summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_addr.c6
-rw-r--r--fs/9p/vfs_dentry.c1
-rw-r--r--fs/9p/vfs_file.c13
-rw-r--r--fs/9p/vfs_inode.c7
-rw-r--r--fs/9p/vfs_inode_dotl.c10
-rw-r--r--fs/9p/vfs_super.c10
-rw-r--r--fs/Kconfig15
-rw-r--r--fs/Makefile5
-rw-r--r--fs/adfs/file.c2
-rw-r--r--fs/adfs/inode.c9
-rw-r--r--fs/adfs/super.c2
-rw-r--r--fs/affs/affs.h2
-rw-r--r--fs/affs/file.c37
-rw-r--r--fs/affs/namei.c8
-rw-r--r--fs/affs/super.c4
-rw-r--r--fs/afs/Kconfig1
-rw-r--r--fs/afs/Makefile1
-rw-r--r--fs/afs/addr_list.c50
-rw-r--r--fs/afs/addr_prefs.c2
-rw-r--r--fs/afs/cell.c447
-rw-r--r--fs/afs/cm_security.c340
-rw-r--r--fs/afs/cmservice.c82
-rw-r--r--fs/afs/dir.c19
-rw-r--r--fs/afs/dir_silly.c6
-rw-r--r--fs/afs/dynroot.c501
-rw-r--r--fs/afs/file.c12
-rw-r--r--fs/afs/fs_probe.c34
-rw-r--r--fs/afs/fsclient.c4
-rw-r--r--fs/afs/internal.h120
-rw-r--r--fs/afs/main.c17
-rw-r--r--fs/afs/misc.c27
-rw-r--r--fs/afs/mntpt.c6
-rw-r--r--fs/afs/proc.c19
-rw-r--r--fs/afs/rxrpc.c48
-rw-r--r--fs/afs/server.c604
-rw-r--r--fs/afs/server_list.c6
-rw-r--r--fs/afs/super.c29
-rw-r--r--fs/afs/vl_alias.c7
-rw-r--r--fs/afs/vl_rotate.c2
-rw-r--r--fs/afs/volume.c15
-rw-r--r--fs/afs/write.c9
-rw-r--r--fs/aio.c9
-rw-r--r--fs/anon_inodes.c72
-rw-r--r--fs/attr.c10
-rw-r--r--fs/autofs/autofs_i.h2
-rw-r--r--fs/autofs/dev-ioctl.c6
-rw-r--r--fs/autofs/inode.c2
-rw-r--r--fs/autofs/root.c14
-rw-r--r--fs/backing-file.c8
-rw-r--r--fs/bad_inode.c6
-rw-r--r--fs/bcachefs/Kconfig25
-rw-r--r--fs/bcachefs/Makefile7
-rw-r--r--fs/bcachefs/acl.c4
-rw-r--r--fs/bcachefs/alloc_background.c555
-rw-r--r--fs/bcachefs/alloc_background.h18
-rw-r--r--fs/bcachefs/alloc_foreground.c741
-rw-r--r--fs/bcachefs/alloc_foreground.h100
-rw-r--r--fs/bcachefs/alloc_types.h17
-rw-r--r--fs/bcachefs/async_objs.c132
-rw-r--r--fs/bcachefs/async_objs.h44
-rw-r--r--fs/bcachefs/async_objs_types.h25
-rw-r--r--fs/bcachefs/backpointers.c585
-rw-r--r--fs/bcachefs/backpointers.h43
-rw-r--r--fs/bcachefs/bcachefs.h323
-rw-r--r--fs/bcachefs/bcachefs_format.h124
-rw-r--r--fs/bcachefs/bcachefs_ioctl.h29
-rw-r--r--fs/bcachefs/bkey.c47
-rw-r--r--fs/bcachefs/bkey.h4
-rw-r--r--fs/bcachefs/bkey_methods.c26
-rw-r--r--fs/bcachefs/bset.c64
-rw-r--r--fs/bcachefs/bset.h22
-rw-r--r--fs/bcachefs/btree_cache.c245
-rw-r--r--fs/bcachefs/btree_cache.h1
-rw-r--r--fs/bcachefs/btree_gc.c257
-rw-r--r--fs/bcachefs/btree_gc.h3
-rw-r--r--fs/bcachefs/btree_io.c718
-rw-r--r--fs/bcachefs/btree_io.h16
-rw-r--r--fs/bcachefs/btree_iter.c786
-rw-r--r--fs/bcachefs/btree_iter.h262
-rw-r--r--fs/bcachefs/btree_journal_iter.c104
-rw-r--r--fs/bcachefs/btree_journal_iter_types.h5
-rw-r--r--fs/bcachefs/btree_key_cache.c124
-rw-r--r--fs/bcachefs/btree_key_cache.h3
-rw-r--r--fs/bcachefs/btree_locking.c273
-rw-r--r--fs/bcachefs/btree_locking.h80
-rw-r--r--fs/bcachefs/btree_node_scan.c153
-rw-r--r--fs/bcachefs/btree_node_scan.h2
-rw-r--r--fs/bcachefs/btree_trans_commit.c255
-rw-r--r--fs/bcachefs/btree_types.h80
-rw-r--r--fs/bcachefs/btree_update.c175
-rw-r--r--fs/bcachefs/btree_update.h93
-rw-r--r--fs/bcachefs/btree_update_interior.c437
-rw-r--r--fs/bcachefs/btree_update_interior.h25
-rw-r--r--fs/bcachefs/btree_write_buffer.c67
-rw-r--r--fs/bcachefs/btree_write_buffer.h7
-rw-r--r--fs/bcachefs/buckets.c461
-rw-r--r--fs/bcachefs/buckets.h70
-rw-r--r--fs/bcachefs/buckets_types.h32
-rw-r--r--fs/bcachefs/buckets_waiting_for_journal.c15
-rw-r--r--fs/bcachefs/buckets_waiting_for_journal.h4
-rw-r--r--fs/bcachefs/chardev.c94
-rw-r--r--fs/bcachefs/checksum.c276
-rw-r--r--fs/bcachefs/checksum.h7
-rw-r--r--fs/bcachefs/clock.c47
-rw-r--r--fs/bcachefs/clock.h1
-rw-r--r--fs/bcachefs/compress.c82
-rw-r--r--fs/bcachefs/darray.h59
-rw-r--r--fs/bcachefs/data_update.c562
-rw-r--r--fs/bcachefs/data_update.h44
-rw-r--r--fs/bcachefs/debug.c160
-rw-r--r--fs/bcachefs/debug.h20
-rw-r--r--fs/bcachefs/dirent.c294
-rw-r--r--fs/bcachefs/dirent.h46
-rw-r--r--fs/bcachefs/dirent_format.h20
-rw-r--r--fs/bcachefs/disk_accounting.c210
-rw-r--r--fs/bcachefs/disk_accounting.h62
-rw-r--r--fs/bcachefs/disk_accounting_format.h90
-rw-r--r--fs/bcachefs/disk_groups.c177
-rw-r--r--fs/bcachefs/ec.c828
-rw-r--r--fs/bcachefs/ec.h56
-rw-r--r--fs/bcachefs/ec_types.h19
-rw-r--r--fs/bcachefs/enumerated_ref.c144
-rw-r--r--fs/bcachefs/enumerated_ref.h66
-rw-r--r--fs/bcachefs/enumerated_ref_types.h19
-rw-r--r--fs/bcachefs/errcode.c4
-rw-r--r--fs/bcachefs/errcode.h96
-rw-r--r--fs/bcachefs/error.c465
-rw-r--r--fs/bcachefs/error.h125
-rw-r--r--fs/bcachefs/extent_update.c80
-rw-r--r--fs/bcachefs/extent_update.h2
-rw-r--r--fs/bcachefs/extents.c418
-rw-r--r--fs/bcachefs/extents.h36
-rw-r--r--fs/bcachefs/extents_format.h24
-rw-r--r--fs/bcachefs/extents_types.h12
-rw-r--r--fs/bcachefs/eytzinger.c76
-rw-r--r--fs/bcachefs/eytzinger.h95
-rw-r--r--fs/bcachefs/fast_list.c156
-rw-r--r--fs/bcachefs/fast_list.h41
-rw-r--r--fs/bcachefs/fs-io-buffered.c93
-rw-r--r--fs/bcachefs/fs-io-buffered.h4
-rw-r--r--fs/bcachefs/fs-io-direct.c27
-rw-r--r--fs/bcachefs/fs-io-pagecache.c20
-rw-r--r--fs/bcachefs/fs-io.c106
-rw-r--r--fs/bcachefs/fs-ioctl.c223
-rw-r--r--fs/bcachefs/fs-ioctl.h73
-rw-r--r--fs/bcachefs/fs.c687
-rw-r--r--fs/bcachefs/fsck.c1137
-rw-r--r--fs/bcachefs/fsck.h6
-rw-r--r--fs/bcachefs/inode.c290
-rw-r--r--fs/bcachefs/inode.h62
-rw-r--r--fs/bcachefs/inode_format.h13
-rw-r--r--fs/bcachefs/io_misc.c50
-rw-r--r--fs/bcachefs/io_misc.h2
-rw-r--r--fs/bcachefs/io_read.c1084
-rw-r--r--fs/bcachefs/io_read.h118
-rw-r--r--fs/bcachefs/io_write.c547
-rw-r--r--fs/bcachefs/io_write.h38
-rw-r--r--fs/bcachefs/io_write_types.h34
-rw-r--r--fs/bcachefs/journal.c487
-rw-r--r--fs/bcachefs/journal.h58
-rw-r--r--fs/bcachefs/journal_io.c580
-rw-r--r--fs/bcachefs/journal_io.h1
-rw-r--r--fs/bcachefs/journal_reclaim.c143
-rw-r--r--fs/bcachefs/journal_sb.c2
-rw-r--r--fs/bcachefs/journal_seq_blacklist.c21
-rw-r--r--fs/bcachefs/journal_seq_blacklist.h1
-rw-r--r--fs/bcachefs/journal_types.h45
-rw-r--r--fs/bcachefs/lru.c105
-rw-r--r--fs/bcachefs/lru.h22
-rw-r--r--fs/bcachefs/lru_format.h6
-rw-r--r--fs/bcachefs/migrate.c147
-rw-r--r--fs/bcachefs/migrate.h3
-rw-r--r--fs/bcachefs/move.c733
-rw-r--r--fs/bcachefs/move.h17
-rw-r--r--fs/bcachefs/move_types.h28
-rw-r--r--fs/bcachefs/movinggc.c268
-rw-r--r--fs/bcachefs/movinggc.h10
-rw-r--r--fs/bcachefs/namei.c (renamed from fs/bcachefs/fs-common.c)531
-rw-r--r--fs/bcachefs/namei.h (renamed from fs/bcachefs/fs-common.h)38
-rw-r--r--fs/bcachefs/nocow_locking.c4
-rw-r--r--fs/bcachefs/nocow_locking.h2
-rw-r--r--fs/bcachefs/opts.c314
-rw-r--r--fs/bcachefs/opts.h135
-rw-r--r--fs/bcachefs/printbuf.c19
-rw-r--r--fs/bcachefs/printbuf.h9
-rw-r--r--fs/bcachefs/progress.c61
-rw-r--r--fs/bcachefs/progress.h29
-rw-r--r--fs/bcachefs/quota.c8
-rw-r--r--fs/bcachefs/rcu_pending.c22
-rw-r--r--fs/bcachefs/rebalance.c303
-rw-r--r--fs/bcachefs/rebalance.h36
-rw-r--r--fs/bcachefs/rebalance_types.h6
-rw-r--r--fs/bcachefs/recovery.c222
-rw-r--r--fs/bcachefs/recovery.h3
-rw-r--r--fs/bcachefs/recovery_passes.c650
-rw-r--r--fs/bcachefs/recovery_passes.h40
-rw-r--r--fs/bcachefs/recovery_passes_format.h106
-rw-r--r--fs/bcachefs/recovery_passes_types.h93
-rw-r--r--fs/bcachefs/reflink.c96
-rw-r--r--fs/bcachefs/replicas.c35
-rw-r--r--fs/bcachefs/sb-counters.c90
-rw-r--r--fs/bcachefs/sb-counters.h4
-rw-r--r--fs/bcachefs/sb-counters_format.h34
-rw-r--r--fs/bcachefs/sb-downgrade.c25
-rw-r--r--fs/bcachefs/sb-errors.c22
-rw-r--r--fs/bcachefs/sb-errors.h1
-rw-r--r--fs/bcachefs/sb-errors_format.h59
-rw-r--r--fs/bcachefs/sb-members.c138
-rw-r--r--fs/bcachefs/sb-members.h110
-rw-r--r--fs/bcachefs/sb-members_format.h7
-rw-r--r--fs/bcachefs/sb-members_types.h1
-rw-r--r--fs/bcachefs/six.c12
-rw-r--r--fs/bcachefs/six.h7
-rw-r--r--fs/bcachefs/snapshot.c673
-rw-r--r--fs/bcachefs/snapshot.h115
-rw-r--r--fs/bcachefs/snapshot_format.h4
-rw-r--r--fs/bcachefs/snapshot_types.h57
-rw-r--r--fs/bcachefs/str_hash.c371
-rw-r--r--fs/bcachefs/str_hash.h51
-rw-r--r--fs/bcachefs/subvolume.c123
-rw-r--r--fs/bcachefs/subvolume.h19
-rw-r--r--fs/bcachefs/subvolume_types.h27
-rw-r--r--fs/bcachefs/super-io.c193
-rw-r--r--fs/bcachefs/super-io.h18
-rw-r--r--fs/bcachefs/super.c1075
-rw-r--r--fs/bcachefs/super.h13
-rw-r--r--fs/bcachefs/super_types.h8
-rw-r--r--fs/bcachefs/sysfs.c251
-rw-r--r--fs/bcachefs/sysfs.h5
-rw-r--r--fs/bcachefs/tests.c34
-rw-r--r--fs/bcachefs/thread_with_file.c4
-rw-r--r--fs/bcachefs/time_stats.c20
-rw-r--r--fs/bcachefs/time_stats.h1
-rw-r--r--fs/bcachefs/trace.h361
-rw-r--r--fs/bcachefs/util.c308
-rw-r--r--fs/bcachefs/util.h81
-rw-r--r--fs/bcachefs/xattr.c31
-rw-r--r--fs/bcachefs/xattr.h4
-rw-r--r--fs/bcachefs/xattr_format.h8
-rw-r--r--fs/bfs/file.c9
-rw-r--r--fs/bfs/inode.c30
-rw-r--r--fs/binfmt_elf.c194
-rw-r--r--fs/binfmt_elf_fdpic.c24
-rw-r--r--fs/binfmt_misc.c46
-rw-r--r--fs/bpf_fs_kfuncs.c259
-rw-r--r--fs/btrfs/Kconfig36
-rw-r--r--fs/btrfs/accessors.c162
-rw-r--r--fs/btrfs/accessors.h38
-rw-r--r--fs/btrfs/acl.h2
-rw-r--r--fs/btrfs/async-thread.c14
-rw-r--r--fs/btrfs/backref.c63
-rw-r--r--fs/btrfs/backref.h23
-rw-r--r--fs/btrfs/bio.c117
-rw-r--r--fs/btrfs/bio.h3
-rw-r--r--fs/btrfs/block-group.c395
-rw-r--r--fs/btrfs/block-group.h18
-rw-r--r--fs/btrfs/block-rsv.c11
-rw-r--r--fs/btrfs/block-rsv.h1
-rw-r--r--fs/btrfs/btrfs_inode.h37
-rw-r--r--fs/btrfs/compression.c130
-rw-r--r--fs/btrfs/compression.h40
-rw-r--r--fs/btrfs/ctree.c217
-rw-r--r--fs/btrfs/ctree.h39
-rw-r--r--fs/btrfs/defrag.c279
-rw-r--r--fs/btrfs/defrag.h4
-rw-r--r--fs/btrfs/delalloc-space.c51
-rw-r--r--fs/btrfs/delalloc-space.h4
-rw-r--r--fs/btrfs/delayed-inode.c285
-rw-r--r--fs/btrfs/delayed-inode.h9
-rw-r--r--fs/btrfs/delayed-ref.c19
-rw-r--r--fs/btrfs/delayed-ref.h9
-rw-r--r--fs/btrfs/dev-replace.c73
-rw-r--r--fs/btrfs/dev-replace.h2
-rw-r--r--fs/btrfs/dir-item.c28
-rw-r--r--fs/btrfs/dir-item.h3
-rw-r--r--fs/btrfs/direct-io.c94
-rw-r--r--fs/btrfs/direct-io.h2
-rw-r--r--fs/btrfs/discard.c53
-rw-r--r--fs/btrfs/discard.h1
-rw-r--r--fs/btrfs/disk-io.c366
-rw-r--r--fs/btrfs/disk-io.h5
-rw-r--r--fs/btrfs/export.c51
-rw-r--r--fs/btrfs/extent-io-tree.c530
-rw-r--r--fs/btrfs/extent-io-tree.h166
-rw-r--r--fs/btrfs/extent-tree.c349
-rw-r--r--fs/btrfs/extent-tree.h7
-rw-r--r--fs/btrfs/extent_io.c1694
-rw-r--r--fs/btrfs/extent_io.h26
-rw-r--r--fs/btrfs/extent_map.c264
-rw-r--r--fs/btrfs/extent_map.h47
-rw-r--r--fs/btrfs/fiemap.c11
-rw-r--r--fs/btrfs/file-item.c81
-rw-r--r--fs/btrfs/file-item.h8
-rw-r--r--fs/btrfs/file.c953
-rw-r--r--fs/btrfs/file.h2
-rw-r--r--fs/btrfs/free-space-cache.c117
-rw-r--r--fs/btrfs/free-space-tree.c526
-rw-r--r--fs/btrfs/free-space-tree.h52
-rw-r--r--fs/btrfs/fs.c1
-rw-r--r--fs/btrfs/fs.h46
-rw-r--r--fs/btrfs/inode-item.c61
-rw-r--r--fs/btrfs/inode-item.h11
-rw-r--r--fs/btrfs/inode.c1811
-rw-r--r--fs/btrfs/ioctl.c421
-rw-r--r--fs/btrfs/ioctl.h10
-rw-r--r--fs/btrfs/locking.c9
-rw-r--r--fs/btrfs/locking.h2
-rw-r--r--fs/btrfs/lzo.c5
-rw-r--r--fs/btrfs/messages.h188
-rw-r--r--fs/btrfs/misc.h38
-rw-r--r--fs/btrfs/ordered-data.c110
-rw-r--r--fs/btrfs/ordered-data.h9
-rw-r--r--fs/btrfs/print-tree.c4
-rw-r--r--fs/btrfs/print-tree.h2
-rw-r--r--fs/btrfs/props.c66
-rw-r--r--fs/btrfs/props.h8
-rw-r--r--fs/btrfs/qgroup.c427
-rw-r--r--fs/btrfs/qgroup.h3
-rw-r--r--fs/btrfs/raid-stripe-tree.c7
-rw-r--r--fs/btrfs/raid-stripe-tree.h1
-rw-r--r--fs/btrfs/raid56.c219
-rw-r--r--fs/btrfs/rcu-string.h58
-rw-r--r--fs/btrfs/ref-verify.c146
-rw-r--r--fs/btrfs/ref-verify.h4
-rw-r--r--fs/btrfs/reflink.c131
-rw-r--r--fs/btrfs/relocation.c301
-rw-r--r--fs/btrfs/relocation.h3
-rw-r--r--fs/btrfs/scrub.c553
-rw-r--r--fs/btrfs/send.c673
-rw-r--r--fs/btrfs/send.h4
-rw-r--r--fs/btrfs/space-info.c188
-rw-r--r--fs/btrfs/space-info.h15
-rw-r--r--fs/btrfs/subpage.c480
-rw-r--r--fs/btrfs/subpage.h83
-rw-r--r--fs/btrfs/super.c341
-rw-r--r--fs/btrfs/sysfs.c123
-rw-r--r--fs/btrfs/sysfs.h1
-rw-r--r--fs/btrfs/tests/btrfs-tests.c32
-rw-r--r--fs/btrfs/tests/delayed-refs-tests.c1
-rw-r--r--fs/btrfs/tests/extent-io-tests.c91
-rw-r--r--fs/btrfs/tests/extent-map-tests.c103
-rw-r--r--fs/btrfs/tests/free-space-tree-tests.c93
-rw-r--r--fs/btrfs/tests/inode-tests.c107
-rw-r--r--fs/btrfs/transaction.c163
-rw-r--r--fs/btrfs/tree-checker.c36
-rw-r--r--fs/btrfs/tree-log.c1124
-rw-r--r--fs/btrfs/tree-mod-log.c81
-rw-r--r--fs/btrfs/ulist.c59
-rw-r--r--fs/btrfs/verity.c4
-rw-r--r--fs/btrfs/volumes.c585
-rw-r--r--fs/btrfs/volumes.h53
-rw-r--r--fs/btrfs/xattr.c9
-rw-r--r--fs/btrfs/xattr.h2
-rw-r--r--fs/btrfs/zlib.c90
-rw-r--r--fs/btrfs/zoned.c368
-rw-r--r--fs/btrfs/zoned.h3
-rw-r--r--fs/btrfs/zstd.c77
-rw-r--r--fs/buffer.c220
-rw-r--r--fs/cachefiles/internal.h1
-rw-r--r--fs/cachefiles/io.c18
-rw-r--r--fs/cachefiles/key.c3
-rw-r--r--fs/cachefiles/namei.c31
-rw-r--r--fs/cachefiles/ondemand.c11
-rw-r--r--fs/ceph/Kconfig2
-rw-r--r--fs/ceph/addr.c1290
-rw-r--r--fs/ceph/caps.c18
-rw-r--r--fs/ceph/crypto.c95
-rw-r--r--fs/ceph/crypto.h28
-rw-r--r--fs/ceph/debugfs.c2
-rw-r--r--fs/ceph/dir.c52
-rw-r--r--fs/ceph/export.c21
-rw-r--r--fs/ceph/file.c29
-rw-r--r--fs/ceph/inode.c36
-rw-r--r--fs/ceph/mds_client.c38
-rw-r--r--fs/ceph/mds_client.h3
-rw-r--r--fs/ceph/super.c17
-rw-r--r--fs/ceph/super.h4
-rw-r--r--fs/coda/dir.c26
-rw-r--r--fs/coda/file.c6
-rw-r--r--fs/coda/inode.c2
-rw-r--r--fs/configfs/Kconfig1
-rw-r--r--fs/configfs/dir.c11
-rw-r--r--fs/configfs/item.c2
-rw-r--r--fs/configfs/mount.c3
-rw-r--r--fs/coredump.c1039
-rw-r--r--fs/cramfs/inode.c5
-rw-r--r--fs/crypto/Kconfig21
-rw-r--r--fs/crypto/bio.c9
-rw-r--r--fs/crypto/crypto.c74
-rw-r--r--fs/crypto/fname.c69
-rw-r--r--fs/crypto/fscrypt_private.h98
-rw-r--r--fs/crypto/hkdf.c91
-rw-r--r--fs/crypto/hooks.c2
-rw-r--r--fs/crypto/inline_crypt.c43
-rw-r--r--fs/crypto/keyring.c137
-rw-r--r--fs/crypto/keysetup.c86
-rw-r--r--fs/crypto/keysetup_v1.c59
-rw-r--r--fs/crypto/policy.c4
-rw-r--r--fs/d_path.c8
-rw-r--r--fs/dax.c536
-rw-r--r--fs/dcache.c249
-rw-r--r--fs/debugfs/file.c109
-rw-r--r--fs/debugfs/inode.c42
-rw-r--r--fs/debugfs/internal.h2
-rw-r--r--fs/devpts/inode.c253
-rw-r--r--fs/direct-io.c10
-rw-r--r--fs/dlm/Kconfig1
-rw-r--r--fs/dlm/config.c3
-rw-r--r--fs/dlm/config.h2
-rw-r--r--fs/dlm/lock.c4
-rw-r--r--fs/dlm/lockspace.c2
-rw-r--r--fs/dlm/lowcomms.c11
-rw-r--r--fs/drop_caches.c23
-rw-r--r--fs/ecryptfs/file.c2
-rw-r--r--fs/ecryptfs/inode.c44
-rw-r--r--fs/ecryptfs/main.c5
-rw-r--r--fs/ecryptfs/mmap.c10
-rw-r--r--fs/ecryptfs/super.c1
-rw-r--r--fs/efivarfs/inode.c4
-rw-r--r--fs/efivarfs/internal.h1
-rw-r--r--fs/efivarfs/super.c173
-rw-r--r--fs/erofs/Kconfig48
-rw-r--r--fs/erofs/Makefile1
-rw-r--r--fs/erofs/compress.h12
-rw-r--r--fs/erofs/data.c242
-rw-r--r--fs/erofs/decompressor.c109
-rw-r--r--fs/erofs/decompressor_crypto.c181
-rw-r--r--fs/erofs/decompressor_deflate.c28
-rw-r--r--fs/erofs/decompressor_lzma.c8
-rw-r--r--fs/erofs/decompressor_zstd.c8
-rw-r--r--fs/erofs/dir.c32
-rw-r--r--fs/erofs/erofs_fs.h202
-rw-r--r--fs/erofs/fileio.c30
-rw-r--r--fs/erofs/fscache.c9
-rw-r--r--fs/erofs/inode.c138
-rw-r--r--fs/erofs/internal.h94
-rw-r--r--fs/erofs/namei.c2
-rw-r--r--fs/erofs/super.c194
-rw-r--r--fs/erofs/sysfs.c73
-rw-r--r--fs/erofs/xattr.c70
-rw-r--r--fs/erofs/xattr.h3
-rw-r--r--fs/erofs/zdata.c252
-rw-r--r--fs/erofs/zmap.c368
-rw-r--r--fs/eventfd.c5
-rw-r--r--fs/eventpoll.c186
-rw-r--r--fs/exec.c164
-rw-r--r--fs/exfat/balloc.c22
-rw-r--r--fs/exfat/dir.c12
-rw-r--r--fs/exfat/exfat_fs.h4
-rw-r--r--fs/exfat/fatent.c52
-rw-r--r--fs/exfat/file.c55
-rw-r--r--fs/exfat/inode.c158
-rw-r--r--fs/exfat/namei.c20
-rw-r--r--fs/exfat/nls.c1
-rw-r--r--fs/exfat/super.c76
-rw-r--r--fs/exportfs/expfs.c11
-rw-r--r--fs/ext2/dir.c2
-rw-r--r--fs/ext2/ext2.h5
-rw-r--r--fs/ext2/file.c12
-rw-r--r--fs/ext2/inode.c23
-rw-r--r--fs/ext2/ioctl.c4
-rw-r--r--fs/ext2/namei.c9
-rw-r--r--fs/ext2/super.c596
-rw-r--r--fs/ext4/balloc.c6
-rw-r--r--fs/ext4/bitmap.c16
-rw-r--r--fs/ext4/block_validity.c5
-rw-r--r--fs/ext4/dir.c7
-rw-r--r--fs/ext4/ext4.h259
-rw-r--r--fs/ext4/ext4_extents.h7
-rw-r--r--fs/ext4/ext4_jbd2.c15
-rw-r--r--fs/ext4/ext4_jbd2.h117
-rw-r--r--fs/ext4/extents.c756
-rw-r--r--fs/ext4/extents_status.c36
-rw-r--r--fs/ext4/fast_commit.c460
-rw-r--r--fs/ext4/file.c62
-rw-r--r--fs/ext4/fsmap.c23
-rw-r--r--fs/ext4/fsync.c12
-rw-r--r--fs/ext4/hash.c2
-rw-r--r--fs/ext4/ialloc.c19
-rw-r--r--fs/ext4/indirect.c4
-rw-r--r--fs/ext4/inline.c299
-rw-r--r--fs/ext4/inode.c1242
-rw-r--r--fs/ext4/ioctl.c33
-rw-r--r--fs/ext4/mballoc-test.c7
-rw-r--r--fs/ext4/mballoc.c924
-rw-r--r--fs/ext4/mballoc.h9
-rw-r--r--fs/ext4/mmp.c8
-rw-r--r--fs/ext4/move_extent.c16
-rw-r--r--fs/ext4/namei.c210
-rw-r--r--fs/ext4/orphan.c20
-rw-r--r--fs/ext4/page-io.c83
-rw-r--r--fs/ext4/readpage.c28
-rw-r--r--fs/ext4/resize.c6
-rw-r--r--fs/ext4/super.c367
-rw-r--r--fs/ext4/sysfs.c4
-rw-r--r--fs/ext4/xattr.c61
-rw-r--r--fs/ext4/xattr.h10
-rw-r--r--fs/f2fs/acl.c33
-rw-r--r--fs/f2fs/acl.h10
-rw-r--r--fs/f2fs/checkpoint.c313
-rw-r--r--fs/f2fs/compress.c277
-rw-r--r--fs/f2fs/data.c579
-rw-r--r--fs/f2fs/debug.c24
-rw-r--r--fs/f2fs/dir.c243
-rw-r--r--fs/f2fs/extent_cache.c16
-rw-r--r--fs/f2fs/f2fs.h551
-rw-r--r--fs/f2fs/file.c452
-rw-r--r--fs/f2fs/gc.c225
-rw-r--r--fs/f2fs/gc.h5
-rw-r--r--fs/f2fs/inline.c318
-rw-r--r--fs/f2fs/inode.c196
-rw-r--r--fs/f2fs/namei.c165
-rw-r--r--fs/f2fs/node.c1090
-rw-r--r--fs/f2fs/node.h90
-rw-r--r--fs/f2fs/recovery.c236
-rw-r--r--fs/f2fs/segment.c326
-rw-r--r--fs/f2fs/segment.h198
-rw-r--r--fs/f2fs/shrinker.c99
-rw-r--r--fs/f2fs/super.c2388
-rw-r--r--fs/f2fs/sysfs.c228
-rw-r--r--fs/f2fs/xattr.c116
-rw-r--r--fs/f2fs/xattr.h24
-rw-r--r--fs/fat/fatent.c2
-rw-r--r--fs/fat/file.c2
-rw-r--r--fs/fat/inode.c18
-rw-r--r--fs/fat/misc.c6
-rw-r--r--fs/fat/namei_msdos.c10
-rw-r--r--fs/fat/namei_vfat.c12
-rw-r--r--fs/fhandle.c64
-rw-r--r--fs/file.c152
-rw-r--r--fs/file_attr.c498
-rw-r--r--fs/file_table.c108
-rw-r--r--fs/filesystems.c14
-rw-r--r--fs/fs-writeback.c39
-rw-r--r--fs/fs_context.c6
-rw-r--r--fs/fs_parser.c55
-rw-r--r--fs/fs_struct.c36
-rw-r--r--fs/fsopen.c2
-rw-r--r--fs/fuse/Kconfig1
-rw-r--r--fs/fuse/control.c30
-rw-r--r--fs/fuse/dax.c33
-rw-r--r--fs/fuse/dev.c365
-rw-r--r--fs/fuse/dev_uring.c79
-rw-r--r--fs/fuse/dev_uring_i.h18
-rw-r--r--fs/fuse/dir.c113
-rw-r--r--fs/fuse/file.c812
-rw-r--r--fs/fuse/fuse_dev_i.h11
-rw-r--r--fs/fuse/fuse_i.h68
-rw-r--r--fs/fuse/inode.c77
-rw-r--r--fs/fuse/ioctl.c8
-rw-r--r--fs/fuse/readdir.c40
-rw-r--r--fs/fuse/sysctl.c24
-rw-r--r--fs/fuse/virtio_fs.c15
-rw-r--r--fs/gfs2/Kconfig1
-rw-r--r--fs/gfs2/aops.c94
-rw-r--r--fs/gfs2/aops.h3
-rw-r--r--fs/gfs2/bmap.c58
-rw-r--r--fs/gfs2/bmap.h1
-rw-r--r--fs/gfs2/dir.c6
-rw-r--r--fs/gfs2/file.c15
-rw-r--r--fs/gfs2/glock.c158
-rw-r--r--fs/gfs2/glock.h10
-rw-r--r--fs/gfs2/glops.c15
-rw-r--r--fs/gfs2/incore.h14
-rw-r--r--fs/gfs2/inode.c105
-rw-r--r--fs/gfs2/inode.h11
-rw-r--r--fs/gfs2/lock_dlm.c20
-rw-r--r--fs/gfs2/log.c7
-rw-r--r--fs/gfs2/log.h11
-rw-r--r--fs/gfs2/lops.c95
-rw-r--r--fs/gfs2/lops.h2
-rw-r--r--fs/gfs2/main.c1
-rw-r--r--fs/gfs2/meta_io.c27
-rw-r--r--fs/gfs2/meta_io.h4
-rw-r--r--fs/gfs2/ops_fstype.c79
-rw-r--r--fs/gfs2/quota.c4
-rw-r--r--fs/gfs2/recovery.c28
-rw-r--r--fs/gfs2/recovery.h2
-rw-r--r--fs/gfs2/super.c148
-rw-r--r--fs/gfs2/sys.c5
-rw-r--r--fs/gfs2/trace_gfs2.h10
-rw-r--r--fs/gfs2/trans.c25
-rw-r--r--fs/gfs2/trans.h2
-rw-r--r--fs/gfs2/util.c33
-rw-r--r--fs/gfs2/xattr.c11
-rw-r--r--fs/gfs2/xattr.h2
-rw-r--r--fs/hfs/bfind.c3
-rw-r--r--fs/hfs/bnode.c99
-rw-r--r--fs/hfs/btree.c57
-rw-r--r--fs/hfs/dir.c10
-rw-r--r--fs/hfs/extent.c2
-rw-r--r--fs/hfs/hfs_fs.h3
-rw-r--r--fs/hfs/inode.c7
-rw-r--r--fs/hfs/super.c2
-rw-r--r--fs/hfsplus/bnode.c98
-rw-r--r--fs/hfsplus/dir.c6
-rw-r--r--fs/hfsplus/extents.c3
-rw-r--r--fs/hfsplus/hfsplus_fs.h10
-rw-r--r--fs/hfsplus/inode.c15
-rw-r--r--fs/hfsplus/super.c8
-rw-r--r--fs/hfsplus/unicode.c7
-rw-r--r--fs/hfsplus/wrapper.c46
-rw-r--r--fs/hfsplus/xattr.c6
-rw-r--r--fs/hostfs/hostfs.h2
-rw-r--r--fs/hostfs/hostfs_kern.c116
-rw-r--r--fs/hostfs/hostfs_user.c59
-rw-r--r--fs/hpfs/file.c20
-rw-r--r--fs/hpfs/namei.c10
-rw-r--r--fs/hpfs/super.c2
-rw-r--r--fs/hugetlbfs/inode.c62
-rw-r--r--fs/init.c7
-rw-r--r--fs/inode.c140
-rw-r--r--fs/internal.h23
-rw-r--r--fs/ioctl.c334
-rw-r--r--fs/iomap/Makefile7
-rw-r--r--fs/iomap/buffered-io.c887
-rw-r--r--fs/iomap/direct-io.c292
-rw-r--r--fs/iomap/fiemap.c24
-rw-r--r--fs/iomap/internal.h9
-rw-r--r--fs/iomap/ioend.c434
-rw-r--r--fs/iomap/iter.c98
-rw-r--r--fs/iomap/seek.c20
-rw-r--r--fs/iomap/swapfile.c10
-rw-r--r--fs/iomap/trace.c1
-rw-r--r--fs/iomap/trace.h39
-rw-r--r--fs/isofs/dir.c3
-rw-r--r--fs/isofs/export.c2
-rw-r--r--fs/isofs/inode.c18
-rw-r--r--fs/isofs/isofs.h4
-rw-r--r--fs/isofs/rock.c40
-rw-r--r--fs/isofs/rock.h6
-rw-r--r--fs/isofs/util.c49
-rw-r--r--fs/jbd2/checkpoint.c1
-rw-r--r--fs/jbd2/commit.c16
-rw-r--r--fs/jbd2/journal.c63
-rw-r--r--fs/jbd2/recovery.c90
-rw-r--r--fs/jbd2/revoke.c36
-rw-r--r--fs/jbd2/transaction.c26
-rw-r--r--fs/jffs2/dir.c18
-rw-r--r--fs/jffs2/erase.c4
-rw-r--r--fs/jffs2/file.c30
-rw-r--r--fs/jffs2/scan.c4
-rw-r--r--fs/jffs2/summary.c7
-rw-r--r--fs/jffs2/wbuf.c2
-rw-r--r--fs/jfs/file.c5
-rw-r--r--fs/jfs/inode.c20
-rw-r--r--fs/jfs/ioctl.c4
-rw-r--r--fs/jfs/jfs_discard.c3
-rw-r--r--fs/jfs/jfs_dmap.c53
-rw-r--r--fs/jfs/jfs_dtree.c21
-rw-r--r--fs/jfs/jfs_extent.c10
-rw-r--r--fs/jfs/jfs_imap.c17
-rw-r--r--fs/jfs/jfs_inode.h4
-rw-r--r--fs/jfs/jfs_metapage.c114
-rw-r--r--fs/jfs/jfs_xtree.c142
-rw-r--r--fs/jfs/namei.c8
-rw-r--r--fs/jfs/super.c8
-rw-r--r--fs/jfs/xattr.c15
-rw-r--r--fs/kernfs/dir.c246
-rw-r--r--fs/kernfs/file.c9
-rw-r--r--fs/kernfs/inode.c70
-rw-r--r--fs/kernfs/kernfs-internal.h45
-rw-r--r--fs/kernfs/mount.c61
-rw-r--r--fs/kernfs/symlink.c30
-rw-r--r--fs/libfs.c157
-rw-r--r--fs/lockd/Makefile2
-rw-r--r--fs/lockd/netlink.c44
-rw-r--r--fs/lockd/netlink.h19
-rw-r--r--fs/lockd/netns.h3
-rw-r--r--fs/lockd/svc.c123
-rw-r--r--fs/locks.c4
-rw-r--r--fs/minix/dir.c2
-rw-r--r--fs/minix/file.c2
-rw-r--r--fs/minix/inode.c7
-rw-r--r--fs/minix/namei.c8
-rw-r--r--fs/mnt_idmapping.c51
-rw-r--r--fs/mount.h72
-rw-r--r--fs/mpage.c62
-rw-r--r--fs/namei.c502
-rw-r--r--fs/namespace.c1832
-rw-r--r--fs/netfs/buffered_read.c75
-rw-r--r--fs/netfs/buffered_write.c43
-rw-r--r--fs/netfs/direct_read.c22
-rw-r--r--fs/netfs/direct_write.c28
-rw-r--r--fs/netfs/fscache_cache.c2
-rw-r--r--fs/netfs/fscache_cookie.c2
-rw-r--r--fs/netfs/fscache_io.c10
-rw-r--r--fs/netfs/internal.h70
-rw-r--r--fs/netfs/main.c11
-rw-r--r--fs/netfs/misc.c231
-rw-r--r--fs/netfs/objects.c48
-rw-r--r--fs/netfs/read_collect.c213
-rw-r--r--fs/netfs/read_pgpriv2.c9
-rw-r--r--fs/netfs/read_retry.c31
-rw-r--r--fs/netfs/read_single.c6
-rw-r--r--fs/netfs/rolling_buffer.c4
-rw-r--r--fs/netfs/stats.c9
-rw-r--r--fs/netfs/write_collect.c108
-rw-r--r--fs/netfs/write_issue.c43
-rw-r--r--fs/netfs/write_retry.c24
-rw-r--r--fs/nfs/Kconfig2
-rw-r--r--fs/nfs/blocklayout/blocklayout.c4
-rw-r--r--fs/nfs/blocklayout/dev.c5
-rw-r--r--fs/nfs/blocklayout/extent_tree.c104
-rw-r--r--fs/nfs/blocklayout/rpc_pipefs.c53
-rw-r--r--fs/nfs/client.c67
-rw-r--r--fs/nfs/delegation.c242
-rw-r--r--fs/nfs/delegation.h4
-rw-r--r--fs/nfs/dir.c41
-rw-r--r--fs/nfs/direct.c25
-rw-r--r--fs/nfs/export.c14
-rw-r--r--fs/nfs/file.c24
-rw-r--r--fs/nfs/filelayout/filelayoutdev.c6
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c174
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c14
-rw-r--r--fs/nfs/fs_context.c113
-rw-r--r--fs/nfs/fscache.c1
-rw-r--r--fs/nfs/inode.c139
-rw-r--r--fs/nfs/internal.h27
-rw-r--r--fs/nfs/localio.c58
-rw-r--r--fs/nfs/mount_clnt.c68
-rw-r--r--fs/nfs/namespace.c1
-rw-r--r--fs/nfs/netns.h6
-rw-r--r--fs/nfs/nfs3acl.c2
-rw-r--r--fs/nfs/nfs3client.c2
-rw-r--r--fs/nfs/nfs3proc.c31
-rw-r--r--fs/nfs/nfs42.h1
-rw-r--r--fs/nfs/nfs42proc.c201
-rw-r--r--fs/nfs/nfs42xdr.c150
-rw-r--r--fs/nfs/nfs4_fs.h8
-rw-r--r--fs/nfs/nfs4client.c190
-rw-r--r--fs/nfs/nfs4file.c37
-rw-r--r--fs/nfs/nfs4getroot.c14
-rw-r--r--fs/nfs/nfs4idmap.c14
-rw-r--r--fs/nfs/nfs4proc.c278
-rw-r--r--fs/nfs/nfs4session.h4
-rw-r--r--fs/nfs/nfs4state.c14
-rw-r--r--fs/nfs/nfs4trace.c2
-rw-r--r--fs/nfs/nfs4trace.h213
-rw-r--r--fs/nfs/nfs4xdr.c44
-rw-r--r--fs/nfs/nfstrace.h11
-rw-r--r--fs/nfs/pagelist.c9
-rw-r--r--fs/nfs/pnfs.c94
-rw-r--r--fs/nfs/pnfs.h4
-rw-r--r--fs/nfs/pnfs_nfs.c57
-rw-r--r--fs/nfs/proc.c12
-rw-r--r--fs/nfs/read.c3
-rw-r--r--fs/nfs/super.c25
-rw-r--r--fs/nfs/symlink.c20
-rw-r--r--fs/nfs/sysfs.c110
-rw-r--r--fs/nfs/unlink.c11
-rw-r--r--fs/nfs/write.c97
-rw-r--r--fs/nfs_common/nfsacl.c8
-rw-r--r--fs/nfs_common/nfslocalio.c113
-rw-r--r--fs/nfsd/Kconfig15
-rw-r--r--fs/nfsd/Makefile1
-rw-r--r--fs/nfsd/blocklayout.c20
-rw-r--r--fs/nfsd/blocklayoutxdr.c111
-rw-r--r--fs/nfsd/blocklayoutxdr.h8
-rw-r--r--fs/nfsd/debugfs.c47
-rw-r--r--fs/nfsd/export.c11
-rw-r--r--fs/nfsd/export.h2
-rw-r--r--fs/nfsd/filecache.c167
-rw-r--r--fs/nfsd/filecache.h10
-rw-r--r--fs/nfsd/localio.c71
-rw-r--r--fs/nfsd/nfs2acl.c2
-rw-r--r--fs/nfsd/nfs3acl.c2
-rw-r--r--fs/nfsd/nfs3proc.c74
-rw-r--r--fs/nfsd/nfs3xdr.c4
-rw-r--r--fs/nfsd/nfs4callback.c286
-rw-r--r--fs/nfsd/nfs4layouts.c11
-rw-r--r--fs/nfsd/nfs4proc.c62
-rw-r--r--fs/nfsd/nfs4recover.c130
-rw-r--r--fs/nfsd/nfs4state.c276
-rw-r--r--fs/nfsd/nfs4xdr.c29
-rw-r--r--fs/nfsd/nfsctl.c148
-rw-r--r--fs/nfsd/nfsd.h40
-rw-r--r--fs/nfsd/nfsfh.c21
-rw-r--r--fs/nfsd/nfsfh.h33
-rw-r--r--fs/nfsd/nfsproc.c55
-rw-r--r--fs/nfsd/nfssvc.c8
-rw-r--r--fs/nfsd/nfsxdr.c4
-rw-r--r--fs/nfsd/state.h44
-rw-r--r--fs/nfsd/stats.c4
-rw-r--r--fs/nfsd/stats.h2
-rw-r--r--fs/nfsd/trace.h351
-rw-r--r--fs/nfsd/vfs.c281
-rw-r--r--fs/nfsd/vfs.h10
-rw-r--r--fs/nfsd/xdr4.h5
-rw-r--r--fs/nfsd/xdr4cb.h5
-rw-r--r--fs/nilfs2/btree.c4
-rw-r--r--fs/nilfs2/dir.c2
-rw-r--r--fs/nilfs2/direct.c3
-rw-r--r--fs/nilfs2/file.c8
-rw-r--r--fs/nilfs2/inode.c23
-rw-r--r--fs/nilfs2/ioctl.c4
-rw-r--r--fs/nilfs2/mdt.c2
-rw-r--r--fs/nilfs2/namei.c8
-rw-r--r--fs/nilfs2/nilfs.h4
-rw-r--r--fs/nilfs2/recovery.c3
-rw-r--r--fs/nilfs2/segment.c20
-rw-r--r--fs/nilfs2/segment.h1
-rw-r--r--fs/nilfs2/the_nilfs.c3
-rw-r--r--fs/notify/dnotify/dnotify.c8
-rw-r--r--fs/notify/fanotify/fanotify.c49
-rw-r--r--fs/notify/fanotify/fanotify.h27
-rw-r--r--fs/notify/fanotify/fanotify_user.c144
-rw-r--r--fs/notify/fdinfo.c5
-rw-r--r--fs/notify/fsnotify.c132
-rw-r--r--fs/notify/fsnotify.h11
-rw-r--r--fs/notify/mark.c14
-rw-r--r--fs/nsfs.c33
-rw-r--r--fs/ntfs3/attrib.c71
-rw-r--r--fs/ntfs3/dir.c6
-rw-r--r--fs/ntfs3/file.c174
-rw-r--r--fs/ntfs3/frecord.c160
-rw-r--r--fs/ntfs3/fslog.c32
-rw-r--r--fs/ntfs3/fsntfs.c34
-rw-r--r--fs/ntfs3/index.c12
-rw-r--r--fs/ntfs3/inode.c143
-rw-r--r--fs/ntfs3/namei.c36
-rw-r--r--fs/ntfs3/ntfs.h5
-rw-r--r--fs/ntfs3/ntfs_fs.h38
-rw-r--r--fs/ntfs3/super.c92
-rw-r--r--fs/ntfs3/xattr.c22
-rw-r--r--fs/ocfs2/alloc.c9
-rw-r--r--fs/ocfs2/aops.c28
-rw-r--r--fs/ocfs2/cluster/tcp.c7
-rw-r--r--fs/ocfs2/dir.c8
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c2
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c10
-rw-r--r--fs/ocfs2/file.c6
-rw-r--r--fs/ocfs2/filecheck.c2
-rw-r--r--fs/ocfs2/inode.c70
-rw-r--r--fs/ocfs2/ioctl.c4
-rw-r--r--fs/ocfs2/ioctl.h4
-rw-r--r--fs/ocfs2/journal.c82
-rw-r--r--fs/ocfs2/journal.h1
-rw-r--r--fs/ocfs2/mmap.c5
-rw-r--r--fs/ocfs2/mmap.h2
-rw-r--r--fs/ocfs2/move_extents.c19
-rw-r--r--fs/ocfs2/namei.c21
-rw-r--r--fs/ocfs2/ocfs2.h17
-rw-r--r--fs/ocfs2/quota_global.c2
-rw-r--r--fs/ocfs2/quota_local.c11
-rw-r--r--fs/ocfs2/stack_user.c15
-rw-r--r--fs/ocfs2/stackglue.c3
-rw-r--r--fs/ocfs2/suballoc.c38
-rw-r--r--fs/ocfs2/suballoc.h1
-rw-r--r--fs/ocfs2/super.c7
-rw-r--r--fs/omfs/dir.c6
-rw-r--r--fs/omfs/file.c9
-rw-r--r--fs/omfs/inode.c176
-rw-r--r--fs/open.c79
-rw-r--r--fs/orangefs/file.c14
-rw-r--r--fs/orangefs/inode.c178
-rw-r--r--fs/orangefs/namei.c8
-rw-r--r--fs/orangefs/orangefs-bufmap.c25
-rw-r--r--fs/orangefs/orangefs-bufmap.h3
-rw-r--r--fs/orangefs/orangefs-debug.h43
-rw-r--r--fs/orangefs/orangefs-debugfs.c55
-rw-r--r--fs/orangefs/orangefs-kernel.h8
-rw-r--r--fs/orangefs/orangefs-mod.c3
-rw-r--r--fs/orangefs/orangefs-sysfs.c28
-rw-r--r--fs/orangefs/super.c191
-rw-r--r--fs/overlayfs/copy_up.c60
-rw-r--r--fs/overlayfs/dir.c306
-rw-r--r--fs/overlayfs/export.c6
-rw-r--r--fs/overlayfs/file.c4
-rw-r--r--fs/overlayfs/inode.c17
-rw-r--r--fs/overlayfs/namei.c149
-rw-r--r--fs/overlayfs/overlayfs.h76
-rw-r--r--fs/overlayfs/ovl_entry.h3
-rw-r--r--fs/overlayfs/params.c77
-rw-r--r--fs/overlayfs/readdir.c69
-rw-r--r--fs/overlayfs/super.c80
-rw-r--r--fs/overlayfs/util.c58
-rw-r--r--fs/pidfs.c485
-rw-r--r--fs/pipe.c230
-rw-r--r--fs/pnode.c704
-rw-r--r--fs/pnode.h31
-rw-r--r--fs/proc/base.c86
-rw-r--r--fs/proc/fd.c11
-rw-r--r--fs/proc/generic.c22
-rw-r--r--fs/proc/inode.c10
-rw-r--r--fs/proc/internal.h73
-rw-r--r--fs/proc/kcore.c12
-rw-r--r--fs/proc/meminfo.c8
-rw-r--r--fs/proc/namespaces.c3
-rw-r--r--fs/proc/page.c202
-rw-r--r--fs/proc/proc_sysctl.c25
-rw-r--r--fs/proc/root.c10
-rw-r--r--fs/proc/task_mmu.c275
-rw-r--r--fs/proc/task_nommu.c4
-rw-r--r--fs/proc/vmcore.c26
-rw-r--r--fs/proc_namespace.c12
-rw-r--r--fs/pstore/inode.c116
-rw-r--r--fs/pstore/internal.h4
-rw-r--r--fs/pstore/platform.c13
-rw-r--r--fs/quota/dquot.c2
-rw-r--r--fs/ramfs/file-mmu.c2
-rw-r--r--fs/ramfs/file-nommu.c12
-rw-r--r--fs/ramfs/inode.c7
-rw-r--r--fs/read_write.c21
-rw-r--r--fs/readdir.c47
-rw-r--r--fs/resctrl/Kconfig39
-rw-r--r--fs/resctrl/Makefile6
-rw-r--r--fs/resctrl/ctrlmondata.c666
-rw-r--r--fs/resctrl/internal.h426
-rw-r--r--fs/resctrl/monitor.c931
-rw-r--r--fs/resctrl/monitor_trace.h33
-rw-r--r--fs/resctrl/pseudo_lock.c1101
-rw-r--r--fs/resctrl/rdtgroup.c4357
-rw-r--r--fs/romfs/mmap-nommu.c6
-rw-r--r--fs/select.c8
-rw-r--r--fs/signalfd.c7
-rw-r--r--fs/smb/client/Makefile2
-rw-r--r--fs/smb/client/asn1.c2
-rw-r--r--fs/smb/client/cached_dir.c61
-rw-r--r--fs/smb/client/cached_dir.h14
-rw-r--r--fs/smb/client/cifs_debug.c100
-rw-r--r--fs/smb/client/cifs_fs_sb.h1
-rw-r--r--fs/smb/client/cifs_ioctl.h2
-rw-r--r--fs/smb/client/cifs_spnego.c49
-rw-r--r--fs/smb/client/cifsacl.c80
-rw-r--r--fs/smb/client/cifsencrypt.c99
-rw-r--r--fs/smb/client/cifsfs.c36
-rw-r--r--fs/smb/client/cifsfs.h13
-rw-r--r--fs/smb/client/cifsglob.h177
-rw-r--r--fs/smb/client/cifspdu.h115
-rw-r--r--fs/smb/client/cifsproto.h41
-rw-r--r--fs/smb/client/cifssmb.c338
-rw-r--r--fs/smb/client/cifstransport.c565
-rw-r--r--fs/smb/client/compress.c71
-rw-r--r--fs/smb/client/connect.c582
-rw-r--r--fs/smb/client/dfs.c30
-rw-r--r--fs/smb/client/dfs.h7
-rw-r--r--fs/smb/client/dfs_cache.c27
-rw-r--r--fs/smb/client/dir.c25
-rw-r--r--fs/smb/client/file.c94
-rw-r--r--fs/smb/client/fs_context.c202
-rw-r--r--fs/smb/client/fs_context.h91
-rw-r--r--fs/smb/client/inode.c112
-rw-r--r--fs/smb/client/ioctl.c2
-rw-r--r--fs/smb/client/link.c63
-rw-r--r--fs/smb/client/misc.c16
-rw-r--r--fs/smb/client/namespace.c4
-rw-r--r--fs/smb/client/netmisc.c10
-rw-r--r--fs/smb/client/nterr.c1
-rw-r--r--fs/smb/client/nterr.h1
-rw-r--r--fs/smb/client/readdir.c40
-rw-r--r--fs/smb/client/reparse.c582
-rw-r--r--fs/smb/client/reparse.h39
-rw-r--r--fs/smb/client/sess.c110
-rw-r--r--fs/smb/client/smb1ops.c410
-rw-r--r--fs/smb/client/smb2file.c73
-rw-r--r--fs/smb/client/smb2glob.h1
-rw-r--r--fs/smb/client/smb2inode.c95
-rw-r--r--fs/smb/client/smb2maperror.c4
-rw-r--r--fs/smb/client/smb2misc.c9
-rw-r--r--fs/smb/client/smb2ops.c215
-rw-r--r--fs/smb/client/smb2pdu.c303
-rw-r--r--fs/smb/client/smb2proto.h13
-rw-r--r--fs/smb/client/smb2transport.c5
-rw-r--r--fs/smb/client/smbdirect.c996
-rw-r--r--fs/smb/client/smbdirect.h161
-rw-r--r--fs/smb/client/trace.h24
-rw-r--r--fs/smb/client/transport.c624
-rw-r--r--fs/smb/client/xattr.c49
-rw-r--r--fs/smb/common/smb2pdu.h53
-rw-r--r--fs/smb/common/smbacl.h3
-rw-r--r--fs/smb/common/smbdirect/smbdirect.h37
-rw-r--r--fs/smb/common/smbdirect/smbdirect_pdu.h55
-rw-r--r--fs/smb/common/smbdirect/smbdirect_socket.h161
-rw-r--r--fs/smb/common/smbfsctl.h3
-rw-r--r--fs/smb/server/Kconfig3
-rw-r--r--fs/smb/server/auth.c54
-rw-r--r--fs/smb/server/auth.h2
-rw-r--r--fs/smb/server/connection.c27
-rw-r--r--fs/smb/server/connection.h21
-rw-r--r--fs/smb/server/crypto_ctx.c8
-rw-r--r--fs/smb/server/crypto_ctx.h4
-rw-r--r--fs/smb/server/ksmbd_work.c3
-rw-r--r--fs/smb/server/ksmbd_work.h1
-rw-r--r--fs/smb/server/mgmt/user_session.c57
-rw-r--r--fs/smb/server/mgmt/user_session.h3
-rw-r--r--fs/smb/server/oplock.c101
-rw-r--r--fs/smb/server/oplock.h2
-rw-r--r--fs/smb/server/server.c15
-rw-r--r--fs/smb/server/smb2pdu.c342
-rw-r--r--fs/smb/server/smb2pdu.h3
-rw-r--r--fs/smb/server/smb_common.c2
-rw-r--r--fs/smb/server/smb_common.h2
-rw-r--r--fs/smb/server/smbacl.c71
-rw-r--r--fs/smb/server/smbacl.h2
-rw-r--r--fs/smb/server/transport_ipc.c8
-rw-r--r--fs/smb/server/transport_rdma.c158
-rw-r--r--fs/smb/server/transport_rdma.h4
-rw-r--r--fs/smb/server/transport_tcp.c58
-rw-r--r--fs/smb/server/transport_tcp.h1
-rw-r--r--fs/smb/server/vfs.c332
-rw-r--r--fs/smb/server/vfs.h7
-rw-r--r--fs/smb/server/vfs_cache.c41
-rw-r--r--fs/smb/server/vfs_cache.h1
-rw-r--r--fs/splice.c65
-rw-r--r--fs/squashfs/Kconfig21
-rw-r--r--fs/squashfs/block.c55
-rw-r--r--fs/squashfs/cache.c2
-rw-r--r--fs/squashfs/file.c7
-rw-r--r--fs/squashfs/super.c9
-rw-r--r--fs/stack.c4
-rw-r--r--fs/stat.c77
-rw-r--r--fs/super.c391
-rw-r--r--fs/sysfs/dir.c2
-rw-r--r--fs/sysfs/file.c34
-rw-r--r--fs/sysfs/group.c6
-rw-r--r--fs/sysv/Kconfig38
-rw-r--r--fs/sysv/Makefile9
-rw-r--r--fs/sysv/balloc.c240
-rw-r--r--fs/sysv/dir.c378
-rw-r--r--fs/sysv/file.c59
-rw-r--r--fs/sysv/ialloc.c235
-rw-r--r--fs/sysv/inode.c354
-rw-r--r--fs/sysv/itree.c511
-rw-r--r--fs/sysv/namei.c280
-rw-r--r--fs/sysv/super.c595
-rw-r--r--fs/sysv/sysv.h245
-rw-r--r--fs/timerfd.c11
-rw-r--r--fs/tracefs/inode.c38
-rw-r--r--fs/ubifs/compress.c243
-rw-r--r--fs/ubifs/crypto.c2
-rw-r--r--fs/ubifs/dir.c10
-rw-r--r--fs/ubifs/file.c102
-rw-r--r--fs/ubifs/io.c3
-rw-r--r--fs/ubifs/ioctl.c4
-rw-r--r--fs/ubifs/journal.c13
-rw-r--r--fs/ubifs/ubifs.h30
-rw-r--r--fs/udf/file.c2
-rw-r--r--fs/udf/inode.c40
-rw-r--r--fs/udf/namei.c12
-rw-r--r--fs/udf/super.c13
-rw-r--r--fs/udf/truncate.c2
-rw-r--r--fs/ufs/dir.c2
-rw-r--r--fs/ufs/file.c2
-rw-r--r--fs/ufs/inode.c16
-rw-r--r--fs/ufs/namei.c8
-rw-r--r--fs/ufs/super.c307
-rw-r--r--fs/ufs/ufs.h9
-rw-r--r--fs/unicode/Kconfig5
-rw-r--r--fs/unicode/Makefile2
-rw-r--r--fs/unicode/tests/.kunitconfig3
-rw-r--r--fs/unicode/tests/utf8_kunit.c (renamed from fs/unicode/utf8-selftest.c)153
-rw-r--r--fs/unicode/utf8-norm.c2
-rw-r--r--fs/userfaultfd.c176
-rw-r--r--fs/vboxsf/dir.c8
-rw-r--r--fs/vboxsf/file.c60
-rw-r--r--fs/vboxsf/super.c4
-rw-r--r--fs/verity/Kconfig10
-rw-r--r--fs/verity/enable.c9
-rw-r--r--fs/verity/fsverity_private.h24
-rw-r--r--fs/verity/hash_algs.c194
-rw-r--r--fs/verity/measure.c1
-rw-r--r--fs/verity/open.c37
-rw-r--r--fs/verity/read_metadata.c1
-rw-r--r--fs/verity/verify.c8
-rw-r--r--fs/xattr.c31
-rw-r--r--fs/xfs/Kconfig2
-rw-r--r--fs/xfs/Makefile7
-rw-r--r--fs/xfs/libxfs/xfs_ag.c2
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c49
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c52
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c334
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h13
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c32
-rw-r--r--fs/xfs/libxfs/xfs_btree.c33
-rw-r--r--fs/xfs/libxfs/xfs_btree.h41
-rw-r--r--fs/xfs/libxfs/xfs_format.h22
-rw-r--r--fs/xfs/libxfs/xfs_fs.h14
-rw-r--r--fs/xfs/libxfs/xfs_group.c17
-rw-r--r--fs/xfs/libxfs/xfs_group.h31
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c35
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c24
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c23
-rw-r--r--fs/xfs/libxfs/xfs_inode_util.c1
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h7
-rw-r--r--fs/xfs/libxfs/xfs_log_recover.h4
-rw-r--r--fs/xfs/libxfs/xfs_log_rlimit.c4
-rw-r--r--fs/xfs/libxfs/xfs_metafile.c167
-rw-r--r--fs/xfs/libxfs/xfs_metafile.h6
-rw-r--r--fs/xfs/libxfs/xfs_ondisk.h6
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c4
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.c18
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.c67
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c11
-rw-r--r--fs/xfs/libxfs/xfs_rtgroup.c39
-rw-r--r--fs/xfs/libxfs/xfs_rtgroup.h50
-rw-r--r--fs/xfs/libxfs/xfs_rtrefcount_btree.c18
-rw-r--r--fs/xfs/libxfs/xfs_rtrmap_btree.c86
-rw-r--r--fs/xfs/libxfs/xfs_rtrmap_btree.h2
-rw-r--r--fs/xfs/libxfs/xfs_sb.c81
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.c343
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.h25
-rw-r--r--fs/xfs/libxfs/xfs_types.h28
-rw-r--r--fs/xfs/libxfs/xfs_zones.c186
-rw-r--r--fs/xfs/libxfs/xfs_zones.h35
-rw-r--r--fs/xfs/scrub/agheader.c2
-rw-r--r--fs/xfs/scrub/bmap.c4
-rw-r--r--fs/xfs/scrub/btree.c2
-rw-r--r--fs/xfs/scrub/common.c7
-rw-r--r--fs/xfs/scrub/common.h7
-rw-r--r--fs/xfs/scrub/dir_repair.c8
-rw-r--r--fs/xfs/scrub/fscounters.c29
-rw-r--r--fs/xfs/scrub/fscounters_repair.c12
-rw-r--r--fs/xfs/scrub/inode.c7
-rw-r--r--fs/xfs/scrub/inode_repair.c19
-rw-r--r--fs/xfs/scrub/metapath.c4
-rw-r--r--fs/xfs/scrub/newbt.c2
-rw-r--r--fs/xfs/scrub/nlinks.c8
-rw-r--r--fs/xfs/scrub/nlinks_repair.c4
-rw-r--r--fs/xfs/scrub/orphanage.c16
-rw-r--r--fs/xfs/scrub/parent_repair.c12
-rw-r--r--fs/xfs/scrub/quotacheck.c4
-rw-r--r--fs/xfs/scrub/rcbag_btree.c38
-rw-r--r--fs/xfs/scrub/reap.c9
-rw-r--r--fs/xfs/scrub/repair.c73
-rw-r--r--fs/xfs/scrub/repair.h15
-rw-r--r--fs/xfs/scrub/rmap_repair.c14
-rw-r--r--fs/xfs/scrub/rtbitmap.c11
-rw-r--r--fs/xfs/scrub/rtrefcount_repair.c34
-rw-r--r--fs/xfs/scrub/rtrmap_repair.c43
-rw-r--r--fs/xfs/scrub/scrub.c21
-rw-r--r--fs/xfs/scrub/trace.h4
-rw-r--r--fs/xfs/xfs_aops.c389
-rw-r--r--fs/xfs/xfs_aops.h3
-rw-r--r--fs/xfs/xfs_attr_item.c148
-rw-r--r--fs/xfs/xfs_attr_item.h8
-rw-r--r--fs/xfs/xfs_bio_io.c30
-rw-r--r--fs/xfs/xfs_bmap_item.c28
-rw-r--r--fs/xfs/xfs_bmap_item.h3
-rw-r--r--fs/xfs/xfs_bmap_util.c32
-rw-r--r--fs/xfs/xfs_bmap_util.h12
-rw-r--r--fs/xfs/xfs_buf.c842
-rw-r--r--fs/xfs/xfs_buf.h48
-rw-r--r--fs/xfs/xfs_buf_item.c434
-rw-r--r--fs/xfs/xfs_buf_item.h8
-rw-r--r--fs/xfs/xfs_buf_item_recover.c46
-rw-r--r--fs/xfs/xfs_buf_mem.c45
-rw-r--r--fs/xfs/xfs_buf_mem.h6
-rw-r--r--fs/xfs/xfs_discard.c61
-rw-r--r--fs/xfs/xfs_dquot.c7
-rw-r--r--fs/xfs/xfs_dquot_item_recover.c20
-rw-r--r--fs/xfs/xfs_exchmaps_item.c8
-rw-r--r--fs/xfs/xfs_exchrange.c71
-rw-r--r--fs/xfs/xfs_extent_busy.c2
-rw-r--r--fs/xfs/xfs_extent_busy.h8
-rw-r--r--fs/xfs/xfs_extfree_item.c104
-rw-r--r--fs/xfs/xfs_extfree_item.h3
-rw-r--r--fs/xfs/xfs_file.c481
-rw-r--r--fs/xfs/xfs_filestream.c15
-rw-r--r--fs/xfs/xfs_fsmap.c115
-rw-r--r--fs/xfs/xfs_fsops.c67
-rw-r--r--fs/xfs/xfs_fsops.h3
-rw-r--r--fs/xfs/xfs_globals.c2
-rw-r--r--fs/xfs/xfs_icache.c21
-rw-r--r--fs/xfs/xfs_icreate_item.c2
-rw-r--r--fs/xfs/xfs_inode.c53
-rw-r--r--fs/xfs/xfs_inode.h49
-rw-r--r--fs/xfs/xfs_inode_item.c18
-rw-r--r--fs/xfs/xfs_inode_item.h4
-rw-r--r--fs/xfs/xfs_inode_item_recover.c27
-rw-r--r--fs/xfs/xfs_ioctl.c35
-rw-r--r--fs/xfs/xfs_ioctl.h4
-rw-r--r--fs/xfs/xfs_iomap.c736
-rw-r--r--fs/xfs/xfs_iomap.h9
-rw-r--r--fs/xfs/xfs_iops.c112
-rw-r--r--fs/xfs/xfs_iops.h3
-rw-r--r--fs/xfs/xfs_itable.c26
-rw-r--r--fs/xfs/xfs_itable.h10
-rw-r--r--fs/xfs/xfs_iwalk.c11
-rw-r--r--fs/xfs/xfs_log.c54
-rw-r--r--fs/xfs/xfs_log.h16
-rw-r--r--fs/xfs/xfs_log_cil.c79
-rw-r--r--fs/xfs/xfs_log_priv.h17
-rw-r--r--fs/xfs/xfs_log_recover.c18
-rw-r--r--fs/xfs/xfs_message.c20
-rw-r--r--fs/xfs/xfs_message.h5
-rw-r--r--fs/xfs/xfs_mount.c391
-rw-r--r--fs/xfs/xfs_mount.h176
-rw-r--r--fs/xfs/xfs_mru_cache.c32
-rw-r--r--fs/xfs/xfs_notify_failure.c15
-rw-r--r--fs/xfs/xfs_pnfs.c2
-rw-r--r--fs/xfs/xfs_qm.c99
-rw-r--r--fs/xfs/xfs_qm_bhv.c55
-rw-r--r--fs/xfs/xfs_refcount_item.c44
-rw-r--r--fs/xfs/xfs_refcount_item.h3
-rw-r--r--fs/xfs/xfs_reflink.c167
-rw-r--r--fs/xfs/xfs_reflink.h6
-rw-r--r--fs/xfs/xfs_rmap_item.c44
-rw-r--r--fs/xfs/xfs_rmap_item.h3
-rw-r--r--fs/xfs/xfs_rtalloc.c259
-rw-r--r--fs/xfs/xfs_rtalloc.h5
-rw-r--r--fs/xfs/xfs_super.c319
-rw-r--r--fs/xfs/xfs_sysctl.h2
-rw-r--r--fs/xfs/xfs_sysfs.c115
-rw-r--r--fs/xfs/xfs_sysfs.h5
-rw-r--r--fs/xfs/xfs_trace.c2
-rw-r--r--fs/xfs/xfs_trace.h414
-rw-r--r--fs/xfs/xfs_trans.c211
-rw-r--r--fs/xfs/xfs_trans.h4
-rw-r--r--fs/xfs/xfs_trans_ail.c39
-rw-r--r--fs/xfs/xfs_trans_priv.h28
-rw-r--r--fs/xfs/xfs_xattr.c2
-rw-r--r--fs/xfs/xfs_zone_alloc.c1349
-rw-r--r--fs/xfs/xfs_zone_alloc.h70
-rw-r--r--fs/xfs/xfs_zone_gc.c1178
-rw-r--r--fs/xfs/xfs_zone_info.c105
-rw-r--r--fs/xfs/xfs_zone_priv.h119
-rw-r--r--fs/xfs/xfs_zone_space_resv.c258
-rw-r--r--fs/zonefs/file.c52
-rw-r--r--fs/zonefs/super.c35
1220 files changed, 78251 insertions, 47954 deletions
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 32619d146cbc..862164181bac 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -59,7 +59,7 @@ static void v9fs_issue_write(struct netfs_io_subrequest *subreq)
len = p9_client_write(fid, subreq->start, &subreq->io_iter, &err);
if (len > 0)
__set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
- netfs_write_subrequest_terminated(subreq, len ?: err, false);
+ netfs_write_subrequest_terminated(subreq, len ?: err);
}
/**
@@ -77,7 +77,8 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq)
/* if we just extended the file size, any portion not in
* cache won't be on server and is zeroes */
- if (subreq->rreq->origin != NETFS_DIO_READ)
+ if (subreq->rreq->origin != NETFS_UNBUFFERED_READ &&
+ subreq->rreq->origin != NETFS_DIO_READ)
__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
if (pos + total >= i_size_read(rreq->inode))
__set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags);
@@ -164,4 +165,5 @@ const struct address_space_operations v9fs_addr_operations = {
.invalidate_folio = netfs_invalidate_folio,
.direct_IO = noop_direct_IO,
.writepages = netfs_writepages,
+ .migrate_folio = filemap_migrate_folio,
};
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index 5061f192eafd..04795508a795 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -127,7 +127,6 @@ const struct dentry_operations v9fs_cached_dentry_operations = {
};
const struct dentry_operations v9fs_dentry_operations = {
- .d_delete = always_delete_dentry,
.d_release = v9fs_dentry_release,
.d_unalias_trylock = v9fs_dentry_unalias_trylock,
.d_unalias_unlock = v9fs_dentry_unalias_unlock,
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 348cc90bf9c5..eb0b083da269 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -454,9 +454,10 @@ int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end,
}
static int
-v9fs_file_mmap(struct file *filp, struct vm_area_struct *vma)
+v9fs_file_mmap_prepare(struct vm_area_desc *desc)
{
int retval;
+ struct file *filp = desc->file;
struct inode *inode = file_inode(filp);
struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode);
@@ -464,12 +465,12 @@ v9fs_file_mmap(struct file *filp, struct vm_area_struct *vma)
if (!(v9ses->cache & CACHE_WRITEBACK)) {
p9_debug(P9_DEBUG_CACHE, "(read-only mmap mode)");
- return generic_file_readonly_mmap(filp, vma);
+ return generic_file_readonly_mmap_prepare(desc);
}
- retval = generic_file_mmap(filp, vma);
+ retval = generic_file_mmap_prepare(desc);
if (!retval)
- vma->vm_ops = &v9fs_mmap_file_vm_ops;
+ desc->vm_ops = &v9fs_mmap_file_vm_ops;
return retval;
}
@@ -516,7 +517,7 @@ const struct file_operations v9fs_file_operations = {
.open = v9fs_file_open,
.release = v9fs_dir_release,
.lock = v9fs_file_lock,
- .mmap = generic_file_readonly_mmap,
+ .mmap_prepare = generic_file_readonly_mmap_prepare,
.splice_read = v9fs_file_splice_read,
.splice_write = iter_file_splice_write,
.fsync = v9fs_file_fsync,
@@ -531,7 +532,7 @@ const struct file_operations v9fs_file_operations_dotl = {
.release = v9fs_dir_release,
.lock = v9fs_file_lock_dotl,
.flock = v9fs_file_flock_dotl,
- .mmap = v9fs_file_mmap,
+ .mmap_prepare = v9fs_file_mmap_prepare,
.splice_read = v9fs_file_splice_read,
.splice_write = iter_file_splice_write,
.fsync = v9fs_file_fsync_dotl,
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 3e68521f4e2f..399d455d50d6 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -669,8 +669,8 @@ v9fs_vfs_create(struct mnt_idmap *idmap, struct inode *dir,
*
*/
-static int v9fs_vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *v9fs_vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
int err;
u32 perm;
@@ -692,8 +692,7 @@ static int v9fs_vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
if (fid)
p9_fid_put(fid);
-
- return err;
+ return ERR_PTR(err);
}
/**
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 143ac03b7425..5b5fda617b80 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -350,9 +350,9 @@ out:
*
*/
-static int v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap,
- struct inode *dir, struct dentry *dentry,
- umode_t omode)
+static struct dentry *v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap,
+ struct inode *dir, struct dentry *dentry,
+ umode_t omode)
{
int err;
struct v9fs_session_info *v9ses;
@@ -407,8 +407,8 @@ static int v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap,
err);
goto error;
}
- v9fs_fid_add(dentry, &fid);
v9fs_set_create_acl(inode, fid, dacl, pacl);
+ v9fs_fid_add(dentry, &fid);
d_instantiate(dentry, inode);
err = 0;
inc_nlink(dir);
@@ -417,7 +417,7 @@ error:
p9_fid_put(fid);
v9fs_put_acl(dacl, pacl);
p9_fid_put(dfid);
- return err;
+ return ERR_PTR(err);
}
static int
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 489db161abc9..795c6388744c 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -134,10 +134,12 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
if (retval)
goto release_sb;
- if (v9ses->cache & (CACHE_META|CACHE_LOOSE))
- sb->s_d_op = &v9fs_cached_dentry_operations;
- else
- sb->s_d_op = &v9fs_dentry_operations;
+ if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) {
+ set_default_d_op(sb, &v9fs_cached_dentry_operations);
+ } else {
+ set_default_d_op(sb, &v9fs_dentry_operations);
+ sb->s_d_flags |= DCACHE_DONTCACHE;
+ }
inode = v9fs_get_new_inode_from_fid(v9ses, fid, sb);
if (IS_ERR(inode)) {
diff --git a/fs/Kconfig b/fs/Kconfig
index 64d420e3c475..c654a3642897 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -59,7 +59,7 @@ endif # BLOCK
config FS_DAX
bool "File system based Direct Access (DAX) support"
depends on MMU
- depends on ZONE_DEVICE || FS_DAX_LIMITED
+ depends on ZONE_DEVICE
select FS_IOMAP
select DAX
help
@@ -95,13 +95,6 @@ config FS_DAX_PMD
depends on ZONE_DEVICE
depends on TRANSPARENT_HUGEPAGE
-# Selected by DAX drivers that do not expect filesystem DAX to support
-# get_user_pages() of DAX mappings. I.e. "limited" indicates no support
-# for fork() of processes with MAP_SHARED mappings or support for
-# direct-I/O to a DAX mapping.
-config FS_DAX_LIMITED
- bool
-
# Posix ACL utility routines
#
# Note: Posix ACLs can be implemented without these helpers. Never use
@@ -256,7 +249,7 @@ config ARCH_SUPPORTS_HUGETLBFS
menuconfig HUGETLBFS
bool "HugeTLB file system support"
- depends on X86 || SPARC64 || ARCH_SUPPORTS_HUGETLBFS || BROKEN
+ depends on ARCH_SUPPORTS_HUGETLBFS
depends on (SYSFS || SYSCTL)
select MEMFD_CREATE
select PADATA if SMP
@@ -286,6 +279,7 @@ config HUGETLB_PAGE_OPTIMIZE_VMEMMAP
def_bool HUGETLB_PAGE
depends on ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP
depends on SPARSEMEM_VMEMMAP
+ select SPARSEMEM_VMEMMAP_PREINIT if ARCH_WANT_HUGETLB_VMEMMAP_PREINIT
config HUGETLB_PMD_PAGE_TABLE_SHARING
def_bool HUGETLB_PAGE
@@ -334,9 +328,9 @@ source "fs/omfs/Kconfig"
source "fs/hpfs/Kconfig"
source "fs/qnx4/Kconfig"
source "fs/qnx6/Kconfig"
+source "fs/resctrl/Kconfig"
source "fs/romfs/Kconfig"
source "fs/pstore/Kconfig"
-source "fs/sysv/Kconfig"
source "fs/ufs/Kconfig"
source "fs/erofs/Kconfig"
source "fs/vboxsf/Kconfig"
@@ -368,6 +362,7 @@ config GRACE_PERIOD
config LOCKD
tristate
depends on FILE_LOCKING
+ select CRC32
select GRACE_PERIOD
config LOCKD_V4
diff --git a/fs/Makefile b/fs/Makefile
index 15df0a923d3a..334654f9584b 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -15,7 +15,8 @@ obj-y := open.o read_write.o file_table.o super.o \
pnode.o splice.o sync.o utimes.o d_path.o \
stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \
fs_types.o fs_context.o fs_parser.o fsopen.o init.o \
- kernel_read_file.o mnt_idmapping.o remap_range.o pidfs.o
+ kernel_read_file.o mnt_idmapping.o remap_range.o pidfs.o \
+ file_attr.o
obj-$(CONFIG_BUFFER_HEAD) += buffer.o mpage.o
obj-$(CONFIG_PROC_FS) += proc_namespace.o
@@ -87,7 +88,6 @@ obj-$(CONFIG_NFSD) += nfsd/
obj-$(CONFIG_LOCKD) += lockd/
obj-$(CONFIG_NLS) += nls/
obj-y += unicode/
-obj-$(CONFIG_SYSV_FS) += sysv/
obj-$(CONFIG_SMBFS) += smb/
obj-$(CONFIG_HPFS_FS) += hpfs/
obj-$(CONFIG_NTFS3_FS) += ntfs3/
@@ -129,3 +129,4 @@ obj-$(CONFIG_EROFS_FS) += erofs/
obj-$(CONFIG_VBOXSF_FS) += vboxsf/
obj-$(CONFIG_ZONEFS_FS) += zonefs/
obj-$(CONFIG_BPF_LSM) += bpf_fs_kfuncs.o
+obj-$(CONFIG_RESCTRL_FS) += resctrl/
diff --git a/fs/adfs/file.c b/fs/adfs/file.c
index ee80718aaeec..cd13165fd904 100644
--- a/fs/adfs/file.c
+++ b/fs/adfs/file.c
@@ -25,7 +25,7 @@
const struct file_operations adfs_file_operations = {
.llseek = generic_file_llseek,
.read_iter = generic_file_read_iter,
- .mmap = generic_file_mmap,
+ .mmap_prepare = generic_file_mmap_prepare,
.fsync = generic_file_fsync,
.write_iter = generic_file_write_iter,
.splice_read = filemap_splice_read,
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 21527189e430..6830f8bc8d4e 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -53,13 +53,14 @@ static void adfs_write_failed(struct address_space *mapping, loff_t to)
truncate_pagecache(inode, inode->i_size);
}
-static int adfs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len,
- struct folio **foliop, void **fsdata)
+static int adfs_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len,
+ struct folio **foliop, void **fsdata)
{
int ret;
- ret = cont_write_begin(file, mapping, pos, len, foliop, fsdata,
+ ret = cont_write_begin(iocb, mapping, pos, len, foliop, fsdata,
adfs_get_block,
&ADFS_I(mapping->host)->mmu_private);
if (unlikely(ret))
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 017c48a80203..fdccdbbfc213 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -397,7 +397,7 @@ static int adfs_fill_super(struct super_block *sb, struct fs_context *fc)
if (asb->s_ftsuffix)
asb->s_namelen += 4;
- sb->s_d_op = &adfs_dentry_operations;
+ set_default_d_op(sb, &adfs_dentry_operations);
root = adfs_iget(sb, &root_obj);
sb->s_root = d_make_root(root);
if (!sb->s_root) {
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index e8c2c4535cb3..ac4e9a02910b 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -168,7 +168,7 @@ extern struct dentry *affs_lookup(struct inode *dir, struct dentry *dentry, unsi
extern int affs_unlink(struct inode *dir, struct dentry *dentry);
extern int affs_create(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, umode_t mode, bool);
-extern int affs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+extern struct dentry *affs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, umode_t mode);
extern int affs_rmdir(struct inode *dir, struct dentry *dentry);
extern int affs_link(struct dentry *olddentry, struct inode *dir,
diff --git a/fs/affs/file.c b/fs/affs/file.c
index a5a861dd5223..765c3443663e 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -415,13 +415,14 @@ affs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
return ret;
}
-static int affs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len,
- struct folio **foliop, void **fsdata)
+static int affs_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len,
+ struct folio **foliop, void **fsdata)
{
int ret;
- ret = cont_write_begin(file, mapping, pos, len, foliop, fsdata,
+ ret = cont_write_begin(iocb, mapping, pos, len, foliop, fsdata,
affs_get_block,
&AFFS_I(mapping->host)->mmu_private);
if (unlikely(ret))
@@ -430,14 +431,15 @@ static int affs_write_begin(struct file *file, struct address_space *mapping,
return ret;
}
-static int affs_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned int len, unsigned int copied,
+static int affs_write_end(const struct kiocb *iocb,
+ struct address_space *mapping, loff_t pos,
+ unsigned int len, unsigned int copied,
struct folio *folio, void *fsdata)
{
struct inode *inode = mapping->host;
int ret;
- ret = generic_write_end(file, mapping, pos, len, copied, folio, fsdata);
+ ret = generic_write_end(iocb, mapping, pos, len, copied, folio, fsdata);
/* Clear Archived bit on file writes, as AmigaOS would do */
if (AFFS_I(inode)->i_protect & FIBF_ARCHIVED) {
@@ -596,7 +598,7 @@ affs_extent_file_ofs(struct inode *inode, u32 newsize)
BUG_ON(tmp > bsize);
AFFS_DATA_HEAD(bh)->ptype = cpu_to_be32(T_DATA);
AFFS_DATA_HEAD(bh)->key = cpu_to_be32(inode->i_ino);
- AFFS_DATA_HEAD(bh)->sequence = cpu_to_be32(bidx);
+ AFFS_DATA_HEAD(bh)->sequence = cpu_to_be32(bidx + 1);
AFFS_DATA_HEAD(bh)->size = cpu_to_be32(tmp);
affs_fix_checksum(sb, bh);
bh->b_state &= ~(1UL << BH_New);
@@ -645,7 +647,8 @@ static int affs_read_folio_ofs(struct file *file, struct folio *folio)
return err;
}
-static int affs_write_begin_ofs(struct file *file, struct address_space *mapping,
+static int affs_write_begin_ofs(const struct kiocb *iocb,
+ struct address_space *mapping,
loff_t pos, unsigned len,
struct folio **foliop, void **fsdata)
{
@@ -684,9 +687,10 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping
return err;
}
-static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct folio *folio, void *fsdata)
+static int affs_write_end_ofs(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct folio *folio, void *fsdata)
{
struct inode *inode = mapping->host;
struct super_block *sb = inode->i_sb;
@@ -724,7 +728,8 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
tmp = min(bsize - boff, to - from);
BUG_ON(boff + tmp > bsize || tmp > bsize);
memcpy(AFFS_DATA(bh) + boff, data + from, tmp);
- be32_add_cpu(&AFFS_DATA_HEAD(bh)->size, tmp);
+ AFFS_DATA_HEAD(bh)->size = cpu_to_be32(
+ max(boff + tmp, be32_to_cpu(AFFS_DATA_HEAD(bh)->size)));
affs_fix_checksum(sb, bh);
mark_buffer_dirty_inode(bh, inode);
written += tmp;
@@ -746,7 +751,7 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
if (buffer_new(bh)) {
AFFS_DATA_HEAD(bh)->ptype = cpu_to_be32(T_DATA);
AFFS_DATA_HEAD(bh)->key = cpu_to_be32(inode->i_ino);
- AFFS_DATA_HEAD(bh)->sequence = cpu_to_be32(bidx);
+ AFFS_DATA_HEAD(bh)->sequence = cpu_to_be32(bidx + 1);
AFFS_DATA_HEAD(bh)->size = cpu_to_be32(bsize);
AFFS_DATA_HEAD(bh)->next = 0;
bh->b_state &= ~(1UL << BH_New);
@@ -780,7 +785,7 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
if (buffer_new(bh)) {
AFFS_DATA_HEAD(bh)->ptype = cpu_to_be32(T_DATA);
AFFS_DATA_HEAD(bh)->key = cpu_to_be32(inode->i_ino);
- AFFS_DATA_HEAD(bh)->sequence = cpu_to_be32(bidx);
+ AFFS_DATA_HEAD(bh)->sequence = cpu_to_be32(bidx + 1);
AFFS_DATA_HEAD(bh)->size = cpu_to_be32(tmp);
AFFS_DATA_HEAD(bh)->next = 0;
bh->b_state &= ~(1UL << BH_New);
@@ -998,7 +1003,7 @@ const struct file_operations affs_file_operations = {
.llseek = generic_file_llseek,
.read_iter = generic_file_read_iter,
.write_iter = generic_file_write_iter,
- .mmap = generic_file_mmap,
+ .mmap_prepare = generic_file_mmap_prepare,
.open = affs_file_open,
.release = affs_file_release,
.fsync = affs_file_fsync,
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 8c154490a2d6..f883be50db12 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -273,7 +273,7 @@ affs_create(struct mnt_idmap *idmap, struct inode *dir,
return 0;
}
-int
+struct dentry *
affs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, umode_t mode)
{
@@ -285,7 +285,7 @@ affs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
inode = affs_new_inode(dir);
if (!inode)
- return -ENOSPC;
+ return ERR_PTR(-ENOSPC);
inode->i_mode = S_IFDIR | mode;
affs_mode_to_prot(inode);
@@ -298,9 +298,9 @@ affs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
clear_nlink(inode);
mark_inode_dirty(inode);
iput(inode);
- return error;
+ return ERR_PTR(error);
}
- return 0;
+ return NULL;
}
int
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 2fa40337776d..44f8aa883100 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -500,9 +500,9 @@ got_root:
return PTR_ERR(root_inode);
if (affs_test_opt(AFFS_SB(sb)->s_flags, SF_INTL))
- sb->s_d_op = &affs_intl_dentry_operations;
+ set_default_d_op(sb, &affs_intl_dentry_operations);
else
- sb->s_d_op = &affs_dentry_operations;
+ set_default_d_op(sb, &affs_dentry_operations);
sb->s_root = d_make_root(root_inode);
if (!sb->s_root) {
diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig
index fc8ba9142f2f..682bd8ec2c10 100644
--- a/fs/afs/Kconfig
+++ b/fs/afs/Kconfig
@@ -5,6 +5,7 @@ config AFS_FS
select AF_RXRPC
select DNS_RESOLVER
select NETFS_SUPPORT
+ select CRYPTO_KRB5
help
If you say Y here, you will get an experimental Andrew File System
driver. It currently only supports unsecured read-only AFS access.
diff --git a/fs/afs/Makefile b/fs/afs/Makefile
index 5efd7e13b304..b49b8fe682f3 100644
--- a/fs/afs/Makefile
+++ b/fs/afs/Makefile
@@ -8,6 +8,7 @@ kafs-y := \
addr_prefs.o \
callback.o \
cell.o \
+ cm_security.o \
cmservice.o \
dir.o \
dir_edit.o \
diff --git a/fs/afs/addr_list.c b/fs/afs/addr_list.c
index 6d42f85c6be5..e941da5b6dd9 100644
--- a/fs/afs/addr_list.c
+++ b/fs/afs/addr_list.c
@@ -362,3 +362,53 @@ int afs_merge_fs_addr6(struct afs_net *net, struct afs_addr_list *alist,
alist->nr_addrs++;
return 0;
}
+
+/*
+ * Set the app data on the rxrpc peers an address list points to
+ */
+void afs_set_peer_appdata(struct afs_server *server,
+ struct afs_addr_list *old_alist,
+ struct afs_addr_list *new_alist)
+{
+ unsigned long data = (unsigned long)server;
+ int n = 0, o = 0;
+
+ if (!old_alist) {
+ /* New server. Just set all. */
+ for (; n < new_alist->nr_addrs; n++)
+ rxrpc_kernel_set_peer_data(new_alist->addrs[n].peer, data);
+ return;
+ }
+ if (!new_alist) {
+ /* Dead server. Just remove all. */
+ for (; o < old_alist->nr_addrs; o++)
+ rxrpc_kernel_set_peer_data(old_alist->addrs[o].peer, 0);
+ return;
+ }
+
+ /* Walk through the two lists simultaneously, setting new peers and
+ * clearing old ones. The two lists are ordered by pointer to peer
+ * record.
+ */
+ while (n < new_alist->nr_addrs && o < old_alist->nr_addrs) {
+ struct rxrpc_peer *pn = new_alist->addrs[n].peer;
+ struct rxrpc_peer *po = old_alist->addrs[o].peer;
+
+ if (pn == po)
+ continue;
+ if (pn < po) {
+ rxrpc_kernel_set_peer_data(pn, data);
+ n++;
+ } else {
+ rxrpc_kernel_set_peer_data(po, 0);
+ o++;
+ }
+ }
+
+ if (n < new_alist->nr_addrs)
+ for (; n < new_alist->nr_addrs; n++)
+ rxrpc_kernel_set_peer_data(new_alist->addrs[n].peer, data);
+ if (o < old_alist->nr_addrs)
+ for (; o < old_alist->nr_addrs; o++)
+ rxrpc_kernel_set_peer_data(old_alist->addrs[o].peer, 0);
+}
diff --git a/fs/afs/addr_prefs.c b/fs/afs/addr_prefs.c
index c0384201b8fe..133736412c3d 100644
--- a/fs/afs/addr_prefs.c
+++ b/fs/afs/addr_prefs.c
@@ -48,7 +48,7 @@ static int afs_split_string(char **pbuf, char *strv[], unsigned int maxstrv)
strv[count++] = p;
/* Skip over word */
- while (!isspace(*p))
+ while (!isspace(*p) && *p)
p++;
if (!*p)
break;
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index cee42646736c..f31359922e98 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -20,8 +20,9 @@ static unsigned __read_mostly afs_cell_min_ttl = 10 * 60;
static unsigned __read_mostly afs_cell_max_ttl = 24 * 60 * 60;
static atomic_t cell_debug_id;
-static void afs_queue_cell_manager(struct afs_net *);
-static void afs_manage_cell_work(struct work_struct *);
+static void afs_cell_timer(struct timer_list *timer);
+static void afs_destroy_cell_work(struct work_struct *work);
+static void afs_manage_cell_work(struct work_struct *work);
static void afs_dec_cells_outstanding(struct afs_net *net)
{
@@ -29,19 +30,11 @@ static void afs_dec_cells_outstanding(struct afs_net *net)
wake_up_var(&net->cells_outstanding);
}
-/*
- * Set the cell timer to fire after a given delay, assuming it's not already
- * set for an earlier time.
- */
-static void afs_set_cell_timer(struct afs_net *net, time64_t delay)
+static void afs_set_cell_state(struct afs_cell *cell, enum afs_cell_state state)
{
- if (net->live) {
- atomic_inc(&net->cells_outstanding);
- if (timer_reduce(&net->cells_timer, jiffies + delay * HZ))
- afs_dec_cells_outstanding(net);
- } else {
- afs_queue_cell_manager(net);
- }
+ smp_store_release(&cell->state, state); /* Commit cell changes before state */
+ smp_wmb(); /* Set cell state before task state */
+ wake_up_var(&cell->state);
}
/*
@@ -64,7 +57,8 @@ static struct afs_cell *afs_find_cell_locked(struct afs_net *net,
return ERR_PTR(-ENAMETOOLONG);
if (!name) {
- cell = net->ws_cell;
+ cell = rcu_dereference_protected(net->ws_cell,
+ lockdep_is_held(&net->cells_lock));
if (!cell)
return ERR_PTR(-EDESTADDRREQ);
goto found;
@@ -115,7 +109,7 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
const char *name, unsigned int namelen,
const char *addresses)
{
- struct afs_vlserver_list *vllist;
+ struct afs_vlserver_list *vllist = NULL;
struct afs_cell *cell;
int i, ret;
@@ -162,13 +156,15 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
cell->net = net;
refcount_set(&cell->ref, 1);
atomic_set(&cell->active, 0);
+ INIT_WORK(&cell->destroyer, afs_destroy_cell_work);
INIT_WORK(&cell->manager, afs_manage_cell_work);
+ timer_setup(&cell->management_timer, afs_cell_timer, 0);
init_rwsem(&cell->vs_lock);
cell->volumes = RB_ROOT;
INIT_HLIST_HEAD(&cell->proc_volumes);
seqlock_init(&cell->volume_lock);
cell->fs_servers = RB_ROOT;
- seqlock_init(&cell->fs_lock);
+ init_rwsem(&cell->fs_lock);
rwlock_init(&cell->vl_servers_lock);
cell->flags = (1 << AFS_CELL_FL_CHECK_ALIAS);
@@ -181,6 +177,7 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
VL_SERVICE, AFS_VL_PORT);
if (IS_ERR(vllist)) {
ret = PTR_ERR(vllist);
+ vllist = NULL;
goto parse_failed;
}
@@ -203,7 +200,13 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
cell->dns_status = vllist->status;
smp_store_release(&cell->dns_lookup_count, 1); /* vs source/status */
atomic_inc(&net->cells_outstanding);
+ ret = idr_alloc_cyclic(&net->cells_dyn_ino, cell,
+ 2, INT_MAX / 2, GFP_KERNEL);
+ if (ret < 0)
+ goto error;
+ cell->dynroot_ino = ret;
cell->debug_id = atomic_inc_return(&cell_debug_id);
+
trace_afs_cell(cell->debug_id, 1, 0, afs_cell_trace_alloc);
_leave(" = %p", cell);
@@ -213,6 +216,7 @@ parse_failed:
if (ret == -EINVAL)
printk(KERN_ERR "kAFS: bad VL server IP address\n");
error:
+ afs_put_vlserverlist(cell->net, vllist);
kfree(cell->name - 1);
kfree(cell);
_leave(" = %d", ret);
@@ -226,6 +230,7 @@ error:
* @namesz: The strlen of the cell name.
* @vllist: A colon/comma separated list of numeric IP addresses or NULL.
* @excl: T if an error should be given if the cell name already exists.
+ * @trace: The reason to be logged if the lookup is successful.
*
* Look up a cell record by name and query the DNS for VL server addresses if
* needed. Note that that actual DNS query is punted off to the manager thread
@@ -234,7 +239,8 @@ error:
*/
struct afs_cell *afs_lookup_cell(struct afs_net *net,
const char *name, unsigned int namesz,
- const char *vllist, bool excl)
+ const char *vllist, bool excl,
+ enum afs_cell_trace trace)
{
struct afs_cell *cell, *candidate, *cursor;
struct rb_node *parent, **pp;
@@ -244,7 +250,7 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net,
_enter("%s,%s", name, vllist);
if (!excl) {
- cell = afs_find_cell(net, name, namesz, afs_cell_trace_use_lookup);
+ cell = afs_find_cell(net, name, namesz, trace);
if (!IS_ERR(cell))
goto wait_for_cell;
}
@@ -287,26 +293,28 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net,
cell = candidate;
candidate = NULL;
- atomic_set(&cell->active, 2);
- trace_afs_cell(cell->debug_id, refcount_read(&cell->ref), 2, afs_cell_trace_insert);
+ afs_use_cell(cell, trace);
rb_link_node_rcu(&cell->net_node, parent, pp);
rb_insert_color(&cell->net_node, &net->cells);
up_write(&net->cells_lock);
- afs_queue_cell(cell, afs_cell_trace_get_queue_new);
+ afs_queue_cell(cell, afs_cell_trace_queue_new);
wait_for_cell:
- trace_afs_cell(cell->debug_id, refcount_read(&cell->ref), atomic_read(&cell->active),
- afs_cell_trace_wait);
_debug("wait_for_cell");
- wait_var_event(&cell->state,
- ({
- state = smp_load_acquire(&cell->state); /* vs error */
- state == AFS_CELL_ACTIVE || state == AFS_CELL_REMOVED;
- }));
+ state = smp_load_acquire(&cell->state); /* vs error */
+ if (state != AFS_CELL_ACTIVE &&
+ state != AFS_CELL_DEAD) {
+ afs_see_cell(cell, afs_cell_trace_wait);
+ wait_var_event(&cell->state,
+ ({
+ state = smp_load_acquire(&cell->state); /* vs error */
+ state == AFS_CELL_ACTIVE || state == AFS_CELL_DEAD;
+ }));
+ }
/* Check the state obtained from the wait check. */
- if (state == AFS_CELL_REMOVED) {
+ if (state == AFS_CELL_DEAD) {
ret = cell->error;
goto error;
}
@@ -320,7 +328,7 @@ cell_already_exists:
if (excl) {
ret = -EEXIST;
} else {
- afs_use_cell(cursor, afs_cell_trace_use_lookup);
+ afs_use_cell(cursor, trace);
ret = 0;
}
up_write(&net->cells_lock);
@@ -330,7 +338,7 @@ cell_already_exists:
goto wait_for_cell;
goto error_noput;
error:
- afs_unuse_cell(net, cell, afs_cell_trace_unuse_lookup);
+ afs_unuse_cell(cell, afs_cell_trace_unuse_lookup_error);
error_noput:
_leave(" = %d [error]", ret);
return ERR_PTR(ret);
@@ -375,8 +383,9 @@ int afs_cell_init(struct afs_net *net, const char *rootcell)
if (cp && cp < rootcell + len)
return -EINVAL;
- /* allocate a cell record for the root cell */
- new_root = afs_lookup_cell(net, rootcell, len, vllist, false);
+ /* allocate a cell record for the root/workstation cell */
+ new_root = afs_lookup_cell(net, rootcell, len, vllist, false,
+ afs_cell_trace_use_lookup_ws);
if (IS_ERR(new_root)) {
_leave(" = %ld", PTR_ERR(new_root));
return PTR_ERR(new_root);
@@ -387,12 +396,11 @@ int afs_cell_init(struct afs_net *net, const char *rootcell)
/* install the new cell */
down_write(&net->cells_lock);
- afs_see_cell(new_root, afs_cell_trace_see_ws);
- old_root = net->ws_cell;
- net->ws_cell = new_root;
+ old_root = rcu_replace_pointer(net->ws_cell, new_root,
+ lockdep_is_held(&net->cells_lock));
up_write(&net->cells_lock);
- afs_unuse_cell(net, old_root, afs_cell_trace_unuse_ws);
+ afs_unuse_cell(old_root, afs_cell_trace_unuse_ws);
_leave(" = 0");
return 0;
}
@@ -510,8 +518,9 @@ static void afs_cell_destroy(struct rcu_head *rcu)
trace_afs_cell(cell->debug_id, r, atomic_read(&cell->active), afs_cell_trace_free);
afs_put_vlserverlist(net, rcu_access_pointer(cell->vl_servers));
- afs_unuse_cell(net, cell->alias_of, afs_cell_trace_unuse_alias);
+ afs_unuse_cell(cell->alias_of, afs_cell_trace_unuse_alias);
key_put(cell->anonymous_key);
+ idr_remove(&net->cells_dyn_ino, cell->dynroot_ino);
kfree(cell->name - 1);
kfree(cell);
@@ -519,30 +528,14 @@ static void afs_cell_destroy(struct rcu_head *rcu)
_leave(" [destroyed]");
}
-/*
- * Queue the cell manager.
- */
-static void afs_queue_cell_manager(struct afs_net *net)
-{
- int outstanding = atomic_inc_return(&net->cells_outstanding);
-
- _enter("%d", outstanding);
-
- if (!queue_work(afs_wq, &net->cells_manager))
- afs_dec_cells_outstanding(net);
-}
-
-/*
- * Cell management timer. We have an increment on cells_outstanding that we
- * need to pass along to the work item.
- */
-void afs_cells_timer(struct timer_list *timer)
+static void afs_destroy_cell_work(struct work_struct *work)
{
- struct afs_net *net = container_of(timer, struct afs_net, cells_timer);
+ struct afs_cell *cell = container_of(work, struct afs_cell, destroyer);
- _enter("");
- if (!queue_work(afs_wq, &net->cells_manager))
- afs_dec_cells_outstanding(net);
+ afs_see_cell(cell, afs_cell_trace_destroy);
+ timer_delete_sync(&cell->management_timer);
+ cancel_work_sync(&cell->manager);
+ call_rcu(&cell->rcu, afs_cell_destroy);
}
/*
@@ -574,7 +567,7 @@ void afs_put_cell(struct afs_cell *cell, enum afs_cell_trace reason)
if (zero) {
a = atomic_read(&cell->active);
WARN(a != 0, "Cell active count %u > 0\n", a);
- call_rcu(&cell->rcu, afs_cell_destroy);
+ WARN_ON(!queue_work(afs_wq, &cell->destroyer));
}
}
}
@@ -586,10 +579,9 @@ struct afs_cell *afs_use_cell(struct afs_cell *cell, enum afs_cell_trace reason)
{
int r, a;
- r = refcount_read(&cell->ref);
- WARN_ON(r == 0);
+ __refcount_inc(&cell->ref, &r);
a = atomic_inc_return(&cell->active);
- trace_afs_cell(cell->debug_id, r, a, reason);
+ trace_afs_cell(cell->debug_id, r + 1, a, reason);
return cell;
}
@@ -597,10 +589,11 @@ struct afs_cell *afs_use_cell(struct afs_cell *cell, enum afs_cell_trace reason)
* Record a cell becoming less active. When the active counter reaches 1, it
* is scheduled for destruction, but may get reactivated.
*/
-void afs_unuse_cell(struct afs_net *net, struct afs_cell *cell, enum afs_cell_trace reason)
+void afs_unuse_cell(struct afs_cell *cell, enum afs_cell_trace reason)
{
unsigned int debug_id;
time64_t now, expire_delay;
+ bool zero;
int r, a;
if (!cell)
@@ -615,13 +608,15 @@ void afs_unuse_cell(struct afs_net *net, struct afs_cell *cell, enum afs_cell_tr
expire_delay = afs_cell_gc_delay;
debug_id = cell->debug_id;
- r = refcount_read(&cell->ref);
a = atomic_dec_return(&cell->active);
- trace_afs_cell(debug_id, r, a, reason);
- WARN_ON(a == 0);
- if (a == 1)
+ if (!a)
/* 'cell' may now be garbage collected. */
- afs_set_cell_timer(net, expire_delay);
+ afs_set_cell_timer(cell, expire_delay);
+
+ zero = __refcount_dec_and_test(&cell->ref, &r);
+ trace_afs_cell(debug_id, r - 1, a, reason);
+ if (zero)
+ WARN_ON(!queue_work(afs_wq, &cell->destroyer));
}
/*
@@ -641,9 +636,27 @@ void afs_see_cell(struct afs_cell *cell, enum afs_cell_trace reason)
*/
void afs_queue_cell(struct afs_cell *cell, enum afs_cell_trace reason)
{
- afs_get_cell(cell, reason);
- if (!queue_work(afs_wq, &cell->manager))
- afs_put_cell(cell, afs_cell_trace_put_queue_fail);
+ queue_work(afs_wq, &cell->manager);
+}
+
+/*
+ * Cell-specific management timer.
+ */
+static void afs_cell_timer(struct timer_list *timer)
+{
+ struct afs_cell *cell = container_of(timer, struct afs_cell, management_timer);
+
+ afs_see_cell(cell, afs_cell_trace_see_mgmt_timer);
+ if (refcount_read(&cell->ref) > 0 && cell->net->live)
+ queue_work(afs_wq, &cell->manager);
+}
+
+/*
+ * Set/reduce the cell timer.
+ */
+void afs_set_cell_timer(struct afs_cell *cell, unsigned int delay_secs)
+{
+ timer_reduce(&cell->management_timer, jiffies + delay_secs * HZ);
}
/*
@@ -705,7 +718,6 @@ static int afs_activate_cell(struct afs_net *net, struct afs_cell *cell)
if (cell->proc_link.next)
cell->proc_link.next->pprev = &cell->proc_link.next;
- afs_dynroot_mkdir(net, cell);
mutex_unlock(&net->proc_cells_lock);
return 0;
}
@@ -722,241 +734,162 @@ static void afs_deactivate_cell(struct afs_net *net, struct afs_cell *cell)
mutex_lock(&net->proc_cells_lock);
if (!hlist_unhashed(&cell->proc_link))
hlist_del_rcu(&cell->proc_link);
- afs_dynroot_rmdir(net, cell);
mutex_unlock(&net->proc_cells_lock);
_leave("");
}
+static bool afs_has_cell_expired(struct afs_cell *cell, time64_t *_next_manage)
+{
+ const struct afs_vlserver_list *vllist;
+ time64_t expire_at = cell->last_inactive;
+ time64_t now = ktime_get_real_seconds();
+
+ if (atomic_read(&cell->active))
+ return false;
+ if (!cell->net->live)
+ return true;
+
+ vllist = rcu_dereference_protected(cell->vl_servers, true);
+ if (vllist && vllist->nr_servers > 0)
+ expire_at += afs_cell_gc_delay;
+
+ if (expire_at <= now)
+ return true;
+ if (expire_at < *_next_manage)
+ *_next_manage = expire_at;
+ return false;
+}
+
/*
* Manage a cell record, initialising and destroying it, maintaining its DNS
* records.
*/
-static void afs_manage_cell(struct afs_cell *cell)
+static bool afs_manage_cell(struct afs_cell *cell)
{
struct afs_net *net = cell->net;
- int ret, active;
+ time64_t next_manage = TIME64_MAX;
+ int ret;
_enter("%s", cell->name);
-again:
_debug("state %u", cell->state);
switch (cell->state) {
- case AFS_CELL_INACTIVE:
- case AFS_CELL_FAILED:
- down_write(&net->cells_lock);
- active = 1;
- if (atomic_try_cmpxchg_relaxed(&cell->active, &active, 0)) {
- rb_erase(&cell->net_node, &net->cells);
- trace_afs_cell(cell->debug_id, refcount_read(&cell->ref), 0,
- afs_cell_trace_unuse_delete);
- smp_store_release(&cell->state, AFS_CELL_REMOVED);
- }
- up_write(&net->cells_lock);
- if (cell->state == AFS_CELL_REMOVED) {
- wake_up_var(&cell->state);
- goto final_destruction;
- }
- if (cell->state == AFS_CELL_FAILED)
- goto done;
- smp_store_release(&cell->state, AFS_CELL_UNSET);
- wake_up_var(&cell->state);
- goto again;
-
- case AFS_CELL_UNSET:
- smp_store_release(&cell->state, AFS_CELL_ACTIVATING);
- wake_up_var(&cell->state);
- goto again;
-
- case AFS_CELL_ACTIVATING:
- ret = afs_activate_cell(net, cell);
- if (ret < 0)
- goto activation_failed;
+ case AFS_CELL_SETTING_UP:
+ goto set_up_cell;
+ case AFS_CELL_ACTIVE:
+ goto cell_is_active;
+ case AFS_CELL_REMOVING:
+ WARN_ON_ONCE(1);
+ return false;
+ case AFS_CELL_DEAD:
+ return false;
+ default:
+ _debug("bad state %u", cell->state);
+ WARN_ON_ONCE(1); /* Unhandled state */
+ return false;
+ }
- smp_store_release(&cell->state, AFS_CELL_ACTIVE);
- wake_up_var(&cell->state);
- goto again;
+set_up_cell:
+ ret = afs_activate_cell(net, cell);
+ if (ret < 0) {
+ cell->error = ret;
+ goto remove_cell;
+ }
- case AFS_CELL_ACTIVE:
- if (atomic_read(&cell->active) > 1) {
- if (test_and_clear_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags)) {
- ret = afs_update_cell(cell);
- if (ret < 0)
- cell->error = ret;
- }
- goto done;
- }
- smp_store_release(&cell->state, AFS_CELL_DEACTIVATING);
- wake_up_var(&cell->state);
- goto again;
+ afs_set_cell_state(cell, AFS_CELL_ACTIVE);
+
+cell_is_active:
+ if (afs_has_cell_expired(cell, &next_manage))
+ goto remove_cell;
- case AFS_CELL_DEACTIVATING:
- if (atomic_read(&cell->active) > 1)
- goto reverse_deactivation;
- afs_deactivate_cell(net, cell);
- smp_store_release(&cell->state, AFS_CELL_INACTIVE);
- wake_up_var(&cell->state);
- goto again;
+ if (test_and_clear_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags)) {
+ ret = afs_update_cell(cell);
+ if (ret < 0)
+ cell->error = ret;
+ }
- case AFS_CELL_REMOVED:
- goto done;
+ if (next_manage < TIME64_MAX && cell->net->live) {
+ time64_t now = ktime_get_real_seconds();
- default:
- break;
+ if (next_manage - now <= 0)
+ afs_queue_cell(cell, afs_cell_trace_queue_again);
+ else
+ afs_set_cell_timer(cell, next_manage - now);
}
- _debug("bad state %u", cell->state);
- BUG(); /* Unhandled state */
+ _leave(" [done %u]", cell->state);
+ return false;
-activation_failed:
- cell->error = ret;
- afs_deactivate_cell(net, cell);
+remove_cell:
+ down_write(&net->cells_lock);
- smp_store_release(&cell->state, AFS_CELL_FAILED); /* vs error */
- wake_up_var(&cell->state);
- goto again;
+ if (atomic_read(&cell->active)) {
+ up_write(&net->cells_lock);
+ goto cell_is_active;
+ }
-reverse_deactivation:
- smp_store_release(&cell->state, AFS_CELL_ACTIVE);
- wake_up_var(&cell->state);
- _leave(" [deact->act]");
- return;
+ /* Make sure that the expiring server records are going to see the fact
+ * that the cell is caput.
+ */
+ afs_set_cell_state(cell, AFS_CELL_REMOVING);
-done:
- _leave(" [done %u]", cell->state);
- return;
+ afs_deactivate_cell(net, cell);
+ afs_purge_servers(cell);
+
+ rb_erase(&cell->net_node, &net->cells);
+ afs_see_cell(cell, afs_cell_trace_unuse_delete);
+ up_write(&net->cells_lock);
-final_destruction:
/* The root volume is pinning the cell */
afs_put_volume(cell->root_volume, afs_volume_trace_put_cell_root);
cell->root_volume = NULL;
- afs_put_cell(cell, afs_cell_trace_put_destroy);
+
+ afs_set_cell_state(cell, AFS_CELL_DEAD);
+ return true;
}
static void afs_manage_cell_work(struct work_struct *work)
{
struct afs_cell *cell = container_of(work, struct afs_cell, manager);
+ bool final_put;
- afs_manage_cell(cell);
- afs_put_cell(cell, afs_cell_trace_put_queue_work);
+ afs_see_cell(cell, afs_cell_trace_manage);
+ final_put = afs_manage_cell(cell);
+ afs_see_cell(cell, afs_cell_trace_managed);
+ if (final_put)
+ afs_put_cell(cell, afs_cell_trace_put_final);
}
/*
- * Manage the records of cells known to a network namespace. This includes
- * updating the DNS records and garbage collecting unused cells that were
- * automatically added.
- *
- * Note that constructed cell records may only be removed from net->cells by
- * this work item, so it is safe for this work item to stash a cursor pointing
- * into the tree and then return to caller (provided it skips cells that are
- * still under construction).
- *
- * Note also that we were given an increment on net->cells_outstanding by
- * whoever queued us that we need to deal with before returning.
+ * Purge in-memory cell database.
*/
-void afs_manage_cells(struct work_struct *work)
+void afs_cell_purge(struct afs_net *net)
{
- struct afs_net *net = container_of(work, struct afs_net, cells_manager);
+ struct afs_cell *ws;
struct rb_node *cursor;
- time64_t now = ktime_get_real_seconds(), next_manage = TIME64_MAX;
- bool purging = !net->live;
_enter("");
- /* Trawl the cell database looking for cells that have expired from
- * lack of use and cells whose DNS results have expired and dispatch
- * their managers.
- */
- down_read(&net->cells_lock);
+ down_write(&net->cells_lock);
+ ws = rcu_replace_pointer(net->ws_cell, NULL,
+ lockdep_is_held(&net->cells_lock));
+ up_write(&net->cells_lock);
+ afs_unuse_cell(ws, afs_cell_trace_unuse_ws);
+ _debug("kick cells");
+ down_read(&net->cells_lock);
for (cursor = rb_first(&net->cells); cursor; cursor = rb_next(cursor)) {
- struct afs_cell *cell =
- rb_entry(cursor, struct afs_cell, net_node);
- unsigned active;
- bool sched_cell = false;
-
- active = atomic_read(&cell->active);
- trace_afs_cell(cell->debug_id, refcount_read(&cell->ref),
- active, afs_cell_trace_manage);
-
- ASSERTCMP(active, >=, 1);
-
- if (purging) {
- if (test_and_clear_bit(AFS_CELL_FL_NO_GC, &cell->flags)) {
- active = atomic_dec_return(&cell->active);
- trace_afs_cell(cell->debug_id, refcount_read(&cell->ref),
- active, afs_cell_trace_unuse_pin);
- }
- }
+ struct afs_cell *cell = rb_entry(cursor, struct afs_cell, net_node);
- if (active == 1) {
- struct afs_vlserver_list *vllist;
- time64_t expire_at = cell->last_inactive;
-
- read_lock(&cell->vl_servers_lock);
- vllist = rcu_dereference_protected(
- cell->vl_servers,
- lockdep_is_held(&cell->vl_servers_lock));
- if (vllist->nr_servers > 0)
- expire_at += afs_cell_gc_delay;
- read_unlock(&cell->vl_servers_lock);
- if (purging || expire_at <= now)
- sched_cell = true;
- else if (expire_at < next_manage)
- next_manage = expire_at;
- }
+ afs_see_cell(cell, afs_cell_trace_purge);
- if (!purging) {
- if (test_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags))
- sched_cell = true;
- }
+ if (test_and_clear_bit(AFS_CELL_FL_NO_GC, &cell->flags))
+ afs_unuse_cell(cell, afs_cell_trace_unuse_pin);
- if (sched_cell)
- afs_queue_cell(cell, afs_cell_trace_get_queue_manage);
+ afs_queue_cell(cell, afs_cell_trace_queue_purge);
}
-
up_read(&net->cells_lock);
- /* Update the timer on the way out. We have to pass an increment on
- * cells_outstanding in the namespace that we are in to the timer or
- * the work scheduler.
- */
- if (!purging && next_manage < TIME64_MAX) {
- now = ktime_get_real_seconds();
-
- if (next_manage - now <= 0) {
- if (queue_work(afs_wq, &net->cells_manager))
- atomic_inc(&net->cells_outstanding);
- } else {
- afs_set_cell_timer(net, next_manage - now);
- }
- }
-
- afs_dec_cells_outstanding(net);
- _leave(" [%d]", atomic_read(&net->cells_outstanding));
-}
-
-/*
- * Purge in-memory cell database.
- */
-void afs_cell_purge(struct afs_net *net)
-{
- struct afs_cell *ws;
-
- _enter("");
-
- down_write(&net->cells_lock);
- ws = net->ws_cell;
- net->ws_cell = NULL;
- up_write(&net->cells_lock);
- afs_unuse_cell(net, ws, afs_cell_trace_unuse_ws);
-
- _debug("del timer");
- if (del_timer_sync(&net->cells_timer))
- atomic_dec(&net->cells_outstanding);
-
- _debug("kick mgr");
- afs_queue_cell_manager(net);
-
_debug("wait");
wait_var_event(&net->cells_outstanding,
!atomic_read(&net->cells_outstanding));
diff --git a/fs/afs/cm_security.c b/fs/afs/cm_security.c
new file mode 100644
index 000000000000..edcbd249d202
--- /dev/null
+++ b/fs/afs/cm_security.c
@@ -0,0 +1,340 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Cache manager security.
+ *
+ * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/slab.h>
+#include <crypto/krb5.h>
+#include "internal.h"
+#include "afs_cm.h"
+#include "afs_fs.h"
+#include "protocol_yfs.h"
+#define RXRPC_TRACE_ONLY_DEFINE_ENUMS
+#include <trace/events/rxrpc.h>
+
+#define RXGK_SERVER_ENC_TOKEN 1036U // 0x40c
+#define xdr_round_up(x) (round_up((x), sizeof(__be32)))
+#define xdr_len_object(x) (4 + round_up((x), sizeof(__be32)))
+
+#ifdef CONFIG_RXGK
+static int afs_create_yfs_cm_token(struct sk_buff *challenge,
+ struct afs_server *server);
+#endif
+
+/*
+ * Respond to an RxGK challenge, adding appdata.
+ */
+static int afs_respond_to_challenge(struct sk_buff *challenge)
+{
+#ifdef CONFIG_RXGK
+ struct krb5_buffer appdata = {};
+ struct afs_server *server;
+#endif
+ struct rxrpc_peer *peer;
+ unsigned long peer_data;
+ u16 service_id;
+ u8 security_index;
+
+ rxrpc_kernel_query_challenge(challenge, &peer, &peer_data,
+ &service_id, &security_index);
+
+ _enter("%u,%u", service_id, security_index);
+
+ switch (service_id) {
+ /* We don't send CM_SERVICE RPCs, so don't expect a challenge
+ * therefrom.
+ */
+ case FS_SERVICE:
+ case VL_SERVICE:
+ case YFS_FS_SERVICE:
+ case YFS_VL_SERVICE:
+ break;
+ default:
+ pr_warn("Can't respond to unknown challenge %u:%u",
+ service_id, security_index);
+ return rxrpc_kernel_reject_challenge(challenge, RX_USER_ABORT, -EPROTO,
+ afs_abort_unsupported_sec_class);
+ }
+
+ switch (security_index) {
+#ifdef CONFIG_RXKAD
+ case RXRPC_SECURITY_RXKAD:
+ return rxkad_kernel_respond_to_challenge(challenge);
+#endif
+
+#ifdef CONFIG_RXGK
+ case RXRPC_SECURITY_RXGK:
+ return rxgk_kernel_respond_to_challenge(challenge, &appdata);
+
+ case RXRPC_SECURITY_YFS_RXGK:
+ switch (service_id) {
+ case FS_SERVICE:
+ case YFS_FS_SERVICE:
+ server = (struct afs_server *)peer_data;
+ if (!server->cm_rxgk_appdata.data) {
+ mutex_lock(&server->cm_token_lock);
+ if (!server->cm_rxgk_appdata.data)
+ afs_create_yfs_cm_token(challenge, server);
+ mutex_unlock(&server->cm_token_lock);
+ }
+ if (server->cm_rxgk_appdata.data)
+ appdata = server->cm_rxgk_appdata;
+ break;
+ }
+ return rxgk_kernel_respond_to_challenge(challenge, &appdata);
+#endif
+
+ default:
+ return rxrpc_kernel_reject_challenge(challenge, RX_USER_ABORT, -EPROTO,
+ afs_abort_unsupported_sec_class);
+ }
+}
+
+/*
+ * Process the OOB message queue, processing challenge packets.
+ */
+void afs_process_oob_queue(struct work_struct *work)
+{
+ struct afs_net *net = container_of(work, struct afs_net, rx_oob_work);
+ struct sk_buff *oob;
+ enum rxrpc_oob_type type;
+
+ while ((oob = rxrpc_kernel_dequeue_oob(net->socket, &type))) {
+ switch (type) {
+ case RXRPC_OOB_CHALLENGE:
+ afs_respond_to_challenge(oob);
+ break;
+ }
+ rxrpc_kernel_free_oob(oob);
+ }
+}
+
+#ifdef CONFIG_RXGK
+/*
+ * Create a securities keyring for the cache manager and attach a key to it for
+ * the RxGK tokens we want to use to secure the callback connection back from
+ * the fileserver.
+ */
+int afs_create_token_key(struct afs_net *net, struct socket *socket)
+{
+ const struct krb5_enctype *krb5;
+ struct key *ring;
+ key_ref_t key;
+ char K0[32], *desc;
+ int ret;
+
+ ring = keyring_alloc("kafs",
+ GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(),
+ KEY_POS_SEARCH | KEY_POS_WRITE |
+ KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH,
+ KEY_ALLOC_NOT_IN_QUOTA,
+ NULL, NULL);
+ if (IS_ERR(ring))
+ return PTR_ERR(ring);
+
+ ret = rxrpc_sock_set_security_keyring(socket->sk, ring);
+ if (ret < 0)
+ goto out;
+
+ ret = -ENOPKG;
+ krb5 = crypto_krb5_find_enctype(KRB5_ENCTYPE_AES128_CTS_HMAC_SHA1_96);
+ if (!krb5)
+ goto out;
+
+ if (WARN_ON_ONCE(krb5->key_len > sizeof(K0)))
+ goto out;
+
+ ret = -ENOMEM;
+ desc = kasprintf(GFP_KERNEL, "%u:%u:%u:%u",
+ YFS_CM_SERVICE, RXRPC_SECURITY_YFS_RXGK, 1, krb5->etype);
+ if (!desc)
+ goto out;
+
+ wait_for_random_bytes();
+ get_random_bytes(K0, krb5->key_len);
+
+ key = key_create(make_key_ref(ring, true),
+ "rxrpc_s", desc,
+ K0, krb5->key_len,
+ KEY_POS_VIEW | KEY_POS_READ | KEY_POS_SEARCH | KEY_USR_VIEW,
+ KEY_ALLOC_NOT_IN_QUOTA);
+ kfree(desc);
+ if (IS_ERR(key)) {
+ ret = PTR_ERR(key);
+ goto out;
+ }
+
+ net->fs_cm_token_key = key_ref_to_ptr(key);
+ ret = 0;
+out:
+ key_put(ring);
+ return ret;
+}
+
+/*
+ * Create an YFS RxGK GSS token to use as a ticket to the specified fileserver.
+ */
+static int afs_create_yfs_cm_token(struct sk_buff *challenge,
+ struct afs_server *server)
+{
+ const struct krb5_enctype *conn_krb5, *token_krb5;
+ const struct krb5_buffer *token_key;
+ struct crypto_aead *aead;
+ struct scatterlist sg;
+ struct afs_net *net = server->cell->net;
+ const struct key *key = net->fs_cm_token_key;
+ size_t keysize, uuidsize, authsize, toksize, encsize, contsize, adatasize, offset;
+ __be32 caps[1] = {
+ [0] = htonl(AFS_CAP_ERROR_TRANSLATION),
+ };
+ __be32 *xdr;
+ void *appdata, *K0, *encbase;
+ u32 enctype;
+ int ret;
+
+ if (!key)
+ return -ENOKEY;
+
+ /* Assume that the fileserver is happy to use the same encoding type as
+ * we were told to use by the token obtained by the user.
+ */
+ enctype = rxgk_kernel_query_challenge(challenge);
+
+ conn_krb5 = crypto_krb5_find_enctype(enctype);
+ if (!conn_krb5)
+ return -ENOPKG;
+ token_krb5 = key->payload.data[0];
+ token_key = (const struct krb5_buffer *)&key->payload.data[2];
+
+ /* struct rxgk_key {
+ * afs_uint32 enctype;
+ * opaque key<>;
+ * };
+ */
+ keysize = 4 + xdr_len_object(conn_krb5->key_len);
+
+ /* struct RXGK_AuthName {
+ * afs_int32 kind;
+ * opaque data<AUTHDATAMAX>;
+ * opaque display<AUTHPRINTABLEMAX>;
+ * };
+ */
+ uuidsize = sizeof(server->uuid);
+ authsize = 4 + xdr_len_object(uuidsize) + xdr_len_object(0);
+
+ /* struct RXGK_Token {
+ * rxgk_key K0;
+ * RXGK_Level level;
+ * rxgkTime starttime;
+ * afs_int32 lifetime;
+ * afs_int32 bytelife;
+ * rxgkTime expirationtime;
+ * struct RXGK_AuthName identities<>;
+ * };
+ */
+ toksize = keysize + 8 + 4 + 4 + 8 + xdr_len_object(authsize);
+
+ offset = 0;
+ encsize = crypto_krb5_how_much_buffer(token_krb5, KRB5_ENCRYPT_MODE, toksize, &offset);
+
+ /* struct RXGK_TokenContainer {
+ * afs_int32 kvno;
+ * afs_int32 enctype;
+ * opaque encrypted_token<>;
+ * };
+ */
+ contsize = 4 + 4 + xdr_len_object(encsize);
+
+ /* struct YFSAppData {
+ * opr_uuid initiatorUuid;
+ * opr_uuid acceptorUuid;
+ * Capabilities caps;
+ * afs_int32 enctype;
+ * opaque callbackKey<>;
+ * opaque callbackToken<>;
+ * };
+ */
+ adatasize = 16 + 16 +
+ xdr_len_object(sizeof(caps)) +
+ 4 +
+ xdr_len_object(conn_krb5->key_len) +
+ xdr_len_object(contsize);
+
+ ret = -ENOMEM;
+ appdata = kzalloc(adatasize, GFP_KERNEL);
+ if (!appdata)
+ goto out;
+ xdr = appdata;
+
+ memcpy(xdr, &net->uuid, 16); /* appdata.initiatorUuid */
+ xdr += 16 / 4;
+ memcpy(xdr, &server->uuid, 16); /* appdata.acceptorUuid */
+ xdr += 16 / 4;
+ *xdr++ = htonl(ARRAY_SIZE(caps)); /* appdata.caps.len */
+ memcpy(xdr, &caps, sizeof(caps)); /* appdata.caps */
+ xdr += ARRAY_SIZE(caps);
+ *xdr++ = htonl(conn_krb5->etype); /* appdata.enctype */
+
+ *xdr++ = htonl(conn_krb5->key_len); /* appdata.callbackKey.len */
+ K0 = xdr;
+ get_random_bytes(K0, conn_krb5->key_len); /* appdata.callbackKey.data */
+ xdr += xdr_round_up(conn_krb5->key_len) / 4;
+
+ *xdr++ = htonl(contsize); /* appdata.callbackToken.len */
+ *xdr++ = htonl(1); /* cont.kvno */
+ *xdr++ = htonl(token_krb5->etype); /* cont.enctype */
+ *xdr++ = htonl(encsize); /* cont.encrypted_token.len */
+
+ encbase = xdr;
+ xdr += offset / 4;
+ *xdr++ = htonl(conn_krb5->etype); /* token.K0.enctype */
+ *xdr++ = htonl(conn_krb5->key_len); /* token.K0.key.len */
+ memcpy(xdr, K0, conn_krb5->key_len); /* token.K0.key.data */
+ xdr += xdr_round_up(conn_krb5->key_len) / 4;
+
+ *xdr++ = htonl(RXRPC_SECURITY_ENCRYPT); /* token.level */
+ *xdr++ = htonl(0); /* token.starttime */
+ *xdr++ = htonl(0); /* " */
+ *xdr++ = htonl(0); /* token.lifetime */
+ *xdr++ = htonl(0); /* token.bytelife */
+ *xdr++ = htonl(0); /* token.expirationtime */
+ *xdr++ = htonl(0); /* " */
+ *xdr++ = htonl(1); /* token.identities.count */
+ *xdr++ = htonl(0); /* token.identities[0].kind */
+ *xdr++ = htonl(uuidsize); /* token.identities[0].data.len */
+ memcpy(xdr, &server->uuid, uuidsize);
+ xdr += xdr_round_up(uuidsize) / 4;
+ *xdr++ = htonl(0); /* token.identities[0].display.len */
+
+ xdr = encbase + xdr_round_up(encsize);
+
+ if ((unsigned long)xdr - (unsigned long)appdata != adatasize)
+ pr_err("Appdata size incorrect %lx != %zx\n",
+ (unsigned long)xdr - (unsigned long)appdata, adatasize);
+
+ aead = crypto_krb5_prepare_encryption(token_krb5, token_key, RXGK_SERVER_ENC_TOKEN,
+ GFP_KERNEL);
+ if (IS_ERR(aead)) {
+ ret = PTR_ERR(aead);
+ goto out_token;
+ }
+
+ sg_init_one(&sg, encbase, encsize);
+ ret = crypto_krb5_encrypt(token_krb5, aead, &sg, 1, encsize, offset, toksize, false);
+ if (ret < 0)
+ goto out_aead;
+
+ server->cm_rxgk_appdata.len = adatasize;
+ server->cm_rxgk_appdata.data = appdata;
+ appdata = NULL;
+
+out_aead:
+ crypto_free_aead(aead);
+out_token:
+ kfree(appdata);
+out:
+ return ret;
+}
+#endif /* CONFIG_RXGK */
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index 99a3f20bc786..1a906805a9e3 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -139,49 +139,6 @@ bool afs_cm_incoming_call(struct afs_call *call)
}
/*
- * Find the server record by peer address and record a probe to the cache
- * manager from a server.
- */
-static int afs_find_cm_server_by_peer(struct afs_call *call)
-{
- struct sockaddr_rxrpc srx;
- struct afs_server *server;
- struct rxrpc_peer *peer;
-
- peer = rxrpc_kernel_get_call_peer(call->net->socket, call->rxcall);
-
- server = afs_find_server(call->net, peer);
- if (!server) {
- trace_afs_cm_no_server(call, &srx);
- return 0;
- }
-
- call->server = server;
- return 0;
-}
-
-/*
- * Find the server record by server UUID and record a probe to the cache
- * manager from a server.
- */
-static int afs_find_cm_server_by_uuid(struct afs_call *call,
- struct afs_uuid *uuid)
-{
- struct afs_server *server;
-
- rcu_read_lock();
- server = afs_find_server_by_uuid(call->net, call->request);
- rcu_read_unlock();
- if (!server) {
- trace_afs_cm_no_server_u(call, call->request);
- return 0;
- }
-
- call->server = server;
- return 0;
-}
-
-/*
* Clean up a cache manager call.
*/
static void afs_cm_destructor(struct afs_call *call)
@@ -322,10 +279,7 @@ static int afs_deliver_cb_callback(struct afs_call *call)
if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING))
return afs_io_error(call, afs_io_error_cm_reply);
-
- /* we'll need the file server record as that tells us which set of
- * vnodes to operate upon */
- return afs_find_cm_server_by_peer(call);
+ return 0;
}
/*
@@ -349,18 +303,10 @@ static void SRXAFSCB_InitCallBackState(struct work_struct *work)
*/
static int afs_deliver_cb_init_call_back_state(struct afs_call *call)
{
- int ret;
-
_enter("");
afs_extract_discard(call, 0);
- ret = afs_extract_data(call, false);
- if (ret < 0)
- return ret;
-
- /* we'll need the file server record as that tells us which set of
- * vnodes to operate upon */
- return afs_find_cm_server_by_peer(call);
+ return afs_extract_data(call, false);
}
/*
@@ -373,8 +319,6 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call)
__be32 *b;
int ret;
- _enter("");
-
_enter("{%u}", call->unmarshall);
switch (call->unmarshall) {
@@ -421,9 +365,13 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call)
if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING))
return afs_io_error(call, afs_io_error_cm_reply);
- /* we'll need the file server record as that tells us which set of
- * vnodes to operate upon */
- return afs_find_cm_server_by_uuid(call, call->request);
+ if (memcmp(call->request, &call->server->_uuid, sizeof(call->server->_uuid)) != 0) {
+ pr_notice("Callback UUID does not match fileserver UUID\n");
+ trace_afs_cm_no_server_u(call, call->request);
+ return 0;
+ }
+
+ return 0;
}
/*
@@ -455,7 +403,7 @@ static int afs_deliver_cb_probe(struct afs_call *call)
if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING))
return afs_io_error(call, afs_io_error_cm_reply);
- return afs_find_cm_server_by_peer(call);
+ return 0;
}
/*
@@ -533,7 +481,7 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call)
if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING))
return afs_io_error(call, afs_io_error_cm_reply);
- return afs_find_cm_server_by_peer(call);
+ return 0;
}
/*
@@ -593,7 +541,7 @@ static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call)
if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING))
return afs_io_error(call, afs_io_error_cm_reply);
- return afs_find_cm_server_by_peer(call);
+ return 0;
}
/*
@@ -667,9 +615,5 @@ static int afs_deliver_yfs_cb_callback(struct afs_call *call)
if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING))
return afs_io_error(call, afs_io_error_cm_reply);
-
- /* We'll need the file server record as that tells us which set of
- * vnodes to operate upon.
- */
- return afs_find_cm_server_by_peer(call);
+ return 0;
}
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 02cbf38e1a77..bfb69e066672 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -33,8 +33,8 @@ static bool afs_lookup_filldir(struct dir_context *ctx, const char *name, int nl
loff_t fpos, u64 ino, unsigned dtype);
static int afs_create(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, umode_t mode, bool excl);
-static int afs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode);
+static struct dentry *afs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode);
static int afs_rmdir(struct inode *dir, struct dentry *dentry);
static int afs_unlink(struct inode *dir, struct dentry *dentry);
static int afs_link(struct dentry *from, struct inode *dir,
@@ -943,7 +943,7 @@ static struct dentry *afs_lookup_atsys(struct inode *dir, struct dentry *dentry)
}
strcpy(p, name);
- ret = lookup_one_len(buf, dentry->d_parent, len);
+ ret = lookup_noperm(&QSTR(buf), dentry->d_parent);
if (IS_ERR(ret) || d_is_positive(ret))
goto out_s;
dput(ret);
@@ -1004,9 +1004,8 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
afs_stat_v(dvnode, n_lookup);
inode = afs_do_lookup(dir, dentry);
if (inode == ERR_PTR(-ENOENT))
- inode = afs_try_auto_mntpt(dentry, dir);
-
- if (!IS_ERR_OR_NULL(inode))
+ inode = NULL;
+ else if (!IS_ERR_OR_NULL(inode))
fid = AFS_FS_I(inode)->fid;
_debug("splice %p", dentry->d_inode);
@@ -1315,8 +1314,8 @@ static const struct afs_operation_ops afs_mkdir_operation = {
/*
* create a directory on an AFS filesystem
*/
-static int afs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *afs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct afs_operation *op;
struct afs_vnode *dvnode = AFS_FS_I(dir);
@@ -1328,7 +1327,7 @@ static int afs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
op = afs_alloc_operation(NULL, dvnode->volume);
if (IS_ERR(op)) {
d_drop(dentry);
- return PTR_ERR(op);
+ return ERR_CAST(op);
}
fscache_use_cookie(afs_vnode_cache(dvnode), true);
@@ -1344,7 +1343,7 @@ static int afs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
op->ops = &afs_mkdir_operation;
ret = afs_do_sync_operation(op);
afs_dir_unuse_cookie(dvnode, ret);
- return ret;
+ return ERR_PTR(ret);
}
/*
diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c
index a1e581946b93..0b80eb93fa40 100644
--- a/fs/afs/dir_silly.c
+++ b/fs/afs/dir_silly.c
@@ -113,16 +113,14 @@ int afs_sillyrename(struct afs_vnode *dvnode, struct afs_vnode *vnode,
sdentry = NULL;
do {
- int slen;
-
dput(sdentry);
sillycounter++;
/* Create a silly name. Note that the ".__afs" prefix is
* understood by the salvager and must not be changed.
*/
- slen = scnprintf(silly, sizeof(silly), ".__afs%04X", sillycounter);
- sdentry = lookup_one_len(silly, dentry->d_parent, slen);
+ scnprintf(silly, sizeof(silly), ".__afs%04X", sillycounter);
+ sdentry = lookup_noperm(&QSTR(silly), dentry->d_parent);
/* N.B. Better to return EBUSY here ... it could be dangerous
* to delete the file while it's in use.
diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c
index d8bf52f77d93..8c6130789fde 100644
--- a/fs/afs/dynroot.c
+++ b/fs/afs/dynroot.c
@@ -10,16 +10,19 @@
#include <linux/dns_resolver.h>
#include "internal.h"
-static atomic_t afs_autocell_ino;
+#define AFS_MIN_DYNROOT_CELL_INO 4 /* Allow for ., .., @cell, .@cell */
+#define AFS_MAX_DYNROOT_CELL_INO ((unsigned int)INT_MAX)
+
+static struct dentry *afs_lookup_atcell(struct inode *dir, struct dentry *dentry, ino_t ino);
/*
* iget5() comparator for inode created by autocell operations
- *
- * These pseudo inodes don't match anything.
*/
static int afs_iget5_pseudo_test(struct inode *inode, void *opaque)
{
- return 0;
+ struct afs_fid *fid = opaque;
+
+ return inode->i_ino == fid->vnode;
}
/*
@@ -39,28 +42,16 @@ static int afs_iget5_pseudo_set(struct inode *inode, void *opaque)
}
/*
- * Create an inode for a dynamic root directory or an autocell dynamic
- * automount dir.
+ * Create an inode for an autocell dynamic automount dir.
*/
-struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root)
+static struct inode *afs_iget_pseudo_dir(struct super_block *sb, ino_t ino)
{
- struct afs_super_info *as = AFS_FS_S(sb);
struct afs_vnode *vnode;
struct inode *inode;
- struct afs_fid fid = {};
+ struct afs_fid fid = { .vnode = ino, .unique = 1, };
_enter("");
- if (as->volume)
- fid.vid = as->volume->vid;
- if (root) {
- fid.vnode = 1;
- fid.unique = 1;
- } else {
- fid.vnode = atomic_inc_return(&afs_autocell_ino);
- fid.unique = 0;
- }
-
inode = iget5_locked(sb, fid.vnode,
afs_iget5_pseudo_test, afs_iget5_pseudo_set, &fid);
if (!inode) {
@@ -73,115 +64,71 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root)
vnode = AFS_FS_I(inode);
- /* there shouldn't be an existing inode */
- BUG_ON(!(inode->i_state & I_NEW));
-
- netfs_inode_init(&vnode->netfs, NULL, false);
- inode->i_size = 0;
- inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
- if (root) {
- inode->i_op = &afs_dynroot_inode_operations;
- inode->i_fop = &simple_dir_operations;
- } else {
- inode->i_op = &afs_autocell_inode_operations;
- }
- set_nlink(inode, 2);
- inode->i_uid = GLOBAL_ROOT_UID;
- inode->i_gid = GLOBAL_ROOT_GID;
- simple_inode_init_ts(inode);
- inode->i_blocks = 0;
- inode->i_generation = 0;
-
- set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags);
- if (!root) {
+ if (inode->i_state & I_NEW) {
+ netfs_inode_init(&vnode->netfs, NULL, false);
+ simple_inode_init_ts(inode);
+ set_nlink(inode, 2);
+ inode->i_size = 0;
+ inode->i_mode = S_IFDIR | 0555;
+ inode->i_op = &afs_autocell_inode_operations;
+ inode->i_uid = GLOBAL_ROOT_UID;
+ inode->i_gid = GLOBAL_ROOT_GID;
+ inode->i_blocks = 0;
+ inode->i_generation = 0;
+ inode->i_flags |= S_AUTOMOUNT | S_NOATIME;
+
+ set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags);
set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags);
- inode->i_flags |= S_AUTOMOUNT;
- }
- inode->i_flags |= S_NOATIME;
- unlock_new_inode(inode);
+ unlock_new_inode(inode);
+ }
_leave(" = %p", inode);
return inode;
}
/*
- * Probe to see if a cell may exist. This prevents positive dentries from
- * being created unnecessarily.
+ * Try to automount the mountpoint with pseudo directory, if the autocell
+ * option is set.
*/
-static int afs_probe_cell_name(struct dentry *dentry)
+static struct dentry *afs_dynroot_lookup_cell(struct inode *dir, struct dentry *dentry,
+ unsigned int flags)
{
- struct afs_cell *cell;
+ struct afs_cell *cell = NULL;
struct afs_net *net = afs_d2net(dentry);
+ struct inode *inode = NULL;
const char *name = dentry->d_name.name;
size_t len = dentry->d_name.len;
- char *result = NULL;
- int ret;
+ bool dotted = false;
+ int ret = -ENOENT;
/* Names prefixed with a dot are R/W mounts. */
if (name[0] == '.') {
- if (len == 1)
- return -EINVAL;
name++;
len--;
+ dotted = true;
}
- cell = afs_find_cell(net, name, len, afs_cell_trace_use_probe);
- if (!IS_ERR(cell)) {
- afs_unuse_cell(net, cell, afs_cell_trace_unuse_probe);
- return 0;
- }
-
- ret = dns_query(net->net, "afsdb", name, len, "srv=1",
- &result, NULL, false);
- if (ret == -ENODATA || ret == -ENOKEY || ret == 0)
- ret = -ENOENT;
- if (ret > 0 && ret >= sizeof(struct dns_server_list_v1_header)) {
- struct dns_server_list_v1_header *v1 = (void *)result;
-
- if (v1->hdr.zero == 0 &&
- v1->hdr.content == DNS_PAYLOAD_IS_SERVER_LIST &&
- v1->hdr.version == 1 &&
- (v1->status != DNS_LOOKUP_GOOD &&
- v1->status != DNS_LOOKUP_GOOD_WITH_BAD))
- return -ENOENT;
-
+ cell = afs_lookup_cell(net, name, len, NULL, false,
+ afs_cell_trace_use_lookup_dynroot);
+ if (IS_ERR(cell)) {
+ ret = PTR_ERR(cell);
+ goto out_no_cell;
}
- kfree(result);
- return ret;
-}
-
-/*
- * Try to auto mount the mountpoint with pseudo directory, if the autocell
- * operation is setted.
- */
-struct inode *afs_try_auto_mntpt(struct dentry *dentry, struct inode *dir)
-{
- struct afs_vnode *vnode = AFS_FS_I(dir);
- struct inode *inode;
- int ret = -ENOENT;
-
- _enter("%p{%pd}, {%llx:%llu}",
- dentry, dentry, vnode->fid.vid, vnode->fid.vnode);
-
- if (!test_bit(AFS_VNODE_AUTOCELL, &vnode->flags))
- goto out;
-
- ret = afs_probe_cell_name(dentry);
- if (ret < 0)
- goto out;
-
- inode = afs_iget_pseudo_dir(dir->i_sb, false);
+ inode = afs_iget_pseudo_dir(dir->i_sb, cell->dynroot_ino * 2 + dotted);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
goto out;
}
- _leave("= %p", inode);
- return inode;
+ dentry->d_fsdata = cell;
+ return d_splice_alias(inode, dentry);
out:
- _leave("= %d", ret);
+ afs_unuse_cell(cell, afs_cell_trace_unuse_lookup_dynroot);
+out_no_cell:
+ if (!inode)
+ return d_splice_alias(inode, dentry);
return ret == -ENOENT ? NULL : ERR_PTR(ret);
}
@@ -193,8 +140,6 @@ static struct dentry *afs_dynroot_lookup(struct inode *dir, struct dentry *dentr
{
_enter("%pd", dentry);
- ASSERTCMP(d_inode(dentry), ==, NULL);
-
if (flags & LOOKUP_CREATE)
return ERR_PTR(-EOPNOTSUPP);
@@ -203,98 +148,49 @@ static struct dentry *afs_dynroot_lookup(struct inode *dir, struct dentry *dentr
return ERR_PTR(-ENAMETOOLONG);
}
- return d_splice_alias(afs_try_auto_mntpt(dentry, dir), dentry);
+ if (dentry->d_name.len == 5 &&
+ memcmp(dentry->d_name.name, "@cell", 5) == 0)
+ return afs_lookup_atcell(dir, dentry, 2);
+
+ if (dentry->d_name.len == 6 &&
+ memcmp(dentry->d_name.name, ".@cell", 6) == 0)
+ return afs_lookup_atcell(dir, dentry, 3);
+
+ return afs_dynroot_lookup_cell(dir, dentry, flags);
}
const struct inode_operations afs_dynroot_inode_operations = {
.lookup = afs_dynroot_lookup,
};
-const struct dentry_operations afs_dynroot_dentry_operations = {
- .d_delete = always_delete_dentry,
- .d_release = afs_d_release,
- .d_automount = afs_d_automount,
-};
-
-/*
- * Create a manually added cell mount directory.
- * - The caller must hold net->proc_cells_lock
- */
-int afs_dynroot_mkdir(struct afs_net *net, struct afs_cell *cell)
-{
- struct super_block *sb = net->dynroot_sb;
- struct dentry *root, *subdir, *dsubdir;
- char *dotname = cell->name - 1;
- int ret;
-
- if (!sb || atomic_read(&sb->s_active) == 0)
- return 0;
-
- /* Let the ->lookup op do the creation */
- root = sb->s_root;
- inode_lock(root->d_inode);
- subdir = lookup_one_len(cell->name, root, cell->name_len);
- if (IS_ERR(subdir)) {
- ret = PTR_ERR(subdir);
- goto unlock;
- }
-
- dsubdir = lookup_one_len(dotname, root, cell->name_len + 1);
- if (IS_ERR(dsubdir)) {
- ret = PTR_ERR(dsubdir);
- dput(subdir);
- goto unlock;
- }
-
- /* Note that we're retaining extra refs on the dentries. */
- subdir->d_fsdata = (void *)1UL;
- dsubdir->d_fsdata = (void *)1UL;
- ret = 0;
-unlock:
- inode_unlock(root->d_inode);
- return ret;
-}
-
-static void afs_dynroot_rm_one_dir(struct dentry *root, const char *name, size_t name_len)
+static void afs_dynroot_d_release(struct dentry *dentry)
{
- struct dentry *subdir;
-
- /* Don't want to trigger a lookup call, which will re-add the cell */
- subdir = try_lookup_one_len(name, root, name_len);
- if (IS_ERR_OR_NULL(subdir)) {
- _debug("lookup %ld", PTR_ERR(subdir));
- return;
- }
-
- _debug("rmdir %pd %u", subdir, d_count(subdir));
+ struct afs_cell *cell = dentry->d_fsdata;
- if (subdir->d_fsdata) {
- _debug("unpin %u", d_count(subdir));
- subdir->d_fsdata = NULL;
- dput(subdir);
- }
- dput(subdir);
+ afs_unuse_cell(cell, afs_cell_trace_unuse_dynroot_mntpt);
}
/*
- * Remove a manually added cell mount directory.
- * - The caller must hold net->proc_cells_lock
+ * Keep @cell symlink dentries around, but only keep cell autodirs when they're
+ * being used.
*/
-void afs_dynroot_rmdir(struct afs_net *net, struct afs_cell *cell)
+static int afs_dynroot_delete_dentry(const struct dentry *dentry)
{
- struct super_block *sb = net->dynroot_sb;
- char *dotname = cell->name - 1;
-
- if (!sb || atomic_read(&sb->s_active) == 0)
- return;
+ const struct qstr *name = &dentry->d_name;
- inode_lock(sb->s_root->d_inode);
- afs_dynroot_rm_one_dir(sb->s_root, cell->name, cell->name_len);
- afs_dynroot_rm_one_dir(sb->s_root, dotname, cell->name_len + 1);
- inode_unlock(sb->s_root->d_inode);
- _leave("");
+ if (name->len == 5 && memcmp(name->name, "@cell", 5) == 0)
+ return 0;
+ if (name->len == 6 && memcmp(name->name, ".@cell", 6) == 0)
+ return 0;
+ return 1;
}
+const struct dentry_operations afs_dynroot_dentry_operations = {
+ .d_delete = afs_dynroot_delete_dentry,
+ .d_release = afs_dynroot_d_release,
+ .d_automount = afs_d_automount,
+};
+
static void afs_atcell_delayed_put_cell(void *arg)
{
struct afs_cell *cell = arg;
@@ -314,12 +210,23 @@ static const char *afs_atcell_get_link(struct dentry *dentry, struct inode *inod
const char *name;
bool dotted = vnode->fid.vnode == 3;
- if (!net->ws_cell)
+ if (!rcu_access_pointer(net->ws_cell))
return ERR_PTR(-ENOENT);
+ if (!dentry) {
+ /* We're in RCU-pathwalk. */
+ cell = rcu_dereference(net->ws_cell);
+ if (dotted)
+ name = cell->name - 1;
+ else
+ name = cell->name;
+ /* Shouldn't need to set a delayed call. */
+ return name;
+ }
+
down_read(&net->cells_lock);
- cell = net->ws_cell;
+ cell = rcu_dereference_protected(net->ws_cell, lockdep_is_held(&net->cells_lock));
if (dotted)
name = cell->name - 1;
else
@@ -336,149 +243,163 @@ static const struct inode_operations afs_atcell_inode_operations = {
};
/*
- * Look up @cell or .@cell in a dynroot directory. This is a substitution for
- * the local cell name for the net namespace.
+ * Create an inode for the @cell or .@cell symlinks.
*/
-static struct dentry *afs_dynroot_create_symlink(struct dentry *root, const char *name)
+static struct dentry *afs_lookup_atcell(struct inode *dir, struct dentry *dentry, ino_t ino)
{
struct afs_vnode *vnode;
- struct afs_fid fid = { .vnode = 2, .unique = 1, };
- struct dentry *dentry;
struct inode *inode;
+ struct afs_fid fid = { .vnode = ino, .unique = 1, };
- if (name[0] == '.')
- fid.vnode = 3;
-
- dentry = d_alloc_name(root, name);
- if (!dentry)
- return ERR_PTR(-ENOMEM);
-
- inode = iget5_locked(dentry->d_sb, fid.vnode,
+ inode = iget5_locked(dir->i_sb, fid.vnode,
afs_iget5_pseudo_test, afs_iget5_pseudo_set, &fid);
- if (!inode) {
- dput(dentry);
+ if (!inode)
return ERR_PTR(-ENOMEM);
- }
vnode = AFS_FS_I(inode);
- /* there shouldn't be an existing inode */
- if (WARN_ON_ONCE(!(inode->i_state & I_NEW))) {
- iput(inode);
- dput(dentry);
- return ERR_PTR(-EIO);
+ if (inode->i_state & I_NEW) {
+ netfs_inode_init(&vnode->netfs, NULL, false);
+ simple_inode_init_ts(inode);
+ set_nlink(inode, 1);
+ inode->i_size = 0;
+ inode->i_mode = S_IFLNK | 0555;
+ inode->i_op = &afs_atcell_inode_operations;
+ inode->i_uid = GLOBAL_ROOT_UID;
+ inode->i_gid = GLOBAL_ROOT_GID;
+ inode->i_blocks = 0;
+ inode->i_generation = 0;
+ inode->i_flags |= S_NOATIME;
+
+ unlock_new_inode(inode);
}
-
- netfs_inode_init(&vnode->netfs, NULL, false);
- simple_inode_init_ts(inode);
- set_nlink(inode, 1);
- inode->i_size = 0;
- inode->i_mode = S_IFLNK | 0555;
- inode->i_op = &afs_atcell_inode_operations;
- inode->i_uid = GLOBAL_ROOT_UID;
- inode->i_gid = GLOBAL_ROOT_GID;
- inode->i_blocks = 0;
- inode->i_generation = 0;
- inode->i_flags |= S_NOATIME;
-
- unlock_new_inode(inode);
- d_splice_alias(inode, dentry);
- return dentry;
+ return d_splice_alias(inode, dentry);
}
/*
- * Create @cell and .@cell symlinks.
+ * Transcribe the cell database into readdir content under the RCU read lock.
+ * Each cell produces two entries, one prefixed with a dot and one not.
*/
-static int afs_dynroot_symlink(struct afs_net *net)
+static int afs_dynroot_readdir_cells(struct afs_net *net, struct dir_context *ctx)
{
- struct super_block *sb = net->dynroot_sb;
- struct dentry *root, *symlink, *dsymlink;
- int ret;
-
- /* Let the ->lookup op do the creation */
- root = sb->s_root;
- inode_lock(root->d_inode);
- symlink = afs_dynroot_create_symlink(root, "@cell");
- if (IS_ERR(symlink)) {
- ret = PTR_ERR(symlink);
- goto unlock;
- }
+ const struct afs_cell *cell;
+ loff_t newpos;
+
+ _enter("%llu", ctx->pos);
+
+ for (;;) {
+ unsigned int ix = ctx->pos >> 1;
+
+ cell = idr_get_next(&net->cells_dyn_ino, &ix);
+ if (!cell)
+ return 0;
+ if (READ_ONCE(cell->state) == AFS_CELL_REMOVING ||
+ READ_ONCE(cell->state) == AFS_CELL_DEAD) {
+ ctx->pos += 2;
+ ctx->pos &= ~1;
+ continue;
+ }
- dsymlink = afs_dynroot_create_symlink(root, ".@cell");
- if (IS_ERR(dsymlink)) {
- ret = PTR_ERR(dsymlink);
- dput(symlink);
- goto unlock;
- }
+ newpos = ix << 1;
+ if (newpos > ctx->pos)
+ ctx->pos = newpos;
- /* Note that we're retaining extra refs on the dentries. */
- symlink->d_fsdata = (void *)1UL;
- dsymlink->d_fsdata = (void *)1UL;
- ret = 0;
-unlock:
- inode_unlock(root->d_inode);
- return ret;
+ _debug("pos %llu -> cell %u", ctx->pos, cell->dynroot_ino);
+
+ if ((ctx->pos & 1) == 0) {
+ if (!dir_emit(ctx, cell->name, cell->name_len,
+ cell->dynroot_ino, DT_DIR))
+ return 0;
+ ctx->pos++;
+ }
+ if ((ctx->pos & 1) == 1) {
+ if (!dir_emit(ctx, cell->name - 1, cell->name_len + 1,
+ cell->dynroot_ino + 1, DT_DIR))
+ return 0;
+ ctx->pos++;
+ }
+ }
+ return 0;
}
/*
- * Populate a newly created dynamic root with cell names.
+ * Read the AFS dynamic root directory. This produces a list of cellnames,
+ * dotted and undotted, along with @cell and .@cell links if configured.
*/
-int afs_dynroot_populate(struct super_block *sb)
+static int afs_dynroot_readdir(struct file *file, struct dir_context *ctx)
{
- struct afs_cell *cell;
- struct afs_net *net = afs_sb2net(sb);
- int ret;
-
- mutex_lock(&net->proc_cells_lock);
+ struct afs_net *net = afs_d2net(file->f_path.dentry);
+ int ret = 0;
- net->dynroot_sb = sb;
- ret = afs_dynroot_symlink(net);
- if (ret < 0)
- goto error;
+ if (!dir_emit_dots(file, ctx))
+ return 0;
- hlist_for_each_entry(cell, &net->proc_cells, proc_link) {
- ret = afs_dynroot_mkdir(net, cell);
- if (ret < 0)
- goto error;
+ if (ctx->pos == 2) {
+ if (rcu_access_pointer(net->ws_cell) &&
+ !dir_emit(ctx, "@cell", 5, 2, DT_LNK))
+ return 0;
+ ctx->pos = 3;
+ }
+ if (ctx->pos == 3) {
+ if (rcu_access_pointer(net->ws_cell) &&
+ !dir_emit(ctx, ".@cell", 6, 3, DT_LNK))
+ return 0;
+ ctx->pos = 4;
}
- ret = 0;
-out:
- mutex_unlock(&net->proc_cells_lock);
+ if ((unsigned long long)ctx->pos <= AFS_MAX_DYNROOT_CELL_INO) {
+ down_read(&net->cells_lock);
+ ret = afs_dynroot_readdir_cells(net, ctx);
+ up_read(&net->cells_lock);
+ }
return ret;
-
-error:
- net->dynroot_sb = NULL;
- goto out;
}
+static const struct file_operations afs_dynroot_file_operations = {
+ .llseek = generic_file_llseek,
+ .read = generic_read_dir,
+ .iterate_shared = afs_dynroot_readdir,
+ .fsync = noop_fsync,
+};
+
/*
- * When a dynamic root that's in the process of being destroyed, depopulate it
- * of pinned directories.
+ * Create an inode for a dynamic root directory.
*/
-void afs_dynroot_depopulate(struct super_block *sb)
+struct inode *afs_dynroot_iget_root(struct super_block *sb)
{
- struct afs_net *net = afs_sb2net(sb);
- struct dentry *root = sb->s_root, *subdir;
-
- /* Prevent more subdirs from being created */
- mutex_lock(&net->proc_cells_lock);
- if (net->dynroot_sb == sb)
- net->dynroot_sb = NULL;
- mutex_unlock(&net->proc_cells_lock);
-
- if (root) {
- struct hlist_node *n;
- inode_lock(root->d_inode);
-
- /* Remove all the pins for dirs created for manually added cells */
- hlist_for_each_entry_safe(subdir, n, &root->d_children, d_sib) {
- if (subdir->d_fsdata) {
- subdir->d_fsdata = NULL;
- dput(subdir);
- }
- }
+ struct afs_super_info *as = AFS_FS_S(sb);
+ struct afs_vnode *vnode;
+ struct inode *inode;
+ struct afs_fid fid = { .vid = 0, .vnode = 1, .unique = 1,};
+
+ if (as->volume)
+ fid.vid = as->volume->vid;
- inode_unlock(root->d_inode);
+ inode = iget5_locked(sb, fid.vnode,
+ afs_iget5_pseudo_test, afs_iget5_pseudo_set, &fid);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+
+ vnode = AFS_FS_I(inode);
+
+ /* there shouldn't be an existing inode */
+ if (inode->i_state & I_NEW) {
+ netfs_inode_init(&vnode->netfs, NULL, false);
+ simple_inode_init_ts(inode);
+ set_nlink(inode, 2);
+ inode->i_size = 0;
+ inode->i_mode = S_IFDIR | 0555;
+ inode->i_op = &afs_dynroot_inode_operations;
+ inode->i_fop = &afs_dynroot_file_operations;
+ inode->i_uid = GLOBAL_ROOT_UID;
+ inode->i_gid = GLOBAL_ROOT_GID;
+ inode->i_blocks = 0;
+ inode->i_generation = 0;
+ inode->i_flags |= S_NOATIME;
+
+ set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags);
+ unlock_new_inode(inode);
}
+ _leave(" = %p", inode);
+ return inode;
}
diff --git a/fs/afs/file.c b/fs/afs/file.c
index fc15497608c6..f66a92294284 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -19,7 +19,7 @@
#include <trace/events/netfs.h>
#include "internal.h"
-static int afs_file_mmap(struct file *file, struct vm_area_struct *vma);
+static int afs_file_mmap_prepare(struct vm_area_desc *desc);
static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter);
static ssize_t afs_file_splice_read(struct file *in, loff_t *ppos,
@@ -35,7 +35,7 @@ const struct file_operations afs_file_operations = {
.llseek = generic_file_llseek,
.read_iter = afs_file_read_iter,
.write_iter = netfs_file_write_iter,
- .mmap = afs_file_mmap,
+ .mmap_prepare = afs_file_mmap_prepare,
.splice_read = afs_file_splice_read,
.splice_write = iter_file_splice_write,
.fsync = afs_fsync,
@@ -492,16 +492,16 @@ static void afs_drop_open_mmap(struct afs_vnode *vnode)
/*
* Handle setting up a memory mapping on an AFS file.
*/
-static int afs_file_mmap(struct file *file, struct vm_area_struct *vma)
+static int afs_file_mmap_prepare(struct vm_area_desc *desc)
{
- struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
+ struct afs_vnode *vnode = AFS_FS_I(file_inode(desc->file));
int ret;
afs_add_open_mmap(vnode);
- ret = generic_file_mmap(file, vma);
+ ret = generic_file_mmap_prepare(desc);
if (ret == 0)
- vma->vm_ops = &afs_vm_ops;
+ desc->vm_ops = &afs_vm_ops;
else
afs_drop_open_mmap(vnode);
return ret;
diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c
index b516d05b0fef..e0030ac74ea0 100644
--- a/fs/afs/fs_probe.c
+++ b/fs/afs/fs_probe.c
@@ -235,20 +235,20 @@ out:
* Probe all of a fileserver's addresses to find out the best route and to
* query its capabilities.
*/
-void afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server,
- struct afs_addr_list *new_alist, struct key *key)
+int afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server,
+ struct afs_addr_list *new_alist, struct key *key)
{
struct afs_endpoint_state *estate, *old;
- struct afs_addr_list *alist;
+ struct afs_addr_list *old_alist = NULL, *alist;
unsigned long unprobed;
_enter("%pU", &server->uuid);
estate = kzalloc(sizeof(*estate), GFP_KERNEL);
if (!estate)
- return;
+ return -ENOMEM;
- refcount_set(&estate->ref, 1);
+ refcount_set(&estate->ref, 2);
estate->server_id = server->debug_id;
estate->rtt = UINT_MAX;
@@ -256,21 +256,31 @@ void afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server,
old = rcu_dereference_protected(server->endpoint_state,
lockdep_is_held(&server->fs_lock));
- estate->responsive_set = old->responsive_set;
- estate->addresses = afs_get_addrlist(new_alist ?: old->addresses,
- afs_alist_trace_get_estate);
+ if (old) {
+ estate->responsive_set = old->responsive_set;
+ if (!new_alist)
+ new_alist = old->addresses;
+ }
+
+ if (old_alist != new_alist)
+ afs_set_peer_appdata(server, old_alist, new_alist);
+
+ estate->addresses = afs_get_addrlist(new_alist, afs_alist_trace_get_estate);
alist = estate->addresses;
estate->probe_seq = ++server->probe_counter;
atomic_set(&estate->nr_probing, alist->nr_addrs);
+ if (new_alist)
+ server->addr_version = new_alist->version;
rcu_assign_pointer(server->endpoint_state, estate);
- set_bit(AFS_ESTATE_SUPERSEDED, &old->flags);
write_unlock(&server->fs_lock);
+ if (old)
+ set_bit(AFS_ESTATE_SUPERSEDED, &old->flags);
trace_afs_estate(estate->server_id, estate->probe_seq, refcount_read(&estate->ref),
afs_estate_trace_alloc_probe);
- afs_get_address_preferences(net, alist);
+ afs_get_address_preferences(net, new_alist);
server->probed_at = jiffies;
unprobed = (1UL << alist->nr_addrs) - 1;
@@ -293,6 +303,8 @@ void afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server,
}
afs_put_endpoint_state(old, afs_estate_trace_put_probe);
+ afs_put_endpoint_state(estate, afs_estate_trace_put_probe);
+ return 0;
}
/*
@@ -522,6 +534,6 @@ dont_wait:
*/
void afs_fs_probe_cleanup(struct afs_net *net)
{
- if (del_timer_sync(&net->fs_probe_timer))
+ if (timer_delete_sync(&net->fs_probe_timer))
afs_dec_servers_outstanding(net);
}
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 1d9ecd5418d8..bc9556991d7c 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -1653,7 +1653,7 @@ int afs_fs_give_up_all_callbacks(struct afs_net *net, struct afs_server *server,
bp = call->request;
*bp++ = htonl(FSGIVEUPALLCALLBACKS);
- call->server = afs_use_server(server, afs_server_trace_give_up_cb);
+ call->server = afs_use_server(server, false, afs_server_trace_use_give_up_cb);
afs_make_call(call, GFP_NOFS);
afs_wait_for_call_to_complete(call);
ret = call->error;
@@ -1760,7 +1760,7 @@ bool afs_fs_get_capabilities(struct afs_net *net, struct afs_server *server,
return false;
call->key = key;
- call->server = afs_use_server(server, afs_server_trace_get_caps);
+ call->server = afs_use_server(server, false, afs_server_trace_use_get_caps);
call->peer = rxrpc_kernel_get_peer(estate->addresses->addrs[addr_index].peer);
call->probe = afs_get_endpoint_state(estate, afs_estate_trace_get_getcaps);
call->probe_index = addr_index;
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 90f407774a9a..1124ea4000cb 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -20,6 +20,7 @@
#include <linux/uuid.h>
#include <linux/mm_types.h>
#include <linux/dns_resolver.h>
+#include <crypto/krb5.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include <net/sock.h>
@@ -176,8 +177,10 @@ struct afs_call {
bool intr; /* T if interruptible */
bool unmarshalling_error; /* T if an unmarshalling error occurred */
bool responded; /* Got a response from the call (may be abort) */
+ u8 security_ix; /* Security class */
u16 service_id; /* Actual service ID (after upgrade) */
unsigned int debug_id; /* Trace ID */
+ u32 enctype; /* Security encoding type */
u32 operation_ID; /* operation ID for an incoming call */
u32 count; /* count for use in unmarshalling */
union { /* place to extract temporary data */
@@ -281,15 +284,15 @@ struct afs_net {
struct socket *socket;
struct afs_call *spare_incoming_call;
struct work_struct charge_preallocation_work;
+ struct work_struct rx_oob_work;
struct mutex socket_mutex;
atomic_t nr_outstanding_calls;
atomic_t nr_superblocks;
/* Cell database */
struct rb_root cells;
- struct afs_cell *ws_cell;
- struct work_struct cells_manager;
- struct timer_list cells_timer;
+ struct idr cells_dyn_ino; /* cell->dynroot_ino mapping */
+ struct afs_cell __rcu *ws_cell;
atomic_t cells_outstanding;
struct rw_semaphore cells_lock;
struct mutex cells_alias_lock;
@@ -301,18 +304,12 @@ struct afs_net {
* cell, but in practice, people create aliases and subsets and there's
* no easy way to distinguish them.
*/
- seqlock_t fs_lock; /* For fs_servers, fs_probe_*, fs_proc */
- struct rb_root fs_servers; /* afs_server (by server UUID or address) */
+ seqlock_t fs_lock; /* For fs_probe_*, fs_proc */
struct list_head fs_probe_fast; /* List of afs_server to probe at 30s intervals */
struct list_head fs_probe_slow; /* List of afs_server to probe at 5m intervals */
struct hlist_head fs_proc; /* procfs servers list */
- struct hlist_head fs_addresses; /* afs_server (by lowest IPv6 addr) */
- seqlock_t fs_addr_lock; /* For fs_addresses[46] */
-
- struct work_struct fs_manager;
- struct timer_list fs_timer;
-
+ struct key *fs_cm_token_key; /* Key for creating CM tokens */
struct work_struct fs_prober;
struct timer_list fs_probe_timer;
atomic_t servers_outstanding;
@@ -345,13 +342,10 @@ struct afs_net {
extern const char afs_init_sysname[];
enum afs_cell_state {
- AFS_CELL_UNSET,
- AFS_CELL_ACTIVATING,
+ AFS_CELL_SETTING_UP,
AFS_CELL_ACTIVE,
- AFS_CELL_DEACTIVATING,
- AFS_CELL_INACTIVE,
- AFS_CELL_FAILED,
- AFS_CELL_REMOVED,
+ AFS_CELL_REMOVING,
+ AFS_CELL_DEAD,
};
/*
@@ -382,7 +376,9 @@ struct afs_cell {
struct afs_cell *alias_of; /* The cell this is an alias of */
struct afs_volume *root_volume; /* The root.cell volume if there is one */
struct key *anonymous_key; /* anonymous user key for this cell */
+ struct work_struct destroyer; /* Destroyer for cell */
struct work_struct manager; /* Manager for init/deinit/dns */
+ struct timer_list management_timer; /* General management timer */
struct hlist_node proc_link; /* /proc cell list link */
time64_t dns_expiry; /* Time AFSDB/SRV record expires */
time64_t last_inactive; /* Time of last drop of usage count */
@@ -398,6 +394,7 @@ struct afs_cell {
enum dns_lookup_status dns_status:8; /* Latest status of data from lookup */
unsigned int dns_lookup_count; /* Counter of DNS lookups */
unsigned int debug_id;
+ unsigned int dynroot_ino; /* Inode numbers for dynroot (a pair) */
/* The volumes belonging to this cell */
struct rw_semaphore vs_lock; /* Lock for server->volumes */
@@ -407,7 +404,7 @@ struct afs_cell {
/* Active fileserver interaction state. */
struct rb_root fs_servers; /* afs_server (by server UUID) */
- seqlock_t fs_lock; /* For fs_servers */
+ struct rw_semaphore fs_lock; /* For fs_servers */
/* VL server list. */
rwlock_t vl_servers_lock; /* Lock on vl_servers */
@@ -542,22 +539,24 @@ struct afs_server {
};
struct afs_cell *cell; /* Cell to which belongs (pins ref) */
- struct rb_node uuid_rb; /* Link in net->fs_servers */
- struct afs_server __rcu *uuid_next; /* Next server with same UUID */
- struct afs_server *uuid_prev; /* Previous server with same UUID */
- struct list_head probe_link; /* Link in net->fs_probe_list */
- struct hlist_node addr_link; /* Link in net->fs_addresses6 */
+ struct rb_node uuid_rb; /* Link in cell->fs_servers */
+ struct list_head probe_link; /* Link in net->fs_probe_* */
struct hlist_node proc_link; /* Link in net->fs_proc */
struct list_head volumes; /* RCU list of afs_server_entry objects */
- struct afs_server *gc_next; /* Next server in manager's list */
+ struct work_struct destroyer; /* Work item to try and destroy a server */
+ struct timer_list timer; /* Management timer */
+ struct mutex cm_token_lock; /* Lock governing creation of appdata */
+ struct krb5_buffer cm_rxgk_appdata; /* Appdata to be included in RESPONSE packet */
time64_t unuse_time; /* Time at which last unused */
unsigned long flags;
#define AFS_SERVER_FL_RESPONDING 0 /* The server is responding */
#define AFS_SERVER_FL_UPDATING 1
#define AFS_SERVER_FL_NEEDS_UPDATE 2 /* Fileserver address list is out of date */
-#define AFS_SERVER_FL_NOT_READY 4 /* The record is not ready for use */
-#define AFS_SERVER_FL_NOT_FOUND 5 /* VL server says no such server */
-#define AFS_SERVER_FL_VL_FAIL 6 /* Failed to access VL server */
+#define AFS_SERVER_FL_UNCREATED 3 /* The record needs creating */
+#define AFS_SERVER_FL_CREATING 4 /* The record is being created */
+#define AFS_SERVER_FL_EXPIRED 5 /* The record has expired */
+#define AFS_SERVER_FL_NOT_FOUND 6 /* VL server says no such server */
+#define AFS_SERVER_FL_VL_FAIL 7 /* Failed to access VL server */
#define AFS_SERVER_FL_MAY_HAVE_CB 8 /* May have callbacks on this fileserver */
#define AFS_SERVER_FL_IS_YFS 16 /* Server is YFS not AFS */
#define AFS_SERVER_FL_NO_IBULK 17 /* Fileserver doesn't support FS.InlineBulkStatus */
@@ -567,6 +566,7 @@ struct afs_server {
atomic_t active; /* Active user count */
u32 addr_version; /* Address list version */
u16 service_id; /* Service ID we're using. */
+ short create_error; /* Creation error */
unsigned int rtt; /* Server's current RTT in uS */
unsigned int debug_id; /* Debugging ID for traces */
@@ -621,6 +621,7 @@ struct afs_volume {
afs_volid_t vid; /* The volume ID of this volume */
afs_volid_t vids[AFS_MAXTYPES]; /* All associated volume IDs */
refcount_t ref;
+ unsigned int debug_id; /* Debugging ID for traces */
time64_t update_at; /* Time at which to next update */
struct afs_cell *cell; /* Cell to which belongs (pins ref) */
struct rb_node cell_node; /* Link in cell->volumes */
@@ -700,7 +701,6 @@ struct afs_vnode {
#define AFS_VNODE_ZAP_DATA 3 /* set if vnode's data should be invalidated */
#define AFS_VNODE_DELETED 4 /* set if vnode deleted on server */
#define AFS_VNODE_MOUNTPOINT 5 /* set if vnode is a mountpoint symlink */
-#define AFS_VNODE_AUTOCELL 6 /* set if Vnode is an auto mount point */
#define AFS_VNODE_PSEUDODIR 7 /* set if Vnode is a pseudo directory */
#define AFS_VNODE_NEW_CONTENT 8 /* Set if file has new content (create/trunc-0) */
#define AFS_VNODE_SILLY_DELETED 9 /* Set if file has been silly-deleted */
@@ -1008,6 +1008,9 @@ extern int afs_merge_fs_addr4(struct afs_net *net, struct afs_addr_list *addr,
__be32 xdr, u16 port);
extern int afs_merge_fs_addr6(struct afs_net *net, struct afs_addr_list *addr,
__be32 *xdr, u16 port);
+void afs_set_peer_appdata(struct afs_server *server,
+ struct afs_addr_list *old_alist,
+ struct afs_addr_list *new_alist);
/*
* addr_prefs.c
@@ -1044,16 +1047,17 @@ static inline bool afs_cb_is_broken(unsigned int cb_break,
extern int afs_cell_init(struct afs_net *, const char *);
extern struct afs_cell *afs_find_cell(struct afs_net *, const char *, unsigned,
enum afs_cell_trace);
-extern struct afs_cell *afs_lookup_cell(struct afs_net *, const char *, unsigned,
- const char *, bool);
+struct afs_cell *afs_lookup_cell(struct afs_net *net,
+ const char *name, unsigned int namesz,
+ const char *vllist, bool excl,
+ enum afs_cell_trace trace);
extern struct afs_cell *afs_use_cell(struct afs_cell *, enum afs_cell_trace);
-extern void afs_unuse_cell(struct afs_net *, struct afs_cell *, enum afs_cell_trace);
+void afs_unuse_cell(struct afs_cell *cell, enum afs_cell_trace reason);
extern struct afs_cell *afs_get_cell(struct afs_cell *, enum afs_cell_trace);
extern void afs_see_cell(struct afs_cell *, enum afs_cell_trace);
extern void afs_put_cell(struct afs_cell *, enum afs_cell_trace);
extern void afs_queue_cell(struct afs_cell *, enum afs_cell_trace);
-extern void afs_manage_cells(struct work_struct *);
-extern void afs_cells_timer(struct timer_list *);
+void afs_set_cell_timer(struct afs_cell *cell, unsigned int delay_secs);
extern void __net_exit afs_cell_purge(struct afs_net *);
/*
@@ -1062,6 +1066,19 @@ extern void __net_exit afs_cell_purge(struct afs_net *);
extern bool afs_cm_incoming_call(struct afs_call *);
/*
+ * cm_security.c
+ */
+void afs_process_oob_queue(struct work_struct *work);
+#ifdef CONFIG_RXGK
+int afs_create_token_key(struct afs_net *net, struct socket *socket);
+#else
+static inline int afs_create_token_key(struct afs_net *net, struct socket *socket)
+{
+ return 0;
+}
+#endif
+
+/*
* dir.c
*/
extern const struct file_operations afs_dir_file_operations;
@@ -1111,11 +1128,7 @@ extern int afs_silly_iput(struct dentry *, struct inode *);
extern const struct inode_operations afs_dynroot_inode_operations;
extern const struct dentry_operations afs_dynroot_dentry_operations;
-extern struct inode *afs_try_auto_mntpt(struct dentry *, struct inode *);
-extern int afs_dynroot_mkdir(struct afs_net *, struct afs_cell *);
-extern void afs_dynroot_rmdir(struct afs_net *, struct afs_cell *);
-extern int afs_dynroot_populate(struct super_block *);
-extern void afs_dynroot_depopulate(struct super_block *);
+struct inode *afs_dynroot_iget_root(struct super_block *sb);
/*
* file.c
@@ -1207,8 +1220,8 @@ struct afs_endpoint_state *afs_get_endpoint_state(struct afs_endpoint_state *est
enum afs_estate_trace where);
void afs_put_endpoint_state(struct afs_endpoint_state *estate, enum afs_estate_trace where);
extern void afs_fileserver_probe_result(struct afs_call *);
-void afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server,
- struct afs_addr_list *new_addrs, struct key *key);
+int afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server,
+ struct afs_addr_list *new_alist, struct key *key);
int afs_wait_for_fs_probes(struct afs_operation *op, struct afs_server_state *states, bool intr);
extern void afs_probe_fileserver(struct afs_net *, struct afs_server *);
extern void afs_fs_probe_dispatcher(struct work_struct *);
@@ -1228,7 +1241,6 @@ int afs_readlink(struct dentry *dentry, char __user *buffer, int buflen);
extern void afs_vnode_commit_status(struct afs_operation *, struct afs_vnode_param *);
extern int afs_fetch_status(struct afs_vnode *, struct key *, bool, afs_access_t *);
extern int afs_ilookup5_test_by_fid(struct inode *, void *);
-extern struct inode *afs_iget_pseudo_dir(struct super_block *, bool);
extern struct inode *afs_iget(struct afs_operation *, struct afs_vnode_param *);
extern struct inode *afs_root_iget(struct super_block *, struct key *);
extern int afs_getattr(struct mnt_idmap *idmap, const struct path *,
@@ -1510,20 +1522,30 @@ extern void __exit afs_clean_up_permit_cache(void);
*/
extern spinlock_t afs_server_peer_lock;
-extern struct afs_server *afs_find_server(struct afs_net *, const struct rxrpc_peer *);
-extern struct afs_server *afs_find_server_by_uuid(struct afs_net *, const uuid_t *);
+struct afs_server *afs_find_server(const struct rxrpc_peer *peer);
extern struct afs_server *afs_lookup_server(struct afs_cell *, struct key *, const uuid_t *, u32);
extern struct afs_server *afs_get_server(struct afs_server *, enum afs_server_trace);
-extern struct afs_server *afs_use_server(struct afs_server *, enum afs_server_trace);
-extern void afs_unuse_server(struct afs_net *, struct afs_server *, enum afs_server_trace);
-extern void afs_unuse_server_notime(struct afs_net *, struct afs_server *, enum afs_server_trace);
+struct afs_server *afs_use_server(struct afs_server *server, bool activate,
+ enum afs_server_trace reason);
+void afs_unuse_server(struct afs_net *net, struct afs_server *server,
+ enum afs_server_trace reason);
+void afs_unuse_server_notime(struct afs_net *net, struct afs_server *server,
+ enum afs_server_trace reason);
extern void afs_put_server(struct afs_net *, struct afs_server *, enum afs_server_trace);
-extern void afs_manage_servers(struct work_struct *);
-extern void afs_servers_timer(struct timer_list *);
+void afs_purge_servers(struct afs_cell *cell);
extern void afs_fs_probe_timer(struct timer_list *);
-extern void __net_exit afs_purge_servers(struct afs_net *);
+void __net_exit afs_wait_for_servers(struct afs_net *net);
bool afs_check_server_record(struct afs_operation *op, struct afs_server *server, struct key *key);
+static inline void afs_see_server(struct afs_server *server, enum afs_server_trace trace)
+{
+ int r = refcount_read(&server->ref);
+ int a = atomic_read(&server->active);
+
+ trace_afs_server(server->debug_id, r, a, trace);
+
+}
+
static inline void afs_inc_servers_outstanding(struct afs_net *net)
{
atomic_inc(&net->servers_outstanding);
diff --git a/fs/afs/main.c b/fs/afs/main.c
index 1ae0067f772d..02475d415d88 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -73,28 +73,21 @@ static int __net_init afs_net_init(struct net *net_ns)
generate_random_uuid((unsigned char *)&net->uuid);
INIT_WORK(&net->charge_preallocation_work, afs_charge_preallocation);
+ INIT_WORK(&net->rx_oob_work, afs_process_oob_queue);
mutex_init(&net->socket_mutex);
net->cells = RB_ROOT;
+ idr_init(&net->cells_dyn_ino);
init_rwsem(&net->cells_lock);
- INIT_WORK(&net->cells_manager, afs_manage_cells);
- timer_setup(&net->cells_timer, afs_cells_timer, 0);
-
mutex_init(&net->cells_alias_lock);
mutex_init(&net->proc_cells_lock);
INIT_HLIST_HEAD(&net->proc_cells);
seqlock_init(&net->fs_lock);
- net->fs_servers = RB_ROOT;
INIT_LIST_HEAD(&net->fs_probe_fast);
INIT_LIST_HEAD(&net->fs_probe_slow);
INIT_HLIST_HEAD(&net->fs_proc);
- INIT_HLIST_HEAD(&net->fs_addresses);
- seqlock_init(&net->fs_addr_lock);
-
- INIT_WORK(&net->fs_manager, afs_manage_servers);
- timer_setup(&net->fs_timer, afs_servers_timer, 0);
INIT_WORK(&net->fs_prober, afs_fs_probe_dispatcher);
timer_setup(&net->fs_probe_timer, afs_fs_probe_timer, 0);
atomic_set(&net->servers_outstanding, 1);
@@ -130,13 +123,14 @@ error_open_socket:
net->live = false;
afs_fs_probe_cleanup(net);
afs_cell_purge(net);
- afs_purge_servers(net);
+ afs_wait_for_servers(net);
error_cell_init:
net->live = false;
afs_proc_cleanup(net);
error_proc:
afs_put_sysnames(net->sysnames);
error_sysnames:
+ idr_destroy(&net->cells_dyn_ino);
net->live = false;
return ret;
}
@@ -151,10 +145,11 @@ static void __net_exit afs_net_exit(struct net *net_ns)
net->live = false;
afs_fs_probe_cleanup(net);
afs_cell_purge(net);
- afs_purge_servers(net);
+ afs_wait_for_servers(net);
afs_close_socket(net);
afs_proc_cleanup(net);
afs_put_sysnames(net->sysnames);
+ idr_destroy(&net->cells_dyn_ino);
kfree_rcu(rcu_access_pointer(net->address_prefs), rcu);
}
diff --git a/fs/afs/misc.c b/fs/afs/misc.c
index b8180bf2281f..8f2b3a177690 100644
--- a/fs/afs/misc.c
+++ b/fs/afs/misc.c
@@ -8,6 +8,7 @@
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/errno.h>
+#include <crypto/krb5.h>
#include "internal.h"
#include "afs_fs.h"
#include "protocol_uae.h"
@@ -103,6 +104,32 @@ int afs_abort_to_error(u32 abort_code)
case RXKADDATALEN: return -EKEYREJECTED;
case RXKADILLEGALLEVEL: return -EKEYREJECTED;
+ case RXGK_INCONSISTENCY: return -EPROTO;
+ case RXGK_PACKETSHORT: return -EPROTO;
+ case RXGK_BADCHALLENGE: return -EPROTO;
+ case RXGK_SEALEDINCON: return -EKEYREJECTED;
+ case RXGK_NOTAUTH: return -EKEYREJECTED;
+ case RXGK_EXPIRED: return -EKEYEXPIRED;
+ case RXGK_BADLEVEL: return -EKEYREJECTED;
+ case RXGK_BADKEYNO: return -EKEYREJECTED;
+ case RXGK_NOTRXGK: return -EKEYREJECTED;
+ case RXGK_UNSUPPORTED: return -EKEYREJECTED;
+ case RXGK_GSSERROR: return -EKEYREJECTED;
+#ifdef RXGK_BADETYPE
+ case RXGK_BADETYPE: return -ENOPKG;
+#endif
+#ifdef RXGK_BADTOKEN
+ case RXGK_BADTOKEN: return -EKEYREJECTED;
+#endif
+#ifdef RXGK_BADETYPE
+ case RXGK_DATALEN: return -EPROTO;
+#endif
+#ifdef RXGK_BADQOP
+ case RXGK_BADQOP: return -EKEYREJECTED;
+#endif
+
+ case KRB5_PROG_KEYTYPE_NOSUPP: return -ENOPKG;
+
case RXGEN_OPCODE: return -ENOTSUPP;
default: return -EREMOTEIO;
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 507c25a5b2cb..9434a5399f2b 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -87,7 +87,7 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt)
ctx->force = true;
}
if (ctx->cell) {
- afs_unuse_cell(ctx->net, ctx->cell, afs_cell_trace_unuse_mntpt);
+ afs_unuse_cell(ctx->cell, afs_cell_trace_unuse_mntpt);
ctx->cell = NULL;
}
if (test_bit(AFS_VNODE_PSEUDODIR, &vnode->flags)) {
@@ -107,7 +107,8 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt)
if (size > AFS_MAXCELLNAME)
return -ENAMETOOLONG;
- cell = afs_lookup_cell(ctx->net, p, size, NULL, false);
+ cell = afs_lookup_cell(ctx->net, p, size, NULL, false,
+ afs_cell_trace_use_lookup_mntpt);
if (IS_ERR(cell)) {
pr_err("kAFS: unable to lookup cell '%pd'\n", mntpt);
return PTR_ERR(cell);
@@ -188,7 +189,6 @@ struct vfsmount *afs_d_automount(struct path *path)
if (IS_ERR(newmnt))
return newmnt;
- mntget(newmnt); /* prevent immediate expiration */
mnt_set_expiry(newmnt, &afs_vfsmounts);
queue_delayed_work(afs_wq, &afs_mntpt_expiry_timer,
afs_mntpt_expiry_timeout * HZ);
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index e7614f4f30c2..40e879c8ca77 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -122,14 +122,15 @@ static int afs_proc_cells_write(struct file *file, char *buf, size_t size)
if (strcmp(buf, "add") == 0) {
struct afs_cell *cell;
- cell = afs_lookup_cell(net, name, strlen(name), args, true);
+ cell = afs_lookup_cell(net, name, strlen(name), args, true,
+ afs_cell_trace_use_lookup_add);
if (IS_ERR(cell)) {
ret = PTR_ERR(cell);
goto done;
}
if (test_and_set_bit(AFS_CELL_FL_NO_GC, &cell->flags))
- afs_unuse_cell(net, cell, afs_cell_trace_unuse_no_pin);
+ afs_unuse_cell(cell, afs_cell_trace_unuse_no_pin);
} else {
goto inval;
}
@@ -206,7 +207,7 @@ static int afs_proc_rootcell_show(struct seq_file *m, void *v)
net = afs_seq2net_single(m);
down_read(&net->cells_lock);
- cell = net->ws_cell;
+ cell = rcu_dereference_protected(net->ws_cell, lockdep_is_held(&net->cells_lock));
if (cell)
seq_printf(m, "%s\n", cell->name);
up_read(&net->cells_lock);
@@ -242,7 +243,7 @@ static int afs_proc_rootcell_write(struct file *file, char *buf, size_t size)
ret = -EEXIST;
inode_lock(file_inode(file));
- if (!net->ws_cell)
+ if (!rcu_access_pointer(net->ws_cell))
ret = afs_cell_init(net, buf);
else
printk("busy\n");
@@ -443,8 +444,6 @@ static int afs_proc_servers_show(struct seq_file *m, void *v)
}
server = list_entry(v, struct afs_server, proc_link);
- estate = rcu_dereference(server->endpoint_state);
- alist = estate->addresses;
seq_printf(m, "%pU %3d %3d %s\n",
&server->uuid,
refcount_read(&server->ref),
@@ -454,10 +453,16 @@ static int afs_proc_servers_show(struct seq_file *m, void *v)
server->flags, server->rtt);
seq_printf(m, " - probe: last=%d\n",
(int)(jiffies - server->probed_at) / HZ);
+
+ estate = rcu_dereference(server->endpoint_state);
+ if (!estate)
+ goto out;
failed = estate->failed_set;
seq_printf(m, " - ESTATE pq=%x np=%u rsp=%lx f=%lx\n",
estate->probe_seq, atomic_read(&estate->nr_probing),
estate->responsive_set, estate->failed_set);
+
+ alist = estate->addresses;
seq_printf(m, " - ALIST v=%u ap=%u\n",
alist->version, alist->addr_pref_version);
for (i = 0; i < alist->nr_addrs; i++) {
@@ -470,6 +475,8 @@ static int afs_proc_servers_show(struct seq_file *m, void *v)
rxrpc_kernel_get_srtt(addr->peer),
addr->last_error, addr->prio);
}
+
+out:
return 0;
}
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 886416ea1d96..c1cadf8fb346 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -24,8 +24,17 @@ static void afs_wake_up_async_call(struct sock *, struct rxrpc_call *, unsigned
static void afs_process_async_call(struct work_struct *);
static void afs_rx_new_call(struct sock *, struct rxrpc_call *, unsigned long);
static void afs_rx_discard_new_call(struct rxrpc_call *, unsigned long);
+static void afs_rx_attach(struct rxrpc_call *rxcall, unsigned long user_call_ID);
+static void afs_rx_notify_oob(struct sock *sk, struct sk_buff *oob);
static int afs_deliver_cm_op_id(struct afs_call *);
+static const struct rxrpc_kernel_ops afs_rxrpc_callback_ops = {
+ .notify_new_call = afs_rx_new_call,
+ .discard_new_call = afs_rx_discard_new_call,
+ .user_attach_call = afs_rx_attach,
+ .notify_oob = afs_rx_notify_oob,
+};
+
/* asynchronous incoming call initial processing */
static const struct afs_call_type afs_RXCMxxxx = {
.name = "CB.xxxx",
@@ -49,6 +58,7 @@ int afs_open_socket(struct afs_net *net)
goto error_1;
socket->sk->sk_allocation = GFP_NOFS;
+ socket->sk->sk_user_data = net;
/* bind the callback manager's address to make this a server socket */
memset(&srx, 0, sizeof(srx));
@@ -64,6 +74,14 @@ int afs_open_socket(struct afs_net *net)
if (ret < 0)
goto error_2;
+ ret = rxrpc_sock_set_manage_response(socket->sk, true);
+ if (ret < 0)
+ goto error_2;
+
+ ret = afs_create_token_key(net, socket);
+ if (ret < 0)
+ pr_err("Couldn't create RxGK CM key: %d\n", ret);
+
ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx));
if (ret == -EADDRINUSE) {
srx.transport.sin6.sin6_port = 0;
@@ -84,8 +102,7 @@ int afs_open_socket(struct afs_net *net)
* it sends back to us.
*/
- rxrpc_kernel_new_call_notification(socket, afs_rx_new_call,
- afs_rx_discard_new_call);
+ rxrpc_kernel_set_notifications(socket, &afs_rxrpc_callback_ops);
ret = kernel_listen(socket, INT_MAX);
if (ret < 0)
@@ -125,7 +142,9 @@ void afs_close_socket(struct afs_net *net)
kernel_sock_shutdown(net->socket, SHUT_RDWR);
flush_workqueue(afs_async_calls);
+ net->socket->sk->sk_user_data = NULL;
sock_release(net->socket);
+ key_put(net->fs_cm_token_key);
_debug("dework");
_leave("");
@@ -179,7 +198,7 @@ static void afs_free_call(struct afs_call *call)
if (call->type->destructor)
call->type->destructor(call);
- afs_unuse_server_notime(call->net, call->server, afs_server_trace_put_call);
+ afs_unuse_server_notime(call->net, call->server, afs_server_trace_unuse_call);
kfree(call->request);
o = atomic_read(&net->nr_outstanding_calls);
@@ -738,7 +757,6 @@ void afs_charge_preallocation(struct work_struct *work)
if (rxrpc_kernel_charge_accept(net->socket,
afs_wake_up_async_call,
- afs_rx_attach,
(unsigned long)call,
GFP_KERNEL,
call->debug_id) < 0)
@@ -766,8 +784,14 @@ static void afs_rx_discard_new_call(struct rxrpc_call *rxcall,
static void afs_rx_new_call(struct sock *sk, struct rxrpc_call *rxcall,
unsigned long user_call_ID)
{
+ struct afs_call *call = (struct afs_call *)user_call_ID;
struct afs_net *net = afs_sock2net(sk);
+ call->peer = rxrpc_kernel_get_call_peer(sk->sk_socket, call->rxcall);
+ call->server = afs_find_server(call->peer);
+ if (!call->server)
+ trace_afs_cm_no_server(call, rxrpc_kernel_remote_srx(call->peer));
+
queue_work(afs_wq, &net->charge_preallocation_work);
}
@@ -794,10 +818,14 @@ static int afs_deliver_cm_op_id(struct afs_call *call)
if (!afs_cm_incoming_call(call))
return -ENOTSUPP;
+ call->security_ix = rxrpc_kernel_query_call_security(call->rxcall,
+ &call->service_id,
+ &call->enctype);
+
trace_afs_cb_call(call);
call->work.func = call->type->work;
- /* pass responsibility for the remainer of this message off to the
+ /* pass responsibility for the remainder of this message off to the
* cache manager op */
return call->type->deliver(call);
}
@@ -946,3 +974,13 @@ noinline int afs_protocol_error(struct afs_call *call,
call->unmarshalling_error = true;
return -EBADMSG;
}
+
+/*
+ * Wake up OOB notification processing.
+ */
+static void afs_rx_notify_oob(struct sock *sk, struct sk_buff *oob)
+{
+ struct afs_net *net = sk->sk_user_data;
+
+ schedule_work(&net->rx_oob_work);
+}
diff --git a/fs/afs/server.c b/fs/afs/server.c
index 038f9d0ae3af..a97562f831eb 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -14,188 +14,104 @@
static unsigned afs_server_gc_delay = 10; /* Server record timeout in seconds */
static atomic_t afs_server_debug_id;
-static struct afs_server *afs_maybe_use_server(struct afs_server *,
- enum afs_server_trace);
static void __afs_put_server(struct afs_net *, struct afs_server *);
+static void afs_server_timer(struct timer_list *timer);
+static void afs_server_destroyer(struct work_struct *work);
/*
* Find a server by one of its addresses.
*/
-struct afs_server *afs_find_server(struct afs_net *net, const struct rxrpc_peer *peer)
+struct afs_server *afs_find_server(const struct rxrpc_peer *peer)
{
- const struct afs_endpoint_state *estate;
- const struct afs_addr_list *alist;
- struct afs_server *server = NULL;
- unsigned int i;
- int seq = 1;
+ struct afs_server *server = (struct afs_server *)rxrpc_kernel_get_peer_data(peer);
- rcu_read_lock();
-
- do {
- if (server)
- afs_unuse_server_notime(net, server, afs_server_trace_put_find_rsq);
- server = NULL;
- seq++; /* 2 on the 1st/lockless path, otherwise odd */
- read_seqbegin_or_lock(&net->fs_addr_lock, &seq);
-
- hlist_for_each_entry_rcu(server, &net->fs_addresses, addr_link) {
- estate = rcu_dereference(server->endpoint_state);
- alist = estate->addresses;
- for (i = 0; i < alist->nr_addrs; i++)
- if (alist->addrs[i].peer == peer)
- goto found;
- }
-
- server = NULL;
- continue;
- found:
- server = afs_maybe_use_server(server, afs_server_trace_get_by_addr);
-
- } while (need_seqretry(&net->fs_addr_lock, seq));
-
- done_seqretry(&net->fs_addr_lock, seq);
-
- rcu_read_unlock();
- return server;
+ if (!server)
+ return NULL;
+ return afs_use_server(server, false, afs_server_trace_use_cm_call);
}
/*
- * Look up a server by its UUID and mark it active.
+ * Look up a server by its UUID and mark it active. The caller must hold
+ * cell->fs_lock.
*/
-struct afs_server *afs_find_server_by_uuid(struct afs_net *net, const uuid_t *uuid)
+static struct afs_server *afs_find_server_by_uuid(struct afs_cell *cell, const uuid_t *uuid)
{
- struct afs_server *server = NULL;
+ struct afs_server *server;
struct rb_node *p;
- int diff, seq = 1;
+ int diff;
_enter("%pU", uuid);
- do {
- /* Unfortunately, rbtree walking doesn't give reliable results
- * under just the RCU read lock, so we have to check for
- * changes.
- */
- if (server)
- afs_unuse_server(net, server, afs_server_trace_put_uuid_rsq);
- server = NULL;
- seq++; /* 2 on the 1st/lockless path, otherwise odd */
- read_seqbegin_or_lock(&net->fs_lock, &seq);
-
- p = net->fs_servers.rb_node;
- while (p) {
- server = rb_entry(p, struct afs_server, uuid_rb);
-
- diff = memcmp(uuid, &server->uuid, sizeof(*uuid));
- if (diff < 0) {
- p = p->rb_left;
- } else if (diff > 0) {
- p = p->rb_right;
- } else {
- afs_use_server(server, afs_server_trace_get_by_uuid);
- break;
- }
-
- server = NULL;
- }
- } while (need_seqretry(&net->fs_lock, seq));
+ p = cell->fs_servers.rb_node;
+ while (p) {
+ server = rb_entry(p, struct afs_server, uuid_rb);
- done_seqretry(&net->fs_lock, seq);
+ diff = memcmp(uuid, &server->uuid, sizeof(*uuid));
+ if (diff < 0) {
+ p = p->rb_left;
+ } else if (diff > 0) {
+ p = p->rb_right;
+ } else {
+ if (test_bit(AFS_SERVER_FL_UNCREATED, &server->flags))
+ return NULL; /* Need a write lock */
+ afs_use_server(server, true, afs_server_trace_use_by_uuid);
+ return server;
+ }
+ }
- _leave(" = %p", server);
- return server;
+ return NULL;
}
/*
- * Install a server record in the namespace tree. If there's a clash, we stick
- * it into a list anchored on whichever afs_server struct is actually in the
- * tree.
+ * Install a server record in the cell tree. The caller must hold an exclusive
+ * lock on cell->fs_lock.
*/
static struct afs_server *afs_install_server(struct afs_cell *cell,
- struct afs_server *candidate)
+ struct afs_server **candidate)
{
- const struct afs_endpoint_state *estate;
- const struct afs_addr_list *alist;
- struct afs_server *server, *next;
+ struct afs_server *server;
struct afs_net *net = cell->net;
struct rb_node **pp, *p;
int diff;
_enter("%p", candidate);
- write_seqlock(&net->fs_lock);
-
/* Firstly install the server in the UUID lookup tree */
- pp = &net->fs_servers.rb_node;
+ pp = &cell->fs_servers.rb_node;
p = NULL;
while (*pp) {
p = *pp;
_debug("- consider %p", p);
server = rb_entry(p, struct afs_server, uuid_rb);
- diff = memcmp(&candidate->uuid, &server->uuid, sizeof(uuid_t));
- if (diff < 0) {
+ diff = memcmp(&(*candidate)->uuid, &server->uuid, sizeof(uuid_t));
+ if (diff < 0)
pp = &(*pp)->rb_left;
- } else if (diff > 0) {
+ else if (diff > 0)
pp = &(*pp)->rb_right;
- } else {
- if (server->cell == cell)
- goto exists;
-
- /* We have the same UUID representing servers in
- * different cells. Append the new server to the list.
- */
- for (;;) {
- next = rcu_dereference_protected(
- server->uuid_next,
- lockdep_is_held(&net->fs_lock.lock));
- if (!next)
- break;
- server = next;
- }
- rcu_assign_pointer(server->uuid_next, candidate);
- candidate->uuid_prev = server;
- server = candidate;
- goto added_dup;
- }
+ else
+ goto exists;
}
- server = candidate;
+ server = *candidate;
+ *candidate = NULL;
rb_link_node(&server->uuid_rb, p, pp);
- rb_insert_color(&server->uuid_rb, &net->fs_servers);
+ rb_insert_color(&server->uuid_rb, &cell->fs_servers);
+ write_seqlock(&net->fs_lock);
hlist_add_head_rcu(&server->proc_link, &net->fs_proc);
+ write_sequnlock(&net->fs_lock);
-added_dup:
- write_seqlock(&net->fs_addr_lock);
- estate = rcu_dereference_protected(server->endpoint_state,
- lockdep_is_held(&net->fs_addr_lock.lock));
- alist = estate->addresses;
-
- /* Secondly, if the server has any IPv4 and/or IPv6 addresses, install
- * it in the IPv4 and/or IPv6 reverse-map lists.
- *
- * TODO: For speed we want to use something other than a flat list
- * here; even sorting the list in terms of lowest address would help a
- * bit, but anything we might want to do gets messy and memory
- * intensive.
- */
- if (alist->nr_addrs > 0)
- hlist_add_head_rcu(&server->addr_link, &net->fs_addresses);
-
- write_sequnlock(&net->fs_addr_lock);
+ afs_get_cell(cell, afs_cell_trace_get_server);
exists:
- afs_get_server(server, afs_server_trace_get_install);
- write_sequnlock(&net->fs_lock);
+ afs_use_server(server, true, afs_server_trace_use_install);
return server;
}
/*
- * Allocate a new server record and mark it active.
+ * Allocate a new server record and mark it as active but uncreated.
*/
-static struct afs_server *afs_alloc_server(struct afs_cell *cell,
- const uuid_t *uuid,
- struct afs_addr_list *alist)
+static struct afs_server *afs_alloc_server(struct afs_cell *cell, const uuid_t *uuid)
{
- struct afs_endpoint_state *estate;
struct afs_server *server;
struct afs_net *net = cell->net;
@@ -203,65 +119,50 @@ static struct afs_server *afs_alloc_server(struct afs_cell *cell,
server = kzalloc(sizeof(struct afs_server), GFP_KERNEL);
if (!server)
- goto enomem;
-
- estate = kzalloc(sizeof(struct afs_endpoint_state), GFP_KERNEL);
- if (!estate)
- goto enomem_server;
+ return NULL;
refcount_set(&server->ref, 1);
- atomic_set(&server->active, 1);
+ atomic_set(&server->active, 0);
+ __set_bit(AFS_SERVER_FL_UNCREATED, &server->flags);
server->debug_id = atomic_inc_return(&afs_server_debug_id);
- server->addr_version = alist->version;
server->uuid = *uuid;
rwlock_init(&server->fs_lock);
+ INIT_WORK(&server->destroyer, &afs_server_destroyer);
+ timer_setup(&server->timer, afs_server_timer, 0);
INIT_LIST_HEAD(&server->volumes);
init_waitqueue_head(&server->probe_wq);
+ mutex_init(&server->cm_token_lock);
INIT_LIST_HEAD(&server->probe_link);
+ INIT_HLIST_NODE(&server->proc_link);
spin_lock_init(&server->probe_lock);
server->cell = cell;
server->rtt = UINT_MAX;
server->service_id = FS_SERVICE;
-
server->probe_counter = 1;
server->probed_at = jiffies - LONG_MAX / 2;
- refcount_set(&estate->ref, 1);
- estate->addresses = alist;
- estate->server_id = server->debug_id;
- estate->probe_seq = 1;
- rcu_assign_pointer(server->endpoint_state, estate);
afs_inc_servers_outstanding(net);
- trace_afs_server(server->debug_id, 1, 1, afs_server_trace_alloc);
- trace_afs_estate(estate->server_id, estate->probe_seq, refcount_read(&estate->ref),
- afs_estate_trace_alloc_server);
_leave(" = %p", server);
return server;
-
-enomem_server:
- kfree(server);
-enomem:
- _leave(" = NULL [nomem]");
- return NULL;
}
/*
* Look up an address record for a server
*/
-static struct afs_addr_list *afs_vl_lookup_addrs(struct afs_cell *cell,
- struct key *key, const uuid_t *uuid)
+static struct afs_addr_list *afs_vl_lookup_addrs(struct afs_server *server,
+ struct key *key)
{
struct afs_vl_cursor vc;
struct afs_addr_list *alist = NULL;
int ret;
ret = -ERESTARTSYS;
- if (afs_begin_vlserver_operation(&vc, cell, key)) {
+ if (afs_begin_vlserver_operation(&vc, server->cell, key)) {
while (afs_select_vlserver(&vc)) {
if (test_bit(AFS_VLSERVER_FL_IS_YFS, &vc.server->flags))
- alist = afs_yfsvl_get_endpoints(&vc, uuid);
+ alist = afs_yfsvl_get_endpoints(&vc, &server->uuid);
else
- alist = afs_vl_get_addrs_u(&vc, uuid);
+ alist = afs_vl_get_addrs_u(&vc, &server->uuid);
}
ret = afs_end_vlserver_operation(&vc);
@@ -271,72 +172,122 @@ static struct afs_addr_list *afs_vl_lookup_addrs(struct afs_cell *cell,
}
/*
- * Get or create a fileserver record.
+ * Get or create a fileserver record and return it with an active-use count on
+ * it.
*/
struct afs_server *afs_lookup_server(struct afs_cell *cell, struct key *key,
const uuid_t *uuid, u32 addr_version)
{
- struct afs_addr_list *alist;
- struct afs_server *server, *candidate;
+ struct afs_addr_list *alist = NULL;
+ struct afs_server *server, *candidate = NULL;
+ bool creating = false;
+ int ret;
_enter("%p,%pU", cell->net, uuid);
- server = afs_find_server_by_uuid(cell->net, uuid);
+ down_read(&cell->fs_lock);
+ server = afs_find_server_by_uuid(cell, uuid);
+ /* Won't see servers marked uncreated. */
+ up_read(&cell->fs_lock);
+
if (server) {
+ timer_delete_sync(&server->timer);
+ if (test_bit(AFS_SERVER_FL_CREATING, &server->flags))
+ goto wait_for_creation;
if (server->addr_version != addr_version)
set_bit(AFS_SERVER_FL_NEEDS_UPDATE, &server->flags);
return server;
}
- alist = afs_vl_lookup_addrs(cell, key, uuid);
- if (IS_ERR(alist))
- return ERR_CAST(alist);
-
- candidate = afs_alloc_server(cell, uuid, alist);
+ candidate = afs_alloc_server(cell, uuid);
if (!candidate) {
afs_put_addrlist(alist, afs_alist_trace_put_server_oom);
return ERR_PTR(-ENOMEM);
}
- server = afs_install_server(cell, candidate);
- if (server != candidate) {
- afs_put_addrlist(alist, afs_alist_trace_put_server_dup);
+ down_write(&cell->fs_lock);
+ server = afs_install_server(cell, &candidate);
+ if (test_bit(AFS_SERVER_FL_CREATING, &server->flags)) {
+ /* We need to wait for creation to complete. */
+ up_write(&cell->fs_lock);
+ goto wait_for_creation;
+ }
+ if (test_bit(AFS_SERVER_FL_UNCREATED, &server->flags)) {
+ set_bit(AFS_SERVER_FL_CREATING, &server->flags);
+ clear_bit(AFS_SERVER_FL_UNCREATED, &server->flags);
+ creating = true;
+ }
+ up_write(&cell->fs_lock);
+ timer_delete_sync(&server->timer);
+
+ /* If we get to create the server, we look up the addresses and then
+ * immediately dispatch an asynchronous probe to each interface on the
+ * fileserver. This will make sure the repeat-probing service is
+ * started.
+ */
+ if (creating) {
+ alist = afs_vl_lookup_addrs(server, key);
+ if (IS_ERR(alist)) {
+ ret = PTR_ERR(alist);
+ goto create_failed;
+ }
+
+ ret = afs_fs_probe_fileserver(cell->net, server, alist, key);
+ if (ret)
+ goto create_failed;
+
+ clear_and_wake_up_bit(AFS_SERVER_FL_CREATING, &server->flags);
+ }
+
+out:
+ afs_put_addrlist(alist, afs_alist_trace_put_server_create);
+ if (candidate) {
+ kfree(rcu_access_pointer(server->endpoint_state));
kfree(candidate);
- } else {
- /* Immediately dispatch an asynchronous probe to each interface
- * on the fileserver. This will make sure the repeat-probing
- * service is started.
- */
- afs_fs_probe_fileserver(cell->net, server, alist, key);
+ afs_dec_servers_outstanding(cell->net);
+ }
+ return server ?: ERR_PTR(ret);
+
+wait_for_creation:
+ afs_see_server(server, afs_server_trace_wait_create);
+ wait_on_bit(&server->flags, AFS_SERVER_FL_CREATING, TASK_UNINTERRUPTIBLE);
+ if (test_bit_acquire(AFS_SERVER_FL_UNCREATED, &server->flags)) {
+ /* Barrier: read flag before error */
+ ret = READ_ONCE(server->create_error);
+ afs_put_server(cell->net, server, afs_server_trace_unuse_create_fail);
+ server = NULL;
+ goto out;
}
- return server;
-}
+ ret = 0;
+ goto out;
-/*
- * Set the server timer to fire after a given delay, assuming it's not already
- * set for an earlier time.
- */
-static void afs_set_server_timer(struct afs_net *net, time64_t delay)
-{
- if (net->live) {
- afs_inc_servers_outstanding(net);
- if (timer_reduce(&net->fs_timer, jiffies + delay * HZ))
- afs_dec_servers_outstanding(net);
+create_failed:
+ down_write(&cell->fs_lock);
+
+ WRITE_ONCE(server->create_error, ret);
+ smp_wmb(); /* Barrier: set error before flag. */
+ set_bit(AFS_SERVER_FL_UNCREATED, &server->flags);
+
+ clear_and_wake_up_bit(AFS_SERVER_FL_CREATING, &server->flags);
+
+ if (test_bit(AFS_SERVER_FL_UNCREATED, &server->flags)) {
+ clear_bit(AFS_SERVER_FL_UNCREATED, &server->flags);
+ creating = true;
}
+ afs_unuse_server(cell->net, server, afs_server_trace_unuse_create_fail);
+ server = NULL;
+
+ up_write(&cell->fs_lock);
+ goto out;
}
/*
- * Server management timer. We have an increment on fs_outstanding that we
- * need to pass along to the work item.
+ * Set/reduce a server's timer.
*/
-void afs_servers_timer(struct timer_list *timer)
+static void afs_set_server_timer(struct afs_server *server, unsigned int delay_secs)
{
- struct afs_net *net = container_of(timer, struct afs_net, fs_timer);
-
- _enter("");
- if (!queue_work(afs_wq, &net->fs_manager))
- afs_dec_servers_outstanding(net);
+ mod_timer(&server->timer, jiffies + delay_secs * HZ);
}
/*
@@ -355,32 +306,20 @@ struct afs_server *afs_get_server(struct afs_server *server,
}
/*
- * Try to get a reference on a server object.
+ * Get an active count on a server object and maybe remove from the inactive
+ * list.
*/
-static struct afs_server *afs_maybe_use_server(struct afs_server *server,
- enum afs_server_trace reason)
-{
- unsigned int a;
- int r;
-
- if (!__refcount_inc_not_zero(&server->ref, &r))
- return NULL;
-
- a = atomic_inc_return(&server->active);
- trace_afs_server(server->debug_id, r + 1, a, reason);
- return server;
-}
-
-/*
- * Get an active count on a server object.
- */
-struct afs_server *afs_use_server(struct afs_server *server, enum afs_server_trace reason)
+struct afs_server *afs_use_server(struct afs_server *server, bool activate,
+ enum afs_server_trace reason)
{
unsigned int a;
int r;
__refcount_inc(&server->ref, &r);
a = atomic_inc_return(&server->active);
+ if (a == 1 && activate &&
+ !test_bit(AFS_SERVER_FL_EXPIRED, &server->flags))
+ timer_delete(&server->timer);
trace_afs_server(server->debug_id, r + 1, a, reason);
return server;
@@ -413,13 +352,16 @@ void afs_put_server(struct afs_net *net, struct afs_server *server,
void afs_unuse_server_notime(struct afs_net *net, struct afs_server *server,
enum afs_server_trace reason)
{
- if (server) {
- unsigned int active = atomic_dec_return(&server->active);
+ if (!server)
+ return;
- if (active == 0)
- afs_set_server_timer(net, afs_server_gc_delay);
- afs_put_server(net, server, reason);
+ if (atomic_dec_and_test(&server->active)) {
+ if (test_bit(AFS_SERVER_FL_EXPIRED, &server->flags) ||
+ READ_ONCE(server->cell->state) >= AFS_CELL_REMOVING)
+ schedule_work(&server->destroyer);
}
+
+ afs_put_server(net, server, reason);
}
/*
@@ -428,10 +370,22 @@ void afs_unuse_server_notime(struct afs_net *net, struct afs_server *server,
void afs_unuse_server(struct afs_net *net, struct afs_server *server,
enum afs_server_trace reason)
{
- if (server) {
- server->unuse_time = ktime_get_real_seconds();
- afs_unuse_server_notime(net, server, reason);
+ if (!server)
+ return;
+
+ if (atomic_dec_and_test(&server->active)) {
+ if (!test_bit(AFS_SERVER_FL_EXPIRED, &server->flags) &&
+ READ_ONCE(server->cell->state) < AFS_CELL_REMOVING) {
+ time64_t unuse_time = ktime_get_real_seconds();
+
+ server->unuse_time = unuse_time;
+ afs_set_server_timer(server, afs_server_gc_delay);
+ } else {
+ schedule_work(&server->destroyer);
+ }
}
+
+ afs_put_server(net, server, reason);
}
static void afs_server_rcu(struct rcu_head *rcu)
@@ -442,6 +396,8 @@ static void afs_server_rcu(struct rcu_head *rcu)
atomic_read(&server->active), afs_server_trace_free);
afs_put_endpoint_state(rcu_access_pointer(server->endpoint_state),
afs_estate_trace_put_server);
+ afs_put_cell(server->cell, afs_cell_trace_put_server);
+ kfree(server->cm_rxgk_appdata.data);
kfree(server);
}
@@ -460,159 +416,119 @@ static void afs_give_up_callbacks(struct afs_net *net, struct afs_server *server
}
/*
- * destroy a dead server
+ * Check to see if the server record has expired.
*/
-static void afs_destroy_server(struct afs_net *net, struct afs_server *server)
+static bool afs_has_server_expired(const struct afs_server *server)
{
- if (test_bit(AFS_SERVER_FL_MAY_HAVE_CB, &server->flags))
- afs_give_up_callbacks(net, server);
+ time64_t expires_at;
- afs_put_server(net, server, afs_server_trace_destroy);
+ if (atomic_read(&server->active))
+ return false;
+
+ if (server->cell->net->live ||
+ server->cell->state >= AFS_CELL_REMOVING) {
+ trace_afs_server(server->debug_id, refcount_read(&server->ref),
+ 0, afs_server_trace_purging);
+ return true;
+ }
+
+ expires_at = server->unuse_time;
+ if (!test_bit(AFS_SERVER_FL_VL_FAIL, &server->flags) &&
+ !test_bit(AFS_SERVER_FL_NOT_FOUND, &server->flags))
+ expires_at += afs_server_gc_delay;
+
+ return ktime_get_real_seconds() > expires_at;
}
/*
- * Garbage collect any expired servers.
+ * Remove a server record from it's parent cell's database.
*/
-static void afs_gc_servers(struct afs_net *net, struct afs_server *gc_list)
+static bool afs_remove_server_from_cell(struct afs_server *server)
{
- struct afs_server *server, *next, *prev;
- int active;
-
- while ((server = gc_list)) {
- gc_list = server->gc_next;
-
- write_seqlock(&net->fs_lock);
-
- active = atomic_read(&server->active);
- if (active == 0) {
- trace_afs_server(server->debug_id, refcount_read(&server->ref),
- active, afs_server_trace_gc);
- next = rcu_dereference_protected(
- server->uuid_next, lockdep_is_held(&net->fs_lock.lock));
- prev = server->uuid_prev;
- if (!prev) {
- /* The one at the front is in the tree */
- if (!next) {
- rb_erase(&server->uuid_rb, &net->fs_servers);
- } else {
- rb_replace_node_rcu(&server->uuid_rb,
- &next->uuid_rb,
- &net->fs_servers);
- next->uuid_prev = NULL;
- }
- } else {
- /* This server is not at the front */
- rcu_assign_pointer(prev->uuid_next, next);
- if (next)
- next->uuid_prev = prev;
- }
-
- list_del(&server->probe_link);
- hlist_del_rcu(&server->proc_link);
- if (!hlist_unhashed(&server->addr_link))
- hlist_del_rcu(&server->addr_link);
- }
- write_sequnlock(&net->fs_lock);
+ struct afs_cell *cell = server->cell;
+
+ down_write(&cell->fs_lock);
- if (active == 0)
- afs_destroy_server(net, server);
+ if (!afs_has_server_expired(server)) {
+ up_write(&cell->fs_lock);
+ return false;
}
+
+ set_bit(AFS_SERVER_FL_EXPIRED, &server->flags);
+ _debug("expire %pU %u", &server->uuid, atomic_read(&server->active));
+ afs_see_server(server, afs_server_trace_see_expired);
+ rb_erase(&server->uuid_rb, &cell->fs_servers);
+ up_write(&cell->fs_lock);
+ return true;
}
-/*
- * Manage the records of servers known to be within a network namespace. This
- * includes garbage collecting unused servers.
- *
- * Note also that we were given an increment on net->servers_outstanding by
- * whoever queued us that we need to deal with before returning.
- */
-void afs_manage_servers(struct work_struct *work)
+static void afs_server_destroyer(struct work_struct *work)
{
- struct afs_net *net = container_of(work, struct afs_net, fs_manager);
- struct afs_server *gc_list = NULL;
- struct rb_node *cursor;
- time64_t now = ktime_get_real_seconds(), next_manage = TIME64_MAX;
- bool purging = !net->live;
-
- _enter("");
+ struct afs_endpoint_state *estate;
+ struct afs_server *server = container_of(work, struct afs_server, destroyer);
+ struct afs_net *net = server->cell->net;
- /* Trawl the server list looking for servers that have expired from
- * lack of use.
- */
- read_seqlock_excl(&net->fs_lock);
+ afs_see_server(server, afs_server_trace_see_destroyer);
- for (cursor = rb_first(&net->fs_servers); cursor; cursor = rb_next(cursor)) {
- struct afs_server *server =
- rb_entry(cursor, struct afs_server, uuid_rb);
- int active = atomic_read(&server->active);
+ if (test_bit(AFS_SERVER_FL_EXPIRED, &server->flags))
+ return;
- _debug("manage %pU %u", &server->uuid, active);
+ if (!afs_remove_server_from_cell(server))
+ return;
- if (purging) {
- trace_afs_server(server->debug_id, refcount_read(&server->ref),
- active, afs_server_trace_purging);
- if (active != 0)
- pr_notice("Can't purge s=%08x\n", server->debug_id);
- }
+ timer_shutdown_sync(&server->timer);
+ cancel_work(&server->destroyer);
- if (active == 0) {
- time64_t expire_at = server->unuse_time;
-
- if (!test_bit(AFS_SERVER_FL_VL_FAIL, &server->flags) &&
- !test_bit(AFS_SERVER_FL_NOT_FOUND, &server->flags))
- expire_at += afs_server_gc_delay;
- if (purging || expire_at <= now) {
- server->gc_next = gc_list;
- gc_list = server;
- } else if (expire_at < next_manage) {
- next_manage = expire_at;
- }
- }
- }
+ if (test_bit(AFS_SERVER_FL_MAY_HAVE_CB, &server->flags))
+ afs_give_up_callbacks(net, server);
- read_sequnlock_excl(&net->fs_lock);
+ /* Unbind the rxrpc_peer records from the server. */
+ estate = rcu_access_pointer(server->endpoint_state);
+ if (estate)
+ afs_set_peer_appdata(server, estate->addresses, NULL);
- /* Update the timer on the way out. We have to pass an increment on
- * servers_outstanding in the namespace that we are in to the timer or
- * the work scheduler.
- */
- if (!purging && next_manage < TIME64_MAX) {
- now = ktime_get_real_seconds();
+ write_seqlock(&net->fs_lock);
+ list_del_init(&server->probe_link);
+ if (!hlist_unhashed(&server->proc_link))
+ hlist_del_rcu(&server->proc_link);
+ write_sequnlock(&net->fs_lock);
- if (next_manage - now <= 0) {
- if (queue_work(afs_wq, &net->fs_manager))
- afs_inc_servers_outstanding(net);
- } else {
- afs_set_server_timer(net, next_manage - now);
- }
- }
+ afs_put_server(net, server, afs_server_trace_destroy);
+}
- afs_gc_servers(net, gc_list);
+static void afs_server_timer(struct timer_list *timer)
+{
+ struct afs_server *server = container_of(timer, struct afs_server, timer);
- afs_dec_servers_outstanding(net);
- _leave(" [%d]", atomic_read(&net->servers_outstanding));
+ afs_see_server(server, afs_server_trace_see_timer);
+ if (!test_bit(AFS_SERVER_FL_EXPIRED, &server->flags))
+ schedule_work(&server->destroyer);
}
-static void afs_queue_server_manager(struct afs_net *net)
+/*
+ * Wake up all the servers in a cell so that they can purge themselves.
+ */
+void afs_purge_servers(struct afs_cell *cell)
{
- afs_inc_servers_outstanding(net);
- if (!queue_work(afs_wq, &net->fs_manager))
- afs_dec_servers_outstanding(net);
+ struct afs_server *server;
+ struct rb_node *rb;
+
+ down_read(&cell->fs_lock);
+ for (rb = rb_first(&cell->fs_servers); rb; rb = rb_next(rb)) {
+ server = rb_entry(rb, struct afs_server, uuid_rb);
+ afs_see_server(server, afs_server_trace_see_purge);
+ schedule_work(&server->destroyer);
+ }
+ up_read(&cell->fs_lock);
}
/*
- * Purge list of servers.
+ * Wait for outstanding servers.
*/
-void afs_purge_servers(struct afs_net *net)
+void afs_wait_for_servers(struct afs_net *net)
{
_enter("");
- if (del_timer_sync(&net->fs_timer))
- afs_dec_servers_outstanding(net);
-
- afs_queue_server_manager(net);
-
- _debug("wait");
atomic_dec(&net->servers_outstanding);
wait_var_event(&net->servers_outstanding,
!atomic_read(&net->servers_outstanding));
@@ -636,7 +552,7 @@ static noinline bool afs_update_server_record(struct afs_operation *op,
atomic_read(&server->active),
afs_server_trace_update);
- alist = afs_vl_lookup_addrs(op->volume->cell, op->key, &server->uuid);
+ alist = afs_vl_lookup_addrs(server, op->key);
if (IS_ERR(alist)) {
rcu_read_lock();
estate = rcu_dereference(server->endpoint_state);
diff --git a/fs/afs/server_list.c b/fs/afs/server_list.c
index 7e7e567a7f8a..20d5474837df 100644
--- a/fs/afs/server_list.c
+++ b/fs/afs/server_list.c
@@ -16,7 +16,7 @@ void afs_put_serverlist(struct afs_net *net, struct afs_server_list *slist)
if (slist && refcount_dec_and_test(&slist->usage)) {
for (i = 0; i < slist->nr_servers; i++)
afs_unuse_server(net, slist->servers[i].server,
- afs_server_trace_put_slist);
+ afs_server_trace_unuse_slist);
kfree_rcu(slist, rcu);
}
}
@@ -97,8 +97,8 @@ struct afs_server_list *afs_alloc_server_list(struct afs_volume *volume,
break;
if (j < slist->nr_servers) {
if (slist->servers[j].server == server) {
- afs_put_server(volume->cell->net, server,
- afs_server_trace_put_slist_isort);
+ afs_unuse_server_notime(volume->cell->net, server,
+ afs_server_trace_unuse_slist_isort);
continue;
}
diff --git a/fs/afs/super.c b/fs/afs/super.c
index a9bee610674e..da407f2d6f0d 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -194,8 +194,6 @@ static int afs_show_options(struct seq_file *m, struct dentry *root)
if (as->dyn_root)
seq_puts(m, ",dyn");
- if (test_bit(AFS_VNODE_AUTOCELL, &AFS_FS_I(d_inode(root))->flags))
- seq_puts(m, ",autocell");
switch (as->flock_mode) {
case afs_flock_mode_unset: break;
case afs_flock_mode_local: p = "local"; break;
@@ -292,13 +290,14 @@ static int afs_parse_source(struct fs_context *fc, struct fs_parameter *param)
/* lookup the cell record */
if (cellname) {
cell = afs_lookup_cell(ctx->net, cellname, cellnamesz,
- NULL, false);
+ NULL, false,
+ afs_cell_trace_use_lookup_mount);
if (IS_ERR(cell)) {
pr_err("kAFS: unable to lookup cell '%*.*s'\n",
cellnamesz, cellnamesz, cellname ?: "");
return PTR_ERR(cell);
}
- afs_unuse_cell(ctx->net, ctx->cell, afs_cell_trace_unuse_parse);
+ afs_unuse_cell(ctx->cell, afs_cell_trace_unuse_parse);
afs_see_cell(cell, afs_cell_trace_see_source);
ctx->cell = cell;
}
@@ -395,7 +394,7 @@ static int afs_validate_fc(struct fs_context *fc)
ctx->key = NULL;
cell = afs_use_cell(ctx->cell->alias_of,
afs_cell_trace_use_fc_alias);
- afs_unuse_cell(ctx->net, ctx->cell, afs_cell_trace_unuse_fc);
+ afs_unuse_cell(ctx->cell, afs_cell_trace_unuse_fc);
ctx->cell = cell;
goto reget_key;
}
@@ -468,7 +467,7 @@ static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx)
/* allocate the root inode and dentry */
if (as->dyn_root) {
- inode = afs_iget_pseudo_dir(sb, true);
+ inode = afs_dynroot_iget_root(sb);
} else {
sprintf(sb->s_id, "%llu", as->volume->vid);
afs_activate_volume(as->volume);
@@ -478,21 +477,15 @@ static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx)
if (IS_ERR(inode))
return PTR_ERR(inode);
- if (ctx->autocell || as->dyn_root)
- set_bit(AFS_VNODE_AUTOCELL, &AFS_FS_I(inode)->flags);
-
ret = -ENOMEM;
sb->s_root = d_make_root(inode);
if (!sb->s_root)
goto error;
if (as->dyn_root) {
- sb->s_d_op = &afs_dynroot_dentry_operations;
- ret = afs_dynroot_populate(sb);
- if (ret < 0)
- goto error;
+ set_default_d_op(sb, &afs_dynroot_dentry_operations);
} else {
- sb->s_d_op = &afs_fs_dentry_operations;
+ set_default_d_op(sb, &afs_fs_dentry_operations);
rcu_assign_pointer(as->volume->sb, sb);
}
@@ -527,9 +520,8 @@ static struct afs_super_info *afs_alloc_sbi(struct fs_context *fc)
static void afs_destroy_sbi(struct afs_super_info *as)
{
if (as) {
- struct afs_net *net = afs_net(as->net_ns);
afs_put_volume(as->volume, afs_volume_trace_put_destroy_sbi);
- afs_unuse_cell(net, as->cell, afs_cell_trace_unuse_sbi);
+ afs_unuse_cell(as->cell, afs_cell_trace_unuse_sbi);
put_net(as->net_ns);
kfree(as);
}
@@ -539,9 +531,6 @@ static void afs_kill_super(struct super_block *sb)
{
struct afs_super_info *as = AFS_FS_S(sb);
- if (as->dyn_root)
- afs_dynroot_depopulate(sb);
-
/* Clear the callback interests (which will do ilookup5) before
* deactivating the superblock.
*/
@@ -615,7 +604,7 @@ static void afs_free_fc(struct fs_context *fc)
afs_destroy_sbi(fc->s_fs_info);
afs_put_volume(ctx->volume, afs_volume_trace_put_free_fc);
- afs_unuse_cell(ctx->net, ctx->cell, afs_cell_trace_unuse_fc);
+ afs_unuse_cell(ctx->cell, afs_cell_trace_unuse_fc);
key_put(ctx->key);
kfree(ctx);
}
diff --git a/fs/afs/vl_alias.c b/fs/afs/vl_alias.c
index f9e76b604f31..709b4cdb723e 100644
--- a/fs/afs/vl_alias.c
+++ b/fs/afs/vl_alias.c
@@ -205,11 +205,11 @@ static int afs_query_for_alias(struct afs_cell *cell, struct key *key)
goto is_alias;
if (mutex_lock_interruptible(&cell->net->proc_cells_lock) < 0) {
- afs_unuse_cell(cell->net, p, afs_cell_trace_unuse_check_alias);
+ afs_unuse_cell(p, afs_cell_trace_unuse_check_alias);
return -ERESTARTSYS;
}
- afs_unuse_cell(cell->net, p, afs_cell_trace_unuse_check_alias);
+ afs_unuse_cell(p, afs_cell_trace_unuse_check_alias);
}
mutex_unlock(&cell->net->proc_cells_lock);
@@ -269,7 +269,8 @@ static int yfs_check_canonical_cell_name(struct afs_cell *cell, struct key *key)
if (!name_len || name_len > AFS_MAXCELLNAME)
master = ERR_PTR(-EOPNOTSUPP);
else
- master = afs_lookup_cell(cell->net, cell_name, name_len, NULL, false);
+ master = afs_lookup_cell(cell->net, cell_name, name_len, NULL, false,
+ afs_cell_trace_use_lookup_canonical);
kfree(cell_name);
if (IS_ERR(master))
return PTR_ERR(master);
diff --git a/fs/afs/vl_rotate.c b/fs/afs/vl_rotate.c
index d8f79f6ada3d..6ad9688d8f4b 100644
--- a/fs/afs/vl_rotate.c
+++ b/fs/afs/vl_rotate.c
@@ -48,7 +48,7 @@ static bool afs_start_vl_iteration(struct afs_vl_cursor *vc)
cell->dns_expiry <= ktime_get_real_seconds()) {
dns_lookup_count = smp_load_acquire(&cell->dns_lookup_count);
set_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags);
- afs_queue_cell(cell, afs_cell_trace_get_queue_dns);
+ afs_queue_cell(cell, afs_cell_trace_queue_dns);
if (cell->dns_source == DNS_RECORD_UNAVAILABLE) {
if (wait_var_event_interruptible(
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index af3a3f57c1b3..0efff3d25133 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -10,6 +10,7 @@
#include "internal.h"
static unsigned __read_mostly afs_volume_record_life = 60 * 60;
+static atomic_t afs_volume_debug_id;
static void afs_destroy_volume(struct work_struct *work);
@@ -59,7 +60,7 @@ static void afs_remove_volume_from_cell(struct afs_volume *volume)
struct afs_cell *cell = volume->cell;
if (!hlist_unhashed(&volume->proc_link)) {
- trace_afs_volume(volume->vid, refcount_read(&cell->ref),
+ trace_afs_volume(volume->debug_id, volume->vid, refcount_read(&volume->ref),
afs_volume_trace_remove);
write_seqlock(&cell->volume_lock);
hlist_del_rcu(&volume->proc_link);
@@ -84,6 +85,7 @@ static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params,
if (!volume)
goto error_0;
+ volume->debug_id = atomic_inc_return(&afs_volume_debug_id);
volume->vid = vldb->vid[params->type];
volume->update_at = ktime_get_real_seconds() + afs_volume_record_life;
volume->cell = afs_get_cell(params->cell, afs_cell_trace_get_vol);
@@ -115,7 +117,7 @@ static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params,
*_slist = slist;
rcu_assign_pointer(volume->servers, slist);
- trace_afs_volume(volume->vid, 1, afs_volume_trace_alloc);
+ trace_afs_volume(volume->debug_id, volume->vid, 1, afs_volume_trace_alloc);
return volume;
error_1:
@@ -247,7 +249,7 @@ static void afs_destroy_volume(struct work_struct *work)
afs_remove_volume_from_cell(volume);
afs_put_serverlist(volume->cell->net, slist);
afs_put_cell(volume->cell, afs_cell_trace_put_vol);
- trace_afs_volume(volume->vid, refcount_read(&volume->ref),
+ trace_afs_volume(volume->debug_id, volume->vid, refcount_read(&volume->ref),
afs_volume_trace_free);
kfree_rcu(volume, rcu);
@@ -262,7 +264,7 @@ bool afs_try_get_volume(struct afs_volume *volume, enum afs_volume_trace reason)
int r;
if (__refcount_inc_not_zero(&volume->ref, &r)) {
- trace_afs_volume(volume->vid, r + 1, reason);
+ trace_afs_volume(volume->debug_id, volume->vid, r + 1, reason);
return true;
}
return false;
@@ -278,7 +280,7 @@ struct afs_volume *afs_get_volume(struct afs_volume *volume,
int r;
__refcount_inc(&volume->ref, &r);
- trace_afs_volume(volume->vid, r + 1, reason);
+ trace_afs_volume(volume->debug_id, volume->vid, r + 1, reason);
}
return volume;
}
@@ -290,12 +292,13 @@ struct afs_volume *afs_get_volume(struct afs_volume *volume,
void afs_put_volume(struct afs_volume *volume, enum afs_volume_trace reason)
{
if (volume) {
+ unsigned int debug_id = volume->debug_id;
afs_volid_t vid = volume->vid;
bool zero;
int r;
zero = __refcount_dec_and_test(&volume->ref, &r);
- trace_afs_volume(vid, r - 1, reason);
+ trace_afs_volume(debug_id, vid, r - 1, reason);
if (zero)
schedule_work(&volume->destructor);
}
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 18b0a9f1615e..2e7526ea883a 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -120,17 +120,17 @@ static void afs_issue_write_worker(struct work_struct *work)
#if 0 // Error injection
if (subreq->debug_index == 3)
- return netfs_write_subrequest_terminated(subreq, -ENOANO, false);
+ return netfs_write_subrequest_terminated(subreq, -ENOANO);
if (!subreq->retry_count) {
set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
- return netfs_write_subrequest_terminated(subreq, -EAGAIN, false);
+ return netfs_write_subrequest_terminated(subreq, -EAGAIN);
}
#endif
op = afs_alloc_operation(wreq->netfs_priv, vnode->volume);
if (IS_ERR(op))
- return netfs_write_subrequest_terminated(subreq, -EAGAIN, false);
+ return netfs_write_subrequest_terminated(subreq, -EAGAIN);
afs_op_set_vnode(op, 0, vnode);
op->file[0].dv_delta = 1;
@@ -166,7 +166,7 @@ static void afs_issue_write_worker(struct work_struct *work)
break;
}
- netfs_write_subrequest_terminated(subreq, ret < 0 ? ret : subreq->len, false);
+ netfs_write_subrequest_terminated(subreq, ret < 0 ? ret : subreq->len);
}
void afs_issue_write(struct netfs_io_subrequest *subreq)
@@ -202,6 +202,7 @@ void afs_retry_request(struct netfs_io_request *wreq, struct netfs_io_stream *st
case NETFS_READ_GAPS:
case NETFS_READ_SINGLE:
case NETFS_READ_FOR_WRITE:
+ case NETFS_UNBUFFERED_READ:
case NETFS_DIO_READ:
return;
default:
diff --git a/fs/aio.c b/fs/aio.c
index 7b976b564cfc..7fc7b6221312 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -392,15 +392,15 @@ static const struct vm_operations_struct aio_ring_vm_ops = {
#endif
};
-static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
+static int aio_ring_mmap_prepare(struct vm_area_desc *desc)
{
- vm_flags_set(vma, VM_DONTEXPAND);
- vma->vm_ops = &aio_ring_vm_ops;
+ desc->vm_flags |= VM_DONTEXPAND;
+ desc->vm_ops = &aio_ring_vm_ops;
return 0;
}
static const struct file_operations aio_ring_fops = {
- .mmap = aio_ring_mmap,
+ .mmap_prepare = aio_ring_mmap_prepare,
};
#if IS_ENABLED(CONFIG_MIGRATION)
@@ -1511,6 +1511,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb, int rw_type)
{
int ret;
+ req->ki_write_stream = 0;
req->ki_complete = aio_complete_rw;
req->private = NULL;
req->ki_pos = iocb->aio_offset;
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 42bd1cb7c9cd..180a458fc4f7 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -24,10 +24,51 @@
#include <linux/uaccess.h>
+#include "internal.h"
+
static struct vfsmount *anon_inode_mnt __ro_after_init;
static struct inode *anon_inode_inode __ro_after_init;
/*
+ * User space expects anonymous inodes to have no file type in st_mode.
+ *
+ * In particular, 'lsof' has this legacy logic:
+ *
+ * type = s->st_mode & S_IFMT;
+ * switch (type) {
+ * ...
+ * case 0:
+ * if (!strcmp(p, "anon_inode"))
+ * Lf->ntype = Ntype = N_ANON_INODE;
+ *
+ * to detect our old anon_inode logic.
+ *
+ * Rather than mess with our internal sane inode data, just fix it
+ * up here in getattr() by masking off the format bits.
+ */
+int anon_inode_getattr(struct mnt_idmap *idmap, const struct path *path,
+ struct kstat *stat, u32 request_mask,
+ unsigned int query_flags)
+{
+ struct inode *inode = d_inode(path->dentry);
+
+ generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
+ stat->mode &= ~S_IFMT;
+ return 0;
+}
+
+int anon_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
+ struct iattr *attr)
+{
+ return -EOPNOTSUPP;
+}
+
+static const struct inode_operations anon_inode_operations = {
+ .getattr = anon_inode_getattr,
+ .setattr = anon_inode_setattr,
+};
+
+/*
* anon_inodefs_dname() is called from d_path().
*/
static char *anon_inodefs_dname(struct dentry *dentry, char *buffer, int buflen)
@@ -45,6 +86,8 @@ static int anon_inodefs_init_fs_context(struct fs_context *fc)
struct pseudo_fs_context *ctx = init_pseudo(fc, ANON_INODE_FS_MAGIC);
if (!ctx)
return -ENOMEM;
+ fc->s_iflags |= SB_I_NOEXEC;
+ fc->s_iflags |= SB_I_NODEV;
ctx->dops = &anon_inodefs_dentry_operations;
return 0;
}
@@ -55,25 +98,38 @@ static struct file_system_type anon_inode_fs_type = {
.kill_sb = kill_anon_super,
};
-static struct inode *anon_inode_make_secure_inode(
- const char *name,
- const struct inode *context_inode)
+/**
+ * anon_inode_make_secure_inode - allocate an anonymous inode with security context
+ * @sb: [in] Superblock to allocate from
+ * @name: [in] Name of the class of the newfile (e.g., "secretmem")
+ * @context_inode:
+ * [in] Optional parent inode for security inheritance
+ *
+ * The function ensures proper security initialization through the LSM hook
+ * security_inode_init_security_anon().
+ *
+ * Return: Pointer to new inode on success, ERR_PTR on failure.
+ */
+struct inode *anon_inode_make_secure_inode(struct super_block *sb, const char *name,
+ const struct inode *context_inode)
{
struct inode *inode;
- const struct qstr qname = QSTR_INIT(name, strlen(name));
int error;
- inode = alloc_anon_inode(anon_inode_mnt->mnt_sb);
+ inode = alloc_anon_inode(sb);
if (IS_ERR(inode))
return inode;
inode->i_flags &= ~S_PRIVATE;
- error = security_inode_init_security_anon(inode, &qname, context_inode);
+ inode->i_op = &anon_inode_operations;
+ error = security_inode_init_security_anon(inode, &QSTR(name),
+ context_inode);
if (error) {
iput(inode);
return ERR_PTR(error);
}
return inode;
}
+EXPORT_SYMBOL_FOR_MODULES(anon_inode_make_secure_inode, "kvm");
static struct file *__anon_inode_getfile(const char *name,
const struct file_operations *fops,
@@ -88,7 +144,8 @@ static struct file *__anon_inode_getfile(const char *name,
return ERR_PTR(-ENOENT);
if (make_inode) {
- inode = anon_inode_make_secure_inode(name, context_inode);
+ inode = anon_inode_make_secure_inode(anon_inode_mnt->mnt_sb,
+ name, context_inode);
if (IS_ERR(inode)) {
file = ERR_CAST(inode);
goto err;
@@ -313,6 +370,7 @@ static int __init anon_inode_init(void)
anon_inode_inode = alloc_anon_inode(anon_inode_mnt->mnt_sb);
if (IS_ERR(anon_inode_inode))
panic("anon_inode_init() inode allocation failed (%ld)\n", PTR_ERR(anon_inode_inode));
+ anon_inode_inode->i_op = &anon_inode_operations;
return 0;
}
diff --git a/fs/attr.c b/fs/attr.c
index 9caf63d20d03..5425c1dbbff9 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -230,7 +230,7 @@ EXPORT_SYMBOL(setattr_prepare);
* @inode: the inode to be truncated
* @offset: the new size to assign to the inode
*
- * inode_newsize_ok must be called with i_mutex held.
+ * inode_newsize_ok must be called with i_rwsem held exclusively.
*
* inode_newsize_ok will check filesystem limits and ulimits to check that the
* new inode size is within limits. inode_newsize_ok will also send SIGXFSZ
@@ -318,7 +318,7 @@ static void setattr_copy_mgtime(struct inode *inode, const struct iattr *attr)
* @inode: the inode to be updated
* @attr: the new attributes
*
- * setattr_copy must be called with i_mutex held.
+ * setattr_copy must be called with i_rwsem held exclusively.
*
* setattr_copy updates the inode's metadata with that specified
* in attr on idmapped mounts. Necessary permission checks to determine
@@ -403,13 +403,13 @@ EXPORT_SYMBOL(may_setattr);
* @attr: new attributes
* @delegated_inode: returns inode, if the inode is delegated
*
- * The caller must hold the i_mutex on the affected object.
+ * The caller must hold the i_rwsem exclusively on the affected object.
*
* If notify_change discovers a delegation in need of breaking,
* it will return -EWOULDBLOCK and return a reference to the inode in
* delegated_inode. The caller should then break the delegation and
* retry. Because breaking a delegation may take a long time, the
- * caller should drop the i_mutex before doing so.
+ * caller should drop the i_rwsem before doing so.
*
* Alternatively, a caller may pass NULL for delegated_inode. This may
* be appropriate for callers that expect the underlying filesystem not
@@ -456,7 +456,7 @@ int notify_change(struct mnt_idmap *idmap, struct dentry *dentry,
if (S_ISLNK(inode->i_mode))
return -EOPNOTSUPP;
- /* Flag setting protected by i_mutex */
+ /* Flag setting protected by i_rwsem */
if (is_sxid(attr->ia_mode))
inode->i_flags &= ~S_NOSEC;
}
diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
index 77c7991d89aa..23cea74f9933 100644
--- a/fs/autofs/autofs_i.h
+++ b/fs/autofs/autofs_i.h
@@ -218,6 +218,8 @@ void autofs_clean_ino(struct autofs_info *);
static inline int autofs_check_pipe(struct file *pipe)
{
+ if (pipe->f_mode & FMODE_PATH)
+ return -EINVAL;
if (!(pipe->f_mode & FMODE_CAN_WRITE))
return -EINVAL;
if (!S_ISFIFO(file_inode(pipe)->i_mode))
diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c
index 6d57efbb8110..d8dd150cbd74 100644
--- a/fs/autofs/dev-ioctl.c
+++ b/fs/autofs/dev-ioctl.c
@@ -442,7 +442,6 @@ static int autofs_dev_ioctl_timeout(struct file *fp,
sbi->exp_timeout = timeout * HZ;
} else {
struct dentry *base = fp->f_path.dentry;
- struct inode *inode = base->d_inode;
int path_len = param->size - AUTOFS_DEV_IOCTL_SIZE - 1;
struct dentry *dentry;
struct autofs_info *ino;
@@ -460,9 +459,8 @@ static int autofs_dev_ioctl_timeout(struct file *fp,
"the parent autofs mount timeout which could "
"prevent shutdown\n");
- inode_lock_shared(inode);
- dentry = try_lookup_one_len(param->path, base, path_len);
- inode_unlock_shared(inode);
+ dentry = try_lookup_noperm(&QSTR_LEN(param->path, path_len),
+ base);
if (IS_ERR_OR_NULL(dentry))
return dentry ? PTR_ERR(dentry) : -ENOENT;
ino = autofs_dentry_ino(dentry);
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index ee2edccaef70..f5c16ffba013 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -311,7 +311,7 @@ static int autofs_fill_super(struct super_block *s, struct fs_context *fc)
s->s_blocksize_bits = 10;
s->s_magic = AUTOFS_SUPER_MAGIC;
s->s_op = &autofs_sops;
- s->s_d_op = &autofs_dentry_operations;
+ set_default_d_op(s, &autofs_dentry_operations);
s->s_time_gran = 1;
/*
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 530d18827e35..174c7205fee4 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -15,8 +15,8 @@ static int autofs_dir_symlink(struct mnt_idmap *, struct inode *,
struct dentry *, const char *);
static int autofs_dir_unlink(struct inode *, struct dentry *);
static int autofs_dir_rmdir(struct inode *, struct dentry *);
-static int autofs_dir_mkdir(struct mnt_idmap *, struct inode *,
- struct dentry *, umode_t);
+static struct dentry *autofs_dir_mkdir(struct mnt_idmap *, struct inode *,
+ struct dentry *, umode_t);
static long autofs_root_ioctl(struct file *, unsigned int, unsigned long);
#ifdef CONFIG_COMPAT
static long autofs_root_compat_ioctl(struct file *,
@@ -720,9 +720,9 @@ static int autofs_dir_rmdir(struct inode *dir, struct dentry *dentry)
return 0;
}
-static int autofs_dir_mkdir(struct mnt_idmap *idmap,
- struct inode *dir, struct dentry *dentry,
- umode_t mode)
+static struct dentry *autofs_dir_mkdir(struct mnt_idmap *idmap,
+ struct inode *dir, struct dentry *dentry,
+ umode_t mode)
{
struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb);
struct autofs_info *ino = autofs_dentry_ino(dentry);
@@ -739,7 +739,7 @@ static int autofs_dir_mkdir(struct mnt_idmap *idmap,
inode = autofs_get_inode(dir->i_sb, S_IFDIR | mode);
if (!inode)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
d_add(dentry, inode);
if (sbi->version < 5)
@@ -751,7 +751,7 @@ static int autofs_dir_mkdir(struct mnt_idmap *idmap,
inc_nlink(dir);
inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
- return 0;
+ return NULL;
}
/* Get/set timeout ioctl() operation */
diff --git a/fs/backing-file.c b/fs/backing-file.c
index 763fbe9b72b2..15a7f8031084 100644
--- a/fs/backing-file.c
+++ b/fs/backing-file.c
@@ -41,7 +41,7 @@ struct file *backing_file_open(const struct path *user_path, int flags,
return f;
path_get(user_path);
- *backing_file_user_path(f) = *user_path;
+ backing_file_set_user_path(f, user_path);
error = vfs_open(real_path, f);
if (error) {
fput(f);
@@ -65,7 +65,7 @@ struct file *backing_tmpfile_open(const struct path *user_path, int flags,
return f;
path_get(user_path);
- *backing_file_user_path(f) = *user_path;
+ backing_file_set_user_path(f, user_path);
error = vfs_tmpfile(real_idmap, real_parentpath, f, mode);
if (error) {
fput(f);
@@ -333,13 +333,13 @@ int backing_file_mmap(struct file *file, struct vm_area_struct *vma,
if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING)))
return -EIO;
- if (!file->f_op->mmap)
+ if (!can_mmap_file(file))
return -ENODEV;
vma_set_file(vma, file);
old_cred = override_creds(ctx->cred);
- ret = call_mmap(vma->vm_file, vma);
+ ret = vfs_mmap(vma->vm_file, vma);
revert_creds(old_cred);
if (ctx->accessed)
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 316d88da2ce1..0ef9bcb744dd 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -58,10 +58,10 @@ static int bad_inode_symlink(struct mnt_idmap *idmap,
return -EIO;
}
-static int bad_inode_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *bad_inode_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
- return -EIO;
+ return ERR_PTR(-EIO);
}
static int bad_inode_rmdir (struct inode *dir, struct dentry *dentry)
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
index 85eea7a4dea3..8cb2b9d5da96 100644
--- a/fs/bcachefs/Kconfig
+++ b/fs/bcachefs/Kconfig
@@ -4,7 +4,7 @@ config BCACHEFS_FS
depends on BLOCK
select EXPORTFS
select CLOSURES
- select LIBCRC32C
+ select CRC32
select CRC64
select FS_POSIX_ACL
select LZ4_COMPRESS
@@ -15,10 +15,9 @@ config BCACHEFS_FS
select ZLIB_INFLATE
select ZSTD_COMPRESS
select ZSTD_DECOMPRESS
- select CRYPTO
- select CRYPTO_SHA256
- select CRYPTO_CHACHA20
- select CRYPTO_POLY1305
+ select CRYPTO_LIB_SHA256
+ select CRYPTO_LIB_CHACHA
+ select CRYPTO_LIB_POLY1305
select KEYS
select RAID6_PQ
select XOR_BLOCKS
@@ -26,6 +25,7 @@ config BCACHEFS_FS
select SRCU
select SYMBOLIC_ERRNAME
select MIN_HEAP
+ select XARRAY_MULTI
help
The bcachefs filesystem - a modern, copy on write filesystem, with
support for multiple devices, compression, checksumming, etc.
@@ -61,6 +61,13 @@ config BCACHEFS_DEBUG
The resulting code will be significantly slower than normal; you
probably shouldn't select this option unless you're a developer.
+config BCACHEFS_INJECT_TRANSACTION_RESTARTS
+ bool "Randomly inject transaction restarts"
+ depends on BCACHEFS_DEBUG
+ help
+ Randomly inject transaction restarts in a few core paths - may have a
+ significant performance penalty
+
config BCACHEFS_TESTS
bool "bcachefs unit and performance tests"
depends on BCACHEFS_FS
@@ -96,6 +103,14 @@ config BCACHEFS_PATH_TRACEPOINTS
Enable extra tracepoints for debugging btree_path operations; we don't
normally want these enabled because they happen at very high rates.
+config BCACHEFS_TRANS_KMALLOC_TRACE
+ bool "Trace bch2_trans_kmalloc() calls"
+ depends on BCACHEFS_FS
+
+config BCACHEFS_ASYNC_OBJECT_LISTS
+ bool "Keep async objects on fast_lists for debugfs visibility"
+ depends on BCACHEFS_FS && DEBUG_FS
+
config MEAN_AND_VARIANCE_UNIT_TEST
tristate "mean_and_variance unit tests" if !KUNIT_ALL_TESTS
depends on KUNIT
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index d2689388d5e8..93c8ee5425c8 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -35,13 +35,14 @@ bcachefs-y := \
disk_accounting.o \
disk_groups.o \
ec.o \
+ enumerated_ref.o \
errcode.o \
error.o \
extents.o \
extent_update.o \
eytzinger.o \
+ fast_list.o \
fs.o \
- fs-common.o \
fs-ioctl.o \
fs-io.o \
fs-io-buffered.o \
@@ -64,9 +65,11 @@ bcachefs-y := \
migrate.o \
move.o \
movinggc.o \
+ namei.o \
nocow_locking.o \
opts.o \
printbuf.o \
+ progress.o \
quota.o \
rebalance.o \
rcu_pending.o \
@@ -96,6 +99,8 @@ bcachefs-y := \
varint.o \
xattr.o
+bcachefs-$(CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS) += async_objs.o
+
obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST) += mean_and_variance_test.o
# Silence "note: xyz changed in GCC X.X" messages
diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index 99487727ae64..d03adc36100e 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -273,7 +273,7 @@ struct posix_acl *bch2_get_acl(struct inode *vinode, int type, bool rcu)
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0);
- struct btree_iter iter = { NULL };
+ struct btree_iter iter = {};
struct posix_acl *acl = NULL;
if (rcu)
@@ -344,7 +344,7 @@ int bch2_set_acl(struct mnt_idmap *idmap,
{
struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- struct btree_iter inode_iter = { NULL };
+ struct btree_iter inode_iter = {};
struct bch_inode_unpacked inode_u;
struct posix_acl *acl;
umode_t mode;
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index fc2ef33b67b3..66de46318620 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -17,10 +17,10 @@
#include "debug.h"
#include "disk_accounting.h"
#include "ec.h"
+#include "enumerated_ref.h"
#include "error.h"
#include "lru.h"
#include "recovery.h"
-#include "trace.h"
#include "varint.h"
#include <linux/kthread.h>
@@ -232,7 +232,7 @@ int bch2_alloc_v3_validate(struct bch_fs *c, struct bkey_s_c k,
int ret = 0;
bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k),
- c, alloc_v2_unpack_error,
+ c, alloc_v3_unpack_error,
"unpack error");
fsck_err:
return ret;
@@ -308,7 +308,8 @@ int bch2_alloc_v4_validate(struct bch_fs *c, struct bkey_s_c k,
"data type inconsistency");
bkey_fsck_err_on(!a.io_time[READ] &&
- c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs,
+ !(c->recovery.passes_to_run &
+ BIT_ULL(BCH_RECOVERY_PASS_check_alloc_to_lru_refs)),
c, alloc_key_cached_but_read_time_zero,
"cached bucket with read_time == 0");
break;
@@ -335,11 +336,10 @@ void bch2_alloc_v4_swab(struct bkey_s k)
a->stripe_sectors = swab32(a->stripe_sectors);
}
-void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+static inline void __bch2_alloc_v4_to_text(struct printbuf *out, struct bch_fs *c,
+ unsigned dev, const struct bch_alloc_v4 *a)
{
- struct bch_alloc_v4 _a;
- const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
- struct bch_dev *ca = c ? bch2_dev_bucket_tryget_noerror(c, k.k->p) : NULL;
+ struct bch_dev *ca = c ? bch2_dev_tryget_noerror(c, dev) : NULL;
prt_newline(out);
printbuf_indent_add(out, 2);
@@ -367,6 +367,19 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
bch2_dev_put(ca);
}
+void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+ struct bch_alloc_v4 _a;
+ const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
+
+ __bch2_alloc_v4_to_text(out, c, k.k->p.inode, a);
+}
+
+void bch2_alloc_v4_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+ __bch2_alloc_v4_to_text(out, c, k.k->p.inode, bkey_s_c_to_alloc_v4(k).v);
+}
+
void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out)
{
if (k.k->type == KEY_TYPE_alloc_v4) {
@@ -478,12 +491,27 @@ struct bkey_i_alloc_v4 *bch2_trans_start_alloc_update(struct btree_trans *trans,
enum btree_iter_update_trigger_flags flags)
{
struct btree_iter iter;
- struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update_noupdate(trans, &iter, pos);
- int ret = PTR_ERR_OR_ZERO(a);
- if (ret)
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, pos,
+ BTREE_ITER_with_updates|
+ BTREE_ITER_cached|
+ BTREE_ITER_intent);
+ int ret = bkey_err(k);
+ if (unlikely(ret))
return ERR_PTR(ret);
- ret = bch2_trans_update(trans, &iter, &a->k_i, flags);
+ if ((void *) k.v >= trans->mem &&
+ (void *) k.v < trans->mem + trans->mem_top) {
+ bch2_trans_iter_exit(trans, &iter);
+ return container_of(bkey_s_c_to_alloc_v4(k).v, struct bkey_i_alloc_v4, v);
+ }
+
+ struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut_inlined(trans, k);
+ if (IS_ERR(a)) {
+ bch2_trans_iter_exit(trans, &iter);
+ return a;
+ }
+
+ ret = bch2_trans_update_ip(trans, &iter, &a->k_i, flags, _RET_IP_);
bch2_trans_iter_exit(trans, &iter);
return unlikely(ret) ? ERR_PTR(ret) : a;
}
@@ -589,6 +617,8 @@ iter_err:
int bch2_alloc_read(struct bch_fs *c)
{
+ down_read(&c->state_lock);
+
struct btree_trans *trans = bch2_trans_get(c);
struct bch_dev *ca = NULL;
int ret;
@@ -608,7 +638,7 @@ int bch2_alloc_read(struct bch_fs *c)
* bch2_check_alloc_key() which runs later:
*/
if (!ca) {
- bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
+ bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0));
continue;
}
@@ -629,17 +659,17 @@ int bch2_alloc_read(struct bch_fs *c)
* bch2_check_alloc_key() which runs later:
*/
if (!ca) {
- bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
+ bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0));
continue;
}
if (k.k->p.offset < ca->mi.first_bucket) {
- bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode, ca->mi.first_bucket));
+ bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode, ca->mi.first_bucket));
continue;
}
if (k.k->p.offset >= ca->mi.nbuckets) {
- bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
+ bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0));
continue;
}
@@ -652,6 +682,7 @@ int bch2_alloc_read(struct bch_fs *c)
bch2_dev_put(ca);
bch2_trans_put(trans);
+ up_read(&c->state_lock);
bch_err_fn(c, ret);
return ret;
}
@@ -673,13 +704,12 @@ static int __need_discard_or_freespace_err(struct btree_trans *trans,
bch2_bkey_val_to_text(&buf, c, alloc_k);
int ret = __bch2_fsck_err(NULL, trans, flags, err_id,
- "bucket incorrectly %sset in %s btree\n"
- " %s",
+ "bucket incorrectly %sset in %s btree\n%s",
set ? "" : "un",
bch2_btree_id_str(btree),
buf.buf);
- if (ret == -BCH_ERR_fsck_ignore ||
- ret == -BCH_ERR_fsck_errors_not_fixed)
+ if (bch2_err_matches(ret, BCH_ERR_fsck_ignore) ||
+ bch2_err_matches(ret, BCH_ERR_fsck_errors_not_fixed))
ret = 0;
printbuf_exit(&buf);
@@ -777,14 +807,12 @@ static inline int bch2_dev_data_type_accounting_mod(struct btree_trans *trans, s
s64 delta_sectors,
s64 delta_fragmented, unsigned flags)
{
- struct disk_accounting_pos acc = {
- .type = BCH_DISK_ACCOUNTING_dev_data_type,
- .dev_data_type.dev = ca->dev_idx,
- .dev_data_type.data_type = data_type,
- };
s64 d[3] = { delta_buckets, delta_sectors, delta_fragmented };
- return bch2_disk_accounting_mod(trans, &acc, d, 3, flags & BTREE_TRIGGER_gc);
+ return bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc,
+ d, dev_data_type,
+ .dev = ca->dev_idx,
+ .data_type = data_type);
}
int bch2_alloc_key_to_dev_counters(struct btree_trans *trans, struct bch_dev *ca,
@@ -837,7 +865,7 @@ int bch2_trigger_alloc(struct btree_trans *trans,
struct bch_dev *ca = bch2_dev_bucket_tryget(c, new.k->p);
if (!ca)
- return -EIO;
+ return bch_err_throw(c, trigger_alloc);
struct bch_alloc_v4 old_a_convert;
const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert);
@@ -871,6 +899,9 @@ int bch2_trigger_alloc(struct btree_trans *trans,
if (data_type_is_empty(new_a->data_type) &&
BCH_ALLOC_V4_NEED_INC_GEN(new_a) &&
!bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset)) {
+ if (new_a->oldest_gen == new_a->gen &&
+ !bch2_bucket_sectors_total(*new_a))
+ new_a->oldest_gen++;
new_a->gen++;
SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false);
alloc_data_type_set(new_a, new_a->data_type);
@@ -889,26 +920,20 @@ int bch2_trigger_alloc(struct btree_trans *trans,
!new_a->io_time[READ])
new_a->io_time[READ] = bch2_current_io_time(c, READ);
- u64 old_lru = alloc_lru_idx_read(*old_a);
- u64 new_lru = alloc_lru_idx_read(*new_a);
- if (old_lru != new_lru) {
- ret = bch2_lru_change(trans, new.k->p.inode,
- bucket_to_u64(new.k->p),
- old_lru, new_lru);
- if (ret)
- goto err;
- }
+ ret = bch2_lru_change(trans, new.k->p.inode,
+ bucket_to_u64(new.k->p),
+ alloc_lru_idx_read(*old_a),
+ alloc_lru_idx_read(*new_a));
+ if (ret)
+ goto err;
- old_lru = alloc_lru_idx_fragmentation(*old_a, ca);
- new_lru = alloc_lru_idx_fragmentation(*new_a, ca);
- if (old_lru != new_lru) {
- ret = bch2_lru_change(trans,
- BCH_LRU_FRAGMENTATION_START,
- bucket_to_u64(new.k->p),
- old_lru, new_lru);
- if (ret)
- goto err;
- }
+ ret = bch2_lru_change(trans,
+ BCH_LRU_BUCKET_FRAGMENTATION,
+ bucket_to_u64(new.k->p),
+ alloc_lru_idx_fragmentation(*old_a, ca),
+ alloc_lru_idx_fragmentation(*new_a, ca));
+ if (ret)
+ goto err;
if (old_a->gen != new_a->gen) {
ret = bch2_bucket_gen_update(trans, new.k->p, new_a->gen);
@@ -916,15 +941,6 @@ int bch2_trigger_alloc(struct btree_trans *trans,
goto err;
}
- if ((flags & BTREE_TRIGGER_bucket_invalidate) &&
- old_a->cached_sectors) {
- ret = bch2_mod_dev_cached_sectors(trans, ca->dev_idx,
- -((s64) old_a->cached_sectors),
- flags & BTREE_TRIGGER_gc);
- if (ret)
- goto err;
- }
-
ret = bch2_alloc_key_to_dev_counters(trans, ca, old_a, new_a, flags);
if (ret)
goto err;
@@ -983,14 +999,11 @@ int bch2_trigger_alloc(struct btree_trans *trans,
}
if (new_a->gen != old_a->gen) {
- rcu_read_lock();
+ guard(rcu)();
u8 *gen = bucket_gen(ca, new.k->p.offset);
- if (unlikely(!gen)) {
- rcu_read_unlock();
+ if (unlikely(!gen))
goto invalid_bucket;
- }
*gen = new_a->gen;
- rcu_read_unlock();
}
#define eval_state(_a, expr) ({ const struct bch_alloc_v4 *a = _a; expr; })
@@ -1016,15 +1029,12 @@ int bch2_trigger_alloc(struct btree_trans *trans,
}
if ((flags & BTREE_TRIGGER_gc) && (flags & BTREE_TRIGGER_insert)) {
- rcu_read_lock();
+ guard(rcu)();
struct bucket *g = gc_bucket(ca, new.k->p.offset);
- if (unlikely(!g)) {
- rcu_read_unlock();
+ if (unlikely(!g))
goto invalid_bucket;
- }
g->gen_valid = 1;
g->gen = new_a->gen;
- rcu_read_unlock();
}
err:
fsck_err:
@@ -1032,9 +1042,9 @@ fsck_err:
bch2_dev_put(ca);
return ret;
invalid_bucket:
- bch2_fs_inconsistent(c, "reference to invalid bucket\n %s",
+ bch2_fs_inconsistent(c, "reference to invalid bucket\n%s",
(bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf));
- ret = -EIO;
+ ret = bch_err_throw(c, trigger_alloc);
goto err;
}
@@ -1042,9 +1052,10 @@ invalid_bucket:
* This synthesizes deleted extents for holes, similar to BTREE_ITER_slots for
* extents style btrees, but works on non-extents btrees:
*/
-static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos end, struct bkey *hole)
+static struct bkey_s_c bch2_get_key_or_hole(struct btree_trans *trans, struct btree_iter *iter,
+ struct bpos end, struct bkey *hole)
{
- struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
+ struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, iter);
if (bkey_err(k))
return k;
@@ -1055,9 +1066,9 @@ static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos
struct btree_iter iter2;
struct bpos next;
- bch2_trans_copy_iter(&iter2, iter);
+ bch2_trans_copy_iter(trans, &iter2, iter);
- struct btree_path *path = btree_iter_path(iter->trans, iter);
+ struct btree_path *path = btree_iter_path(trans, iter);
if (!bpos_eq(path->l[0].b->key.k.p, SPOS_MAX))
end = bkey_min(end, bpos_nosnap_successor(path->l[0].b->key.k.p));
@@ -1067,9 +1078,9 @@ static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos
* btree node min/max is a closed interval, upto takes a half
* open interval:
*/
- k = bch2_btree_iter_peek_max(&iter2, end);
+ k = bch2_btree_iter_peek_max(trans, &iter2, end);
next = iter2.pos;
- bch2_trans_iter_exit(iter->trans, &iter2);
+ bch2_trans_iter_exit(trans, &iter2);
BUG_ON(next.offset >= iter->pos.offset + U32_MAX);
@@ -1099,24 +1110,24 @@ static bool next_bucket(struct bch_fs *c, struct bch_dev **ca, struct bpos *buck
bucket->offset = 0;
}
- rcu_read_lock();
+ guard(rcu)();
*ca = __bch2_next_dev_idx(c, bucket->inode, NULL);
if (*ca) {
*bucket = POS((*ca)->dev_idx, (*ca)->mi.first_bucket);
bch2_dev_get(*ca);
}
- rcu_read_unlock();
return *ca != NULL;
}
-static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter,
- struct bch_dev **ca, struct bkey *hole)
+static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bch_dev **ca, struct bkey *hole)
{
- struct bch_fs *c = iter->trans->c;
+ struct bch_fs *c = trans->c;
struct bkey_s_c k;
again:
- k = bch2_get_key_or_hole(iter, POS_MAX, hole);
+ k = bch2_get_key_or_hole(trans, iter, POS_MAX, hole);
if (bkey_err(k))
return k;
@@ -1129,7 +1140,7 @@ again:
if (!next_bucket(c, ca, &hole_start))
return bkey_s_c_null;
- bch2_btree_iter_set_pos(iter, hole_start);
+ bch2_btree_iter_set_pos(trans, iter, hole_start);
goto again;
}
@@ -1170,8 +1181,8 @@ int bch2_check_alloc_key(struct btree_trans *trans,
a = bch2_alloc_to_v4(alloc_k, &a_convert);
- bch2_btree_iter_set_pos(discard_iter, alloc_k.k->p);
- k = bch2_btree_iter_peek_slot(discard_iter);
+ bch2_btree_iter_set_pos(trans, discard_iter, alloc_k.k->p);
+ k = bch2_btree_iter_peek_slot(trans, discard_iter);
ret = bkey_err(k);
if (ret)
goto err;
@@ -1184,8 +1195,8 @@ int bch2_check_alloc_key(struct btree_trans *trans,
goto err;
}
- bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a));
- k = bch2_btree_iter_peek_slot(freespace_iter);
+ bch2_btree_iter_set_pos(trans, freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a));
+ k = bch2_btree_iter_peek_slot(trans, freespace_iter);
ret = bkey_err(k);
if (ret)
goto err;
@@ -1198,16 +1209,15 @@ int bch2_check_alloc_key(struct btree_trans *trans,
goto err;
}
- bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(alloc_k.k->p, &gens_offset));
- k = bch2_btree_iter_peek_slot(bucket_gens_iter);
+ bch2_btree_iter_set_pos(trans, bucket_gens_iter, alloc_gens_pos(alloc_k.k->p, &gens_offset));
+ k = bch2_btree_iter_peek_slot(trans, bucket_gens_iter);
ret = bkey_err(k);
if (ret)
goto err;
if (fsck_err_on(a->gen != alloc_gen(k, gens_offset),
trans, bucket_gens_key_wrong,
- "incorrect gen in bucket_gens btree (got %u should be %u)\n"
- " %s",
+ "incorrect gen in bucket_gens btree (got %u should be %u)\n%s",
alloc_gen(k, gens_offset), a->gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
@@ -1253,9 +1263,9 @@ int bch2_check_alloc_hole_freespace(struct btree_trans *trans,
if (!ca->mi.freespace_initialized)
return 0;
- bch2_btree_iter_set_pos(freespace_iter, start);
+ bch2_btree_iter_set_pos(trans, freespace_iter, start);
- k = bch2_btree_iter_peek_slot(freespace_iter);
+ k = bch2_btree_iter_peek_slot(trans, freespace_iter);
ret = bkey_err(k);
if (ret)
goto err;
@@ -1265,7 +1275,7 @@ int bch2_check_alloc_hole_freespace(struct btree_trans *trans,
if (fsck_err_on(k.k->type != KEY_TYPE_set,
trans, freespace_hole_missing,
"hole in alloc btree missing in freespace btree\n"
- " device %llu buckets %llu-%llu",
+ "device %llu buckets %llu-%llu",
freespace_iter->pos.inode,
freespace_iter->pos.offset,
end->offset)) {
@@ -1304,9 +1314,9 @@ int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans,
unsigned i, gens_offset, gens_end_offset;
int ret;
- bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(start, &gens_offset));
+ bch2_btree_iter_set_pos(trans, bucket_gens_iter, alloc_gens_pos(start, &gens_offset));
- k = bch2_btree_iter_peek_slot(bucket_gens_iter);
+ k = bch2_btree_iter_peek_slot(trans, bucket_gens_iter);
ret = bkey_err(k);
if (ret)
goto err;
@@ -1383,7 +1393,7 @@ static void check_discard_freespace_key_work(struct work_struct *work)
container_of(work, struct check_discard_freespace_key_async, work);
bch2_trans_do(w->c, bch2_recheck_discard_freespace_key(trans, w->pos));
- bch2_write_ref_put(w->c, BCH_WRITE_REF_check_discard_freespace_key);
+ enumerated_ref_put(&w->c->writes, BCH_WRITE_REF_check_discard_freespace_key);
kfree(w);
}
@@ -1396,6 +1406,9 @@ int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_ite
: BCH_DATA_free;
struct printbuf buf = PRINTBUF;
+ unsigned fsck_flags = (async_repair ? FSCK_ERR_NO_LOG : 0)|
+ FSCK_CAN_FIX|FSCK_CAN_IGNORE;
+
struct bpos bucket = iter->pos;
bucket.offset &= ~(~0ULL << 56);
u64 genbits = iter->pos.offset & (~0ULL << 56);
@@ -1409,9 +1422,10 @@ int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_ite
return ret;
if (!bch2_dev_bucket_exists(c, bucket)) {
- if (fsck_err(trans, need_discard_freespace_key_to_invalid_dev_bucket,
- "entry in %s btree for nonexistant dev:bucket %llu:%llu",
- bch2_btree_id_str(iter->btree_id), bucket.inode, bucket.offset))
+ if (__fsck_err(trans, fsck_flags,
+ need_discard_freespace_key_to_invalid_dev_bucket,
+ "entry in %s btree for nonexistant dev:bucket %llu:%llu",
+ bch2_btree_id_str(iter->btree_id), bucket.inode, bucket.offset))
goto delete;
ret = 1;
goto out;
@@ -1423,8 +1437,9 @@ int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_ite
if (a->data_type != state ||
(state == BCH_DATA_free &&
genbits != alloc_freespace_genbits(*a))) {
- if (fsck_err(trans, need_discard_freespace_key_bad,
- "%s\n incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)",
+ if (__fsck_err(trans, fsck_flags,
+ need_discard_freespace_key_bad,
+ "%s\nincorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)",
(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
bch2_btree_id_str(iter->btree_id),
iter->pos.inode,
@@ -1439,7 +1454,7 @@ int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_ite
*gen = a->gen;
out:
fsck_err:
- bch2_set_btree_iter_dontneed(&alloc_iter);
+ bch2_set_btree_iter_dontneed(trans, &alloc_iter);
bch2_trans_iter_exit(trans, &alloc_iter);
printbuf_exit(&buf);
return ret;
@@ -1448,7 +1463,7 @@ delete:
ret = bch2_btree_bit_mod_iter(trans, iter, false) ?:
bch2_trans_commit(trans, NULL, NULL,
BCH_TRANS_COMMIT_no_enospc) ?:
- -BCH_ERR_transaction_restart_commit;
+ bch_err_throw(c, transaction_restart_commit);
goto out;
} else {
/*
@@ -1460,7 +1475,7 @@ delete:
if (!w)
goto out;
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_check_discard_freespace_key)) {
+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_check_discard_freespace_key)) {
kfree(w);
goto out;
}
@@ -1469,6 +1484,8 @@ delete:
w->c = c;
w->pos = BBPOS(iter->btree_id, iter->pos);
queue_work(c->write_ref_wq, &w->work);
+
+ ret = 1; /* don't allocate from this bucket */
goto out;
}
}
@@ -1505,7 +1522,7 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans,
struct bch_dev *ca = bch2_dev_tryget_noerror(c, k.k->p.inode);
if (!ca) {
if (fsck_err(trans, bucket_gens_to_invalid_dev,
- "bucket_gens key for invalid device:\n %s",
+ "bucket_gens key for invalid device:\n%s",
(bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
ret = bch2_btree_delete_at(trans, iter, 0);
goto out;
@@ -1514,7 +1531,7 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans,
if (fsck_err_on(end <= ca->mi.first_bucket ||
start >= ca->mi.nbuckets,
trans, bucket_gens_to_invalid_buckets,
- "bucket_gens key for invalid buckets:\n %s",
+ "bucket_gens key for invalid buckets:\n%s",
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
ret = bch2_btree_delete_at(trans, iter, 0);
goto out;
@@ -1576,7 +1593,7 @@ int bch2_check_alloc_info(struct bch_fs *c)
bch2_trans_begin(trans);
- k = bch2_get_key_or_real_bucket_hole(&iter, &ca, &hole);
+ k = bch2_get_key_or_real_bucket_hole(trans, &iter, &ca, &hole);
ret = bkey_err(k);
if (ret)
goto bkey_err;
@@ -1614,7 +1631,7 @@ int bch2_check_alloc_info(struct bch_fs *c)
if (ret)
goto bkey_err;
- bch2_btree_iter_set_pos(&iter, next);
+ bch2_btree_iter_set_pos(trans, &iter, next);
bkey_err:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
continue;
@@ -1642,7 +1659,7 @@ bkey_err:
BTREE_ITER_prefetch);
while (1) {
bch2_trans_begin(trans);
- k = bch2_btree_iter_peek(&iter);
+ k = bch2_btree_iter_peek(trans, &iter);
if (!k.k)
break;
@@ -1661,7 +1678,7 @@ bkey_err:
break;
}
- bch2_btree_iter_set_pos(&iter, bpos_nosnap_successor(iter.pos));
+ bch2_btree_iter_set_pos(trans, &iter, bpos_nosnap_successor(iter.pos));
}
bch2_trans_iter_exit(trans, &iter);
if (ret)
@@ -1689,7 +1706,7 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
struct printbuf buf = PRINTBUF;
int ret;
- alloc_k = bch2_btree_iter_peek(alloc_iter);
+ alloc_k = bch2_btree_iter_peek(trans, alloc_iter);
if (!alloc_k.k)
return 0;
@@ -1705,7 +1722,8 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
u64 lru_idx = alloc_lru_idx_fragmentation(*a, ca);
if (lru_idx) {
- ret = bch2_lru_check_set(trans, BCH_LRU_FRAGMENTATION_START,
+ ret = bch2_lru_check_set(trans, BCH_LRU_BUCKET_FRAGMENTATION,
+ bucket_to_u64(alloc_k.k->p),
lru_idx, alloc_k, last_flushed);
if (ret)
goto err;
@@ -1716,8 +1734,7 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
if (fsck_err_on(!a->io_time[READ],
trans, alloc_key_cached_but_read_time_zero,
- "cached bucket with read_time 0\n"
- " %s",
+ "cached bucket with read_time 0\n%s",
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
struct bkey_i_alloc_v4 *a_mut =
@@ -1735,7 +1752,9 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
a = &a_mut->v;
}
- ret = bch2_lru_check_set(trans, alloc_k.k->p.inode, a->io_time[READ],
+ ret = bch2_lru_check_set(trans, alloc_k.k->p.inode,
+ bucket_to_u64(alloc_k.k->p),
+ a->io_time[READ],
alloc_k, last_flushed);
if (ret)
goto err;
@@ -1757,7 +1776,8 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
POS_MIN, BTREE_ITER_prefetch, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- bch2_check_alloc_to_lru_ref(trans, &iter, &last_flushed)));
+ bch2_check_alloc_to_lru_ref(trans, &iter, &last_flushed))) ?:
+ bch2_check_stripe_to_lru_refs(c);
bch2_bkey_buf_exit(&last_flushed, c);
bch_err_fn(c, ret);
@@ -1766,14 +1786,16 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
static int discard_in_flight_add(struct bch_dev *ca, u64 bucket, bool in_progress)
{
+ struct bch_fs *c = ca->fs;
int ret;
mutex_lock(&ca->discard_buckets_in_flight_lock);
- darray_for_each(ca->discard_buckets_in_flight, i)
- if (i->bucket == bucket) {
- ret = -BCH_ERR_EEXIST_discard_in_flight_add;
- goto out;
- }
+ struct discard_in_flight *i =
+ darray_find_p(ca->discard_buckets_in_flight, i, i->bucket == bucket);
+ if (i) {
+ ret = bch_err_throw(c, EEXIST_discard_in_flight_add);
+ goto out;
+ }
ret = darray_push(&ca->discard_buckets_in_flight, ((struct discard_in_flight) {
.in_progress = in_progress,
@@ -1787,14 +1809,11 @@ out:
static void discard_in_flight_remove(struct bch_dev *ca, u64 bucket)
{
mutex_lock(&ca->discard_buckets_in_flight_lock);
- darray_for_each(ca->discard_buckets_in_flight, i)
- if (i->bucket == bucket) {
- BUG_ON(!i->in_progress);
- darray_remove_item(&ca->discard_buckets_in_flight, i);
- goto found;
- }
- BUG();
-found:
+ struct discard_in_flight *i =
+ darray_find_p(ca->discard_buckets_in_flight, i, i->bucket == bucket);
+ BUG_ON(!i || !i->in_progress);
+
+ darray_remove_item(&ca->discard_buckets_in_flight, i);
mutex_unlock(&ca->discard_buckets_in_flight_lock);
}
@@ -1803,7 +1822,6 @@ struct discard_buckets_state {
u64 open;
u64 need_journal_commit;
u64 discarded;
- u64 need_journal_commit_this_dev;
};
static int bch2_discard_one_bucket(struct btree_trans *trans,
@@ -1815,7 +1833,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct bpos pos = need_discard_iter->pos;
- struct btree_iter iter = { NULL };
+ struct btree_iter iter = {};
struct bkey_s_c k;
struct bkey_i_alloc_v4 *a;
struct printbuf buf = PRINTBUF;
@@ -1827,11 +1845,11 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
goto out;
}
- if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
- c->journal.flushed_seq_ondisk,
- pos.inode, pos.offset)) {
- s->need_journal_commit++;
- s->need_journal_commit_this_dev++;
+ u64 seq_ready = bch2_bucket_journal_seq_ready(&c->buckets_waiting_for_journal,
+ pos.inode, pos.offset);
+ if (seq_ready > c->journal.flushed_seq_ondisk) {
+ if (seq_ready > c->journal.flushing_seq)
+ s->need_journal_commit++;
goto out;
}
@@ -1865,23 +1883,24 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
discard_locked = true;
}
- if (!bkey_eq(*discard_pos_done, iter.pos) &&
- ca->mi.discard && !c->opts.nochanges) {
- /*
- * This works without any other locks because this is the only
- * thread that removes items from the need_discard tree
- */
- bch2_trans_unlock_long(trans);
- blkdev_issue_discard(ca->disk_sb.bdev,
- k.k->p.offset * ca->mi.bucket_size,
- ca->mi.bucket_size,
- GFP_KERNEL);
- *discard_pos_done = iter.pos;
+ if (!bkey_eq(*discard_pos_done, iter.pos)) {
s->discarded++;
+ *discard_pos_done = iter.pos;
- ret = bch2_trans_relock_notrace(trans);
- if (ret)
- goto out;
+ if (bch2_discard_opt_enabled(c, ca) && !c->opts.nochanges) {
+ /*
+ * This works without any other locks because this is the only
+ * thread that removes items from the need_discard tree
+ */
+ bch2_trans_unlock_long(trans);
+ blkdev_issue_discard(ca->disk_sb.bdev,
+ k.k->p.offset * ca->mi.bucket_size,
+ ca->mi.bucket_size,
+ GFP_KERNEL);
+ ret = bch2_trans_relock_notrace(trans);
+ if (ret)
+ goto out;
+ }
}
SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
@@ -1897,7 +1916,10 @@ commit:
if (ret)
goto out;
- count_event(c, bucket_discard);
+ if (!fastpath)
+ count_event(c, bucket_discard);
+ else
+ count_event(c, bucket_discard_fast);
out:
fsck_err:
if (discard_locked)
@@ -1929,29 +1951,32 @@ static void bch2_do_discards_work(struct work_struct *work)
POS(ca->dev_idx, U64_MAX), 0, k,
bch2_discard_one_bucket(trans, ca, &iter, &discard_pos_done, &s, false)));
+ if (s.need_journal_commit > dev_buckets_available(ca, BCH_WATERMARK_normal))
+ bch2_journal_flush_async(&c->journal, NULL);
+
trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded,
bch2_err_str(ret));
- percpu_ref_put(&ca->io_ref);
- bch2_write_ref_put(c, BCH_WRITE_REF_discard);
+ enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_dev_do_discards);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard);
}
void bch2_dev_do_discards(struct bch_dev *ca)
{
struct bch_fs *c = ca->fs;
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard))
+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_discard))
return;
- if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
+ if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE, BCH_DEV_WRITE_REF_dev_do_discards))
goto put_write_ref;
if (queue_work(c->write_ref_wq, &ca->discard_work))
return;
- percpu_ref_put(&ca->io_ref);
+ enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_dev_do_discards);
put_write_ref:
- bch2_write_ref_put(c, BCH_WRITE_REF_discard);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard);
}
void bch2_do_discards(struct bch_fs *c)
@@ -2024,11 +2049,11 @@ static void bch2_do_discards_fast_work(struct work_struct *work)
break;
}
- trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret));
+ trace_discard_buckets_fast(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret));
bch2_trans_put(trans);
- percpu_ref_put(&ca->io_ref);
- bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
+ enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_discard_one_bucket_fast);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard_fast);
}
static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket)
@@ -2038,30 +2063,88 @@ static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket)
if (discard_in_flight_add(ca, bucket, false))
return;
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast))
+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_discard_fast))
return;
- if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
+ if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE, BCH_DEV_WRITE_REF_discard_one_bucket_fast))
goto put_ref;
if (queue_work(c->write_ref_wq, &ca->discard_fast_work))
return;
- percpu_ref_put(&ca->io_ref);
+ enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_discard_one_bucket_fast);
put_ref:
- bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard_fast);
+}
+
+static int invalidate_one_bp(struct btree_trans *trans,
+ struct bch_dev *ca,
+ struct bkey_s_c_backpointer bp,
+ struct bkey_buf *last_flushed)
+{
+ struct btree_iter extent_iter;
+ struct bkey_s_c extent_k =
+ bch2_backpointer_get_key(trans, bp, &extent_iter, 0, last_flushed);
+ int ret = bkey_err(extent_k);
+ if (ret)
+ return ret;
+
+ if (!extent_k.k)
+ return 0;
+
+ struct bkey_i *n =
+ bch2_bkey_make_mut(trans, &extent_iter, &extent_k,
+ BTREE_UPDATE_internal_snapshot_node);
+ ret = PTR_ERR_OR_ZERO(n);
+ if (ret)
+ goto err;
+
+ bch2_bkey_drop_device(bkey_i_to_s(n), ca->dev_idx);
+err:
+ bch2_trans_iter_exit(trans, &extent_iter);
+ return ret;
+}
+
+static int invalidate_one_bucket_by_bps(struct btree_trans *trans,
+ struct bch_dev *ca,
+ struct bpos bucket,
+ u8 gen,
+ struct bkey_buf *last_flushed)
+{
+ struct bpos bp_start = bucket_pos_to_bp_start(ca, bucket);
+ struct bpos bp_end = bucket_pos_to_bp_end(ca, bucket);
+
+ return for_each_btree_key_max_commit(trans, iter, BTREE_ID_backpointers,
+ bp_start, bp_end, 0, k,
+ NULL, NULL,
+ BCH_WATERMARK_btree|
+ BCH_TRANS_COMMIT_no_enospc, ({
+ if (k.k->type != KEY_TYPE_backpointer)
+ continue;
+
+ struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
+
+ if (bp.v->bucket_gen != gen)
+ continue;
+
+ /* filter out bps with gens that don't match */
+
+ invalidate_one_bp(trans, ca, bp, last_flushed);
+ }));
}
+noinline_for_stack
static int invalidate_one_bucket(struct btree_trans *trans,
+ struct bch_dev *ca,
struct btree_iter *lru_iter,
struct bkey_s_c lru_k,
+ struct bkey_buf *last_flushed,
s64 *nr_to_invalidate)
{
struct bch_fs *c = trans->c;
- struct bkey_i_alloc_v4 *a = NULL;
struct printbuf buf = PRINTBUF;
struct bpos bucket = u64_to_bucket(lru_k.k->p.offset);
- unsigned cached_sectors;
+ struct btree_iter alloc_iter = {};
int ret = 0;
if (*nr_to_invalidate <= 0)
@@ -2078,35 +2161,40 @@ static int invalidate_one_bucket(struct btree_trans *trans,
if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset))
return 0;
- a = bch2_trans_start_alloc_update(trans, bucket, BTREE_TRIGGER_bucket_invalidate);
- ret = PTR_ERR_OR_ZERO(a);
+ struct bkey_s_c alloc_k = bch2_bkey_get_iter(trans, &alloc_iter,
+ BTREE_ID_alloc, bucket,
+ BTREE_ITER_cached);
+ ret = bkey_err(alloc_k);
if (ret)
- goto out;
+ return ret;
+
+ struct bch_alloc_v4 a_convert;
+ const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert);
/* We expect harmless races here due to the btree write buffer: */
- if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(a->v))
+ if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(*a))
goto out;
- BUG_ON(a->v.data_type != BCH_DATA_cached);
- BUG_ON(a->v.dirty_sectors);
-
- if (!a->v.cached_sectors)
- bch_err(c, "invalidating empty bucket, confused");
+ /*
+ * Impossible since alloc_lru_idx_read() only returns nonzero if the
+ * bucket is supposed to be on the cached bucket LRU (i.e.
+ * BCH_DATA_cached)
+ *
+ * bch2_lru_validate() also disallows lru keys with lru_pos_time() == 0
+ */
+ BUG_ON(a->data_type != BCH_DATA_cached);
+ BUG_ON(a->dirty_sectors);
- cached_sectors = a->v.cached_sectors;
+ if (!a->cached_sectors) {
+ bch2_check_bucket_backpointer_mismatch(trans, ca, bucket.offset,
+ true, last_flushed);
+ goto out;
+ }
- SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
- a->v.gen++;
- a->v.data_type = 0;
- a->v.dirty_sectors = 0;
- a->v.stripe_sectors = 0;
- a->v.cached_sectors = 0;
- a->v.io_time[READ] = bch2_current_io_time(c, READ);
- a->v.io_time[WRITE] = bch2_current_io_time(c, WRITE);
+ unsigned cached_sectors = a->cached_sectors;
+ u8 gen = a->gen;
- ret = bch2_trans_commit(trans, NULL, NULL,
- BCH_WATERMARK_btree|
- BCH_TRANS_COMMIT_no_enospc);
+ ret = invalidate_one_bucket_by_bps(trans, ca, bucket, gen, last_flushed);
if (ret)
goto out;
@@ -2114,6 +2202,7 @@ static int invalidate_one_bucket(struct btree_trans *trans,
--*nr_to_invalidate;
out:
fsck_err:
+ bch2_trans_iter_exit(trans, &alloc_iter);
printbuf_exit(&buf);
return ret;
}
@@ -2123,9 +2212,9 @@ static struct bkey_s_c next_lru_key(struct btree_trans *trans, struct btree_iter
{
struct bkey_s_c k;
again:
- k = bch2_btree_iter_peek_max(iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX));
+ k = bch2_btree_iter_peek_max(trans, iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX));
if (!k.k && !*wrapped) {
- bch2_btree_iter_set_pos(iter, lru_pos(ca->dev_idx, 0, 0));
+ bch2_btree_iter_set_pos(trans, iter, lru_pos(ca->dev_idx, 0, 0));
*wrapped = true;
goto again;
}
@@ -2140,6 +2229,10 @@ static void bch2_do_invalidates_work(struct work_struct *work)
struct btree_trans *trans = bch2_trans_get(c);
int ret = 0;
+ struct bkey_buf last_flushed;
+ bch2_bkey_buf_init(&last_flushed);
+ bkey_init(&last_flushed.k->k);
+
ret = bch2_btree_write_buffer_tryflush(trans);
if (ret)
goto err;
@@ -2164,38 +2257,39 @@ static void bch2_do_invalidates_work(struct work_struct *work)
if (!k.k)
break;
- ret = invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate);
+ ret = invalidate_one_bucket(trans, ca, &iter, k, &last_flushed, &nr_to_invalidate);
restart_err:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
continue;
if (ret)
break;
- bch2_btree_iter_advance(&iter);
+ bch2_btree_iter_advance(trans, &iter);
}
bch2_trans_iter_exit(trans, &iter);
err:
bch2_trans_put(trans);
- percpu_ref_put(&ca->io_ref);
- bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
+ bch2_bkey_buf_exit(&last_flushed, c);
+ enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_do_invalidates);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_invalidate);
}
void bch2_dev_do_invalidates(struct bch_dev *ca)
{
struct bch_fs *c = ca->fs;
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate))
+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_invalidate))
return;
- if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE))
+ if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE, BCH_DEV_WRITE_REF_do_invalidates))
goto put_ref;
if (queue_work(c->write_ref_wq, &ca->invalidate_work))
return;
- percpu_ref_put(&ca->io_ref);
+ enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_do_invalidates);
put_ref:
- bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_invalidate);
}
void bch2_do_invalidates(struct bch_fs *c)
@@ -2240,7 +2334,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
break;
}
- k = bch2_get_key_or_hole(&iter, end, &hole);
+ k = bch2_get_key_or_hole(trans, &iter, end, &hole);
ret = bkey_err(k);
if (ret)
goto bkey_err;
@@ -2259,7 +2353,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
if (ret)
goto bkey_err;
- bch2_btree_iter_advance(&iter);
+ bch2_btree_iter_advance(trans, &iter);
} else {
struct bkey_i *freespace;
@@ -2279,7 +2373,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
if (ret)
goto bkey_err;
- bch2_btree_iter_set_pos(&iter, k.k->p);
+ bch2_btree_iter_set_pos(trans, &iter, k.k->p);
}
bkey_err:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -2306,14 +2400,16 @@ bkey_err:
int bch2_fs_freespace_init(struct bch_fs *c)
{
- int ret = 0;
- bool doing_init = false;
+ if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image))
+ return 0;
+
/*
* We can crash during the device add path, so we need to check this on
* every mount:
*/
+ bool doing_init = false;
for_each_member_device(c, ca) {
if (ca->mi.freespace_initialized)
continue;
@@ -2323,7 +2419,7 @@ int bch2_fs_freespace_init(struct bch_fs *c)
doing_init = true;
}
- ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets);
+ int ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets);
if (ret) {
bch2_dev_put(ca);
bch_err_fn(c, ret);
@@ -2353,8 +2449,7 @@ int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
* We clear the LRU and need_discard btrees first so that we don't race
* with bch2_do_invalidates() and bch2_do_discards()
*/
- ret = bch2_dev_remove_stripes(c, ca->dev_idx) ?:
- bch2_btree_delete_range(c, BTREE_ID_lru, start, end,
+ ret = bch2_btree_delete_range(c, BTREE_ID_lru, start, end,
BTREE_TRIGGER_norun, NULL) ?:
bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end,
BTREE_TRIGGER_norun, NULL) ?:
@@ -2417,15 +2512,15 @@ void bch2_recalc_capacity(struct bch_fs *c)
lockdep_assert_held(&c->state_lock);
- for_each_online_member(c, ca) {
- struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi;
-
- ra_pages += bdi->ra_pages;
- }
+ guard(rcu)();
+ for_each_member_device_rcu(c, ca, NULL) {
+ struct block_device *bdev = READ_ONCE(ca->disk_sb.bdev);
+ if (bdev)
+ ra_pages += bdev->bd_disk->bdi->ra_pages;
- bch2_set_ra_pages(c, ra_pages);
+ if (ca->mi.state != BCH_MEMBER_STATE_rw)
+ continue;
- for_each_rw_member(c, ca) {
u64 dev_reserve = 0;
/*
@@ -2463,6 +2558,8 @@ void bch2_recalc_capacity(struct bch_fs *c)
ca->mi.bucket_size);
}
+ bch2_set_ra_pages(c, ra_pages);
+
gc_reserve = c->opts.gc_reserve_bytes
? c->opts.gc_reserve_bytes >> 9
: div64_u64(capacity * c->opts.gc_reserve_percent, 100);
@@ -2484,7 +2581,8 @@ u64 bch2_min_rw_member_capacity(struct bch_fs *c)
{
u64 ret = U64_MAX;
- for_each_rw_member(c, ca)
+ guard(rcu)();
+ for_each_rw_member_rcu(c, ca)
ret = min(ret, ca->mi.nbuckets * ca->mi.bucket_size);
return ret;
}
@@ -2492,19 +2590,31 @@ u64 bch2_min_rw_member_capacity(struct bch_fs *c)
static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
{
struct open_bucket *ob;
- bool ret = false;
for (ob = c->open_buckets;
ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
ob++) {
- spin_lock(&ob->lock);
- if (ob->valid && !ob->on_partial_list &&
- ob->dev == ca->dev_idx)
- ret = true;
- spin_unlock(&ob->lock);
+ scoped_guard(spinlock, &ob->lock) {
+ if (ob->valid && !ob->on_partial_list &&
+ ob->dev == ca->dev_idx)
+ return true;
+ }
}
- return ret;
+ return false;
+}
+
+void bch2_dev_allocator_set_rw(struct bch_fs *c, struct bch_dev *ca, bool rw)
+{
+ /* BCH_DATA_free == all rw devs */
+
+ for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
+ if (rw &&
+ (i == BCH_DATA_free ||
+ (ca->mi.data_allowed & BIT(i))))
+ set_bit(ca->dev_idx, c->rw_devs[i].d);
+ else
+ clear_bit(ca->dev_idx, c->rw_devs[i].d);
}
/* device goes ro: */
@@ -2513,9 +2623,7 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
lockdep_assert_held(&c->state_lock);
/* First, remove device from allocation groups: */
-
- for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
- clear_bit(ca->dev_idx, c->rw_devs[i].d);
+ bch2_dev_allocator_set_rw(c, ca, false);
c->rw_devs_change_count++;
@@ -2549,10 +2657,7 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
{
lockdep_assert_held(&c->state_lock);
- for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
- if (ca->mi.data_allowed & (1 << i))
- set_bit(ca->dev_idx, c->rw_devs[i].d);
-
+ bch2_dev_allocator_set_rw(c, ca, true);
c->rw_devs_change_count++;
}
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index de25ba4ee94b..0cc5adc55b6f 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -13,11 +13,9 @@
static inline bool bch2_dev_bucket_exists(struct bch_fs *c, struct bpos pos)
{
- rcu_read_lock();
+ guard(rcu)();
struct bch_dev *ca = bch2_dev_rcu_noerror(c, pos.inode);
- bool ret = ca && bucket_valid(ca, pos.offset);
- rcu_read_unlock();
- return ret;
+ return ca && bucket_valid(ca, pos.offset);
}
static inline u64 bucket_to_u64(struct bpos bucket)
@@ -131,7 +129,7 @@ static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a,
if (a.stripe)
return data_type == BCH_DATA_parity ? data_type : BCH_DATA_stripe;
if (bch2_bucket_sectors_dirty(a))
- return data_type;
+ return bucket_data_type(data_type);
if (a.cached_sectors)
return BCH_DATA_cached;
if (BCH_ALLOC_V4_NEED_DISCARD(&a))
@@ -253,6 +251,7 @@ int bch2_alloc_v4_validate(struct bch_fs *, struct bkey_s_c,
struct bkey_validate_context);
void bch2_alloc_v4_swab(struct bkey_s);
void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+void bch2_alloc_v4_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_alloc ((struct bkey_ops) { \
.key_validate = bch2_alloc_v1_validate, \
@@ -277,7 +276,7 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_alloc_v4 ((struct bkey_ops) { \
.key_validate = bch2_alloc_v4_validate, \
- .val_to_text = bch2_alloc_to_text, \
+ .val_to_text = bch2_alloc_v4_to_text, \
.swab = bch2_alloc_v4_swab, \
.trigger = bch2_trigger_alloc, \
.min_val_size = 48, \
@@ -321,11 +320,11 @@ static inline u64 should_invalidate_buckets(struct bch_dev *ca,
{
u64 want_free = ca->mi.nbuckets >> 7;
u64 free = max_t(s64, 0,
- u.d[BCH_DATA_free].buckets
- + u.d[BCH_DATA_need_discard].buckets
+ u.buckets[BCH_DATA_free]
+ + u.buckets[BCH_DATA_need_discard]
- bch2_dev_buckets_reserved(ca, BCH_WATERMARK_stripe));
- return clamp_t(s64, want_free - free, 0, u.d[BCH_DATA_cached].buckets);
+ return clamp_t(s64, want_free - free, 0, u.buckets[BCH_DATA_cached]);
}
void bch2_dev_do_invalidates(struct bch_dev *);
@@ -350,6 +349,7 @@ int bch2_dev_remove_alloc(struct bch_fs *, struct bch_dev *);
void bch2_recalc_capacity(struct bch_fs *);
u64 bch2_min_rw_member_capacity(struct bch_fs *);
+void bch2_dev_allocator_set_rw(struct bch_fs *, struct bch_dev *, bool);
void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 6df41c331a52..b58525ec7b4d 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -69,10 +69,9 @@ const char * const bch2_watermarks[] = {
void bch2_reset_alloc_cursors(struct bch_fs *c)
{
- rcu_read_lock();
+ guard(rcu)();
for_each_member_device_rcu(c, ca, NULL)
memset(ca->alloc_cursor, 0, sizeof(ca->alloc_cursor));
- rcu_read_unlock();
}
static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob)
@@ -127,14 +126,14 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
void bch2_open_bucket_write_error(struct bch_fs *c,
struct open_buckets *obs,
- unsigned dev)
+ unsigned dev, int err)
{
struct open_bucket *ob;
unsigned i;
open_bucket_for_each(c, obs, ob, i)
if (ob->dev == dev && ob->ec)
- bch2_ec_bucket_cancel(c, ob);
+ bch2_ec_bucket_cancel(c, ob, err);
}
static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
@@ -154,7 +153,7 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
static inline bool is_superblock_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
{
- if (c->curr_recovery_pass > BCH_RECOVERY_PASS_trans_mark_dev_sbs)
+ if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_trans_mark_dev_sbs))
return false;
return bch2_is_superblock_bucket(ca, b);
@@ -166,9 +165,8 @@ static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob)
ARRAY_SIZE(c->open_buckets_partial));
spin_lock(&c->freelist_lock);
- rcu_read_lock();
- bch2_dev_rcu(c, ob->dev)->nr_partial_buckets++;
- rcu_read_unlock();
+ scoped_guard(rcu)
+ bch2_dev_rcu(c, ob->dev)->nr_partial_buckets++;
ob->on_partial_list = true;
c->open_buckets_partial[c->open_buckets_partial_nr++] =
@@ -179,75 +177,63 @@ static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob)
closure_wake_up(&c->freelist_wait);
}
-static inline unsigned open_buckets_reserved(enum bch_watermark watermark)
-{
- switch (watermark) {
- case BCH_WATERMARK_interior_updates:
- return 0;
- case BCH_WATERMARK_reclaim:
- return OPEN_BUCKETS_COUNT / 6;
- case BCH_WATERMARK_btree:
- case BCH_WATERMARK_btree_copygc:
- return OPEN_BUCKETS_COUNT / 4;
- case BCH_WATERMARK_copygc:
- return OPEN_BUCKETS_COUNT / 3;
- default:
- return OPEN_BUCKETS_COUNT / 2;
- }
-}
-
static inline bool may_alloc_bucket(struct bch_fs *c,
- struct bpos bucket,
- struct bucket_alloc_state *s)
+ struct alloc_request *req,
+ struct bpos bucket)
{
if (bch2_bucket_is_open(c, bucket.inode, bucket.offset)) {
- s->skipped_open++;
+ req->counters.skipped_open++;
return false;
}
- if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
- c->journal.flushed_seq_ondisk, bucket.inode, bucket.offset)) {
- s->skipped_need_journal_commit++;
+ u64 journal_seq_ready =
+ bch2_bucket_journal_seq_ready(&c->buckets_waiting_for_journal,
+ bucket.inode, bucket.offset);
+ if (journal_seq_ready > c->journal.flushed_seq_ondisk) {
+ if (journal_seq_ready > c->journal.flushing_seq)
+ req->counters.need_journal_commit++;
+ req->counters.skipped_need_journal_commit++;
return false;
}
if (bch2_bucket_nocow_is_locked(&c->nocow_locks, bucket)) {
- s->skipped_nocow++;
+ req->counters.skipped_nocow++;
return false;
}
return true;
}
-static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
+static struct open_bucket *__try_alloc_bucket(struct bch_fs *c,
+ struct alloc_request *req,
u64 bucket, u8 gen,
- enum bch_watermark watermark,
- struct bucket_alloc_state *s,
struct closure *cl)
{
+ struct bch_dev *ca = req->ca;
+
if (unlikely(is_superblock_bucket(c, ca, bucket)))
return NULL;
if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) {
- s->skipped_nouse++;
+ req->counters.skipped_nouse++;
return NULL;
}
spin_lock(&c->freelist_lock);
- if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(watermark))) {
+ if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(req->watermark))) {
if (cl)
closure_wait(&c->open_buckets_wait, cl);
track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], true);
spin_unlock(&c->freelist_lock);
- return ERR_PTR(-BCH_ERR_open_buckets_empty);
+ return ERR_PTR(bch_err_throw(c, open_buckets_empty));
}
/* Recheck under lock: */
if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
spin_unlock(&c->freelist_lock);
- s->skipped_open++;
+ req->counters.skipped_open++;
return NULL;
}
@@ -271,16 +257,15 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
return ob;
}
-static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca,
- enum bch_watermark watermark,
- struct bucket_alloc_state *s,
+static struct open_bucket *try_alloc_bucket(struct btree_trans *trans,
+ struct alloc_request *req,
struct btree_iter *freespace_iter,
struct closure *cl)
{
struct bch_fs *c = trans->c;
u64 b = freespace_iter->pos.offset & ~(~0ULL << 56);
- if (!may_alloc_bucket(c, POS(ca->dev_idx, b), s))
+ if (!may_alloc_bucket(c, req, POS(req->ca->dev_idx, b)))
return NULL;
u8 gen;
@@ -290,7 +275,7 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc
if (ret)
return NULL;
- return __try_alloc_bucket(c, ca, b, gen, watermark, s, cl);
+ return __try_alloc_bucket(c, req, b, gen, cl);
}
/*
@@ -298,17 +283,16 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc
*/
static noinline struct open_bucket *
bch2_bucket_alloc_early(struct btree_trans *trans,
- struct bch_dev *ca,
- enum bch_watermark watermark,
- struct bucket_alloc_state *s,
+ struct alloc_request *req,
struct closure *cl)
{
struct bch_fs *c = trans->c;
+ struct bch_dev *ca = req->ca;
struct btree_iter iter, citer;
struct bkey_s_c k, ck;
struct open_bucket *ob = NULL;
u64 first_bucket = ca->mi.first_bucket;
- u64 *dev_alloc_cursor = &ca->alloc_cursor[s->btree_bitmap];
+ u64 *dev_alloc_cursor = &ca->alloc_cursor[req->btree_bitmap];
u64 alloc_start = max(first_bucket, *dev_alloc_cursor);
u64 alloc_cursor = alloc_start;
int ret;
@@ -330,19 +314,19 @@ again:
if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets)))
break;
- if (s->btree_bitmap != BTREE_BITMAP_ANY &&
- s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca,
+ if (req->btree_bitmap != BTREE_BITMAP_ANY &&
+ req->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca,
bucket_to_sector(ca, bucket), ca->mi.bucket_size)) {
- if (s->btree_bitmap == BTREE_BITMAP_YES &&
+ if (req->btree_bitmap == BTREE_BITMAP_YES &&
bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift)
break;
bucket = sector_to_bucket(ca,
round_up(bucket_to_sector(ca, bucket) + 1,
1ULL << ca->mi.btree_bitmap_shift));
- bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, bucket));
- s->buckets_seen++;
- s->skipped_mi_btree_bitmap++;
+ bch2_btree_iter_set_pos(trans, &iter, POS(ca->dev_idx, bucket));
+ req->counters.buckets_seen++;
+ req->counters.skipped_mi_btree_bitmap++;
continue;
}
@@ -361,14 +345,13 @@ again:
if (a->data_type != BCH_DATA_free)
goto next;
- s->buckets_seen++;
+ req->counters.buckets_seen++;
- ob = may_alloc_bucket(c, k.k->p, s)
- ? __try_alloc_bucket(c, ca, k.k->p.offset, a->gen,
- watermark, s, cl)
+ ob = may_alloc_bucket(c, req, k.k->p)
+ ? __try_alloc_bucket(c, req, k.k->p.offset, a->gen, cl)
: NULL;
next:
- bch2_set_btree_iter_dontneed(&citer);
+ bch2_set_btree_iter_dontneed(trans, &citer);
bch2_trans_iter_exit(trans, &citer);
if (ob)
break;
@@ -391,15 +374,14 @@ next:
}
static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
- struct bch_dev *ca,
- enum bch_watermark watermark,
- struct bucket_alloc_state *s,
- struct closure *cl)
+ struct alloc_request *req,
+ struct closure *cl)
{
+ struct bch_dev *ca = req->ca;
struct btree_iter iter;
struct bkey_s_c k;
struct open_bucket *ob = NULL;
- u64 *dev_alloc_cursor = &ca->alloc_cursor[s->btree_bitmap];
+ u64 *dev_alloc_cursor = &ca->alloc_cursor[req->btree_bitmap];
u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(*dev_alloc_cursor));
u64 alloc_cursor = alloc_start;
int ret;
@@ -415,13 +397,13 @@ again:
iter.k.size = iter.k.p.offset - iter.pos.offset;
while (iter.k.size) {
- s->buckets_seen++;
+ req->counters.buckets_seen++;
u64 bucket = iter.pos.offset & ~(~0ULL << 56);
- if (s->btree_bitmap != BTREE_BITMAP_ANY &&
- s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca,
+ if (req->btree_bitmap != BTREE_BITMAP_ANY &&
+ req->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca,
bucket_to_sector(ca, bucket), ca->mi.bucket_size)) {
- if (s->btree_bitmap == BTREE_BITMAP_YES &&
+ if (req->btree_bitmap == BTREE_BITMAP_YES &&
bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift)
goto fail;
@@ -430,16 +412,16 @@ again:
1ULL << ca->mi.btree_bitmap_shift));
alloc_cursor = bucket|(iter.pos.offset & (~0ULL << 56));
- bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, alloc_cursor));
- s->skipped_mi_btree_bitmap++;
+ bch2_btree_iter_set_pos(trans, &iter, POS(ca->dev_idx, alloc_cursor));
+ req->counters.skipped_mi_btree_bitmap++;
goto next;
}
- ob = try_alloc_bucket(trans, ca, watermark, s, &iter, cl);
+ ob = try_alloc_bucket(trans, req, &iter, cl);
if (ob) {
if (!IS_ERR(ob))
*dev_alloc_cursor = iter.pos.offset;
- bch2_set_btree_iter_dontneed(&iter);
+ bch2_set_btree_iter_dontneed(trans, &iter);
break;
}
@@ -466,33 +448,30 @@ fail:
return ob;
}
-static noinline void trace_bucket_alloc2(struct bch_fs *c, struct bch_dev *ca,
- enum bch_watermark watermark,
- enum bch_data_type data_type,
+static noinline void trace_bucket_alloc2(struct bch_fs *c,
+ struct alloc_request *req,
struct closure *cl,
- struct bch_dev_usage *usage,
- struct bucket_alloc_state *s,
struct open_bucket *ob)
{
struct printbuf buf = PRINTBUF;
printbuf_tabstop_push(&buf, 24);
- prt_printf(&buf, "dev\t%s (%u)\n", ca->name, ca->dev_idx);
- prt_printf(&buf, "watermark\t%s\n", bch2_watermarks[watermark]);
- prt_printf(&buf, "data type\t%s\n", __bch2_data_types[data_type]);
+ prt_printf(&buf, "dev\t%s (%u)\n", req->ca->name, req->ca->dev_idx);
+ prt_printf(&buf, "watermark\t%s\n", bch2_watermarks[req->watermark]);
+ prt_printf(&buf, "data type\t%s\n", __bch2_data_types[req->data_type]);
prt_printf(&buf, "blocking\t%u\n", cl != NULL);
- prt_printf(&buf, "free\t%llu\n", usage->d[BCH_DATA_free].buckets);
- prt_printf(&buf, "avail\t%llu\n", dev_buckets_free(ca, *usage, watermark));
- prt_printf(&buf, "copygc_wait\t%lu/%lli\n",
+ prt_printf(&buf, "free\t%llu\n", req->usage.buckets[BCH_DATA_free]);
+ prt_printf(&buf, "avail\t%llu\n", dev_buckets_free(req->ca, req->usage, req->watermark));
+ prt_printf(&buf, "copygc_wait\t%llu/%lli\n",
bch2_copygc_wait_amount(c),
c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now));
- prt_printf(&buf, "seen\t%llu\n", s->buckets_seen);
- prt_printf(&buf, "open\t%llu\n", s->skipped_open);
- prt_printf(&buf, "need journal commit\t%llu\n", s->skipped_need_journal_commit);
- prt_printf(&buf, "nocow\t%llu\n", s->skipped_nocow);
- prt_printf(&buf, "nouse\t%llu\n", s->skipped_nouse);
- prt_printf(&buf, "mi_btree_bitmap\t%llu\n", s->skipped_mi_btree_bitmap);
+ prt_printf(&buf, "seen\t%llu\n", req->counters.buckets_seen);
+ prt_printf(&buf, "open\t%llu\n", req->counters.skipped_open);
+ prt_printf(&buf, "need journal commit\t%llu\n", req->counters.skipped_need_journal_commit);
+ prt_printf(&buf, "nocow\t%llu\n", req->counters.skipped_nocow);
+ prt_printf(&buf, "nouse\t%llu\n", req->counters.skipped_nouse);
+ prt_printf(&buf, "mi_btree_bitmap\t%llu\n", req->counters.skipped_mi_btree_bitmap);
if (!IS_ERR(ob)) {
prt_printf(&buf, "allocated\t%llu\n", ob->bucket);
@@ -508,47 +487,43 @@ static noinline void trace_bucket_alloc2(struct bch_fs *c, struct bch_dev *ca,
/**
* bch2_bucket_alloc_trans - allocate a single bucket from a specific device
* @trans: transaction object
- * @ca: device to allocate from
- * @watermark: how important is this allocation?
- * @data_type: BCH_DATA_journal, btree, user...
+ * @req: state for the entire allocation
* @cl: if not NULL, closure to be used to wait if buckets not available
* @nowait: if true, do not wait for buckets to become available
- * @usage: for secondarily also returning the current device usage
*
* Returns: an open_bucket on success, or an ERR_PTR() on failure.
*/
static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
- struct bch_dev *ca,
- enum bch_watermark watermark,
- enum bch_data_type data_type,
- struct closure *cl,
- bool nowait,
- struct bch_dev_usage *usage)
+ struct alloc_request *req,
+ struct closure *cl,
+ bool nowait)
{
struct bch_fs *c = trans->c;
+ struct bch_dev *ca = req->ca;
struct open_bucket *ob = NULL;
bool freespace = READ_ONCE(ca->mi.freespace_initialized);
u64 avail;
- struct bucket_alloc_state s = {
- .btree_bitmap = data_type == BCH_DATA_btree,
- };
bool waiting = nowait;
+
+ req->btree_bitmap = req->data_type == BCH_DATA_btree;
+ memset(&req->counters, 0, sizeof(req->counters));
again:
- bch2_dev_usage_read_fast(ca, usage);
- avail = dev_buckets_free(ca, *usage, watermark);
+ bch2_dev_usage_read_fast(ca, &req->usage);
+ avail = dev_buckets_free(ca, req->usage, req->watermark);
- if (usage->d[BCH_DATA_need_discard].buckets > avail)
+ if (req->usage.buckets[BCH_DATA_need_discard] >
+ min(avail, ca->mi.nbuckets >> 7))
bch2_dev_do_discards(ca);
- if (usage->d[BCH_DATA_need_gc_gens].buckets > avail)
+ if (req->usage.buckets[BCH_DATA_need_gc_gens] > avail)
bch2_gc_gens_async(c);
- if (should_invalidate_buckets(ca, *usage))
+ if (should_invalidate_buckets(ca, req->usage))
bch2_dev_do_invalidates(ca);
if (!avail) {
- if (watermark > BCH_WATERMARK_normal &&
- c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations)
+ if (req->watermark > BCH_WATERMARK_normal &&
+ c->recovery.pass_done < BCH_RECOVERY_PASS_check_allocations)
goto alloc;
if (cl && !waiting) {
@@ -559,7 +534,7 @@ again:
track_event_change(&c->times[BCH_TIME_blocked_allocate], true);
- ob = ERR_PTR(-BCH_ERR_freelist_empty);
+ ob = ERR_PTR(bch_err_throw(c, freelist_empty));
goto err;
}
@@ -567,27 +542,27 @@ again:
closure_wake_up(&c->freelist_wait);
alloc:
ob = likely(freespace)
- ? bch2_bucket_alloc_freelist(trans, ca, watermark, &s, cl)
- : bch2_bucket_alloc_early(trans, ca, watermark, &s, cl);
+ ? bch2_bucket_alloc_freelist(trans, req, cl)
+ : bch2_bucket_alloc_early(trans, req, cl);
- if (s.skipped_need_journal_commit * 2 > avail)
+ if (req->counters.need_journal_commit * 2 > avail)
bch2_journal_flush_async(&c->journal, NULL);
- if (!ob && s.btree_bitmap != BTREE_BITMAP_ANY) {
- s.btree_bitmap = BTREE_BITMAP_ANY;
+ if (!ob && req->btree_bitmap != BTREE_BITMAP_ANY) {
+ req->btree_bitmap = BTREE_BITMAP_ANY;
goto alloc;
}
- if (!ob && freespace && c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) {
+ if (!ob && freespace && c->recovery.pass_done < BCH_RECOVERY_PASS_check_alloc_info) {
freespace = false;
goto alloc;
}
err:
if (!ob)
- ob = ERR_PTR(-BCH_ERR_no_buckets_found);
+ ob = ERR_PTR(bch_err_throw(c, no_buckets_found));
if (!IS_ERR(ob))
- ob->data_type = data_type;
+ ob->data_type = req->data_type;
if (!IS_ERR(ob))
count_event(c, bucket_alloc);
@@ -597,7 +572,7 @@ err:
if (!IS_ERR(ob)
? trace_bucket_alloc_enabled()
: trace_bucket_alloc_fail_enabled())
- trace_bucket_alloc2(c, ca, watermark, data_type, cl, usage, &s, ob);
+ trace_bucket_alloc2(c, req, cl, ob);
return ob;
}
@@ -607,57 +582,96 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
enum bch_data_type data_type,
struct closure *cl)
{
- struct bch_dev_usage usage;
struct open_bucket *ob;
+ struct alloc_request req = {
+ .watermark = watermark,
+ .data_type = data_type,
+ .ca = ca,
+ };
bch2_trans_do(c,
- PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark,
- data_type, cl, false, &usage)));
+ PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, &req, cl, false)));
return ob;
}
static int __dev_stripe_cmp(struct dev_stripe_state *stripe,
unsigned l, unsigned r)
{
- return ((stripe->next_alloc[l] > stripe->next_alloc[r]) -
- (stripe->next_alloc[l] < stripe->next_alloc[r]));
+ return cmp_int(stripe->next_alloc[l], stripe->next_alloc[r]);
}
#define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r)
-struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c,
- struct dev_stripe_state *stripe,
- struct bch_devs_mask *devs)
+void bch2_dev_alloc_list(struct bch_fs *c,
+ struct dev_stripe_state *stripe,
+ struct bch_devs_mask *devs,
+ struct dev_alloc_list *ret)
{
- struct dev_alloc_list ret = { .nr = 0 };
- unsigned i;
+ ret->nr = 0;
+ unsigned i;
for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX)
- ret.data[ret.nr++] = i;
+ ret->data[ret->nr++] = i;
- bubble_sort(ret.data, ret.nr, dev_stripe_cmp);
- return ret;
+ bubble_sort(ret->data, ret->nr, dev_stripe_cmp);
+}
+
+static const u64 stripe_clock_hand_rescale = 1ULL << 62; /* trigger rescale at */
+static const u64 stripe_clock_hand_max = 1ULL << 56; /* max after rescale */
+static const u64 stripe_clock_hand_inv = 1ULL << 52; /* max increment, if a device is empty */
+
+static noinline void bch2_stripe_state_rescale(struct dev_stripe_state *stripe)
+{
+ /*
+ * Avoid underflowing clock hands if at all possible, if clock hands go
+ * to 0 then we lose information - clock hands can be in a wide range if
+ * we have devices we rarely try to allocate from, if we generally
+ * allocate from a specified target but only sometimes have to fall back
+ * to the whole filesystem.
+ */
+ u64 scale_max = U64_MAX; /* maximum we can subtract without underflow */
+ u64 scale_min = 0; /* minumum we must subtract to avoid overflow */
+
+ for (u64 *v = stripe->next_alloc;
+ v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++) {
+ if (*v)
+ scale_max = min(scale_max, *v);
+ if (*v > stripe_clock_hand_max)
+ scale_min = max(scale_min, *v - stripe_clock_hand_max);
+ }
+
+ u64 scale = max(scale_min, scale_max);
+
+ for (u64 *v = stripe->next_alloc;
+ v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++)
+ *v = *v < scale ? 0 : *v - scale;
}
static inline void bch2_dev_stripe_increment_inlined(struct bch_dev *ca,
struct dev_stripe_state *stripe,
struct bch_dev_usage *usage)
{
+ /*
+ * Stripe state has a per device clock hand: we allocate from the device
+ * with the smallest clock hand.
+ *
+ * When we allocate, we don't do a simple increment; we add the inverse
+ * of the device's free space. This results in round robin behavior that
+ * biases in favor of the device(s) with more free space.
+ */
+
u64 *v = stripe->next_alloc + ca->dev_idx;
- u64 free_space = dev_buckets_available(ca, BCH_WATERMARK_normal);
+ u64 free_space = __dev_buckets_available(ca, *usage, BCH_WATERMARK_normal);
u64 free_space_inv = free_space
- ? div64_u64(1ULL << 48, free_space)
- : 1ULL << 48;
- u64 scale = *v / 4;
+ ? div64_u64(stripe_clock_hand_inv, free_space)
+ : stripe_clock_hand_inv;
- if (*v + free_space_inv >= *v)
- *v += free_space_inv;
- else
- *v = U64_MAX;
+ /* Saturating add, avoid overflow: */
+ u64 sum = *v + free_space_inv;
+ *v = sum >= *v ? sum : U64_MAX;
- for (v = stripe->next_alloc;
- v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++)
- *v = *v < scale ? 0 : *v - scale;
+ if (unlikely(*v > stripe_clock_hand_rescale))
+ bch2_stripe_state_rescale(stripe);
}
void bch2_dev_stripe_increment(struct bch_dev *ca,
@@ -670,64 +684,53 @@ void bch2_dev_stripe_increment(struct bch_dev *ca,
}
static int add_new_bucket(struct bch_fs *c,
- struct open_buckets *ptrs,
- struct bch_devs_mask *devs_may_alloc,
- unsigned nr_replicas,
- unsigned *nr_effective,
- bool *have_cache,
- struct open_bucket *ob)
+ struct alloc_request *req,
+ struct open_bucket *ob)
{
unsigned durability = ob_dev(c, ob)->mi.durability;
- BUG_ON(*nr_effective >= nr_replicas);
+ BUG_ON(req->nr_effective >= req->nr_replicas);
- __clear_bit(ob->dev, devs_may_alloc->d);
- *nr_effective += durability;
- *have_cache |= !durability;
+ __clear_bit(ob->dev, req->devs_may_alloc.d);
+ req->nr_effective += durability;
+ req->have_cache |= !durability;
- ob_push(c, ptrs, ob);
+ ob_push(c, &req->ptrs, ob);
- if (*nr_effective >= nr_replicas)
+ if (req->nr_effective >= req->nr_replicas)
return 1;
if (ob->ec)
return 1;
return 0;
}
-int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
- struct open_buckets *ptrs,
- struct dev_stripe_state *stripe,
- struct bch_devs_mask *devs_may_alloc,
- unsigned nr_replicas,
- unsigned *nr_effective,
- bool *have_cache,
- enum bch_write_flags flags,
- enum bch_data_type data_type,
- enum bch_watermark watermark,
- struct closure *cl)
+inline int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
+ struct alloc_request *req,
+ struct dev_stripe_state *stripe,
+ struct closure *cl)
{
struct bch_fs *c = trans->c;
- int ret = -BCH_ERR_insufficient_devices;
+ int ret = 0;
+
+ BUG_ON(req->nr_effective >= req->nr_replicas);
- BUG_ON(*nr_effective >= nr_replicas);
+ bch2_dev_alloc_list(c, stripe, &req->devs_may_alloc, &req->devs_sorted);
- struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, stripe, devs_may_alloc);
- darray_for_each(devs_sorted, i) {
- struct bch_dev *ca = bch2_dev_tryget_noerror(c, *i);
- if (!ca)
+ darray_for_each(req->devs_sorted, i) {
+ req->ca = bch2_dev_tryget_noerror(c, *i);
+ if (!req->ca)
continue;
- if (!ca->mi.durability && *have_cache) {
- bch2_dev_put(ca);
+ if (!req->ca->mi.durability && req->have_cache) {
+ bch2_dev_put(req->ca);
continue;
}
- struct bch_dev_usage usage;
- struct open_bucket *ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type,
- cl, flags & BCH_WRITE_ALLOC_NOWAIT, &usage);
+ struct open_bucket *ob = bch2_bucket_alloc_trans(trans, req, cl,
+ req->flags & BCH_WRITE_alloc_nowait);
if (!IS_ERR(ob))
- bch2_dev_stripe_increment_inlined(ca, stripe, &usage);
- bch2_dev_put(ca);
+ bch2_dev_stripe_increment_inlined(req->ca, stripe, &req->usage);
+ bch2_dev_put(req->ca);
if (IS_ERR(ob)) {
ret = PTR_ERR(ob);
@@ -736,15 +739,16 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
continue;
}
- if (add_new_bucket(c, ptrs, devs_may_alloc,
- nr_replicas, nr_effective,
- have_cache, ob)) {
- ret = 0;
+ ret = add_new_bucket(c, req, ob);
+ if (ret)
break;
- }
}
- return ret;
+ if (ret == 1)
+ return 0;
+ if (ret)
+ return ret;
+ return bch_err_throw(c, insufficient_devices);
}
/* Allocate from stripes: */
@@ -756,35 +760,28 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
*/
static int bucket_alloc_from_stripe(struct btree_trans *trans,
- struct open_buckets *ptrs,
- struct write_point *wp,
- struct bch_devs_mask *devs_may_alloc,
- u16 target,
- unsigned nr_replicas,
- unsigned *nr_effective,
- bool *have_cache,
- enum bch_watermark watermark,
- enum bch_write_flags flags,
- struct closure *cl)
+ struct alloc_request *req,
+ struct closure *cl)
{
struct bch_fs *c = trans->c;
int ret = 0;
- if (nr_replicas < 2)
+ if (req->nr_replicas < 2)
return 0;
- if (ec_open_bucket(c, ptrs))
+ if (ec_open_bucket(c, &req->ptrs))
return 0;
struct ec_stripe_head *h =
- bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, watermark, cl);
+ bch2_ec_stripe_head_get(trans, req, 0, cl);
if (IS_ERR(h))
return PTR_ERR(h);
if (!h)
return 0;
- struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc);
- darray_for_each(devs_sorted, i)
+ bch2_dev_alloc_list(c, &req->wp->stripe, &req->devs_may_alloc, &req->devs_sorted);
+
+ darray_for_each(req->devs_sorted, i)
for (unsigned ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) {
if (!h->s->blocks[ec_idx])
continue;
@@ -795,9 +792,7 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans,
ob->ec = h->s;
ec_stripe_new_get(h->s, STRIPE_REF_io);
- ret = add_new_bucket(c, ptrs, devs_may_alloc,
- nr_replicas, nr_effective,
- have_cache, ob);
+ ret = add_new_bucket(c, req, ob);
goto out;
}
}
@@ -809,65 +804,49 @@ out:
/* Sector allocator */
static bool want_bucket(struct bch_fs *c,
- struct write_point *wp,
- struct bch_devs_mask *devs_may_alloc,
- bool *have_cache, bool ec,
+ struct alloc_request *req,
struct open_bucket *ob)
{
struct bch_dev *ca = ob_dev(c, ob);
- if (!test_bit(ob->dev, devs_may_alloc->d))
+ if (!test_bit(ob->dev, req->devs_may_alloc.d))
return false;
- if (ob->data_type != wp->data_type)
+ if (ob->data_type != req->wp->data_type)
return false;
if (!ca->mi.durability &&
- (wp->data_type == BCH_DATA_btree || ec || *have_cache))
+ (req->wp->data_type == BCH_DATA_btree || req->ec || req->have_cache))
return false;
- if (ec != (ob->ec != NULL))
+ if (req->ec != (ob->ec != NULL))
return false;
return true;
}
static int bucket_alloc_set_writepoint(struct bch_fs *c,
- struct open_buckets *ptrs,
- struct write_point *wp,
- struct bch_devs_mask *devs_may_alloc,
- unsigned nr_replicas,
- unsigned *nr_effective,
- bool *have_cache,
- bool ec)
+ struct alloc_request *req)
{
- struct open_buckets ptrs_skip = { .nr = 0 };
struct open_bucket *ob;
unsigned i;
int ret = 0;
- open_bucket_for_each(c, &wp->ptrs, ob, i) {
- if (!ret && want_bucket(c, wp, devs_may_alloc,
- have_cache, ec, ob))
- ret = add_new_bucket(c, ptrs, devs_may_alloc,
- nr_replicas, nr_effective,
- have_cache, ob);
+ req->scratch_ptrs.nr = 0;
+
+ open_bucket_for_each(c, &req->wp->ptrs, ob, i) {
+ if (!ret && want_bucket(c, req, ob))
+ ret = add_new_bucket(c, req, ob);
else
- ob_push(c, &ptrs_skip, ob);
+ ob_push(c, &req->scratch_ptrs, ob);
}
- wp->ptrs = ptrs_skip;
+ req->wp->ptrs = req->scratch_ptrs;
return ret;
}
static int bucket_alloc_set_partial(struct bch_fs *c,
- struct open_buckets *ptrs,
- struct write_point *wp,
- struct bch_devs_mask *devs_may_alloc,
- unsigned nr_replicas,
- unsigned *nr_effective,
- bool *have_cache, bool ec,
- enum bch_watermark watermark)
+ struct alloc_request *req)
{
int i, ret = 0;
@@ -882,13 +861,12 @@ static int bucket_alloc_set_partial(struct bch_fs *c,
for (i = c->open_buckets_partial_nr - 1; i >= 0; --i) {
struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i];
- if (want_bucket(c, wp, devs_may_alloc, have_cache, ec, ob)) {
+ if (want_bucket(c, req, ob)) {
struct bch_dev *ca = ob_dev(c, ob);
- struct bch_dev_usage usage;
u64 avail;
- bch2_dev_usage_read_fast(ca, &usage);
- avail = dev_buckets_free(ca, usage, watermark) + ca->nr_partial_buckets;
+ bch2_dev_usage_read_fast(ca, &req->usage);
+ avail = dev_buckets_free(ca, req->usage, req->watermark) + ca->nr_partial_buckets;
if (!avail)
continue;
@@ -897,13 +875,10 @@ static int bucket_alloc_set_partial(struct bch_fs *c,
i);
ob->on_partial_list = false;
- rcu_read_lock();
- bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--;
- rcu_read_unlock();
+ scoped_guard(rcu)
+ bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--;
- ret = add_new_bucket(c, ptrs, devs_may_alloc,
- nr_replicas, nr_effective,
- have_cache, ob);
+ ret = add_new_bucket(c, req, ob);
if (ret)
break;
}
@@ -914,61 +889,41 @@ unlock:
}
static int __open_bucket_add_buckets(struct btree_trans *trans,
- struct open_buckets *ptrs,
- struct write_point *wp,
- struct bch_devs_list *devs_have,
- u16 target,
- bool erasure_code,
- unsigned nr_replicas,
- unsigned *nr_effective,
- bool *have_cache,
- enum bch_watermark watermark,
- enum bch_write_flags flags,
- struct closure *_cl)
+ struct alloc_request *req,
+ struct closure *_cl)
{
struct bch_fs *c = trans->c;
- struct bch_devs_mask devs;
struct open_bucket *ob;
struct closure *cl = NULL;
unsigned i;
int ret;
- devs = target_rw_devs(c, wp->data_type, target);
+ req->devs_may_alloc = target_rw_devs(c, req->wp->data_type, req->target);
/* Don't allocate from devices we already have pointers to: */
- darray_for_each(*devs_have, i)
- __clear_bit(*i, devs.d);
+ darray_for_each(*req->devs_have, i)
+ __clear_bit(*i, req->devs_may_alloc.d);
- open_bucket_for_each(c, ptrs, ob, i)
- __clear_bit(ob->dev, devs.d);
+ open_bucket_for_each(c, &req->ptrs, ob, i)
+ __clear_bit(ob->dev, req->devs_may_alloc.d);
- ret = bucket_alloc_set_writepoint(c, ptrs, wp, &devs,
- nr_replicas, nr_effective,
- have_cache, erasure_code);
+ ret = bucket_alloc_set_writepoint(c, req);
if (ret)
return ret;
- ret = bucket_alloc_set_partial(c, ptrs, wp, &devs,
- nr_replicas, nr_effective,
- have_cache, erasure_code, watermark);
+ ret = bucket_alloc_set_partial(c, req);
if (ret)
return ret;
- if (erasure_code) {
- ret = bucket_alloc_from_stripe(trans, ptrs, wp, &devs,
- target,
- nr_replicas, nr_effective,
- have_cache,
- watermark, flags, _cl);
+ if (req->ec) {
+ ret = bucket_alloc_from_stripe(trans, req, _cl);
} else {
retry_blocking:
/*
* Try nonblocking first, so that if one device is full we'll try from
* other devices:
*/
- ret = bch2_bucket_alloc_set_trans(trans, ptrs, &wp->stripe, &devs,
- nr_replicas, nr_effective, have_cache,
- flags, wp->data_type, watermark, cl);
+ ret = bch2_bucket_alloc_set_trans(trans, req, &req->wp->stripe, cl);
if (ret &&
!bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
!bch2_err_matches(ret, BCH_ERR_insufficient_devices) &&
@@ -982,38 +937,27 @@ retry_blocking:
}
static int open_bucket_add_buckets(struct btree_trans *trans,
- struct open_buckets *ptrs,
- struct write_point *wp,
- struct bch_devs_list *devs_have,
- u16 target,
- unsigned erasure_code,
- unsigned nr_replicas,
- unsigned *nr_effective,
- bool *have_cache,
- enum bch_watermark watermark,
- enum bch_write_flags flags,
- struct closure *cl)
+ struct alloc_request *req,
+ struct closure *cl)
{
int ret;
- if (erasure_code && !ec_open_bucket(trans->c, ptrs)) {
- ret = __open_bucket_add_buckets(trans, ptrs, wp,
- devs_have, target, erasure_code,
- nr_replicas, nr_effective, have_cache,
- watermark, flags, cl);
+ if (req->ec && !ec_open_bucket(trans->c, &req->ptrs)) {
+ ret = __open_bucket_add_buckets(trans, req, cl);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
bch2_err_matches(ret, BCH_ERR_operation_blocked) ||
bch2_err_matches(ret, BCH_ERR_freelist_empty) ||
bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
return ret;
- if (*nr_effective >= nr_replicas)
+ if (req->nr_effective >= req->nr_replicas)
return 0;
}
- ret = __open_bucket_add_buckets(trans, ptrs, wp,
- devs_have, target, false,
- nr_replicas, nr_effective, have_cache,
- watermark, flags, cl);
+ bool ec = false;
+ swap(ec, req->ec);
+ ret = __open_bucket_add_buckets(trans, req, cl);
+ swap(ec, req->ec);
+
return ret < 0 ? ret : 0;
}
@@ -1114,9 +1058,8 @@ void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *ca,
ob->on_partial_list = false;
- rcu_read_lock();
- bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--;
- rcu_read_unlock();
+ scoped_guard(rcu)
+ bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--;
spin_unlock(&c->freelist_lock);
bch2_open_bucket_put(c, ob);
@@ -1144,14 +1087,11 @@ static struct write_point *__writepoint_find(struct hlist_head *head,
{
struct write_point *wp;
- rcu_read_lock();
+ guard(rcu)();
hlist_for_each_entry_rcu(wp, head, node)
if (wp->write_point == write_point)
- goto out;
- wp = NULL;
-out:
- rcu_read_unlock();
- return wp;
+ return wp;
+ return NULL;
}
static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor)
@@ -1162,7 +1102,7 @@ static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor)
return stranded * factor > free;
}
-static bool try_increase_writepoints(struct bch_fs *c)
+static noinline bool try_increase_writepoints(struct bch_fs *c)
{
struct write_point *wp;
@@ -1175,7 +1115,7 @@ static bool try_increase_writepoints(struct bch_fs *c)
return true;
}
-static bool try_decrease_writepoints(struct btree_trans *trans, unsigned old_nr)
+static noinline bool try_decrease_writepoints(struct btree_trans *trans, unsigned old_nr)
{
struct bch_fs *c = trans->c;
struct write_point *wp;
@@ -1266,26 +1206,26 @@ out:
static noinline void
deallocate_extra_replicas(struct bch_fs *c,
- struct open_buckets *ptrs,
- struct open_buckets *ptrs_no_use,
- unsigned extra_replicas)
+ struct alloc_request *req)
{
- struct open_buckets ptrs2 = { 0 };
struct open_bucket *ob;
+ unsigned extra_replicas = req->nr_effective - req->nr_replicas;
unsigned i;
- open_bucket_for_each(c, ptrs, ob, i) {
+ req->scratch_ptrs.nr = 0;
+
+ open_bucket_for_each(c, &req->ptrs, ob, i) {
unsigned d = ob_dev(c, ob)->mi.durability;
if (d && d <= extra_replicas) {
extra_replicas -= d;
- ob_push(c, ptrs_no_use, ob);
+ ob_push(c, &req->wp->ptrs, ob);
} else {
- ob_push(c, &ptrs2, ob);
+ ob_push(c, &req->scratch_ptrs, ob);
}
}
- *ptrs = ptrs2;
+ req->ptrs = req->scratch_ptrs;
}
/*
@@ -1304,51 +1244,53 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
struct write_point **wp_ret)
{
struct bch_fs *c = trans->c;
- struct write_point *wp;
struct open_bucket *ob;
- struct open_buckets ptrs;
- unsigned nr_effective, write_points_nr;
- bool have_cache;
- int ret;
+ unsigned write_points_nr;
int i;
+ struct alloc_request *req = bch2_trans_kmalloc_nomemzero(trans, sizeof(*req));
+ int ret = PTR_ERR_OR_ZERO(req);
+ if (unlikely(ret))
+ return ret;
+
if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING))
erasure_code = false;
+ req->nr_replicas = nr_replicas;
+ req->target = target;
+ req->ec = erasure_code;
+ req->watermark = watermark;
+ req->flags = flags;
+ req->devs_have = devs_have;
+
BUG_ON(!nr_replicas || !nr_replicas_required);
retry:
- ptrs.nr = 0;
- nr_effective = 0;
- write_points_nr = c->write_points_nr;
- have_cache = false;
+ req->ptrs.nr = 0;
+ req->nr_effective = 0;
+ req->have_cache = false;
+ write_points_nr = c->write_points_nr;
- *wp_ret = wp = writepoint_find(trans, write_point.v);
+ *wp_ret = req->wp = writepoint_find(trans, write_point.v);
+
+ req->data_type = req->wp->data_type;
ret = bch2_trans_relock(trans);
if (ret)
goto err;
/* metadata may not allocate on cache devices: */
- if (wp->data_type != BCH_DATA_user)
- have_cache = true;
-
- if (target && !(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) {
- ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
- target, erasure_code,
- nr_replicas, &nr_effective,
- &have_cache, watermark,
- flags, NULL);
+ if (req->data_type != BCH_DATA_user)
+ req->have_cache = true;
+
+ if (target && !(flags & BCH_WRITE_only_specified_devs)) {
+ ret = open_bucket_add_buckets(trans, req, NULL);
if (!ret ||
bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto alloc_done;
/* Don't retry from all devices if we're out of open buckets: */
if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) {
- int ret2 = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
- target, erasure_code,
- nr_replicas, &nr_effective,
- &have_cache, watermark,
- flags, cl);
+ int ret2 = open_bucket_add_buckets(trans, req, cl);
if (!ret2 ||
bch2_err_matches(ret2, BCH_ERR_transaction_restart) ||
bch2_err_matches(ret2, BCH_ERR_open_buckets_empty)) {
@@ -1361,88 +1303,89 @@ retry:
* Only try to allocate cache (durability = 0 devices) from the
* specified target:
*/
- have_cache = true;
+ req->have_cache = true;
+ req->target = 0;
- ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
- 0, erasure_code,
- nr_replicas, &nr_effective,
- &have_cache, watermark,
- flags, cl);
+ ret = open_bucket_add_buckets(trans, req, cl);
} else {
- ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
- target, erasure_code,
- nr_replicas, &nr_effective,
- &have_cache, watermark,
- flags, cl);
+ ret = open_bucket_add_buckets(trans, req, cl);
}
alloc_done:
- BUG_ON(!ret && nr_effective < nr_replicas);
+ BUG_ON(!ret && req->nr_effective < req->nr_replicas);
- if (erasure_code && !ec_open_bucket(c, &ptrs))
+ if (erasure_code && !ec_open_bucket(c, &req->ptrs))
pr_debug("failed to get ec bucket: ret %u", ret);
if (ret == -BCH_ERR_insufficient_devices &&
- nr_effective >= nr_replicas_required)
+ req->nr_effective >= nr_replicas_required)
ret = 0;
if (ret)
goto err;
- if (nr_effective > nr_replicas)
- deallocate_extra_replicas(c, &ptrs, &wp->ptrs, nr_effective - nr_replicas);
+ if (req->nr_effective > req->nr_replicas)
+ deallocate_extra_replicas(c, req);
/* Free buckets we didn't use: */
- open_bucket_for_each(c, &wp->ptrs, ob, i)
+ open_bucket_for_each(c, &req->wp->ptrs, ob, i)
open_bucket_free_unused(c, ob);
- wp->ptrs = ptrs;
+ req->wp->ptrs = req->ptrs;
- wp->sectors_free = UINT_MAX;
+ req->wp->sectors_free = UINT_MAX;
- open_bucket_for_each(c, &wp->ptrs, ob, i)
- wp->sectors_free = min(wp->sectors_free, ob->sectors_free);
+ open_bucket_for_each(c, &req->wp->ptrs, ob, i) {
+ /*
+ * Ensure proper write alignment - either due to misaligned
+ * bucket sizes (from buggy bcachefs-tools), or writes that mix
+ * logical/physical alignment:
+ */
+ struct bch_dev *ca = ob_dev(c, ob);
+ u64 offset = bucket_to_sector(ca, ob->bucket) +
+ ca->mi.bucket_size -
+ ob->sectors_free;
+ unsigned align = round_up(offset, block_sectors(c)) - offset;
+
+ ob->sectors_free = max_t(int, 0, ob->sectors_free - align);
+
+ req->wp->sectors_free = min(req->wp->sectors_free, ob->sectors_free);
+ }
+
+ req->wp->sectors_free = rounddown(req->wp->sectors_free, block_sectors(c));
- BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX);
+ /* Did alignment use up space in an open_bucket? */
+ if (unlikely(!req->wp->sectors_free)) {
+ bch2_alloc_sectors_done(c, req->wp);
+ goto retry;
+ }
+
+ BUG_ON(!req->wp->sectors_free || req->wp->sectors_free == UINT_MAX);
return 0;
err:
- open_bucket_for_each(c, &wp->ptrs, ob, i)
- if (ptrs.nr < ARRAY_SIZE(ptrs.v))
- ob_push(c, &ptrs, ob);
+ open_bucket_for_each(c, &req->wp->ptrs, ob, i)
+ if (req->ptrs.nr < ARRAY_SIZE(req->ptrs.v))
+ ob_push(c, &req->ptrs, ob);
else
open_bucket_free_unused(c, ob);
- wp->ptrs = ptrs;
+ req->wp->ptrs = req->ptrs;
- mutex_unlock(&wp->lock);
+ mutex_unlock(&req->wp->lock);
if (bch2_err_matches(ret, BCH_ERR_freelist_empty) &&
try_decrease_writepoints(trans, write_points_nr))
goto retry;
if (cl && bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
- ret = -BCH_ERR_bucket_alloc_blocked;
+ ret = bch_err_throw(c, bucket_alloc_blocked);
- if (cl && !(flags & BCH_WRITE_ALLOC_NOWAIT) &&
+ if (cl && !(flags & BCH_WRITE_alloc_nowait) &&
bch2_err_matches(ret, BCH_ERR_freelist_empty))
- ret = -BCH_ERR_bucket_alloc_blocked;
+ ret = bch_err_throw(c, bucket_alloc_blocked);
return ret;
}
-struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob)
-{
- struct bch_dev *ca = ob_dev(c, ob);
-
- return (struct bch_extent_ptr) {
- .type = 1 << BCH_EXTENT_ENTRY_ptr,
- .gen = ob->gen,
- .dev = ob->dev,
- .offset = bucket_to_sector(ca, ob->bucket) +
- ca->mi.bucket_size -
- ob->sectors_free,
- };
-}
-
void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
struct bkey_i *k, unsigned sectors,
bool cached)
@@ -1572,8 +1515,10 @@ static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c,
struct open_bucket *ob;
unsigned i;
+ mutex_lock(&wp->lock);
+
prt_printf(out, "%lu: ", wp->write_point);
- prt_human_readable_u64(out, wp->sectors_allocated);
+ prt_human_readable_u64(out, wp->sectors_allocated << 9);
prt_printf(out, " last wrote: ");
bch2_pr_time_units(out, sched_clock() - wp->last_used);
@@ -1589,6 +1534,8 @@ static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c,
open_bucket_for_each(c, &wp->ptrs, ob, i)
bch2_open_bucket_to_text(out, c, ob);
printbuf_indent_sub(out, 2);
+
+ mutex_unlock(&wp->lock);
}
void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c)
@@ -1646,7 +1593,7 @@ void bch2_fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c)
void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
{
struct bch_fs *c = ca->fs;
- struct bch_dev_usage stats = bch2_dev_usage_read(ca);
+ struct bch_dev_usage_full stats = bch2_dev_usage_full_read(ca);
unsigned nr[BCH_DATA_NR];
memset(nr, 0, sizeof(nr));
@@ -1669,7 +1616,8 @@ void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
printbuf_tabstop_push(out, 16);
prt_printf(out, "open buckets\t%i\r\n", ca->nr_open_buckets);
- prt_printf(out, "buckets to invalidate\t%llu\r\n", should_invalidate_buckets(ca, stats));
+ prt_printf(out, "buckets to invalidate\t%llu\r\n",
+ should_invalidate_buckets(ca, bch2_dev_usage_read(ca)));
}
static noinline void bch2_print_allocator_stuck(struct bch_fs *c)
@@ -1685,13 +1633,18 @@ static noinline void bch2_print_allocator_stuck(struct bch_fs *c)
printbuf_indent_sub(&buf, 2);
prt_newline(&buf);
- for_each_online_member(c, ca) {
- prt_printf(&buf, "Dev %u:\n", ca->dev_idx);
- printbuf_indent_add(&buf, 2);
- bch2_dev_alloc_debug_to_text(&buf, ca);
- printbuf_indent_sub(&buf, 2);
- prt_newline(&buf);
- }
+ bch2_printbuf_make_room(&buf, 4096);
+
+ buf.atomic++;
+ scoped_guard(rcu)
+ for_each_online_member_rcu(c, ca) {
+ prt_printf(&buf, "Dev %u:\n", ca->dev_idx);
+ printbuf_indent_add(&buf, 2);
+ bch2_dev_alloc_debug_to_text(&buf, ca);
+ printbuf_indent_sub(&buf, 2);
+ prt_newline(&buf);
+ }
+ --buf.atomic;
prt_printf(&buf, "Copygc debug:\n");
printbuf_indent_add(&buf, 2);
@@ -1704,7 +1657,7 @@ static noinline void bch2_print_allocator_stuck(struct bch_fs *c)
bch2_journal_debug_to_text(&buf, &c->journal);
printbuf_indent_sub(&buf, 2);
- bch2_print_string_as_lines(KERN_ERR, buf.buf);
+ bch2_print_str(c, KERN_ERR, buf.buf);
printbuf_exit(&buf);
}
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
index f25481a0d1a0..1b3fc8460096 100644
--- a/fs/bcachefs/alloc_foreground.h
+++ b/fs/bcachefs/alloc_foreground.h
@@ -3,8 +3,10 @@
#define _BCACHEFS_ALLOC_FOREGROUND_H
#include "bcachefs.h"
+#include "buckets.h"
#include "alloc_types.h"
#include "extents.h"
+#include "io_write_types.h"
#include "sb-members.h"
#include <linux/hash.h>
@@ -23,9 +25,57 @@ struct dev_alloc_list {
u8 data[BCH_SB_MEMBERS_MAX];
};
-struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *,
- struct dev_stripe_state *,
- struct bch_devs_mask *);
+struct alloc_request {
+ unsigned nr_replicas;
+ unsigned target;
+ bool ec;
+ enum bch_watermark watermark;
+ enum bch_write_flags flags;
+ enum bch_data_type data_type;
+ struct bch_devs_list *devs_have;
+ struct write_point *wp;
+
+ /* These fields are used primarily by open_bucket_add_buckets */
+ struct open_buckets ptrs;
+ unsigned nr_effective; /* sum of @ptrs durability */
+ bool have_cache; /* have we allocated from a 0 durability dev */
+ struct bch_devs_mask devs_may_alloc;
+
+ /* bch2_bucket_alloc_set_trans(): */
+ struct dev_alloc_list devs_sorted;
+ struct bch_dev_usage usage;
+
+ /* bch2_bucket_alloc_trans(): */
+ struct bch_dev *ca;
+
+ enum {
+ BTREE_BITMAP_NO,
+ BTREE_BITMAP_YES,
+ BTREE_BITMAP_ANY,
+ } btree_bitmap;
+
+ struct {
+ u64 buckets_seen;
+ u64 skipped_open;
+ u64 skipped_need_journal_commit;
+ u64 need_journal_commit;
+ u64 skipped_nocow;
+ u64 skipped_nouse;
+ u64 skipped_mi_btree_bitmap;
+ } counters;
+
+ unsigned scratch_nr_replicas;
+ unsigned scratch_nr_effective;
+ bool scratch_have_cache;
+ enum bch_data_type scratch_data_type;
+ struct open_buckets scratch_ptrs;
+ struct bch_devs_mask scratch_devs_may_alloc;
+};
+
+void bch2_dev_alloc_list(struct bch_fs *,
+ struct dev_stripe_state *,
+ struct bch_devs_mask *,
+ struct dev_alloc_list *);
void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *);
static inline struct bch_dev *ob_dev(struct bch_fs *c, struct open_bucket *ob)
@@ -33,6 +83,23 @@ static inline struct bch_dev *ob_dev(struct bch_fs *c, struct open_bucket *ob)
return bch2_dev_have_ref(c, ob->dev);
}
+static inline unsigned bch2_open_buckets_reserved(enum bch_watermark watermark)
+{
+ switch (watermark) {
+ case BCH_WATERMARK_interior_updates:
+ return 0;
+ case BCH_WATERMARK_reclaim:
+ return OPEN_BUCKETS_COUNT / 6;
+ case BCH_WATERMARK_btree:
+ case BCH_WATERMARK_btree_copygc:
+ return OPEN_BUCKETS_COUNT / 4;
+ case BCH_WATERMARK_copygc:
+ return OPEN_BUCKETS_COUNT / 3;
+ default:
+ return OPEN_BUCKETS_COUNT / 2;
+ }
+}
+
struct open_bucket *bch2_bucket_alloc(struct bch_fs *, struct bch_dev *,
enum bch_watermark, enum bch_data_type,
struct closure *);
@@ -65,7 +132,7 @@ static inline struct open_bucket *ec_open_bucket(struct bch_fs *c,
}
void bch2_open_bucket_write_error(struct bch_fs *,
- struct open_buckets *, unsigned);
+ struct open_buckets *, unsigned, int);
void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *);
@@ -93,7 +160,9 @@ static inline void bch2_alloc_sectors_done_inlined(struct bch_fs *c, struct writ
unsigned i;
open_bucket_for_each(c, &wp->ptrs, ob, i)
- ob_push(c, !ob->sectors_free ? &ptrs : &keep, ob);
+ ob_push(c, ob->sectors_free < block_sectors(c)
+ ? &ptrs
+ : &keep, ob);
wp->ptrs = keep;
mutex_unlock(&wp->lock);
@@ -154,11 +223,8 @@ static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64
}
enum bch_write_flags;
-int bch2_bucket_alloc_set_trans(struct btree_trans *, struct open_buckets *,
- struct dev_stripe_state *, struct bch_devs_mask *,
- unsigned, unsigned *, bool *, enum bch_write_flags,
- enum bch_data_type, enum bch_watermark,
- struct closure *);
+int bch2_bucket_alloc_set_trans(struct btree_trans *, struct alloc_request *,
+ struct dev_stripe_state *, struct closure *);
int bch2_alloc_sectors_start_trans(struct btree_trans *,
unsigned, unsigned,
@@ -170,7 +236,19 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *,
struct closure *,
struct write_point **);
-struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *, struct open_bucket *);
+static inline struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob)
+{
+ struct bch_dev *ca = ob_dev(c, ob);
+
+ return (struct bch_extent_ptr) {
+ .type = 1 << BCH_EXTENT_ENTRY_ptr,
+ .gen = ob->gen,
+ .dev = ob->dev,
+ .offset = bucket_to_sector(ca, ob->bucket) +
+ ca->mi.bucket_size -
+ ob->sectors_free,
+ };
+}
/*
* Append pointers to the space we just allocated to @k, and mark @sectors space
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
index 9bbb28e90b93..e7becdf22cba 100644
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
@@ -8,21 +8,6 @@
#include "clock_types.h"
#include "fifo.h"
-struct bucket_alloc_state {
- enum {
- BTREE_BITMAP_NO,
- BTREE_BITMAP_YES,
- BTREE_BITMAP_ANY,
- } btree_bitmap;
-
- u64 buckets_seen;
- u64 skipped_open;
- u64 skipped_need_journal_commit;
- u64 skipped_nocow;
- u64 skipped_nouse;
- u64 skipped_mi_btree_bitmap;
-};
-
#define BCH_WATERMARKS() \
x(stripe) \
x(normal) \
@@ -89,6 +74,7 @@ struct dev_stripe_state {
x(stopped) \
x(waiting_io) \
x(waiting_work) \
+ x(runnable) \
x(running)
enum write_point_state {
@@ -124,6 +110,7 @@ struct write_point {
enum write_point_state state;
u64 last_state_change;
u64 time[WRITE_POINT_STATE_NR];
+ u64 last_runtime;
} __aligned(SMP_CACHE_BYTES);
};
diff --git a/fs/bcachefs/async_objs.c b/fs/bcachefs/async_objs.c
new file mode 100644
index 000000000000..a7cd1f0f0964
--- /dev/null
+++ b/fs/bcachefs/async_objs.c
@@ -0,0 +1,132 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Async obj debugging: keep asynchronous objects on (very fast) lists, make
+ * them visibile in debugfs:
+ */
+
+#include "bcachefs.h"
+#include "async_objs.h"
+#include "btree_io.h"
+#include "debug.h"
+#include "io_read.h"
+#include "io_write.h"
+
+#include <linux/debugfs.h>
+
+static void promote_obj_to_text(struct printbuf *out, void *obj)
+{
+ bch2_promote_op_to_text(out, obj);
+}
+
+static void rbio_obj_to_text(struct printbuf *out, void *obj)
+{
+ bch2_read_bio_to_text(out, obj);
+}
+
+static void write_op_obj_to_text(struct printbuf *out, void *obj)
+{
+ bch2_write_op_to_text(out, obj);
+}
+
+static void btree_read_bio_obj_to_text(struct printbuf *out, void *obj)
+{
+ struct btree_read_bio *rbio = obj;
+ bch2_btree_read_bio_to_text(out, rbio);
+}
+
+static void btree_write_bio_obj_to_text(struct printbuf *out, void *obj)
+{
+ struct btree_write_bio *wbio = obj;
+ bch2_bio_to_text(out, &wbio->wbio.bio);
+}
+
+static int bch2_async_obj_list_open(struct inode *inode, struct file *file)
+{
+ struct async_obj_list *list = inode->i_private;
+ struct dump_iter *i;
+
+ i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL);
+ if (!i)
+ return -ENOMEM;
+
+ file->private_data = i;
+ i->from = POS_MIN;
+ i->iter = 0;
+ i->c = container_of(list, struct bch_fs, async_objs[list->idx]);
+ i->list = list;
+ i->buf = PRINTBUF;
+ return 0;
+}
+
+static ssize_t bch2_async_obj_list_read(struct file *file, char __user *buf,
+ size_t size, loff_t *ppos)
+{
+ struct dump_iter *i = file->private_data;
+ struct async_obj_list *list = i->list;
+ ssize_t ret = 0;
+
+ i->ubuf = buf;
+ i->size = size;
+ i->ret = 0;
+
+ struct genradix_iter iter;
+ void *obj;
+ fast_list_for_each_from(&list->list, iter, obj, i->iter) {
+ ret = bch2_debugfs_flush_buf(i);
+ if (ret)
+ return ret;
+
+ if (!i->size)
+ break;
+
+ list->obj_to_text(&i->buf, obj);
+ }
+
+ if (i->buf.allocation_failure)
+ ret = -ENOMEM;
+ else
+ i->iter = iter.pos;
+
+ if (!ret)
+ ret = bch2_debugfs_flush_buf(i);
+
+ return ret ?: i->ret;
+}
+
+static const struct file_operations async_obj_ops = {
+ .owner = THIS_MODULE,
+ .open = bch2_async_obj_list_open,
+ .release = bch2_dump_release,
+ .read = bch2_async_obj_list_read,
+};
+
+void bch2_fs_async_obj_debugfs_init(struct bch_fs *c)
+{
+ c->async_obj_dir = debugfs_create_dir("async_objs", c->fs_debug_dir);
+
+#define x(n) debugfs_create_file(#n, 0400, c->async_obj_dir, \
+ &c->async_objs[BCH_ASYNC_OBJ_LIST_##n], &async_obj_ops);
+ BCH_ASYNC_OBJ_LISTS()
+#undef x
+}
+
+void bch2_fs_async_obj_exit(struct bch_fs *c)
+{
+ for (unsigned i = 0; i < ARRAY_SIZE(c->async_objs); i++)
+ fast_list_exit(&c->async_objs[i].list);
+}
+
+int bch2_fs_async_obj_init(struct bch_fs *c)
+{
+ for (unsigned i = 0; i < ARRAY_SIZE(c->async_objs); i++) {
+ if (fast_list_init(&c->async_objs[i].list))
+ return -BCH_ERR_ENOMEM_async_obj_init;
+ c->async_objs[i].idx = i;
+ }
+
+#define x(n) c->async_objs[BCH_ASYNC_OBJ_LIST_##n].obj_to_text = n##_obj_to_text;
+ BCH_ASYNC_OBJ_LISTS()
+#undef x
+
+ return 0;
+}
diff --git a/fs/bcachefs/async_objs.h b/fs/bcachefs/async_objs.h
new file mode 100644
index 000000000000..cd6489b8cf76
--- /dev/null
+++ b/fs/bcachefs/async_objs.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ASYNC_OBJS_H
+#define _BCACHEFS_ASYNC_OBJS_H
+
+#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
+static inline void __async_object_list_del(struct fast_list *head, unsigned idx)
+{
+ fast_list_remove(head, idx);
+}
+
+static inline int __async_object_list_add(struct fast_list *head, void *obj, unsigned *idx)
+{
+ int ret = fast_list_add(head, obj);
+ *idx = ret > 0 ? ret : 0;
+ return ret < 0 ? ret : 0;
+}
+
+#define async_object_list_del(_c, _list, idx) \
+ __async_object_list_del(&(_c)->async_objs[BCH_ASYNC_OBJ_LIST_##_list].list, idx)
+
+#define async_object_list_add(_c, _list, obj, idx) \
+ __async_object_list_add(&(_c)->async_objs[BCH_ASYNC_OBJ_LIST_##_list].list, obj, idx)
+
+void bch2_fs_async_obj_debugfs_init(struct bch_fs *);
+void bch2_fs_async_obj_exit(struct bch_fs *);
+int bch2_fs_async_obj_init(struct bch_fs *);
+
+#else /* CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS */
+
+#define async_object_list_del(_c, _n, idx) do {} while (0)
+
+static inline int __async_object_list_add(void)
+{
+ return 0;
+}
+#define async_object_list_add(_c, _n, obj, idx) __async_object_list_add()
+
+static inline void bch2_fs_async_obj_debugfs_init(struct bch_fs *c) {}
+static inline void bch2_fs_async_obj_exit(struct bch_fs *c) {}
+static inline int bch2_fs_async_obj_init(struct bch_fs *c) { return 0; }
+
+#endif /* CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS */
+
+#endif /* _BCACHEFS_ASYNC_OBJS_H */
diff --git a/fs/bcachefs/async_objs_types.h b/fs/bcachefs/async_objs_types.h
new file mode 100644
index 000000000000..8d713c0f5841
--- /dev/null
+++ b/fs/bcachefs/async_objs_types.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ASYNC_OBJS_TYPES_H
+#define _BCACHEFS_ASYNC_OBJS_TYPES_H
+
+#define BCH_ASYNC_OBJ_LISTS() \
+ x(promote) \
+ x(rbio) \
+ x(write_op) \
+ x(btree_read_bio) \
+ x(btree_write_bio)
+
+enum bch_async_obj_lists {
+#define x(n) BCH_ASYNC_OBJ_LIST_##n,
+ BCH_ASYNC_OBJ_LISTS()
+#undef x
+ BCH_ASYNC_OBJ_NR
+};
+
+struct async_obj_list {
+ struct fast_list list;
+ void (*obj_to_text)(struct printbuf *, void *);
+ unsigned idx;
+};
+
+#endif /* _BCACHEFS_ASYNC_OBJS_TYPES_H */
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index ebeb6a5ff9d2..77d93beb3c8f 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -11,9 +11,21 @@
#include "checksum.h"
#include "disk_accounting.h"
#include "error.h"
+#include "progress.h"
+#include "recovery_passes.h"
#include <linux/mm.h>
+static int bch2_bucket_bitmap_set(struct bch_dev *, struct bucket_bitmap *, u64);
+
+static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp)
+{
+ return (struct bbpos) {
+ .btree = bp.btree_id,
+ .pos = bp.pos,
+ };
+}
+
int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k,
struct bkey_validate_context from)
{
@@ -36,19 +48,23 @@ void bch2_backpointer_to_text(struct printbuf *out, struct bch_fs *c, struct bke
{
struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
- rcu_read_lock();
- struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp.k->p.inode);
- if (ca) {
- u32 bucket_offset;
- struct bpos bucket = bp_pos_to_bucket_and_offset(ca, bp.k->p, &bucket_offset);
- rcu_read_unlock();
+ struct bch_dev *ca;
+ u32 bucket_offset;
+ struct bpos bucket;
+ scoped_guard(rcu) {
+ ca = bch2_dev_rcu_noerror(c, bp.k->p.inode);
+ if (ca)
+ bucket = bp_pos_to_bucket_and_offset(ca, bp.k->p, &bucket_offset);
+ }
+
+ if (ca)
prt_printf(out, "bucket=%llu:%llu:%u ", bucket.inode, bucket.offset, bucket_offset);
- } else {
- rcu_read_unlock();
+ else
prt_printf(out, "sector=%llu:%llu ", bp.k->p.inode, bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT);
- }
bch2_btree_id_level_to_text(out, bp.v->btree_id, bp.v->level);
+ prt_str(out, " data_type=");
+ bch2_prt_data_type(out, bp.v->data_type);
prt_printf(out, " suboffset=%u len=%u gen=%u pos=",
(u32) bp.k->p.offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT),
bp.v->bucket_len,
@@ -93,6 +109,9 @@ static noinline int backpointer_mod_err(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct printbuf buf = PRINTBUF;
+ bool will_check = c->recovery.passes_to_run &
+ BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers);
+ int ret = 0;
if (insert) {
prt_printf(&buf, "existing backpointer found when inserting ");
@@ -106,9 +125,7 @@ static noinline int backpointer_mod_err(struct btree_trans *trans,
prt_printf(&buf, "for ");
bch2_bkey_val_to_text(&buf, c, orig_k);
-
- bch_err(c, "%s", buf.buf);
- } else if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) {
+ } else if (!will_check) {
prt_printf(&buf, "backpointer not found when deleting\n");
printbuf_indent_add(&buf, 2);
@@ -122,17 +139,14 @@ static noinline int backpointer_mod_err(struct btree_trans *trans,
prt_printf(&buf, "for ");
bch2_bkey_val_to_text(&buf, c, orig_k);
-
- bch_err(c, "%s", buf.buf);
}
- printbuf_exit(&buf);
+ if (!will_check && __bch2_inconsistent_error(c, &buf))
+ ret = bch_err_throw(c, erofs_unfixed_errors);
- if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) {
- return bch2_inconsistent_error(c) ? BCH_ERR_erofs_unfixed_errors : 0;
- } else {
- return 0;
- }
+ bch_err(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ return ret;
}
int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans,
@@ -172,7 +186,7 @@ err:
static int bch2_backpointer_del(struct btree_trans *trans, struct bpos pos)
{
- return (likely(!bch2_backpointers_no_use_write_buffer)
+ return (!static_branch_unlikely(&bch2_backpointers_no_use_write_buffer)
? bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, pos)
: bch2_btree_delete(trans, BTREE_ID_backpointers, pos, 0)) ?:
bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
@@ -182,7 +196,7 @@ static inline int bch2_backpointers_maybe_flush(struct btree_trans *trans,
struct bkey_s_c visiting_k,
struct bkey_buf *last_flushed)
{
- return likely(!bch2_backpointers_no_use_write_buffer)
+ return !static_branch_unlikely(&bch2_backpointers_no_use_write_buffer)
? bch2_btree_write_buffer_maybe_flush(trans, visiting_k, last_flushed)
: 0;
}
@@ -190,7 +204,8 @@ static inline int bch2_backpointers_maybe_flush(struct btree_trans *trans,
static int backpointer_target_not_found(struct btree_trans *trans,
struct bkey_s_c_backpointer bp,
struct bkey_s_c target_k,
- struct bkey_buf *last_flushed)
+ struct bkey_buf *last_flushed,
+ bool commit)
{
struct bch_fs *c = trans->c;
struct printbuf buf = PRINTBUF;
@@ -207,11 +222,11 @@ static int backpointer_target_not_found(struct btree_trans *trans,
if (ret)
return ret;
- prt_printf(&buf, "backpointer doesn't match %s it points to:\n ",
+ prt_printf(&buf, "backpointer doesn't match %s it points to:\n",
bp.v->level ? "btree node" : "extent");
bch2_bkey_val_to_text(&buf, c, bp.s_c);
- prt_printf(&buf, "\n ");
+ prt_newline(&buf);
bch2_bkey_val_to_text(&buf, c, target_k);
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(target_k);
@@ -219,63 +234,45 @@ static int backpointer_target_not_found(struct btree_trans *trans,
struct extent_ptr_decoded p;
bkey_for_each_ptr_decode(target_k.k, ptrs, p, entry)
if (p.ptr.dev == bp.k->p.inode) {
- prt_printf(&buf, "\n ");
+ prt_newline(&buf);
struct bkey_i_backpointer bp2;
bch2_extent_ptr_to_bp(c, bp.v->btree_id, bp.v->level, target_k, p, entry, &bp2);
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp2.k_i));
}
if (fsck_err(trans, backpointer_to_missing_ptr,
- "%s", buf.buf))
+ "%s", buf.buf)) {
ret = bch2_backpointer_del(trans, bp.k->p);
+ if (ret || !commit)
+ goto out;
+
+ /*
+ * Normally, on transaction commit from inside a transaction,
+ * we'll return -BCH_ERR_transaction_restart_nested, since a
+ * transaction commit invalidates pointers given out by peek().
+ *
+ * However, since we're updating a write buffer btree, if we
+ * return a transaction restart and loop we won't see that the
+ * backpointer has been deleted without an additional write
+ * buffer flush - and those are expensive.
+ *
+ * So we're relying on the caller immediately advancing to the
+ * next backpointer and starting a new transaction immediately
+ * after backpointer_get_key() returns NULL:
+ */
+ ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
+ }
+out:
fsck_err:
printbuf_exit(&buf);
return ret;
}
-struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
- struct bkey_s_c_backpointer bp,
- struct btree_iter *iter,
- unsigned iter_flags,
- struct bkey_buf *last_flushed)
-{
- struct bch_fs *c = trans->c;
-
- if (unlikely(bp.v->btree_id >= btree_id_nr_alive(c)))
- return bkey_s_c_null;
-
- if (likely(!bp.v->level)) {
- bch2_trans_node_iter_init(trans, iter,
- bp.v->btree_id,
- bp.v->pos,
- 0, 0,
- iter_flags);
- struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
- if (bkey_err(k)) {
- bch2_trans_iter_exit(trans, iter);
- return k;
- }
-
- if (k.k &&
- extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bp))
- return k;
-
- bch2_trans_iter_exit(trans, iter);
- int ret = backpointer_target_not_found(trans, bp, k, last_flushed);
- return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
- } else {
- struct btree *b = bch2_backpointer_get_node(trans, bp, iter, last_flushed);
- if (IS_ERR_OR_NULL(b))
- return ((struct bkey_s_c) { .k = ERR_CAST(b) });
-
- return bkey_i_to_s_c(&b->key);
- }
-}
-
-struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
- struct bkey_s_c_backpointer bp,
- struct btree_iter *iter,
- struct bkey_buf *last_flushed)
+static struct btree *__bch2_backpointer_get_node(struct btree_trans *trans,
+ struct bkey_s_c_backpointer bp,
+ struct btree_iter *iter,
+ struct bkey_buf *last_flushed,
+ bool commit)
{
struct bch_fs *c = trans->c;
@@ -287,7 +284,7 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
0,
bp.v->level - 1,
0);
- struct btree *b = bch2_btree_iter_peek_node(iter);
+ struct btree *b = bch2_btree_iter_peek_node(trans, iter);
if (IS_ERR_OR_NULL(b))
goto err;
@@ -298,9 +295,10 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
return b;
if (btree_node_will_make_reachable(b)) {
- b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node);
+ b = ERR_PTR(bch_err_throw(c, backpointer_to_overwritten_btree_node));
} else {
- int ret = backpointer_target_not_found(trans, bp, bkey_i_to_s_c(&b->key), last_flushed);
+ int ret = backpointer_target_not_found(trans, bp, bkey_i_to_s_c(&b->key),
+ last_flushed, commit);
b = ret ? ERR_PTR(ret) : NULL;
}
err:
@@ -308,6 +306,79 @@ err:
return b;
}
+static struct bkey_s_c __bch2_backpointer_get_key(struct btree_trans *trans,
+ struct bkey_s_c_backpointer bp,
+ struct btree_iter *iter,
+ unsigned iter_flags,
+ struct bkey_buf *last_flushed,
+ bool commit)
+{
+ struct bch_fs *c = trans->c;
+
+ if (unlikely(bp.v->btree_id >= btree_id_nr_alive(c)))
+ return bkey_s_c_null;
+
+ bch2_trans_node_iter_init(trans, iter,
+ bp.v->btree_id,
+ bp.v->pos,
+ 0,
+ bp.v->level,
+ iter_flags);
+ struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, iter);
+ if (bkey_err(k)) {
+ bch2_trans_iter_exit(trans, iter);
+ return k;
+ }
+
+ /*
+ * peek_slot() doesn't normally return NULL - except when we ask for a
+ * key at a btree level that doesn't exist.
+ *
+ * We may want to revisit this and change peek_slot():
+ */
+ if (!k.k) {
+ bkey_init(&iter->k);
+ iter->k.p = bp.v->pos;
+ k.k = &iter->k;
+ }
+
+ if (k.k &&
+ extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bp))
+ return k;
+
+ bch2_trans_iter_exit(trans, iter);
+
+ if (!bp.v->level) {
+ int ret = backpointer_target_not_found(trans, bp, k, last_flushed, commit);
+ return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
+ } else {
+ struct btree *b = __bch2_backpointer_get_node(trans, bp, iter, last_flushed, commit);
+ if (b == ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node))
+ return bkey_s_c_null;
+ if (IS_ERR_OR_NULL(b))
+ return ((struct bkey_s_c) { .k = ERR_CAST(b) });
+
+ return bkey_i_to_s_c(&b->key);
+ }
+}
+
+struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
+ struct bkey_s_c_backpointer bp,
+ struct btree_iter *iter,
+ struct bkey_buf *last_flushed)
+{
+ return __bch2_backpointer_get_node(trans, bp, iter, last_flushed, true);
+}
+
+struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
+ struct bkey_s_c_backpointer bp,
+ struct btree_iter *iter,
+ unsigned iter_flags,
+ struct bkey_buf *last_flushed)
+{
+ return __bch2_backpointer_get_key(trans, bp, iter, iter_flags, last_flushed, true);
+}
+
static int bch2_check_backpointer_has_valid_bucket(struct btree_trans *trans, struct bkey_s_c k,
struct bkey_buf *last_flushed)
{
@@ -315,7 +386,7 @@ static int bch2_check_backpointer_has_valid_bucket(struct btree_trans *trans, st
return 0;
struct bch_fs *c = trans->c;
- struct btree_iter alloc_iter = { NULL };
+ struct btree_iter alloc_iter = {};
struct bkey_s_c alloc_k;
struct printbuf buf = PRINTBUF;
int ret = 0;
@@ -419,7 +490,8 @@ found:
bytes = p.crc.compressed_size << 9;
- struct bch_dev *ca = bch2_dev_get_ioref(c, dev, READ);
+ struct bch_dev *ca = bch2_dev_get_ioref(c, dev, READ,
+ BCH_DEV_READ_REF_check_extent_checksums);
if (!ca)
return false;
@@ -436,12 +508,11 @@ found:
if (ret)
goto err;
- prt_str(&buf, "extents pointing to same space, but first extent checksum bad:");
- prt_printf(&buf, "\n ");
+ prt_printf(&buf, "extents pointing to same space, but first extent checksum bad:\n");
bch2_btree_id_to_text(&buf, btree);
prt_str(&buf, " ");
bch2_bkey_val_to_text(&buf, c, extent);
- prt_printf(&buf, "\n ");
+ prt_newline(&buf);
bch2_btree_id_to_text(&buf, o_btree);
prt_str(&buf, " ");
bch2_bkey_val_to_text(&buf, c, extent2);
@@ -457,7 +528,8 @@ err:
if (bio)
bio_put(bio);
kvfree(data_buf);
- percpu_ref_put(&ca->io_ref);
+ enumerated_ref_put(&ca->io_ref[READ],
+ BCH_DEV_READ_REF_check_extent_checksums);
printbuf_exit(&buf);
return ret;
}
@@ -504,7 +576,7 @@ check_existing_bp:
struct bkey_s_c_backpointer other_bp = bkey_s_c_to_backpointer(bp_k);
struct bkey_s_c other_extent =
- bch2_backpointer_get_key(trans, other_bp, &other_extent_iter, 0, NULL);
+ __bch2_backpointer_get_key(trans, other_bp, &other_extent_iter, 0, NULL, false);
ret = bkey_err(other_extent);
if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
ret = 0;
@@ -514,11 +586,28 @@ check_existing_bp:
if (!other_extent.k)
goto missing;
+ rcu_read_lock();
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp->k.p.inode);
+ if (ca) {
+ struct bkey_ptrs_c other_extent_ptrs = bch2_bkey_ptrs_c(other_extent);
+ bkey_for_each_ptr(other_extent_ptrs, ptr)
+ if (ptr->dev == bp->k.p.inode &&
+ dev_ptr_stale_rcu(ca, ptr)) {
+ rcu_read_unlock();
+ ret = drop_dev_and_update(trans, other_bp.v->btree_id,
+ other_extent, bp->k.p.inode);
+ if (ret)
+ goto err;
+ goto out;
+ }
+ }
+ rcu_read_unlock();
+
if (bch2_extents_match(orig_k, other_extent)) {
printbuf_reset(&buf);
- prt_printf(&buf, "duplicate versions of same extent, deleting smaller\n ");
+ prt_printf(&buf, "duplicate versions of same extent, deleting smaller\n");
bch2_bkey_val_to_text(&buf, c, orig_k);
- prt_str(&buf, "\n ");
+ prt_newline(&buf);
bch2_bkey_val_to_text(&buf, c, other_extent);
bch_err(c, "%s", buf.buf);
@@ -557,20 +646,20 @@ check_existing_bp:
}
printbuf_reset(&buf);
- prt_printf(&buf, "duplicate extents pointing to same space on dev %llu\n ", bp->k.p.inode);
+ prt_printf(&buf, "duplicate extents pointing to same space on dev %llu\n", bp->k.p.inode);
bch2_bkey_val_to_text(&buf, c, orig_k);
- prt_str(&buf, "\n ");
+ prt_newline(&buf);
bch2_bkey_val_to_text(&buf, c, other_extent);
bch_err(c, "%s", buf.buf);
- ret = -BCH_ERR_fsck_repair_unimplemented;
+ ret = bch_err_throw(c, fsck_repair_unimplemented);
goto err;
missing:
printbuf_reset(&buf);
- prt_str(&buf, "missing backpointer\n for: ");
+ prt_str(&buf, "missing backpointer\nfor: ");
bch2_bkey_val_to_text(&buf, c, orig_k);
- prt_printf(&buf, "\n want: ");
+ prt_printf(&buf, "\nwant: ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp->k_i));
- prt_printf(&buf, "\n got: ");
+ prt_printf(&buf, "\ngot: ");
bch2_bkey_val_to_text(&buf, c, bp_k);
if (fsck_err(trans, ptr_to_missing_backpointer, "%s", buf.buf))
@@ -590,28 +679,35 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
struct extent_ptr_decoded p;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- if (p.ptr.cached)
- continue;
-
if (p.ptr.dev == BCH_SB_MEMBER_INVALID)
continue;
- rcu_read_lock();
- struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev);
- bool check = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_mismatches);
- bool empty = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_empty);
- rcu_read_unlock();
+ bool empty;
+ {
+ /* scoped_guard() is a loop, so it breaks continue */
+ guard(rcu)();
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev);
+ if (!ca)
+ continue;
+
+ if (p.ptr.cached && dev_ptr_stale_rcu(ca, &p.ptr))
+ continue;
- if (check || empty) {
- struct bkey_i_backpointer bp;
- bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp);
+ u64 b = PTR_BUCKET_NR(ca, &p.ptr);
+ if (!bch2_bucket_bitmap_test(&ca->bucket_backpointer_mismatch, b))
+ continue;
- int ret = check
- ? check_bp_exists(trans, s, &bp, k)
- : bch2_bucket_backpointer_mod(trans, k, &bp, true);
- if (ret)
- return ret;
+ empty = bch2_bucket_bitmap_test(&ca->bucket_backpointer_empty, b);
}
+
+ struct bkey_i_backpointer bp;
+ bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp);
+
+ int ret = !empty
+ ? check_bp_exists(trans, s, &bp, k)
+ : bch2_bucket_backpointer_mod(trans, k, &bp, true);
+ if (ret)
+ return ret;
}
return 0;
@@ -630,7 +726,7 @@ static int check_btree_root_to_backpointers(struct btree_trans *trans,
retry:
bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN,
0, bch2_btree_id_root(c, btree_id)->b->c.level, 0);
- b = bch2_btree_iter_peek_node(&iter);
+ b = bch2_btree_iter_peek_node(trans, &iter);
ret = PTR_ERR_OR_ZERO(b);
if (ret)
goto err;
@@ -649,14 +745,6 @@ err:
return ret;
}
-static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp)
-{
- return (struct bbpos) {
- .btree = bp.btree_id,
- .pos = bp.pos,
- };
-}
-
static u64 mem_may_pin_bytes(struct bch_fs *c)
{
struct sysinfo i;
@@ -715,69 +803,11 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
return ret;
}
-struct progress_indicator_state {
- unsigned long next_print;
- u64 nodes_seen;
- u64 nodes_total;
- struct btree *last_node;
-};
-
-static inline void progress_init(struct progress_indicator_state *s,
- struct bch_fs *c,
- u64 btree_id_mask)
+static inline int bch2_fs_going_ro(struct bch_fs *c)
{
- memset(s, 0, sizeof(*s));
-
- s->next_print = jiffies + HZ * 10;
-
- for (unsigned i = 0; i < BTREE_ID_NR; i++) {
- if (!(btree_id_mask & BIT_ULL(i)))
- continue;
-
- struct disk_accounting_pos acc = {
- .type = BCH_DISK_ACCOUNTING_btree,
- .btree.id = i,
- };
-
- u64 v;
- bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1);
- s->nodes_total += div64_ul(v, btree_sectors(c));
- }
-}
-
-static inline bool progress_update_p(struct progress_indicator_state *s)
-{
- bool ret = time_after_eq(jiffies, s->next_print);
-
- if (ret)
- s->next_print = jiffies + HZ * 10;
- return ret;
-}
-
-static void progress_update_iter(struct btree_trans *trans,
- struct progress_indicator_state *s,
- struct btree_iter *iter,
- const char *msg)
-{
- struct bch_fs *c = trans->c;
- struct btree *b = path_l(btree_iter_path(trans, iter))->b;
-
- s->nodes_seen += b != s->last_node;
- s->last_node = b;
-
- if (progress_update_p(s)) {
- struct printbuf buf = PRINTBUF;
- unsigned percent = s->nodes_total
- ? div64_u64(s->nodes_seen * 100, s->nodes_total)
- : 0;
-
- prt_printf(&buf, "%s: %d%%, done %llu/%llu nodes, at ",
- msg, percent, s->nodes_seen, s->nodes_total);
- bch2_bbpos_to_text(&buf, BBPOS(iter->btree_id, iter->pos));
-
- bch_info(c, "%s", buf.buf);
- printbuf_exit(&buf);
- }
+ return test_bit(BCH_FS_going_ro, &c->flags)
+ ? -EROFS
+ : 0;
}
static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
@@ -787,7 +817,7 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
struct progress_indicator_state progress;
int ret = 0;
- progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_extents)|BIT_ULL(BTREE_ID_reflink));
+ bch2_progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_extents)|BIT_ULL(BTREE_ID_reflink));
for (enum btree_id btree_id = 0;
btree_id < btree_id_nr_alive(c);
@@ -806,7 +836,8 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
BTREE_ITER_prefetch);
ret = for_each_btree_key_continue(trans, iter, 0, k, ({
- progress_update_iter(trans, &progress, &iter, "extents_to_backpointers");
+ bch2_progress_update_iter(trans, &progress, &iter, "extents_to_backpointers");
+ bch2_fs_going_ro(c) ?:
check_extent_to_backpointers(trans, s, btree_id, level, k) ?:
bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
}));
@@ -827,7 +858,7 @@ enum alloc_sector_counter {
ALLOC_SECTORS_NR
};
-static enum alloc_sector_counter data_type_to_alloc_counter(enum bch_data_type t)
+static int data_type_to_alloc_counter(enum bch_data_type t)
{
switch (t) {
case BCH_DATA_btree:
@@ -836,15 +867,17 @@ static enum alloc_sector_counter data_type_to_alloc_counter(enum bch_data_type t
case BCH_DATA_cached:
return ALLOC_cached;
case BCH_DATA_stripe:
+ case BCH_DATA_parity:
return ALLOC_stripe;
default:
- BUG();
+ return -1;
}
}
static int check_bucket_backpointers_to_extents(struct btree_trans *, struct bch_dev *, struct bpos);
static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct bkey_s_c alloc_k,
+ bool *had_mismatch,
struct bkey_buf *last_flushed)
{
struct bch_fs *c = trans->c;
@@ -852,6 +885,8 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b
const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert);
bool need_commit = false;
+ *had_mismatch = false;
+
if (a->data_type == BCH_DATA_sb ||
a->data_type == BCH_DATA_journal ||
a->data_type == BCH_DATA_parity)
@@ -889,7 +924,11 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b
if (bp.v->bucket_gen != a->gen)
continue;
- sectors[data_type_to_alloc_counter(bp.v->data_type)] += bp.v->bucket_len;
+ int alloc_counter = data_type_to_alloc_counter(bp.v->data_type);
+ if (alloc_counter < 0)
+ continue;
+
+ sectors[alloc_counter] += bp.v->bucket_len;
};
bch2_trans_iter_exit(trans, &iter);
if (ret)
@@ -901,9 +940,8 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b
goto err;
}
- /* Cached pointers don't have backpointers: */
-
if (sectors[ALLOC_dirty] != a->dirty_sectors ||
+ sectors[ALLOC_cached] != a->cached_sectors ||
sectors[ALLOC_stripe] != a->stripe_sectors) {
if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen) {
ret = bch2_backpointers_maybe_flush(trans, alloc_k, last_flushed);
@@ -912,17 +950,25 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b
}
if (sectors[ALLOC_dirty] > a->dirty_sectors ||
+ sectors[ALLOC_cached] > a->cached_sectors ||
sectors[ALLOC_stripe] > a->stripe_sectors) {
ret = check_bucket_backpointers_to_extents(trans, ca, alloc_k.k->p) ?:
- -BCH_ERR_transaction_restart_nested;
+ bch_err_throw(c, transaction_restart_nested);
goto err;
}
- if (!sectors[ALLOC_dirty] &&
- !sectors[ALLOC_stripe])
- __set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_empty);
- else
- __set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_mismatches);
+ bool empty = (sectors[ALLOC_dirty] +
+ sectors[ALLOC_stripe] +
+ sectors[ALLOC_cached]) == 0;
+
+ ret = bch2_bucket_bitmap_set(ca, &ca->bucket_backpointer_mismatch,
+ alloc_k.k->p.offset) ?:
+ (empty
+ ? bch2_bucket_bitmap_set(ca, &ca->bucket_backpointer_empty,
+ alloc_k.k->p.offset)
+ : 0);
+
+ *had_mismatch = true;
}
err:
bch2_dev_put(ca);
@@ -935,7 +981,7 @@ static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k)
case KEY_TYPE_btree_ptr_v2: {
bool ret = false;
- rcu_read_lock();
+ guard(rcu)();
struct bpos pos = bkey_s_c_to_btree_ptr_v2(k).v->min_key;
while (pos.inode <= k.k->p.inode) {
if (pos.inode >= c->sb.nr_devices)
@@ -946,8 +992,14 @@ static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k)
goto next;
struct bpos bucket = bp_pos_to_bucket(ca, pos);
- bucket.offset = find_next_bit(ca->bucket_backpointer_mismatches,
- ca->mi.nbuckets, bucket.offset);
+ u64 next = ca->mi.nbuckets;
+
+ unsigned long *bitmap = READ_ONCE(ca->bucket_backpointer_mismatch.buckets);
+ if (bitmap)
+ next = min_t(u64, next,
+ find_next_bit(bitmap, ca->mi.nbuckets, bucket.offset));
+
+ bucket.offset = next;
if (bucket.offset == ca->mi.nbuckets)
goto next;
@@ -957,7 +1009,6 @@ static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k)
next:
pos = SPOS(pos.inode + 1, 0, 0);
}
- rcu_read_unlock();
return ret;
}
@@ -973,7 +1024,7 @@ static int btree_node_get_and_pin(struct btree_trans *trans, struct bkey_i *k,
{
struct btree_iter iter;
bch2_trans_node_iter_init(trans, &iter, btree, k->k.p, 0, level, 0);
- struct btree *b = bch2_btree_iter_peek_node(&iter);
+ struct btree *b = bch2_btree_iter_peek_node(trans, &iter);
int ret = PTR_ERR_OR_ZERO(b);
if (ret)
goto err;
@@ -1056,28 +1107,6 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c)
{
int ret = 0;
- /*
- * Can't allow devices to come/go/resize while we have bucket bitmaps
- * allocated
- */
- lockdep_assert_held(&c->state_lock);
-
- for_each_member_device(c, ca) {
- BUG_ON(ca->bucket_backpointer_mismatches);
- ca->bucket_backpointer_mismatches = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets),
- sizeof(unsigned long),
- GFP_KERNEL);
- ca->bucket_backpointer_empty = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets),
- sizeof(unsigned long),
- GFP_KERNEL);
- if (!ca->bucket_backpointer_mismatches ||
- !ca->bucket_backpointer_empty) {
- bch2_dev_put(ca);
- ret = -BCH_ERR_ENOMEM_backpointer_mismatches_bitmap;
- goto err_free_bitmaps;
- }
- }
-
struct btree_trans *trans = bch2_trans_get(c);
struct extents_to_bp_state s = { .bp_start = POS_MIN };
@@ -1086,23 +1115,24 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c)
ret = for_each_btree_key(trans, iter, BTREE_ID_alloc,
POS_MIN, BTREE_ITER_prefetch, k, ({
- check_bucket_backpointer_mismatch(trans, k, &s.last_flushed);
+ bool had_mismatch;
+ bch2_fs_going_ro(c) ?:
+ check_bucket_backpointer_mismatch(trans, k, &had_mismatch, &s.last_flushed);
}));
if (ret)
goto err;
- u64 nr_buckets = 0, nr_mismatches = 0, nr_empty = 0;
+ u64 nr_buckets = 0, nr_mismatches = 0;
for_each_member_device(c, ca) {
nr_buckets += ca->mi.nbuckets;
- nr_mismatches += bitmap_weight(ca->bucket_backpointer_mismatches, ca->mi.nbuckets);
- nr_empty += bitmap_weight(ca->bucket_backpointer_empty, ca->mi.nbuckets);
+ nr_mismatches += ca->bucket_backpointer_mismatch.nr;
}
- if (!nr_mismatches && !nr_empty)
+ if (!nr_mismatches)
goto err;
bch_info(c, "scanning for missing backpointers in %llu/%llu buckets",
- nr_mismatches + nr_empty, nr_buckets);
+ nr_mismatches, nr_buckets);
while (1) {
ret = bch2_pin_backpointer_nodes_with_missing(trans, s.bp_start, &s.bp_end);
@@ -1133,22 +1163,71 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c)
s.bp_start = bpos_successor(s.bp_end);
}
+
+ for_each_member_device(c, ca) {
+ bch2_bucket_bitmap_free(&ca->bucket_backpointer_mismatch);
+ bch2_bucket_bitmap_free(&ca->bucket_backpointer_empty);
+ }
err:
bch2_trans_put(trans);
bch2_bkey_buf_exit(&s.last_flushed, c);
bch2_btree_cache_unpin(c);
-err_free_bitmaps:
- for_each_member_device(c, ca) {
- kvfree(ca->bucket_backpointer_empty);
- ca->bucket_backpointer_empty = NULL;
- kvfree(ca->bucket_backpointer_mismatches);
- ca->bucket_backpointer_mismatches = NULL;
- }
bch_err_fn(c, ret);
return ret;
}
+static int check_bucket_backpointer_pos_mismatch(struct btree_trans *trans,
+ struct bpos bucket,
+ bool *had_mismatch,
+ struct bkey_buf *last_flushed)
+{
+ struct btree_iter alloc_iter;
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &alloc_iter,
+ BTREE_ID_alloc, bucket,
+ BTREE_ITER_cached);
+ int ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ ret = check_bucket_backpointer_mismatch(trans, k, had_mismatch, last_flushed);
+ bch2_trans_iter_exit(trans, &alloc_iter);
+ return ret;
+}
+
+int bch2_check_bucket_backpointer_mismatch(struct btree_trans *trans,
+ struct bch_dev *ca, u64 bucket,
+ bool copygc,
+ struct bkey_buf *last_flushed)
+{
+ struct bch_fs *c = trans->c;
+ bool had_mismatch;
+ int ret = lockrestart_do(trans,
+ check_bucket_backpointer_pos_mismatch(trans, POS(ca->dev_idx, bucket),
+ &had_mismatch, last_flushed));
+ if (ret || !had_mismatch)
+ return ret;
+
+ u64 nr = ca->bucket_backpointer_mismatch.nr;
+ u64 allowed = copygc ? ca->mi.nbuckets >> 7 : 0;
+
+ struct printbuf buf = PRINTBUF;
+ __bch2_log_msg_start(ca->name, &buf);
+
+ prt_printf(&buf, "Detected missing backpointers in bucket %llu, now have %llu/%llu with missing\n",
+ bucket, nr, ca->mi.nbuckets);
+
+ bch2_run_explicit_recovery_pass(c, &buf,
+ BCH_RECOVERY_PASS_check_extents_to_backpointers,
+ nr < allowed ? RUN_RECOVERY_PASS_ratelimit : 0);
+
+ bch2_print_str(c, KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+ return 0;
+}
+
+/* backpointers -> extents */
+
static int check_one_backpointer(struct btree_trans *trans,
struct bbpos start,
struct bbpos end,
@@ -1206,11 +1285,11 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans,
bch2_bkey_buf_init(&last_flushed);
bkey_init(&last_flushed.k->k);
- progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_backpointers));
+ bch2_progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_backpointers));
int ret = for_each_btree_key(trans, iter, BTREE_ID_backpointers,
POS_MIN, BTREE_ITER_prefetch, k, ({
- progress_update_iter(trans, &progress, &iter, "backpointers_to_extents");
+ bch2_progress_update_iter(trans, &progress, &iter, "backpointers_to_extents");
check_one_backpointer(trans, start, end, k, &last_flushed);
}));
@@ -1264,3 +1343,49 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c)
bch_err_fn(c, ret);
return ret;
}
+
+static int bch2_bucket_bitmap_set(struct bch_dev *ca, struct bucket_bitmap *b, u64 bit)
+{
+ scoped_guard(mutex, &b->lock) {
+ if (!b->buckets) {
+ b->buckets = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets),
+ sizeof(unsigned long), GFP_KERNEL);
+ if (!b->buckets)
+ return bch_err_throw(ca->fs, ENOMEM_backpointer_mismatches_bitmap);
+ }
+
+ b->nr += !__test_and_set_bit(bit, b->buckets);
+ }
+
+ return 0;
+}
+
+int bch2_bucket_bitmap_resize(struct bch_dev *ca, struct bucket_bitmap *b,
+ u64 old_size, u64 new_size)
+{
+ scoped_guard(mutex, &b->lock) {
+ if (!b->buckets)
+ return 0;
+
+ unsigned long *n = kvcalloc(BITS_TO_LONGS(new_size),
+ sizeof(unsigned long), GFP_KERNEL);
+ if (!n)
+ return bch_err_throw(ca->fs, ENOMEM_backpointer_mismatches_bitmap);
+
+ memcpy(n, b->buckets,
+ BITS_TO_LONGS(min(old_size, new_size)) * sizeof(unsigned long));
+ kvfree(b->buckets);
+ b->buckets = n;
+ }
+
+ return 0;
+}
+
+void bch2_bucket_bitmap_free(struct bucket_bitmap *b)
+{
+ mutex_lock(&b->lock);
+ kvfree(b->buckets);
+ b->buckets = NULL;
+ b->nr = 0;
+ mutex_unlock(&b->lock);
+}
diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
index 060dad1521ee..7e71afee1ac0 100644
--- a/fs/bcachefs/backpointers.h
+++ b/fs/bcachefs/backpointers.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BACKPOINTERS_BACKGROUND_H
-#define _BCACHEFS_BACKPOINTERS_BACKGROUND_H
+#ifndef _BCACHEFS_BACKPOINTERS_H
+#define _BCACHEFS_BACKPOINTERS_H
#include "btree_cache.h"
#include "btree_iter.h"
@@ -53,11 +53,10 @@ static inline struct bpos bp_pos_to_bucket_and_offset(const struct bch_dev *ca,
static inline bool bp_pos_to_bucket_nodev_noerror(struct bch_fs *c, struct bpos bp_pos, struct bpos *bucket)
{
- rcu_read_lock();
+ guard(rcu)();
struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp_pos.inode);
if (ca)
*bucket = bp_pos_to_bucket(ca, bp_pos);
- rcu_read_unlock();
return ca != NULL;
}
@@ -102,7 +101,7 @@ static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans,
struct bkey_i_backpointer *bp,
bool insert)
{
- if (unlikely(bch2_backpointers_no_use_write_buffer))
+ if (static_branch_unlikely(&bch2_backpointers_no_use_write_buffer))
return bch2_bucket_backpointer_mod_nowritebuffer(trans, orig_k, bp, insert);
if (!insert) {
@@ -123,7 +122,12 @@ static inline enum bch_data_type bch2_bkey_ptr_data_type(struct bkey_s_c k,
return BCH_DATA_btree;
case KEY_TYPE_extent:
case KEY_TYPE_reflink_v:
- return p.has_ec ? BCH_DATA_stripe : BCH_DATA_user;
+ if (p.has_ec)
+ return BCH_DATA_stripe;
+ if (p.ptr.cached)
+ return BCH_DATA_cached;
+ else
+ return BCH_DATA_user;
case KEY_TYPE_stripe: {
const struct bch_extent_ptr *ptr = &entry->ptr;
struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
@@ -147,7 +151,20 @@ static inline void bch2_extent_ptr_to_bp(struct bch_fs *c,
struct bkey_i_backpointer *bp)
{
bkey_backpointer_init(&bp->k_i);
- bp->k.p = POS(p.ptr.dev, ((u64) p.ptr.offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + p.crc.offset);
+ bp->k.p.inode = p.ptr.dev;
+
+ if (k.k->type != KEY_TYPE_stripe)
+ bp->k.p.offset = ((u64) p.ptr.offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + p.crc.offset;
+ else {
+ /*
+ * Put stripe backpointers where they won't collide with the
+ * extent backpointers within the stripe:
+ */
+ struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+ bp->k.p.offset = ((u64) (p.ptr.offset + le16_to_cpu(s.v->sectors)) <<
+ MAX_EXTENT_COMPRESS_RATIO_SHIFT) - 1;
+ }
+
bp->v = (struct bch_backpointer) {
.btree_id = btree_id,
.level = level,
@@ -164,8 +181,20 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct bkey_s_c_b
struct btree *bch2_backpointer_get_node(struct btree_trans *, struct bkey_s_c_backpointer,
struct btree_iter *, struct bkey_buf *);
+int bch2_check_bucket_backpointer_mismatch(struct btree_trans *, struct bch_dev *, u64,
+ bool, struct bkey_buf *);
+
int bch2_check_btree_backpointers(struct bch_fs *);
int bch2_check_extents_to_backpointers(struct bch_fs *);
int bch2_check_backpointers_to_extents(struct bch_fs *);
+static inline bool bch2_bucket_bitmap_test(struct bucket_bitmap *b, u64 i)
+{
+ unsigned long *bitmap = READ_ONCE(b->buckets);
+ return bitmap && test_bit(i, bitmap);
+}
+
+int bch2_bucket_bitmap_resize(struct bch_dev *, struct bucket_bitmap *, u64, u64);
+void bch2_bucket_bitmap_free(struct bucket_bitmap *);
+
#endif /* _BCACHEFS_BACKPOINTERS_BACKGROUND_H */
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 161cf2f05d2a..ddfacad0f70c 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -183,6 +183,16 @@
#define pr_fmt(fmt) "%s() " fmt "\n", __func__
#endif
+#ifdef CONFIG_BCACHEFS_DEBUG
+#define ENUMERATED_REF_DEBUG
+#endif
+
+#ifndef dynamic_fault
+#define dynamic_fault(...) 0
+#endif
+
+#define race_fault(...) dynamic_fault("bcachefs:race")
+
#include <linux/backing-dev-defs.h>
#include <linux/bug.h>
#include <linux/bio.h>
@@ -203,29 +213,46 @@
#include <linux/types.h>
#include <linux/workqueue.h>
#include <linux/zstd.h>
+#include <linux/unicode.h>
#include "bcachefs_format.h"
#include "btree_journal_iter_types.h"
#include "disk_accounting_types.h"
#include "errcode.h"
+#include "fast_list.h"
#include "fifo.h"
#include "nocow_locking_types.h"
#include "opts.h"
-#include "recovery_passes_types.h"
#include "sb-errors_types.h"
#include "seqmutex.h"
+#include "snapshot_types.h"
#include "time_stats.h"
#include "util.h"
-#ifdef CONFIG_BCACHEFS_DEBUG
-#define BCH_WRITE_REF_DEBUG
-#endif
-
-#ifndef dynamic_fault
-#define dynamic_fault(...) 0
-#endif
+#include "alloc_types.h"
+#include "async_objs_types.h"
+#include "btree_gc_types.h"
+#include "btree_types.h"
+#include "btree_node_scan_types.h"
+#include "btree_write_buffer_types.h"
+#include "buckets_types.h"
+#include "buckets_waiting_for_journal_types.h"
+#include "clock_types.h"
+#include "disk_groups_types.h"
+#include "ec_types.h"
+#include "enumerated_ref_types.h"
+#include "journal_types.h"
+#include "keylist_types.h"
+#include "quota_types.h"
+#include "rebalance_types.h"
+#include "recovery_passes_types.h"
+#include "replicas_types.h"
+#include "sb-members_types.h"
+#include "subvolume_types.h"
+#include "super_types.h"
+#include "thread_with_file_types.h"
-#define race_fault(...) dynamic_fault("bcachefs:race")
+#include "trace.h"
#define count_event(_c, _name) this_cpu_inc((_c)->counters[BCH_COUNTER_##_name])
@@ -268,7 +295,7 @@ do { \
#define bch2_fmt(_c, fmt) bch2_log_msg(_c, fmt "\n")
-void bch2_print_str(struct bch_fs *, const char *);
+void bch2_print_str(struct bch_fs *, const char *, const char *);
__printf(2, 3)
void bch2_print_opts(struct bch_opts *, const char *, ...);
@@ -292,6 +319,16 @@ do { \
bch2_print(_c, __VA_ARGS__); \
} while (0)
+#define bch2_print_str_ratelimited(_c, ...) \
+do { \
+ static DEFINE_RATELIMIT_STATE(_rs, \
+ DEFAULT_RATELIMIT_INTERVAL, \
+ DEFAULT_RATELIMIT_BURST); \
+ \
+ if (__ratelimit(&_rs)) \
+ bch2_print_str(_c, __VA_ARGS__); \
+} while (0)
+
#define bch_info(c, fmt, ...) \
bch2_print(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__)
#define bch_info_ratelimited(c, fmt, ...) \
@@ -367,6 +404,14 @@ do { \
pr_info(fmt, ##__VA_ARGS__); \
} while (0)
+static inline int __bch2_err_trace(struct bch_fs *c, int err)
+{
+ trace_error_throw(c, err, _THIS_IP_);
+ return err;
+}
+
+#define bch_err_throw(_c, _err) __bch2_err_trace(_c, -BCH_ERR_##_err)
+
/* Parameters that are useful for debugging, but should always be compiled in: */
#define BCH_DEBUG_PARAMS_ALWAYS() \
BCH_DEBUG_PARAM(key_merging_disabled, \
@@ -389,17 +434,20 @@ do { \
"compare them") \
BCH_DEBUG_PARAM(backpointers_no_use_write_buffer, \
"Don't use the write buffer for backpointers, enabling "\
- "extra runtime checks")
-
-/* Parameters that should only be compiled in debug mode: */
-#define BCH_DEBUG_PARAMS_DEBUG() \
- BCH_DEBUG_PARAM(expensive_debug_checks, \
- "Enables various runtime debugging checks that " \
- "significantly affect performance") \
+ "extra runtime checks") \
+ BCH_DEBUG_PARAM(debug_check_btree_locking, \
+ "Enable additional asserts for btree locking") \
BCH_DEBUG_PARAM(debug_check_iterators, \
"Enables extra verification for btree iterators") \
+ BCH_DEBUG_PARAM(debug_check_bset_lookups, \
+ "Enables extra verification for bset lookups") \
BCH_DEBUG_PARAM(debug_check_btree_accounting, \
"Verify btree accounting for keys within a node") \
+ BCH_DEBUG_PARAM(debug_check_bkey_unpack, \
+ "Enables extra verification for bkey unpack")
+
+/* Parameters that should only be compiled in debug mode: */
+#define BCH_DEBUG_PARAMS_DEBUG() \
BCH_DEBUG_PARAM(journal_seq_verify, \
"Store the journal sequence number in the version " \
"number of every btree key, and verify that btree " \
@@ -426,15 +474,9 @@ do { \
#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS()
#endif
-#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name;
-BCH_DEBUG_PARAMS()
-#undef BCH_DEBUG_PARAM
-
-#ifndef CONFIG_BCACHEFS_DEBUG
-#define BCH_DEBUG_PARAM(name, description) static const __maybe_unused bool bch2_##name;
-BCH_DEBUG_PARAMS_DEBUG()
+#define BCH_DEBUG_PARAM(name, description) extern struct static_key_false bch2_##name;
+BCH_DEBUG_PARAMS_ALL()
#undef BCH_DEBUG_PARAM
-#endif
#define BCH_TIME_STATS() \
x(btree_node_mem_alloc) \
@@ -442,12 +484,18 @@ BCH_DEBUG_PARAMS_DEBUG()
x(btree_node_compact) \
x(btree_node_merge) \
x(btree_node_sort) \
+ x(btree_node_get) \
x(btree_node_read) \
x(btree_node_read_done) \
+ x(btree_node_write) \
x(btree_interior_update_foreground) \
x(btree_interior_update_total) \
x(btree_gc) \
x(data_write) \
+ x(data_write_to_submit) \
+ x(data_write_to_queue) \
+ x(data_write_to_btree_update) \
+ x(data_write_btree_update) \
x(data_read) \
x(data_promote) \
x(journal_flush_write) \
@@ -456,6 +504,7 @@ BCH_DEBUG_PARAMS_DEBUG()
x(blocked_journal_low_on_space) \
x(blocked_journal_low_on_pin) \
x(blocked_journal_max_in_flight) \
+ x(blocked_journal_max_open) \
x(blocked_key_cache_flush) \
x(blocked_allocate) \
x(blocked_allocate_open_bucket) \
@@ -469,26 +518,6 @@ enum bch_time_stats {
BCH_TIME_STAT_NR
};
-#include "alloc_types.h"
-#include "btree_gc_types.h"
-#include "btree_types.h"
-#include "btree_node_scan_types.h"
-#include "btree_write_buffer_types.h"
-#include "buckets_types.h"
-#include "buckets_waiting_for_journal_types.h"
-#include "clock_types.h"
-#include "disk_groups_types.h"
-#include "ec_types.h"
-#include "journal_types.h"
-#include "keylist_types.h"
-#include "quota_types.h"
-#include "rebalance_types.h"
-#include "replicas_types.h"
-#include "sb-members_types.h"
-#include "subvolume_types.h"
-#include "super_types.h"
-#include "thread_with_file_types.h"
-
/* Number of nodes btree coalesce will try to coalesce at once */
#define GC_MERGE_NODES 4U
@@ -511,6 +540,57 @@ struct discard_in_flight {
u64 bucket:63;
};
+#define BCH_DEV_READ_REFS() \
+ x(bch2_online_devs) \
+ x(trans_mark_dev_sbs) \
+ x(read_fua_test) \
+ x(sb_field_resize) \
+ x(write_super) \
+ x(journal_read) \
+ x(fs_journal_alloc) \
+ x(fs_resize_on_mount) \
+ x(btree_node_read) \
+ x(btree_node_read_all_replicas) \
+ x(btree_node_scrub) \
+ x(btree_node_write) \
+ x(btree_node_scan) \
+ x(btree_verify_replicas) \
+ x(btree_node_ondisk_to_text) \
+ x(io_read) \
+ x(check_extent_checksums) \
+ x(ec_block)
+
+enum bch_dev_read_ref {
+#define x(n) BCH_DEV_READ_REF_##n,
+ BCH_DEV_READ_REFS()
+#undef x
+ BCH_DEV_READ_REF_NR,
+};
+
+#define BCH_DEV_WRITE_REFS() \
+ x(journal_write) \
+ x(journal_do_discards) \
+ x(dev_do_discards) \
+ x(discard_one_bucket_fast) \
+ x(do_invalidates) \
+ x(nocow_flush) \
+ x(io_write) \
+ x(ec_block) \
+ x(ec_bucket_zero)
+
+enum bch_dev_write_ref {
+#define x(n) BCH_DEV_WRITE_REF_##n,
+ BCH_DEV_WRITE_REFS()
+#undef x
+ BCH_DEV_WRITE_REF_NR,
+};
+
+struct bucket_bitmap {
+ unsigned long *buckets;
+ u64 nr;
+ struct mutex lock;
+};
+
struct bch_dev {
struct kobject kobj;
#ifdef CONFIG_BCACHEFS_DEBUG
@@ -521,8 +601,7 @@ struct bch_dev {
struct percpu_ref ref;
#endif
struct completion ref_completion;
- struct percpu_ref io_ref;
- struct completion io_ref_completion;
+ struct enumerated_ref io_ref[2];
struct bch_fs *fs;
@@ -533,6 +612,7 @@ struct bch_dev {
*/
struct bch_member_cpu mi;
atomic64_t errors[BCH_MEMBER_ERROR_NR];
+ unsigned long write_errors_start;
__uuid_t uuid;
char name[BDEVNAME_SIZE];
@@ -555,10 +635,11 @@ struct bch_dev {
u8 *oldest_gen;
unsigned long *buckets_nouse;
- unsigned long *bucket_backpointer_mismatches;
- unsigned long *bucket_backpointer_empty;
+ struct bucket_bitmap bucket_backpointer_mismatch;
+ struct bucket_bitmap bucket_backpointer_empty;
- struct bch_dev_usage __percpu *usage;
+ struct bch_dev_usage_full __percpu
+ *usage;
/* Allocator: */
u64 alloc_cursor[3];
@@ -567,10 +648,6 @@ struct bch_dev {
unsigned nr_partial_buckets;
unsigned nr_btree_reserve;
- size_t inc_gen_needs_gc;
- size_t inc_gen_really_needs_gc;
- size_t buckets_waiting_on_journal;
-
struct work_struct invalidate_work;
struct work_struct discard_work;
struct mutex discard_buckets_in_flight_lock;
@@ -609,21 +686,23 @@ struct bch_dev {
x(accounting_replay_done) \
x(may_go_rw) \
x(rw) \
+ x(rw_init_done) \
x(was_rw) \
x(stopping) \
x(emergency_ro) \
x(going_ro) \
x(write_disable_complete) \
x(clean_shutdown) \
- x(recovery_running) \
- x(fsck_running) \
+ x(in_recovery) \
+ x(in_fsck) \
x(initial_gc_unfixed) \
x(need_delete_dead_snapshots) \
x(error) \
x(topology_error) \
x(errors_fixed) \
x(errors_not_fixed) \
- x(no_invalid_checks)
+ x(no_invalid_checks) \
+ x(discard_mount_opt_set) \
enum bch_fs_flags {
#define x(n) BCH_FS_##n,
@@ -642,8 +721,10 @@ struct btree_transaction_stats {
struct bch2_time_stats lock_hold_times;
struct mutex lock;
unsigned nr_max_paths;
- unsigned journal_entries_size;
unsigned max_mem;
+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
+ darray_trans_kmalloc_trace trans_kmalloc_trace;
+#endif
char *max_paths_text;
};
@@ -664,9 +745,6 @@ struct btree_trans_buf {
struct btree_trans *trans;
};
-#define BCACHEFS_ROOT_SUBVOL_INUM \
- ((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO })
-
#define BCH_WRITE_REFS() \
x(journal) \
x(trans) \
@@ -687,7 +765,10 @@ struct btree_trans_buf {
x(gc_gens) \
x(snapshot_delete_pagecache) \
x(sysfs) \
- x(btree_write_buffer)
+ x(btree_write_buffer) \
+ x(btree_node_scrub) \
+ x(async_recovery_passes) \
+ x(ioctl_data)
enum bch_write_ref {
#define x(n) BCH_WRITE_REF_##n,
@@ -696,6 +777,8 @@ enum bch_write_ref {
BCH_WRITE_REF_NR,
};
+#define BCH_FS_DEFAULT_UTF8_ENCODING UNICODE_AGE(12, 1, 0)
+
struct bch_fs {
struct closure cl;
@@ -719,11 +802,7 @@ struct bch_fs {
struct rw_semaphore state_lock;
/* Counts outstanding writes, for clean transition to read-only */
-#ifdef BCH_WRITE_REF_DEBUG
- atomic_long_t writes[BCH_WRITE_REF_NR];
-#else
- struct percpu_ref writes;
-#endif
+ struct enumerated_ref writes;
/*
* Certain operations are only allowed in single threaded mode, during
* recovery, and we want to assert that this is the case:
@@ -767,6 +846,7 @@ struct bch_fs {
u8 nr_devices;
u8 clean;
+ bool multi_device; /* true if we've ever had more than one device */
u8 encryption_type;
@@ -776,10 +856,14 @@ struct bch_fs {
unsigned nsec_per_time_unit;
u64 features;
u64 compat;
+ u64 recovery_passes_required;
unsigned long errors_silent[BITS_TO_LONGS(BCH_FSCK_ERR_MAX)];
u64 btrees_lost_data;
} sb;
+ DARRAY(enum bcachefs_metadata_version)
+ incompat_versions_requested;
+ struct unicode_map *cf_encoding;
struct bch_sb_handle disk_sb;
@@ -795,7 +879,7 @@ struct bch_fs {
struct mutex snapshot_table_lock;
struct rw_semaphore snapshot_create_lock;
- struct work_struct snapshot_delete_work;
+ struct snapshot_delete snapshot_delete;
struct work_struct snapshot_wait_for_pagecache_and_delete_work;
snapshot_id_list snapshots_unlinked;
struct mutex snapshots_unlinked_lock;
@@ -860,7 +944,7 @@ struct bch_fs {
struct btree_write_buffer btree_write_buffer;
struct workqueue_struct *btree_update_wq;
- struct workqueue_struct *btree_io_complete_wq;
+ struct workqueue_struct *btree_write_complete_wq;
/* copygc needs its own workqueue for index updates.. */
struct workqueue_struct *copygc_wq;
/*
@@ -871,6 +955,7 @@ struct bch_fs {
struct workqueue_struct *write_ref_wq;
/* ALLOCATION */
+ struct bch_devs_mask online_devs;
struct bch_devs_mask rw_devs[BCH_DATA_NR];
unsigned long rw_devs_change_count;
@@ -965,13 +1050,16 @@ struct bch_fs {
nocow_locks;
struct rhashtable promote_table;
+#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
+ struct async_obj_list async_objs[BCH_ASYNC_OBJ_NR];
+#endif
+
mempool_t compression_bounce[2];
mempool_t compress_workspace[BCH_COMPRESSION_OPT_NR];
size_t zstd_workspace_size;
- struct crypto_shash *sha256;
- struct crypto_sync_skcipher *chacha20;
- struct crypto_shash *poly1305;
+ struct bch_key chacha20_key;
+ bool chacha20_key_set;
atomic64_t key_version;
@@ -993,15 +1081,11 @@ struct bch_fs {
wait_queue_head_t copygc_running_wq;
/* STRIPES: */
- GENRADIX(struct stripe) stripes;
GENRADIX(struct gc_stripe) gc_stripes;
struct hlist_head ec_stripes_new[32];
spinlock_t ec_stripes_new_lock;
- ec_stripes_heap ec_stripes_heap;
- struct mutex ec_stripes_heap_lock;
-
/* ERASURE CODING */
struct list_head ec_stripe_head_list;
struct mutex ec_stripe_head_lock;
@@ -1039,25 +1123,12 @@ struct bch_fs {
/* RECOVERY */
u64 journal_replay_seq_start;
u64 journal_replay_seq_end;
- /*
- * Two different uses:
- * "Has this fsck pass?" - i.e. should this type of error be an
- * emergency read-only
- * And, in certain situations fsck will rewind to an earlier pass: used
- * for signaling to the toplevel code which pass we want to run now.
- */
- enum bch_recovery_pass curr_recovery_pass;
- enum bch_recovery_pass next_recovery_pass;
- /* bitmask of recovery passes that we actually ran */
- u64 recovery_passes_complete;
- /* never rewinds version of curr_recovery_pass */
- enum bch_recovery_pass recovery_pass_done;
- spinlock_t recovery_pass_lock;
- struct semaphore online_fsck_mutex;
+ struct bch_fs_recovery recovery;
/* DEBUG JUNK */
struct dentry *fs_debug_dir;
struct dentry *btree_debug_dir;
+ struct dentry *async_obj_dir;
struct btree_debug btree_debug[BTREE_ID_NR];
struct btree *verify_data;
struct btree_node *verify_ondisk;
@@ -1099,54 +1170,6 @@ struct bch_fs {
extern struct wait_queue_head bch2_read_only_wait;
-static inline void bch2_write_ref_get(struct bch_fs *c, enum bch_write_ref ref)
-{
-#ifdef BCH_WRITE_REF_DEBUG
- atomic_long_inc(&c->writes[ref]);
-#else
- percpu_ref_get(&c->writes);
-#endif
-}
-
-static inline bool __bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref)
-{
-#ifdef BCH_WRITE_REF_DEBUG
- return !test_bit(BCH_FS_going_ro, &c->flags) &&
- atomic_long_inc_not_zero(&c->writes[ref]);
-#else
- return percpu_ref_tryget(&c->writes);
-#endif
-}
-
-static inline bool bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref)
-{
-#ifdef BCH_WRITE_REF_DEBUG
- return !test_bit(BCH_FS_going_ro, &c->flags) &&
- atomic_long_inc_not_zero(&c->writes[ref]);
-#else
- return percpu_ref_tryget_live(&c->writes);
-#endif
-}
-
-static inline void bch2_write_ref_put(struct bch_fs *c, enum bch_write_ref ref)
-{
-#ifdef BCH_WRITE_REF_DEBUG
- long v = atomic_long_dec_return(&c->writes[ref]);
-
- BUG_ON(v < 0);
- if (v)
- return;
- for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++)
- if (atomic_long_read(&c->writes[i]))
- return;
-
- set_bit(BCH_FS_write_disable_complete, &c->flags);
- wake_up(&bch2_read_only_wait);
-#else
- percpu_ref_put(&c->writes);
-#endif
-}
-
static inline bool bch2_ro_ref_tryget(struct bch_fs *c)
{
if (test_bit(BCH_FS_stopping, &c->flags))
@@ -1247,4 +1270,26 @@ static inline unsigned data_replicas_required(struct bch_fs *c)
#define BKEY_PADDED_ONSTACK(key, pad) \
struct { struct bkey_i key; __u64 key ## _pad[pad]; }
+/*
+ * This is needed because discard is both a filesystem option and a device
+ * option, and mount options are supposed to apply to that mount and not be
+ * persisted, i.e. if it's set as a mount option we can't propagate it to the
+ * device.
+ */
+static inline bool bch2_discard_opt_enabled(struct bch_fs *c, struct bch_dev *ca)
+{
+ return test_bit(BCH_FS_discard_mount_opt_set, &c->flags)
+ ? c->opts.discard
+ : ca->mi.discard;
+}
+
+static inline bool bch2_fs_casefold_enabled(struct bch_fs *c)
+{
+#ifdef CONFIG_UNICODE
+ return !c->opts.casefold_disabled;
+#else
+ return false;
+#endif
+}
+
#endif /* _BCACHEFS_H */
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index f70f0108401f..b4a04df5ea95 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -366,6 +366,10 @@ static inline void bkey_init(struct bkey *k)
#define __BKEY_PADDED(key, pad) \
struct bkey_i key; __u64 key ## _pad[pad]
+enum bch_bkey_type_flags {
+ BKEY_TYPE_strict_btree_checks = BIT(0),
+};
+
/*
* - DELETED keys are used internally to mark keys that should be ignored but
* override keys in composition order. Their version number is ignored.
@@ -383,46 +387,46 @@ static inline void bkey_init(struct bkey *k)
*
* - WHITEOUT: for hash table btrees
*/
-#define BCH_BKEY_TYPES() \
- x(deleted, 0) \
- x(whiteout, 1) \
- x(error, 2) \
- x(cookie, 3) \
- x(hash_whiteout, 4) \
- x(btree_ptr, 5) \
- x(extent, 6) \
- x(reservation, 7) \
- x(inode, 8) \
- x(inode_generation, 9) \
- x(dirent, 10) \
- x(xattr, 11) \
- x(alloc, 12) \
- x(quota, 13) \
- x(stripe, 14) \
- x(reflink_p, 15) \
- x(reflink_v, 16) \
- x(inline_data, 17) \
- x(btree_ptr_v2, 18) \
- x(indirect_inline_data, 19) \
- x(alloc_v2, 20) \
- x(subvolume, 21) \
- x(snapshot, 22) \
- x(inode_v2, 23) \
- x(alloc_v3, 24) \
- x(set, 25) \
- x(lru, 26) \
- x(alloc_v4, 27) \
- x(backpointer, 28) \
- x(inode_v3, 29) \
- x(bucket_gens, 30) \
- x(snapshot_tree, 31) \
- x(logged_op_truncate, 32) \
- x(logged_op_finsert, 33) \
- x(accounting, 34) \
- x(inode_alloc_cursor, 35)
+#define BCH_BKEY_TYPES() \
+ x(deleted, 0, 0) \
+ x(whiteout, 1, 0) \
+ x(error, 2, 0) \
+ x(cookie, 3, 0) \
+ x(hash_whiteout, 4, BKEY_TYPE_strict_btree_checks) \
+ x(btree_ptr, 5, BKEY_TYPE_strict_btree_checks) \
+ x(extent, 6, BKEY_TYPE_strict_btree_checks) \
+ x(reservation, 7, BKEY_TYPE_strict_btree_checks) \
+ x(inode, 8, BKEY_TYPE_strict_btree_checks) \
+ x(inode_generation, 9, BKEY_TYPE_strict_btree_checks) \
+ x(dirent, 10, BKEY_TYPE_strict_btree_checks) \
+ x(xattr, 11, BKEY_TYPE_strict_btree_checks) \
+ x(alloc, 12, BKEY_TYPE_strict_btree_checks) \
+ x(quota, 13, BKEY_TYPE_strict_btree_checks) \
+ x(stripe, 14, BKEY_TYPE_strict_btree_checks) \
+ x(reflink_p, 15, BKEY_TYPE_strict_btree_checks) \
+ x(reflink_v, 16, BKEY_TYPE_strict_btree_checks) \
+ x(inline_data, 17, BKEY_TYPE_strict_btree_checks) \
+ x(btree_ptr_v2, 18, BKEY_TYPE_strict_btree_checks) \
+ x(indirect_inline_data, 19, BKEY_TYPE_strict_btree_checks) \
+ x(alloc_v2, 20, BKEY_TYPE_strict_btree_checks) \
+ x(subvolume, 21, BKEY_TYPE_strict_btree_checks) \
+ x(snapshot, 22, BKEY_TYPE_strict_btree_checks) \
+ x(inode_v2, 23, BKEY_TYPE_strict_btree_checks) \
+ x(alloc_v3, 24, BKEY_TYPE_strict_btree_checks) \
+ x(set, 25, 0) \
+ x(lru, 26, BKEY_TYPE_strict_btree_checks) \
+ x(alloc_v4, 27, BKEY_TYPE_strict_btree_checks) \
+ x(backpointer, 28, BKEY_TYPE_strict_btree_checks) \
+ x(inode_v3, 29, BKEY_TYPE_strict_btree_checks) \
+ x(bucket_gens, 30, BKEY_TYPE_strict_btree_checks) \
+ x(snapshot_tree, 31, BKEY_TYPE_strict_btree_checks) \
+ x(logged_op_truncate, 32, BKEY_TYPE_strict_btree_checks) \
+ x(logged_op_finsert, 33, BKEY_TYPE_strict_btree_checks) \
+ x(accounting, 34, BKEY_TYPE_strict_btree_checks) \
+ x(inode_alloc_cursor, 35, BKEY_TYPE_strict_btree_checks)
enum bch_bkey_type {
-#define x(name, nr) KEY_TYPE_##name = nr,
+#define x(name, nr, ...) KEY_TYPE_##name = nr,
BCH_BKEY_TYPES()
#undef x
KEY_TYPE_MAX,
@@ -493,7 +497,8 @@ struct bch_sb_field {
x(members_v2, 11) \
x(errors, 12) \
x(ext, 13) \
- x(downgrade, 14)
+ x(downgrade, 14) \
+ x(recovery_passes, 15)
#include "alloc_background_format.h"
#include "dirent_format.h"
@@ -506,6 +511,7 @@ struct bch_sb_field {
#include "logged_ops_format.h"
#include "lru_format.h"
#include "quota_format.h"
+#include "recovery_passes_format.h"
#include "reflink_format.h"
#include "replicas_format.h"
#include "snapshot_format.h"
@@ -686,7 +692,15 @@ struct bch_sb_field_ext {
x(inode_depth, BCH_VERSION(1, 17)) \
x(persistent_inode_cursors, BCH_VERSION(1, 18)) \
x(autofix_errors, BCH_VERSION(1, 19)) \
- x(directory_size, BCH_VERSION(1, 20))
+ x(directory_size, BCH_VERSION(1, 20)) \
+ x(cached_backpointers, BCH_VERSION(1, 21)) \
+ x(stripe_backpointers, BCH_VERSION(1, 22)) \
+ x(stripe_lru, BCH_VERSION(1, 23)) \
+ x(casefolding, BCH_VERSION(1, 24)) \
+ x(extent_flags, BCH_VERSION(1, 25)) \
+ x(snapshot_deletion_v2, BCH_VERSION(1, 26)) \
+ x(fast_device_removal, BCH_VERSION(1, 27)) \
+ x(inode_has_case_insensitive, BCH_VERSION(1, 28))
enum bcachefs_metadata_version {
bcachefs_metadata_version_min = 9,
@@ -837,6 +851,7 @@ LE64_BITMASK(BCH_SB_SHARD_INUMS, struct bch_sb, flags[3], 28, 29);
LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30);
LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62);
LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63);
+LE64_BITMASK(BCH_SB_MULTI_DEVICE, struct bch_sb, flags[3], 63, 64);
LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32);
LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33);
LE64_BITMASK(BCH_SB_NOCOW, struct bch_sb, flags[4], 33, 34);
@@ -855,6 +870,11 @@ LE64_BITMASK(BCH_SB_VERSION_INCOMPAT, struct bch_sb, flags[5], 32, 48);
LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED,
struct bch_sb, flags[5], 48, 64);
LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS, struct bch_sb, flags[6], 0, 4);
+LE64_BITMASK(BCH_SB_WRITE_ERROR_TIMEOUT,struct bch_sb, flags[6], 4, 14);
+LE64_BITMASK(BCH_SB_CSUM_ERR_RETRY_NR, struct bch_sb, flags[6], 14, 20);
+LE64_BITMASK(BCH_SB_DEGRADED_ACTION, struct bch_sb, flags[6], 20, 22);
+LE64_BITMASK(BCH_SB_CASEFOLD, struct bch_sb, flags[6], 22, 23);
+LE64_BITMASK(BCH_SB_REBALANCE_AC_ONLY, struct bch_sb, flags[6], 23, 24);
static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb)
{
@@ -908,7 +928,10 @@ static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u
x(journal_no_flush, 16) \
x(alloc_v2, 17) \
x(extents_across_btree_nodes, 18) \
- x(incompat_version_field, 19)
+ x(incompat_version_field, 19) \
+ x(casefolding, 20) \
+ x(no_alloc_info, 21) \
+ x(small_image, 22)
#define BCH_SB_FEATURES_ALWAYS \
(BIT_ULL(BCH_FEATURE_new_extent_overwrite)| \
@@ -922,7 +945,8 @@ static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u
BIT_ULL(BCH_FEATURE_new_siphash)| \
BIT_ULL(BCH_FEATURE_btree_ptr_v2)| \
BIT_ULL(BCH_FEATURE_new_varint)| \
- BIT_ULL(BCH_FEATURE_journal_no_flush))
+ BIT_ULL(BCH_FEATURE_journal_no_flush)| \
+ BIT_ULL(BCH_FEATURE_incompat_version_field))
enum bch_sb_feature {
#define x(f, n) BCH_FEATURE_##f,
@@ -974,6 +998,19 @@ enum bch_error_actions {
BCH_ON_ERROR_NR
};
+#define BCH_DEGRADED_ACTIONS() \
+ x(ask, 0) \
+ x(yes, 1) \
+ x(very, 2) \
+ x(no, 3)
+
+enum bch_degraded_actions {
+#define x(t, n) BCH_DEGRADED_##t = n,
+ BCH_DEGRADED_ACTIONS()
+#undef x
+ BCH_DEGRADED_ACTIONS_NR
+};
+
#define BCH_STR_HASH_TYPES() \
x(crc32c, 0) \
x(crc64, 1) \
@@ -1133,7 +1170,8 @@ static inline __u64 __bset_magic(struct bch_sb *sb)
x(log, 9) \
x(overwrite, 10) \
x(write_buffer_keys, 11) \
- x(datetime, 12)
+ x(datetime, 12) \
+ x(log_bkey, 13)
enum bch_jset_entry_type {
#define x(f, nr) BCH_JSET_ENTRY_##f = nr,
diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h
index 3c23bdf788ce..52594e925eb7 100644
--- a/fs/bcachefs/bcachefs_ioctl.h
+++ b/fs/bcachefs/bcachefs_ioctl.h
@@ -87,6 +87,7 @@ struct bch_ioctl_incremental {
#define BCH_IOCTL_FSCK_OFFLINE _IOW(0xbc, 19, struct bch_ioctl_fsck_offline)
#define BCH_IOCTL_FSCK_ONLINE _IOW(0xbc, 20, struct bch_ioctl_fsck_online)
#define BCH_IOCTL_QUERY_ACCOUNTING _IOW(0xbc, 21, struct bch_ioctl_query_accounting)
+#define BCH_IOCTL_QUERY_COUNTERS _IOW(0xbc, 21, struct bch_ioctl_query_counters)
/* ioctl below act on a particular file, not the filesystem as a whole: */
@@ -215,6 +216,10 @@ struct bch_ioctl_data {
union {
struct {
__u32 dev;
+ __u32 data_types;
+ } scrub;
+ struct {
+ __u32 dev;
__u32 pad;
} migrate;
struct {
@@ -229,6 +234,11 @@ enum bch_data_event {
BCH_DATA_EVENT_NR = 1,
};
+enum data_progress_data_type_special {
+ DATA_PROGRESS_DATA_TYPE_phys = 254,
+ DATA_PROGRESS_DATA_TYPE_done = 255,
+};
+
struct bch_ioctl_data_progress {
__u8 data_type;
__u8 btree_id;
@@ -237,11 +247,19 @@ struct bch_ioctl_data_progress {
__u64 sectors_done;
__u64 sectors_total;
+ __u64 sectors_error_corrected;
+ __u64 sectors_error_uncorrected;
} __packed __aligned(8);
+enum bch_ioctl_data_event_ret {
+ BCH_IOCTL_DATA_EVENT_RET_done = 1,
+ BCH_IOCTL_DATA_EVENT_RET_device_offline = 2,
+};
+
struct bch_ioctl_data_event {
__u8 type;
- __u8 pad[7];
+ __u8 ret;
+ __u8 pad[6];
union {
struct bch_ioctl_data_progress p;
__u64 pad2[15];
@@ -443,4 +461,13 @@ struct bch_ioctl_query_accounting {
struct bkey_i_accounting accounting[];
};
+#define BCH_IOCTL_QUERY_COUNTERS_MOUNT (1 << 0)
+
+struct bch_ioctl_query_counters {
+ __u16 nr;
+ __u16 flags;
+ __u32 pad;
+ __u64 d[];
+};
+
#endif /* _BCACHEFS_IOCTL_H */
diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
index 995ba32e9b6e..ee823c640642 100644
--- a/fs/bcachefs/bkey.c
+++ b/fs/bcachefs/bkey.c
@@ -47,11 +47,9 @@ void bch2_bkey_packed_to_binary_text(struct printbuf *out,
}
}
-#ifdef CONFIG_BCACHEFS_DEBUG
-
-static void bch2_bkey_pack_verify(const struct bkey_packed *packed,
- const struct bkey *unpacked,
- const struct bkey_format *format)
+static void __bch2_bkey_pack_verify(const struct bkey_packed *packed,
+ const struct bkey *unpacked,
+ const struct bkey_format *format)
{
struct bkey tmp;
@@ -95,11 +93,13 @@ static void bch2_bkey_pack_verify(const struct bkey_packed *packed,
}
}
-#else
static inline void bch2_bkey_pack_verify(const struct bkey_packed *packed,
- const struct bkey *unpacked,
- const struct bkey_format *format) {}
-#endif
+ const struct bkey *unpacked,
+ const struct bkey_format *format)
+{
+ if (static_branch_unlikely(&bch2_debug_check_bkey_unpack))
+ __bch2_bkey_pack_verify(packed, unpacked, format);
+}
struct pack_state {
const struct bkey_format *format;
@@ -398,7 +398,6 @@ static bool set_inc_field_lossy(struct pack_state *state, unsigned field, u64 v)
return ret;
}
-#ifdef CONFIG_BCACHEFS_DEBUG
static bool bkey_packed_successor(struct bkey_packed *out,
const struct btree *b,
struct bkey_packed k)
@@ -455,7 +454,6 @@ static bool bkey_format_has_too_big_fields(const struct bkey_format *f)
return false;
}
-#endif
/*
* Returns a packed key that compares <= in
@@ -472,9 +470,7 @@ enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out,
const struct bkey_format *f = &b->format;
struct pack_state state = pack_state_init(f, out);
u64 *w = out->_data;
-#ifdef CONFIG_BCACHEFS_DEBUG
struct bpos orig = in;
-#endif
bool exact = true;
unsigned i;
@@ -527,18 +523,18 @@ enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out,
out->format = KEY_FORMAT_LOCAL_BTREE;
out->type = KEY_TYPE_deleted;
-#ifdef CONFIG_BCACHEFS_DEBUG
- if (exact) {
- BUG_ON(bkey_cmp_left_packed(b, out, &orig));
- } else {
- struct bkey_packed successor;
+ if (static_branch_unlikely(&bch2_debug_check_bkey_unpack)) {
+ if (exact) {
+ BUG_ON(bkey_cmp_left_packed(b, out, &orig));
+ } else {
+ struct bkey_packed successor;
- BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0);
- BUG_ON(bkey_packed_successor(&successor, b, *out) &&
- bkey_cmp_left_packed(b, &successor, &orig) < 0 &&
- !bkey_format_has_too_big_fields(f));
+ BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0);
+ BUG_ON(bkey_packed_successor(&successor, b, *out) &&
+ bkey_cmp_left_packed(b, &successor, &orig) < 0 &&
+ !bkey_format_has_too_big_fields(f));
+ }
}
-#endif
return exact ? BKEY_PACK_POS_EXACT : BKEY_PACK_POS_SMALLER;
}
@@ -627,14 +623,13 @@ struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s)
}
}
-#ifdef CONFIG_BCACHEFS_DEBUG
- {
+ if (static_branch_unlikely(&bch2_debug_check_bkey_unpack)) {
struct printbuf buf = PRINTBUF;
BUG_ON(bch2_bkey_format_invalid(NULL, &ret, 0, &buf));
printbuf_exit(&buf);
}
-#endif
+
return ret;
}
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index 054e2d5e8448..3ccd521c190a 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -191,6 +191,7 @@ static inline struct bpos bkey_max(struct bpos l, struct bpos r)
static inline bool bkey_and_val_eq(struct bkey_s_c l, struct bkey_s_c r)
{
return bpos_eq(l.k->p, r.k->p) &&
+ l.k->size == r.k->size &&
bkey_bytes(l.k) == bkey_bytes(r.k) &&
!memcmp(l.v, r.v, bkey_val_bytes(l.k));
}
@@ -397,8 +398,7 @@ __bkey_unpack_key_format_checked(const struct btree *b,
compiled_unpack_fn unpack_fn = b->aux_data;
unpack_fn(dst, src);
- if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
- bch2_expensive_debug_checks) {
+ if (static_branch_unlikely(&bch2_debug_check_bkey_unpack)) {
struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src);
BUG_ON(memcmp(dst, &dst2, sizeof(*dst)));
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 15c93576b5c2..fcd8c82cba4f 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -21,7 +21,7 @@
#include "xattr.h"
const char * const bch2_bkey_types[] = {
-#define x(name, nr) #name,
+#define x(name, nr, ...) #name,
BCH_BKEY_TYPES()
#undef x
NULL
@@ -115,7 +115,7 @@ static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_
})
const struct bkey_ops bch2_bkey_ops[] = {
-#define x(name, nr) [KEY_TYPE_##name] = bch2_bkey_ops_##name,
+#define x(name, nr, ...) [KEY_TYPE_##name] = bch2_bkey_ops_##name,
BCH_BKEY_TYPES()
#undef x
};
@@ -155,6 +155,12 @@ static u64 bch2_key_types_allowed[] = {
#undef x
};
+static const enum bch_bkey_type_flags bch2_bkey_type_flags[] = {
+#define x(name, nr, flags) [KEY_TYPE_##name] = flags,
+ BCH_BKEY_TYPES()
+#undef x
+};
+
const char *bch2_btree_node_type_str(enum btree_node_type type)
{
return type == BKEY_TYPE_btree ? "internal btree node" : bch2_btree_id_str(type - 1);
@@ -177,8 +183,18 @@ int __bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k,
if (type >= BKEY_TYPE_NR)
return 0;
- bkey_fsck_err_on(k.k->type < KEY_TYPE_MAX &&
- (type == BKEY_TYPE_btree || (from.flags & BCH_VALIDATE_commit)) &&
+ enum bch_bkey_type_flags bkey_flags = k.k->type < KEY_TYPE_MAX
+ ? bch2_bkey_type_flags[k.k->type]
+ : 0;
+
+ bool strict_key_type_allowed =
+ (from.flags & BCH_VALIDATE_commit) ||
+ type == BKEY_TYPE_btree ||
+ (from.btree < BTREE_ID_NR &&
+ (bkey_flags & BKEY_TYPE_strict_btree_checks));
+
+ bkey_fsck_err_on(strict_key_type_allowed &&
+ k.k->type < KEY_TYPE_MAX &&
!(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)),
c, bkey_invalid_type_for_btree,
"invalid key type for btree %s (%s)",
@@ -340,7 +356,7 @@ bool bch2_bkey_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
return ops->key_merge &&
bch2_bkey_maybe_mergable(l.k, r.k) &&
(u64) l.k->size + r.k->size <= KEY_SIZE_MAX &&
- !bch2_key_merging_disabled &&
+ !static_branch_unlikely(&bch2_key_merging_disabled) &&
ops->key_merge(c, l, r);
}
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 9a4a83d6fd2d..32841f762eb2 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -144,8 +144,6 @@ struct btree_nr_keys bch2_btree_node_count_keys(struct btree *b)
return nr;
}
-#ifdef CONFIG_BCACHEFS_DEBUG
-
void __bch2_verify_btree_nr_keys(struct btree *b)
{
struct btree_nr_keys nr = bch2_btree_node_count_keys(b);
@@ -153,7 +151,7 @@ void __bch2_verify_btree_nr_keys(struct btree *b)
BUG_ON(memcmp(&nr, &b->nr, sizeof(nr)));
}
-static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter,
+static void __bch2_btree_node_iter_next_check(struct btree_node_iter *_iter,
struct btree *b)
{
struct btree_node_iter iter = *_iter;
@@ -190,8 +188,8 @@ static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter,
}
}
-void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
- struct btree *b)
+void __bch2_btree_node_iter_verify(struct btree_node_iter *iter,
+ struct btree *b)
{
struct btree_node_iter_set *set, *s2;
struct bkey_packed *k, *p;
@@ -237,8 +235,8 @@ found:
}
}
-void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
- struct bkey_packed *insert, unsigned clobber_u64s)
+static void __bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
+ struct bkey_packed *insert, unsigned clobber_u64s)
{
struct bset_tree *t = bch2_bkey_to_bset(b, where);
struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where);
@@ -285,12 +283,15 @@ void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
#endif
}
-#else
-
-static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter,
- struct btree *b) {}
+static inline void bch2_verify_insert_pos(struct btree *b,
+ struct bkey_packed *where,
+ struct bkey_packed *insert,
+ unsigned clobber_u64s)
+{
+ if (static_branch_unlikely(&bch2_debug_check_bset_lookups))
+ __bch2_verify_insert_pos(b, where, insert, clobber_u64s);
+}
-#endif
/* Auxiliary search trees */
@@ -361,9 +362,8 @@ static struct bkey_float *bkey_float(const struct btree *b,
return ro_aux_tree_base(b, t)->f + idx;
}
-static void bset_aux_tree_verify(struct btree *b)
+static void __bset_aux_tree_verify(struct btree *b)
{
-#ifdef CONFIG_BCACHEFS_DEBUG
for_each_bset(b, t) {
if (t->aux_data_offset == U16_MAX)
continue;
@@ -375,7 +375,12 @@ static void bset_aux_tree_verify(struct btree *b)
BUG_ON(t->aux_data_offset > btree_aux_data_u64s(b));
BUG_ON(bset_aux_tree_buf_end(t) > btree_aux_data_u64s(b));
}
-#endif
+}
+
+static inline void bset_aux_tree_verify(struct btree *b)
+{
+ if (static_branch_unlikely(&bch2_debug_check_bset_lookups))
+ __bset_aux_tree_verify(b);
}
void bch2_btree_keys_init(struct btree *b)
@@ -495,15 +500,11 @@ static void rw_aux_tree_set(const struct btree *b, struct bset_tree *t,
};
}
-static void bch2_bset_verify_rw_aux_tree(struct btree *b,
- struct bset_tree *t)
+static void __bch2_bset_verify_rw_aux_tree(struct btree *b, struct bset_tree *t)
{
struct bkey_packed *k = btree_bkey_first(b, t);
unsigned j = 0;
- if (!bch2_expensive_debug_checks)
- return;
-
BUG_ON(bset_has_ro_aux_tree(t));
if (!bset_has_rw_aux_tree(t))
@@ -530,6 +531,13 @@ start:
}
}
+static inline void bch2_bset_verify_rw_aux_tree(struct btree *b,
+ struct bset_tree *t)
+{
+ if (static_branch_unlikely(&bch2_debug_check_bset_lookups))
+ __bch2_bset_verify_rw_aux_tree(b, t);
+}
+
/* returns idx of first entry >= offset: */
static unsigned rw_aux_tree_bsearch(struct btree *b,
struct bset_tree *t,
@@ -869,7 +877,7 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
k = p;
}
- if (bch2_expensive_debug_checks) {
+ if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) {
BUG_ON(ret >= orig_k);
for (i = ret
@@ -1195,7 +1203,7 @@ struct bkey_packed *bch2_bset_search_linear(struct btree *b,
bkey_iter_pos_cmp(b, m, search) < 0)
m = bkey_p_next(m);
- if (bch2_expensive_debug_checks) {
+ if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) {
struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m);
BUG_ON(prev &&
@@ -1435,9 +1443,9 @@ static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter,
void bch2_btree_node_iter_advance(struct btree_node_iter *iter,
struct btree *b)
{
- if (bch2_expensive_debug_checks) {
- bch2_btree_node_iter_verify(iter, b);
- bch2_btree_node_iter_next_check(iter, b);
+ if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) {
+ __bch2_btree_node_iter_verify(iter, b);
+ __bch2_btree_node_iter_next_check(iter, b);
}
__bch2_btree_node_iter_advance(iter, b);
@@ -1453,8 +1461,7 @@ struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter,
struct btree_node_iter_set *set;
unsigned end = 0;
- if (bch2_expensive_debug_checks)
- bch2_btree_node_iter_verify(iter, b);
+ bch2_btree_node_iter_verify(iter, b);
for_each_bset(b, t) {
k = bch2_bkey_prev_all(b, t,
@@ -1489,8 +1496,7 @@ found:
iter->data[0].k = __btree_node_key_to_offset(b, prev);
iter->data[0].end = end;
- if (bch2_expensive_debug_checks)
- bch2_btree_node_iter_verify(iter, b);
+ bch2_btree_node_iter_verify(iter, b);
return prev;
}
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
index 6953d55b72cc..a15ecf9d006e 100644
--- a/fs/bcachefs/bset.h
+++ b/fs/bcachefs/bset.h
@@ -517,27 +517,19 @@ void bch2_dump_bset(struct bch_fs *, struct btree *, struct bset *, unsigned);
void bch2_dump_btree_node(struct bch_fs *, struct btree *);
void bch2_dump_btree_node_iter(struct btree *, struct btree_node_iter *);
-#ifdef CONFIG_BCACHEFS_DEBUG
-
void __bch2_verify_btree_nr_keys(struct btree *);
-void bch2_btree_node_iter_verify(struct btree_node_iter *, struct btree *);
-void bch2_verify_insert_pos(struct btree *, struct bkey_packed *,
- struct bkey_packed *, unsigned);
-
-#else
+void __bch2_btree_node_iter_verify(struct btree_node_iter *, struct btree *);
-static inline void __bch2_verify_btree_nr_keys(struct btree *b) {}
static inline void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
- struct btree *b) {}
-static inline void bch2_verify_insert_pos(struct btree *b,
- struct bkey_packed *where,
- struct bkey_packed *insert,
- unsigned clobber_u64s) {}
-#endif
+ struct btree *b)
+{
+ if (static_branch_unlikely(&bch2_debug_check_bset_lookups))
+ __bch2_btree_node_iter_verify(iter, b);
+}
static inline void bch2_verify_btree_nr_keys(struct btree *b)
{
- if (bch2_debug_check_btree_accounting)
+ if (static_branch_unlikely(&bch2_debug_check_btree_accounting))
__bch2_verify_btree_nr_keys(b);
}
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index ca755e8d1a37..83c9860e6b82 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -17,12 +17,6 @@
#include <linux/sched/mm.h>
#include <linux/swap.h>
-#define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \
-do { \
- if (shrinker_counter) \
- bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_##counter]++; \
-} while (0)
-
const char * const bch2_btree_node_flags[] = {
"typebit",
"typebit",
@@ -91,7 +85,7 @@ void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b)
six_unlock_intent(&b->c.lock);
}
-static void __btree_node_data_free(struct btree_cache *bc, struct btree *b)
+void __btree_node_data_free(struct btree *b)
{
BUG_ON(!list_empty(&b->list));
BUG_ON(btree_node_hashed(b));
@@ -118,16 +112,17 @@ static void __btree_node_data_free(struct btree_cache *bc, struct btree *b)
munmap(b->aux_data, btree_aux_data_bytes(b));
#endif
b->aux_data = NULL;
-
- btree_node_to_freedlist(bc, b);
}
static void btree_node_data_free(struct btree_cache *bc, struct btree *b)
{
BUG_ON(list_empty(&b->list));
list_del_init(&b->list);
+
+ __btree_node_data_free(b);
+
--bc->nr_freeable;
- __btree_node_data_free(bc, b);
+ btree_node_to_freedlist(bc, b);
}
static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
@@ -155,7 +150,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
b->data = kvmalloc(btree_buf_bytes(b), gfp);
if (!b->data)
- return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
+ return bch_err_throw(c, ENOMEM_btree_node_mem_alloc);
#ifdef __KERNEL__
b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp);
#else
@@ -168,7 +163,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
if (!b->aux_data) {
kvfree(b->data);
b->data = NULL;
- return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
+ return bch_err_throw(c, ENOMEM_btree_node_mem_alloc);
}
return 0;
@@ -191,10 +186,7 @@ static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
{
- struct btree_cache *bc = &c->btree_cache;
- struct btree *b;
-
- b = __btree_node_mem_alloc(c, GFP_KERNEL);
+ struct btree *b = __btree_node_mem_alloc(c, GFP_KERNEL);
if (!b)
return NULL;
@@ -203,9 +195,7 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
return NULL;
}
- bch2_btree_lock_init(&b->c, 0);
-
- __bch2_btree_node_to_freelist(bc, b);
+ bch2_btree_lock_init(&b->c, 0, GFP_KERNEL);
return b;
}
@@ -350,115 +340,118 @@ static inline struct btree *btree_cache_find(struct btree_cache *bc,
return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params);
}
-/*
- * this version is for btree nodes that have already been freed (we're not
- * reaping a real btree node)
- */
-static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, bool shrinker_counter)
+static int __btree_node_reclaim_checks(struct bch_fs *c, struct btree *b,
+ bool flush, bool locked)
{
struct btree_cache *bc = &c->btree_cache;
- int ret = 0;
lockdep_assert_held(&bc->lock);
-wait_on_io:
- if (b->flags & ((1U << BTREE_NODE_dirty)|
- (1U << BTREE_NODE_read_in_flight)|
+
+ if (btree_node_noevict(b)) {
+ bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_noevict]++;
+ return bch_err_throw(c, ENOMEM_btree_node_reclaim);
+ }
+ if (btree_node_write_blocked(b)) {
+ bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_write_blocked]++;
+ return bch_err_throw(c, ENOMEM_btree_node_reclaim);
+ }
+ if (btree_node_will_make_reachable(b)) {
+ bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_will_make_reachable]++;
+ return bch_err_throw(c, ENOMEM_btree_node_reclaim);
+ }
+
+ if (btree_node_dirty(b)) {
+ if (!flush) {
+ bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_dirty]++;
+ return bch_err_throw(c, ENOMEM_btree_node_reclaim);
+ }
+
+ if (locked) {
+ /*
+ * Using the underscore version because we don't want to compact
+ * bsets after the write, since this node is about to be evicted
+ * - unless btree verify mode is enabled, since it runs out of
+ * the post write cleanup:
+ */
+ if (static_branch_unlikely(&bch2_verify_btree_ondisk))
+ bch2_btree_node_write(c, b, SIX_LOCK_intent,
+ BTREE_WRITE_cache_reclaim);
+ else
+ __bch2_btree_node_write(c, b,
+ BTREE_WRITE_cache_reclaim);
+ }
+ }
+
+ if (b->flags & ((1U << BTREE_NODE_read_in_flight)|
(1U << BTREE_NODE_write_in_flight))) {
if (!flush) {
- if (btree_node_dirty(b))
- BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
- else if (btree_node_read_in_flight(b))
- BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
+ if (btree_node_read_in_flight(b))
+ bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_read_in_flight]++;
else if (btree_node_write_in_flight(b))
- BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
- return -BCH_ERR_ENOMEM_btree_node_reclaim;
+ bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_write_in_flight]++;
+ return bch_err_throw(c, ENOMEM_btree_node_reclaim);
}
+ if (locked)
+ return -EINTR;
+
/* XXX: waiting on IO with btree cache lock held */
bch2_btree_node_wait_on_read(b);
bch2_btree_node_wait_on_write(b);
}
+ return 0;
+}
+
+/*
+ * this version is for btree nodes that have already been freed (we're not
+ * reaping a real btree node)
+ */
+static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
+{
+ struct btree_cache *bc = &c->btree_cache;
+ int ret = 0;
+
+ lockdep_assert_held(&bc->lock);
+retry_unlocked:
+ ret = __btree_node_reclaim_checks(c, b, flush, false);
+ if (ret)
+ return ret;
+
if (!six_trylock_intent(&b->c.lock)) {
- BTREE_CACHE_NOT_FREED_INCREMENT(lock_intent);
- return -BCH_ERR_ENOMEM_btree_node_reclaim;
+ bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_lock_intent]++;
+ return bch_err_throw(c, ENOMEM_btree_node_reclaim);
}
if (!six_trylock_write(&b->c.lock)) {
- BTREE_CACHE_NOT_FREED_INCREMENT(lock_write);
- goto out_unlock_intent;
+ bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_lock_write]++;
+ six_unlock_intent(&b->c.lock);
+ return bch_err_throw(c, ENOMEM_btree_node_reclaim);
}
/* recheck under lock */
- if (b->flags & ((1U << BTREE_NODE_read_in_flight)|
- (1U << BTREE_NODE_write_in_flight))) {
- if (!flush) {
- if (btree_node_read_in_flight(b))
- BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight);
- else if (btree_node_write_in_flight(b))
- BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight);
- goto out_unlock;
- }
+ ret = __btree_node_reclaim_checks(c, b, flush, true);
+ if (ret) {
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
- goto wait_on_io;
- }
-
- if (btree_node_noevict(b)) {
- BTREE_CACHE_NOT_FREED_INCREMENT(noevict);
- goto out_unlock;
- }
- if (btree_node_write_blocked(b)) {
- BTREE_CACHE_NOT_FREED_INCREMENT(write_blocked);
- goto out_unlock;
- }
- if (btree_node_will_make_reachable(b)) {
- BTREE_CACHE_NOT_FREED_INCREMENT(will_make_reachable);
- goto out_unlock;
+ if (ret == -EINTR)
+ goto retry_unlocked;
+ return ret;
}
- if (btree_node_dirty(b)) {
- if (!flush) {
- BTREE_CACHE_NOT_FREED_INCREMENT(dirty);
- goto out_unlock;
- }
- /*
- * Using the underscore version because we don't want to compact
- * bsets after the write, since this node is about to be evicted
- * - unless btree verify mode is enabled, since it runs out of
- * the post write cleanup:
- */
- if (bch2_verify_btree_ondisk)
- bch2_btree_node_write(c, b, SIX_LOCK_intent,
- BTREE_WRITE_cache_reclaim);
- else
- __bch2_btree_node_write(c, b,
- BTREE_WRITE_cache_reclaim);
-
- six_unlock_write(&b->c.lock);
- six_unlock_intent(&b->c.lock);
- goto wait_on_io;
- }
-out:
if (b->hash_val && !ret)
trace_and_count(c, btree_cache_reap, c, b);
- return ret;
-out_unlock:
- six_unlock_write(&b->c.lock);
-out_unlock_intent:
- six_unlock_intent(&b->c.lock);
- ret = -BCH_ERR_ENOMEM_btree_node_reclaim;
- goto out;
+ return 0;
}
-static int btree_node_reclaim(struct bch_fs *c, struct btree *b, bool shrinker_counter)
+static int btree_node_reclaim(struct bch_fs *c, struct btree *b)
{
- return __btree_node_reclaim(c, b, false, shrinker_counter);
+ return __btree_node_reclaim(c, b, false);
}
static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
{
- return __btree_node_reclaim(c, b, true, false);
+ return __btree_node_reclaim(c, b, true);
}
static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
@@ -476,7 +469,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
unsigned long ret = SHRINK_STOP;
bool trigger_writes = atomic_long_read(&bc->nr_dirty) + nr >= list->nr * 3 / 4;
- if (bch2_btree_shrinker_disabled)
+ if (static_branch_unlikely(&bch2_btree_shrinker_disabled))
return SHRINK_STOP;
mutex_lock(&bc->lock);
@@ -490,7 +483,10 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
* IO can always make forward progress:
*/
can_free = btree_cache_can_free(list);
- nr = min_t(unsigned long, nr, can_free);
+ if (nr > can_free) {
+ bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_cache_reserve] += nr - can_free;
+ nr = can_free;
+ }
i = 0;
list_for_each_entry_safe(b, t, &bc->freeable, list) {
@@ -506,7 +502,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
if (touched >= nr)
goto out;
- if (!btree_node_reclaim(c, b, true)) {
+ if (!btree_node_reclaim(c, b)) {
btree_node_data_free(bc, b);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
@@ -522,9 +518,10 @@ restart:
clear_btree_node_accessed(b);
bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_access_bit]++;
--touched;;
- } else if (!btree_node_reclaim(c, b, true)) {
+ } else if (!btree_node_reclaim(c, b)) {
__bch2_btree_node_hash_remove(bc, b);
- __btree_node_data_free(bc, b);
+ __btree_node_data_free(b);
+ btree_node_to_freedlist(bc, b);
freed++;
bc->nr_freed++;
@@ -569,7 +566,7 @@ static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
{
struct btree_cache_list *list = shrink->private_data;
- if (bch2_btree_shrinker_disabled)
+ if (static_branch_unlikely(&bch2_btree_shrinker_disabled))
return 0;
return btree_cache_can_free(list);
@@ -610,6 +607,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
btree_node_write_in_flight(b));
btree_node_data_free(bc, b);
+ cond_resched();
}
BUG_ON(!bch2_journal_error(&c->journal) &&
@@ -651,9 +649,12 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
bch2_recalc_btree_reserve(c);
- for (i = 0; i < bc->nr_reserve; i++)
- if (!__bch2_btree_node_mem_alloc(c))
+ for (i = 0; i < bc->nr_reserve; i++) {
+ struct btree *b = __bch2_btree_node_mem_alloc(c);
+ if (!b)
goto err;
+ __bch2_btree_node_to_freelist(bc, b);
+ }
list_splice_init(&bc->live[0].list, &bc->freeable);
@@ -681,7 +682,7 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
return 0;
err:
- return -BCH_ERR_ENOMEM_fs_btree_cache_init;
+ return bch_err_throw(c, ENOMEM_fs_btree_cache_init);
}
void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
@@ -726,7 +727,7 @@ int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure
if (!cl) {
trace_and_count(c, btree_cache_cannibalize_lock_fail, trans);
- return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock;
+ return bch_err_throw(c, ENOMEM_btree_cache_cannibalize_lock);
}
closure_wait(&bc->alloc_wait, cl);
@@ -740,7 +741,7 @@ int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure
}
trace_and_count(c, btree_cache_cannibalize_lock_fail, trans);
- return -BCH_ERR_btree_cache_cannibalize_lock_blocked;
+ return bch_err_throw(c, btree_cache_cannibalize_lock_blocked);
success:
trace_and_count(c, btree_cache_cannibalize_lock, trans);
@@ -754,7 +755,7 @@ static struct btree *btree_node_cannibalize(struct bch_fs *c)
for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++)
list_for_each_entry_reverse(b, &bc->live[i].list, list)
- if (!btree_node_reclaim(c, b, false))
+ if (!btree_node_reclaim(c, b))
return b;
while (1) {
@@ -789,23 +790,24 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea
* disk node. Check the freed list before allocating a new one:
*/
list_for_each_entry(b, freed, list)
- if (!btree_node_reclaim(c, b, false)) {
+ if (!btree_node_reclaim(c, b)) {
list_del_init(&b->list);
goto got_node;
}
b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN);
- if (!b) {
+ if (b) {
+ bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0, GFP_NOWAIT);
+ } else {
mutex_unlock(&bc->lock);
bch2_trans_unlock(trans);
b = __btree_node_mem_alloc(c, GFP_KERNEL);
if (!b)
goto err;
+ bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0, GFP_KERNEL);
mutex_lock(&bc->lock);
}
- bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0);
-
BUG_ON(!six_trylock_intent(&b->c.lock));
BUG_ON(!six_trylock_write(&b->c.lock));
@@ -815,7 +817,7 @@ got_node:
* the list. Check if there's any freed nodes there:
*/
list_for_each_entry(b2, &bc->freeable, list)
- if (!btree_node_reclaim(c, b2, false)) {
+ if (!btree_node_reclaim(c, b2)) {
swap(b->data, b2->data);
swap(b->aux_data, b2->aux_data);
@@ -850,7 +852,6 @@ out:
b->sib_u64s[1] = 0;
b->whiteout_u64s = 0;
bch2_btree_keys_init(b);
- set_btree_node_accessed(b);
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
start_time);
@@ -976,7 +977,7 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
/* Unlock before doing IO: */
six_unlock_intent(&b->c.lock);
- bch2_trans_unlock_noassert(trans);
+ bch2_trans_unlock(trans);
bch2_btree_node_read(trans, b, sync);
@@ -1002,7 +1003,7 @@ static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
{
struct printbuf buf = PRINTBUF;
- if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations)
+ if (c->recovery.pass_done < BCH_RECOVERY_PASS_check_allocations)
return;
prt_printf(&buf,
@@ -1284,6 +1285,10 @@ lock_node:
six_unlock_read(&b->c.lock);
goto retry;
}
+
+ /* avoid atomic set bit if it's not needed: */
+ if (!btree_node_accessed(b))
+ set_btree_node_accessed(b);
}
/* XXX: waiting on IO with btree locks held: */
@@ -1299,10 +1304,6 @@ lock_node:
prefetch(p + L1_CACHE_BYTES * 2);
}
- /* avoid atomic set bit if it's not needed: */
- if (!btree_node_accessed(b))
- set_btree_node_accessed(b);
-
if (unlikely(btree_node_read_error(b))) {
six_unlock_read(&b->c.lock);
b = ERR_PTR(-BCH_ERR_btree_node_read_err_cached);
@@ -1415,7 +1416,7 @@ void __bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c,
prt_printf(out, "%u", r->level);
else
prt_printf(out, "(unknown)");
- prt_printf(out, "\n ");
+ prt_newline(out);
bch2_bkey_val_to_text(out, c, k);
}
@@ -1491,9 +1492,10 @@ void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc
prt_btree_cache_line(out, c, "live:", bc->live[0].nr);
prt_btree_cache_line(out, c, "pinned:", bc->live[1].nr);
- prt_btree_cache_line(out, c, "freeable:", bc->nr_freeable);
+ prt_btree_cache_line(out, c, "reserve:", bc->nr_reserve);
+ prt_btree_cache_line(out, c, "freed:", bc->nr_freeable);
prt_btree_cache_line(out, c, "dirty:", atomic_long_read(&bc->nr_dirty));
- prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock);
+ prt_printf(out, "cannibalize lock:\t%s\n", bc->alloc_lock ? "held" : "not held");
prt_newline(out);
for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) {
@@ -1504,6 +1506,7 @@ void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc
}
prt_newline(out);
+ prt_printf(out, "counters since mount:\n");
prt_printf(out, "freed:\t%zu\n", bc->nr_freed);
prt_printf(out, "not freed:\n");
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index ca3c1b145330..be275f87a60e 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -30,6 +30,7 @@ void bch2_btree_node_update_key_early(struct btree_trans *, enum btree_id, unsig
void bch2_btree_cache_cannibalize_unlock(struct btree_trans *);
int bch2_btree_cache_cannibalize_lock(struct btree_trans *, struct closure *);
+void __btree_node_data_free(struct btree *);
struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *);
struct btree *bch2_btree_node_mem_alloc(struct btree_trans *, bool);
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index dd1d9b74076e..bac108e93823 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -22,11 +22,13 @@
#include "debug.h"
#include "disk_accounting.h"
#include "ec.h"
+#include "enumerated_ref.h"
#include "error.h"
#include "extents.h"
#include "journal.h"
#include "keylist.h"
#include "move.h"
+#include "progress.h"
#include "recovery_passes.h"
#include "reflink.h"
#include "recovery.h"
@@ -46,6 +48,27 @@
#define DROP_PREV_NODE 11
#define DID_FILL_FROM_SCAN 12
+/*
+ * Returns true if it's a btree we can easily reconstruct, or otherwise won't
+ * cause data loss if it's missing:
+ */
+static bool btree_id_important(enum btree_id btree)
+{
+ if (btree_id_is_alloc(btree))
+ return false;
+
+ switch (btree) {
+ case BTREE_ID_quotas:
+ case BTREE_ID_snapshot_trees:
+ case BTREE_ID_logged_ops:
+ case BTREE_ID_rebalance_work:
+ case BTREE_ID_subvolume_children:
+ return false;
+ default:
+ return true;
+ }
+}
+
static const char * const bch2_gc_phase_strs[] = {
#define x(n) #n,
GC_PHASES()
@@ -127,7 +150,7 @@ static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min)
new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL);
if (!new)
- return -BCH_ERR_ENOMEM_gc_repair_key;
+ return bch_err_throw(c, ENOMEM_gc_repair_key);
btree_ptr_to_v2(b, new);
b->data->min_key = new_min;
@@ -167,7 +190,7 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL);
if (!new)
- return -BCH_ERR_ENOMEM_gc_repair_key;
+ return bch_err_throw(c, ENOMEM_gc_repair_key);
btree_ptr_to_v2(b, new);
b->data->max_key = new_max;
@@ -212,15 +235,15 @@ static int btree_check_node_boundaries(struct btree_trans *trans, struct btree *
prt_printf(&buf, " at ");
bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
- prt_printf(&buf, ":\n parent: ");
+ prt_printf(&buf, ":\nparent: ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
if (prev) {
- prt_printf(&buf, "\n prev: ");
+ prt_printf(&buf, "\nprev: ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&prev->key));
}
- prt_str(&buf, "\n next: ");
+ prt_str(&buf, "\nnext: ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&cur->key));
if (bpos_lt(expected_start, cur->data->min_key)) { /* gap */
@@ -279,12 +302,12 @@ static int btree_repair_node_end(struct btree_trans *trans, struct btree *b,
if (bpos_eq(child->key.k.p, b->key.k.p))
return 0;
- prt_printf(&buf, " at ");
+ prt_printf(&buf, "\nat: ");
bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
- prt_printf(&buf, ":\n parent: ");
+ prt_printf(&buf, "\nparent: ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
- prt_str(&buf, "\n child: ");
+ prt_str(&buf, "\nchild: ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&child->key));
if (mustfix_fsck_err(trans, btree_node_topology_bad_max_key,
@@ -348,21 +371,13 @@ again:
prt_char(&buf, ' ');
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k));
- if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO),
- trans, btree_node_read_error,
- "Topology repair: unreadable btree node at\n"
- " %s",
- buf.buf)) {
+ if (bch2_err_matches(ret, EIO)) {
bch2_btree_node_evict(trans, cur_k.k);
cur = NULL;
ret = bch2_journal_key_delete(c, b->c.btree_id,
b->c.level, cur_k.k->k.p);
if (ret)
break;
-
- ret = bch2_btree_lost_data(c, b->c.btree_id);
- if (ret)
- break;
continue;
}
@@ -382,7 +397,11 @@ again:
continue;
}
- ret = btree_check_node_boundaries(trans, b, prev, cur, pulled_from_scan);
+ ret = lockrestart_do(trans,
+ btree_check_node_boundaries(trans, b, prev, cur, pulled_from_scan));
+ if (ret < 0)
+ goto err;
+
if (ret == DID_FILL_FROM_SCAN) {
new_pass = true;
ret = 0;
@@ -423,7 +442,8 @@ again:
if (!ret && !IS_ERR_OR_NULL(prev)) {
BUG_ON(cur);
- ret = btree_repair_node_end(trans, b, prev, pulled_from_scan);
+ ret = lockrestart_do(trans,
+ btree_repair_node_end(trans, b, prev, pulled_from_scan));
if (ret == DID_FILL_FROM_SCAN) {
new_pass = true;
ret = 0;
@@ -483,8 +503,14 @@ again:
prt_newline(&buf);
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+ /*
+ * XXX: we're not passing the trans object here because we're not set up
+ * to handle a transaction restart - this code needs to be rewritten
+ * when we start doing online topology repair
+ */
+ bch2_trans_unlock_long(trans);
if (mustfix_fsck_err_on(!have_child,
- trans, btree_node_topology_interior_node_empty,
+ c, btree_node_topology_interior_node_empty,
"empty interior btree node at %s", buf.buf))
ret = DROP_THIS_NODE;
err:
@@ -504,50 +530,72 @@ fsck_err:
bch2_bkey_buf_exit(&prev_k, c);
bch2_bkey_buf_exit(&cur_k, c);
printbuf_exit(&buf);
+ bch_err_fn(c, ret);
return ret;
}
-int bch2_check_topology(struct bch_fs *c)
+static int bch2_check_root(struct btree_trans *trans, enum btree_id btree,
+ bool *reconstructed_root)
{
- struct btree_trans *trans = bch2_trans_get(c);
- struct bpos pulled_from_scan = POS_MIN;
+ struct bch_fs *c = trans->c;
+ struct btree_root *r = bch2_btree_id_root(c, btree);
struct printbuf buf = PRINTBUF;
int ret = 0;
- bch2_trans_srcu_unlock(trans);
+ bch2_btree_id_to_text(&buf, btree);
- for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
- struct btree_root *r = bch2_btree_id_root(c, i);
- bool reconstructed_root = false;
+ if (r->error) {
+ bch_info(c, "btree root %s unreadable, must recover from scan", buf.buf);
- printbuf_reset(&buf);
- bch2_btree_id_to_text(&buf, i);
+ ret = bch2_btree_has_scanned_nodes(c, btree);
+ if (ret < 0)
+ goto err;
- if (r->error) {
- ret = bch2_btree_lost_data(c, i);
- if (ret)
- break;
-reconstruct_root:
- bch_info(c, "btree root %s unreadable, must recover from scan", buf.buf);
+ if (!ret) {
+ __fsck_err(trans,
+ FSCK_CAN_FIX|(!btree_id_important(btree) ? FSCK_AUTOFIX : 0),
+ btree_root_unreadable_and_scan_found_nothing,
+ "no nodes found for btree %s, continue?", buf.buf);
r->alive = false;
r->error = 0;
+ bch2_btree_root_alloc_fake_trans(trans, btree, 0);
+ } else {
+ r->alive = false;
+ r->error = 0;
+ bch2_btree_root_alloc_fake_trans(trans, btree, 1);
- if (!bch2_btree_has_scanned_nodes(c, i)) {
- mustfix_fsck_err(trans, btree_root_unreadable_and_scan_found_nothing,
- "no nodes found for btree %s, continue?", buf.buf);
- bch2_btree_root_alloc_fake_trans(trans, i, 0);
- } else {
- bch2_btree_root_alloc_fake_trans(trans, i, 1);
- bch2_shoot_down_journal_keys(c, i, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
- ret = bch2_get_scanned_nodes(c, i, 0, POS_MIN, SPOS_MAX);
- if (ret)
- break;
- }
-
- reconstructed_root = true;
+ bch2_shoot_down_journal_keys(c, btree, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
+ ret = bch2_get_scanned_nodes(c, btree, 0, POS_MIN, SPOS_MAX);
+ if (ret)
+ goto err;
}
+ *reconstructed_root = true;
+ }
+err:
+fsck_err:
+ printbuf_exit(&buf);
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+int bch2_check_topology(struct bch_fs *c)
+{
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct bpos pulled_from_scan = POS_MIN;
+ int ret = 0;
+
+ bch2_trans_srcu_unlock(trans);
+
+ for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
+ bool reconstructed_root = false;
+recover:
+ ret = lockrestart_do(trans, bch2_check_root(trans, i, &reconstructed_root));
+ if (ret)
+ break;
+
+ struct btree_root *r = bch2_btree_id_root(c, i);
struct btree *b = r->b;
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
@@ -561,17 +609,21 @@ reconstruct_root:
r->b = NULL;
- if (!reconstructed_root)
- goto reconstruct_root;
+ if (!reconstructed_root) {
+ r->error = -EIO;
+ goto recover;
+ }
+ struct printbuf buf = PRINTBUF;
+ bch2_btree_id_to_text(&buf, i);
bch_err(c, "empty btree root %s", buf.buf);
+ printbuf_exit(&buf);
bch2_btree_root_alloc_fake_trans(trans, i, 0);
r->alive = false;
ret = 0;
}
}
-fsck_err:
- printbuf_exit(&buf);
+
bch2_trans_put(trans);
return ret;
}
@@ -605,13 +657,13 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
deleted.p = k.k->p;
if (initial) {
- BUG_ON(bch2_journal_seq_verify &&
+ BUG_ON(static_branch_unlikely(&bch2_journal_seq_verify) &&
k.k->bversion.lo > atomic64_read(&c->journal.seq));
if (fsck_err_on(btree_id != BTREE_ID_accounting &&
k.k->bversion.lo > atomic64_read(&c->key_version),
trans, bkey_version_in_future,
- "key version number higher than recorded %llu\n %s",
+ "key version number higher than recorded %llu\n%s",
atomic64_read(&c->key_version),
(bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
atomic64_set(&c->key_version, k.k->bversion.lo);
@@ -619,7 +671,7 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, k),
trans, btree_bitmap_not_marked,
- "btree ptr not marked in member info btree allocated bitmap\n %s",
+ "btree ptr not marked in member info btree allocated bitmap\n%s",
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k),
buf.buf))) {
@@ -656,7 +708,9 @@ fsck_err:
return ret;
}
-static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool initial)
+static int bch2_gc_btree(struct btree_trans *trans,
+ struct progress_indicator_state *progress,
+ enum btree_id btree, bool initial)
{
struct bch_fs *c = trans->c;
unsigned target_depth = btree_node_type_has_triggers(__btree_node_type(0, btree)) ? 0 : 1;
@@ -673,6 +727,7 @@ static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool in
BTREE_ITER_prefetch);
ret = for_each_btree_key_continue(trans, iter, 0, k, ({
+ bch2_progress_update_iter(trans, progress, &iter, "check_allocations");
gc_pos_set(c, gc_pos_btree(btree, level, k.k->p));
bch2_gc_mark_key(trans, btree, level, &prev, &iter, k, initial);
}));
@@ -688,7 +743,7 @@ retry_root:
struct btree_iter iter;
bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN,
0, bch2_btree_id_root(c, btree)->b->c.level, 0);
- struct btree *b = bch2_btree_iter_peek_node(&iter);
+ struct btree *b = bch2_btree_iter_peek_node(trans, &iter);
ret = PTR_ERR_OR_ZERO(b);
if (ret)
goto err_root;
@@ -717,22 +772,24 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
static int bch2_gc_btrees(struct bch_fs *c)
{
struct btree_trans *trans = bch2_trans_get(c);
- enum btree_id ids[BTREE_ID_NR];
struct printbuf buf = PRINTBUF;
- unsigned i;
int ret = 0;
- for (i = 0; i < BTREE_ID_NR; i++)
+ struct progress_indicator_state progress;
+ bch2_progress_init(&progress, c, ~0ULL);
+
+ enum btree_id ids[BTREE_ID_NR];
+ for (unsigned i = 0; i < BTREE_ID_NR; i++)
ids[i] = i;
bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp);
- for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
+ for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
unsigned btree = i < BTREE_ID_NR ? ids[i] : i;
if (IS_ERR_OR_NULL(bch2_btree_id_root(c, btree)->b))
continue;
- ret = bch2_gc_btree(trans, btree, true);
+ ret = bch2_gc_btree(trans, &progress, btree, true);
}
printbuf_exit(&buf);
@@ -916,7 +973,7 @@ static int bch2_gc_alloc_start(struct bch_fs *c)
ret = genradix_prealloc(&ca->buckets_gc, ca->mi.nbuckets, GFP_KERNEL);
if (ret) {
bch2_dev_put(ca);
- ret = -BCH_ERR_ENOMEM_gc_alloc_start;
+ ret = bch_err_throw(c, ENOMEM_gc_alloc_start);
break;
}
}
@@ -1015,8 +1072,7 @@ int bch2_check_allocations(struct bch_fs *c)
{
int ret;
- lockdep_assert_held(&c->state_lock);
-
+ down_read(&c->state_lock);
down_write(&c->gc_lock);
bch2_btree_interior_updates_flush(c);
@@ -1054,12 +1110,17 @@ out:
percpu_up_write(&c->mark_lock);
up_write(&c->gc_lock);
+ up_read(&c->state_lock);
/*
* At startup, allocations can happen directly instead of via the
* allocator thread - issue wakeup in case they blocked on gc_lock:
*/
closure_wake_up(&c->freelist_wait);
+
+ if (!ret && !test_bit(BCH_FS_errors_not_fixed, &c->flags))
+ bch2_sb_members_clean_deleted(c);
+
bch_err_fn(c, ret);
return ret;
}
@@ -1070,42 +1131,41 @@ static int gc_btree_gens_key(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- struct bkey_i *u;
- int ret;
if (unlikely(test_bit(BCH_FS_going_ro, &c->flags)))
return -EROFS;
- rcu_read_lock();
- bkey_for_each_ptr(ptrs, ptr) {
- struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
- if (!ca)
- continue;
+ bool too_stale = false;
+ scoped_guard(rcu) {
+ bkey_for_each_ptr(ptrs, ptr) {
+ struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
+ if (!ca)
+ continue;
- if (dev_ptr_stale(ca, ptr) > 16) {
- rcu_read_unlock();
- goto update;
+ too_stale |= dev_ptr_stale(ca, ptr) > 16;
}
+
+ if (!too_stale)
+ bkey_for_each_ptr(ptrs, ptr) {
+ struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
+ if (!ca)
+ continue;
+
+ u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)];
+ if (gen_after(*gen, ptr->gen))
+ *gen = ptr->gen;
+ }
}
- bkey_for_each_ptr(ptrs, ptr) {
- struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
- if (!ca)
- continue;
+ if (too_stale) {
+ struct bkey_i *u = bch2_bkey_make_mut(trans, iter, &k, 0);
+ int ret = PTR_ERR_OR_ZERO(u);
+ if (ret)
+ return ret;
- u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)];
- if (gen_after(*gen, ptr->gen))
- *gen = ptr->gen;
+ bch2_extent_normalize(c, bkey_i_to_s(u));
}
- rcu_read_unlock();
- return 0;
-update:
- u = bch2_bkey_make_mut(trans, iter, &k, 0);
- ret = PTR_ERR_OR_ZERO(u);
- if (ret)
- return ret;
- bch2_extent_normalize(c, bkey_i_to_s(u));
return 0;
}
@@ -1158,7 +1218,7 @@ int bch2_gc_gens(struct bch_fs *c)
ca->oldest_gen = kvmalloc(gens->nbuckets, GFP_KERNEL);
if (!ca->oldest_gen) {
bch2_dev_put(ca);
- ret = -BCH_ERR_ENOMEM_gc_gens;
+ ret = bch_err_throw(c, ENOMEM_gc_gens);
goto err;
}
@@ -1194,7 +1254,7 @@ int bch2_gc_gens(struct bch_fs *c)
BCH_TRANS_COMMIT_no_enospc, ({
ca = bch2_dev_iterate(c, ca, k.k->p.inode);
if (!ca) {
- bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
+ bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0));
continue;
}
bch2_alloc_write_oldest_gen(trans, ca, &iter, k);
@@ -1228,26 +1288,21 @@ static void bch2_gc_gens_work(struct work_struct *work)
{
struct bch_fs *c = container_of(work, struct bch_fs, gc_gens_work);
bch2_gc_gens(c);
- bch2_write_ref_put(c, BCH_WRITE_REF_gc_gens);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_gc_gens);
}
void bch2_gc_gens_async(struct bch_fs *c)
{
- if (bch2_write_ref_tryget(c, BCH_WRITE_REF_gc_gens) &&
+ if (enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_gc_gens) &&
!queue_work(c->write_ref_wq, &c->gc_gens_work))
- bch2_write_ref_put(c, BCH_WRITE_REF_gc_gens);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_gc_gens);
}
-void bch2_fs_btree_gc_exit(struct bch_fs *c)
-{
-}
-
-int bch2_fs_btree_gc_init(struct bch_fs *c)
+void bch2_fs_btree_gc_init_early(struct bch_fs *c)
{
seqcount_init(&c->gc_pos_lock);
INIT_WORK(&c->gc_gens_work, bch2_gc_gens_work);
init_rwsem(&c->gc_lock);
mutex_init(&c->gc_gens_lock);
- return 0;
}
diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
index 9693a90a48a2..ec77662369a2 100644
--- a/fs/bcachefs/btree_gc.h
+++ b/fs/bcachefs/btree_gc.h
@@ -83,7 +83,6 @@ void bch2_gc_pos_to_text(struct printbuf *, struct gc_pos *);
int bch2_gc_gens(struct bch_fs *);
void bch2_gc_gens_async(struct bch_fs *);
-void bch2_fs_btree_gc_exit(struct bch_fs *);
-int bch2_fs_btree_gc_init(struct bch_fs *);
+void bch2_fs_btree_gc_init_early(struct bch_fs *);
#endif /* _BCACHEFS_BTREE_GC_H */
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index e371e60e3133..590cd29f3e86 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -1,6 +1,8 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "async_objs.h"
+#include "bkey_buf.h"
#include "bkey_methods.h"
#include "bkey_sort.h"
#include "btree_cache.h"
@@ -12,6 +14,7 @@
#include "buckets.h"
#include "checksum.h"
#include "debug.h"
+#include "enumerated_ref.h"
#include "error.h"
#include "extents.h"
#include "io_write.h"
@@ -40,6 +43,7 @@ void bch2_btree_node_io_unlock(struct btree *b)
clear_btree_node_write_in_flight_inner(b);
clear_btree_node_write_in_flight(b);
+ smp_mb__after_atomic();
wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
}
@@ -512,21 +516,23 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
struct bch_dev *ca,
+ bool print_pos,
struct btree *b, struct bset *i, struct bkey_packed *k,
- unsigned offset, int write)
+ unsigned offset, int rw)
{
- prt_printf(out, bch2_log_msg(c, "%s"),
- write == READ
- ? "error validating btree node "
- : "corrupt btree node before write ");
- if (ca)
- prt_printf(out, "on %s ", ca->name);
- prt_printf(out, "at btree ");
- bch2_btree_pos_to_text(out, c, b);
+ if (print_pos) {
+ prt_str(out, rw == READ
+ ? "error validating btree node "
+ : "corrupt btree node before write ");
+ prt_printf(out, "at btree ");
+ bch2_btree_pos_to_text(out, c, b);
+ prt_newline(out);
+ }
- printbuf_indent_add(out, 2);
+ if (ca)
+ prt_printf(out, "%s ", ca->name);
- prt_printf(out, "\nnode offset %u/%u",
+ prt_printf(out, "node offset %u/%u",
b->written, btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)));
if (i)
prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s));
@@ -537,72 +543,110 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
prt_str(out, ": ");
}
-__printf(10, 11)
+__printf(11, 12)
static int __btree_err(int ret,
struct bch_fs *c,
struct bch_dev *ca,
struct btree *b,
struct bset *i,
struct bkey_packed *k,
- int write,
- bool have_retry,
+ int rw,
enum bch_sb_error_id err_type,
+ struct bch_io_failures *failed,
+ struct printbuf *err_msg,
const char *fmt, ...)
{
+ if (c->recovery.curr_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes)
+ return ret == -BCH_ERR_btree_node_read_err_fixable
+ ? bch_err_throw(c, fsck_fix)
+ : ret;
+
+ bool have_retry = false;
+ int ret2;
+
+ if (ca) {
+ bch2_mark_btree_validate_failure(failed, ca->dev_idx);
+
+ struct extent_ptr_decoded pick;
+ have_retry = bch2_bkey_pick_read_device(c,
+ bkey_i_to_s_c(&b->key),
+ failed, &pick, -1) == 1;
+ }
+
+ if (!have_retry && ret == -BCH_ERR_btree_node_read_err_want_retry)
+ ret = bch_err_throw(c, btree_node_read_err_fixable);
+ if (!have_retry && ret == -BCH_ERR_btree_node_read_err_must_retry)
+ ret = bch_err_throw(c, btree_node_read_err_bad_node);
+
+ bch2_sb_error_count(c, err_type);
+
+ bool print_deferred = err_msg &&
+ rw == READ &&
+ !(test_bit(BCH_FS_in_fsck, &c->flags) &&
+ c->opts.fix_errors == FSCK_FIX_ask);
+
struct printbuf out = PRINTBUF;
- bool silent = c->curr_recovery_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes;
- va_list args;
+ bch2_log_msg_start(c, &out);
- btree_err_msg(&out, c, ca, b, i, k, b->written, write);
+ if (!print_deferred)
+ err_msg = &out;
+ btree_err_msg(err_msg, c, ca, !print_deferred, b, i, k, b->written, rw);
+
+ va_list args;
va_start(args, fmt);
- prt_vprintf(&out, fmt, args);
+ prt_vprintf(err_msg, fmt, args);
va_end(args);
- if (write == WRITE) {
- bch2_print_string_as_lines(KERN_ERR, out.buf);
- ret = c->opts.errors == BCH_ON_ERROR_continue
- ? 0
- : -BCH_ERR_fsck_errors_not_fixed;
+ if (print_deferred) {
+ prt_newline(err_msg);
+
+ switch (ret) {
+ case -BCH_ERR_btree_node_read_err_fixable:
+ ret2 = bch2_fsck_err_opt(c, FSCK_CAN_FIX, err_type);
+ if (!bch2_err_matches(ret2, BCH_ERR_fsck_fix) &&
+ !bch2_err_matches(ret2, BCH_ERR_fsck_ignore)) {
+ ret = ret2;
+ goto fsck_err;
+ }
+
+ if (!have_retry)
+ ret = bch_err_throw(c, fsck_fix);
+ goto out;
+ case -BCH_ERR_btree_node_read_err_bad_node:
+ prt_str(&out, ", ");
+ break;
+ }
+
goto out;
}
- if (!have_retry && ret == -BCH_ERR_btree_node_read_err_want_retry)
- ret = -BCH_ERR_btree_node_read_err_fixable;
- if (!have_retry && ret == -BCH_ERR_btree_node_read_err_must_retry)
- ret = -BCH_ERR_btree_node_read_err_bad_node;
-
- if (!silent && ret != -BCH_ERR_btree_node_read_err_fixable)
- bch2_sb_error_count(c, err_type);
+ if (rw == WRITE) {
+ prt_str(&out, ", ");
+ ret = __bch2_inconsistent_error(c, &out)
+ ? -BCH_ERR_fsck_errors_not_fixed
+ : 0;
+ goto print;
+ }
switch (ret) {
case -BCH_ERR_btree_node_read_err_fixable:
- ret = !silent
- ? __bch2_fsck_err(c, NULL, FSCK_CAN_FIX, err_type, "%s", out.buf)
- : -BCH_ERR_fsck_fix;
- if (ret != -BCH_ERR_fsck_fix &&
- ret != -BCH_ERR_fsck_ignore)
+ ret2 = __bch2_fsck_err(c, NULL, FSCK_CAN_FIX, err_type, "%s", out.buf);
+ if (!bch2_err_matches(ret2, BCH_ERR_fsck_fix) &&
+ !bch2_err_matches(ret2, BCH_ERR_fsck_ignore)) {
+ ret = ret2;
goto fsck_err;
- ret = -BCH_ERR_fsck_fix;
- break;
- case -BCH_ERR_btree_node_read_err_want_retry:
- case -BCH_ERR_btree_node_read_err_must_retry:
- if (!silent)
- bch2_print_string_as_lines(KERN_ERR, out.buf);
- break;
+ }
+
+ if (!have_retry)
+ ret = bch_err_throw(c, fsck_fix);
+ goto out;
case -BCH_ERR_btree_node_read_err_bad_node:
- if (!silent)
- bch2_print_string_as_lines(KERN_ERR, out.buf);
- ret = bch2_topology_error(c);
- break;
- case -BCH_ERR_btree_node_read_err_incompatible:
- if (!silent)
- bch2_print_string_as_lines(KERN_ERR, out.buf);
- ret = -BCH_ERR_fsck_errors_not_fixed;
+ prt_str(&out, ", ");
break;
- default:
- BUG();
}
+print:
+ bch2_print_str(c, KERN_ERR, out.buf);
out:
fsck_err:
printbuf_exit(&out);
@@ -611,16 +655,17 @@ fsck_err:
#define btree_err(type, c, ca, b, i, k, _err_type, msg, ...) \
({ \
- int _ret = __btree_err(type, c, ca, b, i, k, write, have_retry, \
+ int _ret = __btree_err(type, c, ca, b, i, k, write, \
BCH_FSCK_ERR_##_err_type, \
+ failed, err_msg, \
msg, ##__VA_ARGS__); \
\
- if (_ret != -BCH_ERR_fsck_fix) { \
+ if (!bch2_err_matches(_ret, BCH_ERR_fsck_fix)) { \
ret = _ret; \
goto fsck_err; \
} \
\
- *saw_error = true; \
+ true; \
})
#define btree_err_on(cond, ...) ((cond) ? btree_err(__VA_ARGS__) : false)
@@ -678,11 +723,11 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b)
static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
struct btree *b, struct bset *i,
- unsigned offset, unsigned sectors,
- int write, bool have_retry, bool *saw_error)
+ unsigned offset, int write,
+ struct bch_io_failures *failed,
+ struct printbuf *err_msg)
{
unsigned version = le16_to_cpu(i->version);
- unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key));
struct printbuf buf1 = PRINTBUF;
struct printbuf buf2 = PRINTBUF;
int ret = 0;
@@ -695,16 +740,22 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
BCH_VERSION_MAJOR(version),
BCH_VERSION_MINOR(version));
- if (btree_err_on(version < c->sb.version_min,
+ if (c->recovery.curr_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes &&
+ btree_err_on(version < c->sb.version_min,
-BCH_ERR_btree_node_read_err_fixable,
c, NULL, b, i, NULL,
btree_node_bset_older_than_sb_min,
"bset version %u older than superblock version_min %u",
version, c->sb.version_min)) {
- mutex_lock(&c->sb_lock);
- c->disk_sb.sb->version_min = cpu_to_le16(version);
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
+ if (bch2_version_compatible(version)) {
+ mutex_lock(&c->sb_lock);
+ c->disk_sb.sb->version_min = cpu_to_le16(version);
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+ } else {
+ /* We have no idea what's going on: */
+ i->version = cpu_to_le16(c->sb.version);
+ }
}
if (btree_err_on(BCH_VERSION_MAJOR(version) >
@@ -726,15 +777,6 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
btree_node_unsupported_version,
"BSET_SEPARATE_WHITEOUTS no longer supported");
- if (!write &&
- btree_err_on(offset + sectors > (ptr_written ?: btree_sectors(c)),
- -BCH_ERR_btree_node_read_err_fixable,
- c, ca, b, i, NULL,
- bset_past_end_of_btree_node,
- "bset past end of btree node (offset %u len %u but written %zu)",
- offset, sectors, ptr_written ?: btree_sectors(c)))
- i->u64s = 0;
-
btree_err_on(offset && !i->u64s,
-BCH_ERR_btree_node_read_err_fixable,
c, ca, b, i, NULL,
@@ -816,7 +858,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
-BCH_ERR_btree_node_read_err_bad_node,
c, ca, b, i, NULL,
btree_node_bad_format,
- "invalid bkey format: %s\n %s", buf1.buf,
+ "invalid bkey format: %s\n%s", buf1.buf,
(printbuf_reset(&buf2),
bch2_bkey_format_to_text(&buf2, &bn->format), buf2.buf));
printbuf_reset(&buf1);
@@ -892,7 +934,8 @@ static inline int btree_node_read_bkey_cmp(const struct btree *b,
static int validate_bset_keys(struct bch_fs *c, struct btree *b,
struct bset *i, int write,
- bool have_retry, bool *saw_error)
+ struct bch_io_failures *failed,
+ struct printbuf *err_msg)
{
unsigned version = le16_to_cpu(i->version);
struct bkey_packed *k, *prev = NULL;
@@ -996,8 +1039,9 @@ drop_this_key:
}
got_good_key:
le16_add_cpu(&i->u64s, -next_good_key);
- memmove_u64s_down(k, bkey_p_next(k), (u64 *) vstruct_end(i) - (u64 *) k);
+ memmove_u64s_down(k, (u64 *) k + next_good_key, (u64 *) vstruct_end(i) - (u64 *) k);
set_btree_node_need_rewrite(b);
+ set_btree_node_need_rewrite_error(b);
}
fsck_err:
printbuf_exit(&buf);
@@ -1005,7 +1049,9 @@ fsck_err:
}
int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
- struct btree *b, bool have_retry, bool *saw_error)
+ struct btree *b,
+ struct bch_io_failures *failed,
+ struct printbuf *err_msg)
{
struct btree_node_entry *bne;
struct sort_iter *iter;
@@ -1015,11 +1061,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
bool used_mempool, blacklisted;
bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
- unsigned u64s;
unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key));
u64 max_journal_seq = 0;
struct printbuf buf = PRINTBUF;
- int ret = 0, retry_read = 0, write = READ;
+ int ret = 0, write = READ;
u64 start_time = local_clock();
b->version_ondisk = U16_MAX;
@@ -1096,6 +1141,14 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
"unknown checksum type %llu", BSET_CSUM_TYPE(i));
if (first) {
+ sectors = vstruct_sectors(b->data, c->block_bits);
+ if (btree_err_on(b->written + sectors > (ptr_written ?: btree_sectors(c)),
+ -BCH_ERR_btree_node_read_err_fixable,
+ c, ca, b, i, NULL,
+ bset_past_end_of_btree_node,
+ "bset past end of btree node (offset %u len %u but written %zu)",
+ b->written, sectors, ptr_written ?: btree_sectors(c)))
+ i->u64s = 0;
if (good_csum_type) {
struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
bool csum_bad = bch2_crc_cmp(b->data->csum, csum);
@@ -1123,9 +1176,15 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
c, NULL, b, NULL, NULL,
btree_node_unsupported_version,
"btree node does not have NEW_EXTENT_OVERWRITE set");
-
- sectors = vstruct_sectors(b->data, c->block_bits);
} else {
+ sectors = vstruct_sectors(bne, c->block_bits);
+ if (btree_err_on(b->written + sectors > (ptr_written ?: btree_sectors(c)),
+ -BCH_ERR_btree_node_read_err_fixable,
+ c, ca, b, i, NULL,
+ bset_past_end_of_btree_node,
+ "bset past end of btree node (offset %u len %u but written %zu)",
+ b->written, sectors, ptr_written ?: btree_sectors(c)))
+ i->u64s = 0;
if (good_csum_type) {
struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
bool csum_bad = bch2_crc_cmp(bne->csum, csum);
@@ -1146,22 +1205,19 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
"decrypting btree node: %s", bch2_err_str(ret)))
goto fsck_err;
}
-
- sectors = vstruct_sectors(bne, c->block_bits);
}
b->version_ondisk = min(b->version_ondisk,
le16_to_cpu(i->version));
- ret = validate_bset(c, ca, b, i, b->written, sectors,
- READ, have_retry, saw_error);
+ ret = validate_bset(c, ca, b, i, b->written, READ, failed, err_msg);
if (ret)
goto fsck_err;
if (!b->written)
btree_node_set_format(b, b->data->format);
- ret = validate_bset_keys(c, b, i, READ, have_retry, saw_error);
+ ret = validate_bset_keys(c, b, i, READ, failed, err_msg);
if (ret)
goto fsck_err;
@@ -1186,7 +1242,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
le64_to_cpu(i->journal_seq),
b->written, b->written + sectors, ptr_written);
- b->written += sectors;
+ b->written = min(b->written + sectors, btree_sectors(c));
if (blacklisted && !first)
continue;
@@ -1222,29 +1278,23 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
sorted = btree_bounce_alloc(c, btree_buf_bytes(b), &used_mempool);
sorted->keys.u64s = 0;
- set_btree_bset(b, b->set, &b->data->keys);
-
b->nr = bch2_key_sort_fix_overlapping(c, &sorted->keys, iter);
memset((uint8_t *)(sorted + 1) + b->nr.live_u64s * sizeof(u64), 0,
btree_buf_bytes(b) -
sizeof(struct btree_node) -
b->nr.live_u64s * sizeof(u64));
- u64s = le16_to_cpu(sorted->keys.u64s);
+ b->data->keys.u64s = sorted->keys.u64s;
*sorted = *b->data;
- sorted->keys.u64s = cpu_to_le16(u64s);
swap(sorted, b->data);
set_btree_bset(b, b->set, &b->data->keys);
b->nsets = 1;
b->data->keys.journal_seq = cpu_to_le64(max_journal_seq);
- BUG_ON(b->nr.live_u64s != u64s);
+ BUG_ON(b->nr.live_u64s != le16_to_cpu(b->data->keys.u64s));
btree_bounce_free(c, btree_buf_bytes(b), used_mempool, sorted);
- if (updated_range)
- bch2_btree_node_drop_keys_outside_node(b);
-
i = &b->data->keys;
for (k = i->start; k != vstruct_last(i);) {
struct bkey tmp;
@@ -1252,7 +1302,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
ret = btree_node_bkey_val_validate(c, b, u.s_c, READ);
if (ret == -BCH_ERR_fsck_delete_bkey ||
- (bch2_inject_invalid_keys &&
+ (static_branch_unlikely(&bch2_inject_invalid_keys) &&
!bversion_cmp(u.k->bversion, MAX_VERSION))) {
btree_keys_account_key_drop(&b->nr, 0, k);
@@ -1261,6 +1311,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
(u64 *) vstruct_end(i) - (u64 *) k);
set_btree_bset_end(b, b->set);
set_btree_node_need_rewrite(b);
+ set_btree_node_need_rewrite_error(b);
continue;
}
if (ret)
@@ -1281,31 +1332,55 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
btree_node_reset_sib_u64s(b);
- rcu_read_lock();
- bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
- struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev);
+ if (updated_range)
+ bch2_btree_node_drop_keys_outside_node(b);
- if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw)
- set_btree_node_need_rewrite(b);
+ /*
+ * XXX:
+ *
+ * We deadlock if too many btree updates require node rewrites while
+ * we're still in journal replay.
+ *
+ * This is because btree node rewrites generate more updates for the
+ * interior updates (alloc, backpointers), and if those updates touch
+ * new nodes and generate more rewrites - well, you see the problem.
+ *
+ * The biggest cause is that we don't use the btree write buffer (for
+ * the backpointer updates - this needs some real thought on locking in
+ * order to fix.
+ *
+ * The problem with this workaround (not doing the rewrite for degraded
+ * nodes in journal replay) is that those degraded nodes persist, and we
+ * don't want that (this is a real bug when a btree node write completes
+ * with fewer replicas than we wanted and leaves a degraded node due to
+ * device _removal_, i.e. the device went away mid write).
+ *
+ * It's less of a bug here, but still a problem because we don't yet
+ * have a way of tracking degraded data - we another index (all
+ * extents/btree nodes, by replicas entry) in order to fix properly
+ * (re-replicate degraded data at the earliest possible time).
+ */
+ if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_journal_replay)) {
+ scoped_guard(rcu)
+ bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
+ struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev);
+
+ if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) {
+ set_btree_node_need_rewrite(b);
+ set_btree_node_need_rewrite_degraded(b);
+ }
+ }
}
- rcu_read_unlock();
- if (!ptr_written)
+ if (!ptr_written) {
set_btree_node_need_rewrite(b);
-out:
+ set_btree_node_need_rewrite_ptr_written_zero(b);
+ }
+fsck_err:
mempool_free(iter, &c->fill_iter);
printbuf_exit(&buf);
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read_done], start_time);
- return retry_read;
-fsck_err:
- if (ret == -BCH_ERR_btree_node_read_err_want_retry ||
- ret == -BCH_ERR_btree_node_read_err_must_retry) {
- retry_read = 1;
- } else {
- set_btree_node_read_error(b);
- bch2_btree_lost_data(c, b->c.btree_id);
- }
- goto out;
+ return ret;
}
static void btree_node_read_work(struct work_struct *work)
@@ -1317,17 +1392,28 @@ static void btree_node_read_work(struct work_struct *work)
struct btree *b = rb->b;
struct bio *bio = &rb->bio;
struct bch_io_failures failed = { .nr = 0 };
+ int ret = 0;
+
struct printbuf buf = PRINTBUF;
- bool saw_error = false;
- bool retry = false;
- bool can_retry;
+ bch2_log_msg_start(c, &buf);
+
+ prt_printf(&buf, "btree node read error at btree ");
+ bch2_btree_pos_to_text(&buf, c, b);
+ prt_newline(&buf);
goto start;
while (1) {
- retry = true;
- bch_info(c, "retrying read");
- ca = bch2_dev_get_ioref(c, rb->pick.ptr.dev, READ);
+ ret = bch2_bkey_pick_read_device(c,
+ bkey_i_to_s_c(&b->key),
+ &failed, &rb->pick, -1);
+ if (ret <= 0) {
+ set_btree_node_read_error(b);
+ break;
+ }
+
+ ca = bch2_dev_get_ioref(c, rb->pick.ptr.dev, READ, BCH_DEV_READ_REF_btree_node_read);
rb->have_ioref = ca != NULL;
+ rb->start_time = local_clock();
bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META);
bio->bi_iter.bi_sector = rb->pick.ptr.offset;
bio->bi_iter.bi_size = btree_buf_bytes(b);
@@ -1338,60 +1424,66 @@ static void btree_node_read_work(struct work_struct *work)
} else {
bio->bi_status = BLK_STS_REMOVED;
}
+
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
+ rb->start_time, !bio->bi_status);
start:
- printbuf_reset(&buf);
- bch2_btree_pos_to_text(&buf, c, b);
- bch2_dev_io_err_on(ca && bio->bi_status, ca, BCH_MEMBER_ERROR_read,
- "btree read error %s for %s",
- bch2_blk_status_to_str(bio->bi_status), buf.buf);
if (rb->have_ioref)
- percpu_ref_put(&ca->io_ref);
+ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_read);
rb->have_ioref = false;
- bch2_mark_io_failure(&failed, &rb->pick);
-
- can_retry = bch2_bkey_pick_read_device(c,
- bkey_i_to_s_c(&b->key),
- &failed, &rb->pick) > 0;
-
- if (!bio->bi_status &&
- !bch2_btree_node_read_done(c, ca, b, can_retry, &saw_error)) {
- if (retry)
- bch_info(c, "retry success");
- break;
+ if (bio->bi_status) {
+ bch2_mark_io_failure(&failed, &rb->pick, false);
+ continue;
}
- saw_error = true;
+ ret = bch2_btree_node_read_done(c, ca, b, &failed, &buf);
+ if (ret == -BCH_ERR_btree_node_read_err_want_retry ||
+ ret == -BCH_ERR_btree_node_read_err_must_retry)
+ continue;
- if (!can_retry) {
+ if (ret)
set_btree_node_read_error(b);
- bch2_btree_lost_data(c, b->c.btree_id);
- break;
- }
+
+ break;
}
- bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read],
- rb->start_time);
- bio_put(&rb->bio);
+ bch2_io_failures_to_text(&buf, c, &failed);
+
+ if (btree_node_read_error(b))
+ bch2_btree_lost_data(c, &buf, b->c.btree_id);
- if ((saw_error ||
+ /*
+ * only print retry success if we read from a replica with no errors
+ */
+ if (btree_node_read_error(b))
+ prt_printf(&buf, "ret %s", bch2_err_str(ret));
+ else if (failed.nr) {
+ if (!bch2_dev_io_failures(&failed, rb->pick.ptr.dev))
+ prt_printf(&buf, "retry success");
+ else
+ prt_printf(&buf, "repair success");
+ }
+
+ if ((failed.nr ||
btree_node_need_rewrite(b)) &&
!btree_node_read_error(b) &&
- c->curr_recovery_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) {
- if (saw_error) {
- printbuf_reset(&buf);
- bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
- prt_str(&buf, " ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
- bch_err_ratelimited(c, "%s: rewriting btree node at due to error\n %s",
- __func__, buf.buf);
- }
-
+ c->recovery.curr_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) {
+ prt_printf(&buf, " (rewriting node)");
bch2_btree_node_rewrite_async(c, b);
}
+ prt_newline(&buf);
+ if (failed.nr)
+ bch2_print_str_ratelimited(c, KERN_ERR, buf.buf);
+
+ async_object_list_del(c, btree_read_bio, rb->list_idx);
+ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read],
+ rb->start_time);
+ bio_put(&rb->bio);
printbuf_exit(&buf);
clear_btree_node_read_in_flight(b);
+ smp_mb__after_atomic();
wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
}
@@ -1400,16 +1492,20 @@ static void btree_node_read_endio(struct bio *bio)
struct btree_read_bio *rb =
container_of(bio, struct btree_read_bio, bio);
struct bch_fs *c = rb->c;
+ struct bch_dev *ca = rb->have_ioref
+ ? bch2_dev_have_ref(c, rb->pick.ptr.dev) : NULL;
- if (rb->have_ioref) {
- struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev);
-
- bch2_latency_acct(ca, rb->start_time, READ);
- }
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
+ rb->start_time, !bio->bi_status);
queue_work(c->btree_read_complete_wq, &rb->work);
}
+void bch2_btree_read_bio_to_text(struct printbuf *out, struct btree_read_bio *rbio)
+{
+ bch2_bio_to_text(out, &rbio->bio);
+}
+
struct btree_node_read_all {
struct closure cl;
struct bch_fs *c;
@@ -1469,12 +1565,13 @@ static CLOSURE_CALLBACK(btree_node_read_all_replicas_done)
struct btree *b = ra->b;
struct printbuf buf = PRINTBUF;
bool dump_bset_maps = false;
- bool have_retry = false;
int ret = 0, best = -1, write = READ;
unsigned i, written = 0, written2 = 0;
__le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2
? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0;
bool _saw_error = false, *saw_error = &_saw_error;
+ struct printbuf *err_msg = NULL;
+ struct bch_io_failures *failed = NULL;
for (i = 0; i < ra->nr; i++) {
struct btree_node *bn = ra->buf[i];
@@ -1567,14 +1664,19 @@ fsck_err:
if (best >= 0) {
memcpy(b->data, ra->buf[best], btree_buf_bytes(b));
- ret = bch2_btree_node_read_done(c, NULL, b, false, saw_error);
+ ret = bch2_btree_node_read_done(c, NULL, b, NULL, NULL);
} else {
ret = -1;
}
if (ret) {
set_btree_node_read_error(b);
- bch2_btree_lost_data(c, b->c.btree_id);
+
+ struct printbuf buf = PRINTBUF;
+ bch2_btree_lost_data(c, &buf, b->c.btree_id);
+ if (buf.pos)
+ bch_err(c, "%s", buf.buf);
+ printbuf_exit(&buf);
} else if (*saw_error)
bch2_btree_node_rewrite_async(c, b);
@@ -1588,6 +1690,7 @@ fsck_err:
printbuf_exit(&buf);
clear_btree_node_read_in_flight(b);
+ smp_mb__after_atomic();
wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
}
@@ -1602,6 +1705,8 @@ static void btree_node_read_all_replicas_endio(struct bio *bio)
struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev);
bch2_latency_acct(ca, rb->start_time, READ);
+ enumerated_ref_put(&ca->io_ref[READ],
+ BCH_DEV_READ_REF_btree_node_read_all_replicas);
}
ra->err[rb->idx] = bio->bi_status;
@@ -1623,7 +1728,7 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool
ra = kzalloc(sizeof(*ra), GFP_NOFS);
if (!ra)
- return -BCH_ERR_ENOMEM_btree_node_read_all_replicas;
+ return bch_err_throw(c, ENOMEM_btree_node_read_all_replicas);
closure_init(&ra->cl, NULL);
ra->c = c;
@@ -1641,7 +1746,8 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool
i = 0;
bkey_for_each_ptr_decode(k.k, ptrs, pick, entry) {
- struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
+ struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ,
+ BCH_DEV_READ_REF_btree_node_read_all_replicas);
struct btree_read_bio *rb =
container_of(ra->bio[i], struct btree_read_bio, bio);
rb->c = c;
@@ -1692,33 +1798,42 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
trace_and_count(c, btree_node_read, trans, b);
- if (bch2_verify_all_btree_replicas &&
+ if (static_branch_unlikely(&bch2_verify_all_btree_replicas) &&
!btree_node_read_all_replicas(c, b, sync))
return;
ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key),
- NULL, &pick);
+ NULL, &pick, -1);
if (ret <= 0) {
+ bool ratelimit = true;
struct printbuf buf = PRINTBUF;
+ bch2_log_msg_start(c, &buf);
prt_str(&buf, "btree node read error: no device to read from\n at ");
bch2_btree_pos_to_text(&buf, c, b);
- bch_err_ratelimited(c, "%s", buf.buf);
-
- if (c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_check_topology) &&
- c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology)
- bch2_fatal_error(c);
+ prt_newline(&buf);
+ bch2_btree_lost_data(c, &buf, b->c.btree_id);
+
+ if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_check_topology) &&
+ bch2_fs_emergency_read_only2(c, &buf))
+ ratelimit = false;
+
+ static DEFINE_RATELIMIT_STATE(rs,
+ DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+ if (!ratelimit || __ratelimit(&rs))
+ bch2_print_str(c, KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
set_btree_node_read_error(b);
- bch2_btree_lost_data(c, b->c.btree_id);
clear_btree_node_read_in_flight(b);
+ smp_mb__after_atomic();
wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
- printbuf_exit(&buf);
return;
}
- ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
+ ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, BCH_DEV_READ_REF_btree_node_read);
bio = bio_alloc_bioset(NULL,
buf_pages(b->data, btree_buf_bytes(b)),
@@ -1737,6 +1852,8 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
bio->bi_end_io = btree_node_read_endio;
bch2_bio_map(bio, b->data, btree_buf_bytes(b));
+ async_object_list_add(c, btree_read_bio, rb, &rb->list_idx);
+
if (rb->have_ioref) {
this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
bio_sectors(bio));
@@ -1793,7 +1910,7 @@ static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id,
bch2_btree_node_hash_remove(&c->btree_cache, b);
mutex_unlock(&c->btree_cache.lock);
- ret = -BCH_ERR_btree_node_read_error;
+ ret = bch_err_throw(c, btree_node_read_error);
goto err;
}
@@ -1811,6 +1928,176 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
return bch2_trans_run(c, __bch2_btree_root_read(trans, id, k, level));
}
+struct btree_node_scrub {
+ struct bch_fs *c;
+ struct bch_dev *ca;
+ void *buf;
+ bool used_mempool;
+ unsigned written;
+
+ enum btree_id btree;
+ unsigned level;
+ struct bkey_buf key;
+ __le64 seq;
+
+ struct work_struct work;
+ struct bio bio;
+};
+
+static bool btree_node_scrub_check(struct bch_fs *c, struct btree_node *data, unsigned ptr_written,
+ struct printbuf *err)
+{
+ unsigned written = 0;
+
+ if (le64_to_cpu(data->magic) != bset_magic(c)) {
+ prt_printf(err, "bad magic: want %llx, got %llx",
+ bset_magic(c), le64_to_cpu(data->magic));
+ return false;
+ }
+
+ while (written < (ptr_written ?: btree_sectors(c))) {
+ struct btree_node_entry *bne;
+ struct bset *i;
+ bool first = !written;
+
+ if (first) {
+ bne = NULL;
+ i = &data->keys;
+ } else {
+ bne = (void *) data + (written << 9);
+ i = &bne->keys;
+
+ if (!ptr_written && i->seq != data->keys.seq)
+ break;
+ }
+
+ struct nonce nonce = btree_nonce(i, written << 9);
+ bool good_csum_type = bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i));
+
+ if (first) {
+ if (good_csum_type) {
+ struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, data);
+ if (bch2_crc_cmp(data->csum, csum)) {
+ bch2_csum_err_msg(err, BSET_CSUM_TYPE(i), data->csum, csum);
+ return false;
+ }
+ }
+
+ written += vstruct_sectors(data, c->block_bits);
+ } else {
+ if (good_csum_type) {
+ struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
+ if (bch2_crc_cmp(bne->csum, csum)) {
+ bch2_csum_err_msg(err, BSET_CSUM_TYPE(i), bne->csum, csum);
+ return false;
+ }
+ }
+
+ written += vstruct_sectors(bne, c->block_bits);
+ }
+ }
+
+ return true;
+}
+
+static void btree_node_scrub_work(struct work_struct *work)
+{
+ struct btree_node_scrub *scrub = container_of(work, struct btree_node_scrub, work);
+ struct bch_fs *c = scrub->c;
+ struct printbuf err = PRINTBUF;
+
+ __bch2_btree_pos_to_text(&err, c, scrub->btree, scrub->level,
+ bkey_i_to_s_c(scrub->key.k));
+ prt_newline(&err);
+
+ if (!btree_node_scrub_check(c, scrub->buf, scrub->written, &err)) {
+ int ret = bch2_trans_do(c,
+ bch2_btree_node_rewrite_key(trans, scrub->btree, scrub->level - 1,
+ scrub->key.k, 0));
+ if (!bch2_err_matches(ret, ENOENT) &&
+ !bch2_err_matches(ret, EROFS))
+ bch_err_fn_ratelimited(c, ret);
+ }
+
+ printbuf_exit(&err);
+ bch2_bkey_buf_exit(&scrub->key, c);;
+ btree_bounce_free(c, c->opts.btree_node_size, scrub->used_mempool, scrub->buf);
+ enumerated_ref_put(&scrub->ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scrub);
+ kfree(scrub);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_node_scrub);
+}
+
+static void btree_node_scrub_endio(struct bio *bio)
+{
+ struct btree_node_scrub *scrub = container_of(bio, struct btree_node_scrub, bio);
+
+ queue_work(scrub->c->btree_read_complete_wq, &scrub->work);
+}
+
+int bch2_btree_node_scrub(struct btree_trans *trans,
+ enum btree_id btree, unsigned level,
+ struct bkey_s_c k, unsigned dev)
+{
+ if (k.k->type != KEY_TYPE_btree_ptr_v2)
+ return 0;
+
+ struct bch_fs *c = trans->c;
+
+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_node_scrub))
+ return bch_err_throw(c, erofs_no_writes);
+
+ struct extent_ptr_decoded pick;
+ int ret = bch2_bkey_pick_read_device(c, k, NULL, &pick, dev);
+ if (ret <= 0)
+ goto err;
+
+ struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ,
+ BCH_DEV_READ_REF_btree_node_scrub);
+ if (!ca) {
+ ret = bch_err_throw(c, device_offline);
+ goto err;
+ }
+
+ bool used_mempool = false;
+ void *buf = btree_bounce_alloc(c, c->opts.btree_node_size, &used_mempool);
+
+ unsigned vecs = buf_pages(buf, c->opts.btree_node_size);
+
+ struct btree_node_scrub *scrub =
+ kzalloc(sizeof(*scrub) + sizeof(struct bio_vec) * vecs, GFP_KERNEL);
+ if (!scrub) {
+ ret = -ENOMEM;
+ goto err_free;
+ }
+
+ scrub->c = c;
+ scrub->ca = ca;
+ scrub->buf = buf;
+ scrub->used_mempool = used_mempool;
+ scrub->written = btree_ptr_sectors_written(k);
+
+ scrub->btree = btree;
+ scrub->level = level;
+ bch2_bkey_buf_init(&scrub->key);
+ bch2_bkey_buf_reassemble(&scrub->key, c, k);
+ scrub->seq = bkey_s_c_to_btree_ptr_v2(k).v->seq;
+
+ INIT_WORK(&scrub->work, btree_node_scrub_work);
+
+ bio_init(&scrub->bio, ca->disk_sb.bdev, scrub->bio.bi_inline_vecs, vecs, REQ_OP_READ);
+ bch2_bio_map(&scrub->bio, scrub->buf, c->opts.btree_node_size);
+ scrub->bio.bi_iter.bi_sector = pick.ptr.offset;
+ scrub->bio.bi_end_io = btree_node_scrub_endio;
+ submit_bio(&scrub->bio);
+ return 0;
+err_free:
+ btree_bounce_free(c, c->opts.btree_node_size, used_mempool, buf);
+ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scrub);
+err:
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_node_scrub);
+ return ret;
+}
+
static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
struct btree_write *w)
{
@@ -1831,7 +2118,7 @@ static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
bch2_journal_pin_drop(&c->journal, &w->journal);
}
-static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
+static void __btree_node_write_done(struct bch_fs *c, struct btree *b, u64 start_time)
{
struct btree_write *w = btree_prev_write(b);
unsigned long old, new;
@@ -1839,6 +2126,9 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
bch2_btree_complete_write(c, b, w);
+ if (start_time)
+ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_write], start_time);
+
old = READ_ONCE(b->flags);
do {
new = old;
@@ -1865,11 +2155,13 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
if (new & (1U << BTREE_NODE_write_in_flight))
__bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED|type);
- else
+ else {
+ smp_mb__after_atomic();
wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
+ }
}
-static void btree_node_write_done(struct bch_fs *c, struct btree *b)
+static void btree_node_write_done(struct bch_fs *c, struct btree *b, u64 start_time)
{
struct btree_trans *trans = bch2_trans_get(c);
@@ -1877,7 +2169,7 @@ static void btree_node_write_done(struct bch_fs *c, struct btree *b)
/* we don't need transaction context anymore after we got the lock. */
bch2_trans_put(trans);
- __btree_node_write_done(c, b);
+ __btree_node_write_done(c, b, start_time);
six_unlock_read(&b->c.lock);
}
@@ -1887,6 +2179,7 @@ static void btree_node_write_work(struct work_struct *work)
container_of(work, struct btree_write_bio, work);
struct bch_fs *c = wbio->wbio.c;
struct btree *b = wbio->wbio.bio.bi_private;
+ u64 start_time = wbio->start_time;
int ret = 0;
btree_bounce_free(c,
@@ -1898,7 +2191,7 @@ static void btree_node_write_work(struct work_struct *work)
bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev));
if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) {
- ret = -BCH_ERR_btree_node_write_all_failed;
+ ret = bch_err_throw(c, btree_node_write_all_failed);
goto err;
}
@@ -1918,13 +2211,20 @@ static void btree_node_write_work(struct work_struct *work)
goto err;
}
out:
+ async_object_list_del(c, btree_write_bio, wbio->list_idx);
bio_put(&wbio->wbio.bio);
- btree_node_write_done(c, b);
+ btree_node_write_done(c, b, start_time);
return;
err:
set_btree_node_noevict(b);
- bch2_fs_fatal_err_on(!bch2_err_matches(ret, EROFS), c,
- "writing btree node: %s", bch2_err_str(ret));
+
+ if (!bch2_err_matches(ret, EROFS)) {
+ struct printbuf buf = PRINTBUF;
+ prt_printf(&buf, "writing btree node: %s\n ", bch2_err_str(ret));
+ bch2_btree_pos_to_text(&buf, c, b);
+ bch2_fs_fatal_error(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ }
goto out;
}
@@ -1937,23 +2237,34 @@ static void btree_node_write_endio(struct bio *bio)
struct bch_fs *c = wbio->c;
struct btree *b = wbio->bio.bi_private;
struct bch_dev *ca = wbio->have_ioref ? bch2_dev_have_ref(c, wbio->dev) : NULL;
- unsigned long flags;
- if (wbio->have_ioref)
- bch2_latency_acct(ca, wbio->submit_time, WRITE);
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write,
+ wbio->submit_time, !bio->bi_status);
- if (!ca ||
- bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
- "btree write error: %s",
- bch2_blk_status_to_str(bio->bi_status)) ||
- bch2_meta_write_fault("btree")) {
+ if (ca && bio->bi_status) {
+ struct printbuf buf = PRINTBUF;
+ buf.atomic++;
+ prt_printf(&buf, "btree write error: %s\n ",
+ bch2_blk_status_to_str(bio->bi_status));
+ bch2_btree_pos_to_text(&buf, c, b);
+ bch_err_dev_ratelimited(ca, "%s", buf.buf);
+ printbuf_exit(&buf);
+ }
+
+ if (bio->bi_status) {
+ unsigned long flags;
spin_lock_irqsave(&c->btree_write_error_lock, flags);
bch2_dev_list_add_dev(&orig->failed, wbio->dev);
spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
}
+ /*
+ * XXX: we should be using io_ref[WRITE], but we aren't retrying failed
+ * btree writes yet (due to device removal/ro):
+ */
if (wbio->have_ioref)
- percpu_ref_put(&ca->io_ref);
+ enumerated_ref_put(&ca->io_ref[READ],
+ BCH_DEV_READ_REF_btree_node_write);
if (parent) {
bio_put(bio);
@@ -1962,16 +2273,15 @@ static void btree_node_write_endio(struct bio *bio)
}
clear_btree_node_write_in_flight_inner(b);
+ smp_mb__after_atomic();
wake_up_bit(&b->flags, BTREE_NODE_write_in_flight_inner);
INIT_WORK(&wb->work, btree_node_write_work);
- queue_work(c->btree_io_complete_wq, &wb->work);
+ queue_work(c->btree_write_complete_wq, &wb->work);
}
static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
- struct bset *i, unsigned sectors)
+ struct bset *i)
{
- bool saw_error;
-
int ret = bch2_bkey_validate(c, bkey_i_to_s_c(&b->key),
(struct bkey_validate_context) {
.from = BKEY_VALIDATE_btree_node,
@@ -1984,8 +2294,8 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
return ret;
}
- ret = validate_bset_keys(c, b, i, WRITE, false, &saw_error) ?:
- validate_bset(c, NULL, b, i, b->written, sectors, WRITE, false, &saw_error);
+ ret = validate_bset_keys(c, b, i, WRITE, NULL, NULL) ?:
+ validate_bset(c, NULL, b, i, b->written, WRITE, NULL, NULL);
if (ret) {
bch2_inconsistent_error(c);
dump_stack();
@@ -2023,6 +2333,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
bool validate_before_checksum = false;
enum btree_write_type type = flags & BTREE_WRITE_TYPE_MASK;
void *data;
+ u64 start_time = local_clock();
int ret;
if (flags & BTREE_WRITE_ALREADY_STARTED)
@@ -2177,7 +2488,7 @@ do_write:
/* if we're going to be encrypting, check metadata validity first: */
if (validate_before_checksum &&
- validate_bset_for_write(c, b, i, sectors_to_write))
+ validate_bset_for_write(c, b, i))
goto err;
ret = bset_encrypt(c, i, b->written << 9);
@@ -2194,7 +2505,7 @@ do_write:
/* if we're not encrypting, check metadata after checksumming: */
if (!validate_before_checksum &&
- validate_bset_for_write(c, b, i, sectors_to_write))
+ validate_bset_for_write(c, b, i))
goto err;
/*
@@ -2231,6 +2542,7 @@ do_write:
wbio->data = data;
wbio->data_bytes = bytes;
wbio->sector_offset = b->written;
+ wbio->start_time = start_time;
wbio->wbio.c = c;
wbio->wbio.used_mempool = used_mempool;
wbio->wbio.first_btree_write = !b->written;
@@ -2250,6 +2562,8 @@ do_write:
atomic64_inc(&c->btree_write_stats[type].nr);
atomic64_add(bytes_to_write, &c->btree_write_stats[type].bytes);
+ async_object_list_add(c, btree_write_bio, wbio, &wbio->list_idx);
+
INIT_WORK(&wbio->work, btree_write_submit);
queue_work(c->btree_write_submit_wq, &wbio->work);
return;
@@ -2258,7 +2572,7 @@ err:
b->written += sectors_to_write;
nowrite:
btree_bounce_free(c, bytes, used_mempool, data);
- __btree_node_write_done(c, b);
+ __btree_node_write_done(c, b, 0);
}
/*
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index 6f9e4a6dacf7..30a5180532c8 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -41,6 +41,9 @@ struct btree_read_bio {
u64 start_time;
unsigned have_ioref:1;
unsigned idx:7;
+#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
+ unsigned list_idx;
+#endif
struct extent_ptr_decoded pick;
struct work_struct work;
struct bio bio;
@@ -52,6 +55,10 @@ struct btree_write_bio {
void *data;
unsigned data_bytes;
unsigned sector_offset;
+ u64 start_time;
+#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
+ unsigned list_idx;
+#endif
struct bch_write_bio wbio;
};
@@ -127,11 +134,18 @@ void bch2_btree_build_aux_trees(struct btree *);
void bch2_btree_init_next(struct btree_trans *, struct btree *);
int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *,
- struct btree *, bool, bool *);
+ struct btree *,
+ struct bch_io_failures *,
+ struct printbuf *);
void bch2_btree_node_read(struct btree_trans *, struct btree *, bool);
int bch2_btree_root_read(struct bch_fs *, enum btree_id,
const struct bkey_i *, unsigned);
+void bch2_btree_read_bio_to_text(struct printbuf *, struct btree_read_bio *);
+
+int bch2_btree_node_scrub(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, unsigned);
+
bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
enum btree_write_flags {
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 5988219c6908..f8829b667ad3 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -16,6 +16,7 @@
#include "journal_io.h"
#include "replicas.h"
#include "snapshot.h"
+#include "super.h"
#include "trace.h"
#include <linux/random.h>
@@ -114,11 +115,9 @@ static inline bool btree_path_pos_in_node(struct btree_path *path,
!btree_path_pos_after_node(path, b);
}
-/* Btree iterator: */
+/* Debug: */
-#ifdef CONFIG_BCACHEFS_DEBUG
-
-static void bch2_btree_path_verify_cached(struct btree_trans *trans,
+static void __bch2_btree_path_verify_cached(struct btree_trans *trans,
struct btree_path *path)
{
struct bkey_cached *ck;
@@ -135,7 +134,7 @@ static void bch2_btree_path_verify_cached(struct btree_trans *trans,
btree_node_unlock(trans, path, 0);
}
-static void bch2_btree_path_verify_level(struct btree_trans *trans,
+static void __bch2_btree_path_verify_level(struct btree_trans *trans,
struct btree_path *path, unsigned level)
{
struct btree_path_level *l;
@@ -147,16 +146,13 @@ static void bch2_btree_path_verify_level(struct btree_trans *trans,
struct printbuf buf3 = PRINTBUF;
const char *msg;
- if (!bch2_debug_check_iterators)
- return;
-
l = &path->l[level];
tmp = l->iter;
locked = btree_node_locked(path, level);
if (path->cached) {
if (!level)
- bch2_btree_path_verify_cached(trans, path);
+ __bch2_btree_path_verify_cached(trans, path);
return;
}
@@ -217,7 +213,7 @@ err:
msg, level, buf1.buf, buf2.buf, buf3.buf);
}
-static void bch2_btree_path_verify(struct btree_trans *trans,
+static void __bch2_btree_path_verify(struct btree_trans *trans,
struct btree_path *path)
{
struct bch_fs *c = trans->c;
@@ -229,25 +225,23 @@ static void bch2_btree_path_verify(struct btree_trans *trans,
break;
}
- bch2_btree_path_verify_level(trans, path, i);
+ __bch2_btree_path_verify_level(trans, path, i);
}
- bch2_btree_path_verify_locks(path);
+ bch2_btree_path_verify_locks(trans, path);
}
-void bch2_trans_verify_paths(struct btree_trans *trans)
+void __bch2_trans_verify_paths(struct btree_trans *trans)
{
struct btree_path *path;
unsigned iter;
trans_for_each_path(trans, path, iter)
- bch2_btree_path_verify(trans, path);
+ __bch2_btree_path_verify(trans, path);
}
-static void bch2_btree_iter_verify(struct btree_iter *iter)
+static void __bch2_btree_iter_verify(struct btree_trans *trans, struct btree_iter *iter)
{
- struct btree_trans *trans = iter->trans;
-
BUG_ON(!!(iter->flags & BTREE_ITER_cached) != btree_iter_path(trans, iter)->cached);
BUG_ON((iter->flags & BTREE_ITER_is_extents) &&
@@ -258,11 +252,11 @@ static void bch2_btree_iter_verify(struct btree_iter *iter)
!btree_type_has_snapshot_field(iter->btree_id));
if (iter->update_path)
- bch2_btree_path_verify(trans, &trans->paths[iter->update_path]);
- bch2_btree_path_verify(trans, btree_iter_path(trans, iter));
+ __bch2_btree_path_verify(trans, &trans->paths[iter->update_path]);
+ __bch2_btree_path_verify(trans, btree_iter_path(trans, iter));
}
-static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
+static void __bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
{
BUG_ON((iter->flags & BTREE_ITER_filter_snapshots) &&
!iter->pos.snapshot);
@@ -276,16 +270,13 @@ static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
bkey_gt(iter->pos, iter->k.p)));
}
-static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k)
+static int __bch2_btree_iter_verify_ret(struct btree_trans *trans,
+ struct btree_iter *iter, struct bkey_s_c k)
{
- struct btree_trans *trans = iter->trans;
struct btree_iter copy;
struct bkey_s_c prev;
int ret = 0;
- if (!bch2_debug_check_iterators)
- return 0;
-
if (!(iter->flags & BTREE_ITER_filter_snapshots))
return 0;
@@ -299,7 +290,7 @@ static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k
bch2_trans_iter_init(trans, &copy, iter->btree_id, iter->pos,
BTREE_ITER_nopreserve|
BTREE_ITER_all_snapshots);
- prev = bch2_btree_iter_prev(&copy);
+ prev = bch2_btree_iter_prev(trans, &copy);
if (!prev.k)
goto out;
@@ -326,7 +317,7 @@ out:
return ret;
}
-void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
+void __bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
struct bpos pos)
{
bch2_trans_verify_not_unlocked_or_in_restart(trans);
@@ -359,17 +350,40 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
panic("not locked: %s %s\n", bch2_btree_id_str(id), buf.buf);
}
-#else
-
static inline void bch2_btree_path_verify_level(struct btree_trans *trans,
- struct btree_path *path, unsigned l) {}
+ struct btree_path *path, unsigned l)
+{
+ if (static_branch_unlikely(&bch2_debug_check_iterators))
+ __bch2_btree_path_verify_level(trans, path, l);
+}
+
static inline void bch2_btree_path_verify(struct btree_trans *trans,
- struct btree_path *path) {}
-static inline void bch2_btree_iter_verify(struct btree_iter *iter) {}
-static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {}
-static inline int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) { return 0; }
+ struct btree_path *path)
+{
+ if (static_branch_unlikely(&bch2_debug_check_iterators))
+ __bch2_btree_path_verify(trans, path);
+}
-#endif
+static inline void bch2_btree_iter_verify(struct btree_trans *trans,
+ struct btree_iter *iter)
+{
+ if (static_branch_unlikely(&bch2_debug_check_iterators))
+ __bch2_btree_iter_verify(trans, iter);
+}
+
+static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
+{
+ if (static_branch_unlikely(&bch2_debug_check_iterators))
+ __bch2_btree_iter_verify_entry_exit(iter);
+}
+
+static inline int bch2_btree_iter_verify_ret(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_s_c k)
+{
+ return static_branch_unlikely(&bch2_debug_check_iterators)
+ ? __bch2_btree_iter_verify_ret(trans, iter, k)
+ : 0;
+}
/* Btree path: fixups after btree updates */
@@ -523,7 +537,7 @@ void bch2_btree_node_iter_fix(struct btree_trans *trans,
__bch2_btree_node_iter_fix(path, b, node_iter, t,
where, clobber_u64s, new_u64s);
- if (bch2_debug_check_iterators)
+ if (static_branch_unlikely(&bch2_debug_check_iterators))
bch2_btree_node_iter_verify(node_iter, b);
}
@@ -562,20 +576,6 @@ static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c,
bch2_btree_node_iter_peek_all(&l->iter, l->b));
}
-static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans,
- struct btree_path *path,
- struct btree_path_level *l,
- struct bkey *u)
-{
- struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u,
- bch2_btree_node_iter_peek(&l->iter, l->b));
-
- path->pos = k.k ? k.k->p : l->b->key.k.p;
- trans->paths_sorted = false;
- bch2_btree_path_verify_level(trans, path, l - path->l);
- return k;
-}
-
static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans,
struct btree_path *path,
struct btree_path_level *l,
@@ -890,8 +890,7 @@ static noinline void btree_node_mem_ptr_set(struct btree_trans *trans,
static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
struct btree_path *path,
- unsigned flags,
- struct bkey_buf *out)
+ unsigned flags)
{
struct bch_fs *c = trans->c;
struct btree_path_level *l = path_l(path);
@@ -915,7 +914,7 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
goto err;
}
- bch2_bkey_buf_reassemble(out, c, k);
+ bkey_reassemble(&trans->btree_path_down, k);
if ((flags & BTREE_ITER_prefetch) &&
c->opts.btree_node_prefetch)
@@ -926,6 +925,22 @@ err:
return ret;
}
+static noinline_for_stack int btree_node_missing_err(struct btree_trans *trans,
+ struct btree_path *path)
+{
+ struct bch_fs *c = trans->c;
+ struct printbuf buf = PRINTBUF;
+
+ prt_str(&buf, "node not found at pos ");
+ bch2_bpos_to_text(&buf, path->pos);
+ prt_str(&buf, " within parent node ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&path_l(path)->b->key));
+
+ bch2_fs_fatal_error(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ return bch_err_throw(c, btree_need_topology_repair);
+}
+
static __always_inline int btree_path_down(struct btree_trans *trans,
struct btree_path *path,
unsigned flags,
@@ -936,51 +951,38 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
struct btree *b;
unsigned level = path->level - 1;
enum six_lock_type lock_type = __btree_lock_want(path, level);
- struct bkey_buf tmp;
int ret;
EBUG_ON(!btree_node_locked(path, path->level));
- bch2_bkey_buf_init(&tmp);
-
if (unlikely(trans->journal_replay_not_finished)) {
- ret = btree_node_iter_and_journal_peek(trans, path, flags, &tmp);
+ ret = btree_node_iter_and_journal_peek(trans, path, flags);
if (ret)
- goto err;
+ return ret;
} else {
struct bkey_packed *k = bch2_btree_node_iter_peek(&l->iter, l->b);
- if (!k) {
- struct printbuf buf = PRINTBUF;
-
- prt_str(&buf, "node not found at pos ");
- bch2_bpos_to_text(&buf, path->pos);
- prt_str(&buf, " within parent node ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&l->b->key));
-
- bch2_fs_fatal_error(c, "%s", buf.buf);
- printbuf_exit(&buf);
- ret = -BCH_ERR_btree_need_topology_repair;
- goto err;
- }
+ if (unlikely(!k))
+ return btree_node_missing_err(trans, path);
- bch2_bkey_buf_unpack(&tmp, c, l->b, k);
+ bch2_bkey_unpack(l->b, &trans->btree_path_down, k);
- if ((flags & BTREE_ITER_prefetch) &&
+ if (unlikely((flags & BTREE_ITER_prefetch)) &&
c->opts.btree_node_prefetch) {
ret = btree_path_prefetch(trans, path);
if (ret)
- goto err;
+ return ret;
}
}
- b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip);
+ b = bch2_btree_node_get(trans, path, &trans->btree_path_down,
+ level, lock_type, trace_ip);
ret = PTR_ERR_OR_ZERO(b);
if (unlikely(ret))
- goto err;
+ return ret;
- if (likely(!trans->journal_replay_not_finished &&
- tmp.k->k.type == KEY_TYPE_btree_ptr_v2) &&
- unlikely(b != btree_node_mem_ptr(tmp.k)))
+ if (unlikely(b != btree_node_mem_ptr(&trans->btree_path_down)) &&
+ likely(!trans->journal_replay_not_finished &&
+ trans->btree_path_down.k.type == KEY_TYPE_btree_ptr_v2))
btree_node_mem_ptr_set(trans, path, level + 1, b);
if (btree_node_read_locked(path, level + 1))
@@ -991,10 +993,8 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
path->level = level;
bch2_btree_path_level_init(trans, path, b);
- bch2_btree_path_verify_locks(path);
-err:
- bch2_bkey_buf_exit(&tmp, c);
- return ret;
+ bch2_btree_path_verify_locks(trans, path);
+ return 0;
}
static int bch2_btree_path_traverse_all(struct btree_trans *trans)
@@ -1006,7 +1006,7 @@ static int bch2_btree_path_traverse_all(struct btree_trans *trans)
int ret = 0;
if (trans->in_traverse_all)
- return -BCH_ERR_transaction_restart_in_traverse_all;
+ return bch_err_throw(c, transaction_restart_in_traverse_all);
trans->in_traverse_all = true;
retry_all:
@@ -1103,7 +1103,7 @@ static void btree_path_set_level_down(struct btree_trans *trans,
if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED)
btree_node_unlock(trans, path, l);
- btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+ btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE);
bch2_btree_path_verify(trans, path);
}
@@ -1176,7 +1176,7 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans,
}
if (path->cached) {
- ret = bch2_btree_path_traverse_cached(trans, path, flags);
+ ret = bch2_btree_path_traverse_cached(trans, path_idx, flags);
goto out;
}
@@ -1301,7 +1301,7 @@ __bch2_btree_path_set_pos(struct btree_trans *trans,
if (unlikely(path->cached)) {
btree_node_unlock(trans, path, 0);
path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_up);
- btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+ btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE);
goto out;
}
@@ -1330,7 +1330,7 @@ __bch2_btree_path_set_pos(struct btree_trans *trans,
}
if (unlikely(level != path->level)) {
- btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+ btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE);
__bch2_btree_path_unlock(trans, path);
}
out:
@@ -1399,45 +1399,45 @@ static bool bch2_btree_path_can_relock(struct btree_trans *trans, struct btree_p
void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool intent)
{
- struct btree_path *path = trans->paths + path_idx, *dup;
+ struct btree_path *path = trans->paths + path_idx, *dup = NULL;
if (!__btree_path_put(trans, path, intent))
return;
+ if (!path->preserve && !path->should_be_locked)
+ goto free;
+
dup = path->preserve
? have_path_at_pos(trans, path)
: have_node_at_pos(trans, path);
-
- trace_btree_path_free(trans, path_idx, dup);
-
- if (!dup && !(!path->preserve && !is_btree_node(path, path->level)))
+ if (!dup)
return;
- if (path->should_be_locked && !trans->restarted) {
- if (!dup)
- return;
-
+ /*
+ * If we need this path locked, the duplicate also has te be locked
+ * before we free this one:
+ */
+ if (path->should_be_locked &&
+ !dup->should_be_locked &&
+ !trans->restarted) {
if (!(trans->locked
? bch2_btree_path_relock_norestart(trans, dup)
: bch2_btree_path_can_relock(trans, dup)))
return;
- }
- if (dup) {
- dup->preserve |= path->preserve;
- dup->should_be_locked |= path->should_be_locked;
+ dup->should_be_locked = true;
}
- __bch2_path_free(trans, path_idx);
-}
+ BUG_ON(path->should_be_locked &&
+ !trans->restarted &&
+ trans->locked &&
+ !btree_node_locked(dup, dup->level));
-static void bch2_path_put_nokeep(struct btree_trans *trans, btree_path_idx_t path,
- bool intent)
-{
- if (!__btree_path_put(trans, trans->paths + path, intent))
- return;
-
- __bch2_path_free(trans, path);
+ path->should_be_locked = false;
+ dup->preserve |= path->preserve;
+free:
+ trace_btree_path_free(trans, path_idx, dup);
+ __bch2_path_free(trans, path_idx);
}
void __noreturn bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count)
@@ -1499,24 +1499,16 @@ void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
prt_newline(buf);
}
- for (struct jset_entry *e = trans->journal_entries;
+ for (struct jset_entry *e = btree_trans_journal_entries_start(trans);
e != btree_trans_journal_entries_top(trans);
- e = vstruct_next(e))
+ e = vstruct_next(e)) {
bch2_journal_entry_to_text(buf, trans->c, e);
+ prt_newline(buf);
+ }
printbuf_indent_sub(buf, 2);
}
-noinline __cold
-void bch2_dump_trans_updates(struct btree_trans *trans)
-{
- struct printbuf buf = PRINTBUF;
-
- bch2_trans_updates_to_text(&buf, trans);
- bch2_print_str(trans->c, buf.buf);
- printbuf_exit(&buf);
-}
-
static void bch2_btree_path_to_text_short(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx)
{
struct btree_path *path = trans->paths + path_idx;
@@ -1613,7 +1605,7 @@ void __bch2_dump_trans_paths_updates(struct btree_trans *trans, bool nosort)
__bch2_trans_paths_to_text(&buf, trans, nosort);
bch2_trans_updates_to_text(&buf, trans);
- bch2_print_str(trans->c, buf.buf);
+ bch2_print_str(trans->c, KERN_ERR, buf.buf);
printbuf_exit(&buf);
}
@@ -1757,6 +1749,10 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans,
btree_trans_sort_paths(trans);
+ if (intent)
+ locks_want = max(locks_want, level + 1);
+ locks_want = min(locks_want, BTREE_MAX_DEPTH);
+
trans_for_each_path_inorder(trans, path, iter) {
if (__btree_path_cmp(path,
btree_id,
@@ -1771,7 +1767,8 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans,
if (path_pos &&
trans->paths[path_pos].cached == cached &&
trans->paths[path_pos].btree_id == btree_id &&
- trans->paths[path_pos].level == level) {
+ trans->paths[path_pos].level == level &&
+ bch2_btree_path_upgrade_norestart(trans, trans->paths + path_pos, locks_want)) {
trace_btree_path_get(trans, trans->paths + path_pos, &pos);
__btree_path_get(trans, trans->paths + path_pos, intent);
@@ -1803,9 +1800,6 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans,
if (!(flags & BTREE_ITER_nopreserve))
path->preserve = true;
- if (path->intent_ref)
- locks_want = max(locks_want, level + 1);
-
/*
* If the path has locks_want greater than requested, we don't downgrade
* it here - on transaction restart because btree node split needs to
@@ -1814,10 +1808,6 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans,
* a successful transaction commit.
*/
- locks_want = min(locks_want, BTREE_MAX_DEPTH);
- if (locks_want > path->locks_want)
- bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want, NULL);
-
return path_idx;
}
@@ -1877,10 +1867,8 @@ hole:
return (struct bkey_s_c) { u, NULL };
}
-void bch2_set_btree_iter_dontneed(struct btree_iter *iter)
+void bch2_set_btree_iter_dontneed(struct btree_trans *trans, struct btree_iter *iter)
{
- struct btree_trans *trans = iter->trans;
-
if (!iter->path || trans->restarted)
return;
@@ -1892,17 +1880,14 @@ void bch2_set_btree_iter_dontneed(struct btree_iter *iter)
/* Btree iterators: */
int __must_check
-__bch2_btree_iter_traverse(struct btree_iter *iter)
+__bch2_btree_iter_traverse(struct btree_trans *trans, struct btree_iter *iter)
{
- return bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
+ return bch2_btree_path_traverse(trans, iter->path, iter->flags);
}
int __must_check
-bch2_btree_iter_traverse(struct btree_iter *iter)
+bch2_btree_iter_traverse(struct btree_trans *trans, struct btree_iter *iter)
{
- struct btree_trans *trans = iter->trans;
- int ret;
-
bch2_trans_verify_not_unlocked_or_in_restart(trans);
iter->path = bch2_btree_path_set_pos(trans, iter->path,
@@ -1910,7 +1895,7 @@ bch2_btree_iter_traverse(struct btree_iter *iter)
iter->flags & BTREE_ITER_intent,
btree_iter_ip_allocated(iter));
- ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
+ int ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
if (ret)
return ret;
@@ -1922,14 +1907,14 @@ bch2_btree_iter_traverse(struct btree_iter *iter)
/* Iterate across nodes (leaf and interior nodes) */
-struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
+struct btree *bch2_btree_iter_peek_node(struct btree_trans *trans,
+ struct btree_iter *iter)
{
- struct btree_trans *trans = iter->trans;
struct btree *b = NULL;
int ret;
EBUG_ON(trans->paths[iter->path].cached);
- bch2_btree_iter_verify(iter);
+ bch2_btree_iter_verify(trans, iter);
ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
if (ret)
@@ -1951,7 +1936,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
out:
bch2_btree_iter_verify_entry_exit(iter);
- bch2_btree_iter_verify(iter);
+ bch2_btree_iter_verify(trans, iter);
return b;
err:
@@ -1960,26 +1945,26 @@ err:
}
/* Only kept for -tools */
-struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *iter)
+struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_trans *trans,
+ struct btree_iter *iter)
{
struct btree *b;
- while (b = bch2_btree_iter_peek_node(iter),
+ while (b = bch2_btree_iter_peek_node(trans, iter),
bch2_err_matches(PTR_ERR_OR_ZERO(b), BCH_ERR_transaction_restart))
- bch2_trans_begin(iter->trans);
+ bch2_trans_begin(trans);
return b;
}
-struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
+struct btree *bch2_btree_iter_next_node(struct btree_trans *trans, struct btree_iter *iter)
{
- struct btree_trans *trans = iter->trans;
struct btree *b = NULL;
int ret;
EBUG_ON(trans->paths[iter->path].cached);
bch2_trans_verify_not_unlocked_or_in_restart(trans);
- bch2_btree_iter_verify(iter);
+ bch2_btree_iter_verify(trans, iter);
ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
if (ret)
@@ -1994,17 +1979,24 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
/* got to end? */
if (!btree_path_node(path, path->level + 1)) {
+ path->should_be_locked = false;
btree_path_set_level_up(trans, path);
return NULL;
}
+ /*
+ * We don't correctly handle nodes with extra intent locks here:
+ * downgrade so we don't violate locking invariants
+ */
+ bch2_btree_path_downgrade(trans, path);
+
if (!bch2_btree_node_relock(trans, path, path->level + 1)) {
+ trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path);
+ ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
__bch2_btree_path_unlock(trans, path);
path->l[path->level].b = ERR_PTR(-BCH_ERR_no_btree_node_relock);
path->l[path->level + 1].b = ERR_PTR(-BCH_ERR_no_btree_node_relock);
- btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
- trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path);
- ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
+ btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE);
goto err;
}
@@ -2046,7 +2038,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
EBUG_ON(btree_iter_path(trans, iter)->uptodate);
out:
bch2_btree_iter_verify_entry_exit(iter);
- bch2_btree_iter_verify(iter);
+ bch2_btree_iter_verify(trans, iter);
return b;
err:
@@ -2056,7 +2048,7 @@ err:
/* Iterate across keys (in leaf nodes only) */
-inline bool bch2_btree_iter_advance(struct btree_iter *iter)
+inline bool bch2_btree_iter_advance(struct btree_trans *trans, struct btree_iter *iter)
{
struct bpos pos = iter->k.p;
bool ret = !(iter->flags & BTREE_ITER_all_snapshots
@@ -2065,11 +2057,11 @@ inline bool bch2_btree_iter_advance(struct btree_iter *iter)
if (ret && !(iter->flags & BTREE_ITER_is_extents))
pos = bkey_successor(iter, pos);
- bch2_btree_iter_set_pos(iter, pos);
+ bch2_btree_iter_set_pos(trans, iter, pos);
return ret;
}
-inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
+inline bool bch2_btree_iter_rewind(struct btree_trans *trans, struct btree_iter *iter)
{
struct bpos pos = bkey_start_pos(&iter->k);
bool ret = !(iter->flags & BTREE_ITER_all_snapshots
@@ -2078,20 +2070,20 @@ inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
if (ret && !(iter->flags & BTREE_ITER_is_extents))
pos = bkey_predecessor(iter, pos);
- bch2_btree_iter_set_pos(iter, pos);
+ bch2_btree_iter_set_pos(trans, iter, pos);
return ret;
}
static noinline
void bch2_btree_trans_peek_prev_updates(struct btree_trans *trans, struct btree_iter *iter,
- struct bkey_s_c *k)
+ struct bpos search_key, struct bkey_s_c *k)
{
struct bpos end = path_l(btree_iter_path(trans, iter))->b->data->min_key;
trans_for_each_update(trans, i)
if (!i->key_cache_already_flushed &&
i->btree_id == iter->btree_id &&
- bpos_le(i->k->k.p, iter->pos) &&
+ bpos_le(i->k->k.p, search_key) &&
bpos_ge(i->k->k.p, k->k ? k->k->p : end)) {
iter->k = i->k->k;
*k = bkey_i_to_s_c(i->k);
@@ -2100,6 +2092,7 @@ void bch2_btree_trans_peek_prev_updates(struct btree_trans *trans, struct btree_
static noinline
void bch2_btree_trans_peek_updates(struct btree_trans *trans, struct btree_iter *iter,
+ struct bpos search_key,
struct bkey_s_c *k)
{
struct btree_path *path = btree_iter_path(trans, iter);
@@ -2108,7 +2101,7 @@ void bch2_btree_trans_peek_updates(struct btree_trans *trans, struct btree_iter
trans_for_each_update(trans, i)
if (!i->key_cache_already_flushed &&
i->btree_id == iter->btree_id &&
- bpos_ge(i->k->k.p, path->pos) &&
+ bpos_ge(i->k->k.p, search_key) &&
bpos_le(i->k->k.p, k->k ? k->k->p : end)) {
iter->k = i->k->k;
*k = bkey_i_to_s_c(i->k);
@@ -2130,13 +2123,14 @@ void bch2_btree_trans_peek_slot_updates(struct btree_trans *trans, struct btree_
static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans,
struct btree_iter *iter,
+ struct bpos search_pos,
struct bpos end_pos)
{
struct btree_path *path = btree_iter_path(trans, iter);
return bch2_journal_keys_peek_max(trans->c, iter->btree_id,
path->level,
- path->pos,
+ search_pos,
end_pos,
&iter->journal_idx);
}
@@ -2146,7 +2140,7 @@ struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans,
struct btree_iter *iter)
{
struct btree_path *path = btree_iter_path(trans, iter);
- struct bkey_i *k = bch2_btree_journal_peek(trans, iter, path->pos);
+ struct bkey_i *k = bch2_btree_journal_peek(trans, iter, path->pos, path->pos);
if (k) {
iter->k = k->k;
@@ -2159,11 +2153,12 @@ struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans,
static noinline
void btree_trans_peek_journal(struct btree_trans *trans,
struct btree_iter *iter,
+ struct bpos search_key,
struct bkey_s_c *k)
{
struct btree_path *path = btree_iter_path(trans, iter);
struct bkey_i *next_journal =
- bch2_btree_journal_peek(trans, iter,
+ bch2_btree_journal_peek(trans, iter, search_key,
k->k ? k->k->p : path_l(path)->b->key.k.p);
if (next_journal) {
iter->k = next_journal->k;
@@ -2173,13 +2168,14 @@ void btree_trans_peek_journal(struct btree_trans *trans,
static struct bkey_i *bch2_btree_journal_peek_prev(struct btree_trans *trans,
struct btree_iter *iter,
+ struct bpos search_key,
struct bpos end_pos)
{
struct btree_path *path = btree_iter_path(trans, iter);
return bch2_journal_keys_peek_prev_min(trans->c, iter->btree_id,
path->level,
- path->pos,
+ search_key,
end_pos,
&iter->journal_idx);
}
@@ -2187,12 +2183,13 @@ static struct bkey_i *bch2_btree_journal_peek_prev(struct btree_trans *trans,
static noinline
void btree_trans_peek_prev_journal(struct btree_trans *trans,
struct btree_iter *iter,
+ struct bpos search_key,
struct bkey_s_c *k)
{
struct btree_path *path = btree_iter_path(trans, iter);
struct bkey_i *next_journal =
- bch2_btree_journal_peek_prev(trans, iter,
- k->k ? k->k->p : path_l(path)->b->key.k.p);
+ bch2_btree_journal_peek_prev(trans, iter, search_key,
+ k->k ? k->k->p : path_l(path)->b->data->min_key);
if (next_journal) {
iter->k = next_journal->k;
@@ -2205,9 +2202,9 @@ void btree_trans_peek_prev_journal(struct btree_trans *trans,
* bkey_s_c_null:
*/
static noinline
-struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos)
+struct bkey_s_c btree_trans_peek_key_cache(struct btree_trans *trans, struct btree_iter *iter,
+ struct bpos pos)
{
- struct btree_trans *trans = iter->trans;
struct bch_fs *c = trans->c;
struct bkey u;
struct bkey_s_c k;
@@ -2253,14 +2250,14 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos
return k;
}
-static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key)
+static struct bkey_s_c __bch2_btree_iter_peek(struct btree_trans *trans, struct btree_iter *iter,
+ struct bpos search_key)
{
- struct btree_trans *trans = iter->trans;
struct bkey_s_c k, k2;
int ret;
EBUG_ON(btree_iter_path(trans, iter)->cached);
- bch2_btree_iter_verify(iter);
+ bch2_btree_iter_verify(trans, iter);
while (1) {
iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
@@ -2270,7 +2267,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
if (unlikely(ret)) {
/* ensure that iter->k is consistent with iter->pos: */
- bch2_btree_iter_set_pos(iter, iter->pos);
+ bch2_btree_iter_set_pos(trans, iter, iter->pos);
k = bkey_s_c_err(ret);
break;
}
@@ -2280,7 +2277,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
if (unlikely(!l->b)) {
/* No btree nodes at requested level: */
- bch2_btree_iter_set_pos(iter, SPOS_MAX);
+ bch2_btree_iter_set_pos(trans, iter, SPOS_MAX);
k = bkey_s_c_null;
break;
}
@@ -2291,20 +2288,20 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
if (unlikely(iter->flags & BTREE_ITER_with_key_cache) &&
k.k &&
- (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) {
+ (k2 = btree_trans_peek_key_cache(trans, iter, k.k->p)).k) {
k = k2;
if (bkey_err(k)) {
- bch2_btree_iter_set_pos(iter, iter->pos);
+ bch2_btree_iter_set_pos(trans, iter, iter->pos);
break;
}
}
if (unlikely(iter->flags & BTREE_ITER_with_journal))
- btree_trans_peek_journal(trans, iter, &k);
+ btree_trans_peek_journal(trans, iter, search_key, &k);
if (unlikely((iter->flags & BTREE_ITER_with_updates) &&
trans->nr_updates))
- bch2_btree_trans_peek_updates(trans, iter, &k);
+ bch2_btree_trans_peek_updates(trans, iter, search_key, &k);
if (k.k && bkey_deleted(k.k)) {
/*
@@ -2327,27 +2324,42 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
search_key = bpos_successor(l->b->key.k.p);
} else {
/* End of btree: */
- bch2_btree_iter_set_pos(iter, SPOS_MAX);
+ bch2_btree_iter_set_pos(trans, iter, SPOS_MAX);
k = bkey_s_c_null;
break;
}
}
- bch2_btree_iter_verify(iter);
+ bch2_btree_iter_verify(trans, iter);
+
+ if (trace___btree_iter_peek_enabled()) {
+ CLASS(printbuf, buf)();
+
+ int ret = bkey_err(k);
+ if (ret)
+ prt_str(&buf, bch2_err_str(ret));
+ else if (k.k)
+ bch2_bkey_val_to_text(&buf, trans->c, k);
+ else
+ prt_str(&buf, "(null)");
+ trace___btree_iter_peek(trans->c, buf.buf);
+ }
+
return k;
}
/**
* bch2_btree_iter_peek_max() - returns first key greater than or equal to
* iterator's current position
+ * @trans: btree transaction object
* @iter: iterator to peek from
* @end: search limit: returns keys less than or equal to @end
*
* Returns: key if found, or an error extractable with bkey_err().
*/
-struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos end)
+struct bkey_s_c bch2_btree_iter_peek_max(struct btree_trans *trans, struct btree_iter *iter,
+ struct bpos end)
{
- struct btree_trans *trans = iter->trans;
struct bpos search_key = btree_iter_search_key(iter);
struct bkey_s_c k;
struct bpos iter_pos = iter->pos;
@@ -2357,14 +2369,19 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos en
bch2_btree_iter_verify_entry_exit(iter);
EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bkey_eq(end, POS_MAX));
+ ret = trans_maybe_inject_restart(trans, _RET_IP_);
+ if (unlikely(ret)) {
+ k = bkey_s_c_err(ret);
+ goto out_no_locked;
+ }
+
if (iter->update_path) {
- bch2_path_put_nokeep(trans, iter->update_path,
- iter->flags & BTREE_ITER_intent);
+ bch2_path_put(trans, iter->update_path, iter->flags & BTREE_ITER_intent);
iter->update_path = 0;
}
while (1) {
- k = __bch2_btree_iter_peek(iter, search_key);
+ k = __bch2_btree_iter_peek(trans, iter, search_key);
if (unlikely(!k.k))
goto end;
if (unlikely(bkey_err(k)))
@@ -2388,8 +2405,8 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos en
if (iter->update_path &&
!bkey_eq(trans->paths[iter->update_path].pos, k.k->p)) {
- bch2_path_put_nokeep(trans, iter->update_path,
- iter->flags & BTREE_ITER_intent);
+ bch2_path_put(trans, iter->update_path,
+ iter->flags & BTREE_ITER_intent);
iter->update_path = 0;
}
@@ -2478,17 +2495,30 @@ out_no_locked:
if (!(iter->flags & BTREE_ITER_all_snapshots))
iter->pos.snapshot = iter->snapshot;
- ret = bch2_btree_iter_verify_ret(iter, k);
+ ret = bch2_btree_iter_verify_ret(trans, iter, k);
if (unlikely(ret)) {
- bch2_btree_iter_set_pos(iter, iter->pos);
+ bch2_btree_iter_set_pos(trans, iter, iter->pos);
k = bkey_s_c_err(ret);
}
bch2_btree_iter_verify_entry_exit(iter);
+ if (trace_btree_iter_peek_max_enabled()) {
+ CLASS(printbuf, buf)();
+
+ int ret = bkey_err(k);
+ if (ret)
+ prt_str(&buf, bch2_err_str(ret));
+ else if (k.k)
+ bch2_bkey_val_to_text(&buf, trans->c, k);
+ else
+ prt_str(&buf, "(null)");
+ trace_btree_iter_peek_max(trans->c, buf.buf);
+ }
+
return k;
end:
- bch2_btree_iter_set_pos(iter, end);
+ bch2_btree_iter_set_pos(trans, iter, end);
k = bkey_s_c_null;
goto out_no_locked;
}
@@ -2496,24 +2526,25 @@ end:
/**
* bch2_btree_iter_next() - returns first key greater than iterator's current
* position
+ * @trans: btree transaction object
* @iter: iterator to peek from
*
* Returns: key if found, or an error extractable with bkey_err().
*/
-struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
+struct bkey_s_c bch2_btree_iter_next(struct btree_trans *trans, struct btree_iter *iter)
{
- if (!bch2_btree_iter_advance(iter))
+ if (!bch2_btree_iter_advance(trans, iter))
return bkey_s_c_null;
- return bch2_btree_iter_peek(iter);
+ return bch2_btree_iter_peek(trans, iter);
}
-static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, struct bpos search_key)
+static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_trans *trans, struct btree_iter *iter,
+ struct bpos search_key)
{
- struct btree_trans *trans = iter->trans;
struct bkey_s_c k, k2;
- bch2_btree_iter_verify(iter);
+ bch2_btree_iter_verify(trans, iter);
while (1) {
iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
@@ -2523,7 +2554,7 @@ static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, stru
int ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
if (unlikely(ret)) {
/* ensure that iter->k is consistent with iter->pos: */
- bch2_btree_iter_set_pos(iter, iter->pos);
+ bch2_btree_iter_set_pos(trans, iter, iter->pos);
k = bkey_s_c_err(ret);
break;
}
@@ -2533,7 +2564,7 @@ static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, stru
if (unlikely(!l->b)) {
/* No btree nodes at requested level: */
- bch2_btree_iter_set_pos(iter, SPOS_MAX);
+ bch2_btree_iter_set_pos(trans, iter, SPOS_MAX);
k = bkey_s_c_null;
break;
}
@@ -2549,20 +2580,20 @@ static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, stru
if (unlikely(iter->flags & BTREE_ITER_with_key_cache) &&
k.k &&
- (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) {
+ (k2 = btree_trans_peek_key_cache(trans, iter, k.k->p)).k) {
k = k2;
if (bkey_err(k2)) {
- bch2_btree_iter_set_pos(iter, iter->pos);
+ bch2_btree_iter_set_pos(trans, iter, iter->pos);
break;
}
}
if (unlikely(iter->flags & BTREE_ITER_with_journal))
- btree_trans_peek_prev_journal(trans, iter, &k);
+ btree_trans_peek_prev_journal(trans, iter, search_key, &k);
if (unlikely((iter->flags & BTREE_ITER_with_updates) &&
trans->nr_updates))
- bch2_btree_trans_peek_prev_updates(trans, iter, &k);
+ bch2_btree_trans_peek_prev_updates(trans, iter, search_key, &k);
if (likely(k.k && !bkey_deleted(k.k))) {
break;
@@ -2573,28 +2604,33 @@ static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, stru
search_key = bpos_predecessor(path->l[0].b->data->min_key);
} else {
/* Start of btree: */
- bch2_btree_iter_set_pos(iter, POS_MIN);
+ bch2_btree_iter_set_pos(trans, iter, POS_MIN);
k = bkey_s_c_null;
break;
}
}
- bch2_btree_iter_verify(iter);
+ bch2_btree_iter_verify(trans, iter);
return k;
}
/**
* bch2_btree_iter_peek_prev_min() - returns first key less than or equal to
* iterator's current position
+ * @trans: btree transaction object
* @iter: iterator to peek from
* @end: search limit: returns keys greater than or equal to @end
*
* Returns: key if found, or an error extractable with bkey_err().
*/
-struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bpos end)
+struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *trans, struct btree_iter *iter,
+ struct bpos end)
{
if ((iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots)) &&
- !bkey_eq(iter->pos, POS_MAX)) {
+ !bkey_eq(iter->pos, POS_MAX) &&
+ !((iter->flags & BTREE_ITER_is_extents) &&
+ iter->pos.offset == U64_MAX)) {
+
/*
* bkey_start_pos(), for extents, is not monotonically
* increasing until after filtering for snapshots:
@@ -2603,7 +2639,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bp
* real visible extents - easiest to just use peek_slot() (which
* internally uses peek() for extents)
*/
- struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
+ struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, iter);
if (bkey_err(k))
return k;
@@ -2613,17 +2649,22 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bp
return k;
}
- struct btree_trans *trans = iter->trans;
struct bpos search_key = iter->pos;
struct bkey_s_c k;
btree_path_idx_t saved_path = 0;
bch2_trans_verify_not_unlocked_or_in_restart(trans);
bch2_btree_iter_verify_entry_exit(iter);
- EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bpos_eq(end, POS_MIN));
+ EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && iter->pos.inode != end.inode);
+
+ int ret = trans_maybe_inject_restart(trans, _RET_IP_);
+ if (unlikely(ret)) {
+ k = bkey_s_c_err(ret);
+ goto out_no_locked;
+ }
while (1) {
- k = __bch2_btree_iter_peek_prev(iter, search_key);
+ k = __bch2_btree_iter_peek_prev(trans, iter, search_key);
if (unlikely(!k.k))
goto end;
if (unlikely(bkey_err(k)))
@@ -2637,7 +2678,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bp
* the last possible snapshot overwrite, return
* it:
*/
- bch2_path_put_nokeep(trans, iter->path,
+ bch2_path_put(trans, iter->path,
iter->flags & BTREE_ITER_intent);
iter->path = saved_path;
saved_path = 0;
@@ -2667,8 +2708,8 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bp
* our previous saved candidate:
*/
if (saved_path) {
- bch2_path_put_nokeep(trans, saved_path,
- iter->flags & BTREE_ITER_intent);
+ bch2_path_put(trans, saved_path,
+ iter->flags & BTREE_ITER_intent);
saved_path = 0;
}
@@ -2711,13 +2752,26 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bp
iter->pos.snapshot = iter->snapshot;
out_no_locked:
if (saved_path)
- bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_intent);
+ bch2_path_put(trans, saved_path, iter->flags & BTREE_ITER_intent);
bch2_btree_iter_verify_entry_exit(iter);
- bch2_btree_iter_verify(iter);
+ bch2_btree_iter_verify(trans, iter);
+
+ if (trace_btree_iter_peek_prev_min_enabled()) {
+ CLASS(printbuf, buf)();
+
+ int ret = bkey_err(k);
+ if (ret)
+ prt_str(&buf, bch2_err_str(ret));
+ else if (k.k)
+ bch2_bkey_val_to_text(&buf, trans->c, k);
+ else
+ prt_str(&buf, "(null)");
+ trace_btree_iter_peek_prev_min(trans->c, buf.buf);
+ }
return k;
end:
- bch2_btree_iter_set_pos(iter, end);
+ bch2_btree_iter_set_pos(trans, iter, end);
k = bkey_s_c_null;
goto out_no_locked;
}
@@ -2725,37 +2779,45 @@ end:
/**
* bch2_btree_iter_prev() - returns first key less than iterator's current
* position
+ * @trans: btree transaction object
* @iter: iterator to peek from
*
* Returns: key if found, or an error extractable with bkey_err().
*/
-struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
+struct bkey_s_c bch2_btree_iter_prev(struct btree_trans *trans, struct btree_iter *iter)
{
- if (!bch2_btree_iter_rewind(iter))
+ if (!bch2_btree_iter_rewind(trans, iter))
return bkey_s_c_null;
- return bch2_btree_iter_peek_prev(iter);
+ return bch2_btree_iter_peek_prev(trans, iter);
}
-struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
+struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btree_iter *iter)
{
- struct btree_trans *trans = iter->trans;
struct bpos search_key;
struct bkey_s_c k;
int ret;
bch2_trans_verify_not_unlocked_or_in_restart(trans);
- bch2_btree_iter_verify(iter);
+ bch2_btree_iter_verify(trans, iter);
bch2_btree_iter_verify_entry_exit(iter);
EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_with_key_cache));
+ ret = trans_maybe_inject_restart(trans, _RET_IP_);
+ if (unlikely(ret)) {
+ k = bkey_s_c_err(ret);
+ goto out;
+ }
+
/* extents can't span inode numbers: */
if ((iter->flags & BTREE_ITER_is_extents) &&
unlikely(iter->pos.offset == KEY_OFFSET_MAX)) {
- if (iter->pos.inode == KEY_INODE_MAX)
- return bkey_s_c_null;
+ if (iter->pos.inode == KEY_INODE_MAX) {
+ k = bkey_s_c_null;
+ goto out2;
+ }
- bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos));
+ bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(iter->pos));
}
search_key = btree_iter_search_key(iter);
@@ -2766,12 +2828,16 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
if (unlikely(ret)) {
k = bkey_s_c_err(ret);
- goto out_no_locked;
+ goto out;
}
struct btree_path *path = btree_iter_path(trans, iter);
- if (unlikely(!btree_path_node(path, path->level)))
- return bkey_s_c_null;
+ if (unlikely(!btree_path_node(path, path->level))) {
+ k = bkey_s_c_null;
+ goto out2;
+ }
+
+ btree_path_set_should_be_locked(trans, path);
if ((iter->flags & BTREE_ITER_cached) ||
!(iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots))) {
@@ -2789,16 +2855,16 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
goto out;
if (unlikely(iter->flags & BTREE_ITER_with_key_cache) &&
- (k = btree_trans_peek_key_cache(iter, iter->pos)).k) {
+ (k = btree_trans_peek_key_cache(trans, iter, iter->pos)).k) {
if (!bkey_err(k))
iter->k = *k.k;
/* We're not returning a key from iter->path: */
- goto out_no_locked;
+ goto out;
}
- k = bch2_btree_path_peek_slot(trans->paths + iter->path, &iter->k);
+ k = bch2_btree_path_peek_slot(btree_iter_path(trans, iter), &iter->k);
if (unlikely(!k.k))
- goto out_no_locked;
+ goto out;
if (unlikely(k.k->type == KEY_TYPE_whiteout &&
(iter->flags & BTREE_ITER_filter_snapshots) &&
@@ -2816,8 +2882,8 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
if (iter->flags & BTREE_ITER_intent) {
struct btree_iter iter2;
- bch2_trans_copy_iter(&iter2, iter);
- k = bch2_btree_iter_peek_max(&iter2, end);
+ bch2_trans_copy_iter(trans, &iter2, iter);
+ k = bch2_btree_iter_peek_max(trans, &iter2, end);
if (k.k && !bkey_err(k)) {
swap(iter->key_cache_path, iter2.key_cache_path);
@@ -2828,15 +2894,15 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
} else {
struct bpos pos = iter->pos;
- k = bch2_btree_iter_peek_max(iter, end);
+ k = bch2_btree_iter_peek_max(trans, iter, end);
if (unlikely(bkey_err(k)))
- bch2_btree_iter_set_pos(iter, pos);
+ bch2_btree_iter_set_pos(trans, iter, pos);
else
iter->pos = pos;
}
if (unlikely(bkey_err(k)))
- goto out_no_locked;
+ goto out;
next = k.k ? bkey_start_pos(k.k) : POS_MAX;
@@ -2858,42 +2924,53 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
}
}
out:
- btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter));
-out_no_locked:
bch2_btree_iter_verify_entry_exit(iter);
- bch2_btree_iter_verify(iter);
- ret = bch2_btree_iter_verify_ret(iter, k);
+ bch2_btree_iter_verify(trans, iter);
+ ret = bch2_btree_iter_verify_ret(trans, iter, k);
if (unlikely(ret))
- return bkey_s_c_err(ret);
+ k = bkey_s_c_err(ret);
+out2:
+ if (trace_btree_iter_peek_slot_enabled()) {
+ CLASS(printbuf, buf)();
+
+ int ret = bkey_err(k);
+ if (ret)
+ prt_str(&buf, bch2_err_str(ret));
+ else if (k.k)
+ bch2_bkey_val_to_text(&buf, trans->c, k);
+ else
+ prt_str(&buf, "(null)");
+ trace_btree_iter_peek_slot(trans->c, buf.buf);
+ }
return k;
}
-struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
+struct bkey_s_c bch2_btree_iter_next_slot(struct btree_trans *trans, struct btree_iter *iter)
{
- if (!bch2_btree_iter_advance(iter))
+ if (!bch2_btree_iter_advance(trans, iter))
return bkey_s_c_null;
- return bch2_btree_iter_peek_slot(iter);
+ return bch2_btree_iter_peek_slot(trans, iter);
}
-struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter)
+struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_trans *trans, struct btree_iter *iter)
{
- if (!bch2_btree_iter_rewind(iter))
+ if (!bch2_btree_iter_rewind(trans, iter))
return bkey_s_c_null;
- return bch2_btree_iter_peek_slot(iter);
+ return bch2_btree_iter_peek_slot(trans, iter);
}
/* Obsolete, but still used by rust wrapper in -tools */
-struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *iter)
+struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_trans *trans, struct btree_iter *iter)
{
struct bkey_s_c k;
- while (btree_trans_too_many_iters(iter->trans) ||
- (k = bch2_btree_iter_peek_type(iter, iter->flags),
+ while (btree_trans_too_many_iters(trans) ||
+ (k = bch2_btree_iter_peek_type(trans, iter, iter->flags),
bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart)))
- bch2_trans_begin(iter->trans);
+ bch2_trans_begin(trans);
return k;
}
@@ -2926,7 +3003,7 @@ static void btree_trans_verify_sorted(struct btree_trans *trans)
struct btree_path *path, *prev = NULL;
struct trans_for_each_path_inorder_iter iter;
- if (!bch2_debug_check_iterators)
+ if (!static_branch_unlikely(&bch2_debug_check_iterators))
return;
trans_for_each_path_inorder(trans, path, iter) {
@@ -3028,7 +3105,7 @@ static inline void btree_path_list_add(struct btree_trans *trans,
void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
{
if (iter->update_path)
- bch2_path_put_nokeep(trans, iter->update_path,
+ bch2_path_put(trans, iter->update_path,
iter->flags & BTREE_ITER_intent);
if (iter->path)
bch2_path_put(trans, iter->path,
@@ -3039,7 +3116,6 @@ void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
iter->path = 0;
iter->update_path = 0;
iter->key_cache_path = 0;
- iter->trans = NULL;
}
void bch2_trans_iter_init_outlined(struct btree_trans *trans,
@@ -3079,10 +3155,9 @@ void bch2_trans_node_iter_init(struct btree_trans *trans,
BUG_ON(iter->min_depth != depth);
}
-void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
+void bch2_trans_copy_iter(struct btree_trans *trans,
+ struct btree_iter *dst, struct btree_iter *src)
{
- struct btree_trans *trans = src->trans;
-
*dst = *src;
#ifdef TRACK_PATH_ALLOCATED
dst->ip_allocated = _RET_IP_;
@@ -3094,7 +3169,19 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
dst->key_cache_path = 0;
}
-void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
+void bch2_trans_kmalloc_trace_to_text(struct printbuf *out,
+ darray_trans_kmalloc_trace *trace)
+{
+ printbuf_tabstops_reset(out);
+ printbuf_tabstop_push(out, 60);
+
+ darray_for_each(*trace, i)
+ prt_printf(out, "%pS\t%zu\n", (void *) i->ip, i->bytes);
+}
+#endif
+
+void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size, unsigned long ip)
{
struct bch_fs *c = trans->c;
unsigned new_top = trans->mem_top + size;
@@ -3104,51 +3191,66 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
void *new_mem;
void *p;
- WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX);
+ if (WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX)) {
+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
+ struct printbuf buf = PRINTBUF;
+ bch2_log_msg_start(c, &buf);
+ prt_printf(&buf, "bump allocator exceeded BTREE_TRANS_MEM_MAX (%u)\n",
+ BTREE_TRANS_MEM_MAX);
- struct btree_transaction_stats *s = btree_trans_stats(trans);
- s->max_mem = max(s->max_mem, new_bytes);
+ bch2_trans_kmalloc_trace_to_text(&buf, &trans->trans_kmalloc_trace);
+ bch2_print_str(c, KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+#endif
+ }
- if (trans->used_mempool) {
- if (trans->mem_bytes >= new_bytes)
- goto out_change_top;
+ ret = trans_maybe_inject_restart(trans, _RET_IP_);
+ if (ret)
+ return ERR_PTR(ret);
- /* No more space from mempool item, need malloc new one */
- new_mem = kmalloc(new_bytes, GFP_NOWAIT|__GFP_NOWARN);
- if (unlikely(!new_mem)) {
- bch2_trans_unlock(trans);
+ struct btree_transaction_stats *s = btree_trans_stats(trans);
+ if (new_bytes > s->max_mem) {
+ mutex_lock(&s->lock);
+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
+ darray_resize(&s->trans_kmalloc_trace, trans->trans_kmalloc_trace.nr);
+ s->trans_kmalloc_trace.nr = min(s->trans_kmalloc_trace.size,
+ trans->trans_kmalloc_trace.nr);
+
+ memcpy(s->trans_kmalloc_trace.data,
+ trans->trans_kmalloc_trace.data,
+ sizeof(s->trans_kmalloc_trace.data[0]) *
+ s->trans_kmalloc_trace.nr);
+#endif
+ s->max_mem = new_bytes;
+ mutex_unlock(&s->lock);
+ }
- new_mem = kmalloc(new_bytes, GFP_KERNEL);
- if (!new_mem)
- return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc);
+ if (trans->used_mempool || new_bytes > BTREE_TRANS_MEM_MAX) {
+ EBUG_ON(trans->mem_bytes >= new_bytes);
+ return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc);
+ }
- ret = bch2_trans_relock(trans);
- if (ret) {
- kfree(new_mem);
- return ERR_PTR(ret);
- }
- }
- memcpy(new_mem, trans->mem, trans->mem_top);
- trans->used_mempool = false;
- mempool_free(trans->mem, &c->btree_trans_mem_pool);
- goto out_new_mem;
+ if (old_bytes) {
+ trans->realloc_bytes_required = new_bytes;
+ trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes);
+ return ERR_PTR(btree_trans_restart_ip(trans,
+ BCH_ERR_transaction_restart_mem_realloced, _RET_IP_));
}
- new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN);
+ EBUG_ON(trans->mem);
+
+ new_mem = kmalloc(new_bytes, GFP_NOWAIT|__GFP_NOWARN);
if (unlikely(!new_mem)) {
bch2_trans_unlock(trans);
- new_mem = krealloc(trans->mem, new_bytes, GFP_KERNEL);
+ new_mem = kmalloc(new_bytes, GFP_KERNEL);
if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) {
new_mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL);
new_bytes = BTREE_TRANS_MEM_MAX;
- memcpy(new_mem, trans->mem, trans->mem_top);
trans->used_mempool = true;
- kfree(trans->mem);
}
- if (!new_mem)
- return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc);
+ EBUG_ON(!new_mem);
trans->mem = new_mem;
trans->mem_bytes = new_bytes;
@@ -3157,15 +3259,10 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
if (ret)
return ERR_PTR(ret);
}
-out_new_mem:
+
trans->mem = new_mem;
trans->mem_bytes = new_bytes;
- if (old_bytes) {
- trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes);
- return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced));
- }
-out_change_top:
p = trans->mem + trans->mem_top;
trans->mem_top += size;
memset(p, 0, size);
@@ -3225,7 +3322,27 @@ u32 bch2_trans_begin(struct btree_trans *trans)
trans->restart_count++;
trans->mem_top = 0;
- trans->journal_entries = NULL;
+
+ if (trans->restarted == BCH_ERR_transaction_restart_mem_realloced) {
+ EBUG_ON(!trans->mem || !trans->mem_bytes);
+ unsigned new_bytes = trans->realloc_bytes_required;
+ void *new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN);
+ if (unlikely(!new_mem)) {
+ bch2_trans_unlock(trans);
+ new_mem = krealloc(trans->mem, new_bytes, GFP_KERNEL);
+
+ EBUG_ON(new_bytes > BTREE_TRANS_MEM_MAX);
+
+ if (!new_mem) {
+ new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL);
+ new_bytes = BTREE_TRANS_MEM_MAX;
+ trans->used_mempool = true;
+ kfree(trans->mem);
+ }
+ }
+ trans->mem = new_mem;
+ trans->mem_bytes = new_bytes;
+ }
trans_for_each_path(trans, path, i) {
path->should_be_locked = false;
@@ -3271,6 +3388,18 @@ u32 bch2_trans_begin(struct btree_trans *trans)
trans->last_begin_ip = _RET_IP_;
+#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS
+ if (trans->restarted) {
+ trans->restart_count_this_trans++;
+ } else {
+ trans->restart_count_this_trans = 0;
+ }
+#endif
+
+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
+ trans->trans_kmalloc_trace.nr = 0;
+#endif
+
trans_set_locked(trans, false);
if (trans->restarted) {
@@ -3371,7 +3500,6 @@ got_trans:
}
trans->nr_paths_max = s->nr_max_paths;
- trans->journal_entries_size = s->journal_entries_size;
}
trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
@@ -3383,29 +3511,45 @@ got_trans:
return trans;
}
-static void check_btree_paths_leaked(struct btree_trans *trans)
-{
#ifdef CONFIG_BCACHEFS_DEBUG
- struct bch_fs *c = trans->c;
+
+static bool btree_paths_leaked(struct btree_trans *trans)
+{
struct btree_path *path;
unsigned i;
trans_for_each_path(trans, path, i)
if (path->ref)
- goto leaked;
- return;
-leaked:
- bch_err(c, "btree paths leaked from %s!", trans->fn);
- trans_for_each_path(trans, path, i)
- if (path->ref)
- printk(KERN_ERR " btree %s %pS\n",
- bch2_btree_id_str(path->btree_id),
- (void *) path->ip_allocated);
- /* Be noisy about this: */
- bch2_fatal_error(c);
-#endif
+ return true;
+ return false;
}
+static void check_btree_paths_leaked(struct btree_trans *trans)
+{
+ if (btree_paths_leaked(trans)) {
+ struct bch_fs *c = trans->c;
+ struct btree_path *path;
+ unsigned i;
+
+ struct printbuf buf = PRINTBUF;
+ bch2_log_msg_start(c, &buf);
+
+ prt_printf(&buf, "btree paths leaked from %s!\n", trans->fn);
+ trans_for_each_path(trans, path, i)
+ if (path->ref)
+ prt_printf(&buf, "btree %s %pS\n",
+ bch2_btree_id_str(path->btree_id),
+ (void *) path->ip_allocated);
+
+ bch2_fs_emergency_read_only2(c, &buf);
+ bch2_print_str(c, KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+ }
+}
+#else
+static inline void check_btree_paths_leaked(struct btree_trans *trans) {}
+#endif
+
void bch2_trans_put(struct btree_trans *trans)
__releases(&c->btree_trans_barrier)
{
@@ -3440,6 +3584,9 @@ void bch2_trans_put(struct btree_trans *trans)
#ifdef CONFIG_BCACHEFS_DEBUG
darray_exit(&trans->last_restarted_trace);
#endif
+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
+ darray_exit(&trans->trans_kmalloc_trace);
+#endif
unsigned long *paths_allocated = trans->paths_allocated;
trans->paths_allocated = NULL;
@@ -3486,13 +3633,12 @@ bch2_btree_bkey_cached_common_to_text(struct printbuf *out,
struct btree_bkey_cached_common *b)
{
struct six_lock_count c = six_lock_counts(&b->lock);
- struct task_struct *owner;
pid_t pid;
- rcu_read_lock();
- owner = READ_ONCE(b->lock.owner);
- pid = owner ? owner->pid : 0;
- rcu_read_unlock();
+ scoped_guard(rcu) {
+ struct task_struct *owner = READ_ONCE(b->lock.owner);
+ pid = owner ? owner->pid : 0;
+ }
prt_printf(out, "\t%px %c ", b, b->cached ? 'c' : 'b');
bch2_btree_id_to_text(out, b->btree_id);
@@ -3521,7 +3667,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
prt_printf(out, "%i %s\n", task ? task->pid : 0, trans->fn);
/* trans->paths is rcu protected vs. freeing */
- rcu_read_lock();
+ guard(rcu)();
out->atomic++;
struct btree_path *paths = rcu_dereference(trans->paths);
@@ -3564,7 +3710,6 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
}
out:
--out->atomic;
- rcu_read_unlock();
}
void bch2_fs_btree_iter_exit(struct bch_fs *c)
@@ -3594,6 +3739,9 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c)
for (s = c->btree_transaction_stats;
s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
s++) {
+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
+ darray_exit(&s->trans_kmalloc_trace);
+#endif
kfree(s->max_paths_text);
bch2_time_stats_exit(&s->lock_hold_times);
}
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index b9538e6e6d65..09dd3e52622e 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -9,7 +9,6 @@
void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *);
void bch2_btree_path_to_text(struct printbuf *, struct btree_trans *, btree_path_idx_t);
void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *);
-void bch2_dump_trans_updates(struct btree_trans *);
void bch2_dump_trans_paths_updates(struct btree_trans *);
static inline int __bkey_err(const struct bkey *k)
@@ -47,9 +46,11 @@ static inline bool __btree_path_put(struct btree_trans *trans, struct btree_path
return --path->ref == 0;
}
-static inline void btree_path_set_dirty(struct btree_path *path,
+static inline void btree_path_set_dirty(struct btree_trans *trans,
+ struct btree_path *path,
enum btree_path_uptodate u)
{
+ BUG_ON(path->should_be_locked && trans->locked && !trans->restarted);
path->uptodate = max_t(unsigned, path->uptodate, u);
}
@@ -286,14 +287,23 @@ static inline int bch2_trans_mutex_lock(struct btree_trans *trans, struct mutex
: __bch2_trans_mutex_lock(trans, lock);
}
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_trans_verify_paths(struct btree_trans *);
-void bch2_assert_pos_locked(struct btree_trans *, enum btree_id, struct bpos);
-#else
-static inline void bch2_trans_verify_paths(struct btree_trans *trans) {}
-static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
- struct bpos pos) {}
-#endif
+/* Debug: */
+
+void __bch2_trans_verify_paths(struct btree_trans *);
+void __bch2_assert_pos_locked(struct btree_trans *, enum btree_id, struct bpos);
+
+static inline void bch2_trans_verify_paths(struct btree_trans *trans)
+{
+ if (static_branch_unlikely(&bch2_debug_check_iterators))
+ __bch2_trans_verify_paths(trans);
+}
+
+static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id btree,
+ struct bpos pos)
+{
+ if (static_branch_unlikely(&bch2_debug_check_iterators))
+ __bch2_assert_pos_locked(trans, btree, pos);
+}
void bch2_btree_path_fix_key_modified(struct btree_trans *trans,
struct btree *, struct bkey_packed *);
@@ -335,13 +345,20 @@ static inline void bch2_trans_verify_not_unlocked_or_in_restart(struct btree_tra
}
__always_inline
-static int btree_trans_restart_ip(struct btree_trans *trans, int err, unsigned long ip)
+static int btree_trans_restart_foreign_task(struct btree_trans *trans, int err, unsigned long ip)
{
BUG_ON(err <= 0);
BUG_ON(!bch2_err_matches(-err, BCH_ERR_transaction_restart));
trans->restarted = err;
trans->last_restarted_ip = ip;
+ return -err;
+}
+
+__always_inline
+static int btree_trans_restart_ip(struct btree_trans *trans, int err, unsigned long ip)
+{
+ btree_trans_restart_foreign_task(trans, err, ip);
#ifdef CONFIG_BCACHEFS_DEBUG
darray_exit(&trans->last_restarted_trace);
bch2_save_backtrace(&trans->last_restarted_trace, current, 0, GFP_NOWAIT);
@@ -355,6 +372,18 @@ static int btree_trans_restart(struct btree_trans *trans, int err)
return btree_trans_restart_ip(trans, err, _THIS_IP_);
}
+static inline int trans_maybe_inject_restart(struct btree_trans *trans, unsigned long ip)
+{
+#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS
+ if (!(ktime_get_ns() & ~(~0ULL << min(63, (10 + trans->restart_count_this_trans))))) {
+ trace_and_count(trans->c, trans_restart_injected, trans, ip);
+ return btree_trans_restart_ip(trans,
+ BCH_ERR_transaction_restart_fault_inject, ip);
+ }
+#endif
+ return 0;
+}
+
bool bch2_btree_node_upgrade(struct btree_trans *,
struct btree_path *, unsigned);
@@ -375,36 +404,37 @@ void bch2_trans_node_add(struct btree_trans *trans, struct btree_path *, struct
void bch2_trans_node_drop(struct btree_trans *trans, struct btree *);
void bch2_trans_node_reinit_iter(struct btree_trans *, struct btree *);
-int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter);
-int __must_check bch2_btree_iter_traverse(struct btree_iter *);
+int __must_check __bch2_btree_iter_traverse(struct btree_trans *, struct btree_iter *);
+int __must_check bch2_btree_iter_traverse(struct btree_trans *, struct btree_iter *);
-struct btree *bch2_btree_iter_peek_node(struct btree_iter *);
-struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *);
-struct btree *bch2_btree_iter_next_node(struct btree_iter *);
+struct btree *bch2_btree_iter_peek_node(struct btree_trans *, struct btree_iter *);
+struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_trans *, struct btree_iter *);
+struct btree *bch2_btree_iter_next_node(struct btree_trans *, struct btree_iter *);
-struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *, struct bpos);
-struct bkey_s_c bch2_btree_iter_next(struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_peek_max(struct btree_trans *, struct btree_iter *, struct bpos);
+struct bkey_s_c bch2_btree_iter_next(struct btree_trans *, struct btree_iter *);
-static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
+static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_trans *trans,
+ struct btree_iter *iter)
{
- return bch2_btree_iter_peek_max(iter, SPOS_MAX);
+ return bch2_btree_iter_peek_max(trans, iter, SPOS_MAX);
}
-struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *, struct bpos);
+struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *, struct btree_iter *, struct bpos);
-static inline struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
+static inline struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_trans *trans, struct btree_iter *iter)
{
- return bch2_btree_iter_peek_prev_min(iter, POS_MIN);
+ return bch2_btree_iter_peek_prev_min(trans, iter, POS_MIN);
}
-struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_prev(struct btree_trans *, struct btree_iter *);
-struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *);
-struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *);
-struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *, struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_next_slot(struct btree_trans *, struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_trans *, struct btree_iter *);
-bool bch2_btree_iter_advance(struct btree_iter *);
-bool bch2_btree_iter_rewind(struct btree_iter *);
+bool bch2_btree_iter_advance(struct btree_trans *, struct btree_iter *);
+bool bch2_btree_iter_rewind(struct btree_trans *, struct btree_iter *);
static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
{
@@ -415,10 +445,9 @@ static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpo
iter->k.size = 0;
}
-static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
+static inline void bch2_btree_iter_set_pos(struct btree_trans *trans,
+ struct btree_iter *iter, struct bpos new_pos)
{
- struct btree_trans *trans = iter->trans;
-
if (unlikely(iter->update_path))
bch2_path_put(trans, iter->update_path,
iter->flags & BTREE_ITER_intent);
@@ -436,13 +465,14 @@ static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *it
iter->pos = bkey_start_pos(&iter->k);
}
-static inline void bch2_btree_iter_set_snapshot(struct btree_iter *iter, u32 snapshot)
+static inline void bch2_btree_iter_set_snapshot(struct btree_trans *trans,
+ struct btree_iter *iter, u32 snapshot)
{
struct bpos pos = iter->pos;
iter->snapshot = snapshot;
pos.snapshot = snapshot;
- bch2_btree_iter_set_pos(iter, pos);
+ bch2_btree_iter_set_pos(trans, iter, pos);
}
void bch2_trans_iter_exit(struct btree_trans *, struct btree_iter *);
@@ -484,7 +514,6 @@ static inline void bch2_trans_iter_init_common(struct btree_trans *trans,
unsigned flags,
unsigned long ip)
{
- iter->trans = trans;
iter->update_path = 0;
iter->key_cache_path = 0;
iter->btree_id = btree_id;
@@ -521,47 +550,77 @@ static inline void bch2_trans_iter_init(struct btree_trans *trans,
void bch2_trans_node_iter_init(struct btree_trans *, struct btree_iter *,
enum btree_id, struct bpos,
unsigned, unsigned, unsigned);
-void bch2_trans_copy_iter(struct btree_iter *, struct btree_iter *);
+void bch2_trans_copy_iter(struct btree_trans *, struct btree_iter *, struct btree_iter *);
-void bch2_set_btree_iter_dontneed(struct btree_iter *);
+void bch2_set_btree_iter_dontneed(struct btree_trans *, struct btree_iter *);
-void *__bch2_trans_kmalloc(struct btree_trans *, size_t);
+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
+void bch2_trans_kmalloc_trace_to_text(struct printbuf *,
+ darray_trans_kmalloc_trace *);
+#endif
-/**
- * bch2_trans_kmalloc - allocate memory for use by the current transaction
- *
- * Must be called after bch2_trans_begin, which on second and further calls
- * frees all memory allocated in this transaction
- */
-static inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
+void *__bch2_trans_kmalloc(struct btree_trans *, size_t, unsigned long);
+
+static inline void bch2_trans_kmalloc_trace(struct btree_trans *trans, size_t size,
+ unsigned long ip)
+{
+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
+ darray_push(&trans->trans_kmalloc_trace,
+ ((struct trans_kmalloc_trace) { .ip = ip, .bytes = size }));
+#endif
+}
+
+static __always_inline void *bch2_trans_kmalloc_nomemzero_ip(struct btree_trans *trans, size_t size,
+ unsigned long ip)
{
size = roundup(size, 8);
+ bch2_trans_kmalloc_trace(trans, size, ip);
+
if (likely(trans->mem_top + size <= trans->mem_bytes)) {
void *p = trans->mem + trans->mem_top;
trans->mem_top += size;
- memset(p, 0, size);
return p;
} else {
- return __bch2_trans_kmalloc(trans, size);
+ return __bch2_trans_kmalloc(trans, size, ip);
}
}
-static inline void *bch2_trans_kmalloc_nomemzero(struct btree_trans *trans, size_t size)
+static __always_inline void *bch2_trans_kmalloc_ip(struct btree_trans *trans, size_t size,
+ unsigned long ip)
{
- size = round_up(size, 8);
+ size = roundup(size, 8);
+
+ bch2_trans_kmalloc_trace(trans, size, ip);
if (likely(trans->mem_top + size <= trans->mem_bytes)) {
void *p = trans->mem + trans->mem_top;
trans->mem_top += size;
+ memset(p, 0, size);
return p;
} else {
- return __bch2_trans_kmalloc(trans, size);
+ return __bch2_trans_kmalloc(trans, size, ip);
}
}
+/**
+ * bch2_trans_kmalloc - allocate memory for use by the current transaction
+ *
+ * Must be called after bch2_trans_begin, which on second and further calls
+ * frees all memory allocated in this transaction
+ */
+static __always_inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
+{
+ return bch2_trans_kmalloc_ip(trans, size, _THIS_IP_);
+}
+
+static __always_inline void *bch2_trans_kmalloc_nomemzero(struct btree_trans *trans, size_t size)
+{
+ return bch2_trans_kmalloc_nomemzero_ip(trans, size, _THIS_IP_);
+}
+
static inline struct bkey_s_c __bch2_bkey_get_iter(struct btree_trans *trans,
struct btree_iter *iter,
unsigned btree_id, struct bpos pos,
@@ -570,7 +629,7 @@ static inline struct bkey_s_c __bch2_bkey_get_iter(struct btree_trans *trans,
struct bkey_s_c k;
bch2_trans_iter_init(trans, iter, btree_id, pos, flags);
- k = bch2_btree_iter_peek_slot(iter);
+ k = bch2_btree_iter_peek_slot(trans, iter);
if (!bkey_err(k) && type && k.k->type != type)
k = bkey_s_c_err(-BCH_ERR_ENOENT_bkey_type_mismatch);
@@ -640,14 +699,14 @@ u32 bch2_trans_begin(struct btree_trans *);
int _ret3 = 0; \
do { \
_ret3 = lockrestart_do((_trans), ({ \
- struct btree *_b = bch2_btree_iter_peek_node(&_iter); \
+ struct btree *_b = bch2_btree_iter_peek_node(_trans, &_iter);\
if (!_b) \
break; \
\
PTR_ERR_OR_ZERO(_b) ?: (_do); \
})) ?: \
lockrestart_do((_trans), \
- PTR_ERR_OR_ZERO(bch2_btree_iter_next_node(&_iter))); \
+ PTR_ERR_OR_ZERO(bch2_btree_iter_next_node(_trans, &_iter)));\
} while (!_ret3); \
\
bch2_trans_iter_exit((_trans), &(_iter)); \
@@ -659,31 +718,34 @@ u32 bch2_trans_begin(struct btree_trans *);
__for_each_btree_node(_trans, _iter, _btree_id, _start, \
0, 0, _flags, _b, _do)
-static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter *iter,
+static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_trans *trans,
+ struct btree_iter *iter,
unsigned flags)
{
- return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(iter) :
- bch2_btree_iter_peek_prev(iter);
+ return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(trans, iter) :
+ bch2_btree_iter_peek_prev(trans, iter);
}
-static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter,
+static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_trans *trans,
+ struct btree_iter *iter,
unsigned flags)
{
- return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(iter) :
- bch2_btree_iter_peek(iter);
+ return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(trans, iter) :
+ bch2_btree_iter_peek(trans, iter);
}
-static inline struct bkey_s_c bch2_btree_iter_peek_max_type(struct btree_iter *iter,
- struct bpos end,
- unsigned flags)
+static inline struct bkey_s_c bch2_btree_iter_peek_max_type(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bpos end,
+ unsigned flags)
{
if (!(flags & BTREE_ITER_slots))
- return bch2_btree_iter_peek_max(iter, end);
+ return bch2_btree_iter_peek_max(trans, iter, end);
if (bkey_gt(iter->pos, end))
return bkey_s_c_null;
- return bch2_btree_iter_peek_slot(iter);
+ return bch2_btree_iter_peek_slot(trans, iter);
}
int __bch2_btree_trans_too_many_iters(struct btree_trans *);
@@ -739,7 +801,7 @@ transaction_restart: \
if (!_ret2) \
bch2_trans_verify_not_restarted(_trans, _restart_count);\
\
- _ret2 ?: trans_was_restarted(_trans, _restart_count); \
+ _ret2 ?: trans_was_restarted(_trans, _orig_restart_count); \
})
#define for_each_btree_key_max_continue(_trans, _iter, \
@@ -750,14 +812,14 @@ transaction_restart: \
\
do { \
_ret3 = lockrestart_do(_trans, ({ \
- (_k) = bch2_btree_iter_peek_max_type(&(_iter), \
+ (_k) = bch2_btree_iter_peek_max_type(_trans, &(_iter), \
_end, (_flags)); \
if (!(_k).k) \
break; \
\
bkey_err(_k) ?: (_do); \
})); \
- } while (!_ret3 && bch2_btree_iter_advance(&(_iter))); \
+ } while (!_ret3 && bch2_btree_iter_advance(_trans, &(_iter))); \
\
bch2_trans_iter_exit((_trans), &(_iter)); \
_ret3; \
@@ -795,14 +857,14 @@ transaction_restart: \
\
do { \
_ret3 = lockrestart_do(_trans, ({ \
- (_k) = bch2_btree_iter_peek_prev_type(&(_iter), \
+ (_k) = bch2_btree_iter_peek_prev_type(_trans, &(_iter), \
(_flags)); \
if (!(_k).k) \
break; \
\
bkey_err(_k) ?: (_do); \
})); \
- } while (!_ret3 && bch2_btree_iter_rewind(&(_iter))); \
+ } while (!_ret3 && bch2_btree_iter_rewind(_trans, &(_iter))); \
\
bch2_trans_iter_exit((_trans), &(_iter)); \
_ret3; \
@@ -832,37 +894,38 @@ transaction_restart: \
(_do) ?: bch2_trans_commit(_trans, (_disk_res),\
(_journal_seq), (_commit_flags)))
-struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *);
+struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_trans *,
+ struct btree_iter *);
#define for_each_btree_key_max_norestart(_trans, _iter, _btree_id, \
_start, _end, _flags, _k, _ret) \
for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
(_start), (_flags)); \
- (_k) = bch2_btree_iter_peek_max_type(&(_iter), _end, _flags),\
+ (_k) = bch2_btree_iter_peek_max_type(_trans, &(_iter), _end, _flags),\
!((_ret) = bkey_err(_k)) && (_k).k; \
- bch2_btree_iter_advance(&(_iter)))
+ bch2_btree_iter_advance(_trans, &(_iter)))
-#define for_each_btree_key_max_continue_norestart(_iter, _end, _flags, _k, _ret)\
+#define for_each_btree_key_max_continue_norestart(_trans, _iter, _end, _flags, _k, _ret)\
for (; \
- (_k) = bch2_btree_iter_peek_max_type(&(_iter), _end, _flags), \
+ (_k) = bch2_btree_iter_peek_max_type(_trans, &(_iter), _end, _flags), \
!((_ret) = bkey_err(_k)) && (_k).k; \
- bch2_btree_iter_advance(&(_iter)))
+ bch2_btree_iter_advance(_trans, &(_iter)))
#define for_each_btree_key_norestart(_trans, _iter, _btree_id, \
_start, _flags, _k, _ret) \
for_each_btree_key_max_norestart(_trans, _iter, _btree_id, _start,\
SPOS_MAX, _flags, _k, _ret)
-#define for_each_btree_key_reverse_norestart(_trans, _iter, _btree_id, \
- _start, _flags, _k, _ret) \
- for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
- (_start), (_flags)); \
- (_k) = bch2_btree_iter_peek_prev_type(&(_iter), _flags), \
- !((_ret) = bkey_err(_k)) && (_k).k; \
- bch2_btree_iter_rewind(&(_iter)))
+#define for_each_btree_key_reverse_norestart(_trans, _iter, _btree_id, \
+ _start, _flags, _k, _ret) \
+ for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
+ (_start), (_flags)); \
+ (_k) = bch2_btree_iter_peek_prev_type(_trans, &(_iter), _flags), \
+ !((_ret) = bkey_err(_k)) && (_k).k; \
+ bch2_btree_iter_rewind(_trans, &(_iter)))
-#define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \
- for_each_btree_key_max_continue_norestart(_iter, SPOS_MAX, _flags, _k, _ret)
+#define for_each_btree_key_continue_norestart(_trans, _iter, _flags, _k, _ret) \
+ for_each_btree_key_max_continue_norestart(_trans, _iter, SPOS_MAX, _flags, _k, _ret)
/*
* This should not be used in a fastpath, without first trying _do in
@@ -900,16 +963,6 @@ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *);
_p; \
})
-#define bch2_trans_run(_c, _do) \
-({ \
- struct btree_trans *trans = bch2_trans_get(_c); \
- int _ret = (_do); \
- bch2_trans_put(trans); \
- _ret; \
-})
-
-#define bch2_trans_do(_c, _do) bch2_trans_run(_c, lockrestart_do(trans, _do))
-
struct btree_trans *__bch2_trans_get(struct bch_fs *, unsigned);
void bch2_trans_put(struct btree_trans *);
@@ -927,6 +980,27 @@ unsigned bch2_trans_get_fn_idx(const char *);
__bch2_trans_get(_c, trans_fn_idx); \
})
+/*
+ * We don't use DEFINE_CLASS() because using a function for the constructor
+ * breaks bch2_trans_get()'s use of __func__
+ */
+typedef struct btree_trans * class_btree_trans_t;
+static inline void class_btree_trans_destructor(struct btree_trans **p)
+{
+ struct btree_trans *trans = *p;
+ bch2_trans_put(trans);
+}
+
+#define class_btree_trans_constructor(_c) bch2_trans_get(_c)
+
+#define bch2_trans_run(_c, _do) \
+({ \
+ CLASS(btree_trans, trans)(_c); \
+ (_do); \
+})
+
+#define bch2_trans_do(_c, _do) bch2_trans_run(_c, lockrestart_do(trans, _do))
+
void bch2_btree_trans_to_text(struct printbuf *, struct btree_trans *);
void bch2_fs_btree_iter_exit(struct bch_fs *);
diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c
index 6d25e3f85ce8..ea839560a136 100644
--- a/fs/bcachefs/btree_journal_iter.c
+++ b/fs/bcachefs/btree_journal_iter.c
@@ -137,12 +137,15 @@ struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *c, enum btree_id b
struct journal_key *k;
BUG_ON(*idx > keys->nr);
+
+ if (!keys->nr)
+ return NULL;
search:
if (!*idx)
*idx = __bch2_journal_key_search(keys, btree_id, level, pos);
- while (*idx &&
- __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) {
+ while (*idx < keys->nr &&
+ __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx)) >= 0) {
(*idx)++;
iters++;
if (iters == 10) {
@@ -151,18 +154,23 @@ search:
}
}
+ if (*idx == keys->nr)
+ --(*idx);
+
struct bkey_i *ret = NULL;
rcu_read_lock(); /* for overwritten_ranges */
- while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) {
+ while (true) {
+ k = idx_to_key(keys, *idx);
if (__journal_key_cmp(btree_id, level, end_pos, k) > 0)
break;
if (k->overwritten) {
if (k->overwritten_range)
- *idx = rcu_dereference(k->overwritten_range)->start - 1;
- else
- *idx -= 1;
+ *idx = rcu_dereference(k->overwritten_range)->start;
+ if (!*idx)
+ break;
+ --(*idx);
continue;
}
@@ -171,6 +179,8 @@ search:
break;
}
+ if (!*idx)
+ break;
--(*idx);
iters++;
if (iters == 10) {
@@ -288,11 +298,11 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
.size = max_t(size_t, keys->size, 8) * 2,
};
- new_keys.data = kvmalloc_array(new_keys.size, sizeof(new_keys.data[0]), GFP_KERNEL);
+ new_keys.data = bch2_kvmalloc(new_keys.size * sizeof(new_keys.data[0]), GFP_KERNEL);
if (!new_keys.data) {
bch_err(c, "%s: error allocating new key array (size %zu)",
__func__, new_keys.size);
- return -BCH_ERR_ENOMEM_journal_key_insert;
+ return bch_err_throw(c, ENOMEM_journal_key_insert);
}
/* Since @keys was full, there was no gap: */
@@ -331,7 +341,7 @@ int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL);
if (!n)
- return -BCH_ERR_ENOMEM_journal_key_insert;
+ return bch_err_throw(c, ENOMEM_journal_key_insert);
bkey_copy(n, k);
ret = bch2_journal_key_insert_take(c, id, level, n);
@@ -457,11 +467,9 @@ static void bch2_journal_iter_advance(struct journal_iter *iter)
static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
{
- struct bkey_s_c ret = bkey_s_c_null;
-
journal_iter_verify(iter);
- rcu_read_lock();
+ guard(rcu)();
while (iter->idx < iter->keys->size) {
struct journal_key *k = iter->keys->data + iter->idx;
@@ -470,19 +478,16 @@ static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
break;
BUG_ON(cmp);
- if (!k->overwritten) {
- ret = bkey_i_to_s_c(k->k);
- break;
- }
+ if (!k->overwritten)
+ return bkey_i_to_s_c(k->k);
if (k->overwritten_range)
iter->idx = idx_to_pos(iter->keys, rcu_dereference(k->overwritten_range)->end);
else
bch2_journal_iter_advance(iter);
}
- rcu_read_unlock();
- return ret;
+ return bkey_s_c_null;
}
static void bch2_journal_iter_exit(struct journal_iter *iter)
@@ -646,10 +651,11 @@ static int journal_sort_key_cmp(const void *_l, const void *_r)
{
const struct journal_key *l = _l;
const struct journal_key *r = _r;
+ int rewind = l->rewind && r->rewind ? -1 : 1;
return journal_key_cmp(l, r) ?:
- cmp_int(l->journal_seq, r->journal_seq) ?:
- cmp_int(l->journal_offset, r->journal_offset);
+ ((cmp_int(l->journal_seq, r->journal_seq) ?:
+ cmp_int(l->journal_offset, r->journal_offset)) * rewind);
}
void bch2_journal_keys_put(struct bch_fs *c)
@@ -687,7 +693,8 @@ void bch2_journal_keys_put(struct bch_fs *c)
static void __journal_keys_sort(struct journal_keys *keys)
{
- sort(keys->data, keys->nr, sizeof(keys->data[0]), journal_sort_key_cmp, NULL);
+ sort_nonatomic(keys->data, keys->nr, sizeof(keys->data[0]),
+ journal_sort_key_cmp, NULL);
cond_resched();
@@ -717,6 +724,8 @@ int bch2_journal_keys_sort(struct bch_fs *c)
struct journal_keys *keys = &c->journal_keys;
size_t nr_read = 0;
+ u64 rewind_seq = c->opts.journal_rewind ?: U64_MAX;
+
genradix_for_each(&c->journal_entries, iter, _i) {
i = *_i;
@@ -725,28 +734,43 @@ int bch2_journal_keys_sort(struct bch_fs *c)
cond_resched();
- for_each_jset_key(k, entry, &i->j) {
- struct journal_key n = (struct journal_key) {
- .btree_id = entry->btree_id,
- .level = entry->level,
- .k = k,
- .journal_seq = le64_to_cpu(i->j.seq),
- .journal_offset = k->_data - i->j._data,
- };
-
- if (darray_push(keys, n)) {
- __journal_keys_sort(keys);
-
- if (keys->nr * 8 > keys->size * 7) {
- bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu keys at seq %llu",
- keys->nr, keys->size, nr_read, le64_to_cpu(i->j.seq));
- return -BCH_ERR_ENOMEM_journal_keys_sort;
+ vstruct_for_each(&i->j, entry) {
+ bool rewind = !entry->level &&
+ !btree_id_is_alloc(entry->btree_id) &&
+ le64_to_cpu(i->j.seq) >= rewind_seq;
+
+ if (entry->type != (rewind
+ ? BCH_JSET_ENTRY_overwrite
+ : BCH_JSET_ENTRY_btree_keys))
+ continue;
+
+ if (!rewind && le64_to_cpu(i->j.seq) < c->journal_replay_seq_start)
+ continue;
+
+ jset_entry_for_each_key(entry, k) {
+ struct journal_key n = (struct journal_key) {
+ .btree_id = entry->btree_id,
+ .level = entry->level,
+ .rewind = rewind,
+ .k = k,
+ .journal_seq = le64_to_cpu(i->j.seq),
+ .journal_offset = k->_data - i->j._data,
+ };
+
+ if (darray_push(keys, n)) {
+ __journal_keys_sort(keys);
+
+ if (keys->nr * 8 > keys->size * 7) {
+ bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu keys at seq %llu",
+ keys->nr, keys->size, nr_read, le64_to_cpu(i->j.seq));
+ return bch_err_throw(c, ENOMEM_journal_keys_sort);
+ }
+
+ BUG_ON(darray_push(keys, n));
}
- BUG_ON(darray_push(keys, n));
+ nr_read++;
}
-
- nr_read++;
}
}
diff --git a/fs/bcachefs/btree_journal_iter_types.h b/fs/bcachefs/btree_journal_iter_types.h
index 8b773823704f..86aacb254fb2 100644
--- a/fs/bcachefs/btree_journal_iter_types.h
+++ b/fs/bcachefs/btree_journal_iter_types.h
@@ -11,8 +11,9 @@ struct journal_key {
u32 journal_offset;
enum btree_id btree_id:8;
unsigned level:8;
- bool allocated;
- bool overwritten;
+ bool allocated:1;
+ bool overwritten:1;
+ bool rewind:1;
struct journal_key_range_overwritten __rcu *
overwritten_range;
struct bkey_i *k;
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index c378b97ebeca..d96188b92db2 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -101,8 +101,8 @@ static void __bkey_cached_free(struct rcu_pending *pending, struct rcu_head *rcu
kmem_cache_free(bch2_key_cache, ck);
}
-static void bkey_cached_free(struct btree_key_cache *bc,
- struct bkey_cached *ck)
+static inline void bkey_cached_free_noassert(struct btree_key_cache *bc,
+ struct bkey_cached *ck)
{
kfree(ck->k);
ck->k = NULL;
@@ -116,6 +116,19 @@ static void bkey_cached_free(struct btree_key_cache *bc,
this_cpu_inc(*bc->nr_pending);
}
+static void bkey_cached_free(struct btree_trans *trans,
+ struct btree_key_cache *bc,
+ struct bkey_cached *ck)
+{
+ /*
+ * we'll hit strange issues in the SRCU code if we aren't holding an
+ * SRCU read lock...
+ */
+ EBUG_ON(!trans->srcu_held);
+
+ bkey_cached_free_noassert(bc, ck);
+}
+
static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s, gfp_t gfp)
{
gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE;
@@ -156,7 +169,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned k
}
if (ck) {
- bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0);
+ bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0, GFP_KERNEL);
ck->c.cached = true;
goto lock;
}
@@ -174,27 +187,23 @@ lock:
static struct bkey_cached *
bkey_cached_reuse(struct btree_key_cache *c)
{
- struct bucket_table *tbl;
+
+ guard(rcu)();
+ struct bucket_table *tbl = rht_dereference_rcu(c->table.tbl, &c->table);
struct rhash_head *pos;
struct bkey_cached *ck;
- unsigned i;
- rcu_read_lock();
- tbl = rht_dereference_rcu(c->table.tbl, &c->table);
- for (i = 0; i < tbl->size; i++)
+ for (unsigned i = 0; i < tbl->size; i++)
rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
bkey_cached_lock_for_evict(ck)) {
if (bkey_cached_evict(c, ck))
- goto out;
+ return ck;
six_unlock_write(&ck->c.lock);
six_unlock_intent(&ck->c.lock);
}
}
- ck = NULL;
-out:
- rcu_read_unlock();
- return ck;
+ return NULL;
}
static int btree_key_cache_create(struct btree_trans *trans,
@@ -229,7 +238,7 @@ static int btree_key_cache_create(struct btree_trans *trans,
if (unlikely(!ck)) {
bch_err(c, "error allocating memory for key cache item, btree %s",
bch2_btree_id_str(ck_path->btree_id));
- return -BCH_ERR_ENOMEM_btree_key_cache_create;
+ return bch_err_throw(c, ENOMEM_btree_key_cache_create);
}
}
@@ -247,7 +256,7 @@ static int btree_key_cache_create(struct btree_trans *trans,
if (unlikely(!new_k)) {
bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
bch2_btree_id_str(ck->key.btree_id), key_u64s);
- ret = -BCH_ERR_ENOMEM_btree_key_cache_fill;
+ ret = bch_err_throw(c, ENOMEM_btree_key_cache_fill);
} else if (ret) {
kfree(new_k);
goto err;
@@ -281,16 +290,31 @@ static int btree_key_cache_create(struct btree_trans *trans,
ck_path->uptodate = BTREE_ITER_UPTODATE;
return 0;
err:
- bkey_cached_free(bc, ck);
+ bkey_cached_free(trans, bc, ck);
mark_btree_node_locked_noreset(ck_path, 0, BTREE_NODE_UNLOCKED);
return ret;
}
+static noinline_for_stack void do_trace_key_cache_fill(struct btree_trans *trans,
+ struct btree_path *ck_path,
+ struct bkey_s_c k)
+{
+ struct printbuf buf = PRINTBUF;
+
+ bch2_bpos_to_text(&buf, ck_path->pos);
+ prt_char(&buf, ' ');
+ bch2_bkey_val_to_text(&buf, trans->c, k);
+ trace_key_cache_fill(trans, buf.buf);
+ printbuf_exit(&buf);
+}
+
static noinline int btree_key_cache_fill(struct btree_trans *trans,
- struct btree_path *ck_path,
+ btree_path_idx_t ck_path_idx,
unsigned flags)
{
+ struct btree_path *ck_path = trans->paths + ck_path_idx;
+
if (flags & BTREE_ITER_cached_nofill) {
ck_path->l[0].b = NULL;
return 0;
@@ -306,12 +330,13 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans,
BTREE_ITER_key_cache_fill|
BTREE_ITER_cached_nofill);
iter.flags &= ~BTREE_ITER_with_journal;
- k = bch2_btree_iter_peek_slot(&iter);
+ k = bch2_btree_iter_peek_slot(trans, &iter);
ret = bkey_err(k);
if (ret)
goto err;
/* Recheck after btree lookup, before allocating: */
+ ck_path = trans->paths + ck_path_idx;
ret = bch2_btree_key_cache_find(c, ck_path->btree_id, ck_path->pos) ? -EEXIST : 0;
if (unlikely(ret))
goto out;
@@ -320,28 +345,22 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans,
if (ret)
goto err;
- if (trace_key_cache_fill_enabled()) {
- struct printbuf buf = PRINTBUF;
-
- bch2_bpos_to_text(&buf, ck_path->pos);
- prt_char(&buf, ' ');
- bch2_bkey_val_to_text(&buf, trans->c, k);
- trace_key_cache_fill(trans, buf.buf);
- printbuf_exit(&buf);
- }
+ if (trace_key_cache_fill_enabled())
+ do_trace_key_cache_fill(trans, ck_path, k);
out:
/* We're not likely to need this iterator again: */
- bch2_set_btree_iter_dontneed(&iter);
+ bch2_set_btree_iter_dontneed(trans, &iter);
err:
bch2_trans_iter_exit(trans, &iter);
return ret;
}
static inline int btree_path_traverse_cached_fast(struct btree_trans *trans,
- struct btree_path *path)
+ btree_path_idx_t path_idx)
{
struct bch_fs *c = trans->c;
struct bkey_cached *ck;
+ struct btree_path *path = trans->paths + path_idx;
retry:
ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
if (!ck)
@@ -367,27 +386,32 @@ retry:
return 0;
}
-int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path,
+int bch2_btree_path_traverse_cached(struct btree_trans *trans,
+ btree_path_idx_t path_idx,
unsigned flags)
{
- EBUG_ON(path->level);
-
- path->l[1].b = NULL;
+ EBUG_ON(trans->paths[path_idx].level);
int ret;
do {
- ret = btree_path_traverse_cached_fast(trans, path);
+ ret = btree_path_traverse_cached_fast(trans, path_idx);
if (unlikely(ret == -ENOENT))
- ret = btree_key_cache_fill(trans, path, flags);
+ ret = btree_key_cache_fill(trans, path_idx, flags);
} while (ret == -EEXIST);
+ struct btree_path *path = trans->paths + path_idx;
+
if (unlikely(ret)) {
path->uptodate = BTREE_ITER_NEED_TRAVERSE;
if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
btree_node_unlock(trans, path, 0);
path->l[0].b = ERR_PTR(ret);
}
+ } else {
+ BUG_ON(path->uptodate);
+ BUG_ON(!path->nodes_locked);
}
+
return ret;
}
@@ -412,7 +436,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
BTREE_ITER_intent);
b_iter.flags &= ~BTREE_ITER_with_key_cache;
- ret = bch2_btree_iter_traverse(&c_iter);
+ ret = bch2_btree_iter_traverse(trans, &c_iter);
if (ret)
goto out;
@@ -444,7 +468,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
!test_bit(JOURNAL_space_low, &c->journal.flags))
commit_flags |= BCH_TRANS_COMMIT_no_journal_res;
- struct bkey_s_c btree_k = bch2_btree_iter_peek_slot(&b_iter);
+ struct bkey_s_c btree_k = bch2_btree_iter_peek_slot(trans, &b_iter);
ret = bkey_err(btree_k);
if (ret)
goto err;
@@ -496,7 +520,7 @@ evict:
mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED);
if (bkey_cached_evict(&c->btree_key_cache, ck)) {
- bkey_cached_free(&c->btree_key_cache, ck);
+ bkey_cached_free(trans, &c->btree_key_cache, ck);
} else {
six_unlock_write(&ck->c.lock);
six_unlock_intent(&ck->c.lock);
@@ -610,7 +634,7 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans,
}
bkey_cached_evict(bc, ck);
- bkey_cached_free(bc, ck);
+ bkey_cached_free(trans, bc, ck);
mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED);
@@ -618,10 +642,17 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans,
unsigned i;
trans_for_each_path(trans, path2, i)
if (path2->l[0].b == (void *) ck) {
+ /*
+ * It's safe to clear should_be_locked here because
+ * we're evicting from the key cache, and we still have
+ * the underlying btree locked: filling into the key
+ * cache would require taking a write lock on the btree
+ * node
+ */
+ path2->should_be_locked = false;
__bch2_btree_path_unlock(trans, path2);
path2->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_drop);
- path2->should_be_locked = false;
- btree_path_set_dirty(path2, BTREE_ITER_NEED_TRAVERSE);
+ btree_path_set_dirty(trans, path2, BTREE_ITER_NEED_TRAVERSE);
}
bch2_trans_verify_locks(trans);
@@ -678,7 +709,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
} else if (!bkey_cached_lock_for_evict(ck)) {
bc->skipped_lock_fail++;
} else if (bkey_cached_evict(bc, ck)) {
- bkey_cached_free(bc, ck);
+ bkey_cached_free_noassert(bc, ck);
bc->freed++;
freed++;
} else {
@@ -748,7 +779,6 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
rcu_read_unlock();
mutex_lock(&bc->table.mutex);
mutex_unlock(&bc->table.mutex);
- rcu_read_lock();
continue;
}
for (i = 0; i < tbl->size; i++)
@@ -792,20 +822,20 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
bc->nr_pending = alloc_percpu(size_t);
if (!bc->nr_pending)
- return -BCH_ERR_ENOMEM_fs_btree_cache_init;
+ return bch_err_throw(c, ENOMEM_fs_btree_cache_init);
if (rcu_pending_init(&bc->pending[0], &c->btree_trans_barrier, __bkey_cached_free) ||
rcu_pending_init(&bc->pending[1], &c->btree_trans_barrier, __bkey_cached_free))
- return -BCH_ERR_ENOMEM_fs_btree_cache_init;
+ return bch_err_throw(c, ENOMEM_fs_btree_cache_init);
if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params))
- return -BCH_ERR_ENOMEM_fs_btree_cache_init;
+ return bch_err_throw(c, ENOMEM_fs_btree_cache_init);
bc->table_init_done = true;
shrink = shrinker_alloc(0, "%s-btree_key_cache", c->name);
if (!shrink)
- return -BCH_ERR_ENOMEM_fs_btree_cache_init;
+ return bch_err_throw(c, ENOMEM_fs_btree_cache_init);
bc->shrink = shrink;
shrink->count_objects = bch2_btree_key_cache_count;
shrink->scan_objects = bch2_btree_key_cache_scan;
diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h
index 51d6289b8dee..82d8c72512a9 100644
--- a/fs/bcachefs/btree_key_cache.h
+++ b/fs/bcachefs/btree_key_cache.h
@@ -40,8 +40,7 @@ int bch2_btree_key_cache_journal_flush(struct journal *,
struct bkey_cached *
bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos);
-int bch2_btree_path_traverse_cached(struct btree_trans *, struct btree_path *,
- unsigned);
+int bch2_btree_path_traverse_cached(struct btree_trans *, btree_path_idx_t, unsigned);
bool bch2_btree_insert_key_cached(struct btree_trans *, unsigned,
struct btree_insert_entry *);
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index 10b805a60f52..bed2b4b6ffb9 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -1,15 +1,17 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "btree_cache.h"
#include "btree_locking.h"
#include "btree_types.h"
static struct lock_class_key bch2_btree_node_lock_key;
void bch2_btree_lock_init(struct btree_bkey_cached_common *b,
- enum six_lock_init_flags flags)
+ enum six_lock_init_flags flags,
+ gfp_t gfp)
{
- __six_lock_init(&b->lock, "b->c.lock", &bch2_btree_node_lock_key, flags);
+ __six_lock_init(&b->lock, "b->c.lock", &bch2_btree_node_lock_key, flags, gfp);
lockdep_set_notrack_class(&b->lock);
}
@@ -90,10 +92,10 @@ static noinline void print_chain(struct printbuf *out, struct lock_graph *g)
struct trans_waiting_for_lock *i;
for (i = g->g; i != g->g + g->nr; i++) {
- struct task_struct *task = i->trans->locking_wait.task;
+ struct task_struct *task = READ_ONCE(i->trans->locking_wait.task);
if (i != g->g)
prt_str(out, "<- ");
- prt_printf(out, "%u ", task ?task->pid : 0);
+ prt_printf(out, "%u ", task ? task->pid : 0);
}
prt_newline(out);
}
@@ -171,7 +173,9 @@ static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i)
{
if (i == g->g) {
trace_would_deadlock(g, i->trans);
- return btree_trans_restart(i->trans, BCH_ERR_transaction_restart_would_deadlock);
+ return btree_trans_restart_foreign_task(i->trans,
+ BCH_ERR_transaction_restart_would_deadlock,
+ _THIS_IP_);
} else {
i->trans->lock_must_abort = true;
wake_up_process(i->trans->locking_wait.task);
@@ -190,6 +194,30 @@ static int btree_trans_abort_preference(struct btree_trans *trans)
return 3;
}
+static noinline __noreturn void break_cycle_fail(struct lock_graph *g)
+{
+ struct printbuf buf = PRINTBUF;
+ buf.atomic++;
+
+ prt_printf(&buf, bch2_fmt(g->g->trans->c, "cycle of nofail locks"));
+
+ for (struct trans_waiting_for_lock *i = g->g; i < g->g + g->nr; i++) {
+ struct btree_trans *trans = i->trans;
+
+ bch2_btree_trans_to_text(&buf, trans);
+
+ prt_printf(&buf, "backtrace:\n");
+ printbuf_indent_add(&buf, 2);
+ bch2_prt_task_backtrace(&buf, trans->locking_wait.task, 2, GFP_NOWAIT);
+ printbuf_indent_sub(&buf, 2);
+ prt_newline(&buf);
+ }
+
+ bch2_print_str(g->g->trans->c, KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+ BUG();
+}
+
static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle,
struct trans_waiting_for_lock *from)
{
@@ -215,28 +243,8 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle,
}
}
- if (unlikely(!best)) {
- struct printbuf buf = PRINTBUF;
- buf.atomic++;
-
- prt_printf(&buf, bch2_fmt(g->g->trans->c, "cycle of nofail locks"));
-
- for (i = g->g; i < g->g + g->nr; i++) {
- struct btree_trans *trans = i->trans;
-
- bch2_btree_trans_to_text(&buf, trans);
-
- prt_printf(&buf, "backtrace:\n");
- printbuf_indent_add(&buf, 2);
- bch2_prt_task_backtrace(&buf, trans->locking_wait.task, 2, GFP_NOWAIT);
- printbuf_indent_sub(&buf, 2);
- prt_newline(&buf);
- }
-
- bch2_print_string_as_lines_nonblocking(KERN_ERR, buf.buf);
- printbuf_exit(&buf);
- BUG();
- }
+ if (unlikely(!best))
+ break_cycle_fail(g);
ret = abort_lock(g, abort);
out:
@@ -251,15 +259,14 @@ static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans,
struct printbuf *cycle)
{
struct btree_trans *orig_trans = g->g->trans;
- struct trans_waiting_for_lock *i;
- for (i = g->g; i < g->g + g->nr; i++)
+ for (struct trans_waiting_for_lock *i = g->g; i < g->g + g->nr; i++)
if (i->trans == trans) {
closure_put(&trans->ref);
return break_cycle(g, cycle, i);
}
- if (g->nr == ARRAY_SIZE(g->g)) {
+ if (unlikely(g->nr == ARRAY_SIZE(g->g))) {
closure_put(&trans->ref);
if (orig_trans->lock_may_not_fail)
@@ -304,7 +311,7 @@ int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle)
lock_graph_down(&g, trans);
/* trans->paths is rcu protected vs. freeing */
- rcu_read_lock();
+ guard(rcu)();
if (cycle)
cycle->atomic++;
next:
@@ -402,7 +409,6 @@ up:
out:
if (cycle)
--cycle->atomic;
- rcu_read_unlock();
return ret;
}
@@ -447,13 +453,13 @@ void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
/* relock */
-static inline bool btree_path_get_locks(struct btree_trans *trans,
- struct btree_path *path,
- bool upgrade,
- struct get_locks_fail *f)
+static int btree_path_get_locks(struct btree_trans *trans,
+ struct btree_path *path,
+ bool upgrade,
+ struct get_locks_fail *f,
+ int restart_err)
{
unsigned l = path->level;
- int fail_idx = -1;
do {
if (!btree_path_node(path, l))
@@ -461,39 +467,49 @@ static inline bool btree_path_get_locks(struct btree_trans *trans,
if (!(upgrade
? bch2_btree_node_upgrade(trans, path, l)
- : bch2_btree_node_relock(trans, path, l))) {
- fail_idx = l;
-
- if (f) {
- f->l = l;
- f->b = path->l[l].b;
- }
- }
+ : bch2_btree_node_relock(trans, path, l)))
+ goto err;
l++;
} while (l < path->locks_want);
+ if (path->uptodate == BTREE_ITER_NEED_RELOCK)
+ path->uptodate = BTREE_ITER_UPTODATE;
+
+ return path->uptodate < BTREE_ITER_NEED_RELOCK ? 0 : -1;
+err:
+ if (f) {
+ f->l = l;
+ f->b = path->l[l].b;
+ }
+
+ /*
+ * Do transaction restart before unlocking, so we don't pop
+ * should_be_locked asserts
+ */
+ if (restart_err) {
+ btree_trans_restart(trans, restart_err);
+ } else if (path->should_be_locked && !trans->restarted) {
+ if (upgrade)
+ path->locks_want = l;
+ return -1;
+ }
+
+ __bch2_btree_path_unlock(trans, path);
+ btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE);
+
/*
* When we fail to get a lock, we have to ensure that any child nodes
* can't be relocked so bch2_btree_path_traverse has to walk back up to
* the node that we failed to relock:
*/
- if (fail_idx >= 0) {
- __bch2_btree_path_unlock(trans, path);
- btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
-
- do {
- path->l[fail_idx].b = upgrade
- ? ERR_PTR(-BCH_ERR_no_btree_node_upgrade)
- : ERR_PTR(-BCH_ERR_no_btree_node_relock);
- --fail_idx;
- } while (fail_idx >= 0);
- }
-
- if (path->uptodate == BTREE_ITER_NEED_RELOCK)
- path->uptodate = BTREE_ITER_UPTODATE;
+ do {
+ path->l[l].b = upgrade
+ ? ERR_PTR(-BCH_ERR_no_btree_node_upgrade)
+ : ERR_PTR(-BCH_ERR_no_btree_node_relock);
+ } while (l--);
- return path->uptodate < BTREE_ITER_NEED_RELOCK;
+ return -restart_err ?: -1;
}
bool __bch2_btree_node_relock(struct btree_trans *trans,
@@ -580,7 +596,7 @@ int bch2_btree_path_relock_intent(struct btree_trans *trans,
l++) {
if (!bch2_btree_node_relock(trans, path, l)) {
__bch2_btree_path_unlock(trans, path);
- btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+ btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE);
trace_and_count(trans->c, trans_restart_relock_path_intent, trans, _RET_IP_, path);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path_intent);
}
@@ -592,9 +608,7 @@ int bch2_btree_path_relock_intent(struct btree_trans *trans,
__flatten
bool bch2_btree_path_relock_norestart(struct btree_trans *trans, struct btree_path *path)
{
- struct get_locks_fail f;
-
- bool ret = btree_path_get_locks(trans, path, false, &f);
+ bool ret = !btree_path_get_locks(trans, path, false, NULL, 0);
bch2_trans_verify_locks(trans);
return ret;
}
@@ -610,27 +624,37 @@ int __bch2_btree_path_relock(struct btree_trans *trans,
return 0;
}
-bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *trans,
- struct btree_path *path,
- unsigned new_locks_want,
- struct get_locks_fail *f)
+bool __bch2_btree_path_upgrade_norestart(struct btree_trans *trans,
+ struct btree_path *path,
+ unsigned new_locks_want)
{
- EBUG_ON(path->locks_want >= new_locks_want);
-
path->locks_want = new_locks_want;
- bool ret = btree_path_get_locks(trans, path, true, f);
- bch2_trans_verify_locks(trans);
+ /*
+ * If we need it locked, we can't touch it. Otherwise, we can return
+ * success - bch2_path_get() will use this path, and it'll just be
+ * retraversed:
+ */
+ bool ret = !btree_path_get_locks(trans, path, true, NULL, 0) ||
+ !path->should_be_locked;
+
+ bch2_btree_path_verify_locks(trans, path);
return ret;
}
-bool __bch2_btree_path_upgrade(struct btree_trans *trans,
- struct btree_path *path,
- unsigned new_locks_want,
- struct get_locks_fail *f)
+int __bch2_btree_path_upgrade(struct btree_trans *trans,
+ struct btree_path *path,
+ unsigned new_locks_want)
{
- bool ret = bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want, f);
- if (ret)
+ unsigned old_locks = path->nodes_locked;
+ unsigned old_locks_want = path->locks_want;
+
+ path->locks_want = max_t(unsigned, path->locks_want, new_locks_want);
+
+ struct get_locks_fail f = {};
+ int ret = btree_path_get_locks(trans, path, true, &f,
+ BCH_ERR_transaction_restart_upgrade);
+ if (!ret)
goto out;
/*
@@ -662,9 +686,30 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans,
linked->btree_id == path->btree_id &&
linked->locks_want < new_locks_want) {
linked->locks_want = new_locks_want;
- btree_path_get_locks(trans, linked, true, NULL);
+ btree_path_get_locks(trans, linked, true, NULL, 0);
}
}
+
+ count_event(trans->c, trans_restart_upgrade);
+ if (trace_trans_restart_upgrade_enabled()) {
+ struct printbuf buf = PRINTBUF;
+
+ prt_printf(&buf, "%s %pS\n", trans->fn, (void *) _RET_IP_);
+ prt_printf(&buf, "btree %s pos\n", bch2_btree_id_str(path->btree_id));
+ bch2_bpos_to_text(&buf, path->pos);
+ prt_printf(&buf, "locks want %u -> %u level %u\n",
+ old_locks_want, new_locks_want, f.l);
+ prt_printf(&buf, "nodes_locked %x -> %x\n",
+ old_locks, path->nodes_locked);
+ prt_printf(&buf, "node %s ", IS_ERR(f.b) ? bch2_err_str(PTR_ERR(f.b)) :
+ !f.b ? "(null)" : "(node)");
+ prt_printf(&buf, "path seq %u node seq %u\n",
+ IS_ERR_OR_NULL(f.b) ? 0 : f.b->c.lock.seq,
+ path->l[f.l].lock_seq);
+
+ trace_trans_restart_upgrade(trans->c, buf.buf);
+ printbuf_exit(&buf);
+ }
out:
bch2_trans_verify_locks(trans);
return ret;
@@ -696,7 +741,7 @@ void __bch2_btree_path_downgrade(struct btree_trans *trans,
}
}
- bch2_btree_path_verify_locks(path);
+ bch2_btree_path_verify_locks(trans, path);
trace_path_downgrade(trans, _RET_IP_, path, old_locks_want);
}
@@ -725,8 +770,8 @@ static inline void __bch2_trans_unlock(struct btree_trans *trans)
__bch2_btree_path_unlock(trans, path);
}
-static noinline __cold int bch2_trans_relock_fail(struct btree_trans *trans, struct btree_path *path,
- struct get_locks_fail *f, bool trace)
+static noinline __cold void bch2_trans_relock_fail(struct btree_trans *trans, struct btree_path *path,
+ struct get_locks_fail *f, bool trace, ulong ip)
{
if (!trace)
goto out;
@@ -735,7 +780,9 @@ static noinline __cold int bch2_trans_relock_fail(struct btree_trans *trans, str
struct printbuf buf = PRINTBUF;
bch2_bpos_to_text(&buf, path->pos);
- prt_printf(&buf, " l=%u seq=%u node seq=", f->l, path->l[f->l].lock_seq);
+ prt_printf(&buf, " %s l=%u seq=%u node seq=",
+ bch2_btree_id_str(path->btree_id),
+ f->l, path->l[f->l].lock_seq);
if (IS_ERR_OR_NULL(f->b)) {
prt_str(&buf, bch2_err_str(PTR_ERR(f->b)));
} else {
@@ -749,7 +796,7 @@ static noinline __cold int bch2_trans_relock_fail(struct btree_trans *trans, str
prt_printf(&buf, " total locked %u.%u.%u", c.n[0], c.n[1], c.n[2]);
}
- trace_trans_restart_relock(trans, _RET_IP_, buf.buf);
+ trace_trans_restart_relock(trans, ip, buf.buf);
printbuf_exit(&buf);
}
@@ -757,10 +804,9 @@ static noinline __cold int bch2_trans_relock_fail(struct btree_trans *trans, str
out:
__bch2_trans_unlock(trans);
bch2_trans_verify_locks(trans);
- return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
}
-static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace)
+static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace, ulong ip)
{
bch2_trans_verify_locks(trans);
@@ -774,10 +820,14 @@ static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace)
trans_for_each_path(trans, path, i) {
struct get_locks_fail f;
+ int ret;
if (path->should_be_locked &&
- !btree_path_get_locks(trans, path, false, &f))
- return bch2_trans_relock_fail(trans, path, &f, trace);
+ (ret = btree_path_get_locks(trans, path, false, &f,
+ BCH_ERR_transaction_restart_relock))) {
+ bch2_trans_relock_fail(trans, path, &f, trace, ip);
+ return ret;
+ }
}
trans_set_locked(trans, true);
@@ -788,26 +838,19 @@ out:
int bch2_trans_relock(struct btree_trans *trans)
{
- return __bch2_trans_relock(trans, true);
+ return __bch2_trans_relock(trans, true, _RET_IP_);
}
int bch2_trans_relock_notrace(struct btree_trans *trans)
{
- return __bch2_trans_relock(trans, false);
+ return __bch2_trans_relock(trans, false, _RET_IP_);
}
-void bch2_trans_unlock_noassert(struct btree_trans *trans)
+void bch2_trans_unlock(struct btree_trans *trans)
{
- __bch2_trans_unlock(trans);
-
trans_set_unlocked(trans);
-}
-void bch2_trans_unlock(struct btree_trans *trans)
-{
__bch2_trans_unlock(trans);
-
- trans_set_unlocked(trans);
}
void bch2_trans_unlock_long(struct btree_trans *trans)
@@ -839,32 +882,28 @@ int __bch2_trans_mutex_lock(struct btree_trans *trans,
/* Debug */
-#ifdef CONFIG_BCACHEFS_DEBUG
-
-void bch2_btree_path_verify_locks(struct btree_path *path)
+void __bch2_btree_path_verify_locks(struct btree_trans *trans, struct btree_path *path)
{
- /*
- * A path may be uptodate and yet have nothing locked if and only if
- * there is no node at path->level, which generally means we were
- * iterating over all nodes and got to the end of the btree
- */
- BUG_ON(path->uptodate == BTREE_ITER_UPTODATE &&
- btree_path_node(path, path->level) &&
- !path->nodes_locked);
+ if (!path->nodes_locked && btree_path_node(path, path->level)) {
+ /*
+ * A path may be uptodate and yet have nothing locked if and only if
+ * there is no node at path->level, which generally means we were
+ * iterating over all nodes and got to the end of the btree
+ */
+ BUG_ON(path->uptodate == BTREE_ITER_UPTODATE);
+ BUG_ON(path->should_be_locked && trans->locked && !trans->restarted);
+ }
if (!path->nodes_locked)
return;
for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++) {
int want = btree_lock_want(path, l);
- int have = btree_node_locked_type(path, l);
+ int have = btree_node_locked_type_nowrite(path, l);
BUG_ON(!is_btree_node(path, l) && have != BTREE_NODE_UNLOCKED);
- BUG_ON(is_btree_node(path, l) &&
- (want == BTREE_NODE_UNLOCKED ||
- have != BTREE_NODE_WRITE_LOCKED) &&
- want != have);
+ BUG_ON(is_btree_node(path, l) && want != have);
BUG_ON(btree_node_locked(path, l) &&
path->l[l].lock_seq != six_lock_seq(&path->l[l].b->c.lock));
@@ -882,7 +921,7 @@ static bool bch2_trans_locked(struct btree_trans *trans)
return false;
}
-void bch2_trans_verify_locks(struct btree_trans *trans)
+void __bch2_trans_verify_locks(struct btree_trans *trans)
{
if (!trans->locked) {
BUG_ON(bch2_trans_locked(trans));
@@ -893,7 +932,5 @@ void bch2_trans_verify_locks(struct btree_trans *trans)
unsigned i;
trans_for_each_path(trans, path, i)
- bch2_btree_path_verify_locks(path);
+ __bch2_btree_path_verify_locks(trans, path);
}
-
-#endif
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index b54ef48eb8cc..f2173a3316f4 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -13,9 +13,8 @@
#include "btree_iter.h"
#include "six.h"
-void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags);
+void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags, gfp_t gfp);
-void bch2_trans_unlock_noassert(struct btree_trans *);
void bch2_trans_unlock_write(struct btree_trans *);
static inline bool is_btree_node(struct btree_path *path, unsigned l)
@@ -44,6 +43,15 @@ static inline int btree_node_locked_type(struct btree_path *path,
return BTREE_NODE_UNLOCKED + ((path->nodes_locked >> (level << 1)) & 3);
}
+static inline int btree_node_locked_type_nowrite(struct btree_path *path,
+ unsigned level)
+{
+ int have = btree_node_locked_type(path, level);
+ return have == BTREE_NODE_WRITE_LOCKED
+ ? BTREE_NODE_INTENT_LOCKED
+ : have;
+}
+
static inline bool btree_node_write_locked(struct btree_path *path, unsigned l)
{
return btree_node_locked_type(path, l) == BTREE_NODE_WRITE_LOCKED;
@@ -152,7 +160,7 @@ static inline int btree_path_highest_level_locked(struct btree_path *path)
static inline void __bch2_btree_path_unlock(struct btree_trans *trans,
struct btree_path *path)
{
- btree_path_set_dirty(path, BTREE_ITER_NEED_RELOCK);
+ btree_path_set_dirty(trans, path, BTREE_ITER_NEED_RELOCK);
while (path->nodes_locked)
btree_node_unlock(trans, path, btree_path_lowest_level_locked(path));
@@ -367,8 +375,8 @@ static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans,
struct btree_path *path, unsigned level)
{
EBUG_ON(btree_node_locked(path, level) &&
- !btree_node_write_locked(path, level) &&
- btree_node_locked_type(path, level) != __btree_lock_want(path, level));
+ btree_node_locked_type_nowrite(path, level) !=
+ __btree_lock_want(path, level));
return likely(btree_node_locked(path, level)) ||
(!IS_ERR_OR_NULL(path->l[level].b) &&
@@ -377,31 +385,29 @@ static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans,
/* upgrade */
-bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *,
- struct btree_path *, unsigned,
- struct get_locks_fail *);
+bool __bch2_btree_path_upgrade_norestart(struct btree_trans *, struct btree_path *, unsigned);
-bool __bch2_btree_path_upgrade(struct btree_trans *,
- struct btree_path *, unsigned,
- struct get_locks_fail *);
+static inline bool bch2_btree_path_upgrade_norestart(struct btree_trans *trans,
+ struct btree_path *path,
+ unsigned new_locks_want)
+{
+ return new_locks_want > path->locks_want
+ ? __bch2_btree_path_upgrade_norestart(trans, path, new_locks_want)
+ : true;
+}
+
+int __bch2_btree_path_upgrade(struct btree_trans *,
+ struct btree_path *, unsigned);
static inline int bch2_btree_path_upgrade(struct btree_trans *trans,
struct btree_path *path,
unsigned new_locks_want)
{
- struct get_locks_fail f = {};
- unsigned old_locks_want = path->locks_want;
-
new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
- if (path->locks_want < new_locks_want
- ? __bch2_btree_path_upgrade(trans, path, new_locks_want, &f)
- : path->nodes_locked)
- return 0;
-
- trace_and_count(trans->c, trans_restart_upgrade, trans, _THIS_IP_, path,
- old_locks_want, new_locks_want, &f);
- return btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade);
+ return likely(path->locks_want >= new_locks_want && path->nodes_locked)
+ ? 0
+ : __bch2_btree_path_upgrade(trans, path, new_locks_want);
}
/* misc: */
@@ -411,8 +417,10 @@ static inline void btree_path_set_should_be_locked(struct btree_trans *trans, st
EBUG_ON(!btree_node_locked(path, path->level));
EBUG_ON(path->uptodate);
- path->should_be_locked = true;
- trace_btree_path_should_be_locked(trans, path);
+ if (!path->should_be_locked) {
+ path->should_be_locked = true;
+ trace_btree_path_should_be_locked(trans, path);
+ }
}
static inline void __btree_path_set_level_up(struct btree_trans *trans,
@@ -427,7 +435,7 @@ static inline void btree_path_set_level_up(struct btree_trans *trans,
struct btree_path *path)
{
__btree_path_set_level_up(trans, path, path->level++);
- btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+ btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE);
}
/* debug */
@@ -439,12 +447,20 @@ struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *,
int bch2_check_for_deadlock(struct btree_trans *, struct printbuf *);
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_btree_path_verify_locks(struct btree_path *);
-void bch2_trans_verify_locks(struct btree_trans *);
-#else
-static inline void bch2_btree_path_verify_locks(struct btree_path *path) {}
-static inline void bch2_trans_verify_locks(struct btree_trans *trans) {}
-#endif
+void __bch2_btree_path_verify_locks(struct btree_trans *, struct btree_path *);
+void __bch2_trans_verify_locks(struct btree_trans *);
+
+static inline void bch2_btree_path_verify_locks(struct btree_trans *trans,
+ struct btree_path *path)
+{
+ if (static_branch_unlikely(&bch2_debug_check_btree_locking))
+ __bch2_btree_path_verify_locks(trans, path);
+}
+
+static inline void bch2_trans_verify_locks(struct btree_trans *trans)
+{
+ if (static_branch_unlikely(&bch2_debug_check_btree_locking))
+ __bch2_trans_verify_locks(trans);
+}
#endif /* _BCACHEFS_BTREE_LOCKING_H */
diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c
index a7f06deee13c..a3fb07c60e25 100644
--- a/fs/bcachefs/btree_node_scan.c
+++ b/fs/bcachefs/btree_node_scan.c
@@ -13,6 +13,7 @@
#include <linux/kthread.h>
#include <linux/min_heap.h>
+#include <linux/sched/sysctl.h>
#include <linux/sort.h>
struct find_btree_nodes_worker {
@@ -74,39 +75,6 @@ static inline u64 bkey_journal_seq(struct bkey_s_c k)
}
}
-static bool found_btree_node_is_readable(struct btree_trans *trans,
- struct found_btree_node *f)
-{
- struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp;
-
- found_btree_node_to_key(&tmp.k, f);
-
- struct btree *b = bch2_btree_node_get_noiter(trans, &tmp.k, f->btree_id, f->level, false);
- bool ret = !IS_ERR_OR_NULL(b);
- if (!ret)
- return ret;
-
- f->sectors_written = b->written;
- f->journal_seq = le64_to_cpu(b->data->keys.journal_seq);
-
- struct bkey_s_c k;
- struct bkey unpacked;
- struct btree_node_iter iter;
- for_each_btree_node_key_unpack(b, k, &iter, &unpacked)
- f->journal_seq = max(f->journal_seq, bkey_journal_seq(k));
-
- six_unlock_read(&b->c.lock);
-
- /*
- * We might update this node's range; if that happens, we need the node
- * to be re-read so the read path can trim keys that are no longer in
- * this node
- */
- if (b != btree_node_root(trans->c, b))
- bch2_btree_node_evict(trans, &tmp.k);
- return ret;
-}
-
static int found_btree_node_cmp_cookie(const void *_l, const void *_r)
{
const struct found_btree_node *l = _l;
@@ -158,25 +126,31 @@ static const struct min_heap_callbacks found_btree_node_heap_cbs = {
};
static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
- struct bio *bio, struct btree_node *bn, u64 offset)
+ struct btree *b, struct bio *bio, u64 offset)
{
struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes);
+ struct btree_node *bn = b->data;
bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ);
bio->bi_iter.bi_sector = offset;
- bch2_bio_map(bio, bn, PAGE_SIZE);
+ bch2_bio_map(bio, b->data, c->opts.block_size);
+ u64 submit_time = local_clock();
submit_bio_wait(bio);
- if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read,
- "IO error in try_read_btree_node() at %llu: %s",
- offset, bch2_blk_status_to_str(bio->bi_status)))
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status);
+
+ if (bio->bi_status) {
+ bch_err_dev_ratelimited(ca,
+ "IO error in try_read_btree_node() at %llu: %s",
+ offset, bch2_blk_status_to_str(bio->bi_status));
return;
+ }
if (le64_to_cpu(bn->magic) != bset_magic(c))
return;
if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(&bn->keys))) {
- if (!c->chacha20)
+ if (!c->chacha20_key_set)
return;
struct nonce nonce = btree_nonce(&bn->keys, 0);
@@ -210,7 +184,28 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
};
rcu_read_unlock();
- if (bch2_trans_run(c, found_btree_node_is_readable(trans, &n))) {
+ bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ);
+ bio->bi_iter.bi_sector = offset;
+ bch2_bio_map(bio, b->data, c->opts.btree_node_size);
+
+ submit_time = local_clock();
+ submit_bio_wait(bio);
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status);
+
+ found_btree_node_to_key(&b->key, &n);
+
+ CLASS(printbuf, buf)();
+ if (!bch2_btree_node_read_done(c, ca, b, NULL, &buf)) {
+ /* read_done will swap out b->data for another buffer */
+ bn = b->data;
+ /*
+ * Grab journal_seq here because we want the max journal_seq of
+ * any bset; read_done sorts down to a single set and picks the
+ * max journal_seq
+ */
+ n.journal_seq = le64_to_cpu(bn->keys.journal_seq),
+ n.sectors_written = b->written;
+
mutex_lock(&f->lock);
if (BSET_BIG_ENDIAN(&bn->keys) != CPU_BIG_ENDIAN) {
bch_err(c, "try_read_btree_node() can't handle endian conversion");
@@ -230,12 +225,20 @@ static int read_btree_nodes_worker(void *p)
struct find_btree_nodes_worker *w = p;
struct bch_fs *c = container_of(w->f, struct bch_fs, found_btree_nodes);
struct bch_dev *ca = w->ca;
- void *buf = (void *) __get_free_page(GFP_KERNEL);
- struct bio *bio = bio_alloc(NULL, 1, 0, GFP_KERNEL);
unsigned long last_print = jiffies;
+ struct btree *b = NULL;
+ struct bio *bio = NULL;
+
+ b = __bch2_btree_node_mem_alloc(c);
+ if (!b) {
+ bch_err(c, "read_btree_nodes_worker: error allocating buf");
+ w->f->ret = -ENOMEM;
+ goto err;
+ }
- if (!buf || !bio) {
- bch_err(c, "read_btree_nodes_worker: error allocating bio/buf");
+ bio = bio_alloc(NULL, buf_pages(b->data, c->opts.btree_node_size), 0, GFP_KERNEL);
+ if (!bio) {
+ bch_err(c, "read_btree_nodes_worker: error allocating bio");
w->f->ret = -ENOMEM;
goto err;
}
@@ -259,12 +262,14 @@ static int read_btree_nodes_worker(void *p)
!bch2_dev_btree_bitmap_marked_sectors(ca, sector, btree_sectors(c)))
continue;
- try_read_btree_node(w->f, ca, bio, buf, sector);
+ try_read_btree_node(w->f, ca, b, bio, sector);
}
err:
+ if (b)
+ __btree_node_data_free(b);
+ kfree(b);
bio_put(bio);
- free_page((unsigned long) buf);
- percpu_ref_get(&ca->io_ref);
+ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan);
closure_put(w->cl);
kfree(w);
return 0;
@@ -278,37 +283,37 @@ static int read_btree_nodes(struct find_btree_nodes *f)
closure_init_stack(&cl);
- for_each_online_member(c, ca) {
+ for_each_online_member(c, ca, BCH_DEV_READ_REF_btree_node_scan) {
if (!(ca->mi.data_allowed & BIT(BCH_DATA_btree)))
continue;
struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL);
- struct task_struct *t;
-
if (!w) {
- percpu_ref_put(&ca->io_ref);
+ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan);
ret = -ENOMEM;
goto err;
}
- percpu_ref_get(&ca->io_ref);
- closure_get(&cl);
w->cl = &cl;
w->f = f;
w->ca = ca;
- t = kthread_run(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name);
+ struct task_struct *t = kthread_create(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name);
ret = PTR_ERR_OR_ZERO(t);
if (ret) {
- percpu_ref_put(&ca->io_ref);
- closure_put(&cl);
- f->ret = ret;
- bch_err(c, "error starting kthread: %i", ret);
+ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan);
+ kfree(w);
+ bch_err_msg(c, ret, "starting kthread");
break;
}
+
+ closure_get(&cl);
+ enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan);
+ wake_up_process(t);
}
err:
- closure_sync(&cl);
+ while (closure_sync_timeout(&cl, sysctl_hung_task_timeout_secs * HZ / 2))
+ ;
return f->ret ?: ret;
}
@@ -356,6 +361,8 @@ static int handle_overwrites(struct bch_fs *c,
min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL);
}
}
+
+ cond_resched();
}
return 0;
@@ -388,10 +395,10 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c)
printbuf_reset(&buf);
prt_printf(&buf, "%s: nodes found:\n", __func__);
found_btree_nodes_to_text(&buf, c, f->nodes);
- bch2_print_string_as_lines(KERN_INFO, buf.buf);
+ bch2_print_str(c, KERN_INFO, buf.buf);
}
- sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL);
+ sort_nonatomic(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL);
dst = 0;
darray_for_each(f->nodes, i) {
@@ -411,13 +418,13 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c)
}
f->nodes.nr = dst;
- sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
+ sort_nonatomic(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
if (0 && c->opts.verbose) {
printbuf_reset(&buf);
prt_printf(&buf, "%s: nodes after merging replicas:\n", __func__);
found_btree_nodes_to_text(&buf, c, f->nodes);
- bch2_print_string_as_lines(KERN_INFO, buf.buf);
+ bch2_print_str(c, KERN_INFO, buf.buf);
}
swap(nodes_heap, f->nodes);
@@ -463,7 +470,7 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c)
printbuf_reset(&buf);
prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__);
found_btree_nodes_to_text(&buf, c, f->nodes);
- bch2_print_string_as_lines(KERN_INFO, buf.buf);
+ bch2_print_str(c, KERN_INFO, buf.buf);
} else {
bch_info(c, "btree node scan found %zu nodes after overwrites", f->nodes.nr);
}
@@ -512,8 +519,12 @@ bool bch2_btree_node_is_stale(struct bch_fs *c, struct btree *b)
return false;
}
-bool bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree)
+int bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree)
{
+ int ret = bch2_run_print_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
+ if (ret)
+ return ret;
+
struct found_btree_node search = {
.btree_id = btree,
.level = 0,
@@ -534,7 +545,7 @@ int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree,
struct find_btree_nodes *f = &c->found_btree_nodes;
- int ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
+ int ret = bch2_run_print_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
if (ret)
return ret;
@@ -572,10 +583,12 @@ int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree,
found_btree_node_to_key(&tmp.k, &n);
- struct printbuf buf = PRINTBUF;
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&tmp.k));
- bch_verbose(c, "%s(): recovering %s", __func__, buf.buf);
- printbuf_exit(&buf);
+ if (c->opts.verbose) {
+ struct printbuf buf = PRINTBUF;
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&tmp.k));
+ bch_verbose(c, "%s(): recovering %s", __func__, buf.buf);
+ printbuf_exit(&buf);
+ }
BUG_ON(bch2_bkey_validate(c, bkey_i_to_s_c(&tmp.k),
(struct bkey_validate_context) {
diff --git a/fs/bcachefs/btree_node_scan.h b/fs/bcachefs/btree_node_scan.h
index 08687b209787..66e6f9ed19d0 100644
--- a/fs/bcachefs/btree_node_scan.h
+++ b/fs/bcachefs/btree_node_scan.h
@@ -4,7 +4,7 @@
int bch2_scan_for_btree_nodes(struct bch_fs *);
bool bch2_btree_node_is_stale(struct bch_fs *, struct btree *);
-bool bch2_btree_has_scanned_nodes(struct bch_fs *, enum btree_id);
+int bch2_btree_has_scanned_nodes(struct bch_fs *, enum btree_id);
int bch2_get_scanned_nodes(struct bch_fs *, enum btree_id, unsigned, struct bpos, struct bpos);
void bch2_find_btree_nodes_exit(struct find_btree_nodes *);
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index 2760dd9569ed..639ef75b3dbd 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -11,6 +11,7 @@
#include "btree_write_buffer.h"
#include "buckets.h"
#include "disk_accounting.h"
+#include "enumerated_ref.h"
#include "errcode.h"
#include "error.h"
#include "journal.h"
@@ -20,6 +21,7 @@
#include "snapshot.h"
#include <linux/prefetch.h>
+#include <linux/string_helpers.h>
static const char * const trans_commit_flags_strs[] = {
#define x(n, ...) #n,
@@ -164,6 +166,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans,
EBUG_ON(bpos_gt(insert->k.p, b->data->max_key));
EBUG_ON(insert->k.u64s > bch2_btree_keys_u64s_remaining(b));
EBUG_ON(!b->c.level && !bpos_eq(insert->k.p, path->pos));
+ kmsan_check_memory(insert, bkey_bytes(&insert->k));
k = bch2_btree_node_iter_peek_all(node_iter, b);
if (k && bkey_cmp_left_packed(b, k, &insert->k.p))
@@ -336,6 +339,7 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
BUG_ON(i->cached != path->cached);
BUG_ON(i->level != path->level);
BUG_ON(i->btree_id != path->btree_id);
+ BUG_ON(i->bkey_type != __btree_node_type(path->level, path->btree_id));
EBUG_ON(!i->level &&
btree_type_has_snapshots(i->btree_id) &&
!(i->flags & BTREE_UPDATE_internal_snapshot_node) &&
@@ -364,14 +368,15 @@ static noinline void journal_transaction_name(struct btree_trans *trans)
struct jset_entry_log *l =
container_of(entry, struct jset_entry_log, entry);
- strncpy(l->d, trans->fn, JSET_ENTRY_LOG_U64s * sizeof(u64));
+ memcpy_and_pad(l->d, JSET_ENTRY_LOG_U64s * sizeof(u64),
+ trans->fn, strlen(trans->fn), 0);
}
static inline int btree_key_can_insert(struct btree_trans *trans,
struct btree *b, unsigned u64s)
{
if (!bch2_btree_node_insert_fits(b, u64s))
- return -BCH_ERR_btree_insert_btree_node_full;
+ return bch_err_throw(trans->c, btree_insert_btree_node_full);
return 0;
}
@@ -389,9 +394,10 @@ btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags,
new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL);
if (!new_k) {
- bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
+ struct bch_fs *c = trans->c;
+ bch_err(c, "error allocating memory for key cache key, btree %s u64s %u",
bch2_btree_id_str(path->btree_id), new_u64s);
- return -BCH_ERR_ENOMEM_btree_key_cache_insert;
+ return bch_err_throw(c, ENOMEM_btree_key_cache_insert);
}
ret = bch2_trans_relock(trans) ?:
@@ -427,7 +433,7 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags
if (watermark < BCH_WATERMARK_reclaim &&
!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
bch2_btree_key_cache_must_wait(c))
- return -BCH_ERR_btree_insert_need_journal_reclaim;
+ return bch_err_throw(c, btree_insert_need_journal_reclaim);
/*
* bch2_varint_decode can read past the end of the buffer by at most 7
@@ -517,69 +523,45 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_
}
}
-static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
- unsigned *btree_id_updates_start)
+static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
{
- bool trans_trigger_run;
+ unsigned sort_id_start = 0;
- /*
- * Running triggers will append more updates to the list of updates as
- * we're walking it:
- */
- do {
- trans_trigger_run = false;
+ while (sort_id_start < trans->nr_updates) {
+ unsigned i, sort_id = trans->updates[sort_id_start].sort_order;
+ bool trans_trigger_run;
- for (unsigned i = *btree_id_updates_start;
- i < trans->nr_updates && trans->updates[i].btree_id <= btree_id;
- i++) {
- if (trans->updates[i].btree_id < btree_id) {
- *btree_id_updates_start = i;
- continue;
+ /*
+ * For a given btree, this algorithm runs insert triggers before
+ * overwrite triggers: this is so that when extents are being
+ * moved (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop
+ * references before they are re-added.
+ *
+ * Running triggers will append more updates to the list of
+ * updates as we're walking it:
+ */
+ do {
+ trans_trigger_run = false;
+
+ for (i = sort_id_start;
+ i < trans->nr_updates && trans->updates[i].sort_order <= sort_id;
+ i++) {
+ if (trans->updates[i].sort_order < sort_id) {
+ sort_id_start = i;
+ continue;
+ }
+
+ int ret = run_one_trans_trigger(trans, trans->updates + i);
+ if (ret < 0)
+ return ret;
+ if (ret)
+ trans_trigger_run = true;
}
+ } while (trans_trigger_run);
- int ret = run_one_trans_trigger(trans, trans->updates + i);
- if (ret < 0)
- return ret;
- if (ret)
- trans_trigger_run = true;
- }
- } while (trans_trigger_run);
-
- trans_for_each_update(trans, i)
- BUG_ON(!(i->flags & BTREE_TRIGGER_norun) &&
- i->btree_id == btree_id &&
- btree_node_type_has_trans_triggers(i->bkey_type) &&
- (!i->insert_trigger_run || !i->overwrite_trigger_run));
-
- return 0;
-}
-
-static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
-{
- unsigned btree_id = 0, btree_id_updates_start = 0;
- int ret = 0;
-
- /*
- *
- * For a given btree, this algorithm runs insert triggers before
- * overwrite triggers: this is so that when extents are being moved
- * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before
- * they are re-added.
- */
- for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
- if (btree_id == BTREE_ID_alloc)
- continue;
-
- ret = run_btree_triggers(trans, btree_id, &btree_id_updates_start);
- if (ret)
- return ret;
+ sort_id_start = i;
}
- btree_id_updates_start = 0;
- ret = run_btree_triggers(trans, BTREE_ID_alloc, &btree_id_updates_start);
- if (ret)
- return ret;
-
#ifdef CONFIG_BCACHEFS_DEBUG
trans_for_each_update(trans, i)
BUG_ON(!(i->flags & BTREE_TRIGGER_norun) &&
@@ -613,12 +595,13 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
int ret = 0;
bch2_trans_verify_not_unlocked_or_in_restart(trans);
-
+#if 0
+ /* todo: bring back dynamic fault injection */
if (race_fault()) {
trace_and_count(c, trans_restart_fault_inject, trans, trace_ip);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_fault_inject);
}
-
+#endif
/*
* Check if the insert will fit in the leaf node with the write lock
* held, otherwise another thread could write the node changing the
@@ -666,10 +649,10 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
!(flags & BCH_TRANS_COMMIT_no_journal_res)) {
- if (bch2_journal_seq_verify)
+ if (static_branch_unlikely(&bch2_journal_seq_verify))
trans_for_each_update(trans, i)
i->k->k.bversion.lo = trans->journal_res.seq;
- else if (bch2_inject_invalid_keys)
+ else if (static_branch_unlikely(&bch2_inject_invalid_keys))
trans_for_each_update(trans, i)
i->k->k.bversion = MAX_VERSION;
}
@@ -682,18 +665,17 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
h = h->next;
}
- struct jset_entry *entry = trans->journal_entries;
+ struct bkey_i *accounting;
percpu_down_read(&c->mark_lock);
- for (entry = trans->journal_entries;
- entry != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
- entry = vstruct_next(entry))
- if (entry->type == BCH_JSET_ENTRY_write_buffer_keys &&
- entry->start->k.type == KEY_TYPE_accounting) {
- ret = bch2_accounting_trans_commit_hook(trans, bkey_i_to_accounting(entry->start), flags);
- if (ret)
- goto revert_fs_usage;
- }
+ for (accounting = btree_trans_subbuf_base(trans, &trans->accounting);
+ accounting != btree_trans_subbuf_top(trans, &trans->accounting);
+ accounting = bkey_next(accounting)) {
+ ret = bch2_accounting_trans_commit_hook(trans,
+ bkey_i_to_accounting(accounting), flags);
+ if (ret)
+ goto revert_fs_usage;
+ }
percpu_up_read(&c->mark_lock);
/* XXX: we only want to run this if deltas are nonzero */
@@ -717,8 +699,8 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
validate_context.flags = BCH_VALIDATE_write|BCH_VALIDATE_commit;
- for (struct jset_entry *i = trans->journal_entries;
- i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
+ for (struct jset_entry *i = btree_trans_journal_entries_start(trans);
+ i != btree_trans_journal_entries_top(trans);
i = vstruct_next(i)) {
ret = bch2_journal_entry_validate(c, NULL, i,
bcachefs_metadata_version_current,
@@ -773,11 +755,20 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
}
memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res),
- trans->journal_entries,
- trans->journal_entries_u64s);
+ btree_trans_journal_entries_start(trans),
+ trans->journal_entries.u64s);
+
+ EBUG_ON(trans->journal_res.u64s < trans->journal_entries.u64s);
+
+ trans->journal_res.offset += trans->journal_entries.u64s;
+ trans->journal_res.u64s -= trans->journal_entries.u64s;
- trans->journal_res.offset += trans->journal_entries_u64s;
- trans->journal_res.u64s -= trans->journal_entries_u64s;
+ memcpy_u64s_small(bch2_journal_add_entry(j, &trans->journal_res,
+ BCH_JSET_ENTRY_write_buffer_keys,
+ BTREE_ID_accounting, 0,
+ trans->accounting.u64s)->_data,
+ btree_trans_subbuf_base(trans, &trans->accounting),
+ trans->accounting.u64s);
if (trans->journal_seq)
*trans->journal_seq = trans->journal_res.seq;
@@ -799,13 +790,10 @@ fatal_err:
bch2_fs_fatal_error(c, "fatal error in transaction commit: %s", bch2_err_str(ret));
percpu_down_read(&c->mark_lock);
revert_fs_usage:
- for (struct jset_entry *entry2 = trans->journal_entries;
- entry2 != entry;
- entry2 = vstruct_next(entry2))
- if (entry2->type == BCH_JSET_ENTRY_write_buffer_keys &&
- entry2->start->k.type == KEY_TYPE_accounting)
- bch2_accounting_trans_commit_revert(trans,
- bkey_i_to_accounting(entry2->start), flags);
+ for (struct bkey_i *i = btree_trans_subbuf_base(trans, &trans->accounting);
+ i != accounting;
+ i = bkey_next(i))
+ bch2_accounting_trans_commit_revert(trans, bkey_i_to_accounting(i), flags);
percpu_up_read(&c->mark_lock);
return ret;
}
@@ -903,32 +891,34 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
struct bch_fs *c = trans->c;
enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
- switch (ret) {
- case -BCH_ERR_btree_insert_btree_node_full:
- ret = bch2_btree_split_leaf(trans, i->path, flags);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- trace_and_count(c, trans_restart_btree_node_split, trans,
- trace_ip, trans->paths + i->path);
- break;
- case -BCH_ERR_btree_insert_need_mark_replicas:
- ret = drop_locks_do(trans,
- bch2_accounting_update_sb(trans));
- break;
- case -BCH_ERR_journal_res_get_blocked:
+ if (bch2_err_matches(ret, BCH_ERR_journal_res_blocked)) {
/*
* XXX: this should probably be a separate BTREE_INSERT_NONBLOCK
* flag
*/
if ((flags & BCH_TRANS_COMMIT_journal_reclaim) &&
watermark < BCH_WATERMARK_reclaim) {
- ret = -BCH_ERR_journal_reclaim_would_deadlock;
- break;
+ ret = bch_err_throw(c, journal_reclaim_would_deadlock);
+ goto out;
}
ret = drop_locks_do(trans,
bch2_trans_journal_res_get(trans,
(flags & BCH_WATERMARK_MASK)|
JOURNAL_RES_GET_CHECK));
+ goto out;
+ }
+
+ switch (ret) {
+ case -BCH_ERR_btree_insert_btree_node_full:
+ ret = bch2_btree_split_leaf(trans, i->path, flags);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ trace_and_count(c, trans_restart_btree_node_split, trans,
+ trace_ip, trans->paths + i->path);
+ break;
+ case -BCH_ERR_btree_insert_need_mark_replicas:
+ ret = drop_locks_do(trans,
+ bch2_accounting_update_sb(trans));
break;
case -BCH_ERR_btree_insert_need_journal_reclaim:
bch2_trans_unlock(trans);
@@ -950,7 +940,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
BUG_ON(ret >= 0);
break;
}
-
+out:
BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted);
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) &&
@@ -978,15 +968,36 @@ do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans)
return ret;
}
- for (struct jset_entry *i = trans->journal_entries;
- i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
- i = vstruct_next(i))
+ for (struct jset_entry *i = btree_trans_journal_entries_start(trans);
+ i != btree_trans_journal_entries_top(trans);
+ i = vstruct_next(i)) {
if (i->type == BCH_JSET_ENTRY_btree_keys ||
i->type == BCH_JSET_ENTRY_write_buffer_keys) {
- int ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->start);
- if (ret)
- return ret;
+ jset_entry_for_each_key(i, k) {
+ int ret = bch2_journal_key_insert(c, i->btree_id, i->level, k);
+ if (ret)
+ return ret;
+ }
+ }
+
+ if (i->type == BCH_JSET_ENTRY_btree_root) {
+ guard(mutex)(&c->btree_root_lock);
+
+ struct btree_root *r = bch2_btree_id_root(c, i->btree_id);
+
+ bkey_copy(&r->key, i->start);
+ r->level = i->level;
+ r->alive = true;
}
+ }
+
+ for (struct bkey_i *i = btree_trans_subbuf_base(trans, &trans->accounting);
+ i != btree_trans_subbuf_top(trans, &trans->accounting);
+ i = bkey_next(i)) {
+ int ret = bch2_journal_key_insert(c, BTREE_ID_accounting, 0, i);
+ if (ret)
+ return ret;
+ }
return 0;
}
@@ -995,12 +1006,18 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
{
struct btree_insert_entry *errored_at = NULL;
struct bch_fs *c = trans->c;
+ unsigned journal_u64s = 0;
int ret = 0;
bch2_trans_verify_not_unlocked_or_in_restart(trans);
+ ret = trans_maybe_inject_restart(trans, _RET_IP_);
+ if (unlikely(ret))
+ goto out_reset;
+
if (!trans->nr_updates &&
- !trans->journal_entries_u64s)
+ !trans->journal_entries.u64s &&
+ !trans->accounting.u64s)
goto out_reset;
ret = bch2_trans_commit_run_triggers(trans);
@@ -1008,20 +1025,20 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
goto out_reset;
if (!(flags & BCH_TRANS_COMMIT_no_check_rw) &&
- unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) {
+ unlikely(!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_trans))) {
if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags)))
ret = do_bch2_trans_commit_to_journal_replay(trans);
else
- ret = -BCH_ERR_erofs_trans_commit;
+ ret = bch_err_throw(c, erofs_trans_commit);
goto out_reset;
}
EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags));
- trans->journal_u64s = trans->journal_entries_u64s;
+ journal_u64s = jset_u64s(trans->accounting.u64s);
trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names);
if (trans->journal_transaction_names)
- trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s);
+ journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s);
trans_for_each_update(trans, i) {
struct btree_path *path = trans->paths + i->path;
@@ -1041,11 +1058,11 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
continue;
/* we're going to journal the key being updated: */
- trans->journal_u64s += jset_u64s(i->k->k.u64s);
+ journal_u64s += jset_u64s(i->k->k.u64s);
/* and we're also going to log the overwrite: */
if (trans->journal_transaction_names)
- trans->journal_u64s += jset_u64s(i->old_k.u64s);
+ journal_u64s += jset_u64s(i->old_k.u64s);
}
if (trans->extra_disk_res) {
@@ -1063,6 +1080,8 @@ retry:
memset(&trans->journal_res, 0, sizeof(trans->journal_res));
memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta));
+ trans->journal_u64s = journal_u64s + trans->journal_entries.u64s;
+
ret = do_bch2_trans_commit(trans, flags, &errored_at, _RET_IP_);
/* make sure we didn't drop or screw up locks: */
@@ -1074,7 +1093,7 @@ retry:
trace_and_count(c, transaction_commit, trans, _RET_IP_);
out:
if (likely(!(flags & BCH_TRANS_COMMIT_no_check_rw)))
- bch2_write_ref_put(c, BCH_WRITE_REF_trans);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_trans);
out_reset:
if (!ret)
bch2_trans_downgrade(trans);
@@ -1094,7 +1113,7 @@ err:
* restart:
*/
if (flags & BCH_TRANS_COMMIT_no_journal_res) {
- ret = -BCH_ERR_transaction_restart_nested;
+ ret = bch_err_throw(c, transaction_restart_nested);
goto out;
}
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index a6f251eb4164..112170fd9c8f 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -139,6 +139,7 @@ struct btree {
};
#define BCH_BTREE_CACHE_NOT_FREED_REASONS() \
+ x(cache_reserve) \
x(lock_intent) \
x(lock_write) \
x(dirty) \
@@ -257,9 +258,6 @@ struct btree_node_iter {
*
* BTREE_TRIGGER_insert - @new is entering the btree
* BTREE_TRIGGER_overwrite - @old is leaving the btree
- *
- * BTREE_TRIGGER_bucket_invalidate - signal from bucket invalidate path to alloc
- * trigger
*/
#define BTREE_TRIGGER_FLAGS() \
x(norun) \
@@ -269,8 +267,7 @@ struct btree_node_iter {
x(gc) \
x(insert) \
x(overwrite) \
- x(is_root) \
- x(bucket_invalidate)
+ x(is_root)
enum {
#define x(n) BTREE_ITER_FLAG_BIT_##n,
@@ -367,7 +364,6 @@ static inline unsigned long btree_path_ip_allocated(struct btree_path *path)
* @nodes_intent_locked - bitmask indicating which locks are intent locks
*/
struct btree_iter {
- struct btree_trans *trans;
btree_path_idx_t path;
btree_path_idx_t update_path;
btree_path_idx_t key_cache_path;
@@ -423,6 +419,7 @@ static inline struct bpos btree_node_pos(struct btree_bkey_cached_common *b)
struct btree_insert_entry {
unsigned flags;
+ u8 sort_order;
u8 bkey_type;
enum btree_id btree_id:8;
u8 level:4;
@@ -477,6 +474,18 @@ struct btree_trans_paths {
struct btree_path paths[];
};
+struct trans_kmalloc_trace {
+ unsigned long ip;
+ size_t bytes;
+};
+typedef DARRAY(struct trans_kmalloc_trace) darray_trans_kmalloc_trace;
+
+struct btree_trans_subbuf {
+ u16 base;
+ u16 u64s;
+ u16 size;;
+};
+
struct btree_trans {
struct bch_fs *c;
@@ -488,6 +497,10 @@ struct btree_trans {
void *mem;
unsigned mem_top;
unsigned mem_bytes;
+ unsigned realloc_bytes_required;
+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
+ darray_trans_kmalloc_trace trans_kmalloc_trace;
+#endif
btree_path_idx_t nr_sorted;
btree_path_idx_t nr_paths;
@@ -509,6 +522,9 @@ struct btree_trans {
bool notrace_relock_fail:1;
enum bch_errcode restarted:16;
u32 restart_count;
+#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS
+ u32 restart_count_this_trans;
+#endif
u64 last_begin_time;
unsigned long last_begin_ip;
@@ -525,9 +541,8 @@ struct btree_trans {
int srcu_idx;
/* update path: */
- u16 journal_entries_u64s;
- u16 journal_entries_size;
- struct jset_entry *journal_entries;
+ struct btree_trans_subbuf journal_entries;
+ struct btree_trans_subbuf accounting;
struct btree_trans_commit_hook *hooks;
struct journal_entry_pin *journal_pin;
@@ -541,6 +556,8 @@ struct btree_trans {
unsigned journal_u64s;
unsigned extra_disk_res; /* XXX kill */
+ __BKEY_PADDED(btree_path_down, BKEY_BTREE_PTR_VAL_U64s_MAX);
+
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map dep_map;
#endif
@@ -601,6 +618,9 @@ enum btree_write_type {
x(dying) \
x(fake) \
x(need_rewrite) \
+ x(need_rewrite_error) \
+ x(need_rewrite_degraded) \
+ x(need_rewrite_ptr_written_zero) \
x(never_write) \
x(pinned)
@@ -625,6 +645,32 @@ static inline void clear_btree_node_ ## flag(struct btree *b) \
BTREE_FLAGS()
#undef x
+#define BTREE_NODE_REWRITE_REASON() \
+ x(none) \
+ x(unknown) \
+ x(error) \
+ x(degraded) \
+ x(ptr_written_zero)
+
+enum btree_node_rewrite_reason {
+#define x(n) BTREE_NODE_REWRITE_##n,
+ BTREE_NODE_REWRITE_REASON()
+#undef x
+};
+
+static inline enum btree_node_rewrite_reason btree_node_rewrite_reason(struct btree *b)
+{
+ if (btree_node_need_rewrite_ptr_written_zero(b))
+ return BTREE_NODE_REWRITE_ptr_written_zero;
+ if (btree_node_need_rewrite_degraded(b))
+ return BTREE_NODE_REWRITE_degraded;
+ if (btree_node_need_rewrite_error(b))
+ return BTREE_NODE_REWRITE_error;
+ if (btree_node_need_rewrite(b))
+ return BTREE_NODE_REWRITE_unknown;
+ return BTREE_NODE_REWRITE_none;
+}
+
static inline struct btree_write *btree_current_write(struct btree *b)
{
return b->writes + btree_node_write_idx(b);
@@ -644,13 +690,13 @@ static inline struct bset_tree *bset_tree_last(struct btree *b)
static inline void *
__btree_node_offset_to_ptr(const struct btree *b, u16 offset)
{
- return (void *) ((u64 *) b->data + 1 + offset);
+ return (void *) ((u64 *) b->data + offset);
}
static inline u16
__btree_node_ptr_to_offset(const struct btree *b, const void *p)
{
- u16 ret = (u64 *) p - 1 - (u64 *) b->data;
+ u16 ret = (u64 *) p - (u64 *) b->data;
EBUG_ON(__btree_node_offset_to_ptr(b, ret) != p);
return ret;
@@ -850,6 +896,18 @@ static inline bool btree_type_uses_write_buffer(enum btree_id btree)
return BIT_ULL(btree) & mask;
}
+static inline u8 btree_trigger_order(enum btree_id btree)
+{
+ switch (btree) {
+ case BTREE_ID_alloc:
+ return U8_MAX;
+ case BTREE_ID_stripes:
+ return U8_MAX - 1;
+ default:
+ return btree;
+ }
+}
+
struct btree_root {
struct btree *b;
diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
index 13d794f201a5..ee657b9f4b96 100644
--- a/fs/bcachefs/btree_update.c
+++ b/fs/bcachefs/btree_update.c
@@ -14,10 +14,12 @@
#include "snapshot.h"
#include "trace.h"
+#include <linux/string_helpers.h>
+
static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
const struct btree_insert_entry *r)
{
- return cmp_int(l->btree_id, r->btree_id) ?:
+ return cmp_int(l->sort_order, r->sort_order) ?:
cmp_int(l->cached, r->cached) ?:
-cmp_int(l->level, r->level) ?:
bpos_cmp(l->k->k.p, r->k->k.p);
@@ -121,65 +123,44 @@ static int need_whiteout_for_snapshot(struct btree_trans *trans,
}
int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
- enum btree_id id,
- struct bpos old_pos,
- struct bpos new_pos)
+ enum btree_id btree, struct bpos pos,
+ snapshot_id_list *s)
{
- struct bch_fs *c = trans->c;
- struct btree_iter old_iter, new_iter = { NULL };
- struct bkey_s_c old_k, new_k;
- snapshot_id_list s;
- struct bkey_i *update;
int ret = 0;
- if (!bch2_snapshot_has_children(c, old_pos.snapshot))
- return 0;
-
- darray_init(&s);
+ darray_for_each(*s, id) {
+ pos.snapshot = *id;
- bch2_trans_iter_init(trans, &old_iter, id, old_pos,
- BTREE_ITER_not_extents|
- BTREE_ITER_all_snapshots);
- while ((old_k = bch2_btree_iter_prev(&old_iter)).k &&
- !(ret = bkey_err(old_k)) &&
- bkey_eq(old_pos, old_k.k->p)) {
- struct bpos whiteout_pos =
- SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot);
-
- if (!bch2_snapshot_is_ancestor(c, old_k.k->p.snapshot, old_pos.snapshot) ||
- snapshot_list_has_ancestor(c, &s, old_k.k->p.snapshot))
- continue;
-
- new_k = bch2_bkey_get_iter(trans, &new_iter, id, whiteout_pos,
- BTREE_ITER_not_extents|
- BTREE_ITER_intent);
- ret = bkey_err(new_k);
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, btree, pos,
+ BTREE_ITER_not_extents|
+ BTREE_ITER_intent);
+ ret = bkey_err(k);
if (ret)
break;
- if (new_k.k->type == KEY_TYPE_deleted) {
- update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
+ if (k.k->type == KEY_TYPE_deleted) {
+ struct bkey_i *update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
ret = PTR_ERR_OR_ZERO(update);
- if (ret)
+ if (ret) {
+ bch2_trans_iter_exit(trans, &iter);
break;
+ }
bkey_init(&update->k);
- update->k.p = whiteout_pos;
+ update->k.p = pos;
update->k.type = KEY_TYPE_whiteout;
- ret = bch2_trans_update(trans, &new_iter, update,
+ ret = bch2_trans_update(trans, &iter, update,
BTREE_UPDATE_internal_snapshot_node);
}
- bch2_trans_iter_exit(trans, &new_iter);
+ bch2_trans_iter_exit(trans, &iter);
- ret = snapshot_list_add(c, &s, old_k.k->p.snapshot);
if (ret)
break;
}
- bch2_trans_iter_exit(trans, &new_iter);
- bch2_trans_iter_exit(trans, &old_iter);
- darray_exit(&s);
+ darray_exit(s);
return ret;
}
@@ -296,7 +277,7 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
BTREE_ITER_intent|
BTREE_ITER_with_updates|
BTREE_ITER_not_extents);
- k = bch2_btree_iter_peek_max(&iter, POS(insert->k.p.inode, U64_MAX));
+ k = bch2_btree_iter_peek_max(trans, &iter, POS(insert->k.p.inode, U64_MAX));
if ((ret = bkey_err(k)))
goto err;
if (!k.k)
@@ -322,8 +303,8 @@ static int bch2_trans_update_extent(struct btree_trans *trans,
if (done)
goto out;
next:
- bch2_btree_iter_advance(&iter);
- k = bch2_btree_iter_peek_max(&iter, POS(insert->k.p.inode, U64_MAX));
+ bch2_btree_iter_advance(trans, &iter);
+ k = bch2_btree_iter_peek_max(trans, &iter, POS(insert->k.p.inode, U64_MAX));
if ((ret = bkey_err(k)))
goto err;
if (!k.k)
@@ -397,6 +378,7 @@ bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx,
n = (struct btree_insert_entry) {
.flags = flags,
+ .sort_order = btree_trigger_order(path->btree_id),
.bkey_type = __btree_node_type(path->level, path->btree_id),
.btree_id = path->btree_id,
.level = path->level,
@@ -508,9 +490,12 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans,
return 0;
}
-int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
- struct bkey_i *k, enum btree_iter_update_trigger_flags flags)
+int __must_check bch2_trans_update_ip(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_i *k, enum btree_iter_update_trigger_flags flags,
+ unsigned long ip)
{
+ kmsan_check_memory(k, bkey_bytes(&k->k));
+
btree_path_idx_t path_idx = iter->update_path ?: iter->path;
int ret;
@@ -543,7 +528,7 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter
path_idx = iter->key_cache_path;
}
- return bch2_trans_update_by_path(trans, path_idx, k, flags, _RET_IP_);
+ return bch2_trans_update_by_path(trans, path_idx, k, flags, ip);
}
int bch2_btree_insert_clone_trans(struct btree_trans *trans,
@@ -559,43 +544,48 @@ int bch2_btree_insert_clone_trans(struct btree_trans *trans,
return bch2_btree_insert_trans(trans, btree, n, 0);
}
-struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s)
+void *__bch2_trans_subbuf_alloc(struct btree_trans *trans,
+ struct btree_trans_subbuf *buf,
+ unsigned u64s)
{
- unsigned new_top = trans->journal_entries_u64s + u64s;
- unsigned old_size = trans->journal_entries_size;
+ unsigned new_top = buf->u64s + u64s;
+ unsigned new_size = buf->size;
- if (new_top > trans->journal_entries_size) {
- trans->journal_entries_size = roundup_pow_of_two(new_top);
+ BUG_ON(roundup_pow_of_two(new_top) > U16_MAX);
- btree_trans_stats(trans)->journal_entries_size = trans->journal_entries_size;
- }
+ if (new_top > new_size)
+ new_size = roundup_pow_of_two(new_top);
- struct jset_entry *n =
- bch2_trans_kmalloc_nomemzero(trans,
- trans->journal_entries_size * sizeof(u64));
+ void *n = bch2_trans_kmalloc_nomemzero(trans, new_size * sizeof(u64));
if (IS_ERR(n))
- return ERR_CAST(n);
+ return n;
- if (trans->journal_entries)
- memcpy(n, trans->journal_entries, old_size * sizeof(u64));
- trans->journal_entries = n;
+ unsigned offset = (u64 *) n - (u64 *) trans->mem;
+ BUG_ON(offset > U16_MAX);
- struct jset_entry *e = btree_trans_journal_entries_top(trans);
- trans->journal_entries_u64s = new_top;
- return e;
+ if (buf->u64s)
+ memcpy(n,
+ btree_trans_subbuf_base(trans, buf),
+ buf->size * sizeof(u64));
+ buf->base = (u64 *) n - (u64 *) trans->mem;
+ buf->size = new_size;
+
+ void *p = btree_trans_subbuf_top(trans, buf);
+ buf->u64s = new_top;
+ return p;
}
int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter,
enum btree_id btree, struct bpos end)
{
bch2_trans_iter_init(trans, iter, btree, end, BTREE_ITER_intent);
- struct bkey_s_c k = bch2_btree_iter_peek_prev(iter);
+ struct bkey_s_c k = bch2_btree_iter_peek_prev(trans, iter);
int ret = bkey_err(k);
if (ret)
goto err;
- bch2_btree_iter_advance(iter);
- k = bch2_btree_iter_peek_slot(iter);
+ bch2_btree_iter_advance(trans, iter);
+ k = bch2_btree_iter_peek_slot(trans, iter);
ret = bkey_err(k);
if (ret)
goto err;
@@ -603,7 +593,7 @@ int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter,
BUG_ON(k.k->type != KEY_TYPE_deleted);
if (bkey_gt(k.k->p, end)) {
- ret = -BCH_ERR_ENOSPC_btree_slot;
+ ret = bch_err_throw(trans->c, ENOSPC_btree_slot);
goto err;
}
@@ -631,7 +621,7 @@ int bch2_btree_insert_nonextent(struct btree_trans *trans,
BTREE_ITER_cached|
BTREE_ITER_not_extents|
BTREE_ITER_intent);
- ret = bch2_btree_iter_traverse(&iter) ?:
+ ret = bch2_btree_iter_traverse(trans, &iter) ?:
bch2_trans_update(trans, &iter, k, flags);
bch2_trans_iter_exit(trans, &iter);
return ret;
@@ -643,7 +633,7 @@ int bch2_btree_insert_trans(struct btree_trans *trans, enum btree_id id,
struct btree_iter iter;
bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k),
BTREE_ITER_intent|flags);
- int ret = bch2_btree_iter_traverse(&iter) ?:
+ int ret = bch2_btree_iter_traverse(trans, &iter) ?:
bch2_trans_update(trans, &iter, k, flags);
bch2_trans_iter_exit(trans, &iter);
return ret;
@@ -692,7 +682,7 @@ int bch2_btree_delete(struct btree_trans *trans,
bch2_trans_iter_init(trans, &iter, btree, pos,
BTREE_ITER_cached|
BTREE_ITER_intent);
- ret = bch2_btree_iter_traverse(&iter) ?:
+ ret = bch2_btree_iter_traverse(trans, &iter) ?:
bch2_btree_delete_at(trans, &iter, update_flags);
bch2_trans_iter_exit(trans, &iter);
@@ -710,7 +700,7 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
int ret = 0;
bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_intent);
- while ((k = bch2_btree_iter_peek_max(&iter, end)).k) {
+ while ((k = bch2_btree_iter_peek_max(trans, &iter, end)).k) {
struct disk_reservation disk_res =
bch2_disk_reservation_init(trans->c, 0);
struct bkey_i delete;
@@ -805,7 +795,7 @@ int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree,
struct btree_iter iter;
bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_intent);
- int ret = bch2_btree_iter_traverse(&iter) ?:
+ int ret = bch2_btree_iter_traverse(trans, &iter) ?:
bch2_btree_bit_mod_iter(trans, &iter, set);
bch2_trans_iter_exit(trans, &iter);
return ret;
@@ -823,23 +813,45 @@ int bch2_btree_bit_mod_buffered(struct btree_trans *trans, enum btree_id btree,
return bch2_trans_update_buffered(trans, btree, &k);
}
-int bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf)
+static int __bch2_trans_log_str(struct btree_trans *trans, const char *str, unsigned len)
{
- unsigned u64s = DIV_ROUND_UP(buf->pos, sizeof(u64));
- prt_chars(buf, '\0', u64s * sizeof(u64) - buf->pos);
+ unsigned u64s = DIV_ROUND_UP(len, sizeof(u64));
+
+ struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(u64s));
+ int ret = PTR_ERR_OR_ZERO(e);
+ if (ret)
+ return ret;
+ struct jset_entry_log *l = container_of(e, struct jset_entry_log, entry);
+ journal_entry_init(e, BCH_JSET_ENTRY_log, 0, 1, u64s);
+ memcpy_and_pad(l->d, u64s * sizeof(u64), str, len, 0);
+ return 0;
+}
+
+int bch2_trans_log_str(struct btree_trans *trans, const char *str)
+{
+ return __bch2_trans_log_str(trans, str, strlen(str));
+}
+
+int bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf)
+{
int ret = buf->allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0;
if (ret)
return ret;
- struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(u64s));
- ret = PTR_ERR_OR_ZERO(e);
+ return __bch2_trans_log_str(trans, buf->buf, buf->pos);
+}
+
+int bch2_trans_log_bkey(struct btree_trans *trans, enum btree_id btree,
+ unsigned level, struct bkey_i *k)
+{
+ struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(k->k.u64s));
+ int ret = PTR_ERR_OR_ZERO(e);
if (ret)
return ret;
- struct jset_entry_log *l = container_of(e, struct jset_entry_log, entry);
- journal_entry_init(e, BCH_JSET_ENTRY_log, 0, 1, u64s);
- memcpy(l->d, buf->buf, buf->pos);
+ journal_entry_init(e, BCH_JSET_ENTRY_log_bkey, btree, level, k->k.u64s);
+ bkey_copy(e->start, k);
return 0;
}
@@ -852,7 +864,6 @@ __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
prt_vprintf(&buf, fmt, args);
unsigned u64s = DIV_ROUND_UP(buf.pos, sizeof(u64));
- prt_chars(&buf, '\0', u64s * sizeof(u64) - buf.pos);
int ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0;
if (ret)
@@ -865,7 +876,7 @@ __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
struct jset_entry_log *l = (void *) &darray_top(c->journal.early_journal_entries);
journal_entry_init(&l->entry, BCH_JSET_ENTRY_log, 0, 1, u64s);
- memcpy(l->d, buf.buf, buf.pos);
+ memcpy_and_pad(l->d, u64s * sizeof(u64), buf.buf, buf.pos, 0);
c->journal.early_journal_entries.nr += jset_u64s(u64s);
} else {
ret = bch2_trans_commit_do(c, NULL, NULL, commit_flags,
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 8f22ef9a7651..0b98ab959719 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -4,6 +4,7 @@
#include "btree_iter.h"
#include "journal.h"
+#include "snapshot.h"
struct bch_fs;
struct btree;
@@ -74,7 +75,7 @@ static inline int bch2_btree_delete_at_buffered(struct btree_trans *trans,
}
int __bch2_insert_snapshot_whiteouts(struct btree_trans *, enum btree_id,
- struct bpos, struct bpos);
+ struct bpos, snapshot_id_list *);
/*
* For use when splitting extents in existing snapshots:
@@ -88,11 +89,20 @@ static inline int bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
struct bpos old_pos,
struct bpos new_pos)
{
+ BUG_ON(old_pos.snapshot != new_pos.snapshot);
+
if (!btree_type_has_snapshots(btree) ||
bkey_eq(old_pos, new_pos))
return 0;
- return __bch2_insert_snapshot_whiteouts(trans, btree, old_pos, new_pos);
+ snapshot_id_list s;
+ int ret = bch2_get_snapshot_overwrites(trans, btree, old_pos, &s);
+ if (ret)
+ return ret;
+
+ return s.nr
+ ? __bch2_insert_snapshot_whiteouts(trans, btree, new_pos, &s)
+ : 0;
}
int bch2_trans_update_extent_overwrite(struct btree_trans *, struct btree_iter *,
@@ -102,34 +112,79 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *, struct btree_iter *
int bch2_bkey_get_empty_slot(struct btree_trans *, struct btree_iter *,
enum btree_id, struct bpos);
-int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *,
- struct bkey_i *, enum btree_iter_update_trigger_flags);
+int __must_check bch2_trans_update_ip(struct btree_trans *, struct btree_iter *,
+ struct bkey_i *, enum btree_iter_update_trigger_flags,
+ unsigned long);
+
+static inline int __must_check
+bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_i *k, enum btree_iter_update_trigger_flags flags)
+{
+ return bch2_trans_update_ip(trans, iter, k, flags, _THIS_IP_);
+}
+
+static inline void *btree_trans_subbuf_base(struct btree_trans *trans,
+ struct btree_trans_subbuf *buf)
+{
+ return (u64 *) trans->mem + buf->base;
+}
-struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *, unsigned);
+static inline void *btree_trans_subbuf_top(struct btree_trans *trans,
+ struct btree_trans_subbuf *buf)
+{
+ return (u64 *) trans->mem + buf->base + buf->u64s;
+}
+
+void *__bch2_trans_subbuf_alloc(struct btree_trans *,
+ struct btree_trans_subbuf *,
+ unsigned);
+
+static inline void *
+bch2_trans_subbuf_alloc(struct btree_trans *trans,
+ struct btree_trans_subbuf *buf,
+ unsigned u64s)
+{
+ if (buf->u64s + u64s > buf->size)
+ return __bch2_trans_subbuf_alloc(trans, buf, u64s);
+
+ void *p = btree_trans_subbuf_top(trans, buf);
+ buf->u64s += u64s;
+ return p;
+}
+
+static inline struct jset_entry *btree_trans_journal_entries_start(struct btree_trans *trans)
+{
+ return btree_trans_subbuf_base(trans, &trans->journal_entries);
+}
static inline struct jset_entry *btree_trans_journal_entries_top(struct btree_trans *trans)
{
- return (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
+ return btree_trans_subbuf_top(trans, &trans->journal_entries);
}
static inline struct jset_entry *
bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s)
{
- if (!trans->journal_entries ||
- trans->journal_entries_u64s + u64s > trans->journal_entries_size)
- return __bch2_trans_jset_entry_alloc(trans, u64s);
-
- struct jset_entry *e = btree_trans_journal_entries_top(trans);
- trans->journal_entries_u64s += u64s;
- return e;
+ return bch2_trans_subbuf_alloc(trans, &trans->journal_entries, u64s);
}
int bch2_btree_insert_clone_trans(struct btree_trans *, enum btree_id, struct bkey_i *);
+int bch2_btree_write_buffer_insert_err(struct bch_fs *, enum btree_id, struct bkey_i *);
+
static inline int __must_check bch2_trans_update_buffered(struct btree_trans *trans,
enum btree_id btree,
struct bkey_i *k)
{
+ kmsan_check_memory(k, bkey_bytes(&k->k));
+
+ EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX);
+
+ if (unlikely(!btree_type_uses_write_buffer(btree))) {
+ int ret = bch2_btree_write_buffer_insert_err(trans->c, btree, k);
+ dump_stack();
+ return ret;
+ }
/*
* Most updates skip the btree write buffer until journal replay is
* finished because synchronization with journal replay relies on having
@@ -159,7 +214,10 @@ void bch2_trans_commit_hook(struct btree_trans *,
struct btree_trans_commit_hook *);
int __bch2_trans_commit(struct btree_trans *, unsigned);
+int bch2_trans_log_str(struct btree_trans *, const char *);
int bch2_trans_log_msg(struct btree_trans *, struct printbuf *);
+int bch2_trans_log_bkey(struct btree_trans *, enum btree_id, unsigned, struct bkey_i *);
+
__printf(2, 3) int bch2_fs_log_msg(struct bch_fs *, const char *, ...);
__printf(2, 3) int bch2_journal_log_msg(struct bch_fs *, const char *, ...);
@@ -205,12 +263,15 @@ static inline void bch2_trans_reset_updates(struct btree_trans *trans)
bch2_path_put(trans, i->path, true);
trans->nr_updates = 0;
- trans->journal_entries_u64s = 0;
+ trans->journal_entries.u64s = 0;
+ trans->journal_entries.size = 0;
+ trans->accounting.u64s = 0;
+ trans->accounting.size = 0;
trans->hooks = NULL;
trans->extra_disk_res = 0;
}
-static inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k,
+static __always_inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k,
unsigned type, unsigned min_bytes)
{
unsigned bytes = max_t(unsigned, min_bytes, bkey_bytes(k.k));
@@ -233,7 +294,7 @@ static inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *t
return mut;
}
-static inline struct bkey_i *bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k)
+static __always_inline struct bkey_i *bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k)
{
return __bch2_bkey_make_mut_noupdate(trans, k, 0, 0);
}
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index f4aeadbe53c1..553059b33bfd 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -14,6 +14,7 @@
#include "btree_locking.h"
#include "buckets.h"
#include "clock.h"
+#include "enumerated_ref.h"
#include "error.h"
#include "extents.h"
#include "io_write.h"
@@ -35,6 +36,8 @@ static const char * const bch2_btree_update_modes[] = {
NULL
};
+static void bch2_btree_update_to_text(struct printbuf *, struct btree_update *);
+
static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *,
btree_path_idx_t, struct btree *, struct keylist *);
static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *);
@@ -64,19 +67,23 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
if (b == btree_node_root(c, b)) {
if (!bpos_eq(b->data->min_key, POS_MIN)) {
- printbuf_reset(&buf);
+ bch2_log_msg_start(c, &buf);
+ prt_printf(&buf, "btree root with incorrect min_key: ");
bch2_bpos_to_text(&buf, b->data->min_key);
- log_fsck_err(trans, btree_root_bad_min_key,
- "btree root with incorrect min_key: %s", buf.buf);
- goto topology_repair;
+ prt_newline(&buf);
+
+ bch2_count_fsck_err(c, btree_root_bad_min_key, &buf);
+ goto err;
}
if (!bpos_eq(b->data->max_key, SPOS_MAX)) {
- printbuf_reset(&buf);
+ bch2_log_msg_start(c, &buf);
+ prt_printf(&buf, "btree root with incorrect max_key: ");
bch2_bpos_to_text(&buf, b->data->max_key);
- log_fsck_err(trans, btree_root_bad_max_key,
- "btree root with incorrect max_key: %s", buf.buf);
- goto topology_repair;
+ prt_newline(&buf);
+
+ bch2_count_fsck_err(c, btree_root_bad_max_key, &buf);
+ goto err;
}
}
@@ -94,20 +101,15 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
: bpos_successor(prev.k->k.p);
if (!bpos_eq(expected_min, bp.v->min_key)) {
- bch2_topology_error(c);
-
- printbuf_reset(&buf);
- prt_str(&buf, "end of prev node doesn't match start of next node\n in ");
- bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
- prt_str(&buf, " node ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
- prt_str(&buf, "\n prev ");
+ prt_str(&buf, "end of prev node doesn't match start of next node");
+ prt_str(&buf, "\nprev ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k));
- prt_str(&buf, "\n next ");
+ prt_str(&buf, "\nnext ");
bch2_bkey_val_to_text(&buf, c, k);
+ prt_newline(&buf);
- log_fsck_err(trans, btree_node_topology_bad_min_key, "%s", buf.buf);
- goto topology_repair;
+ bch2_count_fsck_err(c, btree_node_topology_bad_min_key, &buf);
+ goto err;
}
bch2_bkey_buf_reassemble(&prev, c, k);
@@ -115,38 +117,33 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
}
if (bkey_deleted(&prev.k->k)) {
- bch2_topology_error(c);
-
- printbuf_reset(&buf);
- prt_str(&buf, "empty interior node\n in ");
- bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
- prt_str(&buf, " node ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
-
- log_fsck_err(trans, btree_node_topology_empty_interior_node, "%s", buf.buf);
- goto topology_repair;
- } else if (!bpos_eq(prev.k->k.p, b->key.k.p)) {
- bch2_topology_error(c);
-
- printbuf_reset(&buf);
- prt_str(&buf, "last child node doesn't end at end of parent node\n in ");
- bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
- prt_str(&buf, " node ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
- prt_str(&buf, "\n last key ");
+ prt_printf(&buf, "empty interior node\n");
+ bch2_count_fsck_err(c, btree_node_topology_empty_interior_node, &buf);
+ goto err;
+ }
+
+ if (!bpos_eq(prev.k->k.p, b->key.k.p)) {
+ prt_str(&buf, "last child node doesn't end at end of parent node\nchild: ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k));
+ prt_newline(&buf);
- log_fsck_err(trans, btree_node_topology_bad_max_key, "%s", buf.buf);
- goto topology_repair;
+ bch2_count_fsck_err(c, btree_node_topology_bad_max_key, &buf);
+ goto err;
}
out:
-fsck_err:
bch2_btree_and_journal_iter_exit(&iter);
bch2_bkey_buf_exit(&prev, c);
printbuf_exit(&buf);
return ret;
-topology_repair:
- ret = bch2_topology_error(c);
+err:
+ bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
+ prt_char(&buf, ' ');
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+ prt_newline(&buf);
+
+ ret = __bch2_topology_error(c, &buf);
+ bch2_print_str(c, KERN_ERR, buf.buf);
+ BUG_ON(!ret);
goto out;
}
@@ -287,6 +284,7 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
struct disk_reservation *res,
struct closure *cl,
bool interior_node,
+ unsigned target,
unsigned flags)
{
struct bch_fs *c = trans->c;
@@ -320,6 +318,7 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
mutex_unlock(&c->btree_reserve_cache_lock);
retry:
ret = bch2_alloc_sectors_start_trans(trans,
+ target ?:
c->opts.metadata_target ?:
c->opts.foreground_target,
0,
@@ -328,7 +327,9 @@ retry:
res->nr_replicas,
min(res->nr_replicas,
c->opts.metadata_replicas_required),
- watermark, 0, cl, &wp);
+ watermark,
+ target ? BCH_WRITE_only_specified_devs : 0,
+ cl, &wp);
if (unlikely(ret))
goto err;
@@ -508,6 +509,7 @@ static void bch2_btree_reserve_put(struct btree_update *as, struct btree_trans *
static int bch2_btree_reserve_get(struct btree_trans *trans,
struct btree_update *as,
unsigned nr_nodes[2],
+ unsigned target,
unsigned flags,
struct closure *cl)
{
@@ -530,7 +532,7 @@ static int bch2_btree_reserve_get(struct btree_trans *trans,
while (p->nr < nr_nodes[interior]) {
b = __bch2_btree_node_alloc(trans, &as->disk_res, cl,
- interior, flags);
+ interior, target, flags);
if (IS_ERR(b)) {
ret = PTR_ERR(b);
goto err;
@@ -649,6 +651,14 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans,
return 0;
}
+/* If the node has been reused, we might be reading uninitialized memory - that's fine: */
+static noinline __no_kmsan_checks bool btree_node_seq_matches(struct btree *b, __le64 seq)
+{
+ struct btree_node *b_data = READ_ONCE(b->data);
+
+ return (b_data ? b_data->keys.seq : 0) == seq;
+}
+
static void btree_update_nodes_written(struct btree_update *as)
{
struct bch_fs *c = as->c;
@@ -674,18 +684,31 @@ static void btree_update_nodes_written(struct btree_update *as)
/*
* Wait for any in flight writes to finish before we free the old nodes
- * on disk:
+ * on disk. But we haven't pinned those old nodes in the btree cache,
+ * they might have already been evicted.
+ *
+ * The update we're completing deleted references to those nodes from the
+ * btree, so we know if they've been evicted they can't be pulled back in.
+ * We just have to check if the nodes we have pointers to are still those
+ * old nodes, and haven't been reused.
+ *
+ * This can't be done locklessly because the data buffer might have been
+ * vmalloc allocated, and they're not RCU freed. We also need the
+ * __no_kmsan_checks annotation because even with the btree node read
+ * lock, nothing tells us that the data buffer has been initialized (if
+ * the btree node has been reused for a different node, and the data
+ * buffer swapped for a new data buffer).
*/
for (i = 0; i < as->nr_old_nodes; i++) {
- __le64 seq;
-
b = as->old_nodes[i];
+ bch2_trans_begin(trans);
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
- seq = b->data ? b->data->keys.seq : 0;
+ bool seq_matches = btree_node_seq_matches(b, as->old_nodes_seq[i]);
six_unlock_read(&b->c.lock);
+ bch2_trans_unlock_long(trans);
- if (seq == as->old_nodes_seq[i])
+ if (seq_matches)
wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner,
TASK_UNINTERRUPTIBLE);
}
@@ -1115,9 +1138,17 @@ static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *
start_time);
}
+static const char * const btree_node_reawrite_reason_strs[] = {
+#define x(n) #n,
+ BTREE_NODE_REWRITE_REASON()
+#undef x
+ NULL,
+};
+
static struct btree_update *
bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
- unsigned level_start, bool split, unsigned flags)
+ unsigned level_start, bool split,
+ unsigned target, unsigned flags)
{
struct bch_fs *c = trans->c;
struct btree_update *as;
@@ -1208,6 +1239,15 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
list_add_tail(&as->list, &c->btree_interior_update_list);
mutex_unlock(&c->btree_interior_update_lock);
+ struct btree *b = btree_path_node(path, path->level);
+ as->node_start = b->data->min_key;
+ as->node_end = b->data->max_key;
+ as->node_needed_rewrite = btree_node_rewrite_reason(b);
+ as->node_written = b->written;
+ as->node_sectors = btree_buf_bytes(b) >> 9;
+ as->node_remaining = __bch2_btree_u64s_remaining(b,
+ btree_bkey_last(b, bset_tree_last(b)));
+
/*
* We don't want to allocate if we're in an error state, that can cause
* deadlock on emergency shutdown due to open buckets getting stuck in
@@ -1222,12 +1262,12 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
ret = bch2_disk_reservation_get(c, &as->disk_res,
(nr_nodes[0] + nr_nodes[1]) * btree_sectors(c),
- c->opts.metadata_replicas,
+ READ_ONCE(c->opts.metadata_replicas),
disk_res_flags);
if (ret)
goto err;
- ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, NULL);
+ ret = bch2_btree_reserve_get(trans, as, nr_nodes, target, flags, NULL);
if (bch2_err_matches(ret, ENOSPC) ||
bch2_err_matches(ret, ENOMEM)) {
struct closure cl;
@@ -1239,18 +1279,19 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
if (bch2_err_matches(ret, ENOSPC) &&
(flags & BCH_TRANS_COMMIT_journal_reclaim) &&
watermark < BCH_WATERMARK_reclaim) {
- ret = -BCH_ERR_journal_reclaim_would_deadlock;
+ ret = bch_err_throw(c, journal_reclaim_would_deadlock);
goto err;
}
closure_init_stack(&cl);
do {
- ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, &cl);
-
+ ret = bch2_btree_reserve_get(trans, as, nr_nodes, target, flags, &cl);
+ if (!bch2_err_matches(ret, BCH_ERR_operation_blocked))
+ break;
bch2_trans_unlock(trans);
bch2_wait_on_allocator(c, &cl);
- } while (bch2_err_matches(ret, BCH_ERR_operation_blocked));
+ } while (1);
}
if (ret) {
@@ -1269,7 +1310,8 @@ err:
bch2_btree_update_free(as, trans);
if (!bch2_err_matches(ret, ENOSPC) &&
!bch2_err_matches(ret, EROFS) &&
- ret != -BCH_ERR_journal_reclaim_would_deadlock)
+ ret != -BCH_ERR_journal_reclaim_would_deadlock &&
+ ret != -BCH_ERR_journal_shutdown)
bch_err_fn_ratelimited(c, ret);
return ERR_PTR(ret);
}
@@ -1389,7 +1431,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
printbuf_exit(&buf);
}
-static void
+static int
bch2_btree_insert_keys_interior(struct btree_update *as,
struct btree_trans *trans,
struct btree_path *path,
@@ -1411,7 +1453,8 @@ bch2_btree_insert_keys_interior(struct btree_update *as,
insert = bkey_next(insert))
bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, insert);
- if (bch2_btree_node_check_topology(trans, b)) {
+ int ret = bch2_btree_node_check_topology(trans, b);
+ if (ret) {
struct printbuf buf = PRINTBUF;
for (struct bkey_i *k = keys->keys;
@@ -1421,11 +1464,15 @@ bch2_btree_insert_keys_interior(struct btree_update *as,
prt_newline(&buf);
}
- panic("%s(): check_topology error: inserted keys\n%s", __func__, buf.buf);
+ bch2_fs_fatal_error(as->c, "%ps -> %s(): check_topology error %s: inserted keys\n%s",
+ (void *) _RET_IP_, __func__, bch2_err_str(ret), buf.buf);
+ dump_stack();
+ return ret;
}
memmove_u64s_down(keys->keys, insert, keys->top_p - insert->_data);
keys->top_p -= insert->_data - keys->keys_p;
+ return 0;
}
static bool key_deleted_in_insert(struct keylist *insert_keys, struct bpos pos)
@@ -1559,11 +1606,11 @@ static void __btree_split_node(struct btree_update *as,
* nodes that were coalesced, and thus in the middle of a child node post
* coalescing:
*/
-static void btree_split_insert_keys(struct btree_update *as,
- struct btree_trans *trans,
- btree_path_idx_t path_idx,
- struct btree *b,
- struct keylist *keys)
+static int btree_split_insert_keys(struct btree_update *as,
+ struct btree_trans *trans,
+ btree_path_idx_t path_idx,
+ struct btree *b,
+ struct keylist *keys)
{
struct btree_path *path = trans->paths + path_idx;
@@ -1573,8 +1620,12 @@ static void btree_split_insert_keys(struct btree_update *as,
bch2_btree_node_iter_init(&node_iter, b, &bch2_keylist_front(keys)->k.p);
- bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys);
+ int ret = bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys);
+ if (ret)
+ return ret;
}
+
+ return 0;
}
static int btree_split(struct btree_update *as, struct btree_trans *trans,
@@ -1607,8 +1658,10 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
__btree_split_node(as, trans, b, n, keys);
if (keys) {
- btree_split_insert_keys(as, trans, path, n1, keys);
- btree_split_insert_keys(as, trans, path, n2, keys);
+ ret = btree_split_insert_keys(as, trans, path, n1, keys) ?:
+ btree_split_insert_keys(as, trans, path, n2, keys);
+ if (ret)
+ goto err;
BUG_ON(!bch2_keylist_empty(keys));
}
@@ -1654,7 +1707,9 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
n3->sib_u64s[0] = U16_MAX;
n3->sib_u64s[1] = U16_MAX;
- btree_split_insert_keys(as, trans, path, n3, &as->parent_keys);
+ ret = btree_split_insert_keys(as, trans, path, n3, &as->parent_keys);
+ if (ret)
+ goto err;
}
} else {
trace_and_count(c, btree_node_compact, trans, b);
@@ -1662,7 +1717,9 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
n1 = bch2_btree_node_alloc_replacement(as, trans, b);
if (keys) {
- btree_split_insert_keys(as, trans, path, n1, keys);
+ ret = btree_split_insert_keys(as, trans, path, n1, keys);
+ if (ret)
+ goto err;
BUG_ON(!bch2_keylist_empty(keys));
}
@@ -1780,11 +1837,24 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t
int ret;
lockdep_assert_held(&c->gc_lock);
- BUG_ON(!btree_node_intent_locked(path, b->c.level));
BUG_ON(!b->c.level);
BUG_ON(!as || as->b);
bch2_verify_keylist_sorted(keys);
+ if (!btree_node_intent_locked(path, b->c.level)) {
+ struct printbuf buf = PRINTBUF;
+ bch2_log_msg_start(c, &buf);
+ prt_printf(&buf, "%s(): node not locked at level %u\n",
+ __func__, b->c.level);
+ bch2_btree_update_to_text(&buf, as);
+ bch2_btree_path_to_text(&buf, trans, path_idx);
+ bch2_fs_emergency_read_only2(c, &buf);
+
+ bch2_print_str(c, KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+ return -EIO;
+ }
+
ret = bch2_btree_node_lock_write(trans, path, &b->c);
if (ret)
return ret;
@@ -1796,15 +1866,15 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t
goto split;
}
- ret = bch2_btree_node_check_topology(trans, b);
+
+ ret = bch2_btree_node_check_topology(trans, b) ?:
+ bch2_btree_insert_keys_interior(as, trans, path, b,
+ path->l[b->c.level].iter, keys);
if (ret) {
bch2_btree_node_unlock_write(trans, path, b);
return ret;
}
- bch2_btree_insert_keys_interior(as, trans, path, b,
- path->l[b->c.level].iter, keys);
-
trans_for_each_path_with_node(trans, b, linked, i)
bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b);
@@ -1850,7 +1920,7 @@ int bch2_btree_split_leaf(struct btree_trans *trans,
as = bch2_btree_update_start(trans, trans->paths + path,
trans->paths[path].level,
- true, flags);
+ true, 0, flags);
if (IS_ERR(as))
return PTR_ERR(as);
@@ -1920,7 +1990,8 @@ int bch2_btree_increase_depth(struct btree_trans *trans, btree_path_idx_t path,
return bch2_btree_split_leaf(trans, path, flags);
struct btree_update *as =
- bch2_btree_update_start(trans, trans->paths + path, b->c.level, true, flags);
+ bch2_btree_update_start(trans, trans->paths + path, b->c.level,
+ true, 0, flags);
if (IS_ERR(as))
return PTR_ERR(as);
@@ -2005,18 +2076,22 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
}
if (!bpos_eq(bpos_successor(prev->data->max_key), next->data->min_key)) {
- struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
-
- bch2_bpos_to_text(&buf1, prev->data->max_key);
- bch2_bpos_to_text(&buf2, next->data->min_key);
- bch_err(c,
- "%s(): btree topology error:\n"
- " prev ends at %s\n"
- " next starts at %s",
- __func__, buf1.buf, buf2.buf);
- printbuf_exit(&buf1);
- printbuf_exit(&buf2);
- ret = bch2_topology_error(c);
+ struct printbuf buf = PRINTBUF;
+
+ printbuf_indent_add_nextline(&buf, 2);
+ prt_printf(&buf, "%s(): ", __func__);
+ ret = __bch2_topology_error(c, &buf);
+ prt_newline(&buf);
+
+ prt_printf(&buf, "prev ends at ");
+ bch2_bpos_to_text(&buf, prev->data->max_key);
+ prt_newline(&buf);
+
+ prt_printf(&buf, "next starts at ");
+ bch2_bpos_to_text(&buf, next->data->min_key);
+
+ bch_err(c, "%s", buf.buf);
+ printbuf_exit(&buf);
goto err;
}
@@ -2045,11 +2120,14 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
parent = btree_node_parent(trans->paths + path, b);
as = bch2_btree_update_start(trans, trans->paths + path, level, false,
- BCH_TRANS_COMMIT_no_enospc|flags);
+ 0, BCH_TRANS_COMMIT_no_enospc|flags);
ret = PTR_ERR_OR_ZERO(as);
if (ret)
goto err;
+ as->node_start = prev->data->min_key;
+ as->node_end = next->data->max_key;
+
trace_and_count(c, btree_node_merge, trans, b);
n = bch2_btree_node_alloc(as, trans, b->c.level);
@@ -2124,9 +2202,35 @@ err_free_update:
goto out;
}
+static int get_iter_to_node(struct btree_trans *trans, struct btree_iter *iter,
+ struct btree *b)
+{
+ bch2_trans_node_iter_init(trans, iter, b->c.btree_id, b->key.k.p,
+ BTREE_MAX_DEPTH, b->c.level,
+ BTREE_ITER_intent);
+ int ret = bch2_btree_iter_traverse(trans, iter);
+ if (ret)
+ goto err;
+
+ /* has node been freed? */
+ if (btree_iter_path(trans, iter)->l[b->c.level].b != b) {
+ /* node has been freed: */
+ BUG_ON(!btree_node_dying(b));
+ ret = bch_err_throw(trans->c, btree_node_dying);
+ goto err;
+ }
+
+ BUG_ON(!btree_node_hashed(b));
+ return 0;
+err:
+ bch2_trans_iter_exit(trans, iter);
+ return ret;
+}
+
int bch2_btree_node_rewrite(struct btree_trans *trans,
struct btree_iter *iter,
struct btree *b,
+ unsigned target,
unsigned flags)
{
struct bch_fs *c = trans->c;
@@ -2139,7 +2243,8 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
struct btree_path *path = btree_iter_path(trans, iter);
parent = btree_node_parent(path, b);
- as = bch2_btree_update_start(trans, path, b->c.level, false, flags);
+ as = bch2_btree_update_start(trans, path, b->c.level,
+ false, target, flags);
ret = PTR_ERR_OR_ZERO(as);
if (ret)
goto out;
@@ -2189,67 +2294,82 @@ err:
goto out;
}
-struct async_btree_rewrite {
- struct bch_fs *c;
- struct work_struct work;
- struct list_head list;
- enum btree_id btree_id;
- unsigned level;
- struct bkey_buf key;
-};
-
-static int async_btree_node_rewrite_trans(struct btree_trans *trans,
- struct async_btree_rewrite *a)
+int bch2_btree_node_rewrite_key(struct btree_trans *trans,
+ enum btree_id btree, unsigned level,
+ struct bkey_i *k, unsigned flags)
{
struct btree_iter iter;
bch2_trans_node_iter_init(trans, &iter,
- a->btree_id, a->key.k->k.p,
- BTREE_MAX_DEPTH, a->level, 0);
- struct btree *b = bch2_btree_iter_peek_node(&iter);
+ btree, k->k.p,
+ BTREE_MAX_DEPTH, level, 0);
+ struct btree *b = bch2_btree_iter_peek_node(trans, &iter);
int ret = PTR_ERR_OR_ZERO(b);
if (ret)
goto out;
- bool found = b && btree_ptr_hash_val(&b->key) == btree_ptr_hash_val(a->key.k);
+ bool found = b && btree_ptr_hash_val(&b->key) == btree_ptr_hash_val(k);
ret = found
- ? bch2_btree_node_rewrite(trans, &iter, b, 0)
+ ? bch2_btree_node_rewrite(trans, &iter, b, 0, flags)
: -ENOENT;
+out:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
-#if 0
- /* Tracepoint... */
- if (!ret || ret == -ENOENT) {
- struct bch_fs *c = trans->c;
- struct printbuf buf = PRINTBUF;
+int bch2_btree_node_rewrite_pos(struct btree_trans *trans,
+ enum btree_id btree, unsigned level,
+ struct bpos pos,
+ unsigned target,
+ unsigned flags)
+{
+ BUG_ON(!level);
- if (!ret) {
- prt_printf(&buf, "rewrite node:\n ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(a->key.k));
- } else {
- prt_printf(&buf, "node to rewrite not found:\n want: ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(a->key.k));
- prt_printf(&buf, "\n got: ");
- if (b)
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
- else
- prt_str(&buf, "(null)");
- }
- bch_info(c, "%s", buf.buf);
- printbuf_exit(&buf);
- }
-#endif
-out:
+ /* Traverse one depth lower to get a pointer to the node itself: */
+ struct btree_iter iter;
+ bch2_trans_node_iter_init(trans, &iter, btree, pos, 0, level - 1, 0);
+ struct btree *b = bch2_btree_iter_peek_node(trans, &iter);
+ int ret = PTR_ERR_OR_ZERO(b);
+ if (ret)
+ goto err;
+
+ ret = bch2_btree_node_rewrite(trans, &iter, b, target, flags);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *trans,
+ struct btree *b, unsigned flags)
+{
+ struct btree_iter iter;
+ int ret = get_iter_to_node(trans, &iter, b);
+ if (ret)
+ return ret == -BCH_ERR_btree_node_dying ? 0 : ret;
+
+ ret = bch2_btree_node_rewrite(trans, &iter, b, 0, flags);
bch2_trans_iter_exit(trans, &iter);
return ret;
}
+struct async_btree_rewrite {
+ struct bch_fs *c;
+ struct work_struct work;
+ struct list_head list;
+ enum btree_id btree_id;
+ unsigned level;
+ struct bkey_buf key;
+};
+
static void async_btree_node_rewrite_work(struct work_struct *work)
{
struct async_btree_rewrite *a =
container_of(work, struct async_btree_rewrite, work);
struct bch_fs *c = a->c;
- int ret = bch2_trans_do(c, async_btree_node_rewrite_trans(trans, a));
- if (ret != -ENOENT)
+ int ret = bch2_trans_do(c, bch2_btree_node_rewrite_key(trans,
+ a->btree_id, a->level, a->key.k, 0));
+ if (!bch2_err_matches(ret, ENOENT) &&
+ !bch2_err_matches(ret, EROFS))
bch_err_fn_ratelimited(c, ret);
spin_lock(&c->btree_node_rewrites_lock);
@@ -2259,7 +2379,7 @@ static void async_btree_node_rewrite_work(struct work_struct *work)
closure_wake_up(&c->btree_node_rewrites_wait);
bch2_bkey_buf_exit(&a->key, c);
- bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_node_rewrite);
kfree(a);
}
@@ -2280,8 +2400,8 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
bool now = false, pending = false;
spin_lock(&c->btree_node_rewrites_lock);
- if (c->curr_recovery_pass > BCH_RECOVERY_PASS_journal_replay &&
- bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) {
+ if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_journal_replay) &&
+ enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_node_rewrite)) {
list_add(&a->list, &c->btree_node_rewrites);
now = true;
} else if (!test_bit(BCH_FS_may_go_rw, &c->flags)) {
@@ -2320,7 +2440,7 @@ void bch2_do_pending_node_rewrites(struct bch_fs *c)
if (!a)
break;
- bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite);
+ enumerated_ref_get(&c->writes, BCH_WRITE_REF_node_rewrite);
queue_work(c->btree_node_rewrite_worker, &a->work);
}
}
@@ -2350,7 +2470,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
bool skip_triggers)
{
struct bch_fs *c = trans->c;
- struct btree_iter iter2 = { NULL };
+ struct btree_iter iter2 = {};
struct btree *parent;
int ret;
@@ -2374,7 +2494,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
parent = btree_node_parent(btree_iter_path(trans, iter), b);
if (parent) {
- bch2_trans_copy_iter(&iter2, iter);
+ bch2_trans_copy_iter(trans, &iter2, iter);
iter2.path = bch2_btree_path_make_mut(trans, iter2.path,
iter2.flags & BTREE_ITER_intent,
@@ -2388,7 +2508,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
trans->paths_sorted = false;
- ret = bch2_btree_iter_traverse(&iter2) ?:
+ ret = bch2_btree_iter_traverse(trans, &iter2) ?:
bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_norun);
if (ret)
goto err;
@@ -2492,30 +2612,15 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *trans,
unsigned commit_flags, bool skip_triggers)
{
struct btree_iter iter;
- int ret;
-
- bch2_trans_node_iter_init(trans, &iter, b->c.btree_id, b->key.k.p,
- BTREE_MAX_DEPTH, b->c.level,
- BTREE_ITER_intent);
- ret = bch2_btree_iter_traverse(&iter);
+ int ret = get_iter_to_node(trans, &iter, b);
if (ret)
- goto out;
-
- /* has node been freed? */
- if (btree_iter_path(trans, &iter)->l[b->c.level].b != b) {
- /* node has been freed: */
- BUG_ON(!btree_node_dying(b));
- goto out;
- }
-
- BUG_ON(!btree_node_hashed(b));
+ return ret == -BCH_ERR_btree_node_dying ? 0 : ret;
bch2_bkey_drop_ptrs(bkey_i_to_s(new_key), ptr,
!bch2_bkey_has_device(bkey_i_to_s(&b->key), ptr->dev));
ret = bch2_btree_node_update_key(trans, &iter, b, new_key,
commit_flags, skip_triggers);
-out:
bch2_trans_iter_exit(trans, &iter);
return ret;
}
@@ -2595,9 +2700,19 @@ static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update
prt_str(out, " ");
bch2_btree_id_to_text(out, as->btree_id);
- prt_printf(out, " l=%u-%u mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n",
+ prt_printf(out, " l=%u-%u ",
as->update_level_start,
- as->update_level_end,
+ as->update_level_end);
+ bch2_bpos_to_text(out, as->node_start);
+ prt_char(out, ' ');
+ bch2_bpos_to_text(out, as->node_end);
+ prt_printf(out, "\nwritten %u/%u u64s_remaining %u need_rewrite %s",
+ as->node_written,
+ as->node_sectors,
+ as->node_remaining,
+ btree_node_reawrite_reason_strs[as->node_needed_rewrite]);
+
+ prt_printf(out, "\nmode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n",
bch2_btree_update_modes[as->mode],
as->nodes_written,
closure_nr_remaining(&as->cl),
@@ -2724,16 +2839,16 @@ int bch2_fs_btree_interior_update_init(struct bch_fs *c)
c->btree_interior_update_worker =
alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 8);
if (!c->btree_interior_update_worker)
- return -BCH_ERR_ENOMEM_btree_interior_update_worker_init;
+ return bch_err_throw(c, ENOMEM_btree_interior_update_worker_init);
c->btree_node_rewrite_worker =
alloc_ordered_workqueue("btree_node_rewrite", WQ_UNBOUND);
if (!c->btree_node_rewrite_worker)
- return -BCH_ERR_ENOMEM_btree_interior_update_worker_init;
+ return bch_err_throw(c, ENOMEM_btree_interior_update_worker_init);
if (mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
sizeof(struct btree_update)))
- return -BCH_ERR_ENOMEM_btree_interior_update_pool_init;
+ return bch_err_throw(c, ENOMEM_btree_interior_update_pool_init);
return 0;
}
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index 7930ffea3075..ac04e45a8515 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -57,6 +57,13 @@ struct btree_update {
unsigned took_gc_lock:1;
enum btree_id btree_id;
+ struct bpos node_start;
+ struct bpos node_end;
+ enum btree_node_rewrite_reason node_needed_rewrite;
+ u16 node_written;
+ u16 node_sectors;
+ u16 node_remaining;
+
unsigned update_level_start;
unsigned update_level_end;
@@ -144,7 +151,7 @@ static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans,
EBUG_ON(!btree_node_locked(path, level));
- if (bch2_btree_node_merging_disabled)
+ if (static_branch_unlikely(&bch2_btree_node_merging_disabled))
return 0;
b = path->l[level].b;
@@ -168,8 +175,18 @@ static inline int bch2_foreground_maybe_merge(struct btree_trans *trans,
}
int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *,
- struct btree *, unsigned);
+ struct btree *, unsigned, unsigned);
+int bch2_btree_node_rewrite_key(struct btree_trans *,
+ enum btree_id, unsigned,
+ struct bkey_i *, unsigned);
+int bch2_btree_node_rewrite_pos(struct btree_trans *,
+ enum btree_id, unsigned,
+ struct bpos, unsigned, unsigned);
+int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *,
+ struct btree *, unsigned);
+
void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *);
+
int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *,
struct btree *, struct bkey_i *,
unsigned, bool);
@@ -278,12 +295,12 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, struct bt
{
struct bset_tree *t = bset_tree_last(b);
struct btree_node_entry *bne = max(write_block(b),
- (void *) btree_bkey_last(b, bset_tree_last(b)));
+ (void *) btree_bkey_last(b, t));
ssize_t remaining_space =
__bch2_btree_u64s_remaining(b, bne->keys.start);
if (unlikely(bset_written(b, bset(b, t)))) {
- if (remaining_space > (ssize_t) (block_bytes(c) >> 3))
+ if (b->written + block_sectors(c) <= btree_sectors(c))
return bne;
} else {
if (unlikely(bset_u64s(t) * sizeof(u64) > btree_write_set_buffer(b)) &&
diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
index b56c4987b8c9..4b095235a0d2 100644
--- a/fs/bcachefs/btree_write_buffer.c
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -7,6 +7,7 @@
#include "btree_update_interior.h"
#include "btree_write_buffer.h"
#include "disk_accounting.h"
+#include "enumerated_ref.h"
#include "error.h"
#include "extents.h"
#include "journal.h"
@@ -144,7 +145,7 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite
EBUG_ON(!trans->c->btree_write_buffer.flushing.pin.seq);
EBUG_ON(trans->c->btree_write_buffer.flushing.pin.seq > wb->journal_seq);
- ret = bch2_btree_iter_traverse(iter);
+ ret = bch2_btree_iter_traverse(trans, iter);
if (ret)
return ret;
@@ -181,6 +182,8 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite
return wb_flush_one_slowpath(trans, iter, wb);
}
+ EBUG_ON(!bpos_eq(wb->k.k.p, path->pos));
+
bch2_btree_insert_key_leaf(trans, path, &wb->k, wb->journal_seq);
(*fast)++;
return 0;
@@ -208,7 +211,7 @@ btree_write_buffered_insert(struct btree_trans *trans,
trans->journal_res.seq = wb->journal_seq;
- ret = bch2_btree_iter_traverse(&iter) ?:
+ ret = bch2_btree_iter_traverse(trans, &iter) ?:
bch2_trans_update(trans, &iter, &wb->k,
BTREE_UPDATE_internal_snapshot_node);
bch2_trans_iter_exit(trans, &iter);
@@ -264,12 +267,27 @@ out:
BUG_ON(wb->sorted.size < wb->flushing.keys.nr);
}
+int bch2_btree_write_buffer_insert_err(struct bch_fs *c,
+ enum btree_id btree, struct bkey_i *k)
+{
+ struct printbuf buf = PRINTBUF;
+
+ prt_printf(&buf, "attempting to do write buffer update on non wb btree=");
+ bch2_btree_id_to_text(&buf, btree);
+ prt_str(&buf, "\n");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
+
+ bch2_fs_inconsistent(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ return -EROFS;
+}
+
static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
struct journal *j = &c->journal;
struct btree_write_buffer *wb = &c->btree_write_buffer;
- struct btree_iter iter = { NULL };
+ struct btree_iter iter = {};
size_t overwritten = 0, fast = 0, slowpath = 0, could_not_insert = 0;
bool write_locked = false;
bool accounting_replay_done = test_bit(BCH_FS_accounting_replay_done, &c->flags);
@@ -312,7 +330,10 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
darray_for_each(wb->sorted, i) {
struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx];
- BUG_ON(!btree_type_uses_write_buffer(k->btree));
+ if (unlikely(!btree_type_uses_write_buffer(k->btree))) {
+ ret = bch2_btree_write_buffer_insert_err(trans->c, k->btree, &k->k);
+ goto err;
+ }
for (struct wb_key_ref *n = i + 1; n < min(i + 4, &darray_top(wb->sorted)); n++)
prefetch(&wb->flushing.keys.data[n->idx]);
@@ -349,7 +370,7 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
write_locked = false;
ret = lockrestart_do(trans,
- bch2_btree_iter_traverse(&iter) ?:
+ bch2_btree_iter_traverse(trans, &iter) ?:
bch2_foreground_maybe_merge(trans, iter.path, 0,
BCH_WATERMARK_reclaim|
BCH_TRANS_COMMIT_journal_reclaim|
@@ -366,13 +387,13 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
BTREE_ITER_intent|BTREE_ITER_all_snapshots);
}
- bch2_btree_iter_set_pos(&iter, k->k.k.p);
+ bch2_btree_iter_set_pos(trans, &iter, k->k.k.p);
btree_iter_path(trans, &iter)->preserve = false;
bool accounting_accumulated = false;
do {
if (race_fault()) {
- ret = -BCH_ERR_journal_reclaim_would_deadlock;
+ ret = bch_err_throw(c, journal_reclaim_would_deadlock);
break;
}
@@ -409,10 +430,10 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
*/
trace_and_count(c, write_buffer_flush_slowpath, trans, slowpath, wb->flushing.keys.nr);
- sort(wb->flushing.keys.data,
- wb->flushing.keys.nr,
- sizeof(wb->flushing.keys.data[0]),
- wb_key_seq_cmp, NULL);
+ sort_nonatomic(wb->flushing.keys.data,
+ wb->flushing.keys.nr,
+ sizeof(wb->flushing.keys.data[0]),
+ wb_key_seq_cmp, NULL);
darray_for_each(wb->flushing.keys, i) {
if (!i->journal_seq)
@@ -610,11 +631,11 @@ int bch2_btree_write_buffer_tryflush(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_write_buffer))
- return -BCH_ERR_erofs_no_writes;
+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_write_buffer))
+ return bch_err_throw(c, erofs_no_writes);
int ret = bch2_btree_write_buffer_flush_nocheck_rw(trans);
- bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_write_buffer);
return ret;
}
@@ -654,7 +675,10 @@ int bch2_btree_write_buffer_maybe_flush(struct btree_trans *trans,
goto err;
bch2_bkey_buf_copy(last_flushed, c, tmp.k);
- ret = -BCH_ERR_transaction_restart_write_buffer_flush;
+
+ /* can we avoid the unconditional restart? */
+ trace_and_count(c, trans_restart_write_buffer_flush, trans, _RET_IP_);
+ ret = bch_err_throw(c, transaction_restart_write_buffer_flush);
}
err:
bch2_bkey_buf_exit(&tmp, c);
@@ -673,7 +697,7 @@ static void bch2_btree_write_buffer_flush_work(struct work_struct *work)
} while (!ret && bch2_btree_write_buffer_should_flush(c));
mutex_unlock(&wb->flushing.lock);
- bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_write_buffer);
}
static void wb_accounting_sort(struct btree_write_buffer *wb)
@@ -802,9 +826,9 @@ int bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_
bch2_journal_pin_drop(&c->journal, &dst->wb->pin);
if (bch2_btree_write_buffer_should_flush(c) &&
- __bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_write_buffer) &&
+ __enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_write_buffer) &&
!queue_work(system_unbound_wq, &c->btree_write_buffer.flush_work))
- bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_write_buffer);
if (dst->wb == &wb->flushing)
mutex_unlock(&wb->flushing.lock);
@@ -847,13 +871,18 @@ void bch2_fs_btree_write_buffer_exit(struct bch_fs *c)
darray_exit(&wb->inc.keys);
}
-int bch2_fs_btree_write_buffer_init(struct bch_fs *c)
+void bch2_fs_btree_write_buffer_init_early(struct bch_fs *c)
{
struct btree_write_buffer *wb = &c->btree_write_buffer;
mutex_init(&wb->inc.lock);
mutex_init(&wb->flushing.lock);
INIT_WORK(&wb->flush_work, bch2_btree_write_buffer_flush_work);
+}
+
+int bch2_fs_btree_write_buffer_init(struct bch_fs *c)
+{
+ struct btree_write_buffer *wb = &c->btree_write_buffer;
/* Will be resized by journal as needed: */
unsigned initial_size = 1 << 16;
diff --git a/fs/bcachefs/btree_write_buffer.h b/fs/bcachefs/btree_write_buffer.h
index d535cea28bde..c351d21aca0b 100644
--- a/fs/bcachefs/btree_write_buffer.h
+++ b/fs/bcachefs/btree_write_buffer.h
@@ -89,6 +89,12 @@ static inline int bch2_journal_key_to_wb(struct bch_fs *c,
struct journal_keys_to_wb *dst,
enum btree_id btree, struct bkey_i *k)
{
+ if (unlikely(!btree_type_uses_write_buffer(btree))) {
+ int ret = bch2_btree_write_buffer_insert_err(c, btree, k);
+ dump_stack();
+ return ret;
+ }
+
EBUG_ON(!dst->seq);
return k->k.type == KEY_TYPE_accounting
@@ -101,6 +107,7 @@ int bch2_journal_keys_to_write_buffer_end(struct bch_fs *, struct journal_keys_t
int bch2_btree_write_buffer_resize(struct bch_fs *, size_t);
void bch2_fs_btree_write_buffer_exit(struct bch_fs *);
+void bch2_fs_btree_write_buffer_init_early(struct bch_fs *);
int bch2_fs_btree_write_buffer_init(struct bch_fs *);
#endif /* _BCACHEFS_BTREE_WRITE_BUFFER_H */
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 345b117a4a4a..f25903c10e8a 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -30,8 +30,15 @@
void bch2_dev_usage_read_fast(struct bch_dev *ca, struct bch_dev_usage *usage)
{
+ for (unsigned i = 0; i < BCH_DATA_NR; i++)
+ usage->buckets[i] = percpu_u64_get(&ca->usage->d[i].buckets);
+}
+
+void bch2_dev_usage_full_read_fast(struct bch_dev *ca, struct bch_dev_usage_full *usage)
+{
memset(usage, 0, sizeof(*usage));
- acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage, dev_usage_u64s());
+ acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage,
+ sizeof(struct bch_dev_usage_full) / sizeof(u64));
}
static u64 reserve_factor(u64 r)
@@ -75,7 +82,7 @@ bch2_fs_usage_read_short(struct bch_fs *c)
void bch2_dev_usage_to_text(struct printbuf *out,
struct bch_dev *ca,
- struct bch_dev_usage *usage)
+ struct bch_dev_usage_full *usage)
{
if (out->nr_tabstops < 5) {
printbuf_tabstops_reset(out);
@@ -149,10 +156,14 @@ static int bch2_check_fix_ptr(struct btree_trans *trans,
g->gen_valid = true;
g->gen = p.ptr.gen;
} else {
+ /* this pointer will be dropped */
*do_update = true;
+ goto out;
}
}
+ /* g->gen_valid == true */
+
if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0,
trans, ptr_gen_newer_than_bucket_gen,
"bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
@@ -165,15 +176,13 @@ static int bch2_check_fix_ptr(struct btree_trans *trans,
if (!p.ptr.cached &&
(g->data_type != BCH_DATA_btree ||
data_type == BCH_DATA_btree)) {
- g->gen_valid = true;
- g->gen = p.ptr.gen;
- g->data_type = 0;
+ g->data_type = data_type;
g->stripe_sectors = 0;
g->dirty_sectors = 0;
g->cached_sectors = 0;
- } else {
- *do_update = true;
}
+
+ *do_update = true;
}
if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX,
@@ -210,9 +219,22 @@ static int bch2_check_fix_ptr(struct btree_trans *trans,
bch2_data_type_str(data_type),
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- if (data_type == BCH_DATA_btree) {
- g->gen_valid = true;
- g->gen = p.ptr.gen;
+ if (!p.ptr.cached &&
+ data_type == BCH_DATA_btree) {
+ switch (g->data_type) {
+ case BCH_DATA_sb:
+ bch_err(c, "btree and superblock in the same bucket - cannot repair");
+ ret = bch_err_throw(c, fsck_repair_unimplemented);
+ goto out;
+ case BCH_DATA_journal:
+ ret = bch2_dev_journal_bucket_delete(ca, PTR_BUCKET_NR(ca, &p.ptr));
+ bch_err_msg(c, ret, "error deleting journal bucket %zu",
+ PTR_BUCKET_NR(ca, &p.ptr));
+ if (ret)
+ goto out;
+ break;
+ }
+
g->data_type = data_type;
g->stripe_sectors = 0;
g->dirty_sectors = 0;
@@ -262,6 +284,9 @@ int bch2_check_fix_ptrs(struct btree_trans *trans,
struct printbuf buf = PRINTBUF;
int ret = 0;
+ /* We don't yet do btree key updates correctly for when we're RW */
+ BUG_ON(test_bit(BCH_FS_rw, &c->flags));
+
bkey_for_each_ptr_decode(k.k, ptrs_c, p, entry_c) {
ret = bch2_check_fix_ptr(trans, k, p, entry_c, &do_update);
if (ret)
@@ -269,20 +294,13 @@ int bch2_check_fix_ptrs(struct btree_trans *trans,
}
if (do_update) {
- if (flags & BTREE_TRIGGER_is_root) {
- bch_err(c, "cannot update btree roots yet");
- ret = -EINVAL;
- goto err;
- }
-
struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
ret = PTR_ERR_OR_ZERO(new);
if (ret)
goto err;
- rcu_read_lock();
- bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, !bch2_dev_exists(c, ptr->dev));
- rcu_read_unlock();
+ scoped_guard(rcu)
+ bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, !bch2_dev_exists(c, ptr->dev));
if (level) {
/*
@@ -291,14 +309,11 @@ int bch2_check_fix_ptrs(struct btree_trans *trans,
* sort it out:
*/
struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
- rcu_read_lock();
- bkey_for_each_ptr(ptrs, ptr) {
- struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
- struct bucket *g = PTR_GC_BUCKET(ca, ptr);
-
- ptr->gen = g->gen;
- }
- rcu_read_unlock();
+ scoped_guard(rcu)
+ bkey_for_each_ptr(ptrs, ptr) {
+ struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
+ ptr->gen = PTR_GC_BUCKET(ca, ptr)->gen;
+ }
} else {
struct bkey_ptrs ptrs;
union bch_extent_entry *entry;
@@ -362,25 +377,80 @@ found:
bch_info(c, "new key %s", buf.buf);
}
- struct btree_iter iter;
- bch2_trans_node_iter_init(trans, &iter, btree, new->k.p, 0, level,
- BTREE_ITER_intent|BTREE_ITER_all_snapshots);
- ret = bch2_btree_iter_traverse(&iter) ?:
- bch2_trans_update(trans, &iter, new,
- BTREE_UPDATE_internal_snapshot_node|
- BTREE_TRIGGER_norun);
- bch2_trans_iter_exit(trans, &iter);
- if (ret)
- goto err;
+ if (!(flags & BTREE_TRIGGER_is_root)) {
+ struct btree_iter iter;
+ bch2_trans_node_iter_init(trans, &iter, btree, new->k.p, 0, level,
+ BTREE_ITER_intent|BTREE_ITER_all_snapshots);
+ ret = bch2_btree_iter_traverse(trans, &iter) ?:
+ bch2_trans_update(trans, &iter, new,
+ BTREE_UPDATE_internal_snapshot_node|
+ BTREE_TRIGGER_norun);
+ bch2_trans_iter_exit(trans, &iter);
+ if (ret)
+ goto err;
- if (level)
- bch2_btree_node_update_key_early(trans, btree, level - 1, k, new);
+ if (level)
+ bch2_btree_node_update_key_early(trans, btree, level - 1, k, new);
+ } else {
+ struct jset_entry *e = bch2_trans_jset_entry_alloc(trans,
+ jset_u64s(new->k.u64s));
+ ret = PTR_ERR_OR_ZERO(e);
+ if (ret)
+ goto err;
+
+ journal_entry_set(e,
+ BCH_JSET_ENTRY_btree_root,
+ btree, level - 1,
+ new, new->k.u64s);
+
+ /*
+ * no locking, we're single threaded and not rw yet, see
+ * the big assertino above that we repeat here:
+ */
+ BUG_ON(test_bit(BCH_FS_rw, &c->flags));
+
+ struct btree *b = bch2_btree_id_root(c, btree)->b;
+ bkey_copy(&b->key, new);
+ }
}
err:
printbuf_exit(&buf);
return ret;
}
+static int bucket_ref_update_err(struct btree_trans *trans, struct printbuf *buf,
+ struct bkey_s_c k, bool insert, enum bch_sb_error_id id)
+{
+ struct bch_fs *c = trans->c;
+
+ prt_printf(buf, "\nwhile marking ");
+ bch2_bkey_val_to_text(buf, c, k);
+ prt_newline(buf);
+
+ bool print = __bch2_count_fsck_err(c, id, buf);
+
+ int ret = bch2_run_explicit_recovery_pass(c, buf,
+ BCH_RECOVERY_PASS_check_allocations, 0);
+
+ if (insert) {
+ bch2_trans_updates_to_text(buf, trans);
+ __bch2_inconsistent_error(c, buf);
+ /*
+ * If we're in recovery, run_explicit_recovery_pass might give
+ * us an error code for rewinding recovery
+ */
+ if (!ret)
+ ret = bch_err_throw(c, bucket_ref_update);
+ } else {
+ /* Always ignore overwrite errors, so that deletion works */
+ ret = 0;
+ }
+
+ if (print || insert)
+ bch2_print_str(c, KERN_ERR, buf->buf);
+ return ret;
+}
+
int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca,
struct bkey_s_c k,
const struct bch_extent_ptr *ptr,
@@ -396,32 +466,29 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca,
BUG_ON(!sectors);
- if (gen_after(ptr->gen, b_gen)) {
- bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations);
- log_fsck_err(trans, ptr_gen_newer_than_bucket_gen,
- "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n"
- "while marking %s",
+ if (unlikely(gen_after(ptr->gen, b_gen))) {
+ bch2_log_msg_start(c, &buf);
+ prt_printf(&buf,
+ "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen",
ptr->dev, bucket_nr, b_gen,
bch2_data_type_str(bucket_data_type ?: ptr_data_type),
- ptr->gen,
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
- if (inserting)
- goto err;
+ ptr->gen);
+
+ ret = bucket_ref_update_err(trans, &buf, k, inserting,
+ BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen);
goto out;
}
- if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) {
- bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations);
- log_fsck_err(trans, ptr_too_stale,
- "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
- "while marking %s",
+ if (unlikely(gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX)) {
+ bch2_log_msg_start(c, &buf);
+ prt_printf(&buf,
+ "bucket %u:%zu gen %u data type %s: ptr gen %u too stale",
ptr->dev, bucket_nr, b_gen,
bch2_data_type_str(bucket_data_type ?: ptr_data_type),
- ptr->gen,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf));
- if (inserting)
- goto err;
+ ptr->gen);
+
+ ret = bucket_ref_update_err(trans, &buf, k, inserting,
+ BCH_FSCK_ERR_ptr_too_stale);
goto out;
}
@@ -430,62 +497,50 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca,
goto out;
}
- if (b_gen != ptr->gen) {
- bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations);
- log_fsck_err(trans, stale_dirty_ptr,
- "bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n"
- "while marking %s",
+ if (unlikely(b_gen != ptr->gen)) {
+ bch2_log_msg_start(c, &buf);
+ prt_printf(&buf,
+ "bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)",
ptr->dev, bucket_nr, b_gen,
bucket_gen_get(ca, bucket_nr),
bch2_data_type_str(bucket_data_type ?: ptr_data_type),
- ptr->gen,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf));
- if (inserting)
- goto err;
+ ptr->gen);
+
+ ret = bucket_ref_update_err(trans, &buf, k, inserting,
+ BCH_FSCK_ERR_stale_dirty_ptr);
goto out;
}
- if (bucket_data_type_mismatch(bucket_data_type, ptr_data_type)) {
- bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations);
- log_fsck_err(trans, ptr_bucket_data_type_mismatch,
- "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
- "while marking %s",
- ptr->dev, bucket_nr, b_gen,
- bch2_data_type_str(bucket_data_type),
- bch2_data_type_str(ptr_data_type),
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf));
- if (inserting)
- goto err;
+ if (unlikely(bucket_data_type_mismatch(bucket_data_type, ptr_data_type))) {
+ bch2_log_msg_start(c, &buf);
+ prt_printf(&buf, "bucket %u:%zu gen %u different types of data in same bucket: %s, %s",
+ ptr->dev, bucket_nr, b_gen,
+ bch2_data_type_str(bucket_data_type),
+ bch2_data_type_str(ptr_data_type));
+
+ ret = bucket_ref_update_err(trans, &buf, k, inserting,
+ BCH_FSCK_ERR_ptr_bucket_data_type_mismatch);
goto out;
}
- if ((u64) *bucket_sectors + sectors > U32_MAX) {
- bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations);
- log_fsck_err(trans, bucket_sector_count_overflow,
- "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n"
- "while marking %s",
+ if (unlikely((u64) *bucket_sectors + sectors > U32_MAX)) {
+ bch2_log_msg_start(c, &buf);
+ prt_printf(&buf,
+ "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX",
ptr->dev, bucket_nr, b_gen,
bch2_data_type_str(bucket_data_type ?: ptr_data_type),
- *bucket_sectors, sectors,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf));
- if (inserting)
- goto err;
+ *bucket_sectors, sectors);
+
+ ret = bucket_ref_update_err(trans, &buf, k, inserting,
+ BCH_FSCK_ERR_bucket_sector_count_overflow);
sectors = -*bucket_sectors;
+ goto out;
}
*bucket_sectors += sectors;
out:
printbuf_exit(&buf);
return ret;
-err:
-fsck_err:
- bch2_dump_trans_updates(trans);
- bch2_inconsistent_error(c);
- ret = -BCH_ERR_bucket_ref_update;
- goto out;
}
void bch2_trans_account_disk_usage_change(struct btree_trans *trans)
@@ -577,11 +632,18 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev);
if (unlikely(!ca)) {
if (insert && p.ptr.dev != BCH_SB_MEMBER_INVALID)
- ret = -BCH_ERR_trigger_pointer;
+ ret = bch_err_throw(c, trigger_pointer);
goto err;
}
struct bpos bucket = PTR_BUCKET_POS(ca, &p.ptr);
+ if (!bucket_valid(ca, bucket.offset)) {
+ if (insert) {
+ bch2_dev_bucket_missing(ca, bucket.offset);
+ ret = bch_err_throw(c, trigger_pointer);
+ }
+ goto err;
+ }
if (flags & BTREE_TRIGGER_transactional) {
struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket, 0);
@@ -590,11 +652,9 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
if (ret)
goto err;
- if (!p.ptr.cached) {
- ret = bch2_bucket_backpointer_mod(trans, k, &bp, insert);
- if (ret)
- goto err;
- }
+ ret = bch2_bucket_backpointer_mod(trans, k, &bp, insert);
+ if (ret)
+ goto err;
}
if (flags & BTREE_TRIGGER_gc) {
@@ -602,7 +662,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s",
p.ptr.dev,
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- ret = -BCH_ERR_trigger_pointer;
+ ret = bch_err_throw(c, trigger_pointer);
goto err;
}
@@ -628,6 +688,8 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans,
s64 sectors,
enum btree_iter_update_trigger_flags flags)
{
+ struct bch_fs *c = trans->c;
+
if (flags & BTREE_TRIGGER_transactional) {
struct btree_iter iter;
struct bkey_i_stripe *s = bch2_bkey_get_mut_typed(trans, &iter,
@@ -645,7 +707,7 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans,
bch2_trans_inconsistent(trans,
"stripe pointer doesn't match stripe %llu",
(u64) p.ec.idx);
- ret = -BCH_ERR_trigger_stripe_pointer;
+ ret = bch_err_throw(c, trigger_stripe_pointer);
goto err;
}
@@ -653,9 +715,9 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans,
stripe_blockcount_get(&s->v, p.ec.block) +
sectors);
- struct disk_accounting_pos acc = {
- .type = BCH_DISK_ACCOUNTING_replicas,
- };
+ struct disk_accounting_pos acc;
+ memset(&acc, 0, sizeof(acc));
+ acc.type = BCH_DISK_ACCOUNTING_replicas;
bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i));
acc.replicas.data_type = data_type;
ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, false);
@@ -665,35 +727,35 @@ err:
}
if (flags & BTREE_TRIGGER_gc) {
- struct bch_fs *c = trans->c;
-
struct gc_stripe *m = genradix_ptr_alloc(&c->gc_stripes, p.ec.idx, GFP_KERNEL);
if (!m) {
bch_err(c, "error allocating memory for gc_stripes, idx %llu",
(u64) p.ec.idx);
- return -BCH_ERR_ENOMEM_mark_stripe_ptr;
+ return bch_err_throw(c, ENOMEM_mark_stripe_ptr);
}
- mutex_lock(&c->ec_stripes_heap_lock);
+ gc_stripe_lock(m);
if (!m || !m->alive) {
- mutex_unlock(&c->ec_stripes_heap_lock);
+ gc_stripe_unlock(m);
struct printbuf buf = PRINTBUF;
+ bch2_log_msg_start(c, &buf);
+ prt_printf(&buf, "pointer to nonexistent stripe %llu\n while marking ",
+ (u64) p.ec.idx);
bch2_bkey_val_to_text(&buf, c, k);
- bch_err_ratelimited(c, "pointer to nonexistent stripe %llu\n while marking %s",
- (u64) p.ec.idx, buf.buf);
+ __bch2_inconsistent_error(c, &buf);
+ bch2_print_str(c, KERN_ERR, buf.buf);
printbuf_exit(&buf);
- bch2_inconsistent_error(c);
- return -BCH_ERR_trigger_stripe_pointer;
+ return bch_err_throw(c, trigger_stripe_pointer);
}
m->block_sectors[p.ec.block] += sectors;
- struct disk_accounting_pos acc = {
- .type = BCH_DISK_ACCOUNTING_replicas,
- };
- memcpy(&acc.replicas, &m->r.e, replicas_entry_bytes(&m->r.e));
- mutex_unlock(&c->ec_stripes_heap_lock);
+ struct disk_accounting_pos acc;
+ memset(&acc, 0, sizeof(acc));
+ acc.type = BCH_DISK_ACCOUNTING_replicas;
+ unsafe_memcpy(&acc.replicas, &m->r.e, replicas_entry_bytes(&m->r.e), "VLA");
+ gc_stripe_unlock(m);
acc.replicas.data_type = data_type;
int ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, true);
@@ -707,8 +769,7 @@ err:
static int __trigger_extent(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
struct bkey_s_c k,
- enum btree_iter_update_trigger_flags flags,
- s64 *replicas_sectors)
+ enum btree_iter_update_trigger_flags flags)
{
bool gc = flags & BTREE_TRIGGER_gc;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
@@ -719,16 +780,16 @@ static int __trigger_extent(struct btree_trans *trans,
: BCH_DATA_user;
int ret = 0;
- struct disk_accounting_pos acc_replicas_key = {
- .type = BCH_DISK_ACCOUNTING_replicas,
- .replicas.data_type = data_type,
- .replicas.nr_devs = 0,
- .replicas.nr_required = 1,
- };
+ s64 replicas_sectors = 0;
+
+ struct disk_accounting_pos acc_replicas_key;
+ memset(&acc_replicas_key, 0, sizeof(acc_replicas_key));
+ acc_replicas_key.type = BCH_DISK_ACCOUNTING_replicas;
+ acc_replicas_key.replicas.data_type = data_type;
+ acc_replicas_key.replicas.nr_devs = 0;
+ acc_replicas_key.replicas.nr_required = 1;
- struct disk_accounting_pos acct_compression_key = {
- .type = BCH_DISK_ACCOUNTING_compression,
- };
+ unsigned cur_compression_type = 0;
u64 compression_acct[3] = { 1, 0, 0 };
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
@@ -747,7 +808,7 @@ static int __trigger_extent(struct btree_trans *trans,
if (ret)
return ret;
} else if (!p.has_ec) {
- *replicas_sectors += disk_sectors;
+ replicas_sectors += disk_sectors;
replicas_entry_add_dev(&acc_replicas_key.replicas, p.ptr.dev);
} else {
ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags);
@@ -762,13 +823,13 @@ static int __trigger_extent(struct btree_trans *trans,
acc_replicas_key.replicas.nr_required = 0;
}
- if (acct_compression_key.compression.type &&
- acct_compression_key.compression.type != p.crc.compression_type) {
+ if (cur_compression_type &&
+ cur_compression_type != p.crc.compression_type) {
if (flags & BTREE_TRIGGER_overwrite)
bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct));
- ret = bch2_disk_accounting_mod(trans, &acct_compression_key, compression_acct,
- ARRAY_SIZE(compression_acct), gc);
+ ret = bch2_disk_accounting_mod2(trans, gc, compression_acct,
+ compression, cur_compression_type);
if (ret)
return ret;
@@ -777,7 +838,7 @@ static int __trigger_extent(struct btree_trans *trans,
compression_acct[2] = 0;
}
- acct_compression_key.compression.type = p.crc.compression_type;
+ cur_compression_type = p.crc.compression_type;
if (p.crc.compression_type) {
compression_acct[1] += p.crc.uncompressed_size;
compression_acct[2] += p.crc.compressed_size;
@@ -785,51 +846,40 @@ static int __trigger_extent(struct btree_trans *trans,
}
if (acc_replicas_key.replicas.nr_devs) {
- ret = bch2_disk_accounting_mod(trans, &acc_replicas_key, replicas_sectors, 1, gc);
+ ret = bch2_disk_accounting_mod(trans, &acc_replicas_key, &replicas_sectors, 1, gc);
if (ret)
return ret;
}
if (acc_replicas_key.replicas.nr_devs && !level && k.k->p.snapshot) {
- struct disk_accounting_pos acc_snapshot_key = {
- .type = BCH_DISK_ACCOUNTING_snapshot,
- .snapshot.id = k.k->p.snapshot,
- };
- ret = bch2_disk_accounting_mod(trans, &acc_snapshot_key, replicas_sectors, 1, gc);
+ ret = bch2_disk_accounting_mod2_nr(trans, gc, &replicas_sectors, 1, snapshot, k.k->p.snapshot);
if (ret)
return ret;
}
- if (acct_compression_key.compression.type) {
+ if (cur_compression_type) {
if (flags & BTREE_TRIGGER_overwrite)
bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct));
- ret = bch2_disk_accounting_mod(trans, &acct_compression_key, compression_acct,
- ARRAY_SIZE(compression_acct), gc);
+ ret = bch2_disk_accounting_mod2(trans, gc, compression_acct,
+ compression, cur_compression_type);
if (ret)
return ret;
}
if (level) {
- struct disk_accounting_pos acc_btree_key = {
- .type = BCH_DISK_ACCOUNTING_btree,
- .btree.id = btree_id,
- };
- ret = bch2_disk_accounting_mod(trans, &acc_btree_key, replicas_sectors, 1, gc);
+ ret = bch2_disk_accounting_mod2_nr(trans, gc, &replicas_sectors, 1, btree, btree_id);
if (ret)
return ret;
} else {
bool insert = !(flags & BTREE_TRIGGER_overwrite);
- struct disk_accounting_pos acc_inum_key = {
- .type = BCH_DISK_ACCOUNTING_inum,
- .inum.inum = k.k->p.inode,
- };
+
s64 v[3] = {
insert ? 1 : -1,
insert ? k.k->size : -((s64) k.k->size),
- *replicas_sectors,
+ replicas_sectors,
};
- ret = bch2_disk_accounting_mod(trans, &acc_inum_key, v, ARRAY_SIZE(v), gc);
+ ret = bch2_disk_accounting_mod2(trans, gc, v, inum, k.k->p.inode);
if (ret)
return ret;
}
@@ -859,34 +909,30 @@ int bch2_trigger_extent(struct btree_trans *trans,
return 0;
if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) {
- s64 old_replicas_sectors = 0, new_replicas_sectors = 0;
-
if (old.k->type) {
int ret = __trigger_extent(trans, btree, level, old,
- flags & ~BTREE_TRIGGER_insert,
- &old_replicas_sectors);
+ flags & ~BTREE_TRIGGER_insert);
if (ret)
return ret;
}
if (new.k->type) {
int ret = __trigger_extent(trans, btree, level, new.s_c,
- flags & ~BTREE_TRIGGER_overwrite,
- &new_replicas_sectors);
+ flags & ~BTREE_TRIGGER_overwrite);
if (ret)
return ret;
}
int need_rebalance_delta = 0;
- s64 need_rebalance_sectors_delta = 0;
+ s64 need_rebalance_sectors_delta[1] = { 0 };
s64 s = bch2_bkey_sectors_need_rebalance(c, old);
need_rebalance_delta -= s != 0;
- need_rebalance_sectors_delta -= s;
+ need_rebalance_sectors_delta[0] -= s;
s = bch2_bkey_sectors_need_rebalance(c, new.s_c);
need_rebalance_delta += s != 0;
- need_rebalance_sectors_delta += s;
+ need_rebalance_sectors_delta[0] += s;
if ((flags & BTREE_TRIGGER_transactional) && need_rebalance_delta) {
int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
@@ -895,12 +941,9 @@ int bch2_trigger_extent(struct btree_trans *trans,
return ret;
}
- if (need_rebalance_sectors_delta) {
- struct disk_accounting_pos acc = {
- .type = BCH_DISK_ACCOUNTING_rebalance_work,
- };
- int ret = bch2_disk_accounting_mod(trans, &acc, &need_rebalance_sectors_delta, 1,
- flags & BTREE_TRIGGER_gc);
+ if (need_rebalance_sectors_delta[0]) {
+ int ret = bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc,
+ need_rebalance_sectors_delta, rebalance_work);
if (ret)
return ret;
}
@@ -916,17 +959,13 @@ static int __trigger_reservation(struct btree_trans *trans,
enum btree_iter_update_trigger_flags flags)
{
if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) {
- s64 sectors = k.k->size;
+ s64 sectors[1] = { k.k->size };
if (flags & BTREE_TRIGGER_overwrite)
- sectors = -sectors;
+ sectors[0] = -sectors[0];
- struct disk_accounting_pos acc = {
- .type = BCH_DISK_ACCOUNTING_persistent_reserved,
- .persistent_reserved.nr_replicas = bkey_s_c_to_reservation(k).v->nr_replicas,
- };
-
- return bch2_disk_accounting_mod(trans, &acc, &sectors, 1, flags & BTREE_TRIGGER_gc);
+ return bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, sectors,
+ persistent_reserved, bkey_s_c_to_reservation(k).v->nr_replicas);
}
return 0;
@@ -957,15 +996,25 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
return PTR_ERR(a);
if (a->v.data_type && type && a->v.data_type != type) {
- bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations);
- log_fsck_err(trans, bucket_metadata_type_mismatch,
- "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
- "while marking %s",
- iter.pos.inode, iter.pos.offset, a->v.gen,
- bch2_data_type_str(a->v.data_type),
- bch2_data_type_str(type),
- bch2_data_type_str(type));
- ret = -BCH_ERR_metadata_bucket_inconsistency;
+ struct printbuf buf = PRINTBUF;
+ bch2_log_msg_start(c, &buf);
+ prt_printf(&buf, "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
+ "while marking %s\n",
+ iter.pos.inode, iter.pos.offset, a->v.gen,
+ bch2_data_type_str(a->v.data_type),
+ bch2_data_type_str(type),
+ bch2_data_type_str(type));
+
+ bch2_count_fsck_err(c, bucket_metadata_type_mismatch, &buf);
+
+ ret = bch2_run_explicit_recovery_pass(c, &buf,
+ BCH_RECOVERY_PASS_check_allocations, 0);
+
+ /* Always print, this is always fatal */
+ bch2_print_str(c, KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+ if (!ret)
+ ret = bch_err_throw(c, metadata_bucket_inconsistency);
goto err;
}
@@ -976,7 +1025,6 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
}
err:
-fsck_err:
bch2_trans_iter_exit(trans, &iter);
return ret;
}
@@ -1019,7 +1067,7 @@ static int bch2_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev *
err_unlock:
bucket_unlock(g);
err:
- return -BCH_ERR_metadata_bucket_inconsistency;
+ return bch_err_throw(c, metadata_bucket_inconsistency);
}
int bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
@@ -1134,10 +1182,10 @@ int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca,
int bch2_trans_mark_dev_sbs_flags(struct bch_fs *c,
enum btree_iter_update_trigger_flags flags)
{
- for_each_online_member(c, ca) {
+ for_each_online_member(c, ca, BCH_DEV_READ_REF_trans_mark_dev_sbs) {
int ret = bch2_trans_mark_dev_sb(c, ca, flags);
if (ret) {
- percpu_ref_put(&ca->io_ref);
+ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_trans_mark_dev_sbs);
return ret;
}
}
@@ -1234,7 +1282,7 @@ recalculate:
ret = 0;
} else {
atomic64_set(&c->sectors_available, sectors_available);
- ret = -BCH_ERR_ENOSPC_disk_reservation;
+ ret = bch_err_throw(c, ENOSPC_disk_reservation);
}
mutex_unlock(&c->sectors_available_lock);
@@ -1263,7 +1311,7 @@ int bch2_buckets_nouse_alloc(struct bch_fs *c)
GFP_KERNEL|__GFP_ZERO);
if (!ca->buckets_nouse) {
bch2_dev_put(ca);
- return -BCH_ERR_ENOMEM_buckets_nouse;
+ return bch_err_throw(c, ENOMEM_buckets_nouse);
}
}
@@ -1288,12 +1336,12 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
lockdep_assert_held(&c->state_lock);
if (resize && ca->buckets_nouse)
- return -BCH_ERR_no_resize_with_buckets_nouse;
+ return bch_err_throw(c, no_resize_with_buckets_nouse);
bucket_gens = bch2_kvmalloc(struct_size(bucket_gens, b, nbuckets),
GFP_KERNEL|__GFP_ZERO);
if (!bucket_gens) {
- ret = -BCH_ERR_ENOMEM_bucket_gens;
+ ret = bch_err_throw(c, ENOMEM_bucket_gens);
goto err;
}
@@ -1305,15 +1353,18 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1);
if (resize) {
- bucket_gens->nbuckets = min(bucket_gens->nbuckets,
- old_bucket_gens->nbuckets);
- bucket_gens->nbuckets_minus_first =
- bucket_gens->nbuckets - bucket_gens->first_bucket;
+ u64 copy = min(bucket_gens->nbuckets,
+ old_bucket_gens->nbuckets);
memcpy(bucket_gens->b,
old_bucket_gens->b,
- bucket_gens->nbuckets);
+ sizeof(bucket_gens->b[0]) * copy);
}
+ ret = bch2_bucket_bitmap_resize(ca, &ca->bucket_backpointer_mismatch,
+ ca->mi.nbuckets, nbuckets) ?:
+ bch2_bucket_bitmap_resize(ca, &ca->bucket_backpointer_empty,
+ ca->mi.nbuckets, nbuckets);
+
rcu_assign_pointer(ca->bucket_gens, bucket_gens);
bucket_gens = old_bucket_gens;
@@ -1336,9 +1387,9 @@ void bch2_dev_buckets_free(struct bch_dev *ca)
int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
{
- ca->usage = alloc_percpu(struct bch_dev_usage);
+ ca->usage = alloc_percpu(struct bch_dev_usage_full);
if (!ca->usage)
- return -BCH_ERR_ENOMEM_usage_init;
+ return bch_err_throw(c, ENOMEM_usage_init);
return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);
}
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index a9acdd6c0c86..49a3807a5eab 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -39,38 +39,12 @@ static inline u64 sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t
for (_b = (_buckets)->b + (_buckets)->first_bucket; \
_b < (_buckets)->b + (_buckets)->nbuckets; _b++)
-/*
- * Ugly hack alert:
- *
- * We need to cram a spinlock in a single byte, because that's what we have left
- * in struct bucket, and we care about the size of these - during fsck, we need
- * in memory state for every single bucket on every device.
- *
- * We used to do
- * while (xchg(&b->lock, 1) cpu_relax();
- * but, it turns out not all architectures support xchg on a single byte.
- *
- * So now we use bit_spin_lock(), with fun games since we can't burn a whole
- * ulong for this - we just need to make sure the lock bit always ends up in the
- * first byte.
- */
-
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-#define BUCKET_LOCK_BITNR 0
-#else
-#define BUCKET_LOCK_BITNR (BITS_PER_LONG - 1)
-#endif
-
-union ulong_byte_assert {
- ulong ulong;
- u8 byte;
-};
-
static inline void bucket_unlock(struct bucket *b)
{
BUILD_BUG_ON(!((union ulong_byte_assert) { .ulong = 1UL << BUCKET_LOCK_BITNR }).byte);
clear_bit_unlock(BUCKET_LOCK_BITNR, (void *) &b->lock);
+ smp_mb__after_atomic();
wake_up_bit((void *) &b->lock, BUCKET_LOCK_BITNR);
}
@@ -110,10 +84,8 @@ static inline int bucket_gen_get_rcu(struct bch_dev *ca, size_t b)
static inline int bucket_gen_get(struct bch_dev *ca, size_t b)
{
- rcu_read_lock();
- int ret = bucket_gen_get_rcu(ca, b);
- rcu_read_unlock();
- return ret;
+ guard(rcu)();
+ return bucket_gen_get_rcu(ca, b);
}
static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
@@ -167,9 +139,7 @@ static inline int gen_cmp(u8 a, u8 b)
static inline int gen_after(u8 a, u8 b)
{
- int r = gen_cmp(a, b);
-
- return r > 0 ? r : 0;
+ return max(0, gen_cmp(a, b));
}
static inline int dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_ptr *ptr)
@@ -184,10 +154,8 @@ static inline int dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_
*/
static inline int dev_ptr_stale(struct bch_dev *ca, const struct bch_extent_ptr *ptr)
{
- rcu_read_lock();
- int ret = dev_ptr_stale_rcu(ca, ptr);
- rcu_read_unlock();
- return ret;
+ guard(rcu)();
+ return dev_ptr_stale_rcu(ca, ptr);
}
/* Device usage: */
@@ -201,7 +169,16 @@ static inline struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
return ret;
}
-void bch2_dev_usage_to_text(struct printbuf *, struct bch_dev *, struct bch_dev_usage *);
+void bch2_dev_usage_full_read_fast(struct bch_dev *, struct bch_dev_usage_full *);
+static inline struct bch_dev_usage_full bch2_dev_usage_full_read(struct bch_dev *ca)
+{
+ struct bch_dev_usage_full ret;
+
+ bch2_dev_usage_full_read_fast(ca, &ret);
+ return ret;
+}
+
+void bch2_dev_usage_to_text(struct printbuf *, struct bch_dev *, struct bch_dev_usage_full *);
static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_watermark watermark)
{
@@ -236,7 +213,7 @@ static inline u64 dev_buckets_free(struct bch_dev *ca,
enum bch_watermark watermark)
{
return max_t(s64, 0,
- usage.d[BCH_DATA_free].buckets -
+ usage.buckets[BCH_DATA_free]-
ca->nr_open_buckets -
bch2_dev_buckets_reserved(ca, watermark));
}
@@ -246,10 +223,10 @@ static inline u64 __dev_buckets_available(struct bch_dev *ca,
enum bch_watermark watermark)
{
return max_t(s64, 0,
- usage.d[BCH_DATA_free].buckets
- + usage.d[BCH_DATA_cached].buckets
- + usage.d[BCH_DATA_need_gc_gens].buckets
- + usage.d[BCH_DATA_need_discard].buckets
+ usage.buckets[BCH_DATA_free]
+ + usage.buckets[BCH_DATA_cached]
+ + usage.buckets[BCH_DATA_need_gc_gens]
+ + usage.buckets[BCH_DATA_need_discard]
- ca->nr_open_buckets
- bch2_dev_buckets_reserved(ca, watermark));
}
@@ -262,11 +239,6 @@ static inline u64 dev_buckets_available(struct bch_dev *ca,
/* Filesystem usage: */
-static inline unsigned dev_usage_u64s(void)
-{
- return sizeof(struct bch_dev_usage) / sizeof(u64);
-}
-
struct bch_fs_usage_short
bch2_fs_usage_read_short(struct bch_fs *);
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 7174047b8e92..0aed2500ade3 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -7,6 +7,33 @@
#define BUCKET_JOURNAL_SEQ_BITS 16
+/*
+ * Ugly hack alert:
+ *
+ * We need to cram a spinlock in a single byte, because that's what we have left
+ * in struct bucket, and we care about the size of these - during fsck, we need
+ * in memory state for every single bucket on every device.
+ *
+ * We used to do
+ * while (xchg(&b->lock, 1) cpu_relax();
+ * but, it turns out not all architectures support xchg on a single byte.
+ *
+ * So now we use bit_spin_lock(), with fun games since we can't burn a whole
+ * ulong for this - we just need to make sure the lock bit always ends up in the
+ * first byte.
+ */
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define BUCKET_LOCK_BITNR 0
+#else
+#define BUCKET_LOCK_BITNR (BITS_PER_LONG - 1)
+#endif
+
+union ulong_byte_assert {
+ ulong ulong;
+ u8 byte;
+};
+
struct bucket {
u8 lock;
u8 gen_valid:1;
@@ -27,7 +54,12 @@ struct bucket_gens {
u8 b[] __counted_by(nbuckets);
};
+/* Only info on bucket countns: */
struct bch_dev_usage {
+ u64 buckets[BCH_DATA_NR];
+};
+
+struct bch_dev_usage_full {
struct bch_dev_usage_type {
u64 buckets;
u64 sectors; /* _compressed_ sectors: */
diff --git a/fs/bcachefs/buckets_waiting_for_journal.c b/fs/bcachefs/buckets_waiting_for_journal.c
index f9fb150eda70..832eff93acb6 100644
--- a/fs/bcachefs/buckets_waiting_for_journal.c
+++ b/fs/bcachefs/buckets_waiting_for_journal.c
@@ -22,23 +22,21 @@ static void bucket_table_init(struct buckets_waiting_for_journal_table *t, size_
memset(t->d, 0, sizeof(t->d[0]) << t->bits);
}
-bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b,
- u64 flushed_seq,
- unsigned dev, u64 bucket)
+u64 bch2_bucket_journal_seq_ready(struct buckets_waiting_for_journal *b,
+ unsigned dev, u64 bucket)
{
struct buckets_waiting_for_journal_table *t;
u64 dev_bucket = (u64) dev << 56 | bucket;
- bool ret = false;
- unsigned i;
+ u64 ret = 0;
mutex_lock(&b->lock);
t = b->t;
- for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) {
+ for (unsigned i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) {
struct bucket_hashed *h = bucket_hash(t, i, dev_bucket);
if (h->dev_bucket == dev_bucket) {
- ret = h->journal_seq > flushed_seq;
+ ret = h->journal_seq;
break;
}
}
@@ -110,7 +108,8 @@ int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b,
realloc:
n = kvmalloc(sizeof(*n) + (sizeof(n->d[0]) << new_bits), GFP_KERNEL);
if (!n) {
- ret = -BCH_ERR_ENOMEM_buckets_waiting_for_journal_set;
+ struct bch_fs *c = container_of(b, struct bch_fs, buckets_waiting_for_journal);
+ ret = bch_err_throw(c, ENOMEM_buckets_waiting_for_journal_set);
goto out;
}
diff --git a/fs/bcachefs/buckets_waiting_for_journal.h b/fs/bcachefs/buckets_waiting_for_journal.h
index d2ae19cbe18c..365619ca44c8 100644
--- a/fs/bcachefs/buckets_waiting_for_journal.h
+++ b/fs/bcachefs/buckets_waiting_for_journal.h
@@ -4,8 +4,8 @@
#include "buckets_waiting_for_journal_types.h"
-bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *,
- u64, unsigned, u64);
+u64 bch2_bucket_journal_seq_ready(struct buckets_waiting_for_journal *,
+ unsigned, u64);
int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *,
u64, unsigned, u64, u64);
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 46e9e32105a9..5ea89aa2b0c4 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -11,6 +11,7 @@
#include "move.h"
#include "recovery_passes.h"
#include "replicas.h"
+#include "sb-counters.h"
#include "super-io.h"
#include "thread_with_file.h"
@@ -312,7 +313,13 @@ static int bch2_data_thread(void *arg)
struct bch_data_ctx *ctx = container_of(arg, struct bch_data_ctx, thr);
ctx->thr.ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg);
- ctx->stats.data_type = U8_MAX;
+ if (ctx->thr.ret == -BCH_ERR_device_offline)
+ ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_device_offline;
+ else {
+ ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_done;
+ ctx->stats.data_type = (int) DATA_PROGRESS_DATA_TYPE_done;
+ }
+ enumerated_ref_put(&ctx->c->writes, BCH_WRITE_REF_ioctl_data);
return 0;
}
@@ -331,14 +338,30 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf,
struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr);
struct bch_fs *c = ctx->c;
struct bch_ioctl_data_event e = {
- .type = BCH_DATA_EVENT_PROGRESS,
- .p.data_type = ctx->stats.data_type,
- .p.btree_id = ctx->stats.pos.btree,
- .p.pos = ctx->stats.pos.pos,
- .p.sectors_done = atomic64_read(&ctx->stats.sectors_seen),
- .p.sectors_total = bch2_fs_usage_read_short(c).used,
+ .type = BCH_DATA_EVENT_PROGRESS,
+ .ret = ctx->stats.ret,
+ .p.data_type = ctx->stats.data_type,
+ .p.btree_id = ctx->stats.pos.btree,
+ .p.pos = ctx->stats.pos.pos,
+ .p.sectors_done = atomic64_read(&ctx->stats.sectors_seen),
+ .p.sectors_error_corrected = atomic64_read(&ctx->stats.sectors_error_corrected),
+ .p.sectors_error_uncorrected = atomic64_read(&ctx->stats.sectors_error_uncorrected),
};
+ if (ctx->arg.op == BCH_DATA_OP_scrub) {
+ struct bch_dev *ca = bch2_dev_tryget(c, ctx->arg.scrub.dev);
+ if (ca) {
+ struct bch_dev_usage_full u;
+ bch2_dev_usage_full_read_fast(ca, &u);
+ for (unsigned i = BCH_DATA_btree; i < ARRAY_SIZE(u.d); i++)
+ if (ctx->arg.scrub.data_types & BIT(i))
+ e.p.sectors_total += u.d[i].sectors;
+ bch2_dev_put(ca);
+ }
+ } else {
+ e.p.sectors_total = bch2_fs_usage_read_short(c).used;
+ }
+
if (len < sizeof(e))
return -EINVAL;
@@ -356,15 +379,24 @@ static long bch2_ioctl_data(struct bch_fs *c,
struct bch_data_ctx *ctx;
int ret;
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_ioctl_data))
+ return -EROFS;
- if (arg.op >= BCH_DATA_OP_NR || arg.flags)
- return -EINVAL;
+ if (!capable(CAP_SYS_ADMIN)) {
+ ret = -EPERM;
+ goto put_ref;
+ }
+
+ if (arg.op >= BCH_DATA_OP_NR || arg.flags) {
+ ret = -EINVAL;
+ goto put_ref;
+ }
ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
- if (!ctx)
- return -ENOMEM;
+ if (!ctx) {
+ ret = -ENOMEM;
+ goto put_ref;
+ }
ctx->c = c;
ctx->arg = arg;
@@ -373,11 +405,16 @@ static long bch2_ioctl_data(struct bch_fs *c,
&bcachefs_data_ops,
bch2_data_thread);
if (ret < 0)
- kfree(ctx);
+ goto cleanup;
+ return ret;
+cleanup:
+ kfree(ctx);
+put_ref:
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_ioctl_data);
return ret;
}
-static long bch2_ioctl_fs_usage(struct bch_fs *c,
+static noinline_for_stack long bch2_ioctl_fs_usage(struct bch_fs *c,
struct bch_ioctl_fs_usage __user *user_arg)
{
struct bch_ioctl_fs_usage arg = {};
@@ -404,10 +441,8 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c,
arg.replica_entries_bytes = replicas.nr;
for (unsigned i = 0; i < BCH_REPLICAS_MAX; i++) {
- struct disk_accounting_pos k = {
- .type = BCH_DISK_ACCOUNTING_persistent_reserved,
- .persistent_reserved.nr_replicas = i,
- };
+ struct disk_accounting_pos k;
+ disk_accounting_key_init(k, persistent_reserved, .nr_replicas = i);
bch2_accounting_mem_read(c,
disk_accounting_pos_to_bpos(&k),
@@ -449,11 +484,11 @@ err:
}
/* obsolete, didn't allow for new data types: */
-static long bch2_ioctl_dev_usage(struct bch_fs *c,
+static noinline_for_stack long bch2_ioctl_dev_usage(struct bch_fs *c,
struct bch_ioctl_dev_usage __user *user_arg)
{
struct bch_ioctl_dev_usage arg;
- struct bch_dev_usage src;
+ struct bch_dev_usage_full src;
struct bch_dev *ca;
unsigned i;
@@ -473,7 +508,7 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c,
if (IS_ERR(ca))
return PTR_ERR(ca);
- src = bch2_dev_usage_read(ca);
+ src = bch2_dev_usage_full_read(ca);
arg.state = ca->mi.state;
arg.bucket_size = ca->mi.bucket_size;
@@ -494,7 +529,7 @@ static long bch2_ioctl_dev_usage_v2(struct bch_fs *c,
struct bch_ioctl_dev_usage_v2 __user *user_arg)
{
struct bch_ioctl_dev_usage_v2 arg;
- struct bch_dev_usage src;
+ struct bch_dev_usage_full src;
struct bch_dev *ca;
int ret = 0;
@@ -514,7 +549,7 @@ static long bch2_ioctl_dev_usage_v2(struct bch_fs *c,
if (IS_ERR(ca))
return PTR_ERR(ca);
- src = bch2_dev_usage_read(ca);
+ src = bch2_dev_usage_full_read(ca);
arg.state = ca->mi.state;
arg.bucket_size = ca->mi.bucket_size;
@@ -593,13 +628,12 @@ static long bch2_ioctl_disk_get_idx(struct bch_fs *c,
if (!dev)
return -EINVAL;
- for_each_online_member(c, ca)
- if (ca->dev == dev) {
- percpu_ref_put(&ca->io_ref);
+ guard(rcu)();
+ for_each_online_member_rcu(c, ca)
+ if (ca->dev == dev)
return ca->dev_idx;
- }
- return -BCH_ERR_ENOENT_dev_idx_not_found;
+ return bch_err_throw(c, ENOENT_dev_idx_not_found);
}
static long bch2_ioctl_disk_resize(struct bch_fs *c,
@@ -710,6 +744,8 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
BCH_IOCTL(fsck_online, struct bch_ioctl_fsck_online);
case BCH_IOCTL_QUERY_ACCOUNTING:
return bch2_ioctl_query_accounting(c, arg);
+ case BCH_IOCTL_QUERY_COUNTERS:
+ return bch2_ioctl_query_counters(c, arg);
default:
return -ENOTTY;
}
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index 23a383577d4c..a6795e73f0b9 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -7,17 +7,12 @@
#include "super-io.h"
#include <linux/crc32c.h>
-#include <linux/crypto.h>
#include <linux/xxhash.h>
#include <linux/key.h>
#include <linux/random.h>
#include <linux/ratelimit.h>
-#include <linux/scatterlist.h>
-#include <crypto/algapi.h>
#include <crypto/chacha.h>
-#include <crypto/hash.h>
#include <crypto/poly1305.h>
-#include <crypto/skcipher.h>
#include <keys/user-type.h>
/*
@@ -96,116 +91,40 @@ static void bch2_checksum_update(struct bch2_checksum_state *state, const void *
}
}
-static inline int do_encrypt_sg(struct crypto_sync_skcipher *tfm,
- struct nonce nonce,
- struct scatterlist *sg, size_t len)
+static void bch2_chacha20_init(struct chacha_state *state,
+ const struct bch_key *key, struct nonce nonce)
{
- SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
+ u32 key_words[CHACHA_KEY_SIZE / sizeof(u32)];
- skcipher_request_set_sync_tfm(req, tfm);
- skcipher_request_set_callback(req, 0, NULL, NULL);
- skcipher_request_set_crypt(req, sg, sg, len, nonce.d);
+ BUILD_BUG_ON(sizeof(key_words) != sizeof(*key));
+ memcpy(key_words, key, sizeof(key_words));
+ le32_to_cpu_array(key_words, ARRAY_SIZE(key_words));
- int ret = crypto_skcipher_encrypt(req);
- if (ret)
- pr_err("got error %i from crypto_skcipher_encrypt()", ret);
-
- return ret;
-}
-
-static inline int do_encrypt(struct crypto_sync_skcipher *tfm,
- struct nonce nonce,
- void *buf, size_t len)
-{
- if (!is_vmalloc_addr(buf)) {
- struct scatterlist sg = {};
-
- sg_mark_end(&sg);
- sg_set_page(&sg, virt_to_page(buf), len, offset_in_page(buf));
- return do_encrypt_sg(tfm, nonce, &sg, len);
- } else {
- DARRAY_PREALLOCATED(struct scatterlist, 4) sgl;
- size_t sgl_len = 0;
- int ret;
-
- darray_init(&sgl);
-
- while (len) {
- unsigned offset = offset_in_page(buf);
- struct scatterlist sg = {
- .page_link = (unsigned long) vmalloc_to_page(buf),
- .offset = offset,
- .length = min(len, PAGE_SIZE - offset),
- };
-
- if (darray_push(&sgl, sg)) {
- sg_mark_end(&darray_last(sgl));
- ret = do_encrypt_sg(tfm, nonce, sgl.data, sgl_len);
- if (ret)
- goto err;
-
- nonce = nonce_add(nonce, sgl_len);
- sgl_len = 0;
- sgl.nr = 0;
- BUG_ON(darray_push(&sgl, sg));
- }
-
- buf += sg.length;
- len -= sg.length;
- sgl_len += sg.length;
- }
+ BUILD_BUG_ON(sizeof(nonce) != CHACHA_IV_SIZE);
+ chacha_init(state, key_words, (const u8 *)nonce.d);
- sg_mark_end(&darray_last(sgl));
- ret = do_encrypt_sg(tfm, nonce, sgl.data, sgl_len);
-err:
- darray_exit(&sgl);
- return ret;
- }
+ memzero_explicit(key_words, sizeof(key_words));
}
-int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
- void *buf, size_t len)
+void bch2_chacha20(const struct bch_key *key, struct nonce nonce,
+ void *data, size_t len)
{
- struct crypto_sync_skcipher *chacha20 =
- crypto_alloc_sync_skcipher("chacha20", 0, 0);
- int ret;
-
- ret = PTR_ERR_OR_ZERO(chacha20);
- if (ret) {
- pr_err("error requesting chacha20 cipher: %s", bch2_err_str(ret));
- return ret;
- }
-
- ret = crypto_skcipher_setkey(&chacha20->base,
- (void *) key, sizeof(*key));
- if (ret) {
- pr_err("error from crypto_skcipher_setkey(): %s", bch2_err_str(ret));
- goto err;
- }
+ struct chacha_state state;
- ret = do_encrypt(chacha20, nonce, buf, len);
-err:
- crypto_free_sync_skcipher(chacha20);
- return ret;
+ bch2_chacha20_init(&state, key, nonce);
+ chacha20_crypt(&state, data, data, len);
+ chacha_zeroize_state(&state);
}
-static int gen_poly_key(struct bch_fs *c, struct shash_desc *desc,
- struct nonce nonce)
+static void bch2_poly1305_init(struct poly1305_desc_ctx *desc,
+ struct bch_fs *c, struct nonce nonce)
{
- u8 key[POLY1305_KEY_SIZE];
- int ret;
+ u8 key[POLY1305_KEY_SIZE] = { 0 };
nonce.d[3] ^= BCH_NONCE_POLY;
- memset(key, 0, sizeof(key));
- ret = do_encrypt(c->chacha20, nonce, key, sizeof(key));
- if (ret)
- return ret;
-
- desc->tfm = c->poly1305;
- crypto_shash_init(desc);
- crypto_shash_update(desc, key, sizeof(key));
- return 0;
+ bch2_chacha20(&c->chacha20_key, nonce, key, sizeof(key));
+ poly1305_init(desc, key);
}
struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type,
@@ -230,14 +149,13 @@ struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type,
case BCH_CSUM_chacha20_poly1305_80:
case BCH_CSUM_chacha20_poly1305_128: {
- SHASH_DESC_ON_STACK(desc, c->poly1305);
+ struct poly1305_desc_ctx dctx;
u8 digest[POLY1305_DIGEST_SIZE];
struct bch_csum ret = { 0 };
- gen_poly_key(c, desc, nonce);
-
- crypto_shash_update(desc, data, len);
- crypto_shash_final(desc, digest);
+ bch2_poly1305_init(&dctx, c, nonce);
+ poly1305_update(&dctx, data, len);
+ poly1305_final(&dctx, digest);
memcpy(&ret, digest, bch_crc_bytes[type]);
return ret;
@@ -253,11 +171,12 @@ int bch2_encrypt(struct bch_fs *c, unsigned type,
if (!bch2_csum_type_is_encryption(type))
return 0;
- if (bch2_fs_inconsistent_on(!c->chacha20,
+ if (bch2_fs_inconsistent_on(!c->chacha20_key_set,
c, "attempting to encrypt without encryption key"))
- return -BCH_ERR_no_encryption_key;
+ return bch_err_throw(c, no_encryption_key);
- return do_encrypt(c->chacha20, nonce, data, len);
+ bch2_chacha20(&c->chacha20_key, nonce, data, len);
+ return 0;
}
static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
@@ -296,26 +215,26 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
case BCH_CSUM_chacha20_poly1305_80:
case BCH_CSUM_chacha20_poly1305_128: {
- SHASH_DESC_ON_STACK(desc, c->poly1305);
+ struct poly1305_desc_ctx dctx;
u8 digest[POLY1305_DIGEST_SIZE];
struct bch_csum ret = { 0 };
- gen_poly_key(c, desc, nonce);
+ bch2_poly1305_init(&dctx, c, nonce);
#ifdef CONFIG_HIGHMEM
__bio_for_each_segment(bv, bio, *iter, *iter) {
void *p = kmap_local_page(bv.bv_page) + bv.bv_offset;
- crypto_shash_update(desc, p, bv.bv_len);
+ poly1305_update(&dctx, p, bv.bv_len);
kunmap_local(p);
}
#else
__bio_for_each_bvec(bv, bio, *iter, *iter)
- crypto_shash_update(desc,
+ poly1305_update(&dctx,
page_address(bv.bv_page) + bv.bv_offset,
bv.bv_len);
#endif
- crypto_shash_final(desc, digest);
+ poly1305_final(&dctx, digest);
memcpy(&ret, digest, bch_crc_bytes[type]);
return ret;
@@ -338,43 +257,33 @@ int __bch2_encrypt_bio(struct bch_fs *c, unsigned type,
{
struct bio_vec bv;
struct bvec_iter iter;
- DARRAY_PREALLOCATED(struct scatterlist, 4) sgl;
- size_t sgl_len = 0;
+ struct chacha_state chacha_state;
int ret = 0;
- if (bch2_fs_inconsistent_on(!c->chacha20,
+ if (bch2_fs_inconsistent_on(!c->chacha20_key_set,
c, "attempting to encrypt without encryption key"))
- return -BCH_ERR_no_encryption_key;
+ return bch_err_throw(c, no_encryption_key);
- darray_init(&sgl);
+ bch2_chacha20_init(&chacha_state, &c->chacha20_key, nonce);
bio_for_each_segment(bv, bio, iter) {
- struct scatterlist sg = {
- .page_link = (unsigned long) bv.bv_page,
- .offset = bv.bv_offset,
- .length = bv.bv_len,
- };
-
- if (darray_push(&sgl, sg)) {
- sg_mark_end(&darray_last(sgl));
- ret = do_encrypt_sg(c->chacha20, nonce, sgl.data, sgl_len);
- if (ret)
- goto err;
-
- nonce = nonce_add(nonce, sgl_len);
- sgl_len = 0;
- sgl.nr = 0;
-
- BUG_ON(darray_push(&sgl, sg));
+ void *p;
+
+ /*
+ * chacha_crypt() assumes that the length is a multiple of
+ * CHACHA_BLOCK_SIZE on any non-final call.
+ */
+ if (!IS_ALIGNED(bv.bv_len, CHACHA_BLOCK_SIZE)) {
+ bch_err_ratelimited(c, "bio not aligned for encryption");
+ ret = -EIO;
+ break;
}
- sgl_len += sg.length;
+ p = bvec_kmap_local(&bv);
+ chacha20_crypt(&chacha_state, p, p, bv.bv_len);
+ kunmap_local(p);
}
-
- sg_mark_end(&darray_last(sgl));
- ret = do_encrypt_sg(c->chacha20, nonce, sgl.data, sgl_len);
-err:
- darray_exit(&sgl);
+ chacha_zeroize_state(&chacha_state);
return ret;
}
@@ -466,7 +375,7 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
prt_str(&buf, ")");
WARN_RATELIMIT(1, "%s", buf.buf);
printbuf_exit(&buf);
- return -EIO;
+ return bch_err_throw(c, recompute_checksum);
}
for (i = splits; i < splits + ARRAY_SIZE(splits); i++) {
@@ -650,10 +559,7 @@ int bch2_decrypt_sb_key(struct bch_fs *c,
}
/* decrypt real key: */
- ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c),
- &sb_key, sizeof(sb_key));
- if (ret)
- goto err;
+ bch2_chacha20(&user_key, bch2_sb_key_nonce(c), &sb_key, sizeof(sb_key));
if (bch2_key_is_encrypted(&sb_key)) {
bch_err(c, "incorrect encryption key");
@@ -668,31 +574,14 @@ err:
return ret;
}
-static int bch2_alloc_ciphers(struct bch_fs *c)
-{
- if (c->chacha20)
- return 0;
-
- struct crypto_sync_skcipher *chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0);
- int ret = PTR_ERR_OR_ZERO(chacha20);
- if (ret) {
- bch_err(c, "error requesting chacha20 module: %s", bch2_err_str(ret));
- return ret;
- }
-
- struct crypto_shash *poly1305 = crypto_alloc_shash("poly1305", 0, 0);
- ret = PTR_ERR_OR_ZERO(poly1305);
- if (ret) {
- bch_err(c, "error requesting poly1305 module: %s", bch2_err_str(ret));
- crypto_free_sync_skcipher(chacha20);
- return ret;
- }
-
- c->chacha20 = chacha20;
- c->poly1305 = poly1305;
- return 0;
-}
+#if 0
+/*
+ * This seems to be duplicating code in cmd_remove_passphrase() in
+ * bcachefs-tools, but we might want to switch userspace to use this - and
+ * perhaps add an ioctl for calling this at runtime, so we can take the
+ * passphrase off of a mounted filesystem (which has come up).
+ */
int bch2_disable_encryption(struct bch_fs *c)
{
struct bch_sb_field_crypt *crypt;
@@ -725,6 +614,10 @@ out:
return ret;
}
+/*
+ * For enabling encryption on an existing filesystem: not hooked up yet, but it
+ * should be
+ */
int bch2_enable_encryption(struct bch_fs *c, bool keyed)
{
struct bch_encrypted_key key;
@@ -766,7 +659,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed)
crypt = bch2_sb_field_resize(&c->disk_sb, crypt,
sizeof(*crypt) / sizeof(u64));
if (!crypt) {
- ret = -BCH_ERR_ENOSPC_sb_crypt;
+ ret = bch_err_throw(c, ENOSPC_sb_crypt);
goto err;
}
@@ -781,48 +674,25 @@ err:
memzero_explicit(&key, sizeof(key));
return ret;
}
+#endif
void bch2_fs_encryption_exit(struct bch_fs *c)
{
- if (c->poly1305)
- crypto_free_shash(c->poly1305);
- if (c->chacha20)
- crypto_free_sync_skcipher(c->chacha20);
- if (c->sha256)
- crypto_free_shash(c->sha256);
+ memzero_explicit(&c->chacha20_key, sizeof(c->chacha20_key));
}
int bch2_fs_encryption_init(struct bch_fs *c)
{
struct bch_sb_field_crypt *crypt;
- struct bch_key key;
- int ret = 0;
-
- c->sha256 = crypto_alloc_shash("sha256", 0, 0);
- ret = PTR_ERR_OR_ZERO(c->sha256);
- if (ret) {
- c->sha256 = NULL;
- bch_err(c, "error requesting sha256 module: %s", bch2_err_str(ret));
- goto out;
- }
+ int ret;
crypt = bch2_sb_field_get(c->disk_sb.sb, crypt);
if (!crypt)
- goto out;
-
- ret = bch2_alloc_ciphers(c);
- if (ret)
- goto out;
-
- ret = bch2_decrypt_sb_key(c, crypt, &key);
- if (ret)
- goto out;
+ return 0;
- ret = crypto_skcipher_setkey(&c->chacha20->base,
- (void *) &key.key, sizeof(key.key));
+ ret = bch2_decrypt_sb_key(c, crypt, &c->chacha20_key);
if (ret)
- goto out;
-out:
- memzero_explicit(&key, sizeof(key));
- return ret;
+ return ret;
+ c->chacha20_key_set = true;
+ return 0;
}
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
index 43b9d71f2f2b..7bd9cf6104ca 100644
--- a/fs/bcachefs/checksum.h
+++ b/fs/bcachefs/checksum.h
@@ -69,7 +69,8 @@ static inline void bch2_csum_err_msg(struct printbuf *out,
bch2_csum_to_text(out, type, expected);
}
-int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t);
+void bch2_chacha20(const struct bch_key *, struct nonce, void *, size_t);
+
int bch2_request_key(struct bch_sb *, struct bch_key *);
#ifndef __KERNEL__
int bch2_revoke_key(struct bch_sb *);
@@ -103,8 +104,10 @@ extern const struct bch_sb_field_ops bch_sb_field_ops_crypt;
int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *,
struct bch_key *);
+#if 0
int bch2_disable_encryption(struct bch_fs *);
int bch2_enable_encryption(struct bch_fs *, bool);
+#endif
void bch2_fs_encryption_exit(struct bch_fs *);
int bch2_fs_encryption_init(struct bch_fs *);
@@ -154,7 +157,7 @@ static inline bool bch2_checksum_type_valid(const struct bch_fs *c,
if (type >= BCH_CSUM_NR)
return false;
- if (bch2_csum_type_is_encryption(type) && !c->chacha20)
+ if (bch2_csum_type_is_encryption(type) && !c->chacha20_key_set)
return false;
return true;
diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c
index 1f8e035d7119..8e9264b5a84e 100644
--- a/fs/bcachefs/clock.c
+++ b/fs/bcachefs/clock.c
@@ -53,7 +53,6 @@ void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer)
struct io_clock_wait {
struct io_timer io_timer;
- struct timer_list cpu_timer;
struct task_struct *task;
int expired;
};
@@ -67,15 +66,6 @@ static void io_clock_wait_fn(struct io_timer *timer)
wake_up_process(wait->task);
}
-static void io_clock_cpu_timeout(struct timer_list *timer)
-{
- struct io_clock_wait *wait = container_of(timer,
- struct io_clock_wait, cpu_timer);
-
- wait->expired = 1;
- wake_up_process(wait->task);
-}
-
void bch2_io_clock_schedule_timeout(struct io_clock *clock, u64 until)
{
struct io_clock_wait wait = {
@@ -90,8 +80,8 @@ void bch2_io_clock_schedule_timeout(struct io_clock *clock, u64 until)
bch2_io_timer_del(clock, &wait.io_timer);
}
-void bch2_kthread_io_clock_wait(struct io_clock *clock,
- u64 io_until, unsigned long cpu_timeout)
+unsigned long bch2_kthread_io_clock_wait_once(struct io_clock *clock,
+ u64 io_until, unsigned long cpu_timeout)
{
bool kthread = (current->flags & PF_KTHREAD) != 0;
struct io_clock_wait wait = {
@@ -103,27 +93,26 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock,
bch2_io_timer_add(clock, &wait.io_timer);
- timer_setup_on_stack(&wait.cpu_timer, io_clock_cpu_timeout, 0);
-
- if (cpu_timeout != MAX_SCHEDULE_TIMEOUT)
- mod_timer(&wait.cpu_timer, cpu_timeout + jiffies);
-
- do {
- set_current_state(TASK_INTERRUPTIBLE);
- if (kthread && kthread_should_stop())
- break;
-
- if (wait.expired)
- break;
-
- schedule();
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (!(kthread && kthread_should_stop())) {
+ cpu_timeout = schedule_timeout(cpu_timeout);
try_to_freeze();
- } while (0);
+ }
__set_current_state(TASK_RUNNING);
- del_timer_sync(&wait.cpu_timer);
- destroy_timer_on_stack(&wait.cpu_timer);
bch2_io_timer_del(clock, &wait.io_timer);
+ return cpu_timeout;
+}
+
+void bch2_kthread_io_clock_wait(struct io_clock *clock,
+ u64 io_until, unsigned long cpu_timeout)
+{
+ bool kthread = (current->flags & PF_KTHREAD) != 0;
+
+ while (!(kthread && kthread_should_stop()) &&
+ cpu_timeout &&
+ atomic64_read(&clock->now) < io_until)
+ cpu_timeout = bch2_kthread_io_clock_wait_once(clock, io_until, cpu_timeout);
}
static struct io_timer *get_expired_timer(struct io_clock *clock, u64 now)
diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h
index 82c79c8baf92..8769be2aa21e 100644
--- a/fs/bcachefs/clock.h
+++ b/fs/bcachefs/clock.h
@@ -4,6 +4,7 @@
void bch2_io_timer_add(struct io_clock *, struct io_timer *);
void bch2_io_timer_del(struct io_clock *, struct io_timer *);
+unsigned long bch2_kthread_io_clock_wait_once(struct io_clock *, u64, unsigned long);
void bch2_kthread_io_clock_wait(struct io_clock *, u64, unsigned long);
void __bch2_increment_clock(struct io_clock *, u64);
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index 114bf2f3879f..b37b1f325f0a 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -177,7 +177,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
size_t src_len = src->bi_iter.bi_size;
size_t dst_len = crc.uncompressed_size << 9;
void *workspace;
- int ret;
+ int ret = 0, ret2;
enum bch_compression_opts opt = bch2_compression_type_to_opt(crc.compression_type);
mempool_t *workspace_pool = &c->compress_workspace[opt];
@@ -187,9 +187,9 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
__bch2_compression_types[crc.compression_type]))
ret = bch2_check_set_has_compressed_data(c, opt);
else
- ret = -BCH_ERR_compression_workspace_not_initialized;
+ ret = bch_err_throw(c, compression_workspace_not_initialized);
if (ret)
- goto out;
+ goto err;
}
src_data = bio_map_or_bounce(c, src, READ);
@@ -197,10 +197,10 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
switch (crc.compression_type) {
case BCH_COMPRESSION_TYPE_lz4_old:
case BCH_COMPRESSION_TYPE_lz4:
- ret = LZ4_decompress_safe_partial(src_data.b, dst_data,
- src_len, dst_len, dst_len);
- if (ret != dst_len)
- goto err;
+ ret2 = LZ4_decompress_safe_partial(src_data.b, dst_data,
+ src_len, dst_len, dst_len);
+ if (ret2 != dst_len)
+ ret = bch_err_throw(c, decompress_lz4);
break;
case BCH_COMPRESSION_TYPE_gzip: {
z_stream strm = {
@@ -214,45 +214,43 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
zlib_set_workspace(&strm, workspace);
zlib_inflateInit2(&strm, -MAX_WBITS);
- ret = zlib_inflate(&strm, Z_FINISH);
+ ret2 = zlib_inflate(&strm, Z_FINISH);
mempool_free(workspace, workspace_pool);
- if (ret != Z_STREAM_END)
- goto err;
+ if (ret2 != Z_STREAM_END)
+ ret = bch_err_throw(c, decompress_gzip);
break;
}
case BCH_COMPRESSION_TYPE_zstd: {
ZSTD_DCtx *ctx;
size_t real_src_len = le32_to_cpup(src_data.b);
- if (real_src_len > src_len - 4)
+ if (real_src_len > src_len - 4) {
+ ret = bch_err_throw(c, decompress_zstd_src_len_bad);
goto err;
+ }
workspace = mempool_alloc(workspace_pool, GFP_NOFS);
ctx = zstd_init_dctx(workspace, zstd_dctx_workspace_bound());
- ret = zstd_decompress_dctx(ctx,
+ ret2 = zstd_decompress_dctx(ctx,
dst_data, dst_len,
src_data.b + 4, real_src_len);
mempool_free(workspace, workspace_pool);
- if (ret != dst_len)
- goto err;
+ if (ret2 != dst_len)
+ ret = bch_err_throw(c, decompress_zstd);
break;
}
default:
BUG();
}
- ret = 0;
+err:
fsck_err:
-out:
bio_unmap_or_unbounce(c, src_data);
return ret;
-err:
- ret = -EIO;
- goto out;
}
int bch2_bio_uncompress_inplace(struct bch_write_op *op,
@@ -268,27 +266,22 @@ int bch2_bio_uncompress_inplace(struct bch_write_op *op,
BUG_ON(!bio->bi_vcnt);
BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs);
- if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max ||
- crc->compressed_size << 9 > c->opts.encoded_extent_max) {
- struct printbuf buf = PRINTBUF;
- bch2_write_op_error(&buf, op);
- prt_printf(&buf, "error rewriting existing data: extent too big");
- bch_err_ratelimited(c, "%s", buf.buf);
- printbuf_exit(&buf);
- return -EIO;
+ if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max) {
+ bch2_write_op_error(op, op->pos.offset,
+ "extent too big to decompress (%u > %u)",
+ crc->uncompressed_size << 9, c->opts.encoded_extent_max);
+ return bch_err_throw(c, decompress_exceeded_max_encoded_extent);
}
data = __bounce_alloc(c, dst_len, WRITE);
- if (__bio_uncompress(c, bio, data.b, *crc)) {
- if (!c->opts.no_data_io) {
- struct printbuf buf = PRINTBUF;
- bch2_write_op_error(&buf, op);
- prt_printf(&buf, "error rewriting existing data: decompression error");
- bch_err_ratelimited(c, "%s", buf.buf);
- printbuf_exit(&buf);
- }
- ret = -EIO;
+ ret = __bio_uncompress(c, bio, data.b, *crc);
+
+ if (c->opts.no_data_io)
+ ret = 0;
+
+ if (ret) {
+ bch2_write_op_error(op, op->pos.offset, "%s", bch2_err_str(ret));
goto err;
}
@@ -321,7 +314,7 @@ int bch2_bio_uncompress(struct bch_fs *c, struct bio *src,
if (crc.uncompressed_size << 9 > c->opts.encoded_extent_max ||
crc.compressed_size << 9 > c->opts.encoded_extent_max)
- return -EIO;
+ return bch_err_throw(c, decompress_exceeded_max_encoded_extent);
dst_data = dst_len == dst_iter.bi_size
? __bio_map_or_bounce(c, dst, dst_iter, WRITE)
@@ -378,13 +371,14 @@ static int attempt_compress(struct bch_fs *c,
};
zlib_set_workspace(&strm, workspace);
- zlib_deflateInit2(&strm,
+ if (zlib_deflateInit2(&strm,
compression.level
? clamp_t(unsigned, compression.level,
Z_BEST_SPEED, Z_BEST_COMPRESSION)
: Z_DEFAULT_COMPRESSION,
Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL,
- Z_DEFAULT_STRATEGY);
+ Z_DEFAULT_STRATEGY) != Z_OK)
+ return 0;
if (zlib_deflate(&strm, Z_FINISH) != Z_STREAM_END)
return 0;
@@ -662,12 +656,12 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
if (!mempool_initialized(&c->compression_bounce[READ]) &&
mempool_init_kvmalloc_pool(&c->compression_bounce[READ],
1, c->opts.encoded_extent_max))
- return -BCH_ERR_ENOMEM_compression_bounce_read_init;
+ return bch_err_throw(c, ENOMEM_compression_bounce_read_init);
if (!mempool_initialized(&c->compression_bounce[WRITE]) &&
mempool_init_kvmalloc_pool(&c->compression_bounce[WRITE],
1, c->opts.encoded_extent_max))
- return -BCH_ERR_ENOMEM_compression_bounce_write_init;
+ return bch_err_throw(c, ENOMEM_compression_bounce_write_init);
for (i = compression_types;
i < compression_types + ARRAY_SIZE(compression_types);
@@ -681,7 +675,7 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
if (mempool_init_kvmalloc_pool(
&c->compress_workspace[i->type],
1, i->compress_workspace))
- return -BCH_ERR_ENOMEM_compression_workspace_init;
+ return bch_err_throw(c, ENOMEM_compression_workspace_init);
}
return 0;
@@ -720,7 +714,7 @@ int bch2_opt_compression_parse(struct bch_fs *c, const char *_val, u64 *res,
ret = match_string(bch2_compression_opts, -1, type_str);
if (ret < 0 && err)
- prt_str(err, "invalid compression type");
+ prt_printf(err, "invalid compression type\n");
if (ret < 0)
goto err;
@@ -735,7 +729,7 @@ int bch2_opt_compression_parse(struct bch_fs *c, const char *_val, u64 *res,
if (!ret && level > 15)
ret = -EINVAL;
if (ret < 0 && err)
- prt_str(err, "invalid compression level");
+ prt_printf(err, "invalid compression level\n");
if (ret < 0)
goto err;
diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h
index c6151495985f..4080ee99aadd 100644
--- a/fs/bcachefs/darray.h
+++ b/fs/bcachefs/darray.h
@@ -8,6 +8,7 @@
* Inspired by CCAN's darray
*/
+#include <linux/cleanup.h>
#include <linux/slab.h>
#define DARRAY_PREALLOCATED(_type, _nr) \
@@ -20,7 +21,18 @@ struct { \
#define DARRAY(_type) DARRAY_PREALLOCATED(_type, 0)
typedef DARRAY(char) darray_char;
-typedef DARRAY(char *) darray_str;
+typedef DARRAY(char *) darray_str;
+typedef DARRAY(const char *) darray_const_str;
+
+typedef DARRAY(u8) darray_u8;
+typedef DARRAY(u16) darray_u16;
+typedef DARRAY(u32) darray_u32;
+typedef DARRAY(u64) darray_u64;
+
+typedef DARRAY(s8) darray_s8;
+typedef DARRAY(s16) darray_s16;
+typedef DARRAY(s32) darray_s32;
+typedef DARRAY(s64) darray_s64;
int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t);
@@ -76,7 +88,23 @@ int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t);
#define darray_remove_item(_d, _pos) \
array_remove_item((_d)->data, (_d)->nr, (_pos) - (_d)->data)
-#define __darray_for_each(_d, _i) \
+#define darray_find_p(_d, _i, cond) \
+({ \
+ typeof((_d).data) _ret = NULL; \
+ \
+ darray_for_each(_d, _i) \
+ if (cond) { \
+ _ret = _i; \
+ break; \
+ } \
+ _ret; \
+})
+
+#define darray_find(_d, _item) darray_find_p(_d, _i, *_i == _item)
+
+/* Iteration: */
+
+#define __darray_for_each(_d, _i) \
for ((_i) = (_d).data; _i < (_d).data + (_d).nr; _i++)
#define darray_for_each(_d, _i) \
@@ -85,6 +113,8 @@ int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t);
#define darray_for_each_reverse(_d, _i) \
for (typeof(&(_d).data[0]) _i = (_d).data + (_d).nr - 1; _i >= (_d).data && (_d).nr; --_i)
+/* Init/exit */
+
#define darray_init(_d) \
do { \
(_d)->nr = 0; \
@@ -100,4 +130,29 @@ do { \
darray_init(_d); \
} while (0)
+#define DEFINE_DARRAY_CLASS(_type) \
+DEFINE_CLASS(_type, _type, darray_exit(&(_T)), (_type) {}, void)
+
+#define DEFINE_DARRAY(_type) \
+typedef DARRAY(_type) darray_##_type; \
+DEFINE_DARRAY_CLASS(darray_##_type)
+
+#define DEFINE_DARRAY_NAMED(_name, _type) \
+typedef DARRAY(_type) _name; \
+DEFINE_DARRAY_CLASS(_name)
+
+DEFINE_DARRAY_CLASS(darray_char);
+DEFINE_DARRAY_CLASS(darray_str)
+DEFINE_DARRAY_CLASS(darray_const_str)
+
+DEFINE_DARRAY_CLASS(darray_u8)
+DEFINE_DARRAY_CLASS(darray_u16)
+DEFINE_DARRAY_CLASS(darray_u32)
+DEFINE_DARRAY_CLASS(darray_u64)
+
+DEFINE_DARRAY_CLASS(darray_s8)
+DEFINE_DARRAY_CLASS(darray_s16)
+DEFINE_DARRAY_CLASS(darray_s32)
+DEFINE_DARRAY_CLASS(darray_s64)
+
#endif /* _BCACHEFS_DARRAY_H */
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 337494facac6..e848e210a9bf 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -20,6 +20,15 @@
#include "subvolume.h"
#include "trace.h"
+#include <linux/ioprio.h>
+
+static const char * const bch2_data_update_type_strs[] = {
+#define x(t, n, ...) [n] = #t,
+ BCH_DATA_UPDATE_TYPES()
+#undef x
+ NULL
+};
+
static void bkey_put_dev_refs(struct bch_fs *c, struct bkey_s_c k)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
@@ -33,7 +42,7 @@ static bool bkey_get_dev_refs(struct bch_fs *c, struct bkey_s_c k)
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
bkey_for_each_ptr(ptrs, ptr) {
- if (!bch2_dev_tryget(c, ptr->dev)) {
+ if (unlikely(!bch2_dev_tryget(c, ptr->dev))) {
bkey_for_each_ptr(ptrs, ptr2) {
if (ptr2 == ptr)
break;
@@ -57,43 +66,53 @@ static void bkey_nocow_unlock(struct bch_fs *c, struct bkey_s_c k)
}
}
-static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_s_c k)
+static noinline_for_stack
+bool __bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_ptrs_c ptrs,
+ const struct bch_extent_ptr *start)
{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ if (!ctxt) {
+ bkey_for_each_ptr(ptrs, ptr) {
+ if (ptr == start)
+ break;
+
+ struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
+ struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
+ bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0);
+ }
+ return false;
+ }
- bkey_for_each_ptr(ptrs, ptr) {
+ __bkey_for_each_ptr(start, ptrs.end, ptr) {
struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
- if (ctxt) {
- bool locked;
-
- move_ctxt_wait_event(ctxt,
- (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) ||
- list_empty(&ctxt->ios));
+ bool locked;
+ move_ctxt_wait_event(ctxt,
+ (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) ||
+ list_empty(&ctxt->ios));
+ if (!locked)
+ bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0);
+ }
+ return true;
+}
- if (!locked)
- bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0);
- } else {
- if (!bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) {
- bkey_for_each_ptr(ptrs, ptr2) {
- if (ptr2 == ptr)
- break;
+static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_ptrs_c ptrs)
+{
+ bkey_for_each_ptr(ptrs, ptr) {
+ struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
+ struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
- ca = bch2_dev_have_ref(c, ptr2->dev);
- bucket = PTR_BUCKET_POS(ca, ptr2);
- bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0);
- }
- return false;
- }
- }
+ if (!bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0))
+ return __bkey_nocow_lock(c, ctxt, ptrs, ptr);
}
+
return true;
}
-static noinline void trace_move_extent_finish2(struct data_update *u,
- struct bkey_i *new,
- struct bkey_i *insert)
+noinline_for_stack
+static void trace_io_move_finish2(struct data_update *u,
+ struct bkey_i *new,
+ struct bkey_i *insert)
{
struct bch_fs *c = u->op.c;
struct printbuf buf = PRINTBUF;
@@ -111,11 +130,12 @@ static noinline void trace_move_extent_finish2(struct data_update *u,
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
prt_newline(&buf);
- trace_move_extent_finish(c, buf.buf);
+ trace_io_move_finish(c, buf.buf);
printbuf_exit(&buf);
}
-static void trace_move_extent_fail2(struct data_update *m,
+noinline_for_stack
+static void trace_io_move_fail2(struct data_update *m,
struct bkey_s_c new,
struct bkey_s_c wrote,
struct bkey_i *insert,
@@ -126,7 +146,7 @@ static void trace_move_extent_fail2(struct data_update *m,
struct printbuf buf = PRINTBUF;
unsigned rewrites_found = 0;
- if (!trace_move_extent_fail_enabled())
+ if (!trace_io_move_fail_enabled())
return;
prt_str(&buf, msg);
@@ -166,8 +186,77 @@ static void trace_move_extent_fail2(struct data_update *m,
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
}
- trace_move_extent_fail(c, buf.buf);
+ trace_io_move_fail(c, buf.buf);
+ printbuf_exit(&buf);
+}
+
+noinline_for_stack
+static void trace_data_update2(struct data_update *m,
+ struct bkey_s_c old, struct bkey_s_c k,
+ struct bkey_i *insert)
+{
+ struct bch_fs *c = m->op.c;
+ struct printbuf buf = PRINTBUF;
+
+ prt_str(&buf, "\nold: ");
+ bch2_bkey_val_to_text(&buf, c, old);
+ prt_str(&buf, "\nk: ");
+ bch2_bkey_val_to_text(&buf, c, k);
+ prt_str(&buf, "\nnew: ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
+
+ trace_data_update(c, buf.buf);
+ printbuf_exit(&buf);
+}
+
+noinline_for_stack
+static void trace_io_move_created_rebalance2(struct data_update *m,
+ struct bkey_s_c old, struct bkey_s_c k,
+ struct bkey_i *insert)
+{
+ struct bch_fs *c = m->op.c;
+ struct printbuf buf = PRINTBUF;
+
+ bch2_data_update_opts_to_text(&buf, c, &m->op.opts, &m->data_opts);
+
+ prt_str(&buf, "\nold: ");
+ bch2_bkey_val_to_text(&buf, c, old);
+ prt_str(&buf, "\nk: ");
+ bch2_bkey_val_to_text(&buf, c, k);
+ prt_str(&buf, "\nnew: ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
+
+ trace_io_move_created_rebalance(c, buf.buf);
+ printbuf_exit(&buf);
+
+ this_cpu_inc(c->counters[BCH_COUNTER_io_move_created_rebalance]);
+}
+
+noinline_for_stack
+static int data_update_invalid_bkey(struct data_update *m,
+ struct bkey_s_c old, struct bkey_s_c k,
+ struct bkey_i *insert)
+{
+ struct bch_fs *c = m->op.c;
+ struct printbuf buf = PRINTBUF;
+ bch2_log_msg_start(c, &buf);
+
+ prt_str(&buf, "about to insert invalid key in data update path");
+ prt_printf(&buf, "\nop.nonce: %u", m->op.nonce);
+ prt_str(&buf, "\nold: ");
+ bch2_bkey_val_to_text(&buf, c, old);
+ prt_str(&buf, "\nk: ");
+ bch2_bkey_val_to_text(&buf, c, k);
+ prt_str(&buf, "\nnew: ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
+ prt_newline(&buf);
+
+ bch2_fs_emergency_read_only2(c, &buf);
+
+ bch2_print_str(c, KERN_ERR, buf.buf);
printbuf_exit(&buf);
+
+ return bch_err_throw(c, invalid_bkey);
}
static int __bch2_data_update_index_update(struct btree_trans *trans,
@@ -175,18 +264,11 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
{
struct bch_fs *c = op->c;
struct btree_iter iter;
- struct data_update *m =
- container_of(op, struct data_update, op);
- struct keylist *keys = &op->insert_keys;
- struct bkey_buf _new, _insert;
+ struct data_update *m = container_of(op, struct data_update, op);
int ret = 0;
- bch2_bkey_buf_init(&_new);
- bch2_bkey_buf_init(&_insert);
- bch2_bkey_buf_realloc(&_insert, c, U8_MAX);
-
bch2_trans_iter_init(trans, &iter, m->btree_id,
- bkey_start_pos(&bch2_keylist_front(keys)->k),
+ bkey_start_pos(&bch2_keylist_front(&op->insert_keys)->k),
BTREE_ITER_slots|BTREE_ITER_intent);
while (1) {
@@ -206,24 +288,35 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
bch2_trans_begin(trans);
- k = bch2_btree_iter_peek_slot(&iter);
+ k = bch2_btree_iter_peek_slot(trans, &iter);
ret = bkey_err(k);
if (ret)
goto err;
- new = bkey_i_to_extent(bch2_keylist_front(keys));
+ new = bkey_i_to_extent(bch2_keylist_front(&op->insert_keys));
if (!bch2_extents_match(k, old)) {
- trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i),
- NULL, "no match:");
+ trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i),
+ NULL, "no match:");
goto nowork;
}
- bkey_reassemble(_insert.k, k);
- insert = _insert.k;
+ insert = bch2_trans_kmalloc(trans,
+ bkey_bytes(k.k) +
+ bkey_val_bytes(&new->k) +
+ sizeof(struct bch_extent_rebalance));
+ ret = PTR_ERR_OR_ZERO(insert);
+ if (ret)
+ goto err;
+
+ bkey_reassemble(insert, k);
- bch2_bkey_buf_copy(&_new, c, bch2_keylist_front(keys));
- new = bkey_i_to_extent(_new.k);
+ new = bch2_trans_kmalloc(trans, bkey_bytes(&new->k));
+ ret = PTR_ERR_OR_ZERO(new);
+ if (ret)
+ goto err;
+
+ bkey_copy(&new->k_i, bch2_keylist_front(&op->insert_keys));
bch2_cut_front(iter.pos, &new->k_i);
bch2_cut_front(iter.pos, insert);
@@ -254,7 +347,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
if (m->data_opts.rewrite_ptrs &&
!rewrites_found &&
bch2_bkey_durability(c, k) >= m->op.opts.data_replicas) {
- trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "no rewrites found:");
+ trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "no rewrites found:");
goto nowork;
}
@@ -271,7 +364,7 @@ restart_drop_conflicting_replicas:
}
if (!bkey_val_u64s(&new->k)) {
- trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "new replicas conflicted:");
+ trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "new replicas conflicted:");
goto nowork;
}
@@ -284,21 +377,21 @@ restart_drop_conflicting_replicas:
bch2_bkey_durability(c, bkey_i_to_s_c(&new->k_i));
/* Now, drop excess replicas: */
- rcu_read_lock();
+ scoped_guard(rcu) {
restart_drop_extra_replicas:
- bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) {
- unsigned ptr_durability = bch2_extent_ptr_durability(c, &p);
+ bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) {
+ unsigned ptr_durability = bch2_extent_ptr_durability(c, &p);
- if (!p.ptr.cached &&
- durability - ptr_durability >= m->op.opts.data_replicas) {
- durability -= ptr_durability;
+ if (!p.ptr.cached &&
+ durability - ptr_durability >= m->op.opts.data_replicas) {
+ durability -= ptr_durability;
- bch2_extent_ptr_set_cached(c, &m->op.opts,
- bkey_i_to_s(insert), &entry->ptr);
- goto restart_drop_extra_replicas;
+ bch2_extent_ptr_set_cached(c, &m->op.opts,
+ bkey_i_to_s(insert), &entry->ptr);
+ goto restart_drop_extra_replicas;
+ }
}
}
- rcu_read_unlock();
/* Finally, add the pointers we just wrote: */
extent_for_each_ptr_decode(extent_i_to_s(new), p, entry)
@@ -336,67 +429,52 @@ restart_drop_extra_replicas:
.btree = m->btree_id,
.flags = BCH_VALIDATE_commit,
});
- if (invalid) {
- struct printbuf buf = PRINTBUF;
-
- prt_str(&buf, "about to insert invalid key in data update path");
- prt_str(&buf, "\nold: ");
- bch2_bkey_val_to_text(&buf, c, old);
- prt_str(&buf, "\nk: ");
- bch2_bkey_val_to_text(&buf, c, k);
- prt_str(&buf, "\nnew: ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
-
- bch2_print_string_as_lines(KERN_ERR, buf.buf);
- printbuf_exit(&buf);
-
- bch2_fatal_error(c);
- ret = -EIO;
+ if (unlikely(invalid)) {
+ ret = data_update_invalid_bkey(m, old, k, insert);
goto out;
}
- if (trace_data_update_enabled()) {
- struct printbuf buf = PRINTBUF;
-
- prt_str(&buf, "\nold: ");
- bch2_bkey_val_to_text(&buf, c, old);
- prt_str(&buf, "\nk: ");
- bch2_bkey_val_to_text(&buf, c, k);
- prt_str(&buf, "\nnew: ");
- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
-
- trace_data_update(c, buf.buf);
- printbuf_exit(&buf);
- }
-
- ret = bch2_insert_snapshot_whiteouts(trans, m->btree_id,
+ ret = bch2_trans_log_str(trans, bch2_data_update_type_strs[m->type]) ?:
+ bch2_trans_log_bkey(trans, m->btree_id, 0, m->k.k) ?:
+ bch2_insert_snapshot_whiteouts(trans, m->btree_id,
k.k->p, bkey_start_pos(&insert->k)) ?:
bch2_insert_snapshot_whiteouts(trans, m->btree_id,
k.k->p, insert->k.p) ?:
bch2_bkey_set_needs_rebalance(c, &op->opts, insert) ?:
bch2_trans_update(trans, &iter, insert,
- BTREE_UPDATE_internal_snapshot_node) ?:
- bch2_trans_commit(trans, &op->res,
+ BTREE_UPDATE_internal_snapshot_node);
+ if (ret)
+ goto err;
+
+ if (trace_data_update_enabled())
+ trace_data_update2(m, old, k, insert);
+
+ if (bch2_bkey_sectors_need_rebalance(c, bkey_i_to_s_c(insert)) * k.k->size >
+ bch2_bkey_sectors_need_rebalance(c, k) * insert->k.size)
+ trace_io_move_created_rebalance2(m, old, k, insert);
+
+ ret = bch2_trans_commit(trans, &op->res,
NULL,
BCH_TRANS_COMMIT_no_check_rw|
BCH_TRANS_COMMIT_no_enospc|
m->data_opts.btree_insert_flags);
- if (!ret) {
- bch2_btree_iter_set_pos(&iter, next_pos);
+ if (ret)
+ goto err;
- this_cpu_add(c->counters[BCH_COUNTER_move_extent_finish], new->k.size);
- if (trace_move_extent_finish_enabled())
- trace_move_extent_finish2(m, &new->k_i, insert);
- }
+ bch2_btree_iter_set_pos(trans, &iter, next_pos);
+
+ this_cpu_add(c->counters[BCH_COUNTER_io_move_finish], new->k.size);
+ if (trace_io_move_finish_enabled())
+ trace_io_move_finish2(m, &new->k_i, insert);
err:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
ret = 0;
if (ret)
break;
next:
- while (bkey_ge(iter.pos, bch2_keylist_front(keys)->k.p)) {
- bch2_keylist_pop_front(keys);
- if (bch2_keylist_empty(keys))
+ while (bkey_ge(iter.pos, bch2_keylist_front(&op->insert_keys)->k.p)) {
+ bch2_keylist_pop_front(&op->insert_keys);
+ if (bch2_keylist_empty(&op->insert_keys))
goto out;
}
continue;
@@ -408,15 +486,13 @@ nowork:
&m->stats->sectors_raced);
}
- count_event(c, move_extent_fail);
+ count_event(c, io_move_fail);
- bch2_btree_iter_advance(&iter);
+ bch2_btree_iter_advance(trans, &iter);
goto next;
}
out:
bch2_trans_iter_exit(trans, &iter);
- bch2_bkey_buf_exit(&_insert, c);
- bch2_bkey_buf_exit(&_new, c);
BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
return ret;
}
@@ -426,14 +502,17 @@ int bch2_data_update_index_update(struct bch_write_op *op)
return bch2_trans_run(op->c, __bch2_data_update_index_update(trans, op));
}
-void bch2_data_update_read_done(struct data_update *m,
- struct bch_extent_crc_unpacked crc)
+void bch2_data_update_read_done(struct data_update *m)
{
+ m->read_done = true;
+
/* write bio must own pages: */
BUG_ON(!m->op.wbio.bio.bi_vcnt);
- m->op.crc = crc;
- m->op.wbio.bio.bi_iter.bi_size = crc.compressed_size << 9;
+ m->op.crc = m->rbio.pick.crc;
+ m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9;
+
+ this_cpu_add(m->op.c->counters[BCH_COUNTER_io_move_write], m->k.k->k.size);
closure_call(&m->op.cl, bch2_write, NULL, NULL);
}
@@ -443,38 +522,42 @@ void bch2_data_update_exit(struct data_update *update)
struct bch_fs *c = update->op.c;
struct bkey_s_c k = bkey_i_to_s_c(update->k.k);
+ bch2_bio_free_pages_pool(c, &update->op.wbio.bio);
+ kfree(update->bvecs);
+ update->bvecs = NULL;
+
if (c->opts.nocow_enabled)
bkey_nocow_unlock(c, k);
bkey_put_dev_refs(c, k);
- bch2_bkey_buf_exit(&update->k, c);
bch2_disk_reservation_put(c, &update->op.res);
- bch2_bio_free_pages_pool(c, &update->op.wbio.bio);
+ bch2_bkey_buf_exit(&update->k, c);
}
-static void bch2_update_unwritten_extent(struct btree_trans *trans,
- struct data_update *update)
+static noinline_for_stack
+int bch2_update_unwritten_extent(struct btree_trans *trans,
+ struct data_update *update)
{
struct bch_fs *c = update->op.c;
- struct bio *bio = &update->op.wbio.bio;
struct bkey_i_extent *e;
struct write_point *wp;
struct closure cl;
struct btree_iter iter;
struct bkey_s_c k;
- int ret;
+ int ret = 0;
closure_init_stack(&cl);
bch2_keylist_init(&update->op.insert_keys, update->op.inline_keys);
- while (bio_sectors(bio)) {
- unsigned sectors = bio_sectors(bio);
+ while (bpos_lt(update->op.pos, update->k.k->k.p)) {
+ unsigned sectors = update->k.k->k.p.offset -
+ update->op.pos.offset;
bch2_trans_begin(trans);
bch2_trans_iter_init(trans, &iter, update->btree_id, update->op.pos,
BTREE_ITER_slots);
ret = lockrestart_do(trans, ({
- k = bch2_btree_iter_peek_slot(&iter);
+ k = bch2_btree_iter_peek_slot(trans, &iter);
bkey_err(k);
}));
bch2_trans_iter_exit(trans, &iter);
@@ -503,7 +586,7 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans,
bch_err_fn_ratelimited(c, ret);
if (ret)
- return;
+ break;
sectors = min(sectors, wp->sectors_free);
@@ -513,7 +596,6 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans,
bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false);
bch2_alloc_sectors_done(c, wp);
- bio_advance(bio, sectors << 9);
update->op.pos.offset += sectors;
extent_for_each_ptr(extent_i_to_s(e), ptr)
@@ -532,13 +614,16 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans,
bch2_trans_unlock(trans);
closure_sync(&cl);
}
+
+ return ret;
}
void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c,
struct bch_io_opts *io_opts,
struct data_update_opts *data_opts)
{
- printbuf_tabstop_push(out, 20);
+ if (!out->nr_tabstops)
+ printbuf_tabstop_push(out, 20);
prt_str_indented(out, "rewrite ptrs:\t");
bch2_prt_u64_base2(out, data_opts->rewrite_ptrs);
@@ -562,10 +647,17 @@ void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c,
prt_str_indented(out, "extra replicas:\t");
prt_u64(out, data_opts->extra_replicas);
+ prt_newline(out);
+
+ prt_str_indented(out, "scrub:\t");
+ prt_u64(out, data_opts->scrub);
}
void bch2_data_update_to_text(struct printbuf *out, struct data_update *m)
{
+ prt_str(out, bch2_data_update_type_strs[m->type]);
+ prt_newline(out);
+
bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts);
prt_newline(out);
@@ -573,6 +665,25 @@ void bch2_data_update_to_text(struct printbuf *out, struct data_update *m)
bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k));
}
+void bch2_data_update_inflight_to_text(struct printbuf *out, struct data_update *m)
+{
+ bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k));
+ prt_newline(out);
+ printbuf_indent_add(out, 2);
+ bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts);
+
+ if (!m->read_done) {
+ prt_printf(out, "read:\n");
+ printbuf_indent_add(out, 2);
+ bch2_read_bio_to_text(out, &m->rbio);
+ } else {
+ prt_printf(out, "write:\n");
+ printbuf_indent_add(out, 2);
+ bch2_write_op_to_text(out, &m->op);
+ }
+ printbuf_indent_sub(out, 4);
+}
+
int bch2_extent_drop_ptrs(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k,
@@ -616,67 +727,152 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans,
bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
}
+static int __bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c,
+ struct bch_io_opts *io_opts,
+ unsigned buf_bytes)
+{
+ unsigned nr_vecs = DIV_ROUND_UP(buf_bytes, PAGE_SIZE);
+
+ m->bvecs = kmalloc_array(nr_vecs, sizeof*(m->bvecs), GFP_KERNEL);
+ if (!m->bvecs)
+ return -ENOMEM;
+
+ bio_init(&m->rbio.bio, NULL, m->bvecs, nr_vecs, REQ_OP_READ);
+ bio_init(&m->op.wbio.bio, NULL, m->bvecs, nr_vecs, 0);
+
+ if (bch2_bio_alloc_pages(&m->op.wbio.bio, buf_bytes, GFP_KERNEL)) {
+ kfree(m->bvecs);
+ m->bvecs = NULL;
+ return -ENOMEM;
+ }
+
+ rbio_init(&m->rbio.bio, c, *io_opts, NULL);
+ m->rbio.data_update = true;
+ m->rbio.bio.bi_iter.bi_size = buf_bytes;
+ m->rbio.bio.bi_iter.bi_sector = bkey_start_offset(&m->k.k->k);
+ m->op.wbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
+ return 0;
+}
+
+int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c,
+ struct bch_io_opts *io_opts)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k));
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+
+ /* write path might have to decompress data: */
+ unsigned buf_bytes = 0;
+ bkey_for_each_ptr_decode(&m->k.k->k, ptrs, p, entry)
+ buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9);
+
+ return __bch2_data_update_bios_init(m, c, io_opts, buf_bytes);
+}
+
+static int can_write_extent(struct bch_fs *c, struct data_update *m)
+{
+ if ((m->op.flags & BCH_WRITE_alloc_nowait) &&
+ unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(m->op.watermark)))
+ return bch_err_throw(c, data_update_done_would_block);
+
+ unsigned target = m->op.flags & BCH_WRITE_only_specified_devs
+ ? m->op.target
+ : 0;
+ struct bch_devs_mask devs = target_rw_devs(c, BCH_DATA_user, target);
+
+ darray_for_each(m->op.devs_have, i)
+ __clear_bit(*i, devs.d);
+
+ guard(rcu)();
+
+ unsigned nr_replicas = 0, i;
+ for_each_set_bit(i, devs.d, BCH_SB_MEMBERS_MAX) {
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, i);
+ if (!ca)
+ continue;
+
+ struct bch_dev_usage usage;
+ bch2_dev_usage_read_fast(ca, &usage);
+
+ if (!dev_buckets_free(ca, usage, m->op.watermark))
+ continue;
+
+ nr_replicas += ca->mi.durability;
+ if (nr_replicas >= m->op.nr_replicas)
+ break;
+ }
+
+ if (!nr_replicas)
+ return bch_err_throw(c, data_update_done_no_rw_devs);
+ if (nr_replicas < m->op.nr_replicas)
+ return bch_err_throw(c, insufficient_devices);
+ return 0;
+}
+
int bch2_data_update_init(struct btree_trans *trans,
struct btree_iter *iter,
struct moving_context *ctxt,
struct data_update *m,
struct write_point_specifier wp,
- struct bch_io_opts io_opts,
+ struct bch_io_opts *io_opts,
struct data_update_opts data_opts,
enum btree_id btree_id,
struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- unsigned reserve_sectors = k.k->size * data_opts.extra_replicas;
int ret = 0;
- /*
- * fs is corrupt we have a key for a snapshot node that doesn't exist,
- * and we have to check for this because we go rw before repairing the
- * snapshots table - just skip it, we can move it later.
- */
- if (unlikely(k.k->p.snapshot && !bch2_snapshot_exists(c, k.k->p.snapshot)))
- return -BCH_ERR_data_update_done;
-
- if (!bkey_get_dev_refs(c, k))
- return -BCH_ERR_data_update_done;
-
- if (c->opts.nocow_enabled &&
- !bkey_nocow_lock(c, ctxt, k)) {
- bkey_put_dev_refs(c, k);
- return -BCH_ERR_nocow_lock_blocked;
+ if (k.k->p.snapshot) {
+ ret = bch2_check_key_has_snapshot(trans, iter, k);
+ if (bch2_err_matches(ret, BCH_ERR_recovery_will_run)) {
+ /* Can't repair yet, waiting on other recovery passes */
+ return bch_err_throw(c, data_update_done_no_snapshot);
+ }
+ if (ret < 0)
+ return ret;
+ if (ret) /* key was deleted */
+ return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
+ bch_err_throw(c, data_update_done_no_snapshot);
+ ret = 0;
}
bch2_bkey_buf_init(&m->k);
bch2_bkey_buf_reassemble(&m->k, c, k);
+ m->type = data_opts.btree_insert_flags & BCH_WATERMARK_copygc
+ ? BCH_DATA_UPDATE_copygc
+ : BCH_DATA_UPDATE_rebalance;
m->btree_id = btree_id;
m->data_opts = data_opts;
m->ctxt = ctxt;
m->stats = ctxt ? ctxt->stats : NULL;
- bch2_write_op_init(&m->op, c, io_opts);
+ bch2_write_op_init(&m->op, c, *io_opts);
m->op.pos = bkey_start_pos(k.k);
m->op.version = k.k->bversion;
m->op.target = data_opts.target;
m->op.write_point = wp;
m->op.nr_replicas = 0;
- m->op.flags |= BCH_WRITE_PAGES_STABLE|
- BCH_WRITE_PAGES_OWNED|
- BCH_WRITE_DATA_ENCODED|
- BCH_WRITE_MOVE|
+ m->op.flags |= BCH_WRITE_pages_stable|
+ BCH_WRITE_pages_owned|
+ BCH_WRITE_data_encoded|
+ BCH_WRITE_move|
m->data_opts.write_flags;
- m->op.compression_opt = io_opts.background_compression;
+ m->op.compression_opt = io_opts->background_compression;
m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK;
unsigned durability_have = 0, durability_removing = 0;
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k));
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ unsigned reserve_sectors = k.k->size * data_opts.extra_replicas;
+ unsigned buf_bytes = 0;
+ bool unwritten = false;
+
unsigned ptr_bit = 1;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
if (!p.ptr.cached) {
- rcu_read_lock();
+ guard(rcu)();
if (ptr_bit & m->data_opts.rewrite_ptrs) {
if (crc_is_compressed(p.crc))
reserve_sectors += k.k->size;
@@ -687,7 +883,6 @@ int bch2_data_update_init(struct btree_trans *trans,
bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev);
durability_have += bch2_extent_ptr_durability(c, &p);
}
- rcu_read_unlock();
}
/*
@@ -703,10 +898,13 @@ int bch2_data_update_init(struct btree_trans *trans,
if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible)
m->op.incompressible = true;
+ buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9);
+ unwritten |= p.ptr.unwritten;
+
ptr_bit <<= 1;
}
- unsigned durability_required = max(0, (int) (io_opts.data_replicas - durability_have));
+ unsigned durability_required = max(0, (int) (io_opts->data_replicas - durability_have));
/*
* If current extent durability is less than io_opts.data_replicas,
@@ -739,28 +937,72 @@ int bch2_data_update_init(struct btree_trans *trans,
m->data_opts.rewrite_ptrs = 0;
/* if iter == NULL, it's just a promote */
if (iter)
- ret = bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &m->data_opts);
- goto out;
+ ret = bch2_extent_drop_ptrs(trans, iter, k, io_opts, &m->data_opts);
+ if (!ret)
+ ret = bch_err_throw(c, data_update_done_no_writes_needed);
+ goto out_bkey_buf_exit;
}
+ /*
+ * Check if the allocation will succeed, to avoid getting an error later
+ * in bch2_write() -> bch2_alloc_sectors_start() and doing a useless
+ * read:
+ *
+ * This guards against
+ * - BCH_WRITE_alloc_nowait allocations failing (promotes)
+ * - Destination target full
+ * - Device(s) in destination target offline
+ * - Insufficient durability available in destination target
+ * (i.e. trying to move a durability=2 replica to a target with a
+ * single durability=2 device)
+ */
+ ret = can_write_extent(c, m);
+ if (ret)
+ goto out_bkey_buf_exit;
+
if (reserve_sectors) {
ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors,
m->data_opts.extra_replicas
? 0
: BCH_DISK_RESERVATION_NOFAIL);
if (ret)
- goto out;
+ goto out_bkey_buf_exit;
}
- if (bkey_extent_is_unwritten(k)) {
- bch2_update_unwritten_extent(trans, m);
- goto out;
+ if (!bkey_get_dev_refs(c, k)) {
+ ret = bch_err_throw(c, data_update_done_no_dev_refs);
+ goto out_put_disk_res;
+ }
+
+ if (c->opts.nocow_enabled &&
+ !bkey_nocow_lock(c, ctxt, ptrs)) {
+ ret = bch_err_throw(c, nocow_lock_blocked);
+ goto out_put_dev_refs;
}
+ if (unwritten) {
+ ret = bch2_update_unwritten_extent(trans, m) ?:
+ bch_err_throw(c, data_update_done_unwritten);
+ goto out_nocow_unlock;
+ }
+
+ bch2_trans_unlock(trans);
+
+ ret = __bch2_data_update_bios_init(m, c, io_opts, buf_bytes);
+ if (ret)
+ goto out_nocow_unlock;
+
return 0;
-out:
- bch2_data_update_exit(m);
- return ret ?: -BCH_ERR_data_update_done;
+out_nocow_unlock:
+ if (c->opts.nocow_enabled)
+ bkey_nocow_unlock(c, k);
+out_put_dev_refs:
+ bkey_put_dev_refs(c, k);
+out_put_disk_res:
+ bch2_disk_reservation_put(c, &m->op.res);
+out_bkey_buf_exit:
+ bch2_bkey_buf_exit(&m->k, c);
+ return ret;
}
void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts)
diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h
index e4b50723428e..5e14d13568de 100644
--- a/fs/bcachefs/data_update.h
+++ b/fs/bcachefs/data_update.h
@@ -4,6 +4,7 @@
#define _BCACHEFS_DATA_UPDATE_H
#include "bkey_buf.h"
+#include "io_read.h"
#include "io_write_types.h"
struct moving_context;
@@ -15,27 +16,61 @@ struct data_update_opts {
u8 extra_replicas;
unsigned btree_insert_flags;
unsigned write_flags;
+
+ int read_dev;
+ bool scrub;
};
void bch2_data_update_opts_to_text(struct printbuf *, struct bch_fs *,
struct bch_io_opts *, struct data_update_opts *);
+#define BCH_DATA_UPDATE_TYPES() \
+ x(copygc, 0) \
+ x(rebalance, 1) \
+ x(promote, 2)
+
+enum bch_data_update_types {
+#define x(n, id) BCH_DATA_UPDATE_##n = id,
+ BCH_DATA_UPDATE_TYPES()
+#undef x
+};
+
struct data_update {
+ enum bch_data_update_types type;
/* extent being updated: */
+ bool read_done;
enum btree_id btree_id;
struct bkey_buf k;
struct data_update_opts data_opts;
struct moving_context *ctxt;
struct bch_move_stats *stats;
+
+ struct bch_read_bio rbio;
struct bch_write_op op;
+ struct bio_vec *bvecs;
+};
+
+struct promote_op {
+ struct rcu_head rcu;
+ u64 start_time;
+#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
+ unsigned list_idx;
+#endif
+
+ struct rhash_head hash;
+ struct bpos pos;
+
+ struct work_struct work;
+ struct data_update write;
+ struct bio_vec bi_inline_vecs[]; /* must be last */
};
void bch2_data_update_to_text(struct printbuf *, struct data_update *);
+void bch2_data_update_inflight_to_text(struct printbuf *, struct data_update *);
int bch2_data_update_index_update(struct bch_write_op *);
-void bch2_data_update_read_done(struct data_update *,
- struct bch_extent_crc_unpacked);
+void bch2_data_update_read_done(struct data_update *);
int bch2_extent_drop_ptrs(struct btree_trans *,
struct btree_iter *,
@@ -43,12 +78,15 @@ int bch2_extent_drop_ptrs(struct btree_trans *,
struct bch_io_opts *,
struct data_update_opts *);
+int bch2_data_update_bios_init(struct data_update *, struct bch_fs *,
+ struct bch_io_opts *);
+
void bch2_data_update_exit(struct data_update *);
int bch2_data_update_init(struct btree_trans *, struct btree_iter *,
struct moving_context *,
struct data_update *,
struct write_point_specifier,
- struct bch_io_opts, struct data_update_opts,
+ struct bch_io_opts *, struct data_update_opts,
enum btree_id, struct bkey_s_c);
void bch2_data_update_opts_normalize(struct bkey_s_c, struct data_update_opts *);
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 55333e82d1fe..07c2a0f73cc2 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -7,6 +7,8 @@
*/
#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "async_objs.h"
#include "bkey_methods.h"
#include "btree_cache.h"
#include "btree_io.h"
@@ -15,6 +17,7 @@
#include "btree_update.h"
#include "btree_update_interior.h"
#include "buckets.h"
+#include "data_update.h"
#include "debug.h"
#include "error.h"
#include "extents.h"
@@ -39,9 +42,10 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b,
struct btree_node *n_sorted = c->verify_data->data;
struct bset *sorted, *inmemory = &b->data->keys;
struct bio *bio;
- bool failed = false, saw_error = false;
+ bool failed = false;
- struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
+ struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ,
+ BCH_DEV_READ_REF_btree_verify_replicas);
if (!ca)
return false;
@@ -56,12 +60,13 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b,
submit_bio_wait(bio);
bio_put(bio);
- percpu_ref_put(&ca->io_ref);
+ enumerated_ref_put(&ca->io_ref[READ],
+ BCH_DEV_READ_REF_btree_verify_replicas);
memcpy(n_ondisk, n_sorted, btree_buf_bytes(b));
v->written = 0;
- if (bch2_btree_node_read_done(c, ca, v, false, &saw_error) || saw_error)
+ if (bch2_btree_node_read_done(c, ca, v, NULL, NULL))
return false;
n_sorted = c->verify_data->data;
@@ -148,8 +153,6 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
c->verify_data = __bch2_btree_node_mem_alloc(c);
if (!c->verify_data)
goto out;
-
- list_del_init(&c->verify_data->list);
}
BUG_ON(b->nsets != 1);
@@ -190,12 +193,13 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c,
unsigned offset = 0;
int ret;
- if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick) <= 0) {
+ if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick, -1) <= 0) {
prt_printf(out, "error getting device to read from: invalid device\n");
return;
}
- ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
+ ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ,
+ BCH_DEV_READ_REF_btree_node_ondisk_to_text);
if (!ca) {
prt_printf(out, "error getting device to read from: not online\n");
return;
@@ -296,28 +300,13 @@ out:
if (bio)
bio_put(bio);
kvfree(n_ondisk);
- percpu_ref_put(&ca->io_ref);
+ enumerated_ref_put(&ca->io_ref[READ],
+ BCH_DEV_READ_REF_btree_node_ondisk_to_text);
}
#ifdef CONFIG_DEBUG_FS
-/* XXX: bch_fs refcounting */
-
-struct dump_iter {
- struct bch_fs *c;
- enum btree_id id;
- struct bpos from;
- struct bpos prev_node;
- u64 iter;
-
- struct printbuf buf;
-
- char __user *ubuf; /* destination user buffer */
- size_t size; /* size of requested read */
- ssize_t ret; /* bytes read so far */
-};
-
-static ssize_t flush_buf(struct dump_iter *i)
+ssize_t bch2_debugfs_flush_buf(struct dump_iter *i)
{
if (i->buf.pos) {
size_t bytes = min_t(size_t, i->buf.pos, i->size);
@@ -329,6 +318,11 @@ static ssize_t flush_buf(struct dump_iter *i)
i->buf.pos -= copied;
memmove(i->buf.buf, i->buf.buf + copied, i->buf.pos);
+ if (i->buf.last_newline >= copied)
+ i->buf.last_newline -= copied;
+ if (i->buf.last_field >= copied)
+ i->buf.last_field -= copied;
+
if (copied != bytes)
return -EFAULT;
}
@@ -355,7 +349,7 @@ static int bch2_dump_open(struct inode *inode, struct file *file)
return 0;
}
-static int bch2_dump_release(struct inode *inode, struct file *file)
+int bch2_dump_release(struct inode *inode, struct file *file)
{
struct dump_iter *i = file->private_data;
@@ -373,7 +367,7 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
i->size = size;
i->ret = 0;
- return flush_buf(i) ?:
+ return bch2_debugfs_flush_buf(i) ?:
bch2_trans_run(i->c,
for_each_btree_key(trans, iter, i->id, i->from,
BTREE_ITER_prefetch|
@@ -382,7 +376,7 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
prt_newline(&i->buf);
bch2_trans_unlock(trans);
i->from = bpos_successor(iter.pos);
- flush_buf(i);
+ bch2_debugfs_flush_buf(i);
}))) ?:
i->ret;
}
@@ -403,7 +397,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
i->size = size;
i->ret = 0;
- ssize_t ret = flush_buf(i);
+ ssize_t ret = bch2_debugfs_flush_buf(i);
if (ret)
return ret;
@@ -417,7 +411,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
? bpos_successor(b->key.k.p)
: b->key.k.p;
- drop_locks_do(trans, flush_buf(i));
+ drop_locks_do(trans, bch2_debugfs_flush_buf(i));
}))) ?: i->ret;
}
@@ -437,7 +431,7 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
i->size = size;
i->ret = 0;
- return flush_buf(i) ?:
+ return bch2_debugfs_flush_buf(i) ?:
bch2_trans_run(i->c,
for_each_btree_key(trans, iter, i->id, i->from,
BTREE_ITER_prefetch|
@@ -455,7 +449,7 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
bch2_bfloat_to_text(&i->buf, l->b, _k);
bch2_trans_unlock(trans);
i->from = bpos_successor(iter.pos);
- flush_buf(i);
+ bch2_debugfs_flush_buf(i);
}))) ?:
i->ret;
}
@@ -496,6 +490,8 @@ static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs *
prt_printf(out, "journal pin %px:\t%llu\n",
&b->writes[1].journal, b->writes[1].journal.seq);
+ prt_printf(out, "ob:\t%u\n", b->ob.nr);
+
printbuf_indent_sub(out, 2);
}
@@ -512,34 +508,34 @@ static ssize_t bch2_cached_btree_nodes_read(struct file *file, char __user *buf,
i->ret = 0;
do {
- struct bucket_table *tbl;
- struct rhash_head *pos;
- struct btree *b;
-
- ret = flush_buf(i);
+ ret = bch2_debugfs_flush_buf(i);
if (ret)
return ret;
- rcu_read_lock();
i->buf.atomic++;
- tbl = rht_dereference_rcu(c->btree_cache.table.tbl,
- &c->btree_cache.table);
- if (i->iter < tbl->size) {
- rht_for_each_entry_rcu(b, pos, tbl, i->iter, hash)
- bch2_cached_btree_node_to_text(&i->buf, c, b);
- i->iter++;
- } else {
- done = true;
+ scoped_guard(rcu) {
+ struct bucket_table *tbl =
+ rht_dereference_rcu(c->btree_cache.table.tbl,
+ &c->btree_cache.table);
+ if (i->iter < tbl->size) {
+ struct rhash_head *pos;
+ struct btree *b;
+
+ rht_for_each_entry_rcu(b, pos, tbl, i->iter, hash)
+ bch2_cached_btree_node_to_text(&i->buf, c, b);
+ i->iter++;
+ } else {
+ done = true;
+ }
}
--i->buf.atomic;
- rcu_read_unlock();
} while (!done);
if (i->buf.allocation_failure)
ret = -ENOMEM;
if (!ret)
- ret = flush_buf(i);
+ ret = bch2_debugfs_flush_buf(i);
return ret ?: i->ret;
}
@@ -588,6 +584,8 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
i->ubuf = buf;
i->size = size;
i->ret = 0;
+
+ int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
restart:
seqmutex_lock(&c->btree_trans_lock);
list_sort(&c->btree_trans_list, list_ptr_order_cmp);
@@ -601,6 +599,11 @@ restart:
if (!closure_get_not_zero(&trans->ref))
continue;
+ if (!trans->srcu_held) {
+ closure_put(&trans->ref);
+ continue;
+ }
+
u32 seq = seqmutex_unlock(&c->btree_trans_lock);
bch2_btree_trans_to_text(&i->buf, trans);
@@ -613,7 +616,7 @@ restart:
closure_put(&trans->ref);
- ret = flush_buf(i);
+ ret = bch2_debugfs_flush_buf(i);
if (ret)
goto unlocked;
@@ -622,11 +625,13 @@ restart:
}
seqmutex_unlock(&c->btree_trans_lock);
unlocked:
+ srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
+
if (i->buf.allocation_failure)
ret = -ENOMEM;
if (!ret)
- ret = flush_buf(i);
+ ret = bch2_debugfs_flush_buf(i);
return ret ?: i->ret;
}
@@ -651,7 +656,7 @@ static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf,
i->ret = 0;
while (1) {
- err = flush_buf(i);
+ err = bch2_debugfs_flush_buf(i);
if (err)
return err;
@@ -694,7 +699,7 @@ static ssize_t bch2_btree_updates_read(struct file *file, char __user *buf,
i->iter++;
}
- err = flush_buf(i);
+ err = bch2_debugfs_flush_buf(i);
if (err)
return err;
@@ -752,7 +757,7 @@ static ssize_t btree_transaction_stats_read(struct file *file, char __user *buf,
while (1) {
struct btree_transaction_stats *s = &c->btree_transaction_stats[i->iter];
- err = flush_buf(i);
+ err = bch2_debugfs_flush_buf(i);
if (err)
return err;
@@ -769,6 +774,12 @@ static ssize_t btree_transaction_stats_read(struct file *file, char __user *buf,
mutex_lock(&s->lock);
prt_printf(&i->buf, "Max mem used: %u\n", s->max_mem);
+#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE
+ printbuf_indent_add(&i->buf, 2);
+ bch2_trans_kmalloc_trace_to_text(&i->buf, &s->trans_kmalloc_trace);
+ printbuf_indent_sub(&i->buf, 2);
+#endif
+
prt_printf(&i->buf, "Transaction duration:\n");
printbuf_indent_add(&i->buf, 2);
@@ -844,8 +855,11 @@ restart:
seqmutex_unlock(&c->btree_trans_lock);
}
-static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
- size_t size, loff_t *ppos)
+typedef void (*fs_to_text_fn)(struct printbuf *, struct bch_fs *);
+
+static ssize_t bch2_simple_print(struct file *file, char __user *buf,
+ size_t size, loff_t *ppos,
+ fs_to_text_fn fn)
{
struct dump_iter *i = file->private_data;
struct bch_fs *c = i->c;
@@ -856,7 +870,7 @@ static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
i->ret = 0;
if (!i->iter) {
- btree_deadlock_to_text(&i->buf, c);
+ fn(&i->buf, c);
i->iter++;
}
@@ -864,11 +878,17 @@ static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
ret = -ENOMEM;
if (!ret)
- ret = flush_buf(i);
+ ret = bch2_debugfs_flush_buf(i);
return ret ?: i->ret;
}
+static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
+ size_t size, loff_t *ppos)
+{
+ return bch2_simple_print(file, buf, size, ppos, btree_deadlock_to_text);
+}
+
static const struct file_operations btree_deadlock_ops = {
.owner = THIS_MODULE,
.open = bch2_dump_open,
@@ -876,6 +896,19 @@ static const struct file_operations btree_deadlock_ops = {
.read = bch2_btree_deadlock_read,
};
+static ssize_t bch2_write_points_read(struct file *file, char __user *buf,
+ size_t size, loff_t *ppos)
+{
+ return bch2_simple_print(file, buf, size, ppos, bch2_write_points_to_text);
+}
+
+static const struct file_operations write_points_ops = {
+ .owner = THIS_MODULE,
+ .open = bch2_dump_open,
+ .release = bch2_dump_release,
+ .read = bch2_write_points_read,
+};
+
void bch2_fs_debug_exit(struct bch_fs *c)
{
if (!IS_ERR_OR_NULL(c->fs_debug_dir))
@@ -904,7 +937,11 @@ void bch2_fs_debug_init(struct bch_fs *c)
if (IS_ERR_OR_NULL(bch_debug))
return;
- snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b);
+ if (c->sb.multi_device)
+ snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b);
+ else
+ strscpy(name, c->name, sizeof(name));
+
c->fs_debug_dir = debugfs_create_dir(name, bch_debug);
if (IS_ERR_OR_NULL(c->fs_debug_dir))
return;
@@ -927,6 +964,11 @@ void bch2_fs_debug_init(struct bch_fs *c)
debugfs_create_file("btree_deadlock", 0400, c->fs_debug_dir,
c->btree_debug, &btree_deadlock_ops);
+ debugfs_create_file("write_points", 0400, c->fs_debug_dir,
+ c->btree_debug, &write_points_ops);
+
+ bch2_fs_async_obj_debugfs_init(c);
+
c->btree_debug_dir = debugfs_create_dir("btrees", c->fs_debug_dir);
if (IS_ERR_OR_NULL(c->btree_debug_dir))
return;
diff --git a/fs/bcachefs/debug.h b/fs/bcachefs/debug.h
index 2c37143b5fd1..d88b1194b8ac 100644
--- a/fs/bcachefs/debug.h
+++ b/fs/bcachefs/debug.h
@@ -14,11 +14,29 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *, struct bch_fs *,
static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b)
{
- if (bch2_verify_btree_ondisk)
+ if (static_branch_unlikely(&bch2_verify_btree_ondisk))
__bch2_btree_verify(c, b);
}
#ifdef CONFIG_DEBUG_FS
+struct dump_iter {
+ struct bch_fs *c;
+ struct async_obj_list *list;
+ enum btree_id id;
+ struct bpos from;
+ struct bpos prev_node;
+ u64 iter;
+
+ struct printbuf buf;
+
+ char __user *ubuf; /* destination user buffer */
+ size_t size; /* size of requested read */
+ ssize_t ret; /* bytes read so far */
+};
+
+ssize_t bch2_debugfs_flush_buf(struct dump_iter *);
+int bch2_dump_release(struct inode *, struct file *);
+
void bch2_fs_debug_exit(struct bch_fs *);
void bch2_fs_debug_init(struct bch_fs *);
#else
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 600eee936f13..28875c5c86ad 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -13,6 +13,29 @@
#include <linux/dcache.h>
+#ifdef CONFIG_UNICODE
+int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *info,
+ const struct qstr *str, struct qstr *out_cf)
+{
+ *out_cf = (struct qstr) QSTR_INIT(NULL, 0);
+
+ if (!bch2_fs_casefold_enabled(trans->c))
+ return -EOPNOTSUPP;
+
+ unsigned char *buf = bch2_trans_kmalloc(trans, BCH_NAME_MAX + 1);
+ int ret = PTR_ERR_OR_ZERO(buf);
+ if (ret)
+ return ret;
+
+ ret = utf8_casefold(info->cf_encoding, str, buf, BCH_NAME_MAX + 1);
+ if (ret <= 0)
+ return ret;
+
+ *out_cf = (struct qstr) QSTR_INIT(buf, ret);
+ return 0;
+}
+#endif
+
static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d)
{
if (bkey_val_bytes(d.k) < offsetof(struct bch_dirent, d_name))
@@ -28,13 +51,38 @@ static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d)
#endif
return bkey_bytes -
- offsetof(struct bch_dirent, d_name) -
+ (d.v->d_casefold
+ ? offsetof(struct bch_dirent, d_cf_name_block.d_names)
+ : offsetof(struct bch_dirent, d_name)) -
trailing_nuls;
}
struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d)
{
- return (struct qstr) QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d));
+ if (d.v->d_casefold) {
+ unsigned name_len = le16_to_cpu(d.v->d_cf_name_block.d_name_len);
+ return (struct qstr) QSTR_INIT(&d.v->d_cf_name_block.d_names[0], name_len);
+ } else {
+ return (struct qstr) QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d));
+ }
+}
+
+static struct qstr bch2_dirent_get_casefold_name(struct bkey_s_c_dirent d)
+{
+ if (d.v->d_casefold) {
+ unsigned name_len = le16_to_cpu(d.v->d_cf_name_block.d_name_len);
+ unsigned cf_name_len = le16_to_cpu(d.v->d_cf_name_block.d_cf_name_len);
+ return (struct qstr) QSTR_INIT(&d.v->d_cf_name_block.d_names[name_len], cf_name_len);
+ } else {
+ return (struct qstr) QSTR_INIT(NULL, 0);
+ }
+}
+
+static inline struct qstr bch2_dirent_get_lookup_name(struct bkey_s_c_dirent d)
+{
+ return d.v->d_casefold
+ ? bch2_dirent_get_casefold_name(d)
+ : bch2_dirent_get_name(d);
}
static u64 bch2_dirent_hash(const struct bch_hash_info *info,
@@ -57,7 +105,7 @@ static u64 dirent_hash_key(const struct bch_hash_info *info, const void *key)
static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
{
struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
- struct qstr name = bch2_dirent_get_name(d);
+ struct qstr name = bch2_dirent_get_lookup_name(d);
return bch2_dirent_hash(info, &name);
}
@@ -65,7 +113,7 @@ static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r)
{
struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
- const struct qstr l_name = bch2_dirent_get_name(l);
+ const struct qstr l_name = bch2_dirent_get_lookup_name(l);
const struct qstr *r_name = _r;
return !qstr_eq(l_name, *r_name);
@@ -75,8 +123,8 @@ static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
{
struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
struct bkey_s_c_dirent r = bkey_s_c_to_dirent(_r);
- const struct qstr l_name = bch2_dirent_get_name(l);
- const struct qstr r_name = bch2_dirent_get_name(r);
+ const struct qstr l_name = bch2_dirent_get_lookup_name(l);
+ const struct qstr r_name = bch2_dirent_get_lookup_name(r);
return !qstr_eq(l_name, r_name);
}
@@ -104,17 +152,19 @@ int bch2_dirent_validate(struct bch_fs *c, struct bkey_s_c k,
struct bkey_validate_context from)
{
struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
+ unsigned name_block_len = bch2_dirent_name_bytes(d);
struct qstr d_name = bch2_dirent_get_name(d);
+ struct qstr d_cf_name = bch2_dirent_get_casefold_name(d);
int ret = 0;
bkey_fsck_err_on(!d_name.len,
c, dirent_empty_name,
"empty name");
- bkey_fsck_err_on(bkey_val_u64s(k.k) > dirent_val_u64s(d_name.len),
+ bkey_fsck_err_on(d_name.len + d_cf_name.len > name_block_len,
c, dirent_val_too_big,
- "value too big (%zu > %u)",
- bkey_val_u64s(k.k), dirent_val_u64s(d_name.len));
+ "dirent names exceed bkey size (%d + %d > %d)",
+ d_name.len, d_cf_name.len, name_block_len);
/*
* Check new keys don't exceed the max length
@@ -142,6 +192,18 @@ int bch2_dirent_validate(struct bch_fs *c, struct bkey_s_c k,
le64_to_cpu(d.v->d_inum) == d.k->p.inode,
c, dirent_to_itself,
"dirent points to own directory");
+
+ if (d.v->d_casefold) {
+ bkey_fsck_err_on(from.from == BKEY_VALIDATE_commit &&
+ d_cf_name.len > BCH_NAME_MAX,
+ c, dirent_cf_name_too_big,
+ "dirent w/ cf name too big (%u > %u)",
+ d_cf_name.len, BCH_NAME_MAX);
+
+ bkey_fsck_err_on(d_cf_name.len != strnlen(d_cf_name.name, d_cf_name.len),
+ c, dirent_stray_data_after_cf_name,
+ "dirent has stray data after cf name's NUL");
+ }
fsck_err:
return ret;
}
@@ -151,36 +213,98 @@ void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
struct qstr d_name = bch2_dirent_get_name(d);
- prt_printf(out, "%.*s -> ", d_name.len, d_name.name);
+ prt_printf(out, "%.*s", d_name.len, d_name.name);
+
+ if (d.v->d_casefold) {
+ struct qstr d_name = bch2_dirent_get_lookup_name(d);
+ prt_printf(out, " (casefold %.*s)", d_name.len, d_name.name);
+ }
+
+ prt_str(out, " ->");
if (d.v->d_type != DT_SUBVOL)
- prt_printf(out, "%llu", le64_to_cpu(d.v->d_inum));
+ prt_printf(out, " %llu", le64_to_cpu(d.v->d_inum));
else
- prt_printf(out, "%u -> %u",
+ prt_printf(out, " %u -> %u",
le32_to_cpu(d.v->d_parent_subvol),
le32_to_cpu(d.v->d_child_subvol));
prt_printf(out, " type %s", bch2_d_type_str(d.v->d_type));
}
-static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
- subvol_inum dir, u8 type,
- const struct qstr *name, u64 dst)
+int bch2_dirent_init_name(struct bch_fs *c,
+ struct bkey_i_dirent *dirent,
+ const struct bch_hash_info *hash_info,
+ const struct qstr *name,
+ const struct qstr *cf_name)
{
- struct bkey_i_dirent *dirent;
- unsigned u64s = BKEY_U64s + dirent_val_u64s(name->len);
+ EBUG_ON(hash_info->cf_encoding == NULL && cf_name);
+ int cf_len = 0;
if (name->len > BCH_NAME_MAX)
- return ERR_PTR(-ENAMETOOLONG);
+ return -ENAMETOOLONG;
+
+ dirent->v.d_casefold = hash_info->cf_encoding != NULL;
+
+ if (!dirent->v.d_casefold) {
+ memcpy(&dirent->v.d_name[0], name->name, name->len);
+ memset(&dirent->v.d_name[name->len], 0,
+ bkey_val_bytes(&dirent->k) -
+ offsetof(struct bch_dirent, d_name) -
+ name->len);
+ } else {
+ if (!bch2_fs_casefold_enabled(c))
+ return -EOPNOTSUPP;
+
+#ifdef CONFIG_UNICODE
+ memcpy(&dirent->v.d_cf_name_block.d_names[0], name->name, name->len);
+
+ char *cf_out = &dirent->v.d_cf_name_block.d_names[name->len];
+
+ if (cf_name) {
+ cf_len = cf_name->len;
+
+ memcpy(cf_out, cf_name->name, cf_name->len);
+ } else {
+ cf_len = utf8_casefold(hash_info->cf_encoding, name,
+ cf_out,
+ bkey_val_end(bkey_i_to_s(&dirent->k_i)) - (void *) cf_out);
+ if (cf_len <= 0)
+ return cf_len;
+ }
+
+ memset(&dirent->v.d_cf_name_block.d_names[name->len + cf_len], 0,
+ bkey_val_bytes(&dirent->k) -
+ offsetof(struct bch_dirent, d_cf_name_block.d_names) -
+ name->len + cf_len);
+
+ dirent->v.d_cf_name_block.d_name_len = cpu_to_le16(name->len);
+ dirent->v.d_cf_name_block.d_cf_name_len = cpu_to_le16(cf_len);
+
+ EBUG_ON(bch2_dirent_get_casefold_name(dirent_i_to_s_c(dirent)).len != cf_len);
+#endif
+ }
- BUG_ON(u64s > U8_MAX);
+ unsigned u64s = dirent_val_u64s(name->len, cf_len);
+ BUG_ON(u64s > bkey_val_u64s(&dirent->k));
+ set_bkey_val_u64s(&dirent->k, u64s);
+ return 0;
+}
- dirent = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
+struct bkey_i_dirent *bch2_dirent_create_key(struct btree_trans *trans,
+ const struct bch_hash_info *hash_info,
+ subvol_inum dir,
+ u8 type,
+ const struct qstr *name,
+ const struct qstr *cf_name,
+ u64 dst)
+{
+ struct bkey_i_dirent *dirent = bch2_trans_kmalloc(trans, BKEY_U64s_MAX * sizeof(u64));
if (IS_ERR(dirent))
return dirent;
bkey_dirent_init(&dirent->k_i);
- dirent->k.u64s = u64s;
+ dirent->k.u64s = BKEY_U64s_MAX;
if (type != DT_SUBVOL) {
dirent->v.d_inum = cpu_to_le64(dst);
@@ -190,15 +314,13 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
}
dirent->v.d_type = type;
+ dirent->v.d_unused = 0;
- memcpy(dirent->v.d_name, name->name, name->len);
- memset(dirent->v.d_name + name->len, 0,
- bkey_val_bytes(&dirent->k) -
- offsetof(struct bch_dirent, d_name) -
- name->len);
-
- EBUG_ON(bch2_dirent_name_bytes(dirent_i_to_s_c(dirent)) != name->len);
+ int ret = bch2_dirent_init_name(trans->c, dirent, hash_info, name, cf_name);
+ if (ret)
+ return ERR_PTR(ret);
+ EBUG_ON(bch2_dirent_get_name(dirent_i_to_s_c(dirent)).len != name->len);
return dirent;
}
@@ -213,7 +335,7 @@ int bch2_dirent_create_snapshot(struct btree_trans *trans,
struct bkey_i_dirent *dirent;
int ret;
- dirent = dirent_create_key(trans, dir_inum, type, name, dst_inum);
+ dirent = bch2_dirent_create_key(trans, hash_info, dir_inum, type, name, NULL, dst_inum);
ret = PTR_ERR_OR_ZERO(dirent);
if (ret)
return ret;
@@ -222,8 +344,7 @@ int bch2_dirent_create_snapshot(struct btree_trans *trans,
dirent->k.p.snapshot = snapshot;
ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info,
- dir_inum, snapshot, &dirent->k_i,
- flags|BTREE_UPDATE_internal_snapshot_node);
+ dir_inum, snapshot, &dirent->k_i, flags);
*dir_offset = dirent->k.p.offset;
return ret;
@@ -238,7 +359,7 @@ int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir,
struct bkey_i_dirent *dirent;
int ret;
- dirent = dirent_create_key(trans, dir, type, name, dst_inum);
+ dirent = bch2_dirent_create_key(trans, hash_info, dir, type, name, NULL, dst_inum);
ret = PTR_ERR_OR_ZERO(dirent);
if (ret)
return ret;
@@ -281,8 +402,9 @@ int bch2_dirent_rename(struct btree_trans *trans,
const struct qstr *dst_name, subvol_inum *dst_inum, u64 *dst_offset,
enum bch_rename_mode mode)
{
- struct btree_iter src_iter = { NULL };
- struct btree_iter dst_iter = { NULL };
+ struct qstr src_name_lookup, dst_name_lookup;
+ struct btree_iter src_iter = {};
+ struct btree_iter dst_iter = {};
struct bkey_s_c old_src, old_dst = bkey_s_c_null;
struct bkey_i_dirent *new_src = NULL, *new_dst = NULL;
struct bpos dst_pos =
@@ -295,8 +417,11 @@ int bch2_dirent_rename(struct btree_trans *trans,
memset(dst_inum, 0, sizeof(*dst_inum));
/* Lookup src: */
+ ret = bch2_maybe_casefold(trans, src_hash, src_name, &src_name_lookup);
+ if (ret)
+ goto out;
old_src = bch2_hash_lookup(trans, &src_iter, bch2_dirent_hash_desc,
- src_hash, src_dir, src_name,
+ src_hash, src_dir, &src_name_lookup,
BTREE_ITER_intent);
ret = bkey_err(old_src);
if (ret)
@@ -308,6 +433,9 @@ int bch2_dirent_rename(struct btree_trans *trans,
goto out;
/* Lookup dst: */
+ ret = bch2_maybe_casefold(trans, dst_hash, dst_name, &dst_name_lookup);
+ if (ret)
+ goto out;
if (mode == BCH_RENAME) {
/*
* Note that we're _not_ checking if the target already exists -
@@ -315,12 +443,12 @@ int bch2_dirent_rename(struct btree_trans *trans,
* correctness:
*/
ret = bch2_hash_hole(trans, &dst_iter, bch2_dirent_hash_desc,
- dst_hash, dst_dir, dst_name);
+ dst_hash, dst_dir, &dst_name_lookup);
if (ret)
goto out;
} else {
old_dst = bch2_hash_lookup(trans, &dst_iter, bch2_dirent_hash_desc,
- dst_hash, dst_dir, dst_name,
+ dst_hash, dst_dir, &dst_name_lookup,
BTREE_ITER_intent);
ret = bkey_err(old_dst);
if (ret)
@@ -336,7 +464,8 @@ int bch2_dirent_rename(struct btree_trans *trans,
*src_offset = dst_iter.pos.offset;
/* Create new dst key: */
- new_dst = dirent_create_key(trans, dst_dir, 0, dst_name, 0);
+ new_dst = bch2_dirent_create_key(trans, dst_hash, dst_dir, 0, dst_name,
+ dst_hash->cf_encoding ? &dst_name_lookup : NULL, 0);
ret = PTR_ERR_OR_ZERO(new_dst);
if (ret)
goto out;
@@ -346,7 +475,8 @@ int bch2_dirent_rename(struct btree_trans *trans,
/* Create new src key: */
if (mode == BCH_RENAME_EXCHANGE) {
- new_src = dirent_create_key(trans, src_dir, 0, src_name, 0);
+ new_src = bch2_dirent_create_key(trans, src_hash, src_dir, 0, src_name,
+ src_hash->cf_encoding ? &src_name_lookup : NULL, 0);
ret = PTR_ERR_OR_ZERO(new_src);
if (ret)
goto out;
@@ -434,16 +564,16 @@ out_set_src:
}
if (delete_src) {
- bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot);
- ret = bch2_btree_iter_traverse(&src_iter) ?:
+ bch2_btree_iter_set_snapshot(trans, &src_iter, old_src.k->p.snapshot);
+ ret = bch2_btree_iter_traverse(trans, &src_iter) ?:
bch2_btree_delete_at(trans, &src_iter, BTREE_UPDATE_internal_snapshot_node);
if (ret)
goto out;
}
if (delete_dst) {
- bch2_btree_iter_set_snapshot(&dst_iter, old_dst.k->p.snapshot);
- ret = bch2_btree_iter_traverse(&dst_iter) ?:
+ bch2_btree_iter_set_snapshot(trans, &dst_iter, old_dst.k->p.snapshot);
+ ret = bch2_btree_iter_traverse(trans, &dst_iter) ?:
bch2_btree_delete_at(trans, &dst_iter, BTREE_UPDATE_internal_snapshot_node);
if (ret)
goto out;
@@ -465,9 +595,14 @@ int bch2_dirent_lookup_trans(struct btree_trans *trans,
const struct qstr *name, subvol_inum *inum,
unsigned flags)
{
+ struct qstr lookup_name;
+ int ret = bch2_maybe_casefold(trans, hash_info, name, &lookup_name);
+ if (ret)
+ return ret;
+
struct bkey_s_c k = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc,
- hash_info, dir, name, flags);
- int ret = bkey_err(k);
+ hash_info, dir, &lookup_name, flags);
+ ret = bkey_err(k);
if (ret)
goto err;
@@ -485,7 +620,7 @@ u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir,
const struct qstr *name, subvol_inum *inum)
{
struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter = { NULL };
+ struct btree_iter iter = {};
int ret = lockrestart_do(trans,
bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0));
@@ -507,7 +642,7 @@ int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 subvol, u32
struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
if (d.v->d_type == DT_SUBVOL && le32_to_cpu(d.v->d_parent_subvol) != subvol)
continue;
- ret = -BCH_ERR_ENOTEMPTY_dir_not_empty;
+ ret = bch_err_throw(trans->c, ENOTEMPTY_dir_not_empty);
break;
}
bch2_trans_iter_exit(trans, &iter);
@@ -540,10 +675,12 @@ static int bch2_dir_emit(struct dir_context *ctx, struct bkey_s_c_dirent d, subv
vfs_d_type(d.v->d_type));
if (ret)
ctx->pos = d.k->p.offset + 1;
- return ret;
+ return !ret;
}
-int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
+int bch2_readdir(struct bch_fs *c, subvol_inum inum,
+ struct bch_hash_info *hash_info,
+ struct dir_context *ctx)
{
struct bkey_buf sk;
bch2_bkey_buf_init(&sk);
@@ -561,14 +698,69 @@ int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
struct bkey_s_c_dirent dirent = bkey_i_to_s_c_dirent(sk.k);
subvol_inum target;
- int ret2 = bch2_dirent_read_target(trans, inum, dirent, &target);
+
+ bool need_second_pass = false;
+ int ret2 = bch2_str_hash_check_key(trans, NULL, &bch2_dirent_hash_desc,
+ hash_info, &iter, k, &need_second_pass) ?:
+ bch2_dirent_read_target(trans, inum, dirent, &target);
if (ret2 > 0)
continue;
- ret2 ?: drop_locks_do(trans, bch2_dir_emit(ctx, dirent, target));
+ ret2 ?: (bch2_trans_unlock(trans), bch2_dir_emit(ctx, dirent, target));
})));
bch2_bkey_buf_exit(&sk, c);
return ret < 0 ? ret : 0;
}
+
+/* fsck */
+
+static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
+ struct bch_inode_unpacked *inode)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+
+ for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr),
+ BTREE_ITER_all_snapshots, k, ret) {
+ if (k.k->p.offset != inode_nr)
+ break;
+ if (!bkey_is_inode(k.k))
+ continue;
+ ret = bch2_inode_unpack(k, inode);
+ goto found;
+ }
+ ret = bch_err_throw(trans->c, ENOENT_inode);
+found:
+ bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr);
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+int bch2_fsck_remove_dirent(struct btree_trans *trans, struct bpos pos)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bch_inode_unpacked dir_inode;
+ struct bch_hash_info dir_hash_info;
+ int ret;
+
+ ret = lookup_first_inode(trans, pos.inode, &dir_inode);
+ if (ret)
+ goto err;
+
+ dir_hash_info = bch2_hash_info_init(c, &dir_inode);
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent);
+
+ ret = bch2_btree_iter_traverse(trans, &iter) ?:
+ bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
+ &dir_hash_info, &iter,
+ BTREE_UPDATE_internal_snapshot_node);
+ bch2_trans_iter_exit(trans, &iter);
+err:
+ bch_err_fn(c, ret);
+ return ret;
+}
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index a633f83c1ac7..0417608c18d5 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -23,17 +23,38 @@ struct bch_fs;
struct bch_hash_info;
struct bch_inode_info;
-struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d);
+#ifdef CONFIG_UNICODE
+int bch2_casefold(struct btree_trans *, const struct bch_hash_info *,
+ const struct qstr *, struct qstr *);
+#else
+static inline int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *info,
+ const struct qstr *str, struct qstr *out_cf)
+{
+ return -EOPNOTSUPP;
+}
+#endif
-static inline unsigned dirent_val_u64s(unsigned len)
+static inline int bch2_maybe_casefold(struct btree_trans *trans,
+ const struct bch_hash_info *info,
+ const struct qstr *str, struct qstr *out_cf)
{
- return DIV_ROUND_UP(offsetof(struct bch_dirent, d_name) + len,
- sizeof(u64));
+ if (likely(!info->cf_encoding)) {
+ *out_cf = *str;
+ return 0;
+ } else {
+ return bch2_casefold(trans, info, str, out_cf);
+ }
}
-static inline unsigned int dirent_occupied_size(const struct qstr *name)
+struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent);
+
+static inline unsigned dirent_val_u64s(unsigned len, unsigned cf_len)
{
- return (BKEY_U64s + dirent_val_u64s(name->len)) * sizeof(u64);
+ unsigned bytes = cf_len
+ ? offsetof(struct bch_dirent, d_cf_name_block.d_names) + len + cf_len
+ : offsetof(struct bch_dirent, d_name) + len;
+
+ return DIV_ROUND_UP(bytes, sizeof(u64));
}
int bch2_dirent_read_target(struct btree_trans *, subvol_inum,
@@ -46,6 +67,15 @@ static inline void dirent_copy_target(struct bkey_i_dirent *dst,
dst->v.d_type = src.v->d_type;
}
+int bch2_dirent_init_name(struct bch_fs *,
+ struct bkey_i_dirent *,
+ const struct bch_hash_info *,
+ const struct qstr *,
+ const struct qstr *);
+struct bkey_i_dirent *bch2_dirent_create_key(struct btree_trans *,
+ const struct bch_hash_info *, subvol_inum, u8,
+ const struct qstr *, const struct qstr *, u64);
+
int bch2_dirent_create_snapshot(struct btree_trans *, u32, u64, u32,
const struct bch_hash_info *, u8,
const struct qstr *, u64, u64 *,
@@ -82,6 +112,8 @@ u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum,
int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32, u32);
int bch2_empty_dir_trans(struct btree_trans *, subvol_inum);
-int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *);
+int bch2_readdir(struct bch_fs *, subvol_inum, struct bch_hash_info *, struct dir_context *);
+
+int bch2_fsck_remove_dirent(struct btree_trans *, struct bpos);
#endif /* _BCACHEFS_DIRENT_H */
diff --git a/fs/bcachefs/dirent_format.h b/fs/bcachefs/dirent_format.h
index 5e116b88e814..a46dbddd21aa 100644
--- a/fs/bcachefs/dirent_format.h
+++ b/fs/bcachefs/dirent_format.h
@@ -29,9 +29,25 @@ struct bch_dirent {
* Copy of mode bits 12-15 from the target inode - so userspace can get
* the filetype without having to do a stat()
*/
- __u8 d_type;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u8 d_type:5,
+ d_unused:2,
+ d_casefold:1;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+ __u8 d_casefold:1,
+ d_unused:2,
+ d_type:5;
+#endif
- __u8 d_name[];
+ union {
+ struct {
+ __u8 d_pad;
+ __le16 d_name_len;
+ __le16 d_cf_name_len;
+ __u8 d_names[];
+ } d_cf_name_block __packed;
+ __DECLARE_FLEX_ARRAY(__u8, d_name);
+ } __packed;
} __packed __aligned(8);
#define DT_SUBVOL 16
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index b32e91ba8be8..f7528cd69c73 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -68,23 +68,31 @@ static const char * const disk_accounting_type_strs[] = {
NULL
};
-static inline void accounting_key_init(struct bkey_i *k, struct disk_accounting_pos *pos,
- s64 *d, unsigned nr)
+static inline void __accounting_key_init(struct bkey_i *k, struct bpos pos,
+ s64 *d, unsigned nr)
{
struct bkey_i_accounting *acc = bkey_accounting_init(k);
- acc->k.p = disk_accounting_pos_to_bpos(pos);
+ acc->k.p = pos;
set_bkey_val_u64s(&acc->k, sizeof(struct bch_accounting) / sizeof(u64) + nr);
memcpy_u64s_small(acc->v.d, d, nr);
}
+static inline void accounting_key_init(struct bkey_i *k, struct disk_accounting_pos *pos,
+ s64 *d, unsigned nr)
+{
+ return __accounting_key_init(k, disk_accounting_pos_to_bpos(pos), d, nr);
+}
+
static int bch2_accounting_update_sb_one(struct bch_fs *, struct bpos);
int bch2_disk_accounting_mod(struct btree_trans *trans,
struct disk_accounting_pos *k,
s64 *d, unsigned nr, bool gc)
{
+ BUG_ON(nr > BCH_ACCOUNTING_MAX_COUNTERS);
+
/* Normalize: */
switch (k->type) {
case BCH_DISK_ACCOUNTING_replicas:
@@ -92,21 +100,49 @@ int bch2_disk_accounting_mod(struct btree_trans *trans,
break;
}
- BUG_ON(nr > BCH_ACCOUNTING_MAX_COUNTERS);
+ struct bpos pos = disk_accounting_pos_to_bpos(k);
+
+ if (likely(!gc)) {
+ struct bkey_i_accounting *a;
+#if 0
+ for (a = btree_trans_subbuf_base(trans, &trans->accounting);
+ a != btree_trans_subbuf_top(trans, &trans->accounting);
+ a = (void *) bkey_next(&a->k_i))
+ if (bpos_eq(a->k.p, pos)) {
+ BUG_ON(nr != bch2_accounting_counters(&a->k));
+ acc_u64s(a->v.d, d, nr);
+
+ if (bch2_accounting_key_is_zero(accounting_i_to_s_c(a))) {
+ unsigned offset = (u64 *) a -
+ (u64 *) btree_trans_subbuf_base(trans, &trans->accounting);
+
+ trans->accounting.u64s -= a->k.u64s;
+ memmove_u64s_down(a,
+ bkey_next(&a->k_i),
+ trans->accounting.u64s - offset);
+ }
+ return 0;
+ }
+#endif
+ unsigned u64s = sizeof(*a) / sizeof(u64) + nr;
+ a = bch2_trans_subbuf_alloc(trans, &trans->accounting, u64s);
+ int ret = PTR_ERR_OR_ZERO(a);
+ if (ret)
+ return ret;
- struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i;
+ __accounting_key_init(&a->k_i, pos, d, nr);
+ return 0;
+ } else {
+ struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i;
- accounting_key_init(&k_i.k, k, d, nr);
+ __accounting_key_init(&k_i.k, pos, d, nr);
- if (unlikely(gc)) {
int ret = bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true);
if (ret == -BCH_ERR_btree_insert_need_mark_replicas)
ret = drop_locks_do(trans,
bch2_accounting_update_sb_one(trans->c, disk_accounting_pos_to_bpos(k))) ?:
bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true);
return ret;
- } else {
- return bch2_trans_update_buffered(trans, BTREE_ID_accounting, &k_i.k);
}
}
@@ -114,10 +150,9 @@ int bch2_mod_dev_cached_sectors(struct btree_trans *trans,
unsigned dev, s64 sectors,
bool gc)
{
- struct disk_accounting_pos acc = {
- .type = BCH_DISK_ACCOUNTING_replicas,
- };
-
+ struct disk_accounting_pos acc;
+ memset(&acc, 0, sizeof(acc));
+ acc.type = BCH_DISK_ACCOUNTING_replicas;
bch2_replicas_entry_cached(&acc.replicas, dev);
return bch2_disk_accounting_mod(trans, &acc, &sectors, 1, gc);
@@ -135,6 +170,12 @@ static inline bool is_zero(char *start, char *end)
#define field_end(p, member) (((void *) (&p.member)) + sizeof(p.member))
+static const unsigned bch2_accounting_type_nr_counters[] = {
+#define x(f, id, nr) [BCH_DISK_ACCOUNTING_##f] = nr,
+ BCH_DISK_ACCOUNTING_TYPES()
+#undef x
+};
+
int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k,
struct bkey_validate_context from)
{
@@ -193,6 +234,11 @@ int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k,
bkey_fsck_err_on(!is_zero(end, (void *) (&acc_k + 1)),
c, accounting_key_junk_at_end,
"junk at end of accounting key");
+
+ bkey_fsck_err_on(bch2_accounting_counters(k.k) != bch2_accounting_type_nr_counters[acc_k.type],
+ c, accounting_key_nr_counters_wrong,
+ "accounting key with %u counters, should be %u",
+ bch2_accounting_counters(k.k), bch2_accounting_type_nr_counters[acc_k.type]);
fsck_err:
return ret;
}
@@ -277,7 +323,7 @@ static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struc
static int bch2_accounting_update_sb_one(struct bch_fs *c, struct bpos p)
{
- struct bch_replicas_padded r;
+ union bch_replicas_padded r;
return accounting_to_replicas(&r.e, p)
? bch2_mark_replicas(c, &r.e)
: 0;
@@ -289,14 +335,13 @@ static int bch2_accounting_update_sb_one(struct bch_fs *c, struct bpos p)
*/
int bch2_accounting_update_sb(struct btree_trans *trans)
{
- for (struct jset_entry *i = trans->journal_entries;
- i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
- i = vstruct_next(i))
- if (jset_entry_is_key(i) && i->start->k.type == KEY_TYPE_accounting) {
- int ret = bch2_accounting_update_sb_one(trans->c, i->start->k.p);
- if (ret)
- return ret;
- }
+ for (struct bkey_i *i = btree_trans_subbuf_base(trans, &trans->accounting);
+ i != btree_trans_subbuf_top(trans, &trans->accounting);
+ i = bkey_next(i)) {
+ int ret = bch2_accounting_update_sb_one(trans->c, i->k.p);
+ if (ret)
+ return ret;
+ }
return 0;
}
@@ -345,18 +390,18 @@ static int __bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accoun
err:
free_percpu(n.v[1]);
free_percpu(n.v[0]);
- return -BCH_ERR_ENOMEM_disk_accounting;
+ return bch_err_throw(c, ENOMEM_disk_accounting);
}
int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a,
enum bch_accounting_mode mode)
{
- struct bch_replicas_padded r;
+ union bch_replicas_padded r;
if (mode != BCH_ACCOUNTING_read &&
accounting_to_replicas(&r.e, a.k->p) &&
!bch2_replicas_marked_locked(c, &r.e))
- return -BCH_ERR_btree_insert_need_mark_replicas;
+ return bch_err_throw(c, btree_insert_need_mark_replicas);
percpu_up_read(&c->mark_lock);
percpu_down_write(&c->mark_lock);
@@ -366,6 +411,19 @@ int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a,
return ret;
}
+int bch2_accounting_mem_insert_locked(struct bch_fs *c, struct bkey_s_c_accounting a,
+ enum bch_accounting_mode mode)
+{
+ union bch_replicas_padded r;
+
+ if (mode != BCH_ACCOUNTING_read &&
+ accounting_to_replicas(&r.e, a.k->p) &&
+ !bch2_replicas_marked_locked(c, &r.e))
+ return bch_err_throw(c, btree_insert_need_mark_replicas);
+
+ return __bch2_accounting_mem_insert(c, a);
+}
+
static bool accounting_mem_entry_is_zero(struct accounting_mem_entry *e)
{
for (unsigned i = 0; i < e->nr_counters; i++)
@@ -415,10 +473,12 @@ int bch2_fs_replicas_usage_read(struct bch_fs *c, darray_char *usage)
percpu_down_read(&c->mark_lock);
darray_for_each(acc->k, i) {
- struct {
+ union {
+ u8 bytes[struct_size_t(struct bch_replicas_usage, r.devs,
+ BCH_BKEY_PTRS_MAX)];
struct bch_replicas_usage r;
- u8 pad[BCH_BKEY_PTRS_MAX];
} u;
+ u.r.r.nr_devs = BCH_BKEY_PTRS_MAX;
if (!accounting_to_replicas(&u.r.r, i->pos))
continue;
@@ -499,7 +559,7 @@ int bch2_gc_accounting_start(struct bch_fs *c)
sizeof(u64), GFP_KERNEL);
if (!e->v[1]) {
bch2_accounting_free_counters(acc, true);
- ret = -BCH_ERR_ENOMEM_disk_accounting;
+ ret = bch_err_throw(c, ENOMEM_disk_accounting);
break;
}
}
@@ -547,18 +607,20 @@ int bch2_gc_accounting_done(struct bch_fs *c)
prt_str(&buf, "accounting mismatch for ");
bch2_accounting_key_to_text(&buf, &acc_k);
- prt_str(&buf, ": got");
+ prt_str(&buf, ":\n got");
for (unsigned j = 0; j < nr; j++)
prt_printf(&buf, " %llu", dst_v[j]);
- prt_str(&buf, " should be");
+ prt_str(&buf, "\nshould be");
for (unsigned j = 0; j < nr; j++)
prt_printf(&buf, " %llu", src_v[j]);
for (unsigned j = 0; j < nr; j++)
src_v[j] -= dst_v[j];
- if (fsck_err(trans, accounting_mismatch, "%s", buf.buf)) {
+ bch2_trans_unlock_long(trans);
+
+ if (fsck_err(c, accounting_mismatch, "%s", buf.buf)) {
percpu_up_write(&c->mark_lock);
ret = commit_do(trans, NULL, NULL, 0,
bch2_disk_accounting_mod(trans, &acc_k, src_v, nr, false));
@@ -573,7 +635,7 @@ int bch2_gc_accounting_done(struct bch_fs *c)
accounting_key_init(&k_i.k, &acc_k, src_v, nr);
bch2_accounting_mem_mod_locked(trans,
bkey_i_to_s_c_accounting(&k_i.k),
- BCH_ACCOUNTING_normal);
+ BCH_ACCOUNTING_normal, true);
preempt_disable();
struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage);
@@ -602,23 +664,23 @@ static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k)
percpu_down_read(&c->mark_lock);
int ret = bch2_accounting_mem_mod_locked(trans, bkey_s_c_to_accounting(k),
- BCH_ACCOUNTING_read);
+ BCH_ACCOUNTING_read, false);
percpu_up_read(&c->mark_lock);
return ret;
}
static int bch2_disk_accounting_validate_late(struct btree_trans *trans,
- struct disk_accounting_pos acc,
+ struct disk_accounting_pos *acc,
u64 *v, unsigned nr)
{
struct bch_fs *c = trans->c;
struct printbuf buf = PRINTBUF;
int ret = 0, invalid_dev = -1;
- switch (acc.type) {
+ switch (acc->type) {
case BCH_DISK_ACCOUNTING_replicas: {
- struct bch_replicas_padded r;
- __accounting_to_replicas(&r.e, &acc);
+ union bch_replicas_padded r;
+ __accounting_to_replicas(&r.e, acc);
for (unsigned i = 0; i < r.e.nr_devs; i++)
if (r.e.devs[i] != BCH_SB_MEMBER_INVALID &&
@@ -635,9 +697,9 @@ static int bch2_disk_accounting_validate_late(struct btree_trans *trans,
if (fsck_err_on(!bch2_replicas_marked_locked(c, &r.e),
trans, accounting_replicas_not_marked,
- "accounting not marked in superblock replicas\n %s",
+ "accounting not marked in superblock replicas\n%s",
(printbuf_reset(&buf),
- bch2_accounting_key_to_text(&buf, &acc),
+ bch2_accounting_key_to_text(&buf, acc),
buf.buf))) {
/*
* We're not RW yet and still single threaded, dropping
@@ -653,8 +715,8 @@ static int bch2_disk_accounting_validate_late(struct btree_trans *trans,
}
case BCH_DISK_ACCOUNTING_dev_data_type:
- if (!bch2_dev_exists(c, acc.dev_data_type.dev)) {
- invalid_dev = acc.dev_data_type.dev;
+ if (!bch2_dev_exists(c, acc->dev_data_type.dev)) {
+ invalid_dev = acc->dev_data_type.dev;
goto invalid_device;
}
break;
@@ -665,19 +727,19 @@ fsck_err:
return ret;
invalid_device:
if (fsck_err(trans, accounting_to_invalid_device,
- "accounting entry points to invalid device %i\n %s",
+ "accounting entry points to invalid device %i\n%s",
invalid_dev,
(printbuf_reset(&buf),
- bch2_accounting_key_to_text(&buf, &acc),
+ bch2_accounting_key_to_text(&buf, acc),
buf.buf))) {
for (unsigned i = 0; i < nr; i++)
v[i] = -v[i];
ret = commit_do(trans, NULL, NULL, 0,
- bch2_disk_accounting_mod(trans, &acc, v, nr, false)) ?:
+ bch2_disk_accounting_mod(trans, acc, v, nr, false)) ?:
-BCH_ERR_remove_disk_accounting_entry;
} else {
- ret = -BCH_ERR_remove_disk_accounting_entry;
+ ret = bch_err_throw(c, remove_disk_accounting_entry);
}
goto fsck_err;
}
@@ -725,9 +787,11 @@ int bch2_accounting_read(struct bch_fs *c)
if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR)
break;
- if (!bch2_accounting_is_mem(acc_k)) {
- struct disk_accounting_pos next = { .type = acc_k.type + 1 };
- bch2_btree_iter_set_pos(&iter, disk_accounting_pos_to_bpos(&next));
+ if (!bch2_accounting_is_mem(&acc_k)) {
+ struct disk_accounting_pos next;
+ memset(&next, 0, sizeof(next));
+ next.type = acc_k.type + 1;
+ bch2_btree_iter_set_pos(trans, &iter, disk_accounting_pos_to_bpos(&next));
continue;
}
@@ -745,7 +809,7 @@ int bch2_accounting_read(struct bch_fs *c)
struct disk_accounting_pos acc_k;
bpos_to_disk_accounting_pos(&acc_k, i->k->k.p);
- if (!bch2_accounting_is_mem(acc_k))
+ if (!bch2_accounting_is_mem(&acc_k))
continue;
struct bkey_s_c k = bkey_i_to_s_c(i->k);
@@ -801,7 +865,7 @@ int bch2_accounting_read(struct bch_fs *c)
*/
ret = bch2_is_zero(v, sizeof(v[0]) * i->nr_counters)
? -BCH_ERR_remove_disk_accounting_entry
- : bch2_disk_accounting_validate_late(trans, acc_k, v, i->nr_counters);
+ : bch2_disk_accounting_validate_late(trans, &acc_k, v, i->nr_counters);
if (ret == -BCH_ERR_remove_disk_accounting_entry) {
free_percpu(i->v[0]);
@@ -835,8 +899,8 @@ int bch2_accounting_read(struct bch_fs *c)
case BCH_DISK_ACCOUNTING_replicas:
fs_usage_data_type_to_base(usage, k.replicas.data_type, v[0]);
break;
- case BCH_DISK_ACCOUNTING_dev_data_type:
- rcu_read_lock();
+ case BCH_DISK_ACCOUNTING_dev_data_type: {
+ guard(rcu)();
struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.dev_data_type.dev);
if (ca) {
struct bch_dev_usage_type __percpu *d = &ca->usage->d[k.dev_data_type.data_type];
@@ -848,9 +912,9 @@ int bch2_accounting_read(struct bch_fs *c)
k.dev_data_type.data_type == BCH_DATA_journal)
usage->hidden += v[0] * ca->mi.bucket_size;
}
- rcu_read_unlock();
break;
}
+ }
}
preempt_enable();
fsck_err:
@@ -882,15 +946,13 @@ int bch2_dev_usage_remove(struct bch_fs *c, unsigned dev)
int bch2_dev_usage_init(struct bch_dev *ca, bool gc)
{
struct bch_fs *c = ca->fs;
- struct disk_accounting_pos acc = {
- .type = BCH_DISK_ACCOUNTING_dev_data_type,
- .dev_data_type.dev = ca->dev_idx,
- .dev_data_type.data_type = BCH_DATA_free,
- };
u64 v[3] = { ca->mi.nbuckets - ca->mi.first_bucket, 0, 0 };
int ret = bch2_trans_do(c, ({
- bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v), gc) ?:
+ bch2_disk_accounting_mod2(trans, gc,
+ v, dev_data_type,
+ .dev = ca->dev_idx,
+ .data_type = BCH_DATA_free) ?:
(!gc ? bch2_trans_commit(trans, NULL, NULL, 0) : 0);
}));
bch_err_fn(c, ret);
@@ -916,9 +978,11 @@ void bch2_verify_accounting_clean(struct bch_fs *c)
if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR)
break;
- if (!bch2_accounting_is_mem(acc_k)) {
- struct disk_accounting_pos next = { .type = acc_k.type + 1 };
- bch2_btree_iter_set_pos(&iter, disk_accounting_pos_to_bpos(&next));
+ if (!bch2_accounting_is_mem(&acc_k)) {
+ struct disk_accounting_pos next;
+ memset(&next, 0, sizeof(next));
+ next.type = acc_k.type + 1;
+ bch2_btree_iter_set_pos(trans, &iter, disk_accounting_pos_to_bpos(&next));
continue;
}
@@ -944,19 +1008,18 @@ void bch2_verify_accounting_clean(struct bch_fs *c)
case BCH_DISK_ACCOUNTING_replicas:
fs_usage_data_type_to_base(&base, acc_k.replicas.data_type, a.v->d[0]);
break;
- case BCH_DISK_ACCOUNTING_dev_data_type: {
- rcu_read_lock();
- struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev);
- if (!ca) {
- rcu_read_unlock();
- continue;
+ case BCH_DISK_ACCOUNTING_dev_data_type:
+ {
+ guard(rcu)(); /* scoped guard is a loop, and doesn't play nicely with continue */
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev);
+ if (!ca)
+ continue;
+
+ v[0] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].buckets);
+ v[1] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].sectors);
+ v[2] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].fragmented);
}
- v[0] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].buckets);
- v[1] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].sectors);
- v[2] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].fragmented);
- rcu_read_unlock();
-
if (memcmp(a.v->d, v, 3 * sizeof(u64))) {
struct printbuf buf = PRINTBUF;
@@ -970,7 +1033,6 @@ void bch2_verify_accounting_clean(struct bch_fs *c)
mismatch = true;
}
}
- }
0;
})));
diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h
index 5360cbb3ec29..d61abebf3e0b 100644
--- a/fs/bcachefs/disk_accounting.h
+++ b/fs/bcachefs/disk_accounting.h
@@ -33,10 +33,12 @@ static inline bool bch2_accounting_key_is_zero(struct bkey_s_c_accounting a)
static inline void bch2_accounting_accumulate(struct bkey_i_accounting *dst,
struct bkey_s_c_accounting src)
{
- EBUG_ON(dst->k.u64s != src.k->u64s);
-
- for (unsigned i = 0; i < bch2_accounting_counters(&dst->k); i++)
+ for (unsigned i = 0;
+ i < min(bch2_accounting_counters(&dst->k),
+ bch2_accounting_counters(src.k));
+ i++)
dst->v.d[i] += src.v->d[i];
+
if (bversion_cmp(dst->k.bversion, src.k->bversion) < 0)
dst->k.bversion = src.k->bversion;
}
@@ -85,6 +87,24 @@ static inline struct bpos disk_accounting_pos_to_bpos(struct disk_accounting_pos
int bch2_disk_accounting_mod(struct btree_trans *, struct disk_accounting_pos *,
s64 *, unsigned, bool);
+
+#define disk_accounting_key_init(_k, _type, ...) \
+do { \
+ memset(&(_k), 0, sizeof(_k)); \
+ (_k).type = BCH_DISK_ACCOUNTING_##_type; \
+ (_k)._type = (struct bch_acct_##_type) { __VA_ARGS__ }; \
+} while (0)
+
+#define bch2_disk_accounting_mod2_nr(_trans, _gc, _v, _nr, ...) \
+({ \
+ struct disk_accounting_pos pos; \
+ disk_accounting_key_init(pos, __VA_ARGS__); \
+ bch2_disk_accounting_mod(trans, &pos, _v, _nr, _gc); \
+})
+
+#define bch2_disk_accounting_mod2(_trans, _gc, _v, ...) \
+ bch2_disk_accounting_mod2_nr(_trans, _gc, _v, ARRAY_SIZE(_v), __VA_ARGS__)
+
int bch2_mod_dev_cached_sectors(struct btree_trans *, unsigned, s64, bool);
int bch2_accounting_validate(struct bch_fs *, struct bkey_s_c,
@@ -116,12 +136,13 @@ enum bch_accounting_mode {
};
int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode);
+int bch2_accounting_mem_insert_locked(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode);
void bch2_accounting_mem_gc(struct bch_fs *);
-static inline bool bch2_accounting_is_mem(struct disk_accounting_pos acc)
+static inline bool bch2_accounting_is_mem(struct disk_accounting_pos *acc)
{
- return acc.type < BCH_DISK_ACCOUNTING_TYPE_NR &&
- acc.type != BCH_DISK_ACCOUNTING_inum;
+ return acc->type < BCH_DISK_ACCOUNTING_TYPE_NR &&
+ acc->type != BCH_DISK_ACCOUNTING_inum;
}
/*
@@ -130,7 +151,8 @@ static inline bool bch2_accounting_is_mem(struct disk_accounting_pos acc)
*/
static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans,
struct bkey_s_c_accounting a,
- enum bch_accounting_mode mode)
+ enum bch_accounting_mode mode,
+ bool write_locked)
{
struct bch_fs *c = trans->c;
struct bch_accounting_mem *acc = &c->accounting;
@@ -141,7 +163,7 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans,
if (gc && !acc->gc_running)
return 0;
- if (!bch2_accounting_is_mem(acc_k))
+ if (!bch2_accounting_is_mem(&acc_k))
return 0;
if (mode == BCH_ACCOUNTING_normal) {
@@ -152,24 +174,28 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans,
case BCH_DISK_ACCOUNTING_replicas:
fs_usage_data_type_to_base(&trans->fs_usage_delta, acc_k.replicas.data_type, a.v->d[0]);
break;
- case BCH_DISK_ACCOUNTING_dev_data_type:
- rcu_read_lock();
+ case BCH_DISK_ACCOUNTING_dev_data_type: {
+ guard(rcu)();
struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev);
if (ca) {
this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].buckets, a.v->d[0]);
this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].sectors, a.v->d[1]);
this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].fragmented, a.v->d[2]);
}
- rcu_read_unlock();
break;
}
+ }
}
unsigned idx;
while ((idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
accounting_pos_cmp, &a.k->p)) >= acc->k.nr) {
- int ret = bch2_accounting_mem_insert(c, a, mode);
+ int ret = 0;
+ if (unlikely(write_locked))
+ ret = bch2_accounting_mem_insert_locked(c, a, mode);
+ else
+ ret = bch2_accounting_mem_insert(c, a, mode);
if (ret)
return ret;
}
@@ -186,7 +212,7 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans,
static inline int bch2_accounting_mem_add(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc)
{
percpu_down_read(&trans->c->mark_lock);
- int ret = bch2_accounting_mem_mod_locked(trans, a, gc ? BCH_ACCOUNTING_gc : BCH_ACCOUNTING_normal);
+ int ret = bch2_accounting_mem_mod_locked(trans, a, gc ? BCH_ACCOUNTING_gc : BCH_ACCOUNTING_normal, false);
percpu_up_read(&trans->c->mark_lock);
return ret;
}
@@ -210,11 +236,13 @@ static inline void bch2_accounting_mem_read_counters(struct bch_accounting_mem *
static inline void bch2_accounting_mem_read(struct bch_fs *c, struct bpos p,
u64 *v, unsigned nr)
{
+ percpu_down_read(&c->mark_lock);
struct bch_accounting_mem *acc = &c->accounting;
unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
accounting_pos_cmp, &p);
bch2_accounting_mem_read_counters(acc, idx, v, nr, false);
+ percpu_up_read(&c->mark_lock);
}
static inline struct bversion journal_pos_to_bversion(struct journal_res *res, unsigned offset)
@@ -231,13 +259,13 @@ static inline int bch2_accounting_trans_commit_hook(struct btree_trans *trans,
struct bkey_i_accounting *a,
unsigned commit_flags)
{
- a->k.bversion = journal_pos_to_bversion(&trans->journal_res,
- (u64 *) a - (u64 *) trans->journal_entries);
+ u64 *base = (u64 *) btree_trans_subbuf_base(trans, &trans->accounting);
+ a->k.bversion = journal_pos_to_bversion(&trans->journal_res, (u64 *) a - base);
EBUG_ON(bversion_zero(a->k.bversion));
return likely(!(commit_flags & BCH_TRANS_COMMIT_skip_accounting_apply))
- ? bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), BCH_ACCOUNTING_normal)
+ ? bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), BCH_ACCOUNTING_normal, false)
: 0;
}
@@ -249,7 +277,7 @@ static inline void bch2_accounting_trans_commit_revert(struct btree_trans *trans
struct bkey_s_accounting a = accounting_i_to_s(a_i);
bch2_accounting_neg(a);
- bch2_accounting_mem_mod_locked(trans, a.c, BCH_ACCOUNTING_normal);
+ bch2_accounting_mem_mod_locked(trans, a.c, BCH_ACCOUNTING_normal, false);
bch2_accounting_neg(a);
}
}
diff --git a/fs/bcachefs/disk_accounting_format.h b/fs/bcachefs/disk_accounting_format.h
index 7b6e6c97e6aa..8269af1dbe2a 100644
--- a/fs/bcachefs/disk_accounting_format.h
+++ b/fs/bcachefs/disk_accounting_format.h
@@ -95,40 +95,81 @@ static inline bool data_type_is_hidden(enum bch_data_type type)
}
}
+/*
+ * field 1: name
+ * field 2: id
+ * field 3: number of counters (max 3)
+ */
+
#define BCH_DISK_ACCOUNTING_TYPES() \
- x(nr_inodes, 0) \
- x(persistent_reserved, 1) \
- x(replicas, 2) \
- x(dev_data_type, 3) \
- x(compression, 4) \
- x(snapshot, 5) \
- x(btree, 6) \
- x(rebalance_work, 7) \
- x(inum, 8)
+ x(nr_inodes, 0, 1) \
+ x(persistent_reserved, 1, 1) \
+ x(replicas, 2, 1) \
+ x(dev_data_type, 3, 3) \
+ x(compression, 4, 3) \
+ x(snapshot, 5, 1) \
+ x(btree, 6, 1) \
+ x(rebalance_work, 7, 1) \
+ x(inum, 8, 3)
enum disk_accounting_type {
-#define x(f, nr) BCH_DISK_ACCOUNTING_##f = nr,
+#define x(f, nr, ...) BCH_DISK_ACCOUNTING_##f = nr,
BCH_DISK_ACCOUNTING_TYPES()
#undef x
BCH_DISK_ACCOUNTING_TYPE_NR,
};
-struct bch_nr_inodes {
+/*
+ * No subtypes - number of inodes in the entire filesystem
+ *
+ * XXX: perhaps we could add a per-subvolume counter?
+ */
+struct bch_acct_nr_inodes {
};
-struct bch_persistent_reserved {
+/*
+ * Tracks KEY_TYPE_reservation sectors, broken out by number of replicas for the
+ * reservation:
+ */
+struct bch_acct_persistent_reserved {
__u8 nr_replicas;
};
-struct bch_dev_data_type {
+/*
+ * device, data type counter fields:
+ * [
+ * nr_buckets
+ * live sectors (in buckets of that data type)
+ * sectors of internal fragmentation
+ * ]
+ *
+ * XXX: live sectors should've been done differently, you can have multiple data
+ * types in the same bucket (user, stripe, cached) and this collapses them to
+ * the bucket data type, and makes the internal fragmentation counter redundant
+ */
+struct bch_acct_dev_data_type {
__u8 dev;
__u8 data_type;
};
+/*
+ * Compression type fields:
+ * [
+ * number of extents
+ * uncompressed size
+ * compressed size
+ * ]
+ *
+ * Compression ratio, average extent size (fragmentation).
+ */
struct bch_acct_compression {
__u8 type;
};
+/*
+ * On disk usage by snapshot id; counts same values as replicas counter, but
+ * aggregated differently
+ */
struct bch_acct_snapshot {
__u32 id;
} __packed;
@@ -137,10 +178,27 @@ struct bch_acct_btree {
__u32 id;
} __packed;
+/*
+ * inum counter fields:
+ * [
+ * number of extents
+ * sum of extent sizes - bkey size
+ * this field is similar to inode.bi_sectors, except here extents in
+ * different snapshots but the same inode number are all collapsed to the
+ * same counter
+ * sum of on disk size - same values tracked by replicas counters
+ * ]
+ *
+ * This tracks on disk fragmentation.
+ */
struct bch_acct_inum {
__u64 inum;
} __packed;
+/*
+ * Simple counter of the amount of data (on disk sectors) rebalance needs to
+ * move, extents counted here are also in the rebalance_work btree.
+ */
struct bch_acct_rebalance_work {
};
@@ -149,10 +207,10 @@ struct disk_accounting_pos {
struct {
__u8 type;
union {
- struct bch_nr_inodes nr_inodes;
- struct bch_persistent_reserved persistent_reserved;
+ struct bch_acct_nr_inodes nr_inodes;
+ struct bch_acct_persistent_reserved persistent_reserved;
struct bch_replicas_entry_v1 replicas;
- struct bch_dev_data_type dev_data_type;
+ struct bch_acct_dev_data_type dev_data_type;
struct bch_acct_compression compression;
struct bch_acct_snapshot snapshot;
struct bch_acct_btree btree;
diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
index 5df8de0b8c02..cde842ac1886 100644
--- a/fs/bcachefs/disk_groups.c
+++ b/fs/bcachefs/disk_groups.c
@@ -86,35 +86,6 @@ err:
return ret;
}
-void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c)
-{
- out->atomic++;
- rcu_read_lock();
-
- struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
- if (!g)
- goto out;
-
- for (unsigned i = 0; i < g->nr; i++) {
- if (i)
- prt_printf(out, " ");
-
- if (g->entries[i].deleted) {
- prt_printf(out, "[deleted]");
- continue;
- }
-
- prt_printf(out, "[parent %d devs", g->entries[i].parent);
- for_each_member_device_rcu(c, ca, &g->entries[i].devs)
- prt_printf(out, " %s", ca->name);
- prt_printf(out, "]");
- }
-
-out:
- rcu_read_unlock();
- out->atomic--;
-}
-
static void bch2_sb_disk_groups_to_text(struct printbuf *out,
struct bch_sb *sb,
struct bch_sb_field *f)
@@ -159,7 +130,7 @@ int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
cpu_g = kzalloc(struct_size(cpu_g, entries, nr_groups), GFP_KERNEL);
if (!cpu_g)
- return -BCH_ERR_ENOMEM_disk_groups_to_cpu;
+ return bch_err_throw(c, ENOMEM_disk_groups_to_cpu);
cpu_g->nr = nr_groups;
@@ -199,36 +170,28 @@ int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned target)
{
struct target t = target_decode(target);
- struct bch_devs_mask *devs;
- rcu_read_lock();
+ guard(rcu)();
switch (t.type) {
case TARGET_NULL:
- devs = NULL;
- break;
+ return NULL;
case TARGET_DEV: {
struct bch_dev *ca = t.dev < c->sb.nr_devices
? rcu_dereference(c->devs[t.dev])
: NULL;
- devs = ca ? &ca->self : NULL;
- break;
+ return ca ? &ca->self : NULL;
}
case TARGET_GROUP: {
struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
- devs = g && t.group < g->nr && !g->entries[t.group].deleted
+ return g && t.group < g->nr && !g->entries[t.group].deleted
? &g->entries[t.group].devs
: NULL;
- break;
}
default:
BUG();
}
-
- rcu_read_unlock();
-
- return devs;
}
bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target)
@@ -241,20 +204,13 @@ bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target)
case TARGET_DEV:
return dev == t.dev;
case TARGET_GROUP: {
- struct bch_disk_groups_cpu *g;
- const struct bch_devs_mask *m;
- bool ret;
-
- rcu_read_lock();
- g = rcu_dereference(c->disk_groups);
- m = g && t.group < g->nr && !g->entries[t.group].deleted
+ struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
+ const struct bch_devs_mask *m =
+ g && t.group < g->nr && !g->entries[t.group].deleted
? &g->entries[t.group].devs
: NULL;
- ret = m ? test_bit(dev, m->d) : false;
- rcu_read_unlock();
-
- return ret;
+ return m ? test_bit(dev, m->d) : false;
}
default:
BUG();
@@ -377,54 +333,79 @@ int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name)
return v;
}
-void bch2_disk_path_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
+static void __bch2_disk_path_to_text(struct printbuf *out, struct bch_disk_groups_cpu *g,
+ unsigned v)
{
- struct bch_disk_groups_cpu *groups;
- struct bch_disk_group_cpu *g;
- unsigned nr = 0;
u16 path[32];
-
- out->atomic++;
- rcu_read_lock();
- groups = rcu_dereference(c->disk_groups);
- if (!groups)
- goto invalid;
+ unsigned nr = 0;
while (1) {
if (nr == ARRAY_SIZE(path))
goto invalid;
- if (v >= groups->nr)
+ if (v >= (g ? g->nr : 0))
goto invalid;
- g = groups->entries + v;
+ struct bch_disk_group_cpu *e = g->entries + v;
- if (g->deleted)
+ if (e->deleted)
goto invalid;
path[nr++] = v;
- if (!g->parent)
+ if (!e->parent)
break;
- v = g->parent - 1;
+ v = e->parent - 1;
}
while (nr) {
- v = path[--nr];
- g = groups->entries + v;
+ struct bch_disk_group_cpu *e = g->entries + path[--nr];
- prt_printf(out, "%.*s", (int) sizeof(g->label), g->label);
+ prt_printf(out, "%.*s", (int) sizeof(e->label), e->label);
if (nr)
prt_printf(out, ".");
}
-out:
- rcu_read_unlock();
- out->atomic--;
return;
invalid:
prt_printf(out, "invalid label %u", v);
- goto out;
+}
+
+void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c)
+{
+ bch2_printbuf_make_room(out, 4096);
+
+ out->atomic++;
+ guard(rcu)();
+ struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
+
+ for (unsigned i = 0; i < (g ? g->nr : 0); i++) {
+ prt_printf(out, "%2u: ", i);
+
+ if (g->entries[i].deleted) {
+ prt_printf(out, "[deleted]");
+ goto next;
+ }
+
+ __bch2_disk_path_to_text(out, g, i);
+
+ prt_printf(out, " devs");
+
+ for_each_member_device_rcu(c, ca, &g->entries[i].devs)
+ prt_printf(out, " %s", ca->name);
+next:
+ prt_newline(out);
+ }
+
+ out->atomic--;
+}
+
+void bch2_disk_path_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
+{
+ out->atomic++;
+ guard(rcu)();
+ __bch2_disk_path_to_text(out, rcu_dereference(c->disk_groups), v),
+ --out->atomic;
}
void bch2_disk_path_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v)
@@ -470,23 +451,22 @@ inval:
int __bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
{
- struct bch_member *mi;
- int ret, v = -1;
+ lockdep_assert_held(&c->sb_lock);
- if (!strlen(name) || !strcmp(name, "none"))
- return 0;
- v = bch2_disk_path_find_or_create(&c->disk_sb, name);
- if (v < 0)
- return v;
+ if (!strlen(name) || !strcmp(name, "none")) {
+ struct bch_member *mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
+ SET_BCH_MEMBER_GROUP(mi, 0);
+ } else {
+ int v = bch2_disk_path_find_or_create(&c->disk_sb, name);
+ if (v < 0)
+ return v;
- ret = bch2_sb_disk_groups_to_cpu(c);
- if (ret)
- return ret;
+ struct bch_member *mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
+ SET_BCH_MEMBER_GROUP(mi, v + 1);
+ }
- mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
- SET_BCH_MEMBER_GROUP(mi, v + 1);
- return 0;
+ return bch2_sb_disk_groups_to_cpu(c);
}
int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
@@ -545,32 +525,27 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
switch (t.type) {
case TARGET_NULL:
prt_printf(out, "none");
- break;
+ return;
case TARGET_DEV: {
- struct bch_dev *ca;
-
out->atomic++;
- rcu_read_lock();
- ca = t.dev < c->sb.nr_devices
+ guard(rcu)();
+ struct bch_dev *ca = t.dev < c->sb.nr_devices
? rcu_dereference(c->devs[t.dev])
: NULL;
- if (ca && percpu_ref_tryget(&ca->io_ref)) {
+ if (ca && ca->disk_sb.bdev)
prt_printf(out, "/dev/%s", ca->name);
- percpu_ref_put(&ca->io_ref);
- } else if (ca) {
+ else if (ca)
prt_printf(out, "offline device %u", t.dev);
- } else {
+ else
prt_printf(out, "invalid device %u", t.dev);
- }
- rcu_read_unlock();
out->atomic--;
- break;
+ return;
}
case TARGET_GROUP:
bch2_disk_path_to_text(out, c, t.group);
- break;
+ return;
default:
BUG();
}
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index d2a5e76e6479..543dbba9b14f 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -16,10 +16,12 @@
#include "disk_accounting.h"
#include "disk_groups.h"
#include "ec.h"
+#include "enumerated_ref.h"
#include "error.h"
#include "io_read.h"
#include "io_write.h"
#include "keylist.h"
+#include "lru.h"
#include "recovery.h"
#include "replicas.h"
#include "super-io.h"
@@ -104,6 +106,8 @@ struct ec_bio {
struct bch_dev *ca;
struct ec_stripe_buf *buf;
size_t idx;
+ int rw;
+ u64 submit_time;
struct bio bio;
};
@@ -209,7 +213,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans,
a->dirty_sectors,
a->stripe, s.k->p.offset,
(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
- ret = -BCH_ERR_mark_stripe;
+ ret = bch_err_throw(c, mark_stripe);
goto err;
}
@@ -220,7 +224,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans,
a->dirty_sectors,
a->cached_sectors,
(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
- ret = -BCH_ERR_mark_stripe;
+ ret = bch_err_throw(c, mark_stripe);
goto err;
}
} else {
@@ -230,7 +234,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans,
bucket.inode, bucket.offset, a->gen,
a->stripe,
(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
- ret = -BCH_ERR_mark_stripe;
+ ret = bch_err_throw(c, mark_stripe);
goto err;
}
@@ -240,7 +244,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans,
bch2_data_type_str(a->data_type),
bch2_data_type_str(data_type),
(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
- ret = -BCH_ERR_mark_stripe;
+ ret = bch_err_throw(c, mark_stripe);
goto err;
}
@@ -252,7 +256,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans,
a->dirty_sectors,
a->cached_sectors,
(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
- ret = -BCH_ERR_mark_stripe;
+ ret = bch_err_throw(c, mark_stripe);
goto err;
}
}
@@ -291,25 +295,37 @@ static int mark_stripe_bucket(struct btree_trans *trans,
struct bch_dev *ca = bch2_dev_tryget(c, ptr->dev);
if (unlikely(!ca)) {
if (ptr->dev != BCH_SB_MEMBER_INVALID && !(flags & BTREE_TRIGGER_overwrite))
- ret = -BCH_ERR_mark_stripe;
+ ret = bch_err_throw(c, mark_stripe);
goto err;
}
struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
if (flags & BTREE_TRIGGER_transactional) {
+ struct extent_ptr_decoded p = {
+ .ptr = *ptr,
+ .crc = bch2_extent_crc_unpack(s.k, NULL),
+ };
+ struct bkey_i_backpointer bp;
+ bch2_extent_ptr_to_bp(c, BTREE_ID_stripes, 0, s.s_c, p,
+ (const union bch_extent_entry *) ptr, &bp);
+
struct bkey_i_alloc_v4 *a =
bch2_trans_start_alloc_update(trans, bucket, 0);
- ret = PTR_ERR_OR_ZERO(a) ?:
- __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &a->v, flags);
+ ret = PTR_ERR_OR_ZERO(a) ?:
+ __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &a->v, flags) ?:
+ bch2_bucket_backpointer_mod(trans, s.s_c, &bp,
+ !(flags & BTREE_TRIGGER_overwrite));
+ if (ret)
+ goto err;
}
if (flags & BTREE_TRIGGER_gc) {
struct bucket *g = gc_bucket(ca, bucket.offset);
- if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s",
+ if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n%s",
ptr->dev,
(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
- ret = -BCH_ERR_mark_stripe;
+ ret = bch_err_throw(c, mark_stripe);
goto err;
}
@@ -366,19 +382,6 @@ static int mark_stripe_buckets(struct btree_trans *trans,
return 0;
}
-static inline void stripe_to_mem(struct stripe *m, const struct bch_stripe *s)
-{
- m->sectors = le16_to_cpu(s->sectors);
- m->algorithm = s->algorithm;
- m->nr_blocks = s->nr_blocks;
- m->nr_redundant = s->nr_redundant;
- m->disk_label = s->disk_label;
- m->blocks_nonempty = 0;
-
- for (unsigned i = 0; i < s->nr_blocks; i++)
- m->blocks_nonempty += !!stripe_blockcount_get(s, i);
-}
-
int bch2_trigger_stripe(struct btree_trans *trans,
enum btree_id btree, unsigned level,
struct bkey_s_c old, struct bkey_s _new,
@@ -399,6 +402,15 @@ int bch2_trigger_stripe(struct btree_trans *trans,
(new_s->nr_blocks != old_s->nr_blocks ||
new_s->nr_redundant != old_s->nr_redundant));
+ if (flags & BTREE_TRIGGER_transactional) {
+ int ret = bch2_lru_change(trans,
+ BCH_LRU_STRIPE_FRAGMENTATION,
+ idx,
+ stripe_lru_pos(old_s),
+ stripe_lru_pos(new_s));
+ if (ret)
+ return ret;
+ }
if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) {
/*
@@ -416,7 +428,7 @@ int bch2_trigger_stripe(struct btree_trans *trans,
gc = genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL);
if (!gc) {
bch_err(c, "error allocating memory for gc_stripes, idx %llu", idx);
- return -BCH_ERR_ENOMEM_mark_stripe;
+ return bch_err_throw(c, ENOMEM_mark_stripe);
}
/*
@@ -443,24 +455,25 @@ int bch2_trigger_stripe(struct btree_trans *trans,
if (new_s) {
s64 sectors = (u64) le16_to_cpu(new_s->sectors) * new_s->nr_redundant;
- struct disk_accounting_pos acc = {
- .type = BCH_DISK_ACCOUNTING_replicas,
- };
+ struct disk_accounting_pos acc;
+ memset(&acc, 0, sizeof(acc));
+ acc.type = BCH_DISK_ACCOUNTING_replicas;
bch2_bkey_to_replicas(&acc.replicas, new);
int ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, gc);
if (ret)
return ret;
if (gc)
- memcpy(&gc->r.e, &acc.replicas, replicas_entry_bytes(&acc.replicas));
+ unsafe_memcpy(&gc->r.e, &acc.replicas,
+ replicas_entry_bytes(&acc.replicas), "VLA");
}
if (old_s) {
s64 sectors = -((s64) le16_to_cpu(old_s->sectors)) * old_s->nr_redundant;
- struct disk_accounting_pos acc = {
- .type = BCH_DISK_ACCOUNTING_replicas,
- };
+ struct disk_accounting_pos acc;
+ memset(&acc, 0, sizeof(acc));
+ acc.type = BCH_DISK_ACCOUNTING_replicas;
bch2_bkey_to_replicas(&acc.replicas, old);
int ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, gc);
if (ret)
@@ -472,38 +485,6 @@ int bch2_trigger_stripe(struct btree_trans *trans,
return ret;
}
- if (flags & BTREE_TRIGGER_atomic) {
- struct stripe *m = genradix_ptr(&c->stripes, idx);
-
- if (!m) {
- struct printbuf buf1 = PRINTBUF;
- struct printbuf buf2 = PRINTBUF;
-
- bch2_bkey_val_to_text(&buf1, c, old);
- bch2_bkey_val_to_text(&buf2, c, new);
- bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n"
- "old %s\n"
- "new %s", idx, buf1.buf, buf2.buf);
- printbuf_exit(&buf2);
- printbuf_exit(&buf1);
- bch2_inconsistent_error(c);
- return -1;
- }
-
- if (!new_s) {
- bch2_stripes_heap_del(c, m, idx);
-
- memset(m, 0, sizeof(*m));
- } else {
- stripe_to_mem(m, new_s);
-
- if (!old_s)
- bch2_stripes_heap_insert(c, m, idx);
- else
- bch2_stripes_heap_update(c, m, idx);
- }
- }
-
return 0;
}
@@ -527,20 +508,14 @@ static const struct bch_extent_ptr *bkey_matches_stripe(struct bch_stripe *s,
static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx)
{
- switch (k.k->type) {
- case KEY_TYPE_extent: {
- struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
- const union bch_extent_entry *entry;
-
- extent_for_each_entry(e, entry)
- if (extent_entry_type(entry) ==
- BCH_EXTENT_ENTRY_stripe_ptr &&
- entry->stripe_ptr.idx == idx)
- return true;
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
- break;
- }
- }
+ bkey_extent_entry_for_each(ptrs, entry)
+ if (extent_entry_type(entry) ==
+ BCH_EXTENT_ENTRY_stripe_ptr &&
+ entry->stripe_ptr.idx == idx)
+ return true;
return false;
}
@@ -561,7 +536,8 @@ static void ec_stripe_buf_exit(struct ec_stripe_buf *buf)
}
/* XXX: this is a non-mempoolified memory allocation: */
-static int ec_stripe_buf_init(struct ec_stripe_buf *buf,
+static int ec_stripe_buf_init(struct bch_fs *c,
+ struct ec_stripe_buf *buf,
unsigned offset, unsigned size)
{
struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v;
@@ -589,7 +565,7 @@ static int ec_stripe_buf_init(struct ec_stripe_buf *buf,
return 0;
err:
ec_stripe_buf_exit(buf);
- return -BCH_ERR_ENOMEM_stripe_buf;
+ return bch_err_throw(c, ENOMEM_stripe_buf);
}
/* Checksumming: */
@@ -725,15 +701,20 @@ static void ec_block_endio(struct bio *bio)
struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx];
struct bch_dev *ca = ec_bio->ca;
struct closure *cl = bio->bi_private;
+ int rw = ec_bio->rw;
+ unsigned ref = rw == READ
+ ? BCH_DEV_READ_REF_ec_block
+ : BCH_DEV_WRITE_REF_ec_block;
+
+ bch2_account_io_completion(ca, bio_data_dir(bio),
+ ec_bio->submit_time, !bio->bi_status);
- if (bch2_dev_io_err_on(bio->bi_status, ca,
- bio_data_dir(bio)
- ? BCH_MEMBER_ERROR_write
- : BCH_MEMBER_ERROR_read,
- "erasure coding %s error: %s",
+ if (bio->bi_status) {
+ bch_err_dev_ratelimited(ca, "erasure coding %s error: %s",
str_write_read(bio_data_dir(bio)),
- bch2_blk_status_to_str(bio->bi_status)))
+ bch2_blk_status_to_str(bio->bi_status));
clear_bit(ec_bio->idx, ec_bio->buf->valid);
+ }
int stale = dev_ptr_stale(ca, ptr);
if (stale) {
@@ -745,7 +726,7 @@ static void ec_block_endio(struct bio *bio)
}
bio_put(&ec_bio->bio);
- percpu_ref_put(&ca->io_ref);
+ enumerated_ref_put(&ca->io_ref[rw], ref);
closure_put(cl);
}
@@ -759,8 +740,11 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
? BCH_DATA_user
: BCH_DATA_parity;
int rw = op_is_write(opf);
+ unsigned ref = rw == READ
+ ? BCH_DEV_READ_REF_ec_block
+ : BCH_DEV_WRITE_REF_ec_block;
- struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, rw);
+ struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, rw, ref);
if (!ca) {
clear_bit(idx, buf->valid);
return;
@@ -796,6 +780,8 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
ec_bio->ca = ca;
ec_bio->buf = buf;
ec_bio->idx = idx;
+ ec_bio->rw = rw;
+ ec_bio->submit_time = local_clock();
ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9);
ec_bio->bio.bi_end_io = ec_block_endio;
@@ -804,14 +790,14 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b);
closure_get(cl);
- percpu_ref_get(&ca->io_ref);
+ enumerated_ref_get(&ca->io_ref[rw], ref);
submit_bio(&ec_bio->bio);
offset += b;
}
- percpu_ref_put(&ca->io_ref);
+ enumerated_ref_put(&ca->io_ref[rw], ref);
}
static int get_stripe_key_trans(struct btree_trans *trans, u64 idx,
@@ -855,7 +841,7 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio,
buf = kzalloc(sizeof(*buf), GFP_NOFS);
if (!buf)
- return -BCH_ERR_ENOMEM_ec_read_extent;
+ return bch_err_throw(c, ENOMEM_ec_read_extent);
ret = lockrestart_do(trans, get_stripe_key_trans(trans, rbio->pick.ec.idx, buf));
if (ret) {
@@ -876,7 +862,7 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio,
goto err;
}
- ret = ec_stripe_buf_init(buf, offset, bio_sectors(&rbio->bio));
+ ret = ec_stripe_buf_init(c, buf, offset, bio_sectors(&rbio->bio));
if (ret) {
msg = "-ENOMEM";
goto err;
@@ -909,7 +895,7 @@ err:
bch_err_ratelimited(c,
"error doing reconstruct read: %s\n %s", msg, msgbuf.buf);
printbuf_exit(&msgbuf);
- ret = -BCH_ERR_stripe_reconstruct;
+ ret = bch_err_throw(c, stripe_reconstruct);
goto out;
}
@@ -917,29 +903,9 @@ err:
static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
{
- ec_stripes_heap n, *h = &c->ec_stripes_heap;
-
- if (idx >= h->size) {
- if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp))
- return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
-
- mutex_lock(&c->ec_stripes_heap_lock);
- if (n.size > h->size) {
- memcpy(n.data, h->data, h->nr * sizeof(h->data[0]));
- n.nr = h->nr;
- swap(*h, n);
- }
- mutex_unlock(&c->ec_stripes_heap_lock);
-
- free_heap(&n);
- }
-
- if (!genradix_ptr_alloc(&c->stripes, idx, gfp))
- return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
-
if (c->gc_pos.phase != GC_PHASE_not_running &&
!genradix_ptr_alloc(&c->gc_stripes, idx, gfp))
- return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
+ return bch_err_throw(c, ENOMEM_ec_stripe_mem_alloc);
return 0;
}
@@ -1009,188 +975,58 @@ static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s)
s->idx = 0;
}
-/* Heap of all existing stripes, ordered by blocks_nonempty */
-
-static u64 stripe_idx_to_delete(struct bch_fs *c)
-{
- ec_stripes_heap *h = &c->ec_stripes_heap;
-
- lockdep_assert_held(&c->ec_stripes_heap_lock);
-
- if (h->nr &&
- h->data[0].blocks_nonempty == 0 &&
- !bch2_stripe_is_open(c, h->data[0].idx))
- return h->data[0].idx;
-
- return 0;
-}
-
-static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h,
- size_t i)
-{
- struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap);
-
- genradix_ptr(&c->stripes, h->data[i].idx)->heap_idx = i;
-}
-
-static inline bool ec_stripes_heap_cmp(const void *l, const void *r, void __always_unused *args)
-{
- struct ec_stripe_heap_entry *_l = (struct ec_stripe_heap_entry *)l;
- struct ec_stripe_heap_entry *_r = (struct ec_stripe_heap_entry *)r;
-
- return ((_l->blocks_nonempty > _r->blocks_nonempty) <
- (_l->blocks_nonempty < _r->blocks_nonempty));
-}
-
-static inline void ec_stripes_heap_swap(void *l, void *r, void *h)
-{
- struct ec_stripe_heap_entry *_l = (struct ec_stripe_heap_entry *)l;
- struct ec_stripe_heap_entry *_r = (struct ec_stripe_heap_entry *)r;
- ec_stripes_heap *_h = (ec_stripes_heap *)h;
- size_t i = _l - _h->data;
- size_t j = _r - _h->data;
-
- swap(*_l, *_r);
-
- ec_stripes_heap_set_backpointer(_h, i);
- ec_stripes_heap_set_backpointer(_h, j);
-}
-
-static const struct min_heap_callbacks callbacks = {
- .less = ec_stripes_heap_cmp,
- .swp = ec_stripes_heap_swap,
-};
-
-static void heap_verify_backpointer(struct bch_fs *c, size_t idx)
-{
- ec_stripes_heap *h = &c->ec_stripes_heap;
- struct stripe *m = genradix_ptr(&c->stripes, idx);
-
- BUG_ON(m->heap_idx >= h->nr);
- BUG_ON(h->data[m->heap_idx].idx != idx);
-}
-
-void bch2_stripes_heap_del(struct bch_fs *c,
- struct stripe *m, size_t idx)
-{
- mutex_lock(&c->ec_stripes_heap_lock);
- heap_verify_backpointer(c, idx);
-
- min_heap_del(&c->ec_stripes_heap, m->heap_idx, &callbacks, &c->ec_stripes_heap);
- mutex_unlock(&c->ec_stripes_heap_lock);
-}
-
-void bch2_stripes_heap_insert(struct bch_fs *c,
- struct stripe *m, size_t idx)
-{
- mutex_lock(&c->ec_stripes_heap_lock);
- BUG_ON(min_heap_full(&c->ec_stripes_heap));
-
- genradix_ptr(&c->stripes, idx)->heap_idx = c->ec_stripes_heap.nr;
- min_heap_push(&c->ec_stripes_heap, &((struct ec_stripe_heap_entry) {
- .idx = idx,
- .blocks_nonempty = m->blocks_nonempty,
- }),
- &callbacks,
- &c->ec_stripes_heap);
-
- heap_verify_backpointer(c, idx);
- mutex_unlock(&c->ec_stripes_heap_lock);
-}
-
-void bch2_stripes_heap_update(struct bch_fs *c,
- struct stripe *m, size_t idx)
-{
- ec_stripes_heap *h = &c->ec_stripes_heap;
- bool do_deletes;
- size_t i;
-
- mutex_lock(&c->ec_stripes_heap_lock);
- heap_verify_backpointer(c, idx);
-
- h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty;
-
- i = m->heap_idx;
- min_heap_sift_up(h, i, &callbacks, &c->ec_stripes_heap);
- min_heap_sift_down(h, i, &callbacks, &c->ec_stripes_heap);
-
- heap_verify_backpointer(c, idx);
-
- do_deletes = stripe_idx_to_delete(c) != 0;
- mutex_unlock(&c->ec_stripes_heap_lock);
-
- if (do_deletes)
- bch2_do_stripe_deletes(c);
-}
-
/* stripe deletion */
static int ec_stripe_delete(struct btree_trans *trans, u64 idx)
{
- struct bch_fs *c = trans->c;
struct btree_iter iter;
- struct bkey_s_c k;
- struct bkey_s_c_stripe s;
- int ret;
-
- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, POS(0, idx),
- BTREE_ITER_intent);
- ret = bkey_err(k);
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter,
+ BTREE_ID_stripes, POS(0, idx),
+ BTREE_ITER_intent);
+ int ret = bkey_err(k);
if (ret)
goto err;
- if (k.k->type != KEY_TYPE_stripe) {
- bch2_fs_inconsistent(c, "attempting to delete nonexistent stripe %llu", idx);
- ret = -EINVAL;
- goto err;
- }
-
- s = bkey_s_c_to_stripe(k);
- for (unsigned i = 0; i < s.v->nr_blocks; i++)
- if (stripe_blockcount_get(s.v, i)) {
- struct printbuf buf = PRINTBUF;
-
- bch2_bkey_val_to_text(&buf, c, k);
- bch2_fs_inconsistent(c, "attempting to delete nonempty stripe %s", buf.buf);
- printbuf_exit(&buf);
- ret = -EINVAL;
- goto err;
- }
-
- ret = bch2_btree_delete_at(trans, &iter, 0);
+ /*
+ * We expect write buffer races here
+ * Important: check stripe_is_open with stripe key locked:
+ */
+ if (k.k->type == KEY_TYPE_stripe &&
+ !bch2_stripe_is_open(trans->c, idx) &&
+ stripe_lru_pos(bkey_s_c_to_stripe(k).v) == 1)
+ ret = bch2_btree_delete_at(trans, &iter, 0);
err:
bch2_trans_iter_exit(trans, &iter);
return ret;
}
+/*
+ * XXX
+ * can we kill this and delete stripes from the trigger?
+ */
static void ec_stripe_delete_work(struct work_struct *work)
{
struct bch_fs *c =
container_of(work, struct bch_fs, ec_stripe_delete_work);
- while (1) {
- mutex_lock(&c->ec_stripes_heap_lock);
- u64 idx = stripe_idx_to_delete(c);
- mutex_unlock(&c->ec_stripes_heap_lock);
-
- if (!idx)
- break;
-
- int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- ec_stripe_delete(trans, idx));
- bch_err_fn(c, ret);
- if (ret)
- break;
- }
-
- bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
+ bch2_trans_run(c,
+ bch2_btree_write_buffer_tryflush(trans) ?:
+ for_each_btree_key_max_commit(trans, lru_iter, BTREE_ID_lru,
+ lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 1, 0),
+ lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 1, LRU_TIME_MAX),
+ 0, lru_k,
+ NULL, NULL,
+ BCH_TRANS_COMMIT_no_enospc, ({
+ ec_stripe_delete(trans, lru_k.k->p.offset);
+ })));
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_delete);
}
void bch2_do_stripe_deletes(struct bch_fs *c)
{
- if (bch2_write_ref_tryget(c, BCH_WRITE_REF_stripe_delete) &&
+ if (enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_stripe_delete) &&
!queue_work(c->write_ref_wq, &c->ec_stripe_delete_work))
- bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_delete);
}
/* stripe creation: */
@@ -1294,7 +1130,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
bch2_fs_inconsistent(c, "%s", buf.buf);
printbuf_exit(&buf);
- return -EIO;
+ return bch_err_throw(c, erasure_coding_found_btree_node);
}
k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent, last_flushed);
@@ -1360,7 +1196,7 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
struct bch_dev *ca = bch2_dev_tryget(c, ptr.dev);
if (!ca)
- return -EIO;
+ return bch_err_throw(c, ENOENT_dev_not_found);
struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr);
@@ -1380,8 +1216,12 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
if (bp_k.k->type != KEY_TYPE_backpointer)
continue;
+ struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k);
+ if (bp.v->btree_id == BTREE_ID_stripes)
+ continue;
+
ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s,
- bkey_s_c_to_backpointer(bp_k), &last_flushed);
+ bp, &last_flushed);
}));
bch2_bkey_buf_exit(&last_flushed, c);
@@ -1393,21 +1233,19 @@ static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s)
{
struct btree_trans *trans = bch2_trans_get(c);
struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
- unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
- int ret = 0;
+ unsigned nr_data = v->nr_blocks - v->nr_redundant;
- ret = bch2_btree_write_buffer_flush_sync(trans);
+ int ret = bch2_btree_write_buffer_flush_sync(trans);
if (ret)
goto err;
- for (i = 0; i < nr_data; i++) {
+ for (unsigned i = 0; i < nr_data; i++) {
ret = ec_stripe_update_bucket(trans, s, i);
if (ret)
break;
}
err:
bch2_trans_put(trans);
-
return ret;
}
@@ -1416,9 +1254,10 @@ static void zero_out_rest_of_ec_bucket(struct bch_fs *c,
unsigned block,
struct open_bucket *ob)
{
- struct bch_dev *ca = bch2_dev_get_ioref(c, ob->dev, WRITE);
+ struct bch_dev *ca = bch2_dev_get_ioref(c, ob->dev, WRITE,
+ BCH_DEV_WRITE_REF_ec_bucket_zero);
if (!ca) {
- s->err = -BCH_ERR_erofs_no_writes;
+ s->err = bch_err_throw(c, erofs_no_writes);
return;
}
@@ -1432,7 +1271,7 @@ static void zero_out_rest_of_ec_bucket(struct bch_fs *c,
ob->sectors_free,
GFP_KERNEL, 0);
- percpu_ref_put(&ca->io_ref);
+ enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_ec_bucket_zero);
if (ret)
s->err = ret;
@@ -1473,6 +1312,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
if (s->err) {
if (!bch2_err_matches(s->err, EROFS))
bch_err(c, "error creating stripe: error writing data buckets");
+ ret = s->err;
goto err;
}
@@ -1481,6 +1321,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
if (ec_do_recov(c, &s->existing_stripe)) {
bch_err(c, "error creating stripe: error reading existing stripe");
+ ret = bch_err_throw(c, ec_block_read);
goto err;
}
@@ -1506,6 +1347,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
if (ec_nr_failed(&s->new_stripe)) {
bch_err(c, "error creating stripe: error writing redundancy buckets");
+ ret = bch_err_throw(c, ec_block_write);
goto err;
}
@@ -1527,6 +1369,8 @@ static void ec_stripe_create(struct ec_stripe_new *s)
if (ret)
goto err;
err:
+ trace_stripe_create(c, s->idx, ret);
+
bch2_disk_reservation_put(c, &s->res);
for (i = 0; i < v->nr_blocks; i++)
@@ -1577,15 +1421,15 @@ static void ec_stripe_create_work(struct work_struct *work)
while ((s = get_pending_stripe(c)))
ec_stripe_create(s);
- bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_create);
}
void bch2_ec_do_stripe_creates(struct bch_fs *c)
{
- bch2_write_ref_get(c, BCH_WRITE_REF_stripe_create);
+ enumerated_ref_get(&c->writes, BCH_WRITE_REF_stripe_create);
if (!queue_work(system_long_wq, &c->ec_stripe_create_work))
- bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_create);
}
static void ec_stripe_new_set_pending(struct bch_fs *c, struct ec_stripe_head *h)
@@ -1612,11 +1456,11 @@ static void ec_stripe_new_cancel(struct bch_fs *c, struct ec_stripe_head *h, int
ec_stripe_new_set_pending(c, h);
}
-void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob)
+void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob, int err)
{
struct ec_stripe_new *s = ob->ec;
- s->err = -EIO;
+ s->err = err;
}
void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp)
@@ -1735,26 +1579,26 @@ static struct ec_stripe_new *ec_new_stripe_alloc(struct bch_fs *c, struct ec_str
static void ec_stripe_head_devs_update(struct bch_fs *c, struct ec_stripe_head *h)
{
struct bch_devs_mask devs = h->devs;
+ unsigned nr_devs, nr_devs_with_durability;
- rcu_read_lock();
- h->devs = target_rw_devs(c, BCH_DATA_user, h->disk_label
- ? group_to_target(h->disk_label - 1)
- : 0);
- unsigned nr_devs = dev_mask_nr(&h->devs);
-
- for_each_member_device_rcu(c, ca, &h->devs)
- if (!ca->mi.durability)
- __clear_bit(ca->dev_idx, h->devs.d);
- unsigned nr_devs_with_durability = dev_mask_nr(&h->devs);
+ scoped_guard(rcu) {
+ h->devs = target_rw_devs(c, BCH_DATA_user, h->disk_label
+ ? group_to_target(h->disk_label - 1)
+ : 0);
+ nr_devs = dev_mask_nr(&h->devs);
- h->blocksize = pick_blocksize(c, &h->devs);
+ for_each_member_device_rcu(c, ca, &h->devs)
+ if (!ca->mi.durability)
+ __clear_bit(ca->dev_idx, h->devs.d);
+ nr_devs_with_durability = dev_mask_nr(&h->devs);
- h->nr_active_devs = 0;
- for_each_member_device_rcu(c, ca, &h->devs)
- if (ca->mi.bucket_size == h->blocksize)
- h->nr_active_devs++;
+ h->blocksize = pick_blocksize(c, &h->devs);
- rcu_read_unlock();
+ h->nr_active_devs = 0;
+ for_each_member_device_rcu(c, ca, &h->devs)
+ if (ca->mi.bucket_size == h->blocksize)
+ h->nr_active_devs++;
+ }
/*
* If we only have redundancy + 1 devices, we're better off with just
@@ -1875,23 +1719,32 @@ err:
}
static int new_stripe_alloc_buckets(struct btree_trans *trans,
+ struct alloc_request *req,
struct ec_stripe_head *h, struct ec_stripe_new *s,
- enum bch_watermark watermark, struct closure *cl)
+ struct closure *cl)
{
struct bch_fs *c = trans->c;
- struct bch_devs_mask devs = h->devs;
struct open_bucket *ob;
- struct open_buckets buckets;
struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
unsigned i, j, nr_have_parity = 0, nr_have_data = 0;
- bool have_cache = true;
int ret = 0;
+ req->scratch_data_type = req->data_type;
+ req->scratch_ptrs = req->ptrs;
+ req->scratch_nr_replicas = req->nr_replicas;
+ req->scratch_nr_effective = req->nr_effective;
+ req->scratch_have_cache = req->have_cache;
+ req->scratch_devs_may_alloc = req->devs_may_alloc;
+
+ req->devs_may_alloc = h->devs;
+ req->have_cache = true;
+
BUG_ON(v->nr_blocks != s->nr_data + s->nr_parity);
BUG_ON(v->nr_redundant != s->nr_parity);
/* * We bypass the sector allocator which normally does this: */
- bitmap_and(devs.d, devs.d, c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX);
+ bitmap_and(req->devs_may_alloc.d, req->devs_may_alloc.d,
+ c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX);
for_each_set_bit(i, s->blocks_gotten, v->nr_blocks) {
/*
@@ -1901,7 +1754,7 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans,
* block when updating the stripe
*/
if (v->ptrs[i].dev != BCH_SB_MEMBER_INVALID)
- __clear_bit(v->ptrs[i].dev, devs.d);
+ __clear_bit(v->ptrs[i].dev, req->devs_may_alloc.d);
if (i < s->nr_data)
nr_have_data++;
@@ -1912,95 +1765,94 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans,
BUG_ON(nr_have_data > s->nr_data);
BUG_ON(nr_have_parity > s->nr_parity);
- buckets.nr = 0;
+ req->ptrs.nr = 0;
if (nr_have_parity < s->nr_parity) {
- ret = bch2_bucket_alloc_set_trans(trans, &buckets,
- &h->parity_stripe,
- &devs,
- s->nr_parity,
- &nr_have_parity,
- &have_cache, 0,
- BCH_DATA_parity,
- watermark,
- cl);
-
- open_bucket_for_each(c, &buckets, ob, i) {
+ req->nr_replicas = s->nr_parity;
+ req->nr_effective = nr_have_parity;
+ req->data_type = BCH_DATA_parity;
+
+ ret = bch2_bucket_alloc_set_trans(trans, req, &h->parity_stripe, cl);
+
+ open_bucket_for_each(c, &req->ptrs, ob, i) {
j = find_next_zero_bit(s->blocks_gotten,
s->nr_data + s->nr_parity,
s->nr_data);
BUG_ON(j >= s->nr_data + s->nr_parity);
- s->blocks[j] = buckets.v[i];
+ s->blocks[j] = req->ptrs.v[i];
v->ptrs[j] = bch2_ob_ptr(c, ob);
__set_bit(j, s->blocks_gotten);
}
if (ret)
- return ret;
+ goto err;
}
- buckets.nr = 0;
+ req->ptrs.nr = 0;
if (nr_have_data < s->nr_data) {
- ret = bch2_bucket_alloc_set_trans(trans, &buckets,
- &h->block_stripe,
- &devs,
- s->nr_data,
- &nr_have_data,
- &have_cache, 0,
- BCH_DATA_user,
- watermark,
- cl);
-
- open_bucket_for_each(c, &buckets, ob, i) {
+ req->nr_replicas = s->nr_data;
+ req->nr_effective = nr_have_data;
+ req->data_type = BCH_DATA_user;
+
+ ret = bch2_bucket_alloc_set_trans(trans, req, &h->block_stripe, cl);
+
+ open_bucket_for_each(c, &req->ptrs, ob, i) {
j = find_next_zero_bit(s->blocks_gotten,
s->nr_data, 0);
BUG_ON(j >= s->nr_data);
- s->blocks[j] = buckets.v[i];
+ s->blocks[j] = req->ptrs.v[i];
v->ptrs[j] = bch2_ob_ptr(c, ob);
__set_bit(j, s->blocks_gotten);
}
if (ret)
- return ret;
+ goto err;
}
-
- return 0;
+err:
+ req->data_type = req->scratch_data_type;
+ req->ptrs = req->scratch_ptrs;
+ req->nr_replicas = req->scratch_nr_replicas;
+ req->nr_effective = req->scratch_nr_effective;
+ req->have_cache = req->scratch_have_cache;
+ req->devs_may_alloc = req->scratch_devs_may_alloc;
+ return ret;
}
-static s64 get_existing_stripe(struct bch_fs *c,
- struct ec_stripe_head *head)
+static int __get_existing_stripe(struct btree_trans *trans,
+ struct ec_stripe_head *head,
+ struct ec_stripe_buf *stripe,
+ u64 idx)
{
- ec_stripes_heap *h = &c->ec_stripes_heap;
- struct stripe *m;
- size_t heap_idx;
- u64 stripe_idx;
- s64 ret = -1;
-
- if (may_create_new_stripe(c))
- return -1;
+ struct bch_fs *c = trans->c;
- mutex_lock(&c->ec_stripes_heap_lock);
- for (heap_idx = 0; heap_idx < h->nr; heap_idx++) {
- /* No blocks worth reusing, stripe will just be deleted: */
- if (!h->data[heap_idx].blocks_nonempty)
- continue;
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter,
+ BTREE_ID_stripes, POS(0, idx), 0);
+ int ret = bkey_err(k);
+ if (ret)
+ goto err;
- stripe_idx = h->data[heap_idx].idx;
+ /* We expect write buffer races here */
+ if (k.k->type != KEY_TYPE_stripe)
+ goto out;
- m = genradix_ptr(&c->stripes, stripe_idx);
+ struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+ if (stripe_lru_pos(s.v) <= 1)
+ goto out;
- if (m->disk_label == head->disk_label &&
- m->algorithm == head->algo &&
- m->nr_redundant == head->redundancy &&
- m->sectors == head->blocksize &&
- m->blocks_nonempty < m->nr_blocks - m->nr_redundant &&
- bch2_try_open_stripe(c, head->s, stripe_idx)) {
- ret = stripe_idx;
- break;
- }
+ if (s.v->disk_label == head->disk_label &&
+ s.v->algorithm == head->algo &&
+ s.v->nr_redundant == head->redundancy &&
+ le16_to_cpu(s.v->sectors) == head->blocksize &&
+ bch2_try_open_stripe(c, head->s, idx)) {
+ bkey_reassemble(&stripe->key, k);
+ ret = 1;
}
- mutex_unlock(&c->ec_stripes_heap_lock);
+out:
+ bch2_set_btree_iter_dontneed(trans, &iter);
+err:
+ bch2_trans_iter_exit(trans, &iter);
return ret;
}
@@ -2014,7 +1866,7 @@ static int init_new_stripe_from_existing(struct bch_fs *c, struct ec_stripe_new
s->nr_data = existing_v->nr_blocks -
existing_v->nr_redundant;
- int ret = ec_stripe_buf_init(&s->existing_stripe, 0, le16_to_cpu(existing_v->sectors));
+ int ret = ec_stripe_buf_init(c, &s->existing_stripe, 0, le16_to_cpu(existing_v->sectors));
if (ret) {
bch2_stripe_close(c, s);
return ret;
@@ -2052,24 +1904,33 @@ static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stri
struct ec_stripe_new *s)
{
struct bch_fs *c = trans->c;
- s64 idx;
- int ret;
/*
* If we can't allocate a new stripe, and there's no stripes with empty
* blocks for us to reuse, that means we have to wait on copygc:
*/
- idx = get_existing_stripe(c, h);
- if (idx < 0)
- return -BCH_ERR_stripe_alloc_blocked;
+ if (may_create_new_stripe(c))
+ return -1;
- ret = get_stripe_key_trans(trans, idx, &s->existing_stripe);
- bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart), c,
- "reading stripe key: %s", bch2_err_str(ret));
- if (ret) {
- bch2_stripe_close(c, s);
- return ret;
+ struct btree_iter lru_iter;
+ struct bkey_s_c lru_k;
+ int ret = 0;
+
+ for_each_btree_key_max_norestart(trans, lru_iter, BTREE_ID_lru,
+ lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 2, 0),
+ lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 2, LRU_TIME_MAX),
+ 0, lru_k, ret) {
+ ret = __get_existing_stripe(trans, h, &s->existing_stripe, lru_k.k->p.offset);
+ if (ret)
+ break;
}
+ bch2_trans_iter_exit(trans, &lru_iter);
+ if (!ret)
+ ret = bch_err_throw(c, stripe_alloc_blocked);
+ if (ret == 1)
+ ret = 0;
+ if (ret)
+ return ret;
return init_new_stripe_from_existing(c, s);
}
@@ -2102,11 +1963,11 @@ static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_st
if (bkey_gt(k.k->p, POS(0, U32_MAX))) {
if (start_pos.offset) {
start_pos = min_pos;
- bch2_btree_iter_set_pos(&iter, start_pos);
+ bch2_btree_iter_set_pos(trans, &iter, start_pos);
continue;
}
- ret = -BCH_ERR_ENOSPC_stripe_create;
+ ret = bch_err_throw(c, ENOSPC_stripe_create);
break;
}
@@ -2136,17 +1997,15 @@ err:
}
struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
- unsigned target,
+ struct alloc_request *req,
unsigned algo,
- unsigned redundancy,
- enum bch_watermark watermark,
struct closure *cl)
{
struct bch_fs *c = trans->c;
- struct ec_stripe_head *h;
- bool waiting = false;
+ unsigned redundancy = req->nr_replicas - 1;
unsigned disk_label = 0;
- struct target t = target_decode(target);
+ struct target t = target_decode(req->target);
+ bool waiting = false;
int ret;
if (t.type == TARGET_GROUP) {
@@ -2157,14 +2016,16 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
disk_label = t.group + 1; /* 0 == no label */
}
- h = __bch2_ec_stripe_head_get(trans, disk_label, algo, redundancy, watermark);
+ struct ec_stripe_head *h =
+ __bch2_ec_stripe_head_get(trans, disk_label, algo,
+ redundancy, req->watermark);
if (IS_ERR_OR_NULL(h))
return h;
if (!h->s) {
h->s = ec_new_stripe_alloc(c, h);
if (!h->s) {
- ret = -BCH_ERR_ENOMEM_ec_new_stripe_alloc;
+ ret = bch_err_throw(c, ENOMEM_ec_new_stripe_alloc);
bch_err(c, "failed to allocate new stripe");
goto err;
}
@@ -2181,8 +2042,12 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
goto alloc_existing;
/* First, try to allocate a full stripe: */
- ret = new_stripe_alloc_buckets(trans, h, s, BCH_WATERMARK_stripe, NULL) ?:
+ enum bch_watermark saved_watermark = BCH_WATERMARK_stripe;
+ swap(req->watermark, saved_watermark);
+ ret = new_stripe_alloc_buckets(trans, req, h, s, NULL) ?:
__bch2_ec_stripe_head_reserve(trans, h, s);
+ swap(req->watermark, saved_watermark);
+
if (!ret)
goto allocate_buf;
if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
@@ -2200,8 +2065,8 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked)
goto err;
- if (watermark == BCH_WATERMARK_copygc) {
- ret = new_stripe_alloc_buckets(trans, h, s, watermark, NULL) ?:
+ if (req->watermark == BCH_WATERMARK_copygc) {
+ ret = new_stripe_alloc_buckets(trans, req, h, s, NULL) ?:
__bch2_ec_stripe_head_reserve(trans, h, s);
if (ret)
goto err;
@@ -2220,12 +2085,12 @@ alloc_existing:
* Retry allocating buckets, with the watermark for this
* particular write:
*/
- ret = new_stripe_alloc_buckets(trans, h, s, watermark, cl);
+ ret = new_stripe_alloc_buckets(trans, req, h, s, cl);
if (ret)
goto err;
allocate_buf:
- ret = ec_stripe_buf_init(&s->new_stripe, 0, h->blocksize);
+ ret = ec_stripe_buf_init(c, &s->new_stripe, 0, h->blocksize);
if (ret)
goto err;
@@ -2242,67 +2107,107 @@ err:
/* device removal */
-static int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, struct bkey_s_c k_a)
+int bch2_invalidate_stripe_to_dev(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k,
+ unsigned dev_idx,
+ unsigned flags)
{
- struct bch_alloc_v4 a_convert;
- const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k_a, &a_convert);
-
- if (!a->stripe)
+ if (k.k->type != KEY_TYPE_stripe)
return 0;
- if (a->stripe_sectors) {
- bch_err(trans->c, "trying to invalidate device in stripe when bucket has stripe data");
- return -BCH_ERR_invalidate_stripe_to_dev;
- }
-
- struct btree_iter iter;
+ struct bch_fs *c = trans->c;
struct bkey_i_stripe *s =
- bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_stripes, POS(0, a->stripe),
- BTREE_ITER_slots, stripe);
+ bch2_bkey_make_mut_typed(trans, iter, &k, 0, stripe);
int ret = PTR_ERR_OR_ZERO(s);
if (ret)
return ret;
- struct disk_accounting_pos acc = {
- .type = BCH_DISK_ACCOUNTING_replicas,
- };
+ struct disk_accounting_pos acc;
s64 sectors = 0;
for (unsigned i = 0; i < s->v.nr_blocks; i++)
sectors -= stripe_blockcount_get(&s->v, i);
+ memset(&acc, 0, sizeof(acc));
+ acc.type = BCH_DISK_ACCOUNTING_replicas;
bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i));
acc.replicas.data_type = BCH_DATA_user;
ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, false);
if (ret)
- goto err;
+ return ret;
struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(&s->k_i));
- bkey_for_each_ptr(ptrs, ptr)
- if (ptr->dev == k_a.k->p.inode)
- ptr->dev = BCH_SB_MEMBER_INVALID;
+
+ /* XXX: how much redundancy do we still have? check degraded flags */
+
+ unsigned nr_good = 0;
+
+ scoped_guard(rcu)
+ bkey_for_each_ptr(ptrs, ptr) {
+ if (ptr->dev == dev_idx)
+ ptr->dev = BCH_SB_MEMBER_INVALID;
+
+ struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
+ nr_good += ca && ca->mi.state != BCH_MEMBER_STATE_failed;
+ }
+
+ if (nr_good < s->v.nr_blocks && !(flags & BCH_FORCE_IF_DATA_DEGRADED))
+ return bch_err_throw(c, remove_would_lose_data);
+
+ unsigned nr_data = s->v.nr_blocks - s->v.nr_redundant;
+
+ if (nr_good < nr_data && !(flags & BCH_FORCE_IF_DATA_LOST))
+ return bch_err_throw(c, remove_would_lose_data);
sectors = -sectors;
+ memset(&acc, 0, sizeof(acc));
+ acc.type = BCH_DISK_ACCOUNTING_replicas;
bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i));
acc.replicas.data_type = BCH_DATA_user;
- ret = bch2_disk_accounting_mod(trans, &acc, &sectors, 1, false);
+ return bch2_disk_accounting_mod(trans, &acc, &sectors, 1, false);
+}
+
+static int bch2_invalidate_stripe_to_dev_from_alloc(struct btree_trans *trans, struct bkey_s_c k_a,
+ unsigned flags)
+{
+ struct bch_alloc_v4 a_convert;
+ const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k_a, &a_convert);
+
+ if (!a->stripe)
+ return 0;
+
+ if (a->stripe_sectors) {
+ struct bch_fs *c = trans->c;
+ bch_err(c, "trying to invalidate device in stripe when bucket has stripe data");
+ return bch_err_throw(c, invalidate_stripe_to_dev);
+ }
+
+ struct btree_iter iter;
+ struct bkey_s_c_stripe s =
+ bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_stripes, POS(0, a->stripe),
+ BTREE_ITER_slots, stripe);
+ int ret = bkey_err(s);
if (ret)
- goto err;
-err:
+ return ret;
+
+ ret = bch2_invalidate_stripe_to_dev(trans, &iter, s.s_c, k_a.k->p.inode, flags);
bch2_trans_iter_exit(trans, &iter);
return ret;
}
-int bch2_dev_remove_stripes(struct bch_fs *c, unsigned dev_idx)
+int bch2_dev_remove_stripes(struct bch_fs *c, unsigned dev_idx, unsigned flags)
{
- return bch2_trans_run(c,
+ int ret = bch2_trans_run(c,
for_each_btree_key_max_commit(trans, iter,
BTREE_ID_alloc, POS(dev_idx, 0), POS(dev_idx, U64_MAX),
BTREE_ITER_intent, k,
NULL, NULL, 0, ({
- bch2_invalidate_stripe_to_dev(trans, k);
+ bch2_invalidate_stripe_to_dev_from_alloc(trans, k, flags);
})));
+ bch_err_fn(c, ret);
+ return ret;
}
/* startup/shutdown */
@@ -2351,10 +2256,10 @@ void bch2_fs_ec_stop(struct bch_fs *c)
static bool bch2_fs_ec_flush_done(struct bch_fs *c)
{
- bool ret;
+ sched_annotate_sleep();
mutex_lock(&c->ec_stripe_new_lock);
- ret = list_empty(&c->ec_stripe_new_list);
+ bool ret = list_empty(&c->ec_stripe_new_list);
mutex_unlock(&c->ec_stripe_new_lock);
return ret;
@@ -2367,46 +2272,7 @@ void bch2_fs_ec_flush(struct bch_fs *c)
int bch2_stripes_read(struct bch_fs *c)
{
- int ret = bch2_trans_run(c,
- for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN,
- BTREE_ITER_prefetch, k, ({
- if (k.k->type != KEY_TYPE_stripe)
- continue;
-
- ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL);
- if (ret)
- break;
-
- struct stripe *m = genradix_ptr(&c->stripes, k.k->p.offset);
-
- stripe_to_mem(m, bkey_s_c_to_stripe(k).v);
-
- bch2_stripes_heap_insert(c, m, k.k->p.offset);
- 0;
- })));
- bch_err_fn(c, ret);
- return ret;
-}
-
-void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c)
-{
- ec_stripes_heap *h = &c->ec_stripes_heap;
- struct stripe *m;
- size_t i;
-
- mutex_lock(&c->ec_stripes_heap_lock);
- for (i = 0; i < min_t(size_t, h->nr, 50); i++) {
- m = genradix_ptr(&c->stripes, h->data[i].idx);
-
- prt_printf(out, "%zu %u/%u+%u", h->data[i].idx,
- h->data[i].blocks_nonempty,
- m->nr_blocks - m->nr_redundant,
- m->nr_redundant);
- if (bch2_stripe_is_open(c, h->data[i].idx))
- prt_str(out, " open");
- prt_newline(out);
- }
- mutex_unlock(&c->ec_stripes_heap_lock);
+ return 0;
}
static void bch2_new_stripe_to_text(struct printbuf *out, struct bch_fs *c,
@@ -2477,15 +2343,12 @@ void bch2_fs_ec_exit(struct bch_fs *c)
BUG_ON(!list_empty(&c->ec_stripe_new_list));
- free_heap(&c->ec_stripes_heap);
- genradix_free(&c->stripes);
bioset_exit(&c->ec_bioset);
}
void bch2_fs_ec_init_early(struct bch_fs *c)
{
spin_lock_init(&c->ec_stripes_new_lock);
- mutex_init(&c->ec_stripes_heap_lock);
INIT_LIST_HEAD(&c->ec_stripe_head_list);
mutex_init(&c->ec_stripe_head_lock);
@@ -2503,3 +2366,40 @@ int bch2_fs_ec_init(struct bch_fs *c)
return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio),
BIOSET_NEED_BVECS);
}
+
+static int bch2_check_stripe_to_lru_ref(struct btree_trans *trans,
+ struct bkey_s_c k,
+ struct bkey_buf *last_flushed)
+{
+ if (k.k->type != KEY_TYPE_stripe)
+ return 0;
+
+ struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+
+ u64 lru_idx = stripe_lru_pos(s.v);
+ if (lru_idx) {
+ int ret = bch2_lru_check_set(trans, BCH_LRU_STRIPE_FRAGMENTATION,
+ k.k->p.offset, lru_idx, k, last_flushed);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+int bch2_check_stripe_to_lru_refs(struct bch_fs *c)
+{
+ struct bkey_buf last_flushed;
+
+ bch2_bkey_buf_init(&last_flushed);
+ bkey_init(&last_flushed.k->k);
+
+ int ret = bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter, BTREE_ID_stripes,
+ POS_MIN, BTREE_ITER_prefetch, k,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ bch2_check_stripe_to_lru_ref(trans, k, &last_flushed)));
+
+ bch2_bkey_buf_exit(&last_flushed, c);
+ bch_err_fn(c, ret);
+ return ret;
+}
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index 583ca6a226da..548048adf0d5 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -92,6 +92,29 @@ static inline void stripe_csum_set(struct bch_stripe *s,
memcpy(stripe_csum(s, block, csum_idx), &csum, bch_crc_bytes[s->csum_type]);
}
+#define STRIPE_LRU_POS_EMPTY 1
+
+static inline u64 stripe_lru_pos(const struct bch_stripe *s)
+{
+ if (!s)
+ return 0;
+
+ unsigned nr_data = s->nr_blocks - s->nr_redundant, blocks_empty = 0;
+
+ for (unsigned i = 0; i < nr_data; i++)
+ blocks_empty += !stripe_blockcount_get(s, i);
+
+ /* Will be picked up by the stripe_delete worker */
+ if (blocks_empty == nr_data)
+ return STRIPE_LRU_POS_EMPTY;
+
+ if (!blocks_empty)
+ return 0;
+
+ /* invert: more blocks empty = reuse first */
+ return LRU_TIME_MAX - blocks_empty;
+}
+
static inline bool __bch2_ptr_matches_stripe(const struct bch_extent_ptr *stripe_ptr,
const struct bch_extent_ptr *data_ptr,
unsigned sectors)
@@ -132,6 +155,21 @@ static inline bool bch2_ptr_matches_stripe_m(const struct gc_stripe *m,
m->sectors);
}
+static inline void gc_stripe_unlock(struct gc_stripe *s)
+{
+ BUILD_BUG_ON(!((union ulong_byte_assert) { .ulong = 1UL << BUCKET_LOCK_BITNR }).byte);
+
+ clear_bit_unlock(BUCKET_LOCK_BITNR, (void *) &s->lock);
+ smp_mb__after_atomic();
+ wake_up_bit((void *) &s->lock, BUCKET_LOCK_BITNR);
+}
+
+static inline void gc_stripe_lock(struct gc_stripe *s)
+{
+ wait_on_bit_lock((void *) &s->lock, BUCKET_LOCK_BITNR,
+ TASK_UNINTERRUPTIBLE);
+}
+
struct bch_read_bio;
struct ec_stripe_buf {
@@ -212,18 +250,15 @@ int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *, struct bkey
void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *);
-void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *);
+void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *, int);
int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *);
void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *);
-struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *,
- unsigned, unsigned, unsigned,
- enum bch_watermark, struct closure *);
-void bch2_stripes_heap_update(struct bch_fs *, struct stripe *, size_t);
-void bch2_stripes_heap_del(struct bch_fs *, struct stripe *, size_t);
-void bch2_stripes_heap_insert(struct bch_fs *, struct stripe *, size_t);
+struct alloc_request;
+struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *,
+ struct alloc_request *, unsigned, struct closure *);
void bch2_do_stripe_deletes(struct bch_fs *);
void bch2_ec_do_stripe_creates(struct bch_fs *);
@@ -253,7 +288,9 @@ static inline void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s,
}
}
-int bch2_dev_remove_stripes(struct bch_fs *, unsigned);
+int bch2_invalidate_stripe_to_dev(struct btree_trans *, struct btree_iter *,
+ struct bkey_s_c, unsigned, unsigned);
+int bch2_dev_remove_stripes(struct bch_fs *, unsigned, unsigned);
void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *);
void bch2_fs_ec_stop(struct bch_fs *);
@@ -261,11 +298,12 @@ void bch2_fs_ec_flush(struct bch_fs *);
int bch2_stripes_read(struct bch_fs *);
-void bch2_stripes_heap_to_text(struct printbuf *, struct bch_fs *);
void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *);
void bch2_fs_ec_exit(struct bch_fs *);
void bch2_fs_ec_init_early(struct bch_fs *);
int bch2_fs_ec_init(struct bch_fs *);
+int bch2_check_stripe_to_lru_refs(struct bch_fs *);
+
#endif /* _BCACHEFS_EC_H */
diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h
index 8d1e70e830ac..809446c78951 100644
--- a/fs/bcachefs/ec_types.h
+++ b/fs/bcachefs/ec_types.h
@@ -4,9 +4,10 @@
#include "bcachefs_format.h"
-struct bch_replicas_padded {
+union bch_replicas_padded {
+ u8 bytes[struct_size_t(struct bch_replicas_entry_v1,
+ devs, BCH_BKEY_PTRS_MAX)];
struct bch_replicas_entry_v1 e;
- u8 pad[BCH_BKEY_PTRS_MAX];
};
struct stripe {
@@ -20,23 +21,15 @@ struct stripe {
};
struct gc_stripe {
+ u8 lock;
+ unsigned alive:1; /* does a corresponding key exist in stripes btree? */
u16 sectors;
-
u8 nr_blocks;
u8 nr_redundant;
-
- unsigned alive:1; /* does a corresponding key exist in stripes btree? */
u16 block_sectors[BCH_BKEY_PTRS_MAX];
struct bch_extent_ptr ptrs[BCH_BKEY_PTRS_MAX];
- struct bch_replicas_padded r;
+ union bch_replicas_padded r;
};
-struct ec_stripe_heap_entry {
- size_t idx;
- unsigned blocks_nonempty;
-};
-
-typedef DEFINE_MIN_HEAP(struct ec_stripe_heap_entry, ec_stripes_heap) ec_stripes_heap;
-
#endif /* _BCACHEFS_EC_TYPES_H */
diff --git a/fs/bcachefs/enumerated_ref.c b/fs/bcachefs/enumerated_ref.c
new file mode 100644
index 000000000000..56ab430f209f
--- /dev/null
+++ b/fs/bcachefs/enumerated_ref.c
@@ -0,0 +1,144 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "enumerated_ref.h"
+#include "util.h"
+
+#include <linux/completion.h>
+
+#ifdef ENUMERATED_REF_DEBUG
+void enumerated_ref_get(struct enumerated_ref *ref, unsigned idx)
+{
+ BUG_ON(idx >= ref->nr);
+ atomic_long_inc(&ref->refs[idx]);
+}
+
+bool __enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx)
+{
+ BUG_ON(idx >= ref->nr);
+ return atomic_long_inc_not_zero(&ref->refs[idx]);
+}
+
+bool enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx)
+{
+ BUG_ON(idx >= ref->nr);
+ return !ref->dying &&
+ atomic_long_inc_not_zero(&ref->refs[idx]);
+}
+
+void enumerated_ref_put(struct enumerated_ref *ref, unsigned idx)
+{
+ BUG_ON(idx >= ref->nr);
+ long v = atomic_long_dec_return(&ref->refs[idx]);
+
+ BUG_ON(v < 0);
+ if (v)
+ return;
+
+ for (unsigned i = 0; i < ref->nr; i++)
+ if (atomic_long_read(&ref->refs[i]))
+ return;
+
+ if (ref->stop_fn)
+ ref->stop_fn(ref);
+ complete(&ref->stop_complete);
+}
+#endif
+
+#ifndef ENUMERATED_REF_DEBUG
+static void enumerated_ref_kill_cb(struct percpu_ref *percpu_ref)
+{
+ struct enumerated_ref *ref =
+ container_of(percpu_ref, struct enumerated_ref, ref);
+
+ if (ref->stop_fn)
+ ref->stop_fn(ref);
+ complete(&ref->stop_complete);
+}
+#endif
+
+void enumerated_ref_stop_async(struct enumerated_ref *ref)
+{
+ reinit_completion(&ref->stop_complete);
+
+#ifndef ENUMERATED_REF_DEBUG
+ percpu_ref_kill(&ref->ref);
+#else
+ ref->dying = true;
+ for (unsigned i = 0; i < ref->nr; i++)
+ enumerated_ref_put(ref, i);
+#endif
+}
+
+void enumerated_ref_stop(struct enumerated_ref *ref,
+ const char * const names[])
+{
+ enumerated_ref_stop_async(ref);
+ while (!wait_for_completion_timeout(&ref->stop_complete, HZ * 10)) {
+ struct printbuf buf = PRINTBUF;
+
+ prt_str(&buf, "Waited for 10 seconds to shutdown enumerated ref\n");
+ prt_str(&buf, "Outstanding refs:\n");
+ enumerated_ref_to_text(&buf, ref, names);
+ printk(KERN_ERR "%s", buf.buf);
+ printbuf_exit(&buf);
+ }
+}
+
+void enumerated_ref_start(struct enumerated_ref *ref)
+{
+#ifndef ENUMERATED_REF_DEBUG
+ percpu_ref_reinit(&ref->ref);
+#else
+ ref->dying = false;
+ for (unsigned i = 0; i < ref->nr; i++) {
+ BUG_ON(atomic_long_read(&ref->refs[i]));
+ atomic_long_inc(&ref->refs[i]);
+ }
+#endif
+}
+
+void enumerated_ref_exit(struct enumerated_ref *ref)
+{
+#ifndef ENUMERATED_REF_DEBUG
+ percpu_ref_exit(&ref->ref);
+#else
+ kfree(ref->refs);
+ ref->refs = NULL;
+ ref->nr = 0;
+#endif
+}
+
+int enumerated_ref_init(struct enumerated_ref *ref, unsigned nr,
+ void (*stop_fn)(struct enumerated_ref *))
+{
+ init_completion(&ref->stop_complete);
+ ref->stop_fn = stop_fn;
+
+#ifndef ENUMERATED_REF_DEBUG
+ return percpu_ref_init(&ref->ref, enumerated_ref_kill_cb,
+ PERCPU_REF_INIT_DEAD, GFP_KERNEL);
+#else
+ ref->refs = kzalloc(sizeof(ref->refs[0]) * nr, GFP_KERNEL);
+ if (!ref->refs)
+ return -ENOMEM;
+
+ ref->nr = nr;
+ return 0;
+#endif
+}
+
+void enumerated_ref_to_text(struct printbuf *out,
+ struct enumerated_ref *ref,
+ const char * const names[])
+{
+#ifdef ENUMERATED_REF_DEBUG
+ bch2_printbuf_tabstop_push(out, 32);
+
+ for (unsigned i = 0; i < ref->nr; i++)
+ prt_printf(out, "%s\t%li\n", names[i],
+ atomic_long_read(&ref->refs[i]));
+#else
+ prt_str(out, "(not in debug mode)\n");
+#endif
+}
diff --git a/fs/bcachefs/enumerated_ref.h b/fs/bcachefs/enumerated_ref.h
new file mode 100644
index 000000000000..ec01cf59ef80
--- /dev/null
+++ b/fs/bcachefs/enumerated_ref.h
@@ -0,0 +1,66 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ENUMERATED_REF_H
+#define _BCACHEFS_ENUMERATED_REF_H
+
+#include "enumerated_ref_types.h"
+
+/*
+ * A refcount where the users are enumerated: in debug mode, we create sepate
+ * refcounts for each user, to make leaks and refcount errors easy to track
+ * down:
+ */
+
+#ifdef ENUMERATED_REF_DEBUG
+void enumerated_ref_get(struct enumerated_ref *, unsigned);
+bool __enumerated_ref_tryget(struct enumerated_ref *, unsigned);
+bool enumerated_ref_tryget(struct enumerated_ref *, unsigned);
+void enumerated_ref_put(struct enumerated_ref *, unsigned);
+#else
+
+static inline void enumerated_ref_get(struct enumerated_ref *ref, unsigned idx)
+{
+ percpu_ref_get(&ref->ref);
+}
+
+static inline bool __enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx)
+{
+ return percpu_ref_tryget(&ref->ref);
+}
+
+static inline bool enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx)
+{
+ return percpu_ref_tryget_live(&ref->ref);
+}
+
+static inline void enumerated_ref_put(struct enumerated_ref *ref, unsigned idx)
+{
+ percpu_ref_put(&ref->ref);
+}
+#endif
+
+static inline bool enumerated_ref_is_zero(struct enumerated_ref *ref)
+{
+#ifndef ENUMERATED_REF_DEBUG
+ return percpu_ref_is_zero(&ref->ref);
+#else
+ for (unsigned i = 0; i < ref->nr; i++)
+ if (atomic_long_read(&ref->refs[i]))
+ return false;
+ return true;
+#endif
+}
+
+void enumerated_ref_stop_async(struct enumerated_ref *);
+void enumerated_ref_stop(struct enumerated_ref *, const char * const[]);
+void enumerated_ref_start(struct enumerated_ref *);
+
+void enumerated_ref_exit(struct enumerated_ref *);
+int enumerated_ref_init(struct enumerated_ref *, unsigned,
+ void (*stop_fn)(struct enumerated_ref *));
+
+struct printbuf;
+void enumerated_ref_to_text(struct printbuf *,
+ struct enumerated_ref *,
+ const char * const[]);
+
+#endif /* _BCACHEFS_ENUMERATED_REF_H */
diff --git a/fs/bcachefs/enumerated_ref_types.h b/fs/bcachefs/enumerated_ref_types.h
new file mode 100644
index 000000000000..0e6076f466d3
--- /dev/null
+++ b/fs/bcachefs/enumerated_ref_types.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ENUMERATED_REF_TYPES_H
+#define _BCACHEFS_ENUMERATED_REF_TYPES_H
+
+#include <linux/percpu-refcount.h>
+
+struct enumerated_ref {
+#ifdef ENUMERATED_REF_DEBUG
+ unsigned nr;
+ bool dying;
+ atomic_long_t *refs;
+#else
+ struct percpu_ref ref;
+#endif
+ void (*stop_fn)(struct enumerated_ref *);
+ struct completion stop_complete;
+};
+
+#endif /* _BCACHEFS_ENUMERATED_REF_TYPES_H */
diff --git a/fs/bcachefs/errcode.c b/fs/bcachefs/errcode.c
index 43557bebd0f8..c39cf304c681 100644
--- a/fs/bcachefs/errcode.c
+++ b/fs/bcachefs/errcode.c
@@ -13,12 +13,13 @@ static const char * const bch2_errcode_strs[] = {
NULL
};
-static unsigned bch2_errcode_parents[] = {
+static const unsigned bch2_errcode_parents[] = {
#define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = class,
BCH_ERRCODES()
#undef x
};
+__attribute__((const))
const char *bch2_err_str(int err)
{
const char *errstr;
@@ -36,6 +37,7 @@ const char *bch2_err_str(int err)
return errstr ?: "(Invalid error)";
}
+__attribute__((const))
bool __bch2_err_matches(int err, int class)
{
err = abs(err);
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 4590cd0c7c90..acc3b7b67704 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -5,6 +5,8 @@
#define BCH_ERRCODES() \
x(ERANGE, ERANGE_option_too_small) \
x(ERANGE, ERANGE_option_too_big) \
+ x(EINVAL, injected) \
+ x(BCH_ERR_injected, injected_fs_start) \
x(EINVAL, mount_option) \
x(BCH_ERR_mount_option, option_name) \
x(BCH_ERR_mount_option, option_value) \
@@ -51,6 +53,7 @@
x(ENOMEM, ENOMEM_dio_write_bioset_init) \
x(ENOMEM, ENOMEM_nocow_flush_bioset_init) \
x(ENOMEM, ENOMEM_promote_table_init) \
+ x(ENOMEM, ENOMEM_async_obj_init) \
x(ENOMEM, ENOMEM_compression_bounce_read_init) \
x(ENOMEM, ENOMEM_compression_bounce_write_init) \
x(ENOMEM, ENOMEM_compression_workspace_init) \
@@ -116,9 +119,11 @@
x(ENOENT, ENOENT_snapshot_tree) \
x(ENOENT, ENOENT_dirent_doesnt_match_inode) \
x(ENOENT, ENOENT_dev_not_found) \
+ x(ENOENT, ENOENT_dev_bucket_not_found) \
x(ENOENT, ENOENT_dev_idx_not_found) \
x(ENOENT, ENOENT_inode_no_backpointer) \
x(ENOENT, ENOENT_no_snapshot_tree_subvol) \
+ x(ENOENT, btree_node_dying) \
x(ENOTEMPTY, ENOTEMPTY_dir_not_empty) \
x(ENOTEMPTY, ENOTEMPTY_subvol_not_empty) \
x(EEXIST, EEXIST_str_hash_set) \
@@ -132,7 +137,6 @@
x(BCH_ERR_transaction_restart, transaction_restart_relock) \
x(BCH_ERR_transaction_restart, transaction_restart_relock_path) \
x(BCH_ERR_transaction_restart, transaction_restart_relock_path_intent) \
- x(BCH_ERR_transaction_restart, transaction_restart_relock_after_fill) \
x(BCH_ERR_transaction_restart, transaction_restart_too_many_iters) \
x(BCH_ERR_transaction_restart, transaction_restart_lock_node_reused) \
x(BCH_ERR_transaction_restart, transaction_restart_fill_relock) \
@@ -143,11 +147,8 @@
x(BCH_ERR_transaction_restart, transaction_restart_would_deadlock_write)\
x(BCH_ERR_transaction_restart, transaction_restart_deadlock_recursion_limit)\
x(BCH_ERR_transaction_restart, transaction_restart_upgrade) \
- x(BCH_ERR_transaction_restart, transaction_restart_key_cache_upgrade) \
x(BCH_ERR_transaction_restart, transaction_restart_key_cache_fill) \
x(BCH_ERR_transaction_restart, transaction_restart_key_cache_raced) \
- x(BCH_ERR_transaction_restart, transaction_restart_key_cache_realloced)\
- x(BCH_ERR_transaction_restart, transaction_restart_journal_preres_get) \
x(BCH_ERR_transaction_restart, transaction_restart_split_race) \
x(BCH_ERR_transaction_restart, transaction_restart_write_buffer_flush) \
x(BCH_ERR_transaction_restart, transaction_restart_nested) \
@@ -170,16 +171,25 @@
x(0, backpointer_to_overwritten_btree_node) \
x(0, journal_reclaim_would_deadlock) \
x(EINVAL, fsck) \
+ x(BCH_ERR_fsck, fsck_ask) \
x(BCH_ERR_fsck, fsck_fix) \
x(BCH_ERR_fsck, fsck_delete_bkey) \
x(BCH_ERR_fsck, fsck_ignore) \
x(BCH_ERR_fsck, fsck_errors_not_fixed) \
x(BCH_ERR_fsck, fsck_repair_unimplemented) \
x(BCH_ERR_fsck, fsck_repair_impossible) \
- x(EINVAL, restart_recovery) \
- x(EINVAL, not_in_recovery) \
- x(EINVAL, cannot_rewind_recovery) \
+ x(EINVAL, recovery_will_run) \
+ x(BCH_ERR_recovery_will_run, restart_recovery) \
+ x(BCH_ERR_recovery_will_run, cannot_rewind_recovery) \
+ x(BCH_ERR_recovery_will_run, recovery_pass_will_run) \
x(0, data_update_done) \
+ x(0, bkey_was_deleted) \
+ x(BCH_ERR_data_update_done, data_update_done_would_block) \
+ x(BCH_ERR_data_update_done, data_update_done_unwritten) \
+ x(BCH_ERR_data_update_done, data_update_done_no_writes_needed) \
+ x(BCH_ERR_data_update_done, data_update_done_no_snapshot) \
+ x(BCH_ERR_data_update_done, data_update_done_no_dev_refs) \
+ x(BCH_ERR_data_update_done, data_update_done_no_rw_devs) \
x(EINVAL, device_state_not_allowed) \
x(EINVAL, member_info_missing) \
x(EINVAL, mismatched_block_size) \
@@ -191,6 +201,7 @@
x(EINVAL, device_has_been_removed) \
x(EINVAL, device_splitbrain) \
x(EINVAL, device_already_online) \
+ x(EINVAL, filesystem_uuid_already_open) \
x(EINVAL, insufficient_devices_to_start) \
x(EINVAL, invalid) \
x(EINVAL, internal_fsck_err) \
@@ -199,7 +210,12 @@
x(EINVAL, remove_would_lose_data) \
x(EINVAL, no_resize_with_buckets_nouse) \
x(EINVAL, inode_unpack_error) \
+ x(EINVAL, inode_not_unlinked) \
+ x(EINVAL, inode_has_child_snapshot) \
x(EINVAL, varint_decode_error) \
+ x(EINVAL, erasure_coding_found_btree_node) \
+ x(EINVAL, option_negative) \
+ x(EOPNOTSUPP, may_not_use_incompat_feature) \
x(EROFS, erofs_trans_commit) \
x(EROFS, erofs_no_writes) \
x(EROFS, erofs_journal_err) \
@@ -207,13 +223,22 @@
x(EROFS, erofs_unfixed_errors) \
x(EROFS, erofs_norecovery) \
x(EROFS, erofs_nochanges) \
+ x(EROFS, erofs_no_alloc_info) \
+ x(EROFS, erofs_filesystem_full) \
x(EROFS, insufficient_devices) \
x(0, operation_blocked) \
x(BCH_ERR_operation_blocked, btree_cache_cannibalize_lock_blocked) \
- x(BCH_ERR_operation_blocked, journal_res_get_blocked) \
- x(BCH_ERR_operation_blocked, journal_preres_get_blocked) \
- x(BCH_ERR_operation_blocked, bucket_alloc_blocked) \
- x(BCH_ERR_operation_blocked, stripe_alloc_blocked) \
+ x(BCH_ERR_operation_blocked, journal_res_blocked) \
+ x(BCH_ERR_journal_res_blocked, journal_blocked) \
+ x(BCH_ERR_journal_res_blocked, journal_max_in_flight) \
+ x(BCH_ERR_journal_res_blocked, journal_max_open) \
+ x(BCH_ERR_journal_res_blocked, journal_full) \
+ x(BCH_ERR_journal_res_blocked, journal_pin_full) \
+ x(BCH_ERR_journal_res_blocked, journal_buf_enomem) \
+ x(BCH_ERR_journal_res_blocked, journal_stuck) \
+ x(BCH_ERR_journal_res_blocked, journal_retry_open) \
+ x(BCH_ERR_journal_res_blocked, bucket_alloc_blocked) \
+ x(BCH_ERR_journal_res_blocked, stripe_alloc_blocked) \
x(BCH_ERR_invalid, invalid_sb) \
x(BCH_ERR_invalid_sb, invalid_sb_magic) \
x(BCH_ERR_invalid_sb, invalid_sb_version) \
@@ -223,6 +248,7 @@
x(BCH_ERR_invalid_sb, invalid_sb_csum) \
x(BCH_ERR_invalid_sb, invalid_sb_block_size) \
x(BCH_ERR_invalid_sb, invalid_sb_uuid) \
+ x(BCH_ERR_invalid_sb, invalid_sb_offset) \
x(BCH_ERR_invalid_sb, invalid_sb_too_many_members) \
x(BCH_ERR_invalid_sb, invalid_sb_dev_idx) \
x(BCH_ERR_invalid_sb, invalid_sb_time_precision) \
@@ -248,27 +274,63 @@
x(BCH_ERR_invalid_sb, invalid_sb_downgrade) \
x(BCH_ERR_invalid, invalid_bkey) \
x(BCH_ERR_operation_blocked, nocow_lock_blocked) \
- x(EIO, journal_shutdown) \
+ x(EROFS, journal_shutdown) \
x(EIO, journal_flush_err) \
+ x(EIO, journal_write_err) \
x(EIO, btree_node_read_err) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_cached) \
x(EIO, sb_not_downgraded) \
x(EIO, btree_node_write_all_failed) \
x(EIO, btree_node_read_error) \
- x(EIO, btree_node_read_validate_error) \
x(EIO, btree_need_topology_repair) \
x(EIO, bucket_ref_update) \
+ x(EIO, trigger_alloc) \
x(EIO, trigger_pointer) \
x(EIO, trigger_stripe_pointer) \
x(EIO, metadata_bucket_inconsistency) \
x(EIO, mark_stripe) \
x(EIO, stripe_reconstruct) \
x(EIO, key_type_error) \
- x(EIO, no_device_to_read_from) \
+ x(EIO, extent_poisoned) \
x(EIO, missing_indirect_extent) \
x(EIO, invalidate_stripe_to_dev) \
x(EIO, no_encryption_key) \
x(EIO, insufficient_journal_devices) \
+ x(EIO, device_offline) \
+ x(EIO, EIO_fault_injected) \
+ x(EIO, ec_block_read) \
+ x(EIO, ec_block_write) \
+ x(EIO, recompute_checksum) \
+ x(EIO, decompress) \
+ x(BCH_ERR_decompress, decompress_exceeded_max_encoded_extent) \
+ x(BCH_ERR_decompress, decompress_lz4) \
+ x(BCH_ERR_decompress, decompress_gzip) \
+ x(BCH_ERR_decompress, decompress_zstd_src_len_bad) \
+ x(BCH_ERR_decompress, decompress_zstd) \
+ x(EIO, data_write) \
+ x(BCH_ERR_data_write, data_write_io) \
+ x(BCH_ERR_data_write, data_write_csum) \
+ x(BCH_ERR_data_write, data_write_invalid_ptr) \
+ x(BCH_ERR_data_write, data_write_misaligned) \
+ x(BCH_ERR_decompress, data_read) \
+ x(BCH_ERR_data_read, no_device_to_read_from) \
+ x(BCH_ERR_data_read, no_devices_valid) \
+ x(BCH_ERR_data_read, data_read_io_err) \
+ x(BCH_ERR_data_read, data_read_csum_err) \
+ x(BCH_ERR_data_read, data_read_retry) \
+ x(BCH_ERR_data_read_retry, data_read_retry_avoid) \
+ x(BCH_ERR_data_read_retry_avoid,data_read_retry_device_offline) \
+ x(BCH_ERR_data_read_retry_avoid,data_read_retry_io_err) \
+ x(BCH_ERR_data_read_retry_avoid,data_read_retry_ec_reconstruct_err) \
+ x(BCH_ERR_data_read_retry_avoid,data_read_retry_csum_err) \
+ x(BCH_ERR_data_read_retry, data_read_retry_csum_err_maybe_userspace)\
+ x(BCH_ERR_data_read, data_read_decompress_err) \
+ x(BCH_ERR_data_read, data_read_decrypt_err) \
+ x(BCH_ERR_data_read, data_read_ptr_stale_race) \
+ x(BCH_ERR_data_read_retry, data_read_ptr_stale_retry) \
+ x(BCH_ERR_data_read, data_read_no_encryption_key) \
+ x(BCH_ERR_data_read, data_read_buffer_too_small) \
+ x(BCH_ERR_data_read, data_read_key_overwritten) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \
@@ -294,9 +356,11 @@ enum bch_errcode {
BCH_ERR_MAX
};
-const char *bch2_err_str(int);
-bool __bch2_err_matches(int, int);
+__attribute__((const)) const char *bch2_err_str(int);
+__attribute__((const)) bool __bch2_err_matches(int, int);
+
+__attribute__((const))
static inline bool _bch2_err_matches(int err, int class)
{
return err < 0 && __bch2_err_matches(err, class);
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index 038da6a61f6b..267e73d9d7e6 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -3,15 +3,24 @@
#include "btree_cache.h"
#include "btree_iter.h"
#include "error.h"
-#include "fs-common.h"
#include "journal.h"
+#include "namei.h"
#include "recovery_passes.h"
#include "super.h"
#include "thread_with_file.h"
#define FSCK_ERR_RATELIMIT_NR 10
-bool bch2_inconsistent_error(struct bch_fs *c)
+void __bch2_log_msg_start(const char *fs_or_dev_name, struct printbuf *out)
+{
+ printbuf_indent_add_nextline(out, 2);
+
+#ifdef BCACHEFS_LOG_PREFIX
+ prt_printf(out, "bcachefs (%s): ", fs_or_dev_name);
+#endif
+}
+
+bool __bch2_inconsistent_error(struct bch_fs *c, struct printbuf *out)
{
set_bit(BCH_FS_error, &c->flags);
@@ -20,11 +29,10 @@ bool bch2_inconsistent_error(struct bch_fs *c)
return false;
case BCH_ON_ERROR_fix_safe:
case BCH_ON_ERROR_ro:
- if (bch2_fs_emergency_read_only(c))
- bch_err(c, "inconsistency detected - emergency read only at journal seq %llu",
- journal_cur_seq(&c->journal));
+ bch2_fs_emergency_read_only2(c, out);
return true;
case BCH_ON_ERROR_panic:
+ bch2_print_str(c, KERN_ERR, out->buf);
panic(bch2_fmt(c, "panic after error"));
return true;
default:
@@ -32,18 +40,91 @@ bool bch2_inconsistent_error(struct bch_fs *c)
}
}
-int bch2_topology_error(struct bch_fs *c)
+bool bch2_inconsistent_error(struct bch_fs *c)
+{
+ struct printbuf buf = PRINTBUF;
+ buf.atomic++;
+
+ printbuf_indent_add_nextline(&buf, 2);
+
+ bool ret = __bch2_inconsistent_error(c, &buf);
+ if (ret)
+ bch_err(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ return ret;
+}
+
+__printf(3, 0)
+static bool bch2_fs_trans_inconsistent(struct bch_fs *c, struct btree_trans *trans,
+ const char *fmt, va_list args)
+{
+ struct printbuf buf = PRINTBUF;
+ buf.atomic++;
+
+ bch2_log_msg_start(c, &buf);
+
+ prt_vprintf(&buf, fmt, args);
+ prt_newline(&buf);
+
+ if (trans)
+ bch2_trans_updates_to_text(&buf, trans);
+ bool ret = __bch2_inconsistent_error(c, &buf);
+ bch2_print_str(c, KERN_ERR, buf.buf);
+
+ printbuf_exit(&buf);
+ return ret;
+}
+
+bool bch2_fs_inconsistent(struct bch_fs *c, const char *fmt, ...)
+{
+ va_list args;
+ va_start(args, fmt);
+ bool ret = bch2_fs_trans_inconsistent(c, NULL, fmt, args);
+ va_end(args);
+ return ret;
+}
+
+bool bch2_trans_inconsistent(struct btree_trans *trans, const char *fmt, ...)
{
+ va_list args;
+ va_start(args, fmt);
+ bool ret = bch2_fs_trans_inconsistent(trans->c, trans, fmt, args);
+ va_end(args);
+ return ret;
+}
+
+int __bch2_topology_error(struct bch_fs *c, struct printbuf *out)
+{
+ prt_printf(out, "btree topology error: ");
+
set_bit(BCH_FS_topology_error, &c->flags);
- if (!test_bit(BCH_FS_recovery_running, &c->flags)) {
- bch2_inconsistent_error(c);
- return -BCH_ERR_btree_need_topology_repair;
+ if (!test_bit(BCH_FS_in_recovery, &c->flags)) {
+ __bch2_inconsistent_error(c, out);
+ return bch_err_throw(c, btree_need_topology_repair);
} else {
- return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology) ?:
- -BCH_ERR_btree_node_read_validate_error;
+ return bch2_run_explicit_recovery_pass(c, out, BCH_RECOVERY_PASS_check_topology, 0) ?:
+ bch_err_throw(c, btree_need_topology_repair);
}
}
+int bch2_fs_topology_error(struct bch_fs *c, const char *fmt, ...)
+{
+ struct printbuf buf = PRINTBUF;
+
+ bch2_log_msg_start(c, &buf);
+
+ va_list args;
+ va_start(args, fmt);
+ prt_vprintf(&buf, fmt, args);
+ va_end(args);
+
+ int ret = __bch2_topology_error(c, &buf);
+ bch2_print_str(c, KERN_ERR, buf.buf);
+
+ printbuf_exit(&buf);
+ return ret;
+}
+
void bch2_fatal_error(struct bch_fs *c)
{
if (bch2_fs_emergency_read_only(c))
@@ -54,25 +135,44 @@ void bch2_io_error_work(struct work_struct *work)
{
struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work);
struct bch_fs *c = ca->fs;
- bool dev;
+
+ /* XXX: if it's reads or checksums that are failing, set it to failed */
down_write(&c->state_lock);
- dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_ro,
- BCH_FORCE_IF_DEGRADED);
- if (dev
- ? __bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro,
- BCH_FORCE_IF_DEGRADED)
- : bch2_fs_emergency_read_only(c))
- bch_err(ca,
- "too many IO errors, setting %s RO",
+ unsigned long write_errors_start = READ_ONCE(ca->write_errors_start);
+
+ if (write_errors_start &&
+ time_after(jiffies,
+ write_errors_start + c->opts.write_error_timeout * HZ)) {
+ if (ca->mi.state >= BCH_MEMBER_STATE_ro)
+ goto out;
+
+ bool dev = !__bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro,
+ BCH_FORCE_IF_DEGRADED);
+ struct printbuf buf = PRINTBUF;
+ __bch2_log_msg_start(ca->name, &buf);
+
+ prt_printf(&buf, "writes erroring for %u seconds, setting %s ro",
+ c->opts.write_error_timeout,
dev ? "device" : "filesystem");
+ if (!dev)
+ bch2_fs_emergency_read_only2(c, &buf);
+
+ bch2_print_str(c, KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+ }
+out:
up_write(&c->state_lock);
}
void bch2_io_error(struct bch_dev *ca, enum bch_member_error_type type)
{
atomic64_inc(&ca->errors[type]);
- //queue_work(system_long_wq, &ca->io_error_work);
+
+ if (type == BCH_MEMBER_ERROR_write && !ca->write_errors_start)
+ ca->write_errors_start = jiffies;
+
+ queue_work(system_long_wq, &ca->io_error_work);
}
enum ask_yn {
@@ -168,15 +268,13 @@ static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c, struct btree_trans *trans)
#endif
-static struct fsck_err_state *fsck_err_get(struct bch_fs *c, const char *fmt)
+static struct fsck_err_state *fsck_err_get(struct bch_fs *c,
+ enum bch_sb_error_id id)
{
struct fsck_err_state *s;
- if (!test_bit(BCH_FS_fsck_running, &c->flags))
- return NULL;
-
list_for_each_entry(s, &c->fsck_error_msgs, list)
- if (s->fmt == fmt) {
+ if (s->id == id) {
/*
* move it to the head of the list: repeated fsck errors
* are common
@@ -194,7 +292,7 @@ static struct fsck_err_state *fsck_err_get(struct bch_fs *c, const char *fmt)
}
INIT_LIST_HEAD(&s->list);
- s->fmt = fmt;
+ s->id = id;
list_add(&s->list, &c->fsck_error_msgs);
return s;
}
@@ -231,7 +329,7 @@ static int do_fsck_ask_yn(struct bch_fs *c,
if (bch2_fs_stdio_redirect(c))
bch2_print(c, "%s", question->buf);
else
- bch2_print_string_as_lines(KERN_ERR, question->buf);
+ bch2_print_str(c, KERN_ERR, question->buf);
int ask = bch2_fsck_ask_yn(c, trans);
@@ -244,17 +342,109 @@ static int do_fsck_ask_yn(struct bch_fs *c,
return ask;
}
+static struct fsck_err_state *count_fsck_err_locked(struct bch_fs *c,
+ enum bch_sb_error_id id, const char *msg,
+ bool *repeat, bool *print, bool *suppress)
+{
+ bch2_sb_error_count(c, id);
+
+ struct fsck_err_state *s = fsck_err_get(c, id);
+ if (s) {
+ /*
+ * We may be called multiple times for the same error on
+ * transaction restart - this memoizes instead of asking the user
+ * multiple times for the same error:
+ */
+ if (s->last_msg && !strcmp(msg, s->last_msg)) {
+ *repeat = true;
+ *print = false;
+ return s;
+ }
+
+ kfree(s->last_msg);
+ s->last_msg = kstrdup(msg, GFP_KERNEL);
+
+ if (c->opts.ratelimit_errors &&
+ s->nr >= FSCK_ERR_RATELIMIT_NR) {
+ if (s->nr == FSCK_ERR_RATELIMIT_NR)
+ *suppress = true;
+ else
+ *print = false;
+ }
+
+ s->nr++;
+ }
+ return s;
+}
+
+bool __bch2_count_fsck_err(struct bch_fs *c,
+ enum bch_sb_error_id id, struct printbuf *msg)
+{
+ bch2_sb_error_count(c, id);
+
+ mutex_lock(&c->fsck_error_msgs_lock);
+ bool print = true, repeat = false, suppress = false;
+
+ count_fsck_err_locked(c, id, msg->buf, &repeat, &print, &suppress);
+ mutex_unlock(&c->fsck_error_msgs_lock);
+
+ if (suppress)
+ prt_printf(msg, "Ratelimiting new instances of previous error\n");
+
+ return print && !repeat;
+}
+
+int bch2_fsck_err_opt(struct bch_fs *c,
+ enum bch_fsck_flags flags,
+ enum bch_sb_error_id err)
+{
+ if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra)))
+ flags |= fsck_flags_extra[err];
+
+ if (test_bit(BCH_FS_in_fsck, &c->flags)) {
+ if (!(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE)))
+ return bch_err_throw(c, fsck_repair_unimplemented);
+
+ switch (c->opts.fix_errors) {
+ case FSCK_FIX_exit:
+ return bch_err_throw(c, fsck_errors_not_fixed);
+ case FSCK_FIX_yes:
+ if (flags & FSCK_CAN_FIX)
+ return bch_err_throw(c, fsck_fix);
+ fallthrough;
+ case FSCK_FIX_no:
+ if (flags & FSCK_CAN_IGNORE)
+ return bch_err_throw(c, fsck_ignore);
+ return bch_err_throw(c, fsck_errors_not_fixed);
+ case FSCK_FIX_ask:
+ if (flags & FSCK_AUTOFIX)
+ return bch_err_throw(c, fsck_fix);
+ return bch_err_throw(c, fsck_ask);
+ default:
+ BUG();
+ }
+ } else {
+ if ((flags & FSCK_AUTOFIX) &&
+ (c->opts.errors == BCH_ON_ERROR_continue ||
+ c->opts.errors == BCH_ON_ERROR_fix_safe))
+ return bch_err_throw(c, fsck_fix);
+
+ if (c->opts.errors == BCH_ON_ERROR_continue &&
+ (flags & FSCK_CAN_IGNORE))
+ return bch_err_throw(c, fsck_ignore);
+ return bch_err_throw(c, fsck_errors_not_fixed);
+ }
+}
+
int __bch2_fsck_err(struct bch_fs *c,
struct btree_trans *trans,
enum bch_fsck_flags flags,
enum bch_sb_error_id err,
const char *fmt, ...)
{
- struct fsck_err_state *s = NULL;
va_list args;
- bool print = true, suppressing = false, inconsistent = false, exiting = false;
struct printbuf buf = PRINTBUF, *out = &buf;
- int ret = -BCH_ERR_fsck_ignore;
+ int ret = 0;
const char *action_orig = "fix?", *action = action_orig;
might_sleep();
@@ -284,10 +474,15 @@ int __bch2_fsck_err(struct bch_fs *c,
if (test_bit(err, c->sb.errors_silent))
return flags & FSCK_CAN_FIX
- ? -BCH_ERR_fsck_fix
- : -BCH_ERR_fsck_ignore;
+ ? bch_err_throw(c, fsck_fix)
+ : bch_err_throw(c, fsck_ignore);
+
+ printbuf_indent_add_nextline(out, 2);
- bch2_sb_error_count(c, err);
+#ifdef BCACHEFS_LOG_PREFIX
+ if (strncmp(fmt, "bcachefs", 8))
+ prt_printf(out, bch2_log_msg(c, ""));
+#endif
va_start(args, fmt);
prt_vprintf(out, fmt, args);
@@ -307,72 +502,48 @@ int __bch2_fsck_err(struct bch_fs *c,
}
mutex_lock(&c->fsck_error_msgs_lock);
- s = fsck_err_get(c, fmt);
- if (s) {
- /*
- * We may be called multiple times for the same error on
- * transaction restart - this memoizes instead of asking the user
- * multiple times for the same error:
- */
- if (s->last_msg && !strcmp(buf.buf, s->last_msg)) {
- ret = s->ret;
- goto err_unlock;
- }
-
- kfree(s->last_msg);
- s->last_msg = kstrdup(buf.buf, GFP_KERNEL);
- if (!s->last_msg) {
- ret = -ENOMEM;
- goto err_unlock;
- }
-
- if (c->opts.ratelimit_errors &&
- !(flags & FSCK_NO_RATELIMIT) &&
- s->nr >= FSCK_ERR_RATELIMIT_NR) {
- if (s->nr == FSCK_ERR_RATELIMIT_NR)
- suppressing = true;
- else
- print = false;
- }
-
- s->nr++;
+ bool repeat = false, print = true, suppress = false;
+ bool inconsistent = false, exiting = false;
+ struct fsck_err_state *s =
+ count_fsck_err_locked(c, err, buf.buf, &repeat, &print, &suppress);
+ if (repeat) {
+ ret = s->ret;
+ goto err_unlock;
}
-#ifdef BCACHEFS_LOG_PREFIX
- if (!strncmp(fmt, "bcachefs:", 9))
- prt_printf(out, bch2_log_msg(c, ""));
-#endif
-
if ((flags & FSCK_AUTOFIX) &&
(c->opts.errors == BCH_ON_ERROR_continue ||
c->opts.errors == BCH_ON_ERROR_fix_safe)) {
prt_str(out, ", ");
if (flags & FSCK_CAN_FIX) {
prt_actioning(out, action);
- ret = -BCH_ERR_fsck_fix;
+ ret = bch_err_throw(c, fsck_fix);
} else {
prt_str(out, ", continuing");
- ret = -BCH_ERR_fsck_ignore;
+ ret = bch_err_throw(c, fsck_ignore);
}
goto print;
- } else if (!test_bit(BCH_FS_fsck_running, &c->flags)) {
+ } else if (!test_bit(BCH_FS_in_fsck, &c->flags)) {
if (c->opts.errors != BCH_ON_ERROR_continue ||
!(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) {
- prt_str(out, ", shutting down");
+ prt_str_indented(out, ", shutting down\n"
+ "error not marked as autofix and not in fsck\n"
+ "run fsck, and forward to devs so error can be marked for self-healing");
inconsistent = true;
- ret = -BCH_ERR_fsck_errors_not_fixed;
+ print = true;
+ ret = bch_err_throw(c, fsck_errors_not_fixed);
} else if (flags & FSCK_CAN_FIX) {
prt_str(out, ", ");
prt_actioning(out, action);
- ret = -BCH_ERR_fsck_fix;
+ ret = bch_err_throw(c, fsck_fix);
} else {
prt_str(out, ", continuing");
- ret = -BCH_ERR_fsck_ignore;
+ ret = bch_err_throw(c, fsck_ignore);
}
} else if (c->opts.fix_errors == FSCK_FIX_exit) {
prt_str(out, ", exiting");
- ret = -BCH_ERR_fsck_errors_not_fixed;
+ ret = bch_err_throw(c, fsck_errors_not_fixed);
} else if (flags & FSCK_CAN_FIX) {
int fix = s && s->fix
? s->fix
@@ -391,71 +562,91 @@ int __bch2_fsck_err(struct bch_fs *c,
: FSCK_FIX_yes;
ret = ret & 1
- ? -BCH_ERR_fsck_fix
- : -BCH_ERR_fsck_ignore;
+ ? bch_err_throw(c, fsck_fix)
+ : bch_err_throw(c, fsck_ignore);
} else if (fix == FSCK_FIX_yes ||
(c->opts.nochanges &&
!(flags & FSCK_CAN_IGNORE))) {
prt_str(out, ", ");
prt_actioning(out, action);
- ret = -BCH_ERR_fsck_fix;
+ ret = bch_err_throw(c, fsck_fix);
} else {
prt_str(out, ", not ");
prt_actioning(out, action);
+ ret = bch_err_throw(c, fsck_ignore);
+ }
+ } else {
+ if (flags & FSCK_CAN_IGNORE) {
+ prt_str(out, ", continuing");
+ ret = bch_err_throw(c, fsck_ignore);
+ } else {
+ prt_str(out, " (repair unimplemented)");
+ ret = bch_err_throw(c, fsck_repair_unimplemented);
}
- } else if (!(flags & FSCK_CAN_IGNORE)) {
- prt_str(out, " (repair unimplemented)");
}
- if (ret == -BCH_ERR_fsck_ignore &&
+ if (bch2_err_matches(ret, BCH_ERR_fsck_ignore) &&
(c->opts.fix_errors == FSCK_FIX_exit ||
!(flags & FSCK_CAN_IGNORE)))
- ret = -BCH_ERR_fsck_errors_not_fixed;
+ ret = bch_err_throw(c, fsck_errors_not_fixed);
- if (test_bit(BCH_FS_fsck_running, &c->flags) &&
- (ret != -BCH_ERR_fsck_fix &&
- ret != -BCH_ERR_fsck_ignore)) {
+ if (test_bit(BCH_FS_in_fsck, &c->flags) &&
+ (!bch2_err_matches(ret, BCH_ERR_fsck_fix) &&
+ !bch2_err_matches(ret, BCH_ERR_fsck_ignore))) {
exiting = true;
print = true;
}
print:
+ prt_newline(out);
+
+ if (inconsistent)
+ __bch2_inconsistent_error(c, out);
+ else if (exiting)
+ prt_printf(out, "Unable to continue, halting\n");
+ else if (suppress)
+ prt_printf(out, "Ratelimiting new instances of previous error\n");
+
if (print) {
+ /* possibly strip an empty line, from printbuf_indent_add */
+ while (out->pos && out->buf[out->pos - 1] == ' ')
+ --out->pos;
+ printbuf_nul_terminate(out);
+
if (bch2_fs_stdio_redirect(c))
- bch2_print(c, "%s\n", out->buf);
+ bch2_print(c, "%s", out->buf);
else
- bch2_print_string_as_lines(KERN_ERR, out->buf);
+ bch2_print_str(c, KERN_ERR, out->buf);
}
- if (exiting)
- bch_err(c, "Unable to continue, halting");
- else if (suppressing)
- bch_err(c, "Ratelimiting new instances of previous error");
-
if (s)
s->ret = ret;
- if (inconsistent)
- bch2_inconsistent_error(c);
-
+ if (trans &&
+ !(flags & FSCK_ERR_NO_LOG) &&
+ ret == -BCH_ERR_fsck_fix)
+ ret = bch2_trans_log_str(trans, bch2_sb_error_strs[err]) ?: ret;
+err_unlock:
+ mutex_unlock(&c->fsck_error_msgs_lock);
+err:
/*
* We don't yet track whether the filesystem currently has errors, for
* log_fsck_err()s: that would require us to track for every error type
* which recovery pass corrects it, to get the fsck exit status correct:
*/
- if (flags & FSCK_CAN_FIX) {
- if (ret == -BCH_ERR_fsck_fix) {
- set_bit(BCH_FS_errors_fixed, &c->flags);
- } else {
- set_bit(BCH_FS_errors_not_fixed, &c->flags);
- set_bit(BCH_FS_error, &c->flags);
- }
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+ /* nothing */
+ } else if (bch2_err_matches(ret, BCH_ERR_fsck_fix)) {
+ set_bit(BCH_FS_errors_fixed, &c->flags);
+ } else {
+ set_bit(BCH_FS_errors_not_fixed, &c->flags);
+ set_bit(BCH_FS_error, &c->flags);
}
-err_unlock:
- mutex_unlock(&c->fsck_error_msgs_lock);
-err:
+
if (action != action_orig)
kfree(action);
printbuf_exit(&buf);
+
+ BUG_ON(!ret);
return ret;
}
@@ -473,12 +664,12 @@ int __bch2_bkey_fsck_err(struct bch_fs *c,
const char *fmt, ...)
{
if (from.flags & BCH_VALIDATE_silent)
- return -BCH_ERR_fsck_delete_bkey;
+ return bch_err_throw(c, fsck_delete_bkey);
unsigned fsck_flags = 0;
if (!(from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit))) {
if (test_bit(err, c->sb.errors_silent))
- return -BCH_ERR_fsck_delete_bkey;
+ return bch_err_throw(c, fsck_delete_bkey);
fsck_flags |= FSCK_AUTOFIX|FSCK_CAN_FIX;
}
@@ -498,29 +689,27 @@ int __bch2_bkey_fsck_err(struct bch_fs *c,
prt_printf(&buf, " level=%u: ", from.level);
bch2_bkey_val_to_text(&buf, c, k);
- prt_str(&buf, "\n ");
+ prt_newline(&buf);
va_list args;
va_start(args, fmt);
prt_vprintf(&buf, fmt, args);
va_end(args);
- prt_str(&buf, ": delete?");
-
- int ret = __bch2_fsck_err(c, NULL, fsck_flags, err, "%s", buf.buf);
+ int ret = __bch2_fsck_err(c, NULL, fsck_flags, err, "%s, delete?", buf.buf);
printbuf_exit(&buf);
return ret;
}
-void bch2_flush_fsck_errs(struct bch_fs *c)
+static void __bch2_flush_fsck_errs(struct bch_fs *c, bool print)
{
struct fsck_err_state *s, *n;
mutex_lock(&c->fsck_error_msgs_lock);
list_for_each_entry_safe(s, n, &c->fsck_error_msgs, list) {
- if (s->ratelimited && s->last_msg)
- bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->last_msg);
+ if (print && s->ratelimited && s->last_msg)
+ bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->last_msg);
list_del(&s->list);
kfree(s->last_msg);
@@ -530,35 +719,53 @@ void bch2_flush_fsck_errs(struct bch_fs *c)
mutex_unlock(&c->fsck_error_msgs_lock);
}
-int bch2_inum_err_msg_trans(struct btree_trans *trans, struct printbuf *out, subvol_inum inum)
+void bch2_flush_fsck_errs(struct bch_fs *c)
+{
+ __bch2_flush_fsck_errs(c, true);
+}
+
+void bch2_free_fsck_errs(struct bch_fs *c)
+{
+ __bch2_flush_fsck_errs(c, false);
+}
+
+int bch2_inum_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
+ subvol_inum inum, u64 offset)
{
u32 restart_count = trans->restart_count;
int ret = 0;
- /* XXX: we don't yet attempt to print paths when we don't know the subvol */
- if (inum.subvol)
- ret = lockrestart_do(trans, bch2_inum_to_path(trans, inum, out));
+ if (inum.subvol) {
+ ret = bch2_inum_to_path(trans, inum, out);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ return ret;
+ }
if (!inum.subvol || ret)
prt_printf(out, "inum %llu:%llu", inum.subvol, inum.inum);
+ prt_printf(out, " offset %llu: ", offset);
return trans_was_restarted(trans, restart_count);
}
-int bch2_inum_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
- subvol_inum inum, u64 offset)
+void bch2_inum_offset_err_msg(struct bch_fs *c, struct printbuf *out,
+ subvol_inum inum, u64 offset)
{
- int ret = bch2_inum_err_msg_trans(trans, out, inum);
- prt_printf(out, " offset %llu: ", offset);
- return ret;
+ bch2_trans_do(c, bch2_inum_offset_err_msg_trans(trans, out, inum, offset));
}
-void bch2_inum_err_msg(struct bch_fs *c, struct printbuf *out, subvol_inum inum)
+int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
+ struct bpos pos)
{
- bch2_trans_run(c, bch2_inum_err_msg_trans(trans, out, inum));
+ int ret = bch2_inum_snapshot_to_path(trans, pos.inode, pos.snapshot, NULL, out);
+ if (ret)
+ return ret;
+
+ prt_printf(out, " offset %llu: ", pos.offset << 8);
+ return 0;
}
-void bch2_inum_offset_err_msg(struct bch_fs *c, struct printbuf *out,
- subvol_inum inum, u64 offset)
+void bch2_inum_snap_offset_err_msg(struct bch_fs *c, struct printbuf *out,
+ struct bpos pos)
{
- bch2_trans_run(c, bch2_inum_offset_err_msg_trans(trans, out, inum, offset));
+ bch2_trans_do(c, bch2_inum_snap_offset_err_msg_trans(trans, out, pos));
}
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index 7acf2a27ca28..0c3c3a24fc6f 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -18,6 +18,13 @@ struct work_struct;
/* Error messages: */
+void __bch2_log_msg_start(const char *, struct printbuf *);
+
+static inline void bch2_log_msg_start(struct bch_fs *c, struct printbuf *out)
+{
+ __bch2_log_msg_start(c->name, out);
+}
+
/*
* Inconsistency errors: The on disk data is inconsistent. If these occur during
* initial recovery, they don't indicate a bug in the running code - we walk all
@@ -29,21 +36,10 @@ struct work_struct;
* BCH_ON_ERROR_CONTINUE mode
*/
+bool __bch2_inconsistent_error(struct bch_fs *, struct printbuf *);
bool bch2_inconsistent_error(struct bch_fs *);
-
-int bch2_topology_error(struct bch_fs *);
-
-#define bch2_fs_topology_error(c, ...) \
-({ \
- bch_err(c, "btree topology error: " __VA_ARGS__); \
- bch2_topology_error(c); \
-})
-
-#define bch2_fs_inconsistent(c, ...) \
-({ \
- bch_err(c, __VA_ARGS__); \
- bch2_inconsistent_error(c); \
-})
+__printf(2, 3)
+bool bch2_fs_inconsistent(struct bch_fs *, const char *, ...);
#define bch2_fs_inconsistent_on(cond, ...) \
({ \
@@ -53,26 +49,21 @@ int bch2_topology_error(struct bch_fs *);
_ret; \
})
-/*
- * When a transaction update discovers or is causing a fs inconsistency, it's
- * helpful to also dump the pending updates:
- */
-#define bch2_trans_inconsistent(trans, ...) \
-({ \
- bch_err(trans->c, __VA_ARGS__); \
- bch2_dump_trans_updates(trans); \
- bch2_inconsistent_error(trans->c); \
-})
+__printf(2, 3)
+bool bch2_trans_inconsistent(struct btree_trans *, const char *, ...);
-#define bch2_trans_inconsistent_on(cond, trans, ...) \
+#define bch2_trans_inconsistent_on(cond, ...) \
({ \
bool _ret = unlikely(!!(cond)); \
- \
if (_ret) \
- bch2_trans_inconsistent(trans, __VA_ARGS__); \
+ bch2_trans_inconsistent(__VA_ARGS__); \
_ret; \
})
+int __bch2_topology_error(struct bch_fs *, struct printbuf *);
+__printf(2, 3)
+int bch2_fs_topology_error(struct bch_fs *, const char *, ...);
+
/*
* Fsck errors: inconsistency errors we detect at mount time, and should ideally
* be able to repair:
@@ -80,7 +71,7 @@ int bch2_topology_error(struct bch_fs *);
struct fsck_err_state {
struct list_head list;
- const char *fmt;
+ enum bch_sb_error_id id;
u64 nr;
bool ratelimited;
int ret;
@@ -90,6 +81,14 @@ struct fsck_err_state {
#define fsck_err_count(_c, _err) bch2_sb_err_count(_c, BCH_FSCK_ERR_##_err)
+bool __bch2_count_fsck_err(struct bch_fs *, enum bch_sb_error_id, struct printbuf *);
+#define bch2_count_fsck_err(_c, _err, ...) \
+ __bch2_count_fsck_err(_c, BCH_FSCK_ERR_##_err, __VA_ARGS__)
+
+int bch2_fsck_err_opt(struct bch_fs *,
+ enum bch_fsck_flags,
+ enum bch_sb_error_id);
+
__printf(5, 6) __cold
int __bch2_fsck_err(struct bch_fs *, struct btree_trans *,
enum bch_fsck_flags,
@@ -101,17 +100,18 @@ int __bch2_fsck_err(struct bch_fs *, struct btree_trans *,
_flags, BCH_FSCK_ERR_##_err_type, __VA_ARGS__)
void bch2_flush_fsck_errs(struct bch_fs *);
+void bch2_free_fsck_errs(struct bch_fs *);
#define fsck_err_wrap(_do) \
({ \
int _ret = _do; \
- if (_ret != -BCH_ERR_fsck_fix && \
- _ret != -BCH_ERR_fsck_ignore) { \
+ if (!bch2_err_matches(_ret, BCH_ERR_fsck_fix) && \
+ !bch2_err_matches(_ret, BCH_ERR_fsck_ignore)) { \
ret = _ret; \
goto fsck_err; \
} \
\
- _ret == -BCH_ERR_fsck_fix; \
+ bch2_err_matches(_ret, BCH_ERR_fsck_fix); \
})
#define __fsck_err(...) fsck_err_wrap(bch2_fsck_err(__VA_ARGS__))
@@ -170,10 +170,10 @@ do { \
int _ret = __bch2_bkey_fsck_err(c, k, from, \
BCH_FSCK_ERR_##_err_type, \
_err_msg, ##__VA_ARGS__); \
- if (_ret != -BCH_ERR_fsck_fix && \
- _ret != -BCH_ERR_fsck_ignore) \
+ if (!bch2_err_matches(_ret, BCH_ERR_fsck_fix) && \
+ !bch2_err_matches(_ret, BCH_ERR_fsck_ignore)) \
ret = _ret; \
- ret = -BCH_ERR_fsck_delete_bkey; \
+ ret = bch_err_throw(c, fsck_delete_bkey); \
goto fsck_err; \
} while (0)
@@ -216,32 +216,43 @@ void bch2_io_error_work(struct work_struct *);
/* Does the error handling without logging a message */
void bch2_io_error(struct bch_dev *, enum bch_member_error_type);
-#define bch2_dev_io_err_on(cond, ca, _type, ...) \
-({ \
- bool _ret = (cond); \
- \
- if (_ret) { \
- bch_err_dev_ratelimited(ca, __VA_ARGS__); \
- bch2_io_error(ca, _type); \
- } \
- _ret; \
-})
-
-#define bch2_dev_inum_io_err_on(cond, ca, _type, ...) \
-({ \
- bool _ret = (cond); \
- \
- if (_ret) { \
- bch_err_inum_offset_ratelimited(ca, __VA_ARGS__); \
- bch2_io_error(ca, _type); \
- } \
- _ret; \
-})
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
+void bch2_latency_acct(struct bch_dev *, u64, int);
+#else
+static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {}
+#endif
+
+static inline void bch2_account_io_success_fail(struct bch_dev *ca,
+ enum bch_member_error_type type,
+ bool success)
+{
+ if (likely(success)) {
+ if (type == BCH_MEMBER_ERROR_write &&
+ ca->write_errors_start)
+ ca->write_errors_start = 0;
+ } else {
+ bch2_io_error(ca, type);
+ }
+}
+
+static inline void bch2_account_io_completion(struct bch_dev *ca,
+ enum bch_member_error_type type,
+ u64 submit_time, bool success)
+{
+ if (unlikely(!ca))
+ return;
+
+ if (type != BCH_MEMBER_ERROR_checksum)
+ bch2_latency_acct(ca, submit_time, type);
+
+ bch2_account_io_success_fail(ca, type, success);
+}
-int bch2_inum_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum);
int bch2_inum_offset_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum, u64);
-void bch2_inum_err_msg(struct bch_fs *, struct printbuf *, subvol_inum);
void bch2_inum_offset_err_msg(struct bch_fs *, struct printbuf *, subvol_inum, u64);
+int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *, struct printbuf *, struct bpos);
+void bch2_inum_snap_offset_err_msg(struct bch_fs *, struct printbuf *, struct bpos);
+
#endif /* _BCACHEFS_ERROR_H */
diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
index 6aac579a692a..e76e58a568bf 100644
--- a/fs/bcachefs/extent_update.c
+++ b/fs/bcachefs/extent_update.c
@@ -37,16 +37,17 @@ static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k)
return lru + ret * 2;
}
+#define EXTENT_ITERS_MAX 64
+
static int count_iters_for_insert(struct btree_trans *trans,
struct bkey_s_c k,
unsigned offset,
struct bpos *end,
- unsigned *nr_iters,
- unsigned max_iters)
+ unsigned *nr_iters)
{
int ret = 0, ret2 = 0;
- if (*nr_iters >= max_iters) {
+ if (*nr_iters >= EXTENT_ITERS_MAX) {
*end = bpos_min(*end, k.k->p);
ret = 1;
}
@@ -56,7 +57,7 @@ static int count_iters_for_insert(struct btree_trans *trans,
case KEY_TYPE_reflink_v:
*nr_iters += bch2_bkey_nr_alloc_ptrs(k);
- if (*nr_iters >= max_iters) {
+ if (*nr_iters >= EXTENT_ITERS_MAX) {
*end = bpos_min(*end, k.k->p);
ret = 1;
}
@@ -81,7 +82,7 @@ static int count_iters_for_insert(struct btree_trans *trans,
*nr_iters += 1 + bch2_bkey_nr_alloc_ptrs(r_k);
- if (*nr_iters >= max_iters) {
+ if (*nr_iters >= EXTENT_ITERS_MAX) {
struct bpos pos = bkey_start_pos(k.k);
pos.offset += min_t(u64, k.k->size,
r_k.k->p.offset - idx);
@@ -100,59 +101,31 @@ static int count_iters_for_insert(struct btree_trans *trans,
return ret2 ?: ret;
}
-#define EXTENT_ITERS_MAX (BTREE_ITER_INITIAL / 3)
-
int bch2_extent_atomic_end(struct btree_trans *trans,
struct btree_iter *iter,
- struct bkey_i *insert,
struct bpos *end)
{
- struct btree_iter copy;
- struct bkey_s_c k;
unsigned nr_iters = 0;
- int ret;
- ret = bch2_btree_iter_traverse(iter);
- if (ret)
- return ret;
-
- *end = insert->k.p;
-
- /* extent_update_to_keys(): */
- nr_iters += 1;
-
- ret = count_iters_for_insert(trans, bkey_i_to_s_c(insert), 0, end,
- &nr_iters, EXTENT_ITERS_MAX / 2);
- if (ret < 0)
- return ret;
+ struct btree_iter copy;
+ bch2_trans_copy_iter(trans, &copy, iter);
- bch2_trans_copy_iter(&copy, iter);
+ int ret = bch2_btree_iter_traverse(trans, &copy);
+ if (ret)
+ goto err;
- for_each_btree_key_max_continue_norestart(copy, insert->k.p, 0, k, ret) {
+ struct bkey_s_c k;
+ for_each_btree_key_max_continue_norestart(trans, copy, *end, 0, k, ret) {
unsigned offset = 0;
- if (bkey_gt(bkey_start_pos(&insert->k), bkey_start_pos(k.k)))
- offset = bkey_start_offset(&insert->k) -
- bkey_start_offset(k.k);
-
- /* extent_handle_overwrites(): */
- switch (bch2_extent_overlap(&insert->k, k.k)) {
- case BCH_EXTENT_OVERLAP_ALL:
- case BCH_EXTENT_OVERLAP_FRONT:
- nr_iters += 1;
- break;
- case BCH_EXTENT_OVERLAP_BACK:
- case BCH_EXTENT_OVERLAP_MIDDLE:
- nr_iters += 2;
- break;
- }
+ if (bkey_gt(iter->pos, bkey_start_pos(k.k)))
+ offset = iter->pos.offset - bkey_start_offset(k.k);
- ret = count_iters_for_insert(trans, k, offset, end,
- &nr_iters, EXTENT_ITERS_MAX);
+ ret = count_iters_for_insert(trans, k, offset, end, &nr_iters);
if (ret)
break;
}
-
+err:
bch2_trans_iter_exit(trans, &copy);
return ret < 0 ? ret : 0;
}
@@ -161,13 +134,22 @@ int bch2_extent_trim_atomic(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_i *k)
{
- struct bpos end;
- int ret;
-
- ret = bch2_extent_atomic_end(trans, iter, k, &end);
+ struct bpos end = k->k.p;
+ int ret = bch2_extent_atomic_end(trans, iter, &end);
if (ret)
return ret;
- bch2_cut_back(end, k);
+ /* tracepoint */
+
+ if (bpos_lt(end, k->k.p)) {
+ if (trace_extent_trim_atomic_enabled()) {
+ CLASS(printbuf, buf)();
+ bch2_bpos_to_text(&buf, end);
+ prt_newline(&buf);
+ bch2_bkey_val_to_text(&buf, trans->c, bkey_i_to_s_c(k));
+ trace_extent_trim_atomic(trans->c, buf.buf);
+ }
+ bch2_cut_back(end, k);
+ }
return 0;
}
diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h
index 6f5cf449361a..34467db53f45 100644
--- a/fs/bcachefs/extent_update.h
+++ b/fs/bcachefs/extent_update.h
@@ -5,7 +5,7 @@
#include "bcachefs.h"
int bch2_extent_atomic_end(struct btree_trans *, struct btree_iter *,
- struct bkey_i *, struct bpos *);
+ struct bpos *);
int bch2_extent_trim_atomic(struct btree_trans *, struct btree_iter *,
struct bkey_i *);
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 05d5f71a7ca9..83cbd77dcb9c 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -28,6 +28,13 @@
#include "trace.h"
#include "util.h"
+static const char * const bch2_extent_flags_strs[] = {
+#define x(n, v) [BCH_EXTENT_FLAG_##n] = #n,
+ BCH_EXTENT_FLAGS()
+#undef x
+ NULL,
+};
+
static unsigned bch2_crc_field_size_max[] = {
[BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX,
[BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX,
@@ -38,6 +45,49 @@ static void bch2_extent_crc_pack(union bch_extent_crc *,
struct bch_extent_crc_unpacked,
enum bch_extent_entry_type);
+void bch2_io_failures_to_text(struct printbuf *out,
+ struct bch_fs *c,
+ struct bch_io_failures *failed)
+{
+ static const char * const error_types[] = {
+ "btree validate", "io", "checksum", "ec reconstruct", NULL
+ };
+
+ for (struct bch_dev_io_failures *f = failed->devs;
+ f < failed->devs + failed->nr;
+ f++) {
+ unsigned errflags =
+ ((!!f->failed_btree_validate) << 0) |
+ ((!!f->failed_io) << 1) |
+ ((!!f->failed_csum_nr) << 2) |
+ ((!!f->failed_ec) << 3);
+
+ bch2_printbuf_make_room(out, 1024);
+ out->atomic++;
+ scoped_guard(rcu) {
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, f->dev);
+ if (ca)
+ prt_str(out, ca->name);
+ else
+ prt_printf(out, "(invalid device %u)", f->dev);
+ }
+ --out->atomic;
+
+ prt_char(out, ' ');
+
+ if (!errflags) {
+ prt_str(out, "no error - confused");
+ } else if (is_power_of_2(errflags)) {
+ prt_bitflags(out, error_types, errflags);
+ prt_str(out, " error");
+ } else {
+ prt_str(out, "errors: ");
+ prt_bitflags(out, error_types, errflags);
+ }
+ prt_newline(out);
+ }
+}
+
struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *f,
unsigned dev)
{
@@ -51,7 +101,8 @@ struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *f,
}
void bch2_mark_io_failure(struct bch_io_failures *failed,
- struct extent_ptr_decoded *p)
+ struct extent_ptr_decoded *p,
+ bool csum_error)
{
struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, p->ptr.dev);
@@ -59,53 +110,73 @@ void bch2_mark_io_failure(struct bch_io_failures *failed,
BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs));
f = &failed->devs[failed->nr++];
- f->dev = p->ptr.dev;
- f->idx = p->idx;
- f->nr_failed = 1;
- f->nr_retries = 0;
- } else if (p->idx != f->idx) {
- f->idx = p->idx;
- f->nr_failed = 1;
- f->nr_retries = 0;
- } else {
- f->nr_failed++;
+ memset(f, 0, sizeof(*f));
+ f->dev = p->ptr.dev;
}
+
+ if (p->do_ec_reconstruct)
+ f->failed_ec = true;
+ else if (!csum_error)
+ f->failed_io = true;
+ else
+ f->failed_csum_nr++;
}
-static inline u64 dev_latency(struct bch_fs *c, unsigned dev)
+void bch2_mark_btree_validate_failure(struct bch_io_failures *failed,
+ unsigned dev)
+{
+ struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, dev);
+
+ if (!f) {
+ BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs));
+
+ f = &failed->devs[failed->nr++];
+ memset(f, 0, sizeof(*f));
+ f->dev = dev;
+ }
+
+ f->failed_btree_validate = true;
+}
+
+static inline u64 dev_latency(struct bch_dev *ca)
{
- struct bch_dev *ca = bch2_dev_rcu(c, dev);
return ca ? atomic64_read(&ca->cur_latency[READ]) : S64_MAX;
}
+static inline int dev_failed(struct bch_dev *ca)
+{
+ return !ca || ca->mi.state == BCH_MEMBER_STATE_failed;
+}
+
/*
* returns true if p1 is better than p2:
*/
static inline bool ptr_better(struct bch_fs *c,
const struct extent_ptr_decoded p1,
- const struct extent_ptr_decoded p2)
+ u64 p1_latency,
+ struct bch_dev *ca1,
+ const struct extent_ptr_decoded p2,
+ u64 p2_latency)
{
- if (likely(!p1.idx && !p2.idx)) {
- u64 l1 = dev_latency(c, p1.ptr.dev);
- u64 l2 = dev_latency(c, p2.ptr.dev);
+ struct bch_dev *ca2 = bch2_dev_rcu(c, p2.ptr.dev);
- /*
- * Square the latencies, to bias more in favor of the faster
- * device - we never want to stop issuing reads to the slower
- * device altogether, so that we can update our latency numbers:
- */
- l1 *= l1;
- l2 *= l2;
+ int failed_delta = dev_failed(ca1) - dev_failed(ca2);
+ if (unlikely(failed_delta))
+ return failed_delta < 0;
- /* Pick at random, biased in favor of the faster device: */
+ if (static_branch_unlikely(&bch2_force_reconstruct_read))
+ return p1.do_ec_reconstruct > p2.do_ec_reconstruct;
- return bch2_rand_range(l1 + l2) > l1;
- }
+ if (unlikely(p1.do_ec_reconstruct || p2.do_ec_reconstruct))
+ return p1.do_ec_reconstruct < p2.do_ec_reconstruct;
+
+ int crc_retry_delta = (int) p1.crc_retry_nr - (int) p2.crc_retry_nr;
+ if (unlikely(crc_retry_delta))
+ return crc_retry_delta < 0;
- if (bch2_force_reconstruct_read)
- return p1.idx > p2.idx;
+ /* Pick at random, biased in favor of the faster device: */
- return p1.idx < p2.idx;
+ return bch2_get_random_u64_below(p1_latency + p2_latency) > p1_latency;
}
/*
@@ -115,64 +186,117 @@ static inline bool ptr_better(struct bch_fs *c,
*/
int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
struct bch_io_failures *failed,
- struct extent_ptr_decoded *pick)
+ struct extent_ptr_decoded *pick,
+ int dev)
{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- struct bch_dev_io_failures *f;
- int ret = 0;
+ bool have_csum_errors = false, have_io_errors = false, have_missing_devs = false;
+ bool have_dirty_ptrs = false, have_pick = false;
if (k.k->type == KEY_TYPE_error)
- return -BCH_ERR_key_type_error;
+ return bch_err_throw(c, key_type_error);
rcu_read_lock();
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ u64 pick_latency;
+
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ have_dirty_ptrs |= !p.ptr.cached;
+
/*
* Unwritten extent: no need to actually read, treat it as a
* hole and return 0s:
*/
if (p.ptr.unwritten) {
- ret = 0;
- break;
+ rcu_read_unlock();
+ return 0;
}
- /*
- * If there are any dirty pointers it's an error if we can't
- * read:
- */
- if (!ret && !p.ptr.cached)
- ret = -BCH_ERR_no_device_to_read_from;
+ /* Are we being asked to read from a specific device? */
+ if (dev >= 0 && p.ptr.dev != dev)
+ continue;
+
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev);
- struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
+ if (unlikely(!ca && p.ptr.dev != BCH_SB_MEMBER_INVALID)) {
+ rcu_read_unlock();
+ int ret = bch2_dev_missing_bkey(c, k, p.ptr.dev);
+ if (ret)
+ return ret;
+ rcu_read_lock();
+ }
if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr)))
continue;
- f = failed ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL;
- if (f)
- p.idx = f->nr_failed < f->nr_retries
- ? f->idx
- : f->idx + 1;
+ struct bch_dev_io_failures *f =
+ unlikely(failed) ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL;
+ if (unlikely(f)) {
+ p.crc_retry_nr = f->failed_csum_nr;
+ p.has_ec &= ~f->failed_ec;
- if (!p.idx && (!ca || !bch2_dev_is_readable(ca)))
- p.idx++;
+ if (ca && ca->mi.state != BCH_MEMBER_STATE_failed) {
+ have_io_errors |= f->failed_io;
+ have_io_errors |= f->failed_btree_validate;
+ have_io_errors |= f->failed_ec;
+ }
+ have_csum_errors |= !!f->failed_csum_nr;
+
+ if (p.has_ec && (f->failed_io || f->failed_csum_nr))
+ p.do_ec_reconstruct = true;
+ else if (f->failed_io ||
+ f->failed_btree_validate ||
+ f->failed_csum_nr > c->opts.checksum_err_retry_nr)
+ continue;
+ }
- if (!p.idx && p.has_ec && bch2_force_reconstruct_read)
- p.idx++;
+ have_missing_devs |= ca && !bch2_dev_is_online(ca);
- if (p.idx > (unsigned) p.has_ec)
- continue;
+ if (!ca || !bch2_dev_is_online(ca)) {
+ if (!p.has_ec)
+ continue;
+ p.do_ec_reconstruct = true;
+ }
- if (ret > 0 && !ptr_better(c, p, *pick))
- continue;
+ if (static_branch_unlikely(&bch2_force_reconstruct_read) && p.has_ec)
+ p.do_ec_reconstruct = true;
- *pick = p;
- ret = 1;
+ u64 p_latency = dev_latency(ca);
+ /*
+ * Square the latencies, to bias more in favor of the faster
+ * device - we never want to stop issuing reads to the slower
+ * device altogether, so that we can update our latency numbers:
+ */
+ p_latency *= p_latency;
+
+ if (!have_pick ||
+ ptr_better(c,
+ p, p_latency, ca,
+ *pick, pick_latency)) {
+ *pick = p;
+ pick_latency = p_latency;
+ have_pick = true;
+ }
}
rcu_read_unlock();
- return ret;
+ if (have_pick)
+ return 1;
+ if (!have_dirty_ptrs)
+ return 0;
+ if (have_missing_devs)
+ return bch_err_throw(c, no_device_to_read_from);
+ if (have_csum_errors)
+ return bch_err_throw(c, data_read_csum_err);
+ if (have_io_errors)
+ return bch_err_throw(c, data_read_io_err);
+
+ /*
+ * If we get here, we have pointers (bkey_ptrs_validate() ensures that),
+ * but they don't point to valid devices:
+ */
+ return bch_err_throw(c, no_devices_valid);
}
/* KEY_TYPE_btree_ptr: */
@@ -283,6 +407,8 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
lp.crc = bch2_extent_crc_unpack(l.k, NULL);
rp.crc = bch2_extent_crc_unpack(r.k, NULL);
+ guard(rcu)();
+
while (__bkey_ptr_next_decode(l.k, l_ptrs.end, lp, en_l) &&
__bkey_ptr_next_decode(r.k, r_ptrs.end, rp, en_r)) {
if (lp.ptr.offset + lp.crc.offset + lp.crc.live_size !=
@@ -294,10 +420,8 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
return false;
/* Extents may not straddle buckets: */
- rcu_read_lock();
struct bch_dev *ca = bch2_dev_rcu(c, lp.ptr.dev);
bool same_bucket = ca && PTR_BUCKET_NR(ca, &lp.ptr) == PTR_BUCKET_NR(ca, &rp.ptr);
- rcu_read_unlock();
if (!same_bucket)
return false;
@@ -536,29 +660,35 @@ static void bch2_extent_crc_pack(union bch_extent_crc *dst,
struct bch_extent_crc_unpacked src,
enum bch_extent_entry_type type)
{
-#define set_common_fields(_dst, _src) \
- _dst.type = 1 << type; \
- _dst.csum_type = _src.csum_type, \
- _dst.compression_type = _src.compression_type, \
- _dst._compressed_size = _src.compressed_size - 1, \
- _dst._uncompressed_size = _src.uncompressed_size - 1, \
- _dst.offset = _src.offset
+#define common_fields(_src) \
+ .type = BIT(type), \
+ .csum_type = _src.csum_type, \
+ .compression_type = _src.compression_type, \
+ ._compressed_size = _src.compressed_size - 1, \
+ ._uncompressed_size = _src.uncompressed_size - 1, \
+ .offset = _src.offset
switch (type) {
case BCH_EXTENT_ENTRY_crc32:
- set_common_fields(dst->crc32, src);
- dst->crc32.csum = (u32 __force) *((__le32 *) &src.csum.lo);
+ dst->crc32 = (struct bch_extent_crc32) {
+ common_fields(src),
+ .csum = (u32 __force) *((__le32 *) &src.csum.lo),
+ };
break;
case BCH_EXTENT_ENTRY_crc64:
- set_common_fields(dst->crc64, src);
- dst->crc64.nonce = src.nonce;
- dst->crc64.csum_lo = (u64 __force) src.csum.lo;
- dst->crc64.csum_hi = (u64 __force) *((__le16 *) &src.csum.hi);
+ dst->crc64 = (struct bch_extent_crc64) {
+ common_fields(src),
+ .nonce = src.nonce,
+ .csum_lo = (u64 __force) src.csum.lo,
+ .csum_hi = (u64 __force) *((__le16 *) &src.csum.hi),
+ };
break;
case BCH_EXTENT_ENTRY_crc128:
- set_common_fields(dst->crc128, src);
- dst->crc128.nonce = src.nonce;
- dst->crc128.csum = src.csum;
+ dst->crc128 = (struct bch_extent_crc128) {
+ common_fields(src),
+ .nonce = src.nonce,
+ .csum = src.csum,
+ };
break;
default:
BUG();
@@ -708,11 +838,9 @@ unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k)
struct extent_ptr_decoded p;
unsigned durability = 0;
- rcu_read_lock();
+ guard(rcu)();
bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
durability += bch2_extent_ptr_durability(c, &p);
- rcu_read_unlock();
-
return durability;
}
@@ -723,12 +851,10 @@ static unsigned bch2_bkey_durability_safe(struct bch_fs *c, struct bkey_s_c k)
struct extent_ptr_decoded p;
unsigned durability = 0;
- rcu_read_lock();
+ guard(rcu)();
bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
if (p.ptr.dev < c->sb.nr_devices && c->devs[p.ptr.dev])
durability += bch2_extent_ptr_durability(c, &p);
- rcu_read_unlock();
-
return durability;
}
@@ -885,20 +1011,16 @@ bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
struct bch_dev *ca;
- bool ret = false;
- rcu_read_lock();
+ guard(rcu)();
bkey_for_each_ptr(ptrs, ptr)
if (bch2_dev_in_target(c, ptr->dev, target) &&
(ca = bch2_dev_rcu(c, ptr->dev)) &&
(!ptr->cached ||
- !dev_ptr_stale_rcu(ca, ptr))) {
- ret = true;
- break;
- }
- rcu_read_unlock();
+ !dev_ptr_stale_rcu(ca, ptr)))
+ return true;
- return ret;
+ return false;
}
bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k,
@@ -991,13 +1113,14 @@ bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, struct bke
static bool want_cached_ptr(struct bch_fs *c, struct bch_io_opts *opts,
struct bch_extent_ptr *ptr)
{
- if (!opts->promote_target ||
- !bch2_dev_in_target(c, ptr->dev, opts->promote_target))
+ unsigned target = opts->promote_target ?: opts->foreground_target;
+
+ if (target && !bch2_dev_in_target(c, ptr->dev, target))
return false;
struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
- return ca && bch2_dev_is_readable(ca) && !dev_ptr_stale_rcu(ca, ptr);
+ return ca && bch2_dev_is_healthy(ca) && !dev_ptr_stale_rcu(ca, ptr);
}
void bch2_extent_ptr_set_cached(struct bch_fs *c,
@@ -1005,33 +1128,48 @@ void bch2_extent_ptr_set_cached(struct bch_fs *c,
struct bkey_s k,
struct bch_extent_ptr *ptr)
{
- struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
+ struct bkey_ptrs ptrs;
union bch_extent_entry *entry;
struct extent_ptr_decoded p;
+ bool have_cached_ptr;
+ unsigned drop_dev = ptr->dev;
- rcu_read_lock();
- if (!want_cached_ptr(c, opts, ptr)) {
- bch2_bkey_drop_ptr_noerror(k, ptr);
- goto out;
- }
+ guard(rcu)();
+restart_drop_ptrs:
+ ptrs = bch2_bkey_ptrs(k);
+ have_cached_ptr = false;
- /*
- * Stripes can't contain cached data, for - reasons.
- *
- * Possibly something we can fix in the future?
- */
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
- if (&entry->ptr == ptr) {
- if (p.has_ec)
- bch2_bkey_drop_ptr_noerror(k, ptr);
- else
- ptr->cached = true;
- goto out;
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+ /*
+ * Check if it's erasure coded - stripes can't contain cached
+ * data. Possibly something we can fix in the future?
+ */
+ if (&entry->ptr == ptr && p.has_ec)
+ goto drop;
+
+ if (p.ptr.cached) {
+ if (have_cached_ptr || !want_cached_ptr(c, opts, &p.ptr)) {
+ bch2_bkey_drop_ptr_noerror(k, &entry->ptr);
+ ptr = NULL;
+ goto restart_drop_ptrs;
+ }
+
+ have_cached_ptr = true;
}
+ }
- BUG();
-out:
- rcu_read_unlock();
+ if (!ptr)
+ bkey_for_each_ptr(ptrs, ptr2)
+ if (ptr2->dev == drop_dev)
+ ptr = ptr2;
+
+ if (have_cached_ptr || !want_cached_ptr(c, opts, ptr))
+ goto drop;
+
+ ptr->cached = true;
+ return;
+drop:
+ bch2_bkey_drop_ptr_noerror(k, ptr);
}
/*
@@ -1046,12 +1184,11 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
{
struct bch_dev *ca;
- rcu_read_lock();
+ guard(rcu)();
bch2_bkey_drop_ptrs(k, ptr,
ptr->cached &&
(!(ca = bch2_dev_rcu(c, ptr->dev)) ||
dev_ptr_stale_rcu(ca, ptr) > 0));
- rcu_read_unlock();
return bkey_deleted(k.k);
}
@@ -1069,7 +1206,7 @@ bool bch2_extent_normalize_by_opts(struct bch_fs *c,
struct bkey_ptrs ptrs;
bool have_cached_ptr;
- rcu_read_lock();
+ guard(rcu)();
restart_drop_ptrs:
ptrs = bch2_bkey_ptrs(k);
have_cached_ptr = false;
@@ -1082,7 +1219,6 @@ restart_drop_ptrs:
}
have_cached_ptr = true;
}
- rcu_read_unlock();
return bkey_deleted(k.k);
}
@@ -1090,7 +1226,7 @@ restart_drop_ptrs:
void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struct bch_extent_ptr *ptr)
{
out->atomic++;
- rcu_read_lock();
+ guard(rcu)();
struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
if (!ca) {
prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev,
@@ -1114,7 +1250,6 @@ void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struc
else if (stale)
prt_printf(out, " invalid");
}
- rcu_read_unlock();
--out->atomic;
}
@@ -1220,6 +1355,10 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
bch2_extent_rebalance_to_text(out, c, &entry->rebalance);
break;
+ case BCH_EXTENT_ENTRY_flags:
+ prt_bitflags(out, bch2_extent_flags_strs, entry->flags.flags);
+ break;
+
default:
prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
return;
@@ -1376,11 +1515,16 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k,
struct bch_compression_opt opt = __bch2_compression_decode(r->compression);
prt_printf(err, "invalid compression opt %u:%u",
opt.type, opt.level);
- return -BCH_ERR_invalid_bkey;
+ return bch_err_throw(c, invalid_bkey);
}
#endif
break;
}
+ case BCH_EXTENT_ENTRY_flags:
+ bkey_fsck_err_on(entry != ptrs.start,
+ c, extent_flags_not_at_start,
+ "extent flags entry not at start");
+ break;
}
}
@@ -1447,6 +1591,28 @@ void bch2_ptr_swab(struct bkey_s k)
}
}
+int bch2_bkey_extent_flags_set(struct bch_fs *c, struct bkey_i *k, u64 flags)
+{
+ int ret = bch2_request_incompat_feature(c, bcachefs_metadata_version_extent_flags);
+ if (ret)
+ return ret;
+
+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
+
+ if (ptrs.start != ptrs.end &&
+ extent_entry_type(ptrs.start) == BCH_EXTENT_ENTRY_flags) {
+ ptrs.start->flags.flags = flags;
+ } else {
+ struct bch_extent_flags f = {
+ .type = BIT(BCH_EXTENT_ENTRY_flags),
+ .flags = flags,
+ };
+ __extent_entry_insert(k, ptrs.start, (union bch_extent_entry *) &f);
+ }
+
+ return 0;
+}
+
/* Generic extent code: */
int bch2_cut_front_s(struct bpos where, struct bkey_s k)
@@ -1492,8 +1658,8 @@ int bch2_cut_front_s(struct bpos where, struct bkey_s k)
entry->crc128.offset += sub;
break;
case BCH_EXTENT_ENTRY_stripe_ptr:
- break;
case BCH_EXTENT_ENTRY_rebalance:
+ case BCH_EXTENT_ENTRY_flags:
break;
}
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 620b284aa34f..b8590e51b76e 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -320,8 +320,9 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k)
({ \
__label__ out; \
\
- (_ptr).idx = 0; \
- (_ptr).has_ec = false; \
+ (_ptr).has_ec = false; \
+ (_ptr).do_ec_reconstruct = false; \
+ (_ptr).crc_retry_nr = 0; \
\
__bkey_extent_entry_for_each_from(_entry, _end, _entry) \
switch (__extent_entry_type(_entry)) { \
@@ -379,13 +380,6 @@ out: \
/* Iterate over pointers in KEY_TYPE_extent: */
-#define extent_for_each_entry_from(_e, _entry, _start) \
- __bkey_extent_entry_for_each_from(_start, \
- extent_entry_last(_e), _entry)
-
-#define extent_for_each_entry(_e, _entry) \
- extent_for_each_entry_from(_e, _entry, (_e).v->start)
-
#define extent_ptr_next(_e, _ptr) \
__bkey_ptr_next(_ptr, extent_entry_last(_e))
@@ -398,13 +392,16 @@ out: \
/* utility code common to all keys with pointers: */
+void bch2_io_failures_to_text(struct printbuf *, struct bch_fs *,
+ struct bch_io_failures *);
struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *,
unsigned);
void bch2_mark_io_failure(struct bch_io_failures *,
- struct extent_ptr_decoded *);
+ struct extent_ptr_decoded *, bool);
+void bch2_mark_btree_validate_failure(struct bch_io_failures *, unsigned);
int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
struct bch_io_failures *,
- struct extent_ptr_decoded *);
+ struct extent_ptr_decoded *, int);
/* KEY_TYPE_btree_ptr: */
@@ -704,7 +701,7 @@ static inline bool bch2_extent_ptr_eq(struct bch_extent_ptr ptr1,
ptr1.unwritten == ptr2.unwritten &&
ptr1.offset == ptr2.offset &&
ptr1.dev == ptr2.dev &&
- ptr1.dev == ptr2.dev);
+ ptr1.gen == ptr2.gen);
}
void bch2_ptr_swab(struct bkey_s);
@@ -753,4 +750,19 @@ static inline void bch2_key_resize(struct bkey *k, unsigned new_size)
k->size = new_size;
}
+static inline u64 bch2_bkey_extent_ptrs_flags(struct bkey_ptrs_c ptrs)
+{
+ if (ptrs.start != ptrs.end &&
+ extent_entry_type(ptrs.start) == BCH_EXTENT_ENTRY_flags)
+ return ptrs.start->flags.flags;
+ return 0;
+}
+
+static inline u64 bch2_bkey_extent_flags(struct bkey_s_c k)
+{
+ return bch2_bkey_extent_ptrs_flags(bch2_bkey_ptrs_c(k));
+}
+
+int bch2_bkey_extent_flags_set(struct bch_fs *, struct bkey_i *, u64);
+
#endif /* _BCACHEFS_EXTENTS_H */
diff --git a/fs/bcachefs/extents_format.h b/fs/bcachefs/extents_format.h
index c198dfc376d6..74c0252cbd98 100644
--- a/fs/bcachefs/extents_format.h
+++ b/fs/bcachefs/extents_format.h
@@ -79,8 +79,9 @@
x(crc64, 2) \
x(crc128, 3) \
x(stripe_ptr, 4) \
- x(rebalance, 5)
-#define BCH_EXTENT_ENTRY_MAX 6
+ x(rebalance, 5) \
+ x(flags, 6)
+#define BCH_EXTENT_ENTRY_MAX 7
enum bch_extent_entry_type {
#define x(f, n) BCH_EXTENT_ENTRY_##f = n,
@@ -201,6 +202,25 @@ struct bch_extent_stripe_ptr {
#endif
};
+#define BCH_EXTENT_FLAGS() \
+ x(poisoned, 0)
+
+enum bch_extent_flags_e {
+#define x(n, v) BCH_EXTENT_FLAG_##n = v,
+ BCH_EXTENT_FLAGS()
+#undef x
+};
+
+struct bch_extent_flags {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u64 type:7,
+ flags:57;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+ __u64 flags:57,
+ type:7;
+#endif
+};
+
/* bch_extent_rebalance: */
#include "rebalance_format.h"
diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h
index 43d6c341ecca..b23ce4a373c0 100644
--- a/fs/bcachefs/extents_types.h
+++ b/fs/bcachefs/extents_types.h
@@ -20,8 +20,9 @@ struct bch_extent_crc_unpacked {
};
struct extent_ptr_decoded {
- unsigned idx;
bool has_ec;
+ bool do_ec_reconstruct;
+ u8 crc_retry_nr;
struct bch_extent_crc_unpacked crc;
struct bch_extent_ptr ptr;
struct bch_extent_stripe_ptr ec;
@@ -31,10 +32,11 @@ struct bch_io_failures {
u8 nr;
struct bch_dev_io_failures {
u8 dev;
- u8 idx;
- u8 nr_failed;
- u8 nr_retries;
- } devs[BCH_REPLICAS_MAX];
+ unsigned failed_csum_nr:6,
+ failed_io:1,
+ failed_btree_validate:1,
+ failed_ec:1;
+ } devs[BCH_REPLICAS_MAX + 1];
};
#endif /* _BCACHEFS_EXTENTS_TYPES_H */
diff --git a/fs/bcachefs/eytzinger.c b/fs/bcachefs/eytzinger.c
index 2eaffe37b5e7..0e742555cb0a 100644
--- a/fs/bcachefs/eytzinger.c
+++ b/fs/bcachefs/eytzinger.c
@@ -148,89 +148,99 @@ static int do_cmp(const void *a, const void *b, cmp_r_func_t cmp, const void *pr
return cmp(a, b, priv);
}
-static inline int eytzinger0_do_cmp(void *base, size_t n, size_t size,
+static inline int eytzinger1_do_cmp(void *base1, size_t n, size_t size,
cmp_r_func_t cmp_func, const void *priv,
size_t l, size_t r)
{
- return do_cmp(base + inorder_to_eytzinger0(l, n) * size,
- base + inorder_to_eytzinger0(r, n) * size,
+ return do_cmp(base1 + inorder_to_eytzinger1(l, n) * size,
+ base1 + inorder_to_eytzinger1(r, n) * size,
cmp_func, priv);
}
-static inline void eytzinger0_do_swap(void *base, size_t n, size_t size,
+static inline void eytzinger1_do_swap(void *base1, size_t n, size_t size,
swap_r_func_t swap_func, const void *priv,
size_t l, size_t r)
{
- do_swap(base + inorder_to_eytzinger0(l, n) * size,
- base + inorder_to_eytzinger0(r, n) * size,
+ do_swap(base1 + inorder_to_eytzinger1(l, n) * size,
+ base1 + inorder_to_eytzinger1(r, n) * size,
size, swap_func, priv);
}
-void eytzinger0_sort_r(void *base, size_t n, size_t size,
- cmp_r_func_t cmp_func,
- swap_r_func_t swap_func,
- const void *priv)
+static void eytzinger1_sort_r(void *base1, size_t n, size_t size,
+ cmp_r_func_t cmp_func,
+ swap_r_func_t swap_func,
+ const void *priv)
{
- int i, j, k;
+ unsigned i, j, k;
/* called from 'sort' without swap function, let's pick the default */
if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap_func)
swap_func = NULL;
if (!swap_func) {
- if (is_aligned(base, size, 8))
+ if (is_aligned(base1, size, 8))
swap_func = SWAP_WORDS_64;
- else if (is_aligned(base, size, 4))
+ else if (is_aligned(base1, size, 4))
swap_func = SWAP_WORDS_32;
else
swap_func = SWAP_BYTES;
}
/* heapify */
- for (i = n / 2 - 1; i >= 0; --i) {
+ for (i = n / 2; i >= 1; --i) {
/* Find the sift-down path all the way to the leaves. */
- for (j = i; k = j * 2 + 1, k + 1 < n;)
- j = eytzinger0_do_cmp(base, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1;
+ for (j = i; k = j * 2, k < n;)
+ j = eytzinger1_do_cmp(base1, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1;
/* Special case for the last leaf with no sibling. */
- if (j * 2 + 2 == n)
- j = j * 2 + 1;
+ if (j * 2 == n)
+ j *= 2;
/* Backtrack to the correct location. */
- while (j != i && eytzinger0_do_cmp(base, n, size, cmp_func, priv, i, j) >= 0)
- j = (j - 1) / 2;
+ while (j != i && eytzinger1_do_cmp(base1, n, size, cmp_func, priv, i, j) >= 0)
+ j /= 2;
/* Shift the element into its correct place. */
for (k = j; j != i;) {
- j = (j - 1) / 2;
- eytzinger0_do_swap(base, n, size, swap_func, priv, j, k);
+ j /= 2;
+ eytzinger1_do_swap(base1, n, size, swap_func, priv, j, k);
}
}
/* sort */
- for (i = n - 1; i > 0; --i) {
- eytzinger0_do_swap(base, n, size, swap_func, priv, 0, i);
+ for (i = n; i > 1; --i) {
+ eytzinger1_do_swap(base1, n, size, swap_func, priv, 1, i);
/* Find the sift-down path all the way to the leaves. */
- for (j = 0; k = j * 2 + 1, k + 1 < i;)
- j = eytzinger0_do_cmp(base, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1;
+ for (j = 1; k = j * 2, k + 1 < i;)
+ j = eytzinger1_do_cmp(base1, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1;
/* Special case for the last leaf with no sibling. */
- if (j * 2 + 2 == i)
- j = j * 2 + 1;
+ if (j * 2 + 1 == i)
+ j *= 2;
/* Backtrack to the correct location. */
- while (j && eytzinger0_do_cmp(base, n, size, cmp_func, priv, 0, j) >= 0)
- j = (j - 1) / 2;
+ while (j >= 1 && eytzinger1_do_cmp(base1, n, size, cmp_func, priv, 1, j) >= 0)
+ j /= 2;
/* Shift the element into its correct place. */
- for (k = j; j;) {
- j = (j - 1) / 2;
- eytzinger0_do_swap(base, n, size, swap_func, priv, j, k);
+ for (k = j; j > 1;) {
+ j /= 2;
+ eytzinger1_do_swap(base1, n, size, swap_func, priv, j, k);
}
}
}
+void eytzinger0_sort_r(void *base, size_t n, size_t size,
+ cmp_r_func_t cmp_func,
+ swap_r_func_t swap_func,
+ const void *priv)
+{
+ void *base1 = base - size;
+
+ return eytzinger1_sort_r(base1, n, size, cmp_func, swap_func, priv);
+}
+
void eytzinger0_sort(void *base, size_t n, size_t size,
cmp_func_t cmp_func,
swap_func_t swap_func)
diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h
index 0541192d7bc0..643c1f716061 100644
--- a/fs/bcachefs/eytzinger.h
+++ b/fs/bcachefs/eytzinger.h
@@ -6,6 +6,7 @@
#include <linux/log2.h>
#ifdef EYTZINGER_DEBUG
+#include <linux/bug.h>
#define EYTZINGER_BUG_ON(cond) BUG_ON(cond)
#else
#define EYTZINGER_BUG_ON(cond)
@@ -56,24 +57,14 @@ static inline unsigned eytzinger1_last(unsigned size)
return rounddown_pow_of_two(size + 1) - 1;
}
-/*
- * eytzinger1_next() and eytzinger1_prev() have the nice properties that
- *
- * eytzinger1_next(0) == eytzinger1_first())
- * eytzinger1_prev(0) == eytzinger1_last())
- *
- * eytzinger1_prev(eytzinger1_first()) == 0
- * eytzinger1_next(eytzinger1_last()) == 0
- */
-
static inline unsigned eytzinger1_next(unsigned i, unsigned size)
{
- EYTZINGER_BUG_ON(i > size);
+ EYTZINGER_BUG_ON(i == 0 || i > size);
if (eytzinger1_right_child(i) <= size) {
i = eytzinger1_right_child(i);
- i <<= __fls(size + 1) - __fls(i);
+ i <<= __fls(size) - __fls(i);
i >>= i > size;
} else {
i >>= ffz(i) + 1;
@@ -84,12 +75,12 @@ static inline unsigned eytzinger1_next(unsigned i, unsigned size)
static inline unsigned eytzinger1_prev(unsigned i, unsigned size)
{
- EYTZINGER_BUG_ON(i > size);
+ EYTZINGER_BUG_ON(i == 0 || i > size);
if (eytzinger1_left_child(i) <= size) {
i = eytzinger1_left_child(i) + 1;
- i <<= __fls(size + 1) - __fls(i);
+ i <<= __fls(size) - __fls(i);
i -= 1;
i >>= i > size;
} else {
@@ -243,73 +234,63 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size)
(_i) != -1; \
(_i) = eytzinger0_next((_i), (_size)))
+#define eytzinger0_for_each_prev(_i, _size) \
+ for (unsigned (_i) = eytzinger0_last((_size)); \
+ (_i) != -1; \
+ (_i) = eytzinger0_prev((_i), (_size)))
+
/* return greatest node <= @search, or -1 if not found */
static inline int eytzinger0_find_le(void *base, size_t nr, size_t size,
cmp_func_t cmp, const void *search)
{
- unsigned i, n = 0;
-
- if (!nr)
- return -1;
-
- do {
- i = n;
- n = eytzinger0_child(i, cmp(base + i * size, search) <= 0);
- } while (n < nr);
-
- if (n & 1) {
- /*
- * @i was greater than @search, return previous node:
- *
- * if @i was leftmost/smallest element,
- * eytzinger0_prev(eytzinger0_first())) returns -1, as expected
- */
- return eytzinger0_prev(i, nr);
- } else {
- return i;
- }
+ void *base1 = base - size;
+ unsigned n = 1;
+
+ while (n <= nr)
+ n = eytzinger1_child(n, cmp(base1 + n * size, search) <= 0);
+ n >>= __ffs(n) + 1;
+ return n - 1;
}
+/* return smallest node > @search, or -1 if not found */
static inline int eytzinger0_find_gt(void *base, size_t nr, size_t size,
cmp_func_t cmp, const void *search)
{
- ssize_t idx = eytzinger0_find_le(base, nr, size, cmp, search);
+ void *base1 = base - size;
+ unsigned n = 1;
- /*
- * if eytitzinger0_find_le() returned -1 - no element was <= search - we
- * want to return the first element; next/prev identities mean this work
- * as expected
- *
- * similarly if find_le() returns last element, we should return -1;
- * identities mean this all works out:
- */
- return eytzinger0_next(idx, nr);
+ while (n <= nr)
+ n = eytzinger1_child(n, cmp(base1 + n * size, search) <= 0);
+ n >>= __ffs(n + 1) + 1;
+ return n - 1;
}
+/* return smallest node >= @search, or -1 if not found */
static inline int eytzinger0_find_ge(void *base, size_t nr, size_t size,
cmp_func_t cmp, const void *search)
{
- ssize_t idx = eytzinger0_find_le(base, nr, size, cmp, search);
-
- if (idx < nr && !cmp(base + idx * size, search))
- return idx;
+ void *base1 = base - size;
+ unsigned n = 1;
- return eytzinger0_next(idx, nr);
+ while (n <= nr)
+ n = eytzinger1_child(n, cmp(base1 + n * size, search) < 0);
+ n >>= __ffs(n + 1) + 1;
+ return n - 1;
}
#define eytzinger0_find(base, nr, size, _cmp, search) \
({ \
- void *_base = (base); \
+ size_t _size = (size); \
+ void *_base1 = (void *)(base) - _size; \
const void *_search = (search); \
size_t _nr = (nr); \
- size_t _size = (size); \
- size_t _i = 0; \
+ size_t _i = 1; \
int _res; \
\
- while (_i < _nr && \
- (_res = _cmp(_search, _base + _i * _size))) \
- _i = eytzinger0_child(_i, _res > 0); \
- _i; \
+ while (_i <= _nr && \
+ (_res = _cmp(_search, _base1 + _i * _size))) \
+ _i = eytzinger1_child(_i, _res > 0); \
+ _i - 1; \
})
void eytzinger0_sort_r(void *, size_t, size_t,
diff --git a/fs/bcachefs/fast_list.c b/fs/bcachefs/fast_list.c
new file mode 100644
index 000000000000..2faec143eb31
--- /dev/null
+++ b/fs/bcachefs/fast_list.c
@@ -0,0 +1,156 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Fast, unordered lists
+ *
+ * Supports add, remove, and iterate
+ *
+ * Underneath, they're a radix tree and an IDA, with a percpu buffer for slot
+ * allocation and freeing.
+ *
+ * This means that adding, removing, and iterating over items is lockless,
+ * except when refilling/emptying the percpu slot buffers.
+ */
+
+#include "fast_list.h"
+
+struct fast_list_pcpu {
+ u32 nr;
+ u32 entries[31];
+};
+
+static int fast_list_alloc_idx(struct fast_list *l, gfp_t gfp)
+{
+ int idx = ida_alloc_range(&l->slots_allocated, 1, INT_MAX, gfp);
+ if (unlikely(idx < 0))
+ return 0;
+
+ if (unlikely(!genradix_ptr_alloc_inlined(&l->items, idx, gfp))) {
+ ida_free(&l->slots_allocated, idx);
+ return 0;
+ }
+
+ return idx;
+}
+
+/**
+ * fast_list_get_idx - get a slot in a fast_list
+ * @l: list to get slot in
+ *
+ * This allocates a slot in the radix tree without storing to it, so that we can
+ * take the potential memory allocation failure early and do the list add later
+ * when we can't take an allocation failure.
+ *
+ * Returns: positive integer on success, -ENOMEM on failure
+ */
+int fast_list_get_idx(struct fast_list *l)
+{
+ unsigned long flags;
+ int idx;
+retry:
+ local_irq_save(flags);
+ struct fast_list_pcpu *lp = this_cpu_ptr(l->buffer);
+
+ if (unlikely(!lp->nr)) {
+ u32 entries[16], nr = 0;
+
+ local_irq_restore(flags);
+ while (nr < ARRAY_SIZE(entries) &&
+ (idx = fast_list_alloc_idx(l, GFP_KERNEL)))
+ entries[nr++] = idx;
+ local_irq_save(flags);
+
+ lp = this_cpu_ptr(l->buffer);
+
+ while (nr && lp->nr < ARRAY_SIZE(lp->entries))
+ lp->entries[lp->nr++] = entries[--nr];
+
+ if (unlikely(nr)) {
+ local_irq_restore(flags);
+ while (nr)
+ ida_free(&l->slots_allocated, entries[--nr]);
+ goto retry;
+ }
+
+ if (unlikely(!lp->nr)) {
+ local_irq_restore(flags);
+ return -ENOMEM;
+ }
+ }
+
+ idx = lp->entries[--lp->nr];
+ local_irq_restore(flags);
+
+ return idx;
+}
+
+/**
+ * fast_list_add - add an item to a fast_list
+ * @l: list
+ * @item: item to add
+ *
+ * Allocates a slot in the radix tree and stores to it and then returns the
+ * slot index, which must be passed to fast_list_remove().
+ *
+ * Returns: positive integer on success, -ENOMEM on failure
+ */
+int fast_list_add(struct fast_list *l, void *item)
+{
+ int idx = fast_list_get_idx(l);
+ if (idx < 0)
+ return idx;
+
+ *genradix_ptr_inlined(&l->items, idx) = item;
+ return idx;
+}
+
+/**
+ * fast_list_remove - remove an item from a fast_list
+ * @l: list
+ * @idx: item's slot index
+ *
+ * Zeroes out the slot in the radix tree and frees the slot for future
+ * fast_list_add() operations.
+ */
+void fast_list_remove(struct fast_list *l, unsigned idx)
+{
+ u32 entries[16], nr = 0;
+ unsigned long flags;
+
+ if (!idx)
+ return;
+
+ *genradix_ptr_inlined(&l->items, idx) = NULL;
+
+ local_irq_save(flags);
+ struct fast_list_pcpu *lp = this_cpu_ptr(l->buffer);
+
+ if (unlikely(lp->nr == ARRAY_SIZE(lp->entries)))
+ while (nr < ARRAY_SIZE(entries))
+ entries[nr++] = lp->entries[--lp->nr];
+
+ lp->entries[lp->nr++] = idx;
+ local_irq_restore(flags);
+
+ if (unlikely(nr))
+ while (nr)
+ ida_free(&l->slots_allocated, entries[--nr]);
+}
+
+void fast_list_exit(struct fast_list *l)
+{
+ /* XXX: warn if list isn't empty */
+ free_percpu(l->buffer);
+ ida_destroy(&l->slots_allocated);
+ genradix_free(&l->items);
+}
+
+int fast_list_init(struct fast_list *l)
+{
+ genradix_init(&l->items);
+ ida_init(&l->slots_allocated);
+ l->buffer = alloc_percpu(*l->buffer);
+ if (!l->buffer)
+ return -ENOMEM;
+ return 0;
+}
diff --git a/fs/bcachefs/fast_list.h b/fs/bcachefs/fast_list.h
new file mode 100644
index 000000000000..73c9bf591fd6
--- /dev/null
+++ b/fs/bcachefs/fast_list.h
@@ -0,0 +1,41 @@
+#ifndef _LINUX_FAST_LIST_H
+#define _LINUX_FAST_LIST_H
+
+#include <linux/generic-radix-tree.h>
+#include <linux/idr.h>
+#include <linux/percpu.h>
+
+struct fast_list_pcpu;
+
+struct fast_list {
+ GENRADIX(void *) items;
+ struct ida slots_allocated;;
+ struct fast_list_pcpu __percpu
+ *buffer;
+};
+
+static inline void *fast_list_iter_peek(struct genradix_iter *iter,
+ struct fast_list *list)
+{
+ void **p;
+ while ((p = genradix_iter_peek(iter, &list->items)) && !*p)
+ genradix_iter_advance(iter, &list->items);
+
+ return p ? *p : NULL;
+}
+
+#define fast_list_for_each_from(_list, _iter, _i, _start) \
+ for (_iter = genradix_iter_init(&(_list)->items, _start); \
+ (_i = fast_list_iter_peek(&(_iter), _list)) != NULL; \
+ genradix_iter_advance(&(_iter), &(_list)->items))
+
+#define fast_list_for_each(_list, _iter, _i) \
+ fast_list_for_each_from(_list, _iter, _i, 0)
+
+int fast_list_get_idx(struct fast_list *l);
+int fast_list_add(struct fast_list *l, void *item);
+void fast_list_remove(struct fast_list *l, unsigned idx);
+void fast_list_exit(struct fast_list *l);
+int fast_list_init(struct fast_list *l);
+
+#endif /* _LINUX_FAST_LIST_H */
diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
index ab1d5db2fa56..1c54b9b5bd69 100644
--- a/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -110,11 +110,21 @@ static int readpage_bio_extend(struct btree_trans *trans,
if (!get_more)
break;
+ unsigned sectors_remaining = sectors_this_extent - bio_sectors(bio);
+
+ if (sectors_remaining < PAGE_SECTORS << mapping_min_folio_order(iter->mapping))
+ break;
+
+ unsigned order = ilog2(rounddown_pow_of_two(sectors_remaining) / PAGE_SECTORS);
+
+ /* ensure proper alignment */
+ order = min(order, __ffs(folio_offset|BIT(31)));
+
folio = xa_load(&iter->mapping->i_pages, folio_offset);
if (folio && !xa_is_value(folio))
break;
- folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), 0);
+ folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), order);
if (!folio)
break;
@@ -149,12 +159,10 @@ static void bchfs_read(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bkey_buf sk;
- int flags = BCH_READ_RETRY_IF_STALE|
- BCH_READ_MAY_PROMOTE;
+ int flags = BCH_READ_retry_if_stale|
+ BCH_READ_may_promote;
int ret = 0;
- rbio->c = c;
- rbio->start_time = local_clock();
rbio->subvol = inum.subvol;
bch2_bkey_buf_init(&sk);
@@ -175,12 +183,12 @@ static void bchfs_read(struct btree_trans *trans,
if (ret)
goto err;
- bch2_btree_iter_set_snapshot(&iter, snapshot);
+ bch2_btree_iter_set_snapshot(trans, &iter, snapshot);
- bch2_btree_iter_set_pos(&iter,
+ bch2_btree_iter_set_pos(trans, &iter,
POS(inum.inum, rbio->bio.bi_iter.bi_sector));
- k = bch2_btree_iter_peek_slot(&iter);
+ k = bch2_btree_iter_peek_slot(trans, &iter);
ret = bkey_err(k);
if (ret)
goto err;
@@ -211,14 +219,29 @@ static void bchfs_read(struct btree_trans *trans,
swap(rbio->bio.bi_iter.bi_size, bytes);
if (rbio->bio.bi_iter.bi_size == bytes)
- flags |= BCH_READ_LAST_FRAGMENT;
+ flags |= BCH_READ_last_fragment;
bch2_bio_page_state_set(&rbio->bio, k);
bch2_read_extent(trans, rbio, iter.pos,
data_btree, k, offset_into_extent, flags);
+ /*
+ * Careful there's a landmine here if bch2_read_extent() ever
+ * starts returning transaction restarts here.
+ *
+ * We've changed rbio->bi_iter.bi_size to be "bytes we can read
+ * from this extent" with the swap call, and we restore it
+ * below. That restore needs to come before checking for
+ * errors.
+ *
+ * But unlike __bch2_read(), we use the rbio bvec iter, not one
+ * on the stack, so we can't do the restore right after the
+ * bch2_read_extent() call: we don't own that iterator anymore
+ * if BCH_READ_last_fragment is set, since we may have submitted
+ * that rbio instead of cloning it.
+ */
- if (flags & BCH_READ_LAST_FRAGMENT)
+ if (flags & BCH_READ_last_fragment)
break;
swap(rbio->bio.bi_iter.bi_size, bytes);
@@ -232,7 +255,8 @@ err:
if (ret) {
struct printbuf buf = PRINTBUF;
- bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter.pos.offset << 9);
+ lockrestart_do(trans,
+ bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter.pos.offset << 9));
prt_printf(&buf, "read error %i from btree lookup", ret);
bch_err_ratelimited(c, "%s", buf.buf);
printbuf_exit(&buf);
@@ -280,12 +304,13 @@ void bch2_readahead(struct readahead_control *ractl)
struct bch_read_bio *rbio =
rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ,
GFP_KERNEL, &c->bio_read),
- opts);
+ c,
+ opts,
+ bch2_readpages_end_io);
readpage_iter_advance(&readpages_iter);
rbio->bio.bi_iter.bi_sector = folio_sector(folio);
- rbio->bio.bi_end_io = bch2_readpages_end_io;
BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
bchfs_read(trans, rbio, inode_inum(inode),
@@ -323,10 +348,10 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping)
bch2_inode_opts_get(&opts, c, &inode->ei_inode);
rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read),
- opts);
+ c,
+ opts,
+ bch2_read_single_folio_end_io);
rbio->bio.bi_private = &done;
- rbio->bio.bi_end_io = bch2_read_single_folio_end_io;
-
rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC;
rbio->bio.bi_iter.bi_sector = folio_sector(folio);
BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
@@ -369,17 +394,9 @@ struct bch_writepage_state {
struct bch_io_opts opts;
struct bch_folio_sector *tmp;
unsigned tmp_sectors;
+ struct blk_plug plug;
};
-static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c,
- struct bch_inode_info *inode)
-{
- struct bch_writepage_state ret = { 0 };
-
- bch2_inode_opts_get(&ret.opts, c, &inode->ei_inode);
- return ret;
-}
-
/*
* Determine when a writepage io is full. We have to limit writepage bios to a
* single page per bvec (i.e. 1MB with 4k pages) because that is the limit to
@@ -420,7 +437,7 @@ static void bch2_writepage_io_done(struct bch_write_op *op)
}
}
- if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) {
+ if (io->op.flags & BCH_WRITE_wrote_data_inline) {
bio_for_each_folio_all(fi, bio) {
struct bch_folio *s;
@@ -641,23 +658,23 @@ do_io:
int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
struct bch_fs *c = mapping->host->i_sb->s_fs_info;
- struct bch_writepage_state w =
- bch_writepage_state_init(c, to_bch_ei(mapping->host));
- struct blk_plug plug;
- int ret;
+ struct bch_writepage_state *w = kzalloc(sizeof(*w), GFP_NOFS|__GFP_NOFAIL);
- blk_start_plug(&plug);
- ret = write_cache_pages(mapping, wbc, __bch2_writepage, &w);
- if (w.io)
- bch2_writepage_do_io(&w);
- blk_finish_plug(&plug);
- kfree(w.tmp);
+ bch2_inode_opts_get(&w->opts, c, &to_bch_ei(mapping->host)->ei_inode);
+
+ blk_start_plug(&w->plug);
+ int ret = write_cache_pages(mapping, wbc, __bch2_writepage, w);
+ if (w->io)
+ bch2_writepage_do_io(w);
+ blk_finish_plug(&w->plug);
+ kfree(w->tmp);
+ kfree(w);
return bch2_err_class(ret);
}
/* buffered writes: */
-int bch2_write_begin(struct file *file, struct address_space *mapping,
+int bch2_write_begin(const struct kiocb *iocb, struct address_space *mapping,
loff_t pos, unsigned len,
struct folio **foliop, void **fsdata)
{
@@ -740,7 +757,7 @@ err_unlock:
return bch2_err_class(ret);
}
-int bch2_write_end(struct file *file, struct address_space *mapping,
+int bch2_write_end(const struct kiocb *iocb, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct folio *folio, void *fsdata)
{
diff --git a/fs/bcachefs/fs-io-buffered.h b/fs/bcachefs/fs-io-buffered.h
index 3207ebbb4ab4..14de91c27656 100644
--- a/fs/bcachefs/fs-io-buffered.h
+++ b/fs/bcachefs/fs-io-buffered.h
@@ -10,9 +10,9 @@ int bch2_read_folio(struct file *, struct folio *);
int bch2_writepages(struct address_space *, struct writeback_control *);
void bch2_readahead(struct readahead_control *);
-int bch2_write_begin(struct file *, struct address_space *, loff_t pos,
+int bch2_write_begin(const struct kiocb *, struct address_space *, loff_t pos,
unsigned len, struct folio **, void **);
-int bch2_write_end(struct file *, struct address_space *, loff_t,
+int bch2_write_end(const struct kiocb *, struct address_space *, loff_t,
unsigned len, unsigned copied, struct folio *, void *);
ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *);
diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c
index 2089c36b5866..1f5154d9676b 100644
--- a/fs/bcachefs/fs-io-direct.c
+++ b/fs/bcachefs/fs-io-direct.c
@@ -3,6 +3,7 @@
#include "bcachefs.h"
#include "alloc_foreground.h"
+#include "enumerated_ref.h"
#include "fs.h"
#include "fs-io.h"
#include "fs-io-direct.h"
@@ -73,6 +74,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
struct blk_plug plug;
loff_t offset = req->ki_pos;
bool sync = is_sync_kiocb(req);
+ bool split = false;
size_t shorten;
ssize_t ret;
@@ -99,8 +101,6 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
GFP_KERNEL,
&c->dio_read_bioset);
- bio->bi_end_io = bch2_direct_IO_read_endio;
-
dio = container_of(bio, struct dio_read, rbio.bio);
closure_init(&dio->cl, NULL);
@@ -133,12 +133,13 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
goto start;
while (iter->count) {
+ split = true;
+
bio = bio_alloc_bioset(NULL,
bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
REQ_OP_READ,
GFP_KERNEL,
&c->bio_read);
- bio->bi_end_io = bch2_direct_IO_read_split_endio;
start:
bio->bi_opf = REQ_OP_READ|REQ_SYNC;
bio->bi_iter.bi_sector = offset >> 9;
@@ -160,7 +161,15 @@ start:
if (iter->count)
closure_get(&dio->cl);
- bch2_read(c, rbio_init(bio, opts), inode_inum(inode));
+ struct bch_read_bio *rbio =
+ rbio_init(bio,
+ c,
+ opts,
+ split
+ ? bch2_direct_IO_read_split_endio
+ : bch2_direct_IO_read_endio);
+
+ bch2_read(c, rbio, inode_inum(inode));
}
blk_finish_plug(&plug);
@@ -393,7 +402,7 @@ static __always_inline long bch2_dio_write_done(struct dio_write *dio)
ret = dio->op.error ?: ((long) dio->written << 9);
bio_put(&dio->op.wbio.bio);
- bch2_write_ref_put(c, BCH_WRITE_REF_dio_write);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_dio_write);
/* inode->i_dio_count is our ref on inode and thus bch_fs */
inode_dio_end(&inode->v);
@@ -511,8 +520,8 @@ static __always_inline long bch2_dio_write_loop(struct dio_write *dio)
dio->op.devs_need_flush = &inode->ei_devs_need_flush;
if (sync)
- dio->op.flags |= BCH_WRITE_SYNC;
- dio->op.flags |= BCH_WRITE_CHECK_ENOSPC;
+ dio->op.flags |= BCH_WRITE_sync;
+ dio->op.flags |= BCH_WRITE_check_enospc;
ret = bch2_quota_reservation_add(c, inode, &dio->quota_res,
bio_sectors(bio), true);
@@ -598,7 +607,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
prefetch(&inode->ei_inode);
prefetch((void *) &inode->ei_inode + 64);
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_dio_write))
+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_dio_write))
return -EROFS;
inode_lock(&inode->v);
@@ -667,7 +676,7 @@ err_put_bio:
bio_put(bio);
inode_dio_end(&inode->v);
err_put_write_ref:
- bch2_write_ref_put(c, BCH_WRITE_REF_dio_write);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_dio_write);
goto out;
}
diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c
index e072900e6a5b..c2cc405822f2 100644
--- a/fs/bcachefs/fs-io-pagecache.c
+++ b/fs/bcachefs/fs-io-pagecache.c
@@ -447,7 +447,7 @@ static int __bch2_folio_reservation_get(struct bch_fs *c,
if (!reserved) {
bch2_disk_reservation_put(c, &disk_res);
- return -BCH_ERR_ENOSPC_disk_reservation;
+ return bch_err_throw(c, ENOSPC_disk_reservation);
}
break;
}
@@ -605,10 +605,14 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
struct address_space *mapping = file->f_mapping;
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch2_folio_reservation res;
- unsigned len;
- loff_t isize;
vm_fault_t ret;
+ loff_t file_offset = round_down(vmf->pgoff << PAGE_SHIFT, block_bytes(c));
+ unsigned offset = file_offset - folio_pos(folio);
+ unsigned len = max(PAGE_SIZE, block_bytes(c));
+
+ BUG_ON(offset + len > folio_size(folio));
+
bch2_folio_reservation_init(c, inode, &res);
sb_start_pagefault(inode->v.i_sb);
@@ -623,24 +627,24 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
bch2_pagecache_add_get(inode);
folio_lock(folio);
- isize = i_size_read(&inode->v);
+ u64 isize = i_size_read(&inode->v);
- if (folio->mapping != mapping || folio_pos(folio) >= isize) {
+ if (folio->mapping != mapping || file_offset >= isize) {
folio_unlock(folio);
ret = VM_FAULT_NOPAGE;
goto out;
}
- len = min_t(loff_t, folio_size(folio), isize - folio_pos(folio));
+ len = min_t(unsigned, len, isize - file_offset);
if (bch2_folio_set(c, inode_inum(inode), &folio, 1) ?:
- bch2_folio_reservation_get(c, inode, folio, &res, 0, len)) {
+ bch2_folio_reservation_get(c, inode, folio, &res, offset, len)) {
folio_unlock(folio);
ret = VM_FAULT_SIGBUS;
goto out;
}
- bch2_set_folio_dirty(c, inode, folio, &res, 0, len);
+ bch2_set_folio_dirty(c, inode, folio, &res, offset, len);
bch2_folio_reservation_put(c, inode, &res);
folio_wait_stable(folio);
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 94bf34b9b65f..a233f45875e9 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -7,6 +7,7 @@
#include "btree_update.h"
#include "buckets.h"
#include "clock.h"
+#include "enumerated_ref.h"
#include "error.h"
#include "extents.h"
#include "extent_update.h"
@@ -48,7 +49,8 @@ static void nocow_flush_endio(struct bio *_bio)
struct nocow_flush *bio = container_of(_bio, struct nocow_flush, bio);
closure_put(bio->cl);
- percpu_ref_put(&bio->ca->io_ref);
+ enumerated_ref_put(&bio->ca->io_ref[WRITE],
+ BCH_DEV_WRITE_REF_nocow_flush);
bio_put(&bio->bio);
}
@@ -69,11 +71,12 @@ void bch2_inode_flush_nocow_writes_async(struct bch_fs *c,
memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush));
for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) {
- rcu_read_lock();
- ca = rcu_dereference(c->devs[dev]);
- if (ca && !percpu_ref_tryget(&ca->io_ref))
- ca = NULL;
- rcu_read_unlock();
+ scoped_guard(rcu) {
+ ca = rcu_dereference(c->devs[dev]);
+ if (ca && !enumerated_ref_tryget(&ca->io_ref[WRITE],
+ BCH_DEV_WRITE_REF_nocow_flush))
+ ca = NULL;
+ }
if (!ca)
continue;
@@ -144,10 +147,24 @@ int __must_check bch2_write_inode_size(struct bch_fs *c,
void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
struct quota_res *quota_res, s64 sectors)
{
- bch2_fs_inconsistent_on((s64) inode->v.i_blocks + sectors < 0, c,
- "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)",
- inode->v.i_ino, (u64) inode->v.i_blocks, sectors,
- inode->ei_inode.bi_sectors);
+ if (unlikely((s64) inode->v.i_blocks + sectors < 0)) {
+ struct printbuf buf = PRINTBUF;
+ bch2_log_msg_start(c, &buf);
+ prt_printf(&buf, "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)",
+ inode->v.i_ino, (u64) inode->v.i_blocks, sectors,
+ inode->ei_inode.bi_sectors);
+
+ bool print = bch2_count_fsck_err(c, vfs_inode_i_blocks_underflow, &buf);
+ if (print)
+ bch2_print_str(c, KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+
+ if (sectors < 0)
+ sectors = -inode->v.i_blocks;
+ else
+ sectors = 0;
+ }
+
inode->v.i_blocks += sectors;
#ifdef CONFIG_BCACHEFS_QUOTA
@@ -205,7 +222,7 @@ static int bch2_flush_inode(struct bch_fs *c,
if (c->opts.journal_flush_disabled)
return 0;
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fsync))
+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_fsync))
return -EROFS;
u64 seq;
@@ -213,7 +230,7 @@ static int bch2_flush_inode(struct bch_fs *c,
bch2_get_inode_journal_seq_trans(trans, inode_inum(inode), &seq)) ?:
bch2_journal_flush_seq(&c->journal, seq, TASK_INTERRUPTIBLE) ?:
bch2_inode_flush_nocow_writes(c, inode);
- bch2_write_ref_put(c, BCH_WRITE_REF_fsync);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_fsync);
return ret;
}
@@ -466,6 +483,7 @@ int bchfs_truncate(struct mnt_idmap *idmap,
ret = bch2_truncate_folio(inode, iattr->ia_size);
if (unlikely(ret < 0))
goto err;
+ ret = 0;
truncate_setsize(&inode->v, iattr->ia_size);
@@ -501,11 +519,20 @@ int bchfs_truncate(struct mnt_idmap *idmap,
goto err;
}
- bch2_fs_inconsistent_on(!inode->v.i_size && inode->v.i_blocks &&
- !bch2_journal_error(&c->journal), c,
- "inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)",
- inode->v.i_ino, (u64) inode->v.i_blocks,
- inode->ei_inode.bi_sectors);
+ if (unlikely(!inode->v.i_size && inode->v.i_blocks &&
+ !bch2_journal_error(&c->journal))) {
+ struct printbuf buf = PRINTBUF;
+ bch2_log_msg_start(c, &buf);
+ prt_printf(&buf,
+ "inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)",
+ inode->v.i_ino, (u64) inode->v.i_blocks,
+ inode->ei_inode.bi_sectors);
+
+ bool print = bch2_count_fsck_err(c, vfs_inode_i_blocks_not_zero_at_truncate, &buf);
+ if (print)
+ bch2_print_str(c, KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+ }
ret = bch2_setattr_nonsize(idmap, inode, iattr);
err:
@@ -635,9 +662,9 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
if (ret)
goto bkey_err;
- bch2_btree_iter_set_snapshot(&iter, snapshot);
+ bch2_btree_iter_set_snapshot(trans, &iter, snapshot);
- k = bch2_btree_iter_peek_slot(&iter);
+ k = bch2_btree_iter_peek_slot(trans, &iter);
if ((ret = bkey_err(k)))
goto bkey_err;
@@ -648,13 +675,13 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
/* already reserved */
if (bkey_extent_is_reservation(k) &&
bch2_bkey_nr_ptrs_fully_allocated(k) >= opts.data_replicas) {
- bch2_btree_iter_advance(&iter);
+ bch2_btree_iter_advance(trans, &iter);
continue;
}
if (bkey_extent_is_data(k.k) &&
!(mode & FALLOC_FL_ZERO_RANGE)) {
- bch2_btree_iter_advance(&iter);
+ bch2_btree_iter_advance(trans, &iter);
continue;
}
@@ -675,7 +702,7 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
if (ret)
goto bkey_err;
}
- bch2_btree_iter_set_pos(&iter, POS(iter.pos.inode, hole_start));
+ bch2_btree_iter_set_pos(trans, &iter, POS(iter.pos.inode, hole_start));
if (ret)
goto bkey_err;
@@ -794,7 +821,7 @@ long bch2_fallocate_dispatch(struct file *file, int mode,
struct bch_fs *c = inode->v.i_sb->s_fs_info;
long ret;
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fallocate))
+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_fallocate))
return -EROFS;
inode_lock(&inode->v);
@@ -818,7 +845,7 @@ long bch2_fallocate_dispatch(struct file *file, int mode,
err:
bch2_pagecache_block_put(inode);
inode_unlock(&inode->v);
- bch2_write_ref_put(c, BCH_WRITE_REF_fallocate);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_fallocate);
return bch2_err_class(ret);
}
@@ -998,17 +1025,28 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
POS(inode->v.i_ino, offset >> 9),
POS(inode->v.i_ino, U64_MAX),
inum.subvol, BTREE_ITER_slots, k, ({
- if (k.k->p.inode != inode->v.i_ino) {
- next_hole = bch2_seek_pagecache_hole(&inode->v,
- offset, MAX_LFS_FILESIZE, 0, false);
- break;
- } else if (!bkey_extent_is_data(k.k)) {
- next_hole = bch2_seek_pagecache_hole(&inode->v,
- max(offset, bkey_start_offset(k.k) << 9),
- k.k->p.offset << 9, 0, false);
-
- if (next_hole < k.k->p.offset << 9)
+ if (k.k->p.inode != inode->v.i_ino ||
+ !bkey_extent_is_data(k.k)) {
+ loff_t start_offset = k.k->p.inode == inode->v.i_ino
+ ? max(offset, bkey_start_offset(k.k) << 9)
+ : offset;
+ loff_t end_offset = k.k->p.inode == inode->v.i_ino
+ ? MAX_LFS_FILESIZE
+ : k.k->p.offset << 9;
+
+ /*
+ * Found a hole in the btree, now make sure it's
+ * a hole in the pagecache. We might have to
+ * keep searching if this hole is entirely dirty
+ * in the page cache:
+ */
+ bch2_trans_unlock(trans);
+ loff_t pagecache_hole = bch2_seek_pagecache_hole(&inode->v,
+ start_offset, end_offset, 0, false);
+ if (pagecache_hole < end_offset) {
+ next_hole = pagecache_hole;
break;
+ }
} else {
offset = max(offset, bkey_start_offset(k.k) << 9);
}
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index 15725b4ce393..4e72e654da96 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -5,8 +5,8 @@
#include "chardev.h"
#include "dirent.h"
#include "fs.h"
-#include "fs-common.h"
#include "fs-ioctl.h"
+#include "namei.h"
#include "quota.h"
#include <linux/compat.h>
@@ -21,180 +21,6 @@
#define FSOP_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */
#define FSOP_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */
-struct flags_set {
- unsigned mask;
- unsigned flags;
-
- unsigned projid;
-
- bool set_projinherit;
- bool projinherit;
-};
-
-static int bch2_inode_flags_set(struct btree_trans *trans,
- struct bch_inode_info *inode,
- struct bch_inode_unpacked *bi,
- void *p)
-{
- struct bch_fs *c = inode->v.i_sb->s_fs_info;
- /*
- * We're relying on btree locking here for exclusion with other ioctl
- * calls - use the flags in the btree (@bi), not inode->i_flags:
- */
- struct flags_set *s = p;
- unsigned newflags = s->flags;
- unsigned oldflags = bi->bi_flags & s->mask;
-
- if (((newflags ^ oldflags) & (BCH_INODE_append|BCH_INODE_immutable)) &&
- !capable(CAP_LINUX_IMMUTABLE))
- return -EPERM;
-
- if (!S_ISREG(bi->bi_mode) &&
- !S_ISDIR(bi->bi_mode) &&
- (newflags & (BCH_INODE_nodump|BCH_INODE_noatime)) != newflags)
- return -EINVAL;
-
- if (s->set_projinherit) {
- bi->bi_fields_set &= ~(1 << Inode_opt_project);
- bi->bi_fields_set |= ((int) s->projinherit << Inode_opt_project);
- }
-
- bi->bi_flags &= ~s->mask;
- bi->bi_flags |= newflags;
-
- bi->bi_ctime = timespec_to_bch2_time(c, current_time(&inode->v));
- return 0;
-}
-
-static int bch2_ioc_getflags(struct bch_inode_info *inode, int __user *arg)
-{
- unsigned flags = map_flags(bch_flags_to_uflags, inode->ei_inode.bi_flags);
-
- return put_user(flags, arg);
-}
-
-static int bch2_ioc_setflags(struct bch_fs *c,
- struct file *file,
- struct bch_inode_info *inode,
- void __user *arg)
-{
- struct flags_set s = { .mask = map_defined(bch_flags_to_uflags) };
- unsigned uflags;
- int ret;
-
- if (get_user(uflags, (int __user *) arg))
- return -EFAULT;
-
- s.flags = map_flags_rev(bch_flags_to_uflags, uflags);
- if (uflags)
- return -EOPNOTSUPP;
-
- ret = mnt_want_write_file(file);
- if (ret)
- return ret;
-
- inode_lock(&inode->v);
- if (!inode_owner_or_capable(file_mnt_idmap(file), &inode->v)) {
- ret = -EACCES;
- goto setflags_out;
- }
-
- mutex_lock(&inode->ei_update_lock);
- ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
- bch2_write_inode(c, inode, bch2_inode_flags_set, &s,
- ATTR_CTIME);
- mutex_unlock(&inode->ei_update_lock);
-
-setflags_out:
- inode_unlock(&inode->v);
- mnt_drop_write_file(file);
- return ret;
-}
-
-static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode,
- struct fsxattr __user *arg)
-{
- struct fsxattr fa = { 0 };
-
- fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags);
-
- if (inode->ei_inode.bi_fields_set & (1 << Inode_opt_project))
- fa.fsx_xflags |= FS_XFLAG_PROJINHERIT;
-
- fa.fsx_projid = inode->ei_qid.q[QTYP_PRJ];
-
- if (copy_to_user(arg, &fa, sizeof(fa)))
- return -EFAULT;
-
- return 0;
-}
-
-static int fssetxattr_inode_update_fn(struct btree_trans *trans,
- struct bch_inode_info *inode,
- struct bch_inode_unpacked *bi,
- void *p)
-{
- struct flags_set *s = p;
-
- if (s->projid != bi->bi_project) {
- bi->bi_fields_set |= 1U << Inode_opt_project;
- bi->bi_project = s->projid;
- }
-
- return bch2_inode_flags_set(trans, inode, bi, p);
-}
-
-static int bch2_ioc_fssetxattr(struct bch_fs *c,
- struct file *file,
- struct bch_inode_info *inode,
- struct fsxattr __user *arg)
-{
- struct flags_set s = { .mask = map_defined(bch_flags_to_xflags) };
- struct fsxattr fa;
- int ret;
-
- if (copy_from_user(&fa, arg, sizeof(fa)))
- return -EFAULT;
-
- s.set_projinherit = true;
- s.projinherit = (fa.fsx_xflags & FS_XFLAG_PROJINHERIT) != 0;
- fa.fsx_xflags &= ~FS_XFLAG_PROJINHERIT;
-
- s.flags = map_flags_rev(bch_flags_to_xflags, fa.fsx_xflags);
- if (fa.fsx_xflags)
- return -EOPNOTSUPP;
-
- if (fa.fsx_projid >= U32_MAX)
- return -EINVAL;
-
- /*
- * inode fields accessible via the xattr interface are stored with a +1
- * bias, so that 0 means unset:
- */
- s.projid = fa.fsx_projid + 1;
-
- ret = mnt_want_write_file(file);
- if (ret)
- return ret;
-
- inode_lock(&inode->v);
- if (!inode_owner_or_capable(file_mnt_idmap(file), &inode->v)) {
- ret = -EACCES;
- goto err;
- }
-
- mutex_lock(&inode->ei_update_lock);
- ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
- bch2_set_projid(c, inode, fa.fsx_projid) ?:
- bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s,
- ATTR_CTIME);
- mutex_unlock(&inode->ei_update_lock);
-err:
- inode_unlock(&inode->v);
- mnt_drop_write_file(file);
- return ret;
-}
-
static int bch2_reinherit_attrs_fn(struct btree_trans *trans,
struct bch_inode_info *inode,
struct bch_inode_unpacked *bi,
@@ -218,7 +44,7 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c,
int ret = 0;
subvol_inum inum;
- kname = kmalloc(BCH_NAME_MAX + 1, GFP_KERNEL);
+ kname = kmalloc(BCH_NAME_MAX, GFP_KERNEL);
if (!kname)
return -ENOMEM;
@@ -346,7 +172,10 @@ static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg)
if (get_user(flags, arg))
return -EFAULT;
- bch_notice(c, "shutdown by ioctl type %u", flags);
+ struct printbuf buf = PRINTBUF;
+ bch2_log_msg_start(c, &buf);
+
+ prt_printf(&buf, "shutdown by ioctl type %u", flags);
switch (flags) {
case FSOP_GOING_FLAGS_DEFAULT:
@@ -354,20 +183,23 @@ static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg)
if (ret)
break;
bch2_journal_flush(&c->journal);
- bch2_fs_emergency_read_only(c);
+ bch2_fs_emergency_read_only2(c, &buf);
bdev_thaw(c->vfs_sb->s_bdev);
break;
case FSOP_GOING_FLAGS_LOGFLUSH:
bch2_journal_flush(&c->journal);
fallthrough;
case FSOP_GOING_FLAGS_NOLOGFLUSH:
- bch2_fs_emergency_read_only(c);
+ bch2_fs_emergency_read_only2(c, &buf);
break;
default:
ret = -EINVAL;
- break;
+ goto noprint;
}
+ bch2_print_str(c, KERN_ERR, buf.buf);
+noprint:
+ printbuf_exit(&buf);
return ret;
}
@@ -436,13 +268,13 @@ static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
}
if (dst_dentry->d_inode) {
- error = -BCH_ERR_EEXIST_subvolume_create;
+ error = bch_err_throw(c, EEXIST_subvolume_create);
goto err3;
}
dir = dst_path.dentry->d_inode;
if (IS_DEADDIR(dir)) {
- error = -BCH_ERR_ENOENT_directory_dead;
+ error = bch_err_throw(c, ENOENT_directory_dead);
goto err3;
}
@@ -511,14 +343,12 @@ static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp,
ret = -EXDEV;
goto err;
}
- if (!d_is_positive(victim)) {
- ret = -ENOENT;
- goto err;
- }
- ret = __bch2_unlink(dir, victim, true);
+
+ ret = inode_permission(file_mnt_idmap(filp), d_inode(victim), MAY_WRITE) ?:
+ __bch2_unlink(dir, victim, true);
if (!ret) {
fsnotify_rmdir(dir, victim);
- d_delete(victim);
+ d_invalidate(victim);
}
err:
inode_unlock(dir);
@@ -534,23 +364,6 @@ long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg)
long ret;
switch (cmd) {
- case FS_IOC_GETFLAGS:
- ret = bch2_ioc_getflags(inode, (int __user *) arg);
- break;
-
- case FS_IOC_SETFLAGS:
- ret = bch2_ioc_setflags(c, file, inode, (int __user *) arg);
- break;
-
- case FS_IOC_FSGETXATTR:
- ret = bch2_ioc_fsgetxattr(inode, (void __user *) arg);
- break;
-
- case FS_IOC_FSSETXATTR:
- ret = bch2_ioc_fssetxattr(c, file, inode,
- (void __user *) arg);
- break;
-
case BCHFS_IOC_REINHERIT_ATTRS:
ret = bch2_ioc_reinherit_attrs(c, file, inode,
(void __user *) arg);
diff --git a/fs/bcachefs/fs-ioctl.h b/fs/bcachefs/fs-ioctl.h
index d30f9bb056fd..a657e4994b71 100644
--- a/fs/bcachefs/fs-ioctl.h
+++ b/fs/bcachefs/fs-ioctl.h
@@ -2,79 +2,6 @@
#ifndef _BCACHEFS_FS_IOCTL_H
#define _BCACHEFS_FS_IOCTL_H
-/* Inode flags: */
-
-/* bcachefs inode flags -> vfs inode flags: */
-static const __maybe_unused unsigned bch_flags_to_vfs[] = {
- [__BCH_INODE_sync] = S_SYNC,
- [__BCH_INODE_immutable] = S_IMMUTABLE,
- [__BCH_INODE_append] = S_APPEND,
- [__BCH_INODE_noatime] = S_NOATIME,
-};
-
-/* bcachefs inode flags -> FS_IOC_GETFLAGS: */
-static const __maybe_unused unsigned bch_flags_to_uflags[] = {
- [__BCH_INODE_sync] = FS_SYNC_FL,
- [__BCH_INODE_immutable] = FS_IMMUTABLE_FL,
- [__BCH_INODE_append] = FS_APPEND_FL,
- [__BCH_INODE_nodump] = FS_NODUMP_FL,
- [__BCH_INODE_noatime] = FS_NOATIME_FL,
-};
-
-/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */
-static const __maybe_unused unsigned bch_flags_to_xflags[] = {
- [__BCH_INODE_sync] = FS_XFLAG_SYNC,
- [__BCH_INODE_immutable] = FS_XFLAG_IMMUTABLE,
- [__BCH_INODE_append] = FS_XFLAG_APPEND,
- [__BCH_INODE_nodump] = FS_XFLAG_NODUMP,
- [__BCH_INODE_noatime] = FS_XFLAG_NOATIME,
- //[__BCH_INODE_PROJINHERIT] = FS_XFLAG_PROJINHERIT;
-};
-
-#define set_flags(_map, _in, _out) \
-do { \
- unsigned _i; \
- \
- for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \
- if ((_in) & (1 << _i)) \
- (_out) |= _map[_i]; \
- else \
- (_out) &= ~_map[_i]; \
-} while (0)
-
-#define map_flags(_map, _in) \
-({ \
- unsigned _out = 0; \
- \
- set_flags(_map, _in, _out); \
- _out; \
-})
-
-#define map_flags_rev(_map, _in) \
-({ \
- unsigned _i, _out = 0; \
- \
- for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \
- if ((_in) & _map[_i]) { \
- (_out) |= 1 << _i; \
- (_in) &= ~_map[_i]; \
- } \
- (_out); \
-})
-
-#define map_defined(_map) \
-({ \
- unsigned _in = ~0; \
- \
- map_flags_rev(_map, _in); \
-})
-
-/* Set VFS inode flags from bcachefs inode: */
-static inline void bch2_inode_flags_to_vfs(struct bch_inode_info *inode)
-{
- set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags);
-}
-
long bch2_fs_file_ioctl(struct file *, unsigned, unsigned long);
long bch2_compat_fs_ioctl(struct file *, unsigned, unsigned long);
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 90ade8f648d9..687af0eea0c2 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -11,7 +11,6 @@
#include "errcode.h"
#include "extents.h"
#include "fs.h"
-#include "fs-common.h"
#include "fs-io.h"
#include "fs-ioctl.h"
#include "fs-io-buffered.h"
@@ -22,6 +21,7 @@
#include "io_read.h"
#include "journal.h"
#include "keylist.h"
+#include "namei.h"
#include "quota.h"
#include "rebalance.h"
#include "snapshot.h"
@@ -33,6 +33,7 @@
#include <linux/backing-dev.h>
#include <linux/exportfs.h>
#include <linux/fiemap.h>
+#include <linux/fileattr.h>
#include <linux/fs_context.h>
#include <linux/module.h>
#include <linux/pagemap.h>
@@ -51,6 +52,24 @@ static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum,
struct bch_inode_unpacked *,
struct bch_subvolume *);
+/* Set VFS inode flags from bcachefs inode: */
+static inline void bch2_inode_flags_to_vfs(struct bch_fs *c, struct bch_inode_info *inode)
+{
+ static const __maybe_unused unsigned bch_flags_to_vfs[] = {
+ [__BCH_INODE_sync] = S_SYNC,
+ [__BCH_INODE_immutable] = S_IMMUTABLE,
+ [__BCH_INODE_append] = S_APPEND,
+ [__BCH_INODE_noatime] = S_NOATIME,
+ };
+
+ set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags);
+
+ if (bch2_inode_casefold(c, &inode->ei_inode))
+ inode->v.i_flags |= S_CASEFOLD;
+ else
+ inode->v.i_flags &= ~S_CASEFOLD;
+}
+
void bch2_inode_update_after_write(struct btree_trans *trans,
struct bch_inode_info *inode,
struct bch_inode_unpacked *bi,
@@ -79,7 +98,7 @@ void bch2_inode_update_after_write(struct btree_trans *trans,
inode->ei_inode = *bi;
- bch2_inode_flags_to_vfs(inode);
+ bch2_inode_flags_to_vfs(c, inode);
}
int __must_check bch2_write_inode(struct bch_fs *c,
@@ -88,7 +107,7 @@ int __must_check bch2_write_inode(struct bch_fs *c,
void *p, unsigned fields)
{
struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter = { NULL };
+ struct btree_iter iter = {};
struct bch_inode_unpacked inode_u;
int ret;
retry:
@@ -105,8 +124,9 @@ retry:
goto err;
struct bch_extent_rebalance new_r = bch2_inode_rebalance_opts_get(c, &inode_u);
+ bool rebalance_changed = memcmp(&old_r, &new_r, sizeof(new_r));
- if (memcmp(&old_r, &new_r, sizeof(new_r))) {
+ if (rebalance_changed) {
ret = bch2_set_rebalance_needs_scan_trans(trans, inode_u.bi_inum);
if (ret)
goto err;
@@ -127,6 +147,9 @@ err:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
+ if (rebalance_changed)
+ bch2_rebalance_wakeup(c);
+
bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c,
"%s: inode %llu:%llu not found when updating",
bch2_err_str(ret),
@@ -172,11 +195,6 @@ int bch2_fs_quota_transfer(struct bch_fs *c,
return ret;
}
-static bool subvol_inum_eq(subvol_inum a, subvol_inum b)
-{
- return a.subvol == b.subvol && a.inum == b.inum;
-}
-
static u32 bch2_vfs_inode_hash_fn(const void *data, u32 len, u32 seed)
{
const subvol_inum *inum = data;
@@ -333,9 +351,8 @@ repeat:
if (!trans) {
__wait_on_freeing_inode(c, inode, inum);
} else {
- bch2_trans_unlock(trans);
- __wait_on_freeing_inode(c, inode, inum);
- int ret = bch2_trans_relock(trans);
+ int ret = drop_locks_do(trans,
+ (__wait_on_freeing_inode(c, inode, inum), 0));
if (ret)
return ERR_PTR(ret);
}
@@ -631,17 +648,24 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
const struct qstr *name)
{
struct bch_fs *c = trans->c;
- struct btree_iter dirent_iter = {};
subvol_inum inum = {};
struct printbuf buf = PRINTBUF;
+ struct qstr lookup_name;
+ int ret = bch2_maybe_casefold(trans, dir_hash_info, name, &lookup_name);
+ if (ret)
+ return ERR_PTR(ret);
+
+ struct btree_iter dirent_iter = {};
struct bkey_s_c k = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc,
- dir_hash_info, dir, name, 0);
- int ret = bkey_err(k);
+ dir_hash_info, dir, &lookup_name, 0);
+ ret = bkey_err(k);
if (ret)
return ERR_PTR(ret);
- ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), &inum);
+ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
+
+ ret = bch2_dirent_read_target(trans, dir, d, &inum);
if (ret > 0)
ret = -ENOENT;
if (ret)
@@ -651,30 +675,30 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
if (inode)
goto out;
+ /*
+ * Note: if check/repair needs it, we commit before
+ * bch2_inode_hash_init_insert(), as after that point we can't take a
+ * restart - not in the top level loop with a commit_do(), like we
+ * usually do:
+ */
+
struct bch_subvolume subvol;
struct bch_inode_unpacked inode_u;
ret = bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?:
bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?:
+ bch2_check_dirent_target(trans, &dirent_iter, d, &inode_u, false) ?:
+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol));
+ /*
+ * don't remove it: check_inodes might find another inode that points
+ * back to this dirent
+ */
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
- c, "dirent to missing inode:\n %s",
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+ c, "dirent to missing inode:\n%s",
+ (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf));
if (ret)
goto err;
-
- /* regular files may have hardlinks: */
- if (bch2_fs_inconsistent_on(bch2_inode_should_have_single_bp(&inode_u) &&
- !bkey_eq(k.k->p, POS(inode_u.bi_dir, inode_u.bi_dir_offset)),
- c,
- "dirent points to inode that does not point back:\n %s",
- (bch2_bkey_val_to_text(&buf, c, k),
- prt_printf(&buf, "\n "),
- bch2_inode_unpacked_to_text(&buf, &inode_u),
- buf.buf))) {
- ret = -ENOENT;
- goto err;
- }
out:
bch2_trans_iter_exit(trans, &dirent_iter);
printbuf_exit(&buf);
@@ -698,6 +722,21 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
if (IS_ERR(inode))
inode = NULL;
+ if (!inode && IS_CASEFOLDED(vdir)) {
+ /*
+ * Do not cache a negative dentry in casefolded directories
+ * as it would need to be invalidated in the following situation:
+ * - Lookup file "blAH" in a casefolded directory
+ * - Creation of file "BLAH" in a casefolded directory
+ * - Lookup file "blAH" in a casefolded directory
+ * which would fail if we had a negative dentry.
+ *
+ * We should come back to this when VFS has a method to handle
+ * this edgecase.
+ */
+ return NULL;
+ }
+
return d_splice_alias(&inode->v, dentry);
}
@@ -806,6 +845,9 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
*/
set_nlink(&inode->v, 0);
}
+
+ if (IS_CASEFOLDED(vdir))
+ d_invalidate(dentry);
err:
bch2_trans_put(trans);
bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode);
@@ -858,10 +900,10 @@ err:
return bch2_err_class(ret);
}
-static int bch2_mkdir(struct mnt_idmap *idmap,
- struct inode *vdir, struct dentry *dentry, umode_t mode)
+static struct dentry *bch2_mkdir(struct mnt_idmap *idmap,
+ struct inode *vdir, struct dentry *dentry, umode_t mode)
{
- return bch2_mknod(idmap, vdir, dentry, mode|S_IFDIR, 0);
+ return ERR_PTR(bch2_mknod(idmap, vdir, dentry, mode|S_IFDIR, 0));
}
static int bch2_rename2(struct mnt_idmap *idmap,
@@ -1056,7 +1098,7 @@ int bch2_setattr_nonsize(struct mnt_idmap *idmap,
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch_qid qid;
struct btree_trans *trans;
- struct btree_iter inode_iter = { NULL };
+ struct btree_iter inode_iter = {};
struct bch_inode_unpacked inode_u;
struct posix_acl *acl = NULL;
kuid_t kuid;
@@ -1216,10 +1258,20 @@ static int bch2_tmpfile(struct mnt_idmap *idmap,
return finish_open_simple(file, 0);
}
+struct bch_fiemap_extent {
+ struct bkey_buf kbuf;
+ unsigned flags;
+};
+
static int bch2_fill_extent(struct bch_fs *c,
struct fiemap_extent_info *info,
- struct bkey_s_c k, unsigned flags)
+ struct bch_fiemap_extent *fe)
{
+ struct bkey_s_c k = bkey_i_to_s_c(fe->kbuf.k);
+ unsigned flags = fe->flags;
+
+ BUG_ON(!k.k->size);
+
if (bkey_extent_is_direct_data(k.k)) {
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
@@ -1272,110 +1324,225 @@ static int bch2_fill_extent(struct bch_fs *c,
}
}
-static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
- u64 start, u64 len)
+/*
+ * Scan a range of an inode for data in pagecache.
+ *
+ * Intended to be retryable, so don't modify the output params until success is
+ * imminent.
+ */
+static int
+bch2_fiemap_hole_pagecache(struct inode *vinode, u64 *start, u64 *end,
+ bool nonblock)
{
- struct bch_fs *c = vinode->i_sb->s_fs_info;
- struct bch_inode_info *ei = to_bch_ei(vinode);
- struct btree_trans *trans;
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bkey_buf cur, prev;
- bool have_extent = false;
- int ret = 0;
+ loff_t dstart, dend;
- ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC);
- if (ret)
+ dstart = bch2_seek_pagecache_data(vinode, *start, *end, 0, nonblock);
+ if (dstart < 0)
+ return dstart;
+
+ if (dstart == *end) {
+ *start = dstart;
+ return 0;
+ }
+
+ dend = bch2_seek_pagecache_hole(vinode, dstart, *end, 0, nonblock);
+ if (dend < 0)
+ return dend;
+
+ /* race */
+ BUG_ON(dstart == dend);
+
+ *start = dstart;
+ *end = dend;
+ return 0;
+}
+
+/*
+ * Scan a range of pagecache that corresponds to a file mapping hole in the
+ * extent btree. If data is found, fake up an extent key so it looks like a
+ * delalloc extent to the rest of the fiemap processing code.
+ */
+static int
+bch2_next_fiemap_pagecache_extent(struct btree_trans *trans, struct bch_inode_info *inode,
+ u64 start, u64 end, struct bch_fiemap_extent *cur)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_i_extent *delextent;
+ struct bch_extent_ptr ptr = {};
+ loff_t dstart = start << 9, dend = end << 9;
+ int ret;
+
+ /*
+ * We hold btree locks here so we cannot block on folio locks without
+ * dropping trans locks first. Run a nonblocking scan for the common
+ * case of no folios over holes and fall back on failure.
+ *
+ * Note that dropping locks like this is technically racy against
+ * writeback inserting to the extent tree, but a non-sync fiemap scan is
+ * fundamentally racy with writeback anyways. Therefore, just report the
+ * range as delalloc regardless of whether we have to cycle trans locks.
+ */
+ ret = bch2_fiemap_hole_pagecache(&inode->v, &dstart, &dend, true);
+ if (ret == -EAGAIN)
+ ret = drop_locks_do(trans,
+ bch2_fiemap_hole_pagecache(&inode->v, &dstart, &dend, false));
+ if (ret < 0)
return ret;
- struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
- if (start + len < start)
- return -EINVAL;
+ /*
+ * Create a fake extent key in the buffer. We have to add a dummy extent
+ * pointer for the fill code to add an extent entry. It's explicitly
+ * zeroed to reflect delayed allocation (i.e. phys offset 0).
+ */
+ bch2_bkey_buf_realloc(&cur->kbuf, c, sizeof(*delextent) / sizeof(u64));
+ delextent = bkey_extent_init(cur->kbuf.k);
+ delextent->k.p = POS(inode->ei_inum.inum, dend >> 9);
+ delextent->k.size = (dend - dstart) >> 9;
+ bch2_bkey_append_ptr(&delextent->k_i, ptr);
- start >>= 9;
+ cur->flags = FIEMAP_EXTENT_DELALLOC;
- bch2_bkey_buf_init(&cur);
- bch2_bkey_buf_init(&prev);
- trans = bch2_trans_get(c);
+ return 0;
+}
+
+static int bch2_next_fiemap_extent(struct btree_trans *trans,
+ struct bch_inode_info *inode,
+ u64 start, u64 end,
+ struct bch_fiemap_extent *cur)
+{
+ u32 snapshot;
+ int ret = bch2_subvolume_get_snapshot(trans, inode->ei_inum.subvol, &snapshot);
+ if (ret)
+ return ret;
+ struct btree_iter iter;
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
- POS(ei->v.i_ino, start), 0);
+ SPOS(inode->ei_inum.inum, start, snapshot), 0);
- while (!ret || bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
- enum btree_id data_btree = BTREE_ID_extents;
+ struct bkey_s_c k =
+ bch2_btree_iter_peek_max(trans, &iter, POS(inode->ei_inum.inum, end));
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
- bch2_trans_begin(trans);
+ u64 pagecache_end = k.k ? max(start, bkey_start_offset(k.k)) : end;
- u32 snapshot;
- ret = bch2_subvolume_get_snapshot(trans, ei->ei_inum.subvol, &snapshot);
- if (ret)
- continue;
+ ret = bch2_next_fiemap_pagecache_extent(trans, inode, start, pagecache_end, cur);
+ if (ret)
+ goto err;
- bch2_btree_iter_set_snapshot(&iter, snapshot);
+ struct bpos pagecache_start = bkey_start_pos(&cur->kbuf.k->k);
- k = bch2_btree_iter_peek_max(&iter, end);
- ret = bkey_err(k);
+ /*
+ * Does the pagecache or the btree take precedence?
+ *
+ * It _should_ be the pagecache, so that we correctly report delalloc
+ * extents when dirty in the pagecache (we're COW, after all).
+ *
+ * But we'd have to add per-sector writeback tracking to
+ * bch_folio_state, otherwise we report delalloc extents for clean
+ * cached data in the pagecache.
+ *
+ * We should do this, but even then fiemap won't report stable mappings:
+ * on bcachefs data moves around in the background (copygc, rebalance)
+ * and we don't provide a way for userspace to lock that out.
+ */
+ if (k.k &&
+ bkey_le(bpos_max(iter.pos, bkey_start_pos(k.k)),
+ pagecache_start)) {
+ bch2_bkey_buf_reassemble(&cur->kbuf, trans->c, k);
+ bch2_cut_front(iter.pos, cur->kbuf.k);
+ bch2_cut_back(POS(inode->ei_inum.inum, end), cur->kbuf.k);
+ cur->flags = 0;
+ } else if (k.k) {
+ bch2_cut_back(bkey_start_pos(k.k), cur->kbuf.k);
+ }
+
+ if (cur->kbuf.k->k.type == KEY_TYPE_reflink_p) {
+ unsigned sectors = cur->kbuf.k->k.size;
+ s64 offset_into_extent = 0;
+ enum btree_id data_btree = BTREE_ID_extents;
+ ret = bch2_read_indirect_extent(trans, &data_btree, &offset_into_extent,
+ &cur->kbuf);
if (ret)
- continue;
+ goto err;
- if (!k.k)
- break;
+ struct bkey_i *k = cur->kbuf.k;
+ sectors = min_t(unsigned, sectors, k->k.size - offset_into_extent);
- if (!bkey_extent_is_data(k.k) &&
- k.k->type != KEY_TYPE_reservation) {
- bch2_btree_iter_advance(&iter);
- continue;
- }
+ bch2_cut_front(POS(k->k.p.inode,
+ bkey_start_offset(&k->k) + offset_into_extent),
+ k);
+ bch2_key_resize(&k->k, sectors);
+ k->k.p = iter.pos;
+ k->k.p.offset += k->k.size;
+ }
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
+ u64 start, u64 len)
+{
+ struct bch_fs *c = vinode->i_sb->s_fs_info;
+ struct bch_inode_info *ei = to_bch_ei(vinode);
+ struct btree_trans *trans;
+ struct bch_fiemap_extent cur, prev;
+ int ret = 0;
+
+ ret = fiemap_prep(&ei->v, info, start, &len, 0);
+ if (ret)
+ return ret;
+
+ if (start + len < start)
+ return -EINVAL;
+
+ start >>= 9;
+ u64 end = (start + len) >> 9;
- s64 offset_into_extent = iter.pos.offset - bkey_start_offset(k.k);
- unsigned sectors = k.k->size - offset_into_extent;
+ bch2_bkey_buf_init(&cur.kbuf);
+ bch2_bkey_buf_init(&prev.kbuf);
+ bkey_init(&prev.kbuf.k->k);
- bch2_bkey_buf_reassemble(&cur, c, k);
+ trans = bch2_trans_get(c);
- ret = bch2_read_indirect_extent(trans, &data_btree,
- &offset_into_extent, &cur);
+ while (start < end) {
+ ret = lockrestart_do(trans,
+ bch2_next_fiemap_extent(trans, ei, start, end, &cur));
if (ret)
- continue;
+ goto err;
- k = bkey_i_to_s_c(cur.k);
- bch2_bkey_buf_realloc(&prev, c, k.k->u64s);
+ BUG_ON(bkey_start_offset(&cur.kbuf.k->k) < start);
+ BUG_ON(cur.kbuf.k->k.p.offset > end);
- sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent);
+ if (bkey_start_offset(&cur.kbuf.k->k) == end)
+ break;
- bch2_cut_front(POS(k.k->p.inode,
- bkey_start_offset(k.k) +
- offset_into_extent),
- cur.k);
- bch2_key_resize(&cur.k->k, sectors);
- cur.k->k.p = iter.pos;
- cur.k->k.p.offset += cur.k->k.size;
+ start = cur.kbuf.k->k.p.offset;
- if (have_extent) {
+ if (!bkey_deleted(&prev.kbuf.k->k)) {
bch2_trans_unlock(trans);
- ret = bch2_fill_extent(c, info,
- bkey_i_to_s_c(prev.k), 0);
+ ret = bch2_fill_extent(c, info, &prev);
if (ret)
- break;
+ goto err;
}
- bkey_copy(prev.k, cur.k);
- have_extent = true;
-
- bch2_btree_iter_set_pos(&iter,
- POS(iter.pos.inode, iter.pos.offset + sectors));
+ bch2_bkey_buf_copy(&prev.kbuf, c, cur.kbuf.k);
+ prev.flags = cur.flags;
}
- bch2_trans_iter_exit(trans, &iter);
- if (!ret && have_extent) {
+ if (!bkey_deleted(&prev.kbuf.k->k)) {
bch2_trans_unlock(trans);
- ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k),
- FIEMAP_EXTENT_LAST);
+ prev.flags |= FIEMAP_EXTENT_LAST;
+ ret = bch2_fill_extent(c, info, &prev);
}
-
+err:
bch2_trans_put(trans);
- bch2_bkey_buf_exit(&cur, c);
- bch2_bkey_buf_exit(&prev, c);
- return ret < 0 ? ret : 0;
+ bch2_bkey_buf_exit(&cur.kbuf, c);
+ bch2_bkey_buf_exit(&prev.kbuf, c);
+
+ return bch2_err_class(ret < 0 ? ret : 0);
}
static const struct vm_operations_struct bch_vm_ops = {
@@ -1384,11 +1551,11 @@ static const struct vm_operations_struct bch_vm_ops = {
.page_mkwrite = bch2_page_mkwrite,
};
-static int bch2_mmap(struct file *file, struct vm_area_struct *vma)
+static int bch2_mmap_prepare(struct vm_area_desc *desc)
{
- file_accessed(file);
+ file_accessed(desc->file);
- vma->vm_ops = &bch_vm_ops;
+ desc->vm_ops = &bch_vm_ops;
return 0;
}
@@ -1404,11 +1571,12 @@ static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
{
struct bch_inode_info *inode = file_bch_inode(file);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
if (!dir_emit_dots(file, ctx))
return 0;
- int ret = bch2_readdir(c, inode_inum(inode), ctx);
+ int ret = bch2_readdir(c, inode_inum(inode), &hash, ctx);
bch_err_fn(c, ret);
return bch2_err_class(ret);
@@ -1430,12 +1598,148 @@ static int bch2_open(struct inode *vinode, struct file *file)
return generic_file_open(vinode, file);
}
+/* bcachefs inode flags -> FS_IOC_GETFLAGS: */
+static const __maybe_unused unsigned bch_flags_to_uflags[] = {
+ [__BCH_INODE_sync] = FS_SYNC_FL,
+ [__BCH_INODE_immutable] = FS_IMMUTABLE_FL,
+ [__BCH_INODE_append] = FS_APPEND_FL,
+ [__BCH_INODE_nodump] = FS_NODUMP_FL,
+ [__BCH_INODE_noatime] = FS_NOATIME_FL,
+};
+
+/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */
+static const __maybe_unused unsigned bch_flags_to_xflags[] = {
+ [__BCH_INODE_sync] = FS_XFLAG_SYNC,
+ [__BCH_INODE_immutable] = FS_XFLAG_IMMUTABLE,
+ [__BCH_INODE_append] = FS_XFLAG_APPEND,
+ [__BCH_INODE_nodump] = FS_XFLAG_NODUMP,
+ [__BCH_INODE_noatime] = FS_XFLAG_NOATIME,
+};
+
+static int bch2_fileattr_get(struct dentry *dentry,
+ struct file_kattr *fa)
+{
+ struct bch_inode_info *inode = to_bch_ei(d_inode(dentry));
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+
+ fileattr_fill_xflags(fa, map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags));
+
+ if (inode->ei_inode.bi_fields_set & (1 << Inode_opt_project))
+ fa->fsx_xflags |= FS_XFLAG_PROJINHERIT;
+
+ if (bch2_inode_casefold(c, &inode->ei_inode))
+ fa->flags |= FS_CASEFOLD_FL;
+
+ fa->fsx_projid = inode->ei_qid.q[QTYP_PRJ];
+ return 0;
+}
+
+struct flags_set {
+ unsigned mask;
+ unsigned flags;
+ unsigned projid;
+ bool set_project;
+ bool set_casefold;
+ bool casefold;
+};
+
+static int fssetxattr_inode_update_fn(struct btree_trans *trans,
+ struct bch_inode_info *inode,
+ struct bch_inode_unpacked *bi,
+ void *p)
+{
+ struct bch_fs *c = trans->c;
+ struct flags_set *s = p;
+
+ /*
+ * We're relying on btree locking here for exclusion with other ioctl
+ * calls - use the flags in the btree (@bi), not inode->i_flags:
+ */
+ if (!S_ISREG(bi->bi_mode) &&
+ !S_ISDIR(bi->bi_mode) &&
+ (s->flags & (BCH_INODE_nodump|BCH_INODE_noatime)) != s->flags)
+ return -EINVAL;
+
+ if (s->casefold != bch2_inode_casefold(c, bi)) {
+ int ret = bch2_inode_set_casefold(trans, inode_inum(inode), bi, s->casefold);
+ if (ret)
+ return ret;
+ }
+
+ if (s->set_project) {
+ bi->bi_project = s->projid;
+ bi->bi_fields_set |= BIT(Inode_opt_project);
+ }
+
+ bi->bi_flags &= ~s->mask;
+ bi->bi_flags |= s->flags;
+
+ bi->bi_ctime = timespec_to_bch2_time(c, current_time(&inode->v));
+ return 0;
+}
+
+static int bch2_fileattr_set(struct mnt_idmap *idmap,
+ struct dentry *dentry,
+ struct file_kattr *fa)
+{
+ struct bch_inode_info *inode = to_bch_ei(d_inode(dentry));
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
+ struct flags_set s = {};
+ int ret;
+
+ if (fa->fsx_valid) {
+ fa->fsx_xflags &= ~FS_XFLAG_PROJINHERIT;
+
+ s.mask = map_defined(bch_flags_to_xflags);
+ s.flags |= map_flags_rev(bch_flags_to_xflags, fa->fsx_xflags);
+ if (fa->fsx_xflags)
+ return -EOPNOTSUPP;
+
+ if (fa->fsx_projid >= U32_MAX)
+ return -EINVAL;
+
+ /*
+ * inode fields accessible via the xattr interface are stored with a +1
+ * bias, so that 0 means unset:
+ */
+ if ((inode->ei_inode.bi_project ||
+ fa->fsx_projid) &&
+ inode->ei_inode.bi_project != fa->fsx_projid + 1) {
+ s.projid = fa->fsx_projid + 1;
+ s.set_project = true;
+ }
+ }
+
+ if (fa->flags_valid) {
+ s.mask = map_defined(bch_flags_to_uflags);
+
+ s.set_casefold = true;
+ s.casefold = (fa->flags & FS_CASEFOLD_FL) != 0;
+ fa->flags &= ~FS_CASEFOLD_FL;
+
+ s.flags |= map_flags_rev(bch_flags_to_uflags, fa->flags);
+ if (fa->flags)
+ return -EOPNOTSUPP;
+ }
+
+ mutex_lock(&inode->ei_update_lock);
+ ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?:
+ (s.set_project
+ ? bch2_set_projid(c, inode, fa->fsx_projid)
+ : 0) ?:
+ bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s,
+ ATTR_CTIME);
+ mutex_unlock(&inode->ei_update_lock);
+
+ return bch2_err_class(ret);
+}
+
static const struct file_operations bch_file_operations = {
.open = bch2_open,
.llseek = bch2_llseek,
.read_iter = bch2_read_iter,
.write_iter = bch2_write_iter,
- .mmap = bch2_mmap,
+ .mmap_prepare = bch2_mmap_prepare,
.get_unmapped_area = thp_get_unmapped_area,
.fsync = bch2_fsync,
.splice_read = filemap_splice_read,
@@ -1457,6 +1761,8 @@ static const struct inode_operations bch_file_inode_operations = {
.get_inode_acl = bch2_get_acl,
.set_acl = bch2_set_acl,
#endif
+ .fileattr_get = bch2_fileattr_get,
+ .fileattr_set = bch2_fileattr_set,
};
static const struct inode_operations bch_dir_inode_operations = {
@@ -1477,6 +1783,8 @@ static const struct inode_operations bch_dir_inode_operations = {
.get_inode_acl = bch2_get_acl,
.set_acl = bch2_set_acl,
#endif
+ .fileattr_get = bch2_fileattr_get,
+ .fileattr_set = bch2_fileattr_set,
};
static const struct file_operations bch_dir_file_operations = {
@@ -1499,6 +1807,8 @@ static const struct inode_operations bch_symlink_inode_operations = {
.get_inode_acl = bch2_get_acl,
.set_acl = bch2_set_acl,
#endif
+ .fileattr_get = bch2_fileattr_get,
+ .fileattr_set = bch2_fileattr_set,
};
static const struct inode_operations bch_special_inode_operations = {
@@ -1509,6 +1819,8 @@ static const struct inode_operations bch_special_inode_operations = {
.get_inode_acl = bch2_get_acl,
.set_acl = bch2_set_acl,
#endif
+ .fileattr_get = bch2_fileattr_get,
+ .fileattr_set = bch2_fileattr_set,
};
static const struct address_space_operations bch_address_space_operations = {
@@ -1678,30 +1990,30 @@ retry:
if (ret)
goto err;
- bch2_btree_iter_set_snapshot(&iter1, snapshot);
- bch2_btree_iter_set_snapshot(&iter2, snapshot);
+ bch2_btree_iter_set_snapshot(trans, &iter1, snapshot);
+ bch2_btree_iter_set_snapshot(trans, &iter2, snapshot);
ret = bch2_inode_find_by_inum_trans(trans, inode_inum(inode), &inode_u);
if (ret)
goto err;
if (inode_u.bi_dir == dir->ei_inode.bi_inum) {
- bch2_btree_iter_set_pos(&iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset));
+ bch2_btree_iter_set_pos(trans, &iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset));
- k = bch2_btree_iter_peek_slot(&iter1);
+ k = bch2_btree_iter_peek_slot(trans, &iter1);
ret = bkey_err(k);
if (ret)
goto err;
if (k.k->type != KEY_TYPE_dirent) {
- ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
+ ret = bch_err_throw(c, ENOENT_dirent_doesnt_match_inode);
goto err;
}
d = bkey_s_c_to_dirent(k);
ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target);
if (ret > 0)
- ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
+ ret = bch_err_throw(c, ENOENT_dirent_doesnt_match_inode);
if (ret)
goto err;
@@ -1712,7 +2024,7 @@ retry:
* File with multiple hardlinks and our backref is to the wrong
* directory - linear search:
*/
- for_each_btree_key_continue_norestart(iter2, 0, k, ret) {
+ for_each_btree_key_continue_norestart(trans, iter2, 0, k, ret) {
if (k.k->p.inode > dir->ei_inode.bi_inum)
break;
@@ -1802,7 +2114,8 @@ static void bch2_vfs_inode_init(struct btree_trans *trans,
break;
}
- mapping_set_large_folios(inode->v.i_mapping);
+ mapping_set_folio_min_order(inode->v.i_mapping,
+ get_order(trans->c->opts.block_size));
}
static void bch2_free_inode(struct inode *vinode)
@@ -1866,7 +2179,13 @@ static void bch2_evict_inode(struct inode *vinode)
KEY_TYPE_QUOTA_WARN);
bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
KEY_TYPE_QUOTA_WARN);
- bch2_inode_rm(c, inode_inum(inode));
+ int ret = bch2_inode_rm(c, inode_inum(inode));
+ if (ret && !bch2_err_matches(ret, EROFS)) {
+ bch_err_msg(c, ret, "VFS incorrectly tried to delete inode %llu:%llu",
+ inode->ei_inum.subvol,
+ inode->ei_inum.inum);
+ bch2_sb_error_count(c, BCH_FSCK_ERR_vfs_bad_inode_rm);
+ }
/*
* If we are deleting, we need it present in the vfs hash table
@@ -2008,50 +2327,13 @@ static struct bch_fs *bch2_path_to_fs(const char *path)
return c ?: ERR_PTR(-ENOENT);
}
-static int bch2_remount(struct super_block *sb, int *flags,
- struct bch_opts opts)
-{
- struct bch_fs *c = sb->s_fs_info;
- int ret = 0;
-
- opt_set(opts, read_only, (*flags & SB_RDONLY) != 0);
-
- if (opts.read_only != c->opts.read_only) {
- down_write(&c->state_lock);
-
- if (opts.read_only) {
- bch2_fs_read_only(c);
-
- sb->s_flags |= SB_RDONLY;
- } else {
- ret = bch2_fs_read_write(c);
- if (ret) {
- bch_err(c, "error going rw: %i", ret);
- up_write(&c->state_lock);
- ret = -EINVAL;
- goto err;
- }
-
- sb->s_flags &= ~SB_RDONLY;
- }
-
- c->opts.read_only = opts.read_only;
-
- up_write(&c->state_lock);
- }
-
- if (opt_defined(opts, errors))
- c->opts.errors = opts.errors;
-err:
- return bch2_err_class(ret);
-}
-
static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
{
struct bch_fs *c = root->d_sb->s_fs_info;
bool first = true;
- for_each_online_member(c, ca) {
+ guard(rcu)();
+ for_each_online_member_rcu(c, ca) {
if (!first)
seq_putc(seq, ':');
first = false;
@@ -2163,7 +2445,7 @@ static int bch2_fs_get_tree(struct fs_context *fc)
struct inode *vinode;
struct bch2_opts_parse *opts_parse = fc->fs_private;
struct bch_opts opts = opts_parse->opts;
- darray_str devs;
+ darray_const_str devs;
darray_fs devs_to_fs = {};
int ret;
@@ -2187,14 +2469,17 @@ static int bch2_fs_get_tree(struct fs_context *fc)
if (!IS_ERR(sb))
goto got_sb;
- c = bch2_fs_open(devs.data, devs.nr, opts);
+ c = bch2_fs_open(&devs, &opts);
ret = PTR_ERR_OR_ZERO(c);
if (ret)
goto err;
+ if (opt_defined(opts, discard))
+ set_bit(BCH_FS_discard_mount_opt_set, &c->flags);
+
/* Some options can't be parsed until after the fs is started: */
opts = bch2_opts_empty();
- ret = bch2_parse_mount_opts(c, &opts, NULL, opts_parse->parse_later.buf);
+ ret = bch2_parse_mount_opts(c, &opts, NULL, opts_parse->parse_later.buf, false);
if (ret)
goto err_stop_fs;
@@ -2204,6 +2489,14 @@ static int bch2_fs_get_tree(struct fs_context *fc)
if (ret)
goto err_stop_fs;
+ /*
+ * We might be doing a RO mount because other options required it, or we
+ * have no alloc info and it's a small image with no room to regenerate
+ * it
+ */
+ if (c->opts.read_only)
+ fc->sb_flags |= SB_RDONLY;
+
sb = sget(fc->fs_type, NULL, bch2_set_super, fc->sb_flags|SB_NOSEC, c);
ret = PTR_ERR_OR_ZERO(sb);
if (ret)
@@ -2234,7 +2527,12 @@ got_sb:
sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1;
sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec);
super_set_uuid(sb, c->sb.user_uuid.b, sizeof(c->sb.user_uuid));
- super_set_sysfs_name_uuid(sb);
+
+ if (c->sb.multi_device)
+ super_set_sysfs_name_uuid(sb);
+ else
+ strscpy(sb->s_sysfs_name, c->name, sizeof(sb->s_sysfs_name));
+
sb->s_shrink->seeks = 0;
c->vfs_sb = sb;
strscpy(sb->s_id, c->name, sizeof(sb->s_id));
@@ -2245,14 +2543,15 @@ got_sb:
sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
- for_each_online_member(c, ca) {
- struct block_device *bdev = ca->disk_sb.bdev;
+ scoped_guard(rcu) {
+ for_each_online_member_rcu(c, ca) {
+ struct block_device *bdev = ca->disk_sb.bdev;
- /* XXX: create an anonymous device for multi device filesystems */
- sb->s_bdev = bdev;
- sb->s_dev = bdev->bd_dev;
- percpu_ref_put(&ca->io_ref);
- break;
+ /* XXX: create an anonymous device for multi device filesystems */
+ sb->s_bdev = bdev;
+ sb->s_dev = bdev->bd_dev;
+ break;
+ }
}
c->dev = sb->s_dev;
@@ -2264,6 +2563,12 @@ got_sb:
sb->s_shrink->seeks = 0;
+#ifdef CONFIG_UNICODE
+ if (bch2_fs_casefold_enabled(c))
+ sb->s_encoding = c->cf_encoding;
+ generic_set_sb_d_ops(sb);
+#endif
+
vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
ret = PTR_ERR_OR_ZERO(vinode);
bch_err_msg(c, ret, "mounting: error getting root inode");
@@ -2300,7 +2605,8 @@ err_stop_fs:
goto err;
err_put_super:
- __bch2_fs_stop(c);
+ if (!sb->s_root)
+ __bch2_fs_stop(c);
deactivate_locked_super(sb);
goto err;
}
@@ -2343,6 +2649,8 @@ static int bch2_fs_parse_param(struct fs_context *fc,
int ret = bch2_parse_one_mount_opt(c, &opts->opts,
&opts->parse_later, param->key,
param->string);
+ if (ret)
+ pr_err("Error parsing option %s: %s", param->key, bch2_err_str(ret));
return bch2_err_class(ret);
}
@@ -2351,8 +2659,39 @@ static int bch2_fs_reconfigure(struct fs_context *fc)
{
struct super_block *sb = fc->root->d_sb;
struct bch2_opts_parse *opts = fc->fs_private;
+ struct bch_fs *c = sb->s_fs_info;
+ int ret = 0;
+
+ opt_set(opts->opts, read_only, (fc->sb_flags & SB_RDONLY) != 0);
+
+ if (opts->opts.read_only != c->opts.read_only) {
+ down_write(&c->state_lock);
+
+ if (opts->opts.read_only) {
+ bch2_fs_read_only(c);
+
+ sb->s_flags |= SB_RDONLY;
+ } else {
+ ret = bch2_fs_read_write(c);
+ if (ret) {
+ bch_err(c, "error going rw: %i", ret);
+ up_write(&c->state_lock);
+ ret = -EINVAL;
+ goto err;
+ }
+
+ sb->s_flags &= ~SB_RDONLY;
+ }
+
+ c->opts.read_only = opts->opts.read_only;
+
+ up_write(&c->state_lock);
+ }
- return bch2_remount(sb, &fc->sb_flags, opts->opts);
+ if (opt_defined(opts->opts, errors))
+ c->opts.errors = opts->opts.errors;
+err:
+ return bch2_err_class(ret);
}
static const struct fs_context_operations bch2_context_ops = {
@@ -2396,7 +2735,7 @@ static struct file_system_type bcache_fs_type = {
.name = "bcachefs",
.init_fs_context = bch2_init_fs_context,
.kill_sb = bch2_kill_sb,
- .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
+ .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_LBS,
};
MODULE_ALIAS_FS("bcachefs");
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 8fcf7c8e5ede..15c1e890d299 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -10,10 +10,11 @@
#include "dirent.h"
#include "error.h"
#include "fs.h"
-#include "fs-common.h"
#include "fsck.h"
#include "inode.h"
+#include "io_misc.h"
#include "keylist.h"
+#include "namei.h"
#include "recovery_passes.h"
#include "snapshot.h"
#include "super.h"
@@ -23,21 +24,15 @@
#include <linux/bsearch.h>
#include <linux/dcache.h> /* struct qstr */
-static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
- struct bkey_s_c_dirent d)
-{
- return inode->bi_dir == d.k->p.inode &&
- inode->bi_dir_offset == d.k->p.offset;
-}
-
-static int dirent_points_to_inode_nowarn(struct bkey_s_c_dirent d,
+static int dirent_points_to_inode_nowarn(struct bch_fs *c,
+ struct bkey_s_c_dirent d,
struct bch_inode_unpacked *inode)
{
if (d.v->d_type == DT_SUBVOL
? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol
: le64_to_cpu(d.v->d_inum) == inode->bi_inum)
return 0;
- return -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
+ return bch_err_throw(c, ENOENT_dirent_doesnt_match_inode);
}
static void dirent_inode_mismatch_msg(struct printbuf *out,
@@ -56,7 +51,7 @@ static int dirent_points_to_inode(struct bch_fs *c,
struct bkey_s_c_dirent dirent,
struct bch_inode_unpacked *inode)
{
- int ret = dirent_points_to_inode_nowarn(dirent, inode);
+ int ret = dirent_points_to_inode_nowarn(c, dirent, inode);
if (ret) {
struct printbuf buf = PRINTBUF;
dirent_inode_mismatch_msg(&buf, c, dirent, inode);
@@ -116,50 +111,6 @@ static int subvol_lookup(struct btree_trans *trans, u32 subvol,
return ret;
}
-static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
- struct bch_inode_unpacked *inode)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-
- for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr),
- BTREE_ITER_all_snapshots, k, ret) {
- if (k.k->p.offset != inode_nr)
- break;
- if (!bkey_is_inode(k.k))
- continue;
- ret = bch2_inode_unpack(k, inode);
- goto found;
- }
- ret = -BCH_ERR_ENOENT_inode;
-found:
- bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr);
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static int lookup_inode(struct btree_trans *trans, u64 inode_nr, u32 snapshot,
- struct bch_inode_unpacked *inode)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-
- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
- SPOS(0, inode_nr, snapshot), 0);
- ret = bkey_err(k);
- if (ret)
- goto err;
-
- ret = bkey_is_inode(k.k)
- ? bch2_inode_unpack(k, inode)
- : -BCH_ERR_ENOENT_inode;
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
static int lookup_dirent_in_snapshot(struct btree_trans *trans,
struct bch_hash_info hash_info,
subvol_inum dir, struct qstr *name,
@@ -179,32 +130,6 @@ static int lookup_dirent_in_snapshot(struct btree_trans *trans,
return 0;
}
-static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bch_inode_unpacked dir_inode;
- struct bch_hash_info dir_hash_info;
- int ret;
-
- ret = lookup_first_inode(trans, pos.inode, &dir_inode);
- if (ret)
- goto err;
-
- dir_hash_info = bch2_hash_info_init(c, &dir_inode);
-
- bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent);
-
- ret = bch2_btree_iter_traverse(&iter) ?:
- bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
- &dir_hash_info, &iter,
- BTREE_UPDATE_internal_snapshot_node);
- bch2_trans_iter_exit(trans, &iter);
-err:
- bch_err_fn(c, ret);
- return ret;
-}
-
/*
* Find any subvolume associated with a tree of snapshots
* We can't rely on master_subvol - it might have been deleted.
@@ -229,7 +154,7 @@ static int find_snapshot_tree_subvol(struct btree_trans *trans,
goto found;
}
}
- ret = -BCH_ERR_ENOENT_no_snapshot_tree_subvol;
+ ret = bch_err_throw(trans->c, ENOENT_no_snapshot_tree_subvol);
found:
bch2_trans_iter_exit(trans, &iter);
return ret;
@@ -242,7 +167,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
{
struct bch_fs *c = trans->c;
struct qstr lostfound_str = QSTR("lost+found");
- struct btree_iter lostfound_iter = { NULL };
+ struct btree_iter lostfound_iter = {};
u64 inum = 0;
unsigned d_type = 0;
int ret;
@@ -287,7 +212,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
struct bch_inode_unpacked root_inode;
struct bch_hash_info root_hash_info;
- ret = lookup_inode(trans, root_inum.inum, snapshot, &root_inode);
+ ret = bch2_inode_find_by_inum_snapshot(trans, root_inum.inum, snapshot, &root_inode, 0);
bch_err_msg(c, ret, "looking up root inode %llu for subvol %u",
root_inum.inum, subvolid);
if (ret)
@@ -306,14 +231,14 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
if (d_type != DT_DIR) {
bch_err(c, "error looking up lost+found: not a directory");
- return -BCH_ERR_ENOENT_not_directory;
+ return bch_err_throw(c, ENOENT_not_directory);
}
/*
* The bch2_check_dirents pass has already run, dangling dirents
* shouldn't exist here:
*/
- ret = lookup_inode(trans, inum, snapshot, lostfound);
+ ret = bch2_inode_find_by_inum_snapshot(trans, inum, snapshot, lostfound, 0);
bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)",
inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot));
return ret;
@@ -341,7 +266,7 @@ create_lostfound:
u64 cpu = raw_smp_processor_id();
bch2_inode_init_early(c, lostfound);
- bch2_inode_init_late(lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode);
+ bch2_inode_init_late(c, lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode);
lostfound->bi_dir = root_inode.bi_inum;
lostfound->bi_snapshot = le32_to_cpu(st.root_snapshot);
@@ -351,8 +276,8 @@ create_lostfound:
if (ret)
goto err;
- bch2_btree_iter_set_snapshot(&lostfound_iter, snapshot);
- ret = bch2_btree_iter_traverse(&lostfound_iter);
+ bch2_btree_iter_set_snapshot(trans, &lostfound_iter, snapshot);
+ ret = bch2_btree_iter_traverse(trans, &lostfound_iter);
if (ret)
goto err;
@@ -362,6 +287,7 @@ create_lostfound:
&lostfound_str,
lostfound->bi_inum,
&lostfound->bi_dir_offset,
+ BTREE_UPDATE_internal_snapshot_node|
STR_HASH_must_create) ?:
bch2_inode_write_flags(trans, &lostfound_iter, lostfound,
BTREE_UPDATE_internal_snapshot_node);
@@ -377,7 +303,33 @@ static inline bool inode_should_reattach(struct bch_inode_unpacked *inode)
inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)
return false;
- return !inode->bi_dir && !(inode->bi_flags & BCH_INODE_unlinked);
+ /*
+ * Subvolume roots are special: older versions of subvolume roots may be
+ * disconnected, it's only the newest version that matters.
+ *
+ * We only keep a single dirent pointing to a subvolume root, i.e.
+ * older versions of snapshots will not have a different dirent pointing
+ * to the same subvolume root.
+ *
+ * This is because dirents that point to subvolumes are only visible in
+ * the parent subvolume - versioning is not needed - and keeping them
+ * around would break fsck, because when we're crossing subvolumes we
+ * don't have a consistent snapshot ID to do check the inode <-> dirent
+ * relationships.
+ *
+ * Thus, a subvolume root that's been renamed after a snapshot will have
+ * a disconnected older version - that's expected.
+ *
+ * Note that taking a snapshot always updates the root inode (to update
+ * the dirent backpointer), so a subvolume root inode with
+ * BCH_INODE_has_child_snapshot is never visible.
+ */
+ if (inode->bi_subvol &&
+ (inode->bi_flags & BCH_INODE_has_child_snapshot))
+ return false;
+
+ return !bch2_inode_has_backpointer(inode) &&
+ !(inode->bi_flags & BCH_INODE_unlinked);
}
static int maybe_delete_dirent(struct btree_trans *trans, struct bpos d_pos, u32 snapshot)
@@ -422,6 +374,18 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *
if (inode->bi_subvol) {
inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL;
+ struct btree_iter subvol_iter;
+ struct bkey_i_subvolume *subvol =
+ bch2_bkey_get_mut_typed(trans, &subvol_iter,
+ BTREE_ID_subvolumes, POS(0, inode->bi_subvol),
+ 0, subvolume);
+ ret = PTR_ERR_OR_ZERO(subvol);
+ if (ret)
+ return ret;
+
+ subvol->v.fs_path_parent = BCACHEFS_ROOT_SUBVOL;
+ bch2_trans_iter_exit(trans, &subvol_iter);
+
u64 root_inum;
ret = subvol_lookup(trans, inode->bi_parent_subvol,
&dirent_snapshot, &root_inum);
@@ -437,6 +401,8 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *
if (ret)
return ret;
+ bch_verbose(c, "got lostfound inum %llu", lostfound.bi_inum);
+
lostfound.bi_nlink += S_ISDIR(inode->bi_mode);
/* ensure lost+found inode is also present in inode snapshot */
@@ -450,7 +416,7 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *
return ret;
struct bch_hash_info dir_hash = bch2_hash_info_init(c, &lostfound);
- struct qstr name = (struct qstr) QSTR(name_buf);
+ struct qstr name = QSTR(name_buf);
inode->bi_dir = lostfound.bi_inum;
@@ -462,6 +428,7 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *
&name,
inode->bi_subvol ?: inode->bi_inum,
&inode->bi_dir_offset,
+ BTREE_UPDATE_internal_snapshot_node|
STR_HASH_must_create);
if (ret) {
bch_err_msg(c, ret, "error creating dirent");
@@ -472,6 +439,16 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *
if (ret)
return ret;
+ {
+ CLASS(printbuf, buf)();
+ ret = bch2_inum_snapshot_to_path(trans, inode->bi_inum,
+ inode->bi_snapshot, NULL, &buf);
+ if (ret)
+ return ret;
+
+ bch_info(c, "reattached at %s", buf.buf);
+ }
+
/*
* Fix up inodes in child snapshots: if they should also be reattached
* update the backpointer field, if they should not be we need to emit
@@ -539,16 +516,24 @@ static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
static int remove_backpointer(struct btree_trans *trans,
struct bch_inode_unpacked *inode)
{
- if (!inode->bi_dir)
+ if (!bch2_inode_has_backpointer(inode))
return 0;
+ u32 snapshot = inode->bi_snapshot;
+
+ if (inode->bi_parent_subvol) {
+ int ret = bch2_subvolume_get_snapshot(trans, inode->bi_parent_subvol, &snapshot);
+ if (ret)
+ return ret;
+ }
+
struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bkey_s_c_dirent d = dirent_get_by_pos(trans, &iter,
- SPOS(inode->bi_dir, inode->bi_dir_offset, inode->bi_snapshot));
+ SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot));
int ret = bkey_err(d) ?:
dirent_points_to_inode(c, d, inode) ?:
- __remove_dirent(trans, d.k->p);
+ bch2_fsck_remove_dirent(trans, d.k->p);
bch2_trans_iter_exit(trans, &iter);
return ret;
}
@@ -581,7 +566,7 @@ static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 sub
if (!bch2_snapshot_is_leaf(c, snapshotid)) {
bch_err(c, "need to reconstruct subvol, but have interior node snapshot");
- return -BCH_ERR_fsck_repair_unimplemented;
+ return bch_err_throw(c, fsck_repair_unimplemented);
}
/*
@@ -595,12 +580,12 @@ static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 sub
u64 cpu = raw_smp_processor_id();
bch2_inode_init_early(c, &new_inode);
- bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL);
+ bch2_inode_init_late(c, &new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL);
new_inode.bi_subvol = subvolid;
int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?:
- bch2_btree_iter_traverse(&inode_iter) ?:
+ bch2_btree_iter_traverse(trans, &inode_iter) ?:
bch2_inode_write(trans, &inode_iter, &new_inode);
bch2_trans_iter_exit(trans, &inode_iter);
if (ret)
@@ -665,7 +650,7 @@ static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32
struct btree_iter iter = {};
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0);
- struct bkey_s_c k = bch2_btree_iter_peek_prev_min(&iter, POS(inum, 0));
+ struct bkey_s_c k = bch2_btree_iter_peek_prev_min(trans, &iter, POS(inum, 0));
bch2_trans_iter_exit(trans, &iter);
int ret = bkey_err(k);
if (ret)
@@ -685,7 +670,7 @@ static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32
struct bch_inode_unpacked new_inode;
bch2_inode_init_early(c, &new_inode);
- bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, i_mode|0600, 0, NULL);
+ bch2_inode_init_late(c, &new_inode, bch2_current_time(c), 0, 0, i_mode|0600, 0, NULL);
new_inode.bi_size = i_size;
new_inode.bi_inum = inum;
new_inode.bi_snapshot = snapshot;
@@ -693,11 +678,6 @@ static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32
return __bch2_fsck_write_inode(trans, &new_inode);
}
-struct snapshots_seen {
- struct bpos pos;
- snapshot_id_list ids;
-};
-
static inline void snapshots_seen_exit(struct snapshots_seen *s)
{
darray_exit(&s->ids);
@@ -749,14 +729,8 @@ static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen,
u32 id, u32 ancestor)
{
- ssize_t i;
-
EBUG_ON(id > ancestor);
- /* @ancestor should be the snapshot most recently added to @seen */
- EBUG_ON(ancestor != seen->pos.snapshot);
- EBUG_ON(ancestor != darray_last(seen->ids));
-
if (id == ancestor)
return true;
@@ -772,11 +746,8 @@ static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *see
* numerically, since snapshot ID lists are kept sorted, so if we find
* an id that's an ancestor of @id we're done:
*/
-
- for (i = seen->ids.nr - 2;
- i >= 0 && seen->ids.data[i] >= id;
- --i)
- if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i]))
+ darray_for_each_reverse(seen->ids, i)
+ if (*i != ancestor && bch2_snapshot_is_ancestor(c, id, *i))
return false;
return true;
@@ -816,13 +787,14 @@ static int ref_visible2(struct bch_fs *c,
#define for_each_visible_inode(_c, _s, _w, _snapshot, _i) \
for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr && \
- (_i)->snapshot <= (_snapshot); _i++) \
- if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot))
+ (_i)->inode.bi_snapshot <= (_snapshot); _i++) \
+ if (key_visible_in_snapshot(_c, _s, _i->inode.bi_snapshot, _snapshot))
struct inode_walker_entry {
struct bch_inode_unpacked inode;
- u32 snapshot;
+ bool whiteout;
u64 count;
+ u64 i_size;
};
struct inode_walker {
@@ -849,13 +821,20 @@ static struct inode_walker inode_walker_init(void)
static int add_inode(struct bch_fs *c, struct inode_walker *w,
struct bkey_s_c inode)
{
- struct bch_inode_unpacked u;
-
- return bch2_inode_unpack(inode, &u) ?:
- darray_push(&w->inodes, ((struct inode_walker_entry) {
- .inode = u,
- .snapshot = inode.k->p.snapshot,
+ int ret = darray_push(&w->inodes, ((struct inode_walker_entry) {
+ .whiteout = !bkey_is_inode(inode.k),
}));
+ if (ret)
+ return ret;
+
+ struct inode_walker_entry *n = &darray_last(w->inodes);
+ if (!n->whiteout) {
+ return bch2_inode_unpack(inode, &n->inode);
+ } else {
+ n->inode.bi_inum = inode.k->p.offset;
+ n->inode.bi_snapshot = inode.k->p.snapshot;
+ return 0;
+ }
}
static int get_inodes_all_snapshots(struct btree_trans *trans,
@@ -875,13 +854,12 @@ static int get_inodes_all_snapshots(struct btree_trans *trans,
w->recalculate_sums = false;
w->inodes.nr = 0;
- for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
- BTREE_ITER_all_snapshots, k, ret) {
- if (k.k->p.offset != inum)
+ for_each_btree_key_max_norestart(trans, iter,
+ BTREE_ID_inodes, POS(0, inum), SPOS(0, inum, U32_MAX),
+ BTREE_ITER_all_snapshots, k, ret) {
+ ret = add_inode(c, w, k);
+ if (ret)
break;
-
- if (bkey_is_inode(k.k))
- add_inode(c, w, k);
}
bch2_trans_iter_exit(trans, &iter);
@@ -893,64 +871,6 @@ static int get_inodes_all_snapshots(struct btree_trans *trans,
return 0;
}
-static struct inode_walker_entry *
-lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c k)
-{
- bool is_whiteout = k.k->type == KEY_TYPE_whiteout;
-
- struct inode_walker_entry *i;
- __darray_for_each(w->inodes, i)
- if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->snapshot))
- goto found;
-
- return NULL;
-found:
- BUG_ON(k.k->p.snapshot > i->snapshot);
-
- if (k.k->p.snapshot != i->snapshot && !is_whiteout) {
- struct inode_walker_entry new = *i;
-
- new.snapshot = k.k->p.snapshot;
- new.count = 0;
-
- struct printbuf buf = PRINTBUF;
- bch2_bkey_val_to_text(&buf, c, k);
-
- bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u\n"
- "unexpected because we should always update the inode when we update a key in that inode\n"
- "%s",
- w->last_pos.inode, k.k->p.snapshot, i->snapshot, buf.buf);
- printbuf_exit(&buf);
-
- while (i > w->inodes.data && i[-1].snapshot > k.k->p.snapshot)
- --i;
-
- size_t pos = i - w->inodes.data;
- int ret = darray_insert_item(&w->inodes, pos, new);
- if (ret)
- return ERR_PTR(ret);
-
- i = w->inodes.data + pos;
- }
-
- return i;
-}
-
-static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
- struct inode_walker *w,
- struct bkey_s_c k)
-{
- if (w->last_pos.inode != k.k->p.inode) {
- int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode);
- if (ret)
- return ERR_PTR(ret);
- }
-
- w->last_pos = k.k->p;
-
- return lookup_inode_for_snapshot(trans->c, w, k);
-}
-
static int get_visible_inodes(struct btree_trans *trans,
struct inode_walker *w,
struct snapshots_seen *s,
@@ -986,6 +906,89 @@ static int get_visible_inodes(struct btree_trans *trans,
return ret;
}
+static struct inode_walker_entry *
+lookup_inode_for_snapshot(struct btree_trans *trans, struct inode_walker *w, struct bkey_s_c k)
+{
+ struct bch_fs *c = trans->c;
+
+ struct inode_walker_entry *i = darray_find_p(w->inodes, i,
+ bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->inode.bi_snapshot));
+
+ if (!i)
+ return NULL;
+
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
+
+ if (fsck_err_on(k.k->p.snapshot != i->inode.bi_snapshot,
+ trans, snapshot_key_missing_inode_snapshot,
+ "have key for inode %llu:%u but have inode in ancestor snapshot %u\n"
+ "unexpected because we should always update the inode when we update a key in that inode\n"
+ "%s",
+ w->last_pos.inode, k.k->p.snapshot, i->inode.bi_snapshot,
+ (bch2_bkey_val_to_text(&buf, c, k),
+ buf.buf))) {
+ if (!i->whiteout) {
+ struct bch_inode_unpacked new = i->inode;
+ new.bi_snapshot = k.k->p.snapshot;
+ ret = __bch2_fsck_write_inode(trans, &new);
+ } else {
+ struct bkey_i whiteout;
+ bkey_init(&whiteout.k);
+ whiteout.k.type = KEY_TYPE_whiteout;
+ whiteout.k.p = SPOS(0, i->inode.bi_inum, k.k->p.snapshot);
+ ret = bch2_btree_insert_nonextent(trans, BTREE_ID_inodes,
+ &whiteout,
+ BTREE_UPDATE_internal_snapshot_node);
+ }
+
+ if (ret)
+ goto fsck_err;
+
+ ret = bch2_trans_commit(trans, NULL, NULL, 0);
+ if (ret)
+ goto fsck_err;
+
+ struct inode_walker_entry new_entry = *i;
+
+ new_entry.inode.bi_snapshot = k.k->p.snapshot;
+ new_entry.count = 0;
+ new_entry.i_size = 0;
+
+ while (i > w->inodes.data && i[-1].inode.bi_snapshot > k.k->p.snapshot)
+ --i;
+
+ size_t pos = i - w->inodes.data;
+ ret = darray_insert_item(&w->inodes, pos, new_entry);
+ if (ret)
+ goto fsck_err;
+
+ ret = bch_err_throw(c, transaction_restart_nested);
+ goto fsck_err;
+ }
+
+ printbuf_exit(&buf);
+ return i;
+fsck_err:
+ printbuf_exit(&buf);
+ return ERR_PTR(ret);
+}
+
+static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
+ struct inode_walker *w,
+ struct bkey_s_c k)
+{
+ if (w->last_pos.inode != k.k->p.inode) {
+ int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode);
+ if (ret)
+ return ERR_PTR(ret);
+ }
+
+ w->last_pos = k.k->p;
+
+ return lookup_inode_for_snapshot(trans, w, k);
+}
+
/*
* Prefer to delete the first one, since that will be the one at the wrong
* offset:
@@ -1005,7 +1008,8 @@ int bch2_fsck_update_backpointers(struct btree_trans *trans,
int ret = 0;
if (d->v.d_type == DT_SUBVOL) {
- BUG();
+ bch_err(trans->c, "%s does not support DT_SUBVOL", __func__);
+ ret = -BCH_ERR_fsck_repair_unimplemented;
} else {
ret = get_visible_inodes(trans, &target, s, le64_to_cpu(d->v.d_inum));
if (ret)
@@ -1061,11 +1065,28 @@ static int check_inode_dirent_inode(struct btree_trans *trans,
if (ret && !bch2_err_matches(ret, ENOENT))
return ret;
+ if ((ret || dirent_points_to_inode_nowarn(c, d, inode)) &&
+ inode->bi_subvol &&
+ (inode->bi_flags & BCH_INODE_has_child_snapshot)) {
+ /* Older version of a renamed subvolume root: we won't have a
+ * correct dirent for it. That's expected, see
+ * inode_should_reattach().
+ *
+ * We don't clear the backpointer field when doing the rename
+ * because there might be arbitrarily many versions in older
+ * snapshots.
+ */
+ inode->bi_dir = 0;
+ inode->bi_dir_offset = 0;
+ *write_inode = true;
+ goto out;
+ }
+
if (fsck_err_on(ret,
trans, inode_points_to_missing_dirent,
"inode points to missing dirent\n%s",
(bch2_inode_unpacked_to_text(&buf, inode), buf.buf)) ||
- fsck_err_on(!ret && dirent_points_to_inode_nowarn(d, inode),
+ fsck_err_on(!ret && dirent_points_to_inode_nowarn(c, d, inode),
trans, inode_points_to_wrong_dirent,
"%s",
(printbuf_reset(&buf),
@@ -1081,7 +1102,7 @@ static int check_inode_dirent_inode(struct btree_trans *trans,
inode->bi_dir_offset = 0;
*write_inode = true;
}
-
+out:
ret = 0;
fsck_err:
bch2_trans_iter_exit(trans, &dirent_iter);
@@ -1090,63 +1111,6 @@ fsck_err:
return ret;
}
-static int get_snapshot_root_inode(struct btree_trans *trans,
- struct bch_inode_unpacked *root,
- u64 inum)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret = 0;
-
- for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes,
- SPOS(0, inum, U32_MAX),
- BTREE_ITER_all_snapshots, k, ret) {
- if (k.k->p.offset != inum)
- break;
- if (bkey_is_inode(k.k))
- goto found_root;
- }
- if (ret)
- goto err;
- BUG();
-found_root:
- ret = bch2_inode_unpack(k, root);
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-static int check_directory_size(struct btree_trans *trans,
- struct bch_inode_unpacked *inode_u,
- struct bkey_s_c inode_k, bool *write_inode)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- u64 new_size = 0;
- int ret;
-
- for_each_btree_key_max_norestart(trans, iter, BTREE_ID_dirents,
- SPOS(inode_k.k->p.offset, 0, inode_k.k->p.snapshot),
- POS(inode_k.k->p.offset, U64_MAX),
- 0, k, ret) {
- if (k.k->type != KEY_TYPE_dirent)
- continue;
-
- struct bkey_s_c_dirent dirent = bkey_s_c_to_dirent(k);
- struct qstr name = bch2_dirent_get_name(dirent);
-
- new_size += dirent_occupied_size(&name);
- }
- bch2_trans_iter_exit(trans, &iter);
-
- if (!ret && inode_u->bi_size != new_size) {
- inode_u->bi_size = new_size;
- *write_inode = true;
- }
-
- return ret;
-}
-
static int check_inode(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k,
@@ -1177,27 +1141,31 @@ static int check_inode(struct btree_trans *trans,
goto err;
if (snapshot_root->bi_inum != u.bi_inum) {
- ret = get_snapshot_root_inode(trans, snapshot_root, u.bi_inum);
+ ret = bch2_inode_find_snapshot_root(trans, u.bi_inum, snapshot_root);
if (ret)
goto err;
}
- if (fsck_err_on(u.bi_hash_seed != snapshot_root->bi_hash_seed ||
- INODE_STR_HASH(&u) != INODE_STR_HASH(snapshot_root),
- trans, inode_snapshot_mismatch,
- "inode hash info in different snapshots don't match")) {
- u.bi_hash_seed = snapshot_root->bi_hash_seed;
- SET_INODE_STR_HASH(&u, INODE_STR_HASH(snapshot_root));
- do_update = true;
+ if (u.bi_hash_seed != snapshot_root->bi_hash_seed ||
+ INODE_STR_HASH(&u) != INODE_STR_HASH(snapshot_root)) {
+ ret = bch2_repair_inode_hash_info(trans, snapshot_root);
+ BUG_ON(ret == -BCH_ERR_fsck_repair_unimplemented);
+ if (ret)
+ goto err;
}
- if (u.bi_dir || u.bi_dir_offset) {
+ ret = bch2_check_inode_has_case_insensitive(trans, &u, &s->ids, &do_update);
+ if (ret)
+ goto err;
+
+ if (bch2_inode_has_backpointer(&u)) {
ret = check_inode_dirent_inode(trans, &u, &do_update);
if (ret)
goto err;
}
- if (fsck_err_on(u.bi_dir && (u.bi_flags & BCH_INODE_unlinked),
+ if (fsck_err_on(bch2_inode_has_backpointer(&u) &&
+ (u.bi_flags & BCH_INODE_unlinked),
trans, inode_unlinked_but_has_dirent,
"inode unlinked but has dirent\n%s",
(printbuf_reset(&buf),
@@ -1224,6 +1192,14 @@ static int check_inode(struct btree_trans *trans,
ret = 0;
}
+ if (fsck_err_on(S_ISDIR(u.bi_mode) && u.bi_size,
+ trans, inode_dir_has_nonzero_i_size,
+ "directory %llu:%u with nonzero i_size %lli",
+ u.bi_inum, u.bi_snapshot, u.bi_size)) {
+ u.bi_size = 0;
+ do_update = true;
+ }
+
ret = bch2_inode_has_child_snapshots(trans, k.k->p);
if (ret < 0)
goto err;
@@ -1335,16 +1311,6 @@ static int check_inode(struct btree_trans *trans,
u.bi_journal_seq = journal_cur_seq(&c->journal);
do_update = true;
}
-
- if (S_ISDIR(u.bi_mode)) {
- ret = check_directory_size(trans, &u, k, &do_update);
-
- fsck_err_on(ret,
- trans, directory_size_mismatch,
- "directory inode %llu:%u with the mismatch directory size",
- u.bi_inum, k.k->p.snapshot);
- ret = 0;
- }
do_update:
if (do_update) {
ret = __bch2_fsck_write_inode(trans, &u);
@@ -1496,6 +1462,7 @@ static int check_key_has_inode(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct printbuf buf = PRINTBUF;
+ struct btree_iter iter2 = {};
int ret = PTR_ERR_OR_ZERO(i);
if (ret)
return ret;
@@ -1503,40 +1470,107 @@ static int check_key_has_inode(struct btree_trans *trans,
if (k.k->type == KEY_TYPE_whiteout)
goto out;
- if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) {
- ret = reconstruct_inode(trans, iter->btree_id, k.k->p.snapshot, k.k->p.inode) ?:
- bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
- if (ret)
- goto err;
+ bool have_inode = i && !i->whiteout;
- inode->last_pos.inode--;
- ret = -BCH_ERR_transaction_restart_nested;
- goto err;
+ if (!have_inode && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes)))
+ goto reconstruct;
+
+ if (have_inode && btree_matches_i_mode(iter->btree_id, i->inode.bi_mode))
+ goto out;
+
+ prt_printf(&buf, ", ");
+
+ bool have_old_inode = false;
+ darray_for_each(inode->inodes, i2)
+ if (!i2->whiteout &&
+ bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i2->inode.bi_snapshot) &&
+ btree_matches_i_mode(iter->btree_id, i2->inode.bi_mode)) {
+ prt_printf(&buf, "but found good inode in older snapshot\n");
+ bch2_inode_unpacked_to_text(&buf, &i2->inode);
+ prt_newline(&buf);
+ have_old_inode = true;
+ break;
+ }
+
+ struct bkey_s_c k2;
+ unsigned nr_keys = 0;
+
+ prt_printf(&buf, "found keys:\n");
+
+ for_each_btree_key_max_norestart(trans, iter2, iter->btree_id,
+ SPOS(k.k->p.inode, 0, k.k->p.snapshot),
+ POS(k.k->p.inode, U64_MAX),
+ 0, k2, ret) {
+ nr_keys++;
+ if (nr_keys <= 10) {
+ bch2_bkey_val_to_text(&buf, c, k2);
+ prt_newline(&buf);
+ }
+ if (nr_keys >= 100)
+ break;
}
- if (fsck_err_on(!i,
- trans, key_in_missing_inode,
- "key in missing inode:\n %s",
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- goto delete;
+ if (ret)
+ goto err;
- if (fsck_err_on(i && !btree_matches_i_mode(iter->btree_id, i->inode.bi_mode),
- trans, key_in_wrong_inode_type,
- "key for wrong inode mode %o:\n %s",
- i->inode.bi_mode,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
- goto delete;
+ if (nr_keys > 100)
+ prt_printf(&buf, "found > %u keys for this missing inode\n", nr_keys);
+ else if (nr_keys > 10)
+ prt_printf(&buf, "found %u keys for this missing inode\n", nr_keys);
+
+ if (!have_inode) {
+ if (fsck_err_on(!have_inode,
+ trans, key_in_missing_inode,
+ "key in missing inode%s", buf.buf)) {
+ /*
+ * Maybe a deletion that raced with data move, or something
+ * weird like that? But if we know the inode was deleted, or
+ * it's just a few keys, we can safely delete them.
+ *
+ * If it's many keys, we should probably recreate the inode
+ */
+ if (have_old_inode || nr_keys <= 2)
+ goto delete;
+ else
+ goto reconstruct;
+ }
+ } else {
+ /*
+ * not autofix, this one would be a giant wtf - bit error in the
+ * inode corrupting i_mode?
+ *
+ * may want to try repairing inode instead of deleting
+ */
+ if (fsck_err_on(!btree_matches_i_mode(iter->btree_id, i->inode.bi_mode),
+ trans, key_in_wrong_inode_type,
+ "key for wrong inode mode %o%s",
+ i->inode.bi_mode, buf.buf))
+ goto delete;
+ }
out:
err:
fsck_err:
+ bch2_trans_iter_exit(trans, &iter2);
printbuf_exit(&buf);
bch_err_fn(c, ret);
return ret;
delete:
+ /*
+ * XXX: print out more info
+ * count up extents for this inode, check if we have different inode in
+ * an older snapshot version, perhaps decide if we want to reconstitute
+ */
ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node);
goto out;
+reconstruct:
+ ret = reconstruct_inode(trans, iter->btree_id, k.k->p.snapshot, k.k->p.inode) ?:
+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
+ if (ret)
+ goto err;
+
+ inode->last_pos.inode--;
+ ret = bch_err_throw(c, transaction_restart_nested);
+ goto out;
}
static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_walker *w)
@@ -1549,21 +1583,21 @@ static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_wal
if (i->inode.bi_sectors == i->count)
continue;
- count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->snapshot);
+ count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->inode.bi_snapshot);
if (w->recalculate_sums)
i->count = count2;
if (i->count != count2) {
bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
- w->last_pos.inode, i->snapshot, i->count, count2);
+ w->last_pos.inode, i->inode.bi_snapshot, i->count, count2);
i->count = count2;
}
if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty),
trans, inode_i_sectors_wrong,
"inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
- w->last_pos.inode, i->snapshot,
+ w->last_pos.inode, i->inode.bi_snapshot,
i->inode.bi_sectors, i->count)) {
i->inode.bi_sectors = i->count;
ret = bch2_fsck_write_inode(trans, &i->inode);
@@ -1627,7 +1661,7 @@ static int extent_ends_at(struct bch_fs *c,
sizeof(seen->ids.data[0]) * seen->ids.size,
GFP_KERNEL);
if (!n.seen.ids.data)
- return -BCH_ERR_ENOMEM_fsck_extent_ends_at;
+ return bch_err_throw(c, ENOMEM_fsck_extent_ends_at);
__darray_for_each(extent_ends->e, i) {
if (i->snapshot == k.k->p.snapshot) {
@@ -1652,7 +1686,7 @@ static int overlapping_extents_found(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct printbuf buf = PRINTBUF;
- struct btree_iter iter1, iter2 = { NULL };
+ struct btree_iter iter1, iter2 = {};
struct bkey_s_c k1, k2;
int ret;
@@ -1661,32 +1695,32 @@ static int overlapping_extents_found(struct btree_trans *trans,
bch2_trans_iter_init(trans, &iter1, btree, pos1,
BTREE_ITER_all_snapshots|
BTREE_ITER_not_extents);
- k1 = bch2_btree_iter_peek_max(&iter1, POS(pos1.inode, U64_MAX));
+ k1 = bch2_btree_iter_peek_max(trans, &iter1, POS(pos1.inode, U64_MAX));
ret = bkey_err(k1);
if (ret)
goto err;
- prt_str(&buf, "\n ");
+ prt_newline(&buf);
bch2_bkey_val_to_text(&buf, c, k1);
if (!bpos_eq(pos1, k1.k->p)) {
- prt_str(&buf, "\n wanted\n ");
+ prt_str(&buf, "\nwanted\n ");
bch2_bpos_to_text(&buf, pos1);
- prt_str(&buf, "\n ");
+ prt_str(&buf, "\n");
bch2_bkey_to_text(&buf, &pos2);
bch_err(c, "%s: error finding first overlapping extent when repairing, got%s",
__func__, buf.buf);
- ret = -BCH_ERR_internal_fsck_err;
+ ret = bch_err_throw(c, internal_fsck_err);
goto err;
}
- bch2_trans_copy_iter(&iter2, &iter1);
+ bch2_trans_copy_iter(trans, &iter2, &iter1);
while (1) {
- bch2_btree_iter_advance(&iter2);
+ bch2_btree_iter_advance(trans, &iter2);
- k2 = bch2_btree_iter_peek_max(&iter2, POS(pos1.inode, U64_MAX));
+ k2 = bch2_btree_iter_peek_max(trans, &iter2, POS(pos1.inode, U64_MAX));
ret = bkey_err(k2);
if (ret)
goto err;
@@ -1695,18 +1729,18 @@ static int overlapping_extents_found(struct btree_trans *trans,
break;
}
- prt_str(&buf, "\n ");
+ prt_newline(&buf);
bch2_bkey_val_to_text(&buf, c, k2);
if (bpos_gt(k2.k->p, pos2.p) ||
pos2.size != k2.k->size) {
bch_err(c, "%s: error finding seconding overlapping extent when repairing%s",
__func__, buf.buf);
- ret = -BCH_ERR_internal_fsck_err;
+ ret = bch_err_throw(c, internal_fsck_err);
goto err;
}
- prt_printf(&buf, "\n overwriting %s extent",
+ prt_printf(&buf, "\noverwriting %s extent",
pos1.snapshot >= pos2.p.snapshot ? "first" : "second");
if (fsck_err(trans, extent_overlapping,
@@ -1727,6 +1761,8 @@ static int overlapping_extents_found(struct btree_trans *trans,
bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc);
bch2_disk_reservation_put(c, &res);
+ bch_info(c, "repair ret %s", bch2_err_str(ret));
+
if (ret)
goto err;
@@ -1748,7 +1784,7 @@ static int overlapping_extents_found(struct btree_trans *trans,
* We overwrote the second extent - restart
* check_extent() from the top:
*/
- ret = -BCH_ERR_transaction_restart_nested;
+ ret = bch_err_throw(c, transaction_restart_nested);
}
}
fsck_err:
@@ -1872,24 +1908,24 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes);
inode->inodes.data && i >= inode->inodes.data;
--i) {
- if (i->snapshot > k.k->p.snapshot ||
- !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot))
+ if (i->inode.bi_snapshot > k.k->p.snapshot ||
+ !key_visible_in_snapshot(c, s, i->inode.bi_snapshot, k.k->p.snapshot))
continue;
- if (fsck_err_on(k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
+ u64 last_block = round_up(i->inode.bi_size, block_bytes(c)) >> 9;
+
+ if (fsck_err_on(k.k->p.offset > last_block &&
!bkey_extent_is_reservation(k),
trans, extent_past_end_of_inode,
- "extent type past end of inode %llu:%u, i_size %llu\n %s",
- i->inode.bi_inum, i->snapshot, i->inode.bi_size,
+ "extent type past end of inode %llu:%u, i_size %llu\n%s",
+ i->inode.bi_inum, i->inode.bi_snapshot, i->inode.bi_size,
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- struct btree_iter iter2;
-
- bch2_trans_copy_iter(&iter2, iter);
- bch2_btree_iter_set_snapshot(&iter2, i->snapshot);
- ret = bch2_btree_iter_traverse(&iter2) ?:
- bch2_btree_delete_at(trans, &iter2,
- BTREE_UPDATE_internal_snapshot_node);
- bch2_trans_iter_exit(trans, &iter2);
+ ret = snapshots_seen_add_inorder(c, s, i->inode.bi_snapshot) ?:
+ bch2_fpunch_snapshot(trans,
+ SPOS(i->inode.bi_inum,
+ last_block,
+ i->inode.bi_snapshot),
+ POS(i->inode.bi_inum, U64_MAX));
if (ret)
goto err;
@@ -1907,8 +1943,9 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes);
inode->inodes.data && i >= inode->inodes.data;
--i) {
- if (i->snapshot > k.k->p.snapshot ||
- !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot))
+ if (i->whiteout ||
+ i->inode.bi_snapshot > k.k->p.snapshot ||
+ !key_visible_in_snapshot(c, s, i->inode.bi_snapshot, k.k->p.snapshot))
continue;
i->count += k.k->size;
@@ -1990,26 +2027,34 @@ static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_
if (i->inode.bi_nlink == i->count)
continue;
- count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->snapshot);
+ count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->inode.bi_snapshot);
if (count2 < 0)
return count2;
if (i->count != count2) {
bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu",
- w->last_pos.inode, i->snapshot, i->count, count2);
+ w->last_pos.inode, i->inode.bi_snapshot, i->count, count2);
i->count = count2;
if (i->inode.bi_nlink == i->count)
continue;
}
- if (fsck_err_on(i->inode.bi_nlink != i->count,
- trans, inode_dir_wrong_nlink,
- "directory %llu:%u with wrong i_nlink: got %u, should be %llu",
- w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) {
- i->inode.bi_nlink = i->count;
- ret = bch2_fsck_write_inode(trans, &i->inode);
- if (ret)
- break;
+ if (i->inode.bi_nlink != i->count) {
+ CLASS(printbuf, buf)();
+
+ lockrestart_do(trans,
+ bch2_inum_snapshot_to_path(trans, w->last_pos.inode,
+ i->inode.bi_snapshot, NULL, &buf));
+
+ if (fsck_err_on(i->inode.bi_nlink != i->count,
+ trans, inode_dir_wrong_nlink,
+ "directory with wrong i_nlink: got %u, should be %llu\n%s",
+ i->inode.bi_nlink, i->count, buf.buf)) {
+ i->inode.bi_nlink = i->count;
+ ret = bch2_fsck_write_inode(trans, &i->inode);
+ if (ret)
+ break;
+ }
}
}
fsck_err:
@@ -2017,176 +2062,13 @@ fsck_err:
return ret;
}
-static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
+static int check_subdir_dirents_count(struct btree_trans *trans, struct inode_walker *w)
{
u32 restart_count = trans->restart_count;
return check_subdir_count_notnested(trans, w) ?:
trans_was_restarted(trans, restart_count);
}
-noinline_for_stack
-static int check_dirent_inode_dirent(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c_dirent d,
- struct bch_inode_unpacked *target)
-{
- struct bch_fs *c = trans->c;
- struct printbuf buf = PRINTBUF;
- struct btree_iter bp_iter = { NULL };
- int ret = 0;
-
- if (inode_points_to_dirent(target, d))
- return 0;
-
- if (!target->bi_dir &&
- !target->bi_dir_offset) {
- fsck_err_on(S_ISDIR(target->bi_mode),
- trans, inode_dir_missing_backpointer,
- "directory with missing backpointer\n%s",
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, d.s_c),
- prt_printf(&buf, "\n"),
- bch2_inode_unpacked_to_text(&buf, target),
- buf.buf));
-
- fsck_err_on(target->bi_flags & BCH_INODE_unlinked,
- trans, inode_unlinked_but_has_dirent,
- "inode unlinked but has dirent\n%s",
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, d.s_c),
- prt_printf(&buf, "\n"),
- bch2_inode_unpacked_to_text(&buf, target),
- buf.buf));
-
- target->bi_flags &= ~BCH_INODE_unlinked;
- target->bi_dir = d.k->p.inode;
- target->bi_dir_offset = d.k->p.offset;
- return __bch2_fsck_write_inode(trans, target);
- }
-
- if (bch2_inode_should_have_single_bp(target) &&
- !fsck_err(trans, inode_wrong_backpointer,
- "dirent points to inode that does not point back:\n %s",
- (bch2_bkey_val_to_text(&buf, c, d.s_c),
- prt_printf(&buf, "\n "),
- bch2_inode_unpacked_to_text(&buf, target),
- buf.buf)))
- goto err;
-
- struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter,
- SPOS(target->bi_dir, target->bi_dir_offset, target->bi_snapshot));
- ret = bkey_err(bp_dirent);
- if (ret && !bch2_err_matches(ret, ENOENT))
- goto err;
-
- bool backpointer_exists = !ret;
- ret = 0;
-
- if (fsck_err_on(!backpointer_exists,
- trans, inode_wrong_backpointer,
- "inode %llu:%u has wrong backpointer:\n"
- "got %llu:%llu\n"
- "should be %llu:%llu",
- target->bi_inum, target->bi_snapshot,
- target->bi_dir,
- target->bi_dir_offset,
- d.k->p.inode,
- d.k->p.offset)) {
- target->bi_dir = d.k->p.inode;
- target->bi_dir_offset = d.k->p.offset;
- ret = __bch2_fsck_write_inode(trans, target);
- goto out;
- }
-
- bch2_bkey_val_to_text(&buf, c, d.s_c);
- prt_newline(&buf);
- if (backpointer_exists)
- bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c);
-
- if (fsck_err_on(backpointer_exists &&
- (S_ISDIR(target->bi_mode) ||
- target->bi_subvol),
- trans, inode_dir_multiple_links,
- "%s %llu:%u with multiple links\n%s",
- S_ISDIR(target->bi_mode) ? "directory" : "subvolume",
- target->bi_inum, target->bi_snapshot, buf.buf)) {
- ret = __remove_dirent(trans, d.k->p);
- goto out;
- }
-
- /*
- * hardlinked file with nlink 0:
- * We're just adjusting nlink here so check_nlinks() will pick
- * it up, it ignores inodes with nlink 0
- */
- if (fsck_err_on(backpointer_exists && !target->bi_nlink,
- trans, inode_multiple_links_but_nlink_0,
- "inode %llu:%u type %s has multiple links but i_nlink 0\n%s",
- target->bi_inum, target->bi_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
- target->bi_nlink++;
- target->bi_flags &= ~BCH_INODE_unlinked;
- ret = __bch2_fsck_write_inode(trans, target);
- if (ret)
- goto err;
- }
-out:
-err:
-fsck_err:
- bch2_trans_iter_exit(trans, &bp_iter);
- printbuf_exit(&buf);
- bch_err_fn(c, ret);
- return ret;
-}
-
-noinline_for_stack
-static int check_dirent_target(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c_dirent d,
- struct bch_inode_unpacked *target)
-{
- struct bch_fs *c = trans->c;
- struct bkey_i_dirent *n;
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- ret = check_dirent_inode_dirent(trans, iter, d, target);
- if (ret)
- goto err;
-
- if (fsck_err_on(d.v->d_type != inode_d_type(target),
- trans, dirent_d_type_wrong,
- "incorrect d_type: got %s, should be %s:\n%s",
- bch2_d_type_str(d.v->d_type),
- bch2_d_type_str(inode_d_type(target)),
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
- n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
- ret = PTR_ERR_OR_ZERO(n);
- if (ret)
- goto err;
-
- bkey_reassemble(&n->k_i, d.s_c);
- n->v.d_type = inode_d_type(target);
- if (n->v.d_type == DT_SUBVOL) {
- n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
- n->v.d_child_subvol = cpu_to_le32(target->bi_subvol);
- } else {
- n->v.d_inum = cpu_to_le64(target->bi_inum);
- }
-
- ret = bch2_trans_update(trans, iter, &n->k_i, 0);
- if (ret)
- goto err;
-
- d = dirent_i_to_s_c(n);
- }
-err:
-fsck_err:
- printbuf_exit(&buf);
- bch_err_fn(c, ret);
- return ret;
-}
-
/* find a subvolume that's a descendent of @snapshot: */
static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid)
{
@@ -2263,7 +2145,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
if (!new_parent_subvol) {
bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot);
- return -BCH_ERR_fsck_repair_unimplemented;
+ return bch_err_throw(c, fsck_repair_unimplemented);
}
struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent);
@@ -2280,41 +2162,52 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
0, subvolume);
ret = bkey_err(s.s_c);
if (ret && !bch2_err_matches(ret, ENOENT))
- return ret;
+ goto err;
if (ret) {
if (fsck_err(trans, dirent_to_missing_subvol,
"dirent points to missing subvolume\n%s",
(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)))
- return __remove_dirent(trans, d.k->p);
+ return bch2_fsck_remove_dirent(trans, d.k->p);
ret = 0;
goto out;
}
- if (fsck_err_on(le32_to_cpu(s.v->fs_path_parent) != parent_subvol,
- trans, subvol_fs_path_parent_wrong,
- "subvol with wrong fs_path_parent, should be be %u\n%s",
- parent_subvol,
- (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
- struct bkey_i_subvolume *n =
- bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume);
- ret = PTR_ERR_OR_ZERO(n);
+ if (le32_to_cpu(s.v->fs_path_parent) != parent_subvol) {
+ printbuf_reset(&buf);
+
+ prt_printf(&buf, "subvol with wrong fs_path_parent, should be be %u\n",
+ parent_subvol);
+
+ ret = bch2_inum_to_path(trans, (subvol_inum) { s.k->p.offset,
+ le64_to_cpu(s.v->inode) }, &buf);
if (ret)
goto err;
+ prt_newline(&buf);
+ bch2_bkey_val_to_text(&buf, c, s.s_c);
- n->v.fs_path_parent = cpu_to_le32(parent_subvol);
+ if (fsck_err(trans, subvol_fs_path_parent_wrong, "%s", buf.buf)) {
+ struct bkey_i_subvolume *n =
+ bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume);
+ ret = PTR_ERR_OR_ZERO(n);
+ if (ret)
+ goto err;
+
+ n->v.fs_path_parent = cpu_to_le32(parent_subvol);
+ }
}
u64 target_inum = le64_to_cpu(s.v->inode);
u32 target_snapshot = le32_to_cpu(s.v->snapshot);
- ret = lookup_inode(trans, target_inum, target_snapshot, &subvol_root);
+ ret = bch2_inode_find_by_inum_snapshot(trans, target_inum, target_snapshot,
+ &subvol_root, 0);
if (ret && !bch2_err_matches(ret, ENOENT))
goto err;
if (ret) {
bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum);
- ret = -BCH_ERR_fsck_repair_unimplemented;
+ ret = bch_err_throw(c, fsck_repair_unimplemented);
goto err;
}
@@ -2330,7 +2223,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
goto err;
}
- ret = check_dirent_target(trans, iter, d, &subvol_root);
+ ret = bch2_check_dirent_target(trans, iter, d, &subvol_root, true);
if (ret)
goto err;
out:
@@ -2346,7 +2239,8 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
struct bch_hash_info *hash_info,
struct inode_walker *dir,
struct inode_walker *target,
- struct snapshots_seen *s)
+ struct snapshots_seen *s,
+ bool *need_second_pass)
{
struct bch_fs *c = trans->c;
struct inode_walker_entry *i;
@@ -2367,7 +2261,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
goto out;
if (dir->last_pos.inode != k.k->p.inode && dir->have_inodes) {
- ret = check_subdir_count(trans, dir);
+ ret = check_subdir_dirents_count(trans, dir);
if (ret)
goto err;
}
@@ -2381,14 +2275,17 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
if (ret)
goto err;
- if (!i)
+ if (!i || i->whiteout)
goto out;
if (dir->first_this_inode)
*hash_info = bch2_hash_info_init(c, &i->inode);
dir->first_this_inode = false;
- ret = bch2_str_hash_check_key(trans, s, &bch2_dirent_hash_desc, hash_info, iter, k);
+ hash_info->cf_encoding = bch2_inode_casefold(c, &i->inode) ? c->cf_encoding : NULL;
+
+ ret = bch2_str_hash_check_key(trans, s, &bch2_dirent_hash_desc, hash_info,
+ iter, k, need_second_pass);
if (ret < 0)
goto err;
if (ret) {
@@ -2402,6 +2299,44 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
+ /* check casefold */
+ if (fsck_err_on(d.v->d_casefold != !!hash_info->cf_encoding,
+ trans, dirent_casefold_mismatch,
+ "dirent casefold does not match dir casefold\n%s",
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k),
+ buf.buf))) {
+ subvol_inum dir_inum = { .subvol = d.v->d_type == DT_SUBVOL
+ ? le32_to_cpu(d.v->d_parent_subvol)
+ : 0,
+ };
+ u64 target = d.v->d_type == DT_SUBVOL
+ ? le32_to_cpu(d.v->d_child_subvol)
+ : le64_to_cpu(d.v->d_inum);
+ struct qstr name = bch2_dirent_get_name(d);
+
+ struct bkey_i_dirent *new_d =
+ bch2_dirent_create_key(trans, hash_info, dir_inum,
+ d.v->d_type, &name, NULL, target);
+ ret = PTR_ERR_OR_ZERO(new_d);
+ if (ret)
+ goto out;
+
+ new_d->k.p.inode = d.k->p.inode;
+ new_d->k.p.snapshot = d.k->p.snapshot;
+
+ struct btree_iter dup_iter = {};
+ ret = bch2_hash_delete_at(trans,
+ bch2_dirent_hash_desc, hash_info, iter,
+ BTREE_UPDATE_internal_snapshot_node) ?:
+ bch2_str_hash_repair_key(trans, s,
+ &bch2_dirent_hash_desc, hash_info,
+ iter, bkey_i_to_s_c(&new_d->k_i),
+ &dup_iter, bkey_s_c_null,
+ need_second_pass);
+ goto out;
+ }
+
if (d.v->d_type == DT_SUBVOL) {
ret = check_dirent_to_subvol(trans, iter, d);
if (ret)
@@ -2417,13 +2352,13 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k),
buf.buf))) {
- ret = __remove_dirent(trans, d.k->p);
+ ret = bch2_fsck_remove_dirent(trans, d.k->p);
if (ret)
goto err;
}
darray_for_each(target->inodes, i) {
- ret = check_dirent_target(trans, iter, d, &i->inode);
+ ret = bch2_check_dirent_target(trans, iter, d, &i->inode, true);
if (ret)
goto err;
}
@@ -2441,7 +2376,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
BTREE_ID_dirents,
SPOS(k.k->p.inode, k.k->p.offset, *i),
BTREE_ITER_intent);
- ret = bch2_btree_iter_traverse(&delete_iter) ?:
+ ret = bch2_btree_iter_traverse(trans, &delete_iter) ?:
bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
hash_info,
&delete_iter,
@@ -2457,14 +2392,15 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
if (ret)
goto err;
- if (d.v->d_type == DT_DIR)
- for_each_visible_inode(c, s, dir, d.k->p.snapshot, i)
+ for_each_visible_inode(c, s, dir, d.k->p.snapshot, i) {
+ if (d.v->d_type == DT_DIR)
i->count++;
+ i->i_size += bkey_bytes(d.k);
+ }
out:
err:
fsck_err:
printbuf_exit(&buf);
- bch_err_fn(c, ret);
return ret;
}
@@ -2478,16 +2414,31 @@ int bch2_check_dirents(struct bch_fs *c)
struct inode_walker target = inode_walker_init();
struct snapshots_seen s;
struct bch_hash_info hash_info;
+ bool need_second_pass = false, did_second_pass = false;
+ int ret;
snapshots_seen_init(&s);
-
- int ret = bch2_trans_run(c,
- for_each_btree_key(trans, iter, BTREE_ID_dirents,
+again:
+ ret = bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter, BTREE_ID_dirents,
POS(BCACHEFS_ROOT_INO, 0),
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
- check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)) ?:
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s,
+ &need_second_pass)) ?:
check_subdir_count_notnested(trans, &dir));
+ if (!ret && need_second_pass && !did_second_pass) {
+ bch_info(c, "check_dirents requires second pass");
+ swap(did_second_pass, need_second_pass);
+ goto again;
+ }
+
+ if (!ret && need_second_pass) {
+ bch_err(c, "dirents not repairing");
+ ret = -EINVAL;
+ }
+
snapshots_seen_exit(&s);
inode_walker_exit(&dir);
inode_walker_exit(&target);
@@ -2501,16 +2452,14 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
struct inode_walker *inode)
{
struct bch_fs *c = trans->c;
- struct inode_walker_entry *i;
- int ret;
- ret = bch2_check_key_has_snapshot(trans, iter, k);
+ int ret = bch2_check_key_has_snapshot(trans, iter, k);
if (ret < 0)
return ret;
if (ret)
return 0;
- i = walk_inode(trans, inode, k);
+ struct inode_walker_entry *i = walk_inode(trans, inode, k);
ret = PTR_ERR_OR_ZERO(i);
if (ret)
return ret;
@@ -2519,16 +2468,16 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
if (ret)
return ret;
- if (!i)
+ if (!i || i->whiteout)
return 0;
if (inode->first_this_inode)
*hash_info = bch2_hash_info_init(c, &i->inode);
inode->first_this_inode = false;
- ret = bch2_str_hash_check_key(trans, NULL, &bch2_xattr_hash_desc, hash_info, iter, k);
- bch_err_fn(c, ret);
- return ret;
+ bool need_second_pass = false;
+ return bch2_str_hash_check_key(trans, NULL, &bch2_xattr_hash_desc, hash_info,
+ iter, k, &need_second_pass);
}
/*
@@ -2588,7 +2537,8 @@ static int check_root_trans(struct btree_trans *trans)
goto err;
}
- ret = lookup_inode(trans, BCACHEFS_ROOT_INO, snapshot, &root_inode);
+ ret = bch2_inode_find_by_inum_snapshot(trans, BCACHEFS_ROOT_INO, snapshot,
+ &root_inode, 0);
if (ret && !bch2_err_matches(ret, ENOENT))
return ret;
@@ -2620,8 +2570,6 @@ int bch2_check_root(struct bch_fs *c)
return ret;
}
-typedef DARRAY(u32) darray_u32;
-
static bool darray_u32_has(darray_u32 *d, u32 v)
{
darray_for_each(*d, i)
@@ -2641,6 +2589,11 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter,
if (k.k->type != KEY_TYPE_subvolume)
return 0;
+ subvol_inum start = {
+ .subvol = k.k->p.offset,
+ .inum = le64_to_cpu(bkey_s_c_to_subvolume(k).v->inode),
+ };
+
while (k.k->p.offset != BCACHEFS_ROOT_SUBVOL) {
ret = darray_push(&subvol_path, k.k->p.offset);
if (ret)
@@ -2658,7 +2611,14 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter,
u32 parent = le32_to_cpu(s.v->fs_path_parent);
if (darray_u32_has(&subvol_path, parent)) {
- if (fsck_err(c, subvol_loop, "subvolume loop"))
+ printbuf_reset(&buf);
+ prt_printf(&buf, "subvolume loop: ");
+
+ ret = bch2_inum_to_path(trans, start, &buf);
+ if (ret)
+ goto err;
+
+ if (fsck_err(trans, subvol_loop, "%s", buf.buf))
ret = reattach_subvol(trans, s);
break;
}
@@ -2666,7 +2626,7 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter,
bch2_trans_iter_exit(trans, &parent_iter);
bch2_trans_iter_init(trans, &parent_iter,
BTREE_ID_subvolumes, POS(0, parent), 0);
- k = bch2_btree_iter_peek_slot(&parent_iter);
+ k = bch2_btree_iter_peek_slot(trans, &parent_iter);
ret = bkey_err(k);
if (ret)
goto err;
@@ -2674,7 +2634,8 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter,
if (fsck_err_on(k.k->type != KEY_TYPE_subvolume,
trans, subvol_unreachable,
"unreachable subvolume %s",
- (bch2_bkey_val_to_text(&buf, c, s.s_c),
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, s.s_c),
buf.buf))) {
ret = reattach_subvol(trans, s);
break;
@@ -2699,19 +2660,13 @@ int bch2_check_subvolume_structure(struct bch_fs *c)
return ret;
}
-struct pathbuf_entry {
- u64 inum;
- u32 snapshot;
-};
-
-typedef DARRAY(struct pathbuf_entry) pathbuf;
-
-static int bch2_bi_depth_renumber_one(struct btree_trans *trans, struct pathbuf_entry *p,
+static int bch2_bi_depth_renumber_one(struct btree_trans *trans,
+ u64 inum, u32 snapshot,
u32 new_depth)
{
struct btree_iter iter;
struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
- SPOS(0, p->inum, p->snapshot), 0);
+ SPOS(0, inum, snapshot), 0);
struct bch_inode_unpacked inode;
int ret = bkey_err(k) ?:
@@ -2730,14 +2685,15 @@ err:
return ret;
}
-static int bch2_bi_depth_renumber(struct btree_trans *trans, pathbuf *path, u32 new_bi_depth)
+static int bch2_bi_depth_renumber(struct btree_trans *trans, darray_u64 *path,
+ u32 snapshot, u32 new_bi_depth)
{
u32 restart_count = trans->restart_count;
int ret = 0;
darray_for_each_reverse(*path, i) {
ret = nested_lockrestart_do(trans,
- bch2_bi_depth_renumber_one(trans, i, new_bi_depth));
+ bch2_bi_depth_renumber_one(trans, *i, snapshot, new_bi_depth));
bch_err_fn(trans->c, ret);
if (ret)
break;
@@ -2748,37 +2704,36 @@ static int bch2_bi_depth_renumber(struct btree_trans *trans, pathbuf *path, u32
return ret ?: trans_was_restarted(trans, restart_count);
}
-static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
-{
- darray_for_each(*p, i)
- if (i->inum == inum &&
- i->snapshot == snapshot)
- return true;
- return false;
-}
-
static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k)
{
struct bch_fs *c = trans->c;
struct btree_iter inode_iter = {};
- pathbuf path = {};
+ darray_u64 path = {};
struct printbuf buf = PRINTBUF;
u32 snapshot = inode_k.k->p.snapshot;
bool redo_bi_depth = false;
u32 min_bi_depth = U32_MAX;
int ret = 0;
+ struct bpos start = inode_k.k->p;
+
struct bch_inode_unpacked inode;
ret = bch2_inode_unpack(inode_k, &inode);
if (ret)
return ret;
- while (!inode.bi_subvol) {
+ /*
+ * If we're running full fsck, check_dirents() will have already ran,
+ * and we shouldn't see any missing backpointers here - otherwise that's
+ * handled separately, by check_unreachable_inodes
+ */
+ while (!inode.bi_subvol &&
+ bch2_inode_has_backpointer(&inode)) {
struct btree_iter dirent_iter;
struct bkey_s_c_dirent d;
- u32 parent_snapshot = snapshot;
- d = inode_get_dirent(trans, &dirent_iter, &inode, &parent_snapshot);
+ d = dirent_get_by_pos(trans, &dirent_iter,
+ SPOS(inode.bi_dir, inode.bi_dir_offset, snapshot));
ret = bkey_err(d.s_c);
if (ret && !bch2_err_matches(ret, ENOENT))
goto out;
@@ -2796,15 +2751,10 @@ static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k)
bch2_trans_iter_exit(trans, &dirent_iter);
- ret = darray_push(&path, ((struct pathbuf_entry) {
- .inum = inode.bi_inum,
- .snapshot = snapshot,
- }));
+ ret = darray_push(&path, inode.bi_inum);
if (ret)
return ret;
- snapshot = parent_snapshot;
-
bch2_trans_iter_exit(trans, &inode_iter);
inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes,
SPOS(0, inode.bi_dir, snapshot), 0);
@@ -2826,22 +2776,28 @@ static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k)
break;
inode = parent_inode;
- snapshot = inode_k.k->p.snapshot;
redo_bi_depth = true;
- if (path_is_dup(&path, inode.bi_inum, snapshot)) {
- /* XXX print path */
- bch_err(c, "directory structure loop");
+ if (darray_find(path, inode.bi_inum)) {
+ printbuf_reset(&buf);
+ prt_printf(&buf, "directory structure loop in snapshot %u: ",
+ snapshot);
+
+ ret = bch2_inum_snapshot_to_path(trans, start.offset, start.snapshot, NULL, &buf);
+ if (ret)
+ goto out;
- darray_for_each(path, i)
- pr_err("%llu:%u", i->inum, i->snapshot);
- pr_err("%llu:%u", inode.bi_inum, snapshot);
+ if (c->opts.verbose) {
+ prt_newline(&buf);
+ darray_for_each(path, i)
+ prt_printf(&buf, "%llu ", *i);
+ }
- if (fsck_err(trans, dir_loop, "directory structure loop")) {
+ if (fsck_err(trans, dir_loop, "%s", buf.buf)) {
ret = remove_backpointer(trans, &inode);
bch_err_msg(c, ret, "removing dirent");
if (ret)
- break;
+ goto out;
ret = reattach_inode(trans, &inode);
bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
@@ -2855,7 +2811,7 @@ static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k)
min_bi_depth = 0;
if (redo_bi_depth)
- ret = bch2_bi_depth_renumber(trans, &path, min_bi_depth);
+ ret = bch2_bi_depth_renumber(trans, &path, snapshot, min_bi_depth);
out:
fsck_err:
bch2_trans_iter_exit(trans, &inode_iter);
@@ -2872,7 +2828,7 @@ fsck_err:
int bch2_check_directory_structure(struct bch_fs *c)
{
int ret = bch2_trans_run(c,
- for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN,
+ for_each_btree_key_reverse_commit(trans, iter, BTREE_ID_inodes, POS_MIN,
BTREE_ITER_intent|
BTREE_ITER_prefetch|
BTREE_ITER_all_snapshots, k,
@@ -2911,7 +2867,7 @@ static int add_nlink(struct bch_fs *c, struct nlink_table *t,
if (!d) {
bch_err(c, "fsck: error allocating memory for nlink_table, size %zu",
new_size);
- return -BCH_ERR_ENOMEM_fsck_add_nlink;
+ return bch_err_throw(c, ENOMEM_fsck_add_nlink);
}
if (t->d)
@@ -3236,7 +3192,7 @@ long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg)
{
struct bch_ioctl_fsck_offline arg;
struct fsck_thread *thr = NULL;
- darray_str(devs) = {};
+ darray_const_str devs = {};
long ret = 0;
if (copy_from_user(&arg, user_arg, sizeof(arg)))
@@ -3277,7 +3233,7 @@ long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg)
if (arg.opts) {
char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
ret = PTR_ERR_OR_ZERO(optstr) ?:
- bch2_parse_mount_opts(NULL, &thr->opts, NULL, optstr);
+ bch2_parse_mount_opts(NULL, &thr->opts, NULL, optstr, false);
if (!IS_ERR(optstr))
kfree(optstr);
@@ -3294,7 +3250,7 @@ long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg)
bch2_thread_with_stdio_init(&thr->thr, &bch2_offline_fsck_ops);
- thr->c = bch2_fs_open(devs.data, arg.nr_devs, thr->opts);
+ thr->c = bch2_fs_open(&devs, &thr->opts);
if (!IS_ERR(thr->c) &&
thr->c->opts.errors == BCH_ON_ERROR_panic)
@@ -3331,19 +3287,18 @@ static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio)
c->opts.fix_errors = FSCK_FIX_ask;
c->opts.fsck = true;
- set_bit(BCH_FS_fsck_running, &c->flags);
+ set_bit(BCH_FS_in_fsck, &c->flags);
- c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info;
- int ret = bch2_run_online_recovery_passes(c);
+ int ret = bch2_run_online_recovery_passes(c, ~0ULL);
- clear_bit(BCH_FS_fsck_running, &c->flags);
+ clear_bit(BCH_FS_in_fsck, &c->flags);
bch_err_fn(c, ret);
c->stdio = NULL;
c->stdio_filter = NULL;
c->opts.fix_errors = old_fix_errors;
- up(&c->online_fsck_mutex);
+ up(&c->recovery.run_lock);
bch2_ro_ref_put(c);
return ret;
}
@@ -3367,7 +3322,7 @@ long bch2_ioctl_fsck_online(struct bch_fs *c, struct bch_ioctl_fsck_online arg)
if (!bch2_ro_ref_tryget(c))
return -EROFS;
- if (down_trylock(&c->online_fsck_mutex)) {
+ if (down_trylock(&c->recovery.run_lock)) {
bch2_ro_ref_put(c);
return -EAGAIN;
}
@@ -3385,7 +3340,7 @@ long bch2_ioctl_fsck_online(struct bch_fs *c, struct bch_ioctl_fsck_online arg)
char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
ret = PTR_ERR_OR_ZERO(optstr) ?:
- bch2_parse_mount_opts(c, &thr->opts, NULL, optstr);
+ bch2_parse_mount_opts(c, &thr->opts, NULL, optstr, false);
if (!IS_ERR(optstr))
kfree(optstr);
@@ -3399,7 +3354,7 @@ err:
bch_err_fn(c, ret);
if (thr)
bch2_fsck_thread_exit(&thr->thr);
- up(&c->online_fsck_mutex);
+ up(&c->recovery.run_lock);
bch2_ro_ref_put(c);
}
return ret;
diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h
index 574948278cd4..e5fe7cf7b251 100644
--- a/fs/bcachefs/fsck.h
+++ b/fs/bcachefs/fsck.h
@@ -4,6 +4,12 @@
#include "str_hash.h"
+/* recoverds snapshot IDs of overwrites at @pos */
+struct snapshots_seen {
+ struct bpos pos;
+ snapshot_id_list ids;
+};
+
int bch2_fsck_update_backpointers(struct btree_trans *,
struct snapshots_seen *,
const struct bch_hash_desc,
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 04ec05206f8c..ef4cc7395b86 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -14,6 +14,7 @@
#include "extent_update.h"
#include "fs.h"
#include "inode.h"
+#include "namei.h"
#include "opts.h"
#include "str_hash.h"
#include "snapshot.h"
@@ -37,6 +38,7 @@ static const char * const bch2_inode_flag_strs[] = {
#undef x
static int delete_ancestor_snapshot_inodes(struct btree_trans *, struct bpos);
+static int may_delete_deleted_inum(struct btree_trans *, subvol_inum);
static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 };
@@ -240,6 +242,7 @@ static int bch2_inode_unpack_v3(struct bkey_s_c k,
u64 v[2];
unpacked->bi_inum = inode.k->p.offset;
+ unpacked->bi_snapshot = inode.k->p.snapshot;
unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq);
unpacked->bi_hash_seed = inode.v->bi_hash_seed;
unpacked->bi_flags = le64_to_cpu(inode.v->bi_flags);
@@ -284,13 +287,12 @@ static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k,
{
memset(unpacked, 0, sizeof(*unpacked));
- unpacked->bi_snapshot = k.k->p.snapshot;
-
switch (k.k->type) {
case KEY_TYPE_inode: {
struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
unpacked->bi_inum = inode.k->p.offset;
+ unpacked->bi_snapshot = inode.k->p.snapshot;
unpacked->bi_journal_seq= 0;
unpacked->bi_hash_seed = inode.v->bi_hash_seed;
unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags);
@@ -309,6 +311,7 @@ static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k,
struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
unpacked->bi_inum = inode.k->p.offset;
+ unpacked->bi_snapshot = inode.k->p.snapshot;
unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq);
unpacked->bi_hash_seed = inode.v->bi_hash_seed;
unpacked->bi_flags = le64_to_cpu(inode.v->bi_flags);
@@ -326,8 +329,6 @@ static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k,
int bch2_inode_unpack(struct bkey_s_c k,
struct bch_inode_unpacked *unpacked)
{
- unpacked->bi_snapshot = k.k->p.snapshot;
-
return likely(k.k->type == KEY_TYPE_inode_v3)
? bch2_inode_unpack_v3(k, unpacked)
: bch2_inode_unpack_slowpath(k, unpacked);
@@ -367,6 +368,82 @@ err:
return ret;
}
+int bch2_inode_find_by_inum_snapshot(struct btree_trans *trans,
+ u64 inode_nr, u32 snapshot,
+ struct bch_inode_unpacked *inode,
+ unsigned flags)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
+ SPOS(0, inode_nr, snapshot), flags);
+ int ret = bkey_err(k);
+ if (ret)
+ goto err;
+
+ ret = bkey_is_inode(k.k)
+ ? bch2_inode_unpack(k, inode)
+ : -BCH_ERR_ENOENT_inode;
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *trans,
+ subvol_inum inum,
+ struct bch_inode_unpacked *inode)
+{
+ struct btree_iter iter;
+ int ret;
+
+ ret = bch2_inode_peek_nowarn(trans, &iter, inode, inum, 0);
+ if (!ret)
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+int bch2_inode_find_by_inum_trans(struct btree_trans *trans,
+ subvol_inum inum,
+ struct bch_inode_unpacked *inode)
+{
+ struct btree_iter iter;
+ int ret;
+
+ ret = bch2_inode_peek(trans, &iter, inode, inum, 0);
+ if (!ret)
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum,
+ struct bch_inode_unpacked *inode)
+{
+ return bch2_trans_do(c, bch2_inode_find_by_inum_trans(trans, inum, inode));
+}
+
+int bch2_inode_find_snapshot_root(struct btree_trans *trans, u64 inum,
+ struct bch_inode_unpacked *root)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret = 0;
+
+ for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes,
+ SPOS(0, inum, U32_MAX),
+ BTREE_ITER_all_snapshots, k, ret) {
+ if (k.k->p.offset != inum)
+ break;
+ if (bkey_is_inode(k.k)) {
+ ret = bch2_inode_unpack(k, root);
+ goto out;
+ }
+ }
+ /* We're only called when we know we have an inode for @inum */
+ BUG_ON(!ret);
+out:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
int bch2_inode_write_flags(struct btree_trans *trans,
struct btree_iter *iter,
struct bch_inode_unpacked *inode,
@@ -731,10 +808,9 @@ int bch2_trigger_inode(struct btree_trans *trans,
bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq);
}
- s64 nr = bkey_is_inode(new.k) - bkey_is_inode(old.k);
- if ((flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) && nr) {
- struct disk_accounting_pos acc = { .type = BCH_DISK_ACCOUNTING_nr_inodes };
- int ret = bch2_disk_accounting_mod(trans, &acc, &nr, 1, flags & BTREE_TRIGGER_gc);
+ s64 nr[1] = { bkey_is_inode(new.k) - bkey_is_inode(old.k) };
+ if ((flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) && nr[0]) {
+ int ret = bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, nr, nr_inodes);
if (ret)
return ret;
}
@@ -833,7 +909,8 @@ void bch2_inode_init_early(struct bch_fs *c,
get_random_bytes(&inode_u->bi_hash_seed, sizeof(inode_u->bi_hash_seed));
}
-void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now,
+void bch2_inode_init_late(struct bch_fs *c,
+ struct bch_inode_unpacked *inode_u, u64 now,
uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
struct bch_inode_unpacked *parent)
{
@@ -857,6 +934,12 @@ void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now,
BCH_INODE_OPTS()
#undef x
}
+
+ if (!S_ISDIR(mode))
+ inode_u->bi_casefold = 0;
+
+ if (bch2_inode_casefold(c, inode_u))
+ inode_u->bi_flags |= BCH_INODE_has_case_insensitive;
}
void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
@@ -864,23 +947,10 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
struct bch_inode_unpacked *parent)
{
bch2_inode_init_early(c, inode_u);
- bch2_inode_init_late(inode_u, bch2_current_time(c),
+ bch2_inode_init_late(c, inode_u, bch2_current_time(c),
uid, gid, mode, rdev, parent);
}
-static inline u32 bkey_generation(struct bkey_s_c k)
-{
- switch (k.k->type) {
- case KEY_TYPE_inode:
- case KEY_TYPE_inode_v2:
- BUG();
- case KEY_TYPE_inode_generation:
- return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation);
- default:
- return 0;
- }
-}
-
static struct bkey_i_inode_alloc_cursor *
bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *max)
{
@@ -954,7 +1024,7 @@ int bch2_inode_create(struct btree_trans *trans,
BTREE_ITER_intent);
struct bkey_s_c k;
again:
- while ((k = bch2_btree_iter_peek(iter)).k &&
+ while ((k = bch2_btree_iter_peek(trans, iter)).k &&
!(ret = bkey_err(k)) &&
bkey_lt(k.k->p, POS(0, max))) {
if (pos < iter->pos.offset)
@@ -965,14 +1035,14 @@ again:
* we've found just one:
*/
pos = iter->pos.offset + 1;
- bch2_btree_iter_set_pos(iter, POS(0, pos));
+ bch2_btree_iter_set_pos(trans, iter, POS(0, pos));
}
if (!ret && pos < max)
goto found_slot;
if (!ret && start == min)
- ret = -BCH_ERR_ENOSPC_inode_create;
+ ret = bch_err_throw(trans->c, ENOSPC_inode_create);
if (ret) {
bch2_trans_iter_exit(trans, iter);
@@ -981,12 +1051,12 @@ again:
/* Retry from start */
pos = start = min;
- bch2_btree_iter_set_pos(iter, POS(0, pos));
+ bch2_btree_iter_set_pos(trans, iter, POS(0, pos));
le32_add_cpu(&cursor->v.gen, 1);
goto again;
found_slot:
- bch2_btree_iter_set_pos(iter, SPOS(0, pos, snapshot));
- k = bch2_btree_iter_peek_slot(iter);
+ bch2_btree_iter_set_pos(trans, iter, SPOS(0, pos, snapshot));
+ k = bch2_btree_iter_peek_slot(trans, iter);
ret = bkey_err(k);
if (ret) {
bch2_trans_iter_exit(trans, iter);
@@ -1023,9 +1093,9 @@ static int bch2_inode_delete_keys(struct btree_trans *trans,
if (ret)
goto err;
- bch2_btree_iter_set_snapshot(&iter, snapshot);
+ bch2_btree_iter_set_snapshot(trans, &iter, snapshot);
- k = bch2_btree_iter_peek_max(&iter, end);
+ k = bch2_btree_iter_peek_max(trans, &iter, end);
ret = bkey_err(k);
if (ret)
goto err;
@@ -1056,24 +1126,28 @@ err:
int bch2_inode_rm(struct bch_fs *c, subvol_inum inum)
{
struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter = { NULL };
+ struct btree_iter iter = {};
struct bkey_s_c k;
u32 snapshot;
int ret;
+ ret = lockrestart_do(trans, may_delete_deleted_inum(trans, inum));
+ if (ret)
+ goto err2;
+
/*
* If this was a directory, there shouldn't be any real dirents left -
* but there could be whiteouts (from hash collisions) that we should
* delete:
*
- * XXX: the dirent could ideally would delete whiteouts when they're no
+ * XXX: the dirent code ideally would delete whiteouts when they're no
* longer needed
*/
ret = bch2_inode_delete_keys(trans, inum, BTREE_ID_extents) ?:
bch2_inode_delete_keys(trans, inum, BTREE_ID_xattrs) ?:
bch2_inode_delete_keys(trans, inum, BTREE_ID_dirents);
if (ret)
- goto err;
+ goto err2;
retry:
bch2_trans_begin(trans);
@@ -1092,7 +1166,7 @@ retry:
bch2_fs_inconsistent(c,
"inode %llu:%u not found when deleting",
inum.inum, snapshot);
- ret = -EIO;
+ ret = bch_err_throw(c, ENOENT_inode);
goto err;
}
@@ -1113,38 +1187,6 @@ err2:
return ret;
}
-int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *trans,
- subvol_inum inum,
- struct bch_inode_unpacked *inode)
-{
- struct btree_iter iter;
- int ret;
-
- ret = bch2_inode_peek_nowarn(trans, &iter, inode, inum, 0);
- if (!ret)
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-int bch2_inode_find_by_inum_trans(struct btree_trans *trans,
- subvol_inum inum,
- struct bch_inode_unpacked *inode)
-{
- struct btree_iter iter;
- int ret;
-
- ret = bch2_inode_peek(trans, &iter, inode, inum, 0);
- if (!ret)
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum,
- struct bch_inode_unpacked *inode)
-{
- return bch2_trans_do(c, bch2_inode_find_by_inum_trans(trans, inum, inode));
-}
-
int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi)
{
if (bi->bi_flags & BCH_INODE_unlinked)
@@ -1198,6 +1240,7 @@ void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c,
opts->_name##_from_inode = true; \
} else { \
opts->_name = c->opts._name; \
+ opts->_name##_from_inode = false; \
}
BCH_INODE_OPTS()
#undef x
@@ -1217,10 +1260,48 @@ int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_i
return 0;
}
+int bch2_inode_set_casefold(struct btree_trans *trans, subvol_inum inum,
+ struct bch_inode_unpacked *bi, unsigned v)
+{
+ struct bch_fs *c = trans->c;
+
+#ifndef CONFIG_UNICODE
+ bch_err(c, "Cannot use casefolding on a kernel without CONFIG_UNICODE");
+ return -EOPNOTSUPP;
+#endif
+
+ if (c->opts.casefold_disabled)
+ return -EOPNOTSUPP;
+
+ int ret = 0;
+ /* Not supported on individual files. */
+ if (!S_ISDIR(bi->bi_mode))
+ return -EOPNOTSUPP;
+
+ /*
+ * Make sure the dir is empty, as otherwise we'd need to
+ * rehash everything and update the dirent keys.
+ */
+ ret = bch2_empty_dir_trans(trans, inum);
+ if (ret < 0)
+ return ret;
+
+ ret = bch2_request_incompat_feature(c, bcachefs_metadata_version_casefolding);
+ if (ret)
+ return ret;
+
+ bch2_check_set_feature(c, BCH_FEATURE_casefolding);
+
+ bi->bi_casefold = v + 1;
+ bi->bi_fields_set |= BIT(Inode_opt_casefold);
+
+ return bch2_maybe_propagate_has_case_insensitive(trans, inum, bi);
+}
+
static noinline int __bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
{
struct bch_fs *c = trans->c;
- struct btree_iter iter = { NULL };
+ struct btree_iter iter = {};
struct bkey_i_inode_generation delete;
struct bch_inode_unpacked inode_u;
struct bkey_s_c k;
@@ -1255,7 +1336,7 @@ retry:
bch2_fs_inconsistent(c,
"inode %llu:%u not found when deleting",
inum, snapshot);
- ret = -EIO;
+ ret = bch_err_throw(c, ENOENT_inode);
goto err;
}
@@ -1319,10 +1400,8 @@ int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
delete_ancestor_snapshot_inodes(trans, SPOS(0, inum, snapshot));
}
-static int may_delete_deleted_inode(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bpos pos,
- bool *need_another_pass)
+static int may_delete_deleted_inode(struct btree_trans *trans, struct bpos pos,
+ bool from_deleted_inodes)
{
struct bch_fs *c = trans->c;
struct btree_iter inode_iter;
@@ -1336,12 +1415,14 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
if (ret)
return ret;
- ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode;
- if (fsck_err_on(!bkey_is_inode(k.k),
+ ret = bkey_is_inode(k.k) ? 0 : bch_err_throw(c, ENOENT_inode);
+ if (fsck_err_on(from_deleted_inodes && ret,
trans, deleted_inode_missing,
"nonexistent inode %llu:%u in deleted_inodes btree",
pos.offset, pos.snapshot))
goto delete;
+ if (ret)
+ goto out;
ret = bch2_inode_unpack(k, &inode);
if (ret)
@@ -1349,7 +1430,8 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
if (S_ISDIR(inode.bi_mode)) {
ret = bch2_empty_dir_snapshot(trans, pos.offset, 0, pos.snapshot);
- if (fsck_err_on(bch2_err_matches(ret, ENOTEMPTY),
+ if (fsck_err_on(from_deleted_inodes &&
+ bch2_err_matches(ret, ENOTEMPTY),
trans, deleted_inode_is_dir,
"non empty directory %llu:%u in deleted_inodes btree",
pos.offset, pos.snapshot))
@@ -1358,17 +1440,25 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
goto out;
}
- if (fsck_err_on(!(inode.bi_flags & BCH_INODE_unlinked),
+ ret = inode.bi_flags & BCH_INODE_unlinked ? 0 : bch_err_throw(c, inode_not_unlinked);
+ if (fsck_err_on(from_deleted_inodes && ret,
trans, deleted_inode_not_unlinked,
"non-deleted inode %llu:%u in deleted_inodes btree",
pos.offset, pos.snapshot))
goto delete;
+ if (ret)
+ goto out;
- if (fsck_err_on(inode.bi_flags & BCH_INODE_has_child_snapshot,
+ ret = !(inode.bi_flags & BCH_INODE_has_child_snapshot)
+ ? 0 : bch_err_throw(c, inode_has_child_snapshot);
+
+ if (fsck_err_on(from_deleted_inodes && ret,
trans, deleted_inode_has_child_snapshots,
"inode with child snapshots %llu:%u in deleted_inodes btree",
pos.offset, pos.snapshot))
goto delete;
+ if (ret)
+ goto out;
ret = bch2_inode_has_child_snapshots(trans, k.k->p);
if (ret < 0)
@@ -1385,19 +1475,28 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
if (ret)
goto out;
}
+
+ if (!from_deleted_inodes) {
+ ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
+ bch_err_throw(c, inode_has_child_snapshot);
+ goto out;
+ }
+
goto delete;
}
- if (test_bit(BCH_FS_clean_recovery, &c->flags) &&
- !fsck_err(trans, deleted_inode_but_clean,
- "filesystem marked as clean but have deleted inode %llu:%u",
- pos.offset, pos.snapshot)) {
- ret = 0;
- goto out;
- }
+ if (from_deleted_inodes) {
+ if (test_bit(BCH_FS_clean_recovery, &c->flags) &&
+ !fsck_err(trans, deleted_inode_but_clean,
+ "filesystem marked as clean but have deleted inode %llu:%u",
+ pos.offset, pos.snapshot)) {
+ ret = 0;
+ goto out;
+ }
- ret = 1;
+ ret = 1;
+ }
out:
fsck_err:
bch2_trans_iter_exit(trans, &inode_iter);
@@ -1408,12 +1507,19 @@ delete:
goto out;
}
+static int may_delete_deleted_inum(struct btree_trans *trans, subvol_inum inum)
+{
+ u32 snapshot;
+
+ return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?:
+ may_delete_deleted_inode(trans, SPOS(0, inum.inum, snapshot), false);
+}
+
int bch2_delete_dead_inodes(struct bch_fs *c)
{
struct btree_trans *trans = bch2_trans_get(c);
- bool need_another_pass;
int ret;
-again:
+
/*
* if we ran check_inodes() unlinked inodes will have already been
* cleaned up but the write buffer will be out of sync; therefore we
@@ -1423,8 +1529,6 @@ again:
if (ret)
goto err;
- need_another_pass = false;
-
/*
* Weird transaction restart handling here because on successful delete,
* bch2_inode_rm_snapshot() will return a nested transaction restart,
@@ -1434,7 +1538,7 @@ again:
ret = for_each_btree_key_commit(trans, iter, BTREE_ID_deleted_inodes, POS_MIN,
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
- ret = may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass);
+ ret = may_delete_deleted_inode(trans, k.k->p, true);
if (ret > 0) {
bch_verbose_ratelimited(c, "deleting unlinked inode %llu:%u",
k.k->p.offset, k.k->p.snapshot);
@@ -1455,10 +1559,8 @@ again:
ret;
}));
-
- if (!ret && need_another_pass)
- goto again;
err:
bch2_trans_put(trans);
+ bch_err_fn(c, ret);
return ret;
}
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index d2e134528f0e..b8ec3e628d90 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -134,10 +134,21 @@ static inline int bch2_inode_peek(struct btree_trans *trans,
subvol_inum inum, unsigned flags)
{
return __bch2_inode_peek(trans, iter, inode, inum, flags, true);
- int ret = bch2_inode_peek_nowarn(trans, iter, inode, inum, flags);
- return ret;
}
+int bch2_inode_find_by_inum_snapshot(struct btree_trans *, u64, u32,
+ struct bch_inode_unpacked *, unsigned);
+int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *,
+ subvol_inum,
+ struct bch_inode_unpacked *);
+int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum,
+ struct bch_inode_unpacked *);
+int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum,
+ struct bch_inode_unpacked *);
+
+int bch2_inode_find_snapshot_root(struct btree_trans *trans, u64 inum,
+ struct bch_inode_unpacked *root);
+
int bch2_inode_write_flags(struct btree_trans *, struct btree_iter *,
struct bch_inode_unpacked *, enum btree_iter_update_trigger_flags);
@@ -153,7 +164,7 @@ int bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *);
void bch2_inode_init_early(struct bch_fs *,
struct bch_inode_unpacked *);
-void bch2_inode_init_late(struct bch_inode_unpacked *, u64,
+void bch2_inode_init_late(struct bch_fs *, struct bch_inode_unpacked *, u64,
uid_t, gid_t, umode_t, dev_t,
struct bch_inode_unpacked *);
void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
@@ -165,14 +176,6 @@ int bch2_inode_create(struct btree_trans *, struct btree_iter *,
int bch2_inode_rm(struct bch_fs *, subvol_inum);
-int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *,
- subvol_inum,
- struct bch_inode_unpacked *);
-int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum,
- struct bch_inode_unpacked *);
-int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum,
- struct bch_inode_unpacked *);
-
#define inode_opt_get(_c, _inode, _name) \
((_inode)->bi_##_name ? (_inode)->bi_##_name - 1 : (_c)->opts._name)
@@ -243,6 +246,19 @@ static inline unsigned bkey_inode_mode(struct bkey_s_c k)
}
}
+static inline bool bch2_inode_casefold(struct bch_fs *c, const struct bch_inode_unpacked *bi)
+{
+ /* inode opts are stored with a +1 bias: 0 means "unset, use fs opt" */
+ return bi->bi_casefold
+ ? bi->bi_casefold - 1
+ : c->opts.casefold;
+}
+
+static inline bool bch2_inode_has_backpointer(const struct bch_inode_unpacked *bi)
+{
+ return bi->bi_dir || bi->bi_dir_offset;
+}
+
/* i_nlink: */
static inline unsigned nlink_bias(umode_t mode)
@@ -272,25 +288,29 @@ static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi,
int bch2_inode_nlink_inc(struct bch_inode_unpacked *);
void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *);
-static inline bool bch2_inode_should_have_single_bp(struct bch_inode_unpacked *inode)
-{
- bool inode_has_bp = inode->bi_dir || inode->bi_dir_offset;
-
- return S_ISDIR(inode->bi_mode) ||
- (!inode->bi_nlink && inode_has_bp);
-}
-
struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *);
void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *,
struct bch_inode_unpacked *);
-int bch2_inum_opts_get(struct btree_trans*, subvol_inum, struct bch_io_opts *);
+int bch2_inum_opts_get(struct btree_trans *, subvol_inum, struct bch_io_opts *);
+int bch2_inode_set_casefold(struct btree_trans *, subvol_inum,
+ struct bch_inode_unpacked *, unsigned);
+
+#include "rebalance.h"
static inline struct bch_extent_rebalance
bch2_inode_rebalance_opts_get(struct bch_fs *c, struct bch_inode_unpacked *inode)
{
struct bch_io_opts io_opts;
bch2_inode_opts_get(&io_opts, c, inode);
- return io_opts_to_rebalance_opts(&io_opts);
+ return io_opts_to_rebalance_opts(c, &io_opts);
+}
+
+#define BCACHEFS_ROOT_SUBVOL_INUM \
+ ((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO })
+
+static inline bool subvol_inum_eq(subvol_inum a, subvol_inum b)
+{
+ return a.subvol == b.subvol && a.inum == b.inum;
}
int bch2_inode_rm_snapshot(struct btree_trans *, u64, u32);
diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h
index b99a5bf1a75e..1f00938b1bdc 100644
--- a/fs/bcachefs/inode_format.h
+++ b/fs/bcachefs/inode_format.h
@@ -103,7 +103,8 @@ struct bch_inode_generation {
x(bi_parent_subvol, 32) \
x(bi_nocow, 8) \
x(bi_depth, 32) \
- x(bi_inodes_32bit, 8)
+ x(bi_inodes_32bit, 8) \
+ x(bi_casefold, 8)
/* subset of BCH_INODE_FIELDS */
#define BCH_INODE_OPTS() \
@@ -117,7 +118,8 @@ struct bch_inode_generation {
x(background_target, 16) \
x(erasure_code, 16) \
x(nocow, 8) \
- x(inodes_32bit, 8)
+ x(inodes_32bit, 8) \
+ x(casefold, 8)
enum inode_opt_id {
#define x(name, ...) \
@@ -127,6 +129,10 @@ enum inode_opt_id {
Inode_opt_nr,
};
+/*
+ * BCH_INODE_has_case_insensitive is set if any descendent is case insensitive -
+ * for overlayfs
+ */
#define BCH_INODE_FLAGS() \
x(sync, 0) \
x(immutable, 1) \
@@ -137,7 +143,8 @@ enum inode_opt_id {
x(i_sectors_dirty, 6) \
x(unlinked, 7) \
x(backptr_untrusted, 8) \
- x(has_child_snapshot, 9)
+ x(has_child_snapshot, 9) \
+ x(has_case_insensitive, 10)
/* bits 20+ reserved for packed fields below: */
diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c
index 5353979117b0..07023667a475 100644
--- a/fs/bcachefs/io_misc.c
+++ b/fs/bcachefs/io_misc.c
@@ -43,7 +43,7 @@ int bch2_extent_fallocate(struct btree_trans *trans,
bch2_bkey_buf_init(&new);
closure_init_stack(&cl);
- k = bch2_btree_iter_peek_slot(iter);
+ k = bch2_btree_iter_peek_slot(trans, iter);
ret = bkey_err(k);
if (ret)
return ret;
@@ -91,7 +91,7 @@ int bch2_extent_fallocate(struct btree_trans *trans,
opts.data_replicas,
BCH_WATERMARK_normal, 0, &cl, &wp);
if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
- ret = -BCH_ERR_transaction_restart_nested;
+ ret = bch_err_throw(c, transaction_restart_nested);
if (ret)
goto err;
@@ -115,7 +115,8 @@ err:
bch2_increment_clock(c, sectors_allocated, WRITE);
if (should_print_err(ret)) {
struct printbuf buf = PRINTBUF;
- bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter->pos.offset << 9);
+ lockrestart_do(trans,
+ bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter->pos.offset << 9));
prt_printf(&buf, "fallocate error: %s", bch2_err_str(ret));
bch_err_ratelimited(c, "%s", buf.buf);
printbuf_exit(&buf);
@@ -134,6 +135,33 @@ err_noprint:
return ret;
}
+/* For fsck */
+int bch2_fpunch_snapshot(struct btree_trans *trans, struct bpos start, struct bpos end)
+{
+ u32 restart_count = trans->restart_count;
+ struct bch_fs *c = trans->c;
+ struct disk_reservation disk_res = bch2_disk_reservation_init(c, 0);
+ unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits);
+ struct bkey_i delete;
+
+ int ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents,
+ start, end, 0, k,
+ &disk_res, NULL, BCH_TRANS_COMMIT_no_enospc, ({
+ bkey_init(&delete.k);
+ delete.k.p = iter.pos;
+
+ /* create the biggest key we can */
+ bch2_key_resize(&delete.k, max_sectors);
+ bch2_cut_back(end, &delete);
+
+ bch2_extent_trim_atomic(trans, &iter, &delete) ?:
+ bch2_trans_update(trans, &iter, &delete, 0);
+ }));
+
+ bch2_disk_reservation_put(c, &disk_res);
+ return ret ?: trans_was_restarted(trans, restart_count);
+}
+
/*
* Returns -BCH_ERR_transacton_restart if we had to drop locks:
*/
@@ -163,12 +191,12 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
if (ret)
continue;
- bch2_btree_iter_set_snapshot(iter, snapshot);
+ bch2_btree_iter_set_snapshot(trans, iter, snapshot);
/*
* peek_max() doesn't have ideal semantics for extents:
*/
- k = bch2_btree_iter_peek_max(iter, end_pos);
+ k = bch2_btree_iter_peek_max(trans, iter, end_pos);
if (!k.k)
break;
@@ -229,7 +257,7 @@ static int truncate_set_isize(struct btree_trans *trans,
u64 new_i_size,
bool warn)
{
- struct btree_iter iter = { NULL };
+ struct btree_iter iter = {};
struct bch_inode_unpacked inode_u;
int ret;
@@ -398,7 +426,7 @@ case LOGGED_OP_FINSERT_start:
if (ret)
goto err;
} else {
- bch2_btree_iter_set_pos(&iter, POS(inum.inum, src_offset));
+ bch2_btree_iter_set_pos(trans, &iter, POS(inum.inum, src_offset));
ret = bch2_fpunch_at(trans, &iter, inum, src_offset + len, i_sectors_delta);
if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -424,12 +452,12 @@ case LOGGED_OP_FINSERT_shift_extents:
if (ret)
goto btree_err;
- bch2_btree_iter_set_snapshot(&iter, snapshot);
- bch2_btree_iter_set_pos(&iter, SPOS(inum.inum, pos, snapshot));
+ bch2_btree_iter_set_snapshot(trans, &iter, snapshot);
+ bch2_btree_iter_set_pos(trans, &iter, SPOS(inum.inum, pos, snapshot));
k = insert
- ? bch2_btree_iter_peek_prev_min(&iter, POS(inum.inum, 0))
- : bch2_btree_iter_peek_max(&iter, POS(inum.inum, U64_MAX));
+ ? bch2_btree_iter_peek_prev_min(trans, &iter, POS(inum.inum, 0))
+ : bch2_btree_iter_peek_max(trans, &iter, POS(inum.inum, U64_MAX));
if ((ret = bkey_err(k)))
goto btree_err;
diff --git a/fs/bcachefs/io_misc.h b/fs/bcachefs/io_misc.h
index 9cb44a7c43c1..b93e4d4b3c0c 100644
--- a/fs/bcachefs/io_misc.h
+++ b/fs/bcachefs/io_misc.h
@@ -5,6 +5,8 @@
int bch2_extent_fallocate(struct btree_trans *, subvol_inum, struct btree_iter *,
u64, struct bch_io_opts, s64 *,
struct write_point_specifier);
+
+int bch2_fpunch_snapshot(struct btree_trans *, struct bpos, struct bpos);
int bch2_fpunch_at(struct btree_trans *, struct btree_iter *,
subvol_inum, u64, s64 *);
int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *);
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
index 8c7b2d3d779d..e0874ad9a6cf 100644
--- a/fs/bcachefs/io_read.c
+++ b/fs/bcachefs/io_read.c
@@ -9,6 +9,7 @@
#include "bcachefs.h"
#include "alloc_background.h"
#include "alloc_foreground.h"
+#include "async_objs.h"
#include "btree_update.h"
#include "buckets.h"
#include "checksum.h"
@@ -17,6 +18,7 @@
#include "data_update.h"
#include "disk_groups.h"
#include "ec.h"
+#include "enumerated_ref.h"
#include "error.h"
#include "io_read.h"
#include "io_misc.h"
@@ -25,8 +27,22 @@
#include "subvolume.h"
#include "trace.h"
+#include <linux/moduleparam.h>
+#include <linux/random.h>
#include <linux/sched/mm.h>
+#ifdef CONFIG_BCACHEFS_DEBUG
+static unsigned bch2_read_corrupt_ratio;
+module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644);
+MODULE_PARM_DESC(read_corrupt_ratio, "");
+#endif
+
+static bool bch2_poison_extents_on_checksum_error;
+module_param_named(poison_extents_on_checksum_error,
+ bch2_poison_extents_on_checksum_error, bool, 0644);
+MODULE_PARM_DESC(poison_extents_on_checksum_error,
+ "Extents with checksum errors are marked as poisoned - unsafe without read fua support");
+
#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
static bool bch2_target_congested(struct bch_fs *c, u16 target)
@@ -40,7 +56,7 @@ static bool bch2_target_congested(struct bch_fs *c, u16 target)
if (!target)
return false;
- rcu_read_lock();
+ guard(rcu)();
devs = bch2_target_to_mask(c, target) ?:
&c->rw_devs[BCH_DATA_user];
@@ -57,9 +73,8 @@ static bool bch2_target_congested(struct bch_fs *c, u16 target)
total += max(congested, 0LL);
nr++;
}
- rcu_read_unlock();
- return bch2_rand_range(nr * CONGESTED_MAX) < total;
+ return get_random_u32_below(nr * CONGESTED_MAX) < total;
}
#else
@@ -73,17 +88,6 @@ static bool bch2_target_congested(struct bch_fs *c, u16 target)
/* Cache promotion on read */
-struct promote_op {
- struct rcu_head rcu;
- u64 start_time;
-
- struct rhash_head hash;
- struct bpos pos;
-
- struct data_update write;
- struct bio_vec bi_inline_vecs[]; /* must be last */
-};
-
static const struct rhashtable_params bch_promote_params = {
.head_offset = offsetof(struct promote_op, hash),
.key_offset = offsetof(struct promote_op, pos),
@@ -96,6 +100,33 @@ static inline bool have_io_error(struct bch_io_failures *failed)
return failed && failed->nr;
}
+static inline struct data_update *rbio_data_update(struct bch_read_bio *rbio)
+{
+ EBUG_ON(rbio->split);
+
+ return rbio->data_update
+ ? container_of(rbio, struct data_update, rbio)
+ : NULL;
+}
+
+static bool ptr_being_rewritten(struct bch_read_bio *orig, unsigned dev)
+{
+ struct data_update *u = rbio_data_update(orig);
+ if (!u)
+ return false;
+
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(u->k.k));
+ unsigned i = 0;
+ bkey_for_each_ptr(ptrs, ptr) {
+ if (ptr->dev == dev &&
+ u->data_opts.rewrite_ptrs & BIT(i))
+ return true;
+ i++;
+ }
+
+ return false;
+}
+
static inline int should_promote(struct bch_fs *c, struct bkey_s_c k,
struct bpos pos,
struct bch_io_opts opts,
@@ -105,187 +136,178 @@ static inline int should_promote(struct bch_fs *c, struct bkey_s_c k,
if (!have_io_error(failed)) {
BUG_ON(!opts.promote_target);
- if (!(flags & BCH_READ_MAY_PROMOTE))
- return -BCH_ERR_nopromote_may_not;
+ if (!(flags & BCH_READ_may_promote))
+ return bch_err_throw(c, nopromote_may_not);
if (bch2_bkey_has_target(c, k, opts.promote_target))
- return -BCH_ERR_nopromote_already_promoted;
+ return bch_err_throw(c, nopromote_already_promoted);
if (bkey_extent_is_unwritten(k))
- return -BCH_ERR_nopromote_unwritten;
+ return bch_err_throw(c, nopromote_unwritten);
if (bch2_target_congested(c, opts.promote_target))
- return -BCH_ERR_nopromote_congested;
+ return bch_err_throw(c, nopromote_congested);
}
if (rhashtable_lookup_fast(&c->promote_table, &pos,
bch_promote_params))
- return -BCH_ERR_nopromote_in_flight;
+ return bch_err_throw(c, nopromote_in_flight);
return 0;
}
-static void promote_free(struct bch_fs *c, struct promote_op *op)
+static noinline void promote_free(struct bch_read_bio *rbio)
{
- int ret;
+ struct promote_op *op = container_of(rbio, struct promote_op, write.rbio);
+ struct bch_fs *c = rbio->c;
+
+ int ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
+ bch_promote_params);
+ BUG_ON(ret);
+
+ async_object_list_del(c, promote, op->list_idx);
+ async_object_list_del(c, rbio, rbio->list_idx);
bch2_data_update_exit(&op->write);
- ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
- bch_promote_params);
- BUG_ON(ret);
- bch2_write_ref_put(c, BCH_WRITE_REF_promote);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_promote);
kfree_rcu(op, rcu);
}
static void promote_done(struct bch_write_op *wop)
{
- struct promote_op *op =
- container_of(wop, struct promote_op, write.op);
- struct bch_fs *c = op->write.op.c;
+ struct promote_op *op = container_of(wop, struct promote_op, write.op);
+ struct bch_fs *c = op->write.rbio.c;
- bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
- op->start_time);
- promote_free(c, op);
+ bch2_time_stats_update(&c->times[BCH_TIME_data_promote], op->start_time);
+ promote_free(&op->write.rbio);
}
-static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
+static void promote_start_work(struct work_struct *work)
{
- struct bio *bio = &op->write.op.wbio.bio;
+ struct promote_op *op = container_of(work, struct promote_op, work);
- trace_and_count(op->write.op.c, read_promote, &rbio->bio);
+ bch2_data_update_read_done(&op->write);
+}
- /* we now own pages: */
- BUG_ON(!rbio->bounce);
- BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs);
+static noinline void promote_start(struct bch_read_bio *rbio)
+{
+ struct promote_op *op = container_of(rbio, struct promote_op, write.rbio);
- memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
- sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
- swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
+ trace_and_count(op->write.op.c, io_read_promote, &rbio->bio);
- bch2_data_update_read_done(&op->write, rbio->pick.crc);
+ INIT_WORK(&op->work, promote_start_work);
+ queue_work(rbio->c->write_ref_wq, &op->work);
}
-static struct promote_op *__promote_alloc(struct btree_trans *trans,
- enum btree_id btree_id,
- struct bkey_s_c k,
- struct bpos pos,
- struct extent_ptr_decoded *pick,
- struct bch_io_opts opts,
- unsigned sectors,
- struct bch_read_bio **rbio,
- struct bch_io_failures *failed)
+static struct bch_read_bio *__promote_alloc(struct btree_trans *trans,
+ enum btree_id btree_id,
+ struct bkey_s_c k,
+ struct bpos pos,
+ struct extent_ptr_decoded *pick,
+ unsigned sectors,
+ struct bch_read_bio *orig,
+ struct bch_io_failures *failed)
{
struct bch_fs *c = trans->c;
- struct promote_op *op = NULL;
- struct bio *bio;
- unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
int ret;
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote))
- return ERR_PTR(-BCH_ERR_nopromote_no_writes);
+ struct data_update_opts update_opts = { .write_flags = BCH_WRITE_alloc_nowait };
- op = kzalloc(struct_size(op, bi_inline_vecs, pages), GFP_KERNEL);
- if (!op) {
- ret = -BCH_ERR_nopromote_enomem;
- goto err;
- }
+ if (!have_io_error(failed)) {
+ update_opts.target = orig->opts.promote_target;
+ update_opts.extra_replicas = 1;
+ update_opts.write_flags |= BCH_WRITE_cached;
+ update_opts.write_flags |= BCH_WRITE_only_specified_devs;
+ } else {
+ update_opts.target = orig->opts.foreground_target;
- op->start_time = local_clock();
- op->pos = pos;
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ unsigned ptr_bit = 1;
+ bkey_for_each_ptr(ptrs, ptr) {
+ if (bch2_dev_io_failures(failed, ptr->dev) &&
+ !ptr_being_rewritten(orig, ptr->dev))
+ update_opts.rewrite_ptrs |= ptr_bit;
+ ptr_bit <<= 1;
+ }
- /*
- * We don't use the mempool here because extents that aren't
- * checksummed or compressed can be too big for the mempool:
- */
- *rbio = kzalloc(sizeof(struct bch_read_bio) +
- sizeof(struct bio_vec) * pages,
- GFP_KERNEL);
- if (!*rbio) {
- ret = -BCH_ERR_nopromote_enomem;
- goto err;
+ if (!update_opts.rewrite_ptrs)
+ return NULL;
}
- rbio_init(&(*rbio)->bio, opts);
- bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0);
+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_promote))
+ return ERR_PTR(-BCH_ERR_nopromote_no_writes);
- if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, GFP_KERNEL)) {
- ret = -BCH_ERR_nopromote_enomem;
- goto err;
+ struct promote_op *op = kzalloc(sizeof(*op), GFP_KERNEL);
+ if (!op) {
+ ret = bch_err_throw(c, nopromote_enomem);
+ goto err_put;
}
- (*rbio)->bounce = true;
- (*rbio)->split = true;
- (*rbio)->kmalloc = true;
+ op->start_time = local_clock();
+ op->pos = pos;
if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
bch_promote_params)) {
- ret = -BCH_ERR_nopromote_in_flight;
+ ret = bch_err_throw(c, nopromote_in_flight);
goto err;
}
- bio = &op->write.op.wbio.bio;
- bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0);
-
- struct data_update_opts update_opts = {};
-
- if (!have_io_error(failed)) {
- update_opts.target = opts.promote_target;
- update_opts.extra_replicas = 1;
- update_opts.write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED;
- } else {
- update_opts.target = opts.foreground_target;
-
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- unsigned ptr_bit = 1;
- bkey_for_each_ptr(ptrs, ptr) {
- if (bch2_dev_io_failures(failed, ptr->dev))
- update_opts.rewrite_ptrs |= ptr_bit;
- ptr_bit <<= 1;
- }
- }
+ ret = async_object_list_add(c, promote, op, &op->list_idx);
+ if (ret < 0)
+ goto err_remove_hash;
ret = bch2_data_update_init(trans, NULL, NULL, &op->write,
writepoint_hashed((unsigned long) current),
- opts,
+ &orig->opts,
update_opts,
btree_id, k);
+ op->write.type = BCH_DATA_UPDATE_promote;
/*
* possible errors: -BCH_ERR_nocow_lock_blocked,
* -BCH_ERR_ENOSPC_disk_reservation:
*/
- if (ret) {
- BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash,
- bch_promote_params));
- goto err;
- }
+ if (ret)
+ goto err_remove_list;
+ rbio_init_fragment(&op->write.rbio.bio, orig);
+ op->write.rbio.bounce = true;
+ op->write.rbio.promote = true;
op->write.op.end_io = promote_done;
- return op;
+ return &op->write.rbio;
+err_remove_list:
+ async_object_list_del(c, promote, op->list_idx);
+err_remove_hash:
+ BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash,
+ bch_promote_params));
err:
- if (*rbio)
- bio_free_pages(&(*rbio)->bio);
- kfree(*rbio);
- *rbio = NULL;
+ bio_free_pages(&op->write.op.wbio.bio);
/* We may have added to the rhashtable and thus need rcu freeing: */
kfree_rcu(op, rcu);
- bch2_write_ref_put(c, BCH_WRITE_REF_promote);
+err_put:
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_promote);
return ERR_PTR(ret);
}
noinline
-static struct promote_op *promote_alloc(struct btree_trans *trans,
+static struct bch_read_bio *promote_alloc(struct btree_trans *trans,
struct bvec_iter iter,
struct bkey_s_c k,
struct extent_ptr_decoded *pick,
- struct bch_io_opts opts,
unsigned flags,
- struct bch_read_bio **rbio,
+ struct bch_read_bio *orig,
bool *bounce,
bool *read_full,
struct bch_io_failures *failed)
{
+ /*
+ * We're in the retry path, but we don't know what to repair yet, and we
+ * don't want to do a promote here:
+ */
+ if (failed && !failed->nr)
+ return NULL;
+
struct bch_fs *c = trans->c;
/*
* if failed != NULL we're not actually doing a promote, we're
@@ -301,38 +323,65 @@ static struct promote_op *promote_alloc(struct btree_trans *trans,
struct bpos pos = promote_full
? bkey_start_pos(k.k)
: POS(k.k->p.inode, iter.bi_sector);
- struct promote_op *promote;
int ret;
- ret = should_promote(c, k, pos, opts, flags, failed);
+ ret = should_promote(c, k, pos, orig->opts, flags, failed);
if (ret)
goto nopromote;
- promote = __promote_alloc(trans,
- k.k->type == KEY_TYPE_reflink_v
- ? BTREE_ID_reflink
- : BTREE_ID_extents,
- k, pos, pick, opts, sectors, rbio, failed);
+ struct bch_read_bio *promote =
+ __promote_alloc(trans,
+ k.k->type == KEY_TYPE_reflink_v
+ ? BTREE_ID_reflink
+ : BTREE_ID_extents,
+ k, pos, pick, sectors, orig, failed);
+ if (!promote)
+ return NULL;
+
ret = PTR_ERR_OR_ZERO(promote);
if (ret)
goto nopromote;
*bounce = true;
*read_full = promote_full;
+
+ if (have_io_error(failed))
+ orig->self_healing = true;
+
return promote;
nopromote:
- trace_read_nopromote(c, ret);
+ trace_io_read_nopromote(c, ret);
return NULL;
}
+void bch2_promote_op_to_text(struct printbuf *out, struct promote_op *op)
+{
+ if (!op->write.read_done) {
+ prt_printf(out, "parent read: %px\n", op->write.rbio.parent);
+ printbuf_indent_add(out, 2);
+ bch2_read_bio_to_text(out, op->write.rbio.parent);
+ printbuf_indent_sub(out, 2);
+ }
+
+ bch2_data_update_to_text(out, &op->write);
+}
+
/* Read */
static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
struct bch_read_bio *rbio, struct bpos read_pos)
{
- return bch2_inum_offset_err_msg_trans(trans, out,
- (subvol_inum) { rbio->subvol, read_pos.inode },
- read_pos.offset << 9);
+ int ret = lockrestart_do(trans,
+ bch2_inum_offset_err_msg_trans(trans, out,
+ (subvol_inum) { rbio->subvol, read_pos.inode },
+ read_pos.offset << 9));
+ if (ret)
+ return ret;
+
+ if (rbio->data_update)
+ prt_str(out, "(internal move) ");
+
+ return 0;
}
static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out,
@@ -341,10 +390,6 @@ static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out,
bch2_trans_run(c, bch2_read_err_msg_trans(trans, out, rbio, read_pos));
}
-#define READ_RETRY_AVOID 1
-#define READ_RETRY 2
-#define READ_ERR 3
-
enum rbio_context {
RBIO_CONTEXT_NULL,
RBIO_CONTEXT_HIGHPRI,
@@ -375,20 +420,27 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
{
BUG_ON(rbio->bounce && !rbio->split);
- if (rbio->promote)
- promote_free(rbio->c, rbio->promote);
- rbio->promote = NULL;
-
- if (rbio->bounce)
- bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
+ if (rbio->have_ioref) {
+ struct bch_dev *ca = bch2_dev_have_ref(rbio->c, rbio->pick.ptr.dev);
+ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_io_read);
+ }
if (rbio->split) {
struct bch_read_bio *parent = rbio->parent;
- if (rbio->kmalloc)
- kfree(rbio);
- else
+ if (unlikely(rbio->promote)) {
+ if (!rbio->bio.bi_status)
+ promote_start(rbio);
+ else
+ promote_free(rbio);
+ } else {
+ async_object_list_del(rbio->c, rbio, rbio->list_idx);
+
+ if (rbio->bounce)
+ bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
+
bio_put(&rbio->bio);
+ }
rbio = parent;
}
@@ -405,64 +457,125 @@ static void bch2_rbio_done(struct bch_read_bio *rbio)
if (rbio->start_time)
bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
rbio->start_time);
+#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
+ if (rbio->list_idx)
+ async_object_list_del(rbio->c, rbio, rbio->list_idx);
+#endif
bio_endio(&rbio->bio);
}
-static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio,
- struct bvec_iter bvec_iter,
- struct bch_io_failures *failed,
- unsigned flags)
+static void get_rbio_extent(struct btree_trans *trans,
+ struct bch_read_bio *rbio,
+ struct bkey_buf *sk)
{
- struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter;
- struct bkey_buf sk;
struct bkey_s_c k;
- int ret;
+ int ret = lockrestart_do(trans,
+ bkey_err(k = bch2_bkey_get_iter(trans, &iter,
+ rbio->data_btree, rbio->data_pos, 0)));
+ if (ret)
+ return;
+
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ bkey_for_each_ptr(ptrs, ptr)
+ if (bch2_extent_ptr_eq(*ptr, rbio->pick.ptr)) {
+ bch2_bkey_buf_reassemble(sk, trans->c, k);
+ break;
+ }
- flags &= ~BCH_READ_LAST_FRAGMENT;
- flags |= BCH_READ_MUST_CLONE;
+ bch2_trans_iter_exit(trans, &iter);
+}
- bch2_bkey_buf_init(&sk);
+static noinline int maybe_poison_extent(struct btree_trans *trans, struct bch_read_bio *rbio,
+ enum btree_id btree, struct bkey_s_c read_k)
+{
+ if (!bch2_poison_extents_on_checksum_error)
+ return 0;
+
+ struct bch_fs *c = trans->c;
+
+ struct data_update *u = rbio_data_update(rbio);
+ if (u)
+ read_k = bkey_i_to_s_c(u->k.k);
+
+ u64 flags = bch2_bkey_extent_flags(read_k);
+ if (flags & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
+ return 0;
+
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, btree, bkey_start_pos(read_k.k),
+ BTREE_ITER_intent);
+ int ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ if (!bkey_and_val_eq(k, read_k))
+ goto out;
+
+ struct bkey_i *new = bch2_trans_kmalloc(trans,
+ bkey_bytes(k.k) + sizeof(struct bch_extent_flags));
+ ret = PTR_ERR_OR_ZERO(new) ?:
+ (bkey_reassemble(new, k), 0) ?:
+ bch2_bkey_extent_flags_set(c, new, flags|BIT_ULL(BCH_EXTENT_FLAG_poisoned)) ?:
+ bch2_trans_update(trans, &iter, new, BTREE_UPDATE_internal_snapshot_node) ?:
+ bch2_trans_commit(trans, NULL, NULL, 0);
+
+ /*
+ * Propagate key change back to data update path, in particular so it
+ * knows the extent has been poisoned and it's safe to change the
+ * checksum
+ */
+ if (u && !ret)
+ bch2_bkey_buf_copy(&u->k, c, new);
+out:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
- bch2_trans_iter_init(trans, &iter, rbio->data_btree,
- rbio->read_pos, BTREE_ITER_slots);
+static noinline int bch2_read_retry_nodecode(struct btree_trans *trans,
+ struct bch_read_bio *rbio,
+ struct bvec_iter bvec_iter,
+ struct bch_io_failures *failed,
+ unsigned flags)
+{
+ struct data_update *u = container_of(rbio, struct data_update, rbio);
retry:
bch2_trans_begin(trans);
- rbio->bio.bi_status = 0;
- ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret = lockrestart_do(trans,
+ bkey_err(k = bch2_bkey_get_iter(trans, &iter,
+ u->btree_id, bkey_start_pos(&u->k.k->k),
+ 0)));
if (ret)
goto err;
- bch2_bkey_buf_reassemble(&sk, c, k);
- k = bkey_i_to_s_c(sk.k);
-
- if (!bch2_bkey_matches_ptr(c, k,
- rbio->pick.ptr,
- rbio->data_pos.offset -
- rbio->pick.crc.offset)) {
+ if (!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) {
/* extent we wanted to read no longer exists: */
- rbio->hole = true;
- goto out;
+ rbio->ret = bch_err_throw(trans->c, data_read_key_overwritten);
+ goto err;
}
ret = __bch2_read_extent(trans, rbio, bvec_iter,
- rbio->read_pos,
- rbio->data_btree,
- k, 0, failed, flags);
- if (ret == READ_RETRY)
- goto retry;
- if (ret)
- goto err;
-out:
- bch2_rbio_done(rbio);
- bch2_trans_iter_exit(trans, &iter);
- bch2_trans_put(trans);
- bch2_bkey_buf_exit(&sk, c);
- return;
+ bkey_start_pos(&u->k.k->k),
+ u->btree_id,
+ bkey_i_to_s_c(u->k.k),
+ 0, failed, flags, -1);
err:
- rbio->bio.bi_status = BLK_STS_IOERR;
- goto out;
+ bch2_trans_iter_exit(trans, &iter);
+
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
+ bch2_err_matches(ret, BCH_ERR_data_read_retry))
+ goto retry;
+
+ if (ret) {
+ rbio->bio.bi_status = BLK_STS_IOERR;
+ rbio->ret = ret;
+ }
+
+ BUG_ON(atomic_read(&rbio->bio.__bi_remaining) != 1);
+ return ret;
}
static void bch2_rbio_retry(struct work_struct *work)
@@ -478,68 +591,108 @@ static void bch2_rbio_retry(struct work_struct *work)
};
struct bch_io_failures failed = { .nr = 0 };
- trace_and_count(c, read_retry, &rbio->bio);
+ struct btree_trans *trans = bch2_trans_get(c);
- if (rbio->retry == READ_RETRY_AVOID)
- bch2_mark_io_failure(&failed, &rbio->pick);
+ struct bkey_buf sk;
+ bch2_bkey_buf_init(&sk);
+ bkey_init(&sk.k->k);
+
+ trace_io_read_retry(&rbio->bio);
+ this_cpu_add(c->counters[BCH_COUNTER_io_read_retry],
+ bvec_iter_sectors(rbio->bvec_iter));
+
+ get_rbio_extent(trans, rbio, &sk);
+
+ if (!bkey_deleted(&sk.k->k) &&
+ bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid))
+ bch2_mark_io_failure(&failed, &rbio->pick,
+ rbio->ret == -BCH_ERR_data_read_retry_csum_err);
+
+ if (!rbio->split) {
+ rbio->bio.bi_status = 0;
+ rbio->ret = 0;
+ }
- rbio->bio.bi_status = 0;
+ unsigned subvol = rbio->subvol;
+ struct bpos read_pos = rbio->read_pos;
rbio = bch2_rbio_free(rbio);
- flags |= BCH_READ_IN_RETRY;
- flags &= ~BCH_READ_MAY_PROMOTE;
+ flags |= BCH_READ_in_retry;
+ flags &= ~BCH_READ_may_promote;
+ flags &= ~BCH_READ_last_fragment;
+ flags |= BCH_READ_must_clone;
- if (flags & BCH_READ_NODECODE) {
- bch2_read_retry_nodecode(c, rbio, iter, &failed, flags);
- } else {
- flags &= ~BCH_READ_LAST_FRAGMENT;
- flags |= BCH_READ_MUST_CLONE;
+ int ret = rbio->data_update
+ ? bch2_read_retry_nodecode(trans, rbio, iter, &failed, flags)
+ : __bch2_read(trans, rbio, iter, inum, &failed, &sk, flags);
- __bch2_read(c, rbio, iter, inum, &failed, flags);
+ if (ret) {
+ rbio->ret = ret;
+ rbio->bio.bi_status = BLK_STS_IOERR;
}
-}
-static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
- blk_status_t error)
-{
- rbio->retry = retry;
+ if (failed.nr || ret) {
+ struct printbuf buf = PRINTBUF;
+ bch2_log_msg_start(c, &buf);
- if (rbio->flags & BCH_READ_IN_RETRY)
- return;
+ lockrestart_do(trans,
+ bch2_inum_offset_err_msg_trans(trans, &buf,
+ (subvol_inum) { subvol, read_pos.inode },
+ read_pos.offset << 9));
+ if (rbio->data_update)
+ prt_str(&buf, "(internal move) ");
- if (retry == READ_ERR) {
- rbio = bch2_rbio_free(rbio);
+ prt_str(&buf, "data read error, ");
+ if (!ret) {
+ prt_str(&buf, "successful retry");
+ if (rbio->self_healing)
+ prt_str(&buf, ", self healing");
+ } else
+ prt_str(&buf, bch2_err_str(ret));
+ prt_newline(&buf);
- rbio->bio.bi_status = error;
- bch2_rbio_done(rbio);
- } else {
- bch2_rbio_punt(rbio, bch2_rbio_retry,
- RBIO_CONTEXT_UNBOUND, system_unbound_wq);
+
+ if (!bkey_deleted(&sk.k->k)) {
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(sk.k));
+ prt_newline(&buf);
+ }
+
+ bch2_io_failures_to_text(&buf, c, &failed);
+
+ bch2_print_str_ratelimited(c, KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
}
+
+ bch2_rbio_done(rbio);
+ bch2_bkey_buf_exit(&sk, c);
+ bch2_trans_put(trans);
}
-static void bch2_read_io_err(struct work_struct *work)
+static void bch2_rbio_error(struct bch_read_bio *rbio,
+ int ret, blk_status_t blk_error)
{
- struct bch_read_bio *rbio =
- container_of(work, struct bch_read_bio, work);
- struct bio *bio = &rbio->bio;
- struct bch_fs *c = rbio->c;
- struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
- struct printbuf buf = PRINTBUF;
+ BUG_ON(ret >= 0);
- bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
- prt_printf(&buf, "data read error: %s", bch2_blk_status_to_str(bio->bi_status));
+ rbio->ret = ret;
+ rbio->bio.bi_status = blk_error;
- if (ca) {
- bch2_io_error(ca, BCH_MEMBER_ERROR_read);
- bch_err_ratelimited(ca, "%s", buf.buf);
+ bch2_rbio_parent(rbio)->saw_error = true;
+
+ if (rbio->flags & BCH_READ_in_retry)
+ return;
+
+ if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) {
+ bch2_rbio_punt(rbio, bch2_rbio_retry,
+ RBIO_CONTEXT_UNBOUND, system_unbound_wq);
} else {
- bch_err_ratelimited(c, "%s", buf.buf);
- }
+ rbio = bch2_rbio_free(rbio);
- printbuf_exit(&buf);
- bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
+ rbio->ret = ret;
+ rbio->bio.bi_status = blk_error;
+
+ bch2_rbio_done(rbio);
+ }
}
static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
@@ -605,33 +758,6 @@ static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
__bch2_rbio_narrow_crcs(trans, rbio));
}
-static void bch2_read_csum_err(struct work_struct *work)
-{
- struct bch_read_bio *rbio =
- container_of(work, struct bch_read_bio, work);
- struct bch_fs *c = rbio->c;
- struct bio *src = &rbio->bio;
- struct bch_extent_crc_unpacked crc = rbio->pick.crc;
- struct nonce nonce = extent_nonce(rbio->version, crc);
- struct bch_csum csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
- struct printbuf buf = PRINTBUF;
-
- bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
- prt_str(&buf, "data ");
- bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum);
-
- struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
- if (ca) {
- bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
- bch_err_ratelimited(ca, "%s", buf.buf);
- } else {
- bch_err_ratelimited(c, "%s", buf.buf);
- }
-
- bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
- printbuf_exit(&buf);
-}
-
static void bch2_read_decompress_err(struct work_struct *work)
{
struct bch_read_bio *rbio =
@@ -648,7 +774,7 @@ static void bch2_read_decompress_err(struct work_struct *work)
else
bch_err_ratelimited(c, "%s", buf.buf);
- bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
+ bch2_rbio_error(rbio, -BCH_ERR_data_read_decompress_err, BLK_STS_IOERR);
printbuf_exit(&buf);
}
@@ -668,7 +794,7 @@ static void bch2_read_decrypt_err(struct work_struct *work)
else
bch_err_ratelimited(c, "%s", buf.buf);
- bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
+ bch2_rbio_error(rbio, -BCH_ERR_data_read_decrypt_err, BLK_STS_IOERR);
printbuf_exit(&buf);
}
@@ -678,9 +804,11 @@ static void __bch2_read_endio(struct work_struct *work)
struct bch_read_bio *rbio =
container_of(work, struct bch_read_bio, work);
struct bch_fs *c = rbio->c;
- struct bio *src = &rbio->bio;
- struct bio *dst = &bch2_rbio_parent(rbio)->bio;
- struct bvec_iter dst_iter = rbio->bvec_iter;
+ struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
+ struct bch_read_bio *parent = bch2_rbio_parent(rbio);
+ struct bio *src = &rbio->bio;
+ struct bio *dst = &parent->bio;
+ struct bvec_iter dst_iter = rbio->bvec_iter;
struct bch_extent_crc_unpacked crc = rbio->pick.crc;
struct nonce nonce = extent_nonce(rbio->version, crc);
unsigned nofs_flags;
@@ -698,8 +826,26 @@ static void __bch2_read_endio(struct work_struct *work)
src->bi_iter = rbio->bvec_iter;
}
+ bch2_maybe_corrupt_bio(src, bch2_read_corrupt_ratio);
+
csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
- if (bch2_crc_cmp(csum, rbio->pick.crc.csum) && !c->opts.no_data_io)
+ bool csum_good = !bch2_crc_cmp(csum, rbio->pick.crc.csum) || c->opts.no_data_io;
+
+ /*
+ * Checksum error: if the bio wasn't bounced, we may have been
+ * reading into buffers owned by userspace (that userspace can
+ * scribble over) - retry the read, bouncing it this time:
+ */
+ if (!csum_good && !rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) {
+ rbio->flags |= BCH_READ_must_bounce;
+ bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err_maybe_userspace,
+ BLK_STS_IOERR);
+ goto out;
+ }
+
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good);
+
+ if (!csum_good)
goto csum_err;
/*
@@ -712,32 +858,40 @@ static void __bch2_read_endio(struct work_struct *work)
if (unlikely(rbio->narrow_crcs))
bch2_rbio_narrow_crcs(rbio);
- if (rbio->flags & BCH_READ_NODECODE)
- goto nodecode;
+ if (likely(!parent->data_update)) {
+ /* Adjust crc to point to subset of data we want: */
+ crc.offset += rbio->offset_into_extent;
+ crc.live_size = bvec_iter_sectors(rbio->bvec_iter);
- /* Adjust crc to point to subset of data we want: */
- crc.offset += rbio->offset_into_extent;
- crc.live_size = bvec_iter_sectors(rbio->bvec_iter);
+ if (crc_is_compressed(crc)) {
+ ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+ if (ret)
+ goto decrypt_err;
- if (crc_is_compressed(crc)) {
- ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
- if (ret)
- goto decrypt_err;
+ if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) &&
+ !c->opts.no_data_io)
+ goto decompression_err;
+ } else {
+ /* don't need to decrypt the entire bio: */
+ nonce = nonce_add(nonce, crc.offset << 9);
+ bio_advance(src, crc.offset << 9);
- if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) &&
- !c->opts.no_data_io)
- goto decompression_err;
- } else {
- /* don't need to decrypt the entire bio: */
- nonce = nonce_add(nonce, crc.offset << 9);
- bio_advance(src, crc.offset << 9);
+ BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
+ src->bi_iter.bi_size = dst_iter.bi_size;
- BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
- src->bi_iter.bi_size = dst_iter.bi_size;
+ ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+ if (ret)
+ goto decrypt_err;
- ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
- if (ret)
- goto decrypt_err;
+ if (rbio->bounce) {
+ struct bvec_iter src_iter = src->bi_iter;
+
+ bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
+ }
+ }
+ } else {
+ if (rbio->split)
+ rbio->parent->pick = rbio->pick;
if (rbio->bounce) {
struct bvec_iter src_iter = src->bi_iter;
@@ -754,12 +908,9 @@ static void __bch2_read_endio(struct work_struct *work)
ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
if (ret)
goto decrypt_err;
-
- promote_start(rbio->promote, rbio);
- rbio->promote = NULL;
}
-nodecode:
- if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) {
+
+ if (likely(!(rbio->flags & BCH_READ_in_retry))) {
rbio = bch2_rbio_free(rbio);
bch2_rbio_done(rbio);
}
@@ -767,18 +918,7 @@ out:
memalloc_nofs_restore(nofs_flags);
return;
csum_err:
- /*
- * Checksum error: if the bio wasn't bounced, we may have been
- * reading into buffers owned by userspace (that userspace can
- * scribble over) - retry the read, bouncing it this time:
- */
- if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
- rbio->flags |= BCH_READ_MUST_BOUNCE;
- bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR);
- goto out;
- }
-
- bch2_rbio_punt(rbio, bch2_read_csum_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
+ bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err, BLK_STS_IOERR);
goto out;
decompression_err:
bch2_rbio_punt(rbio, bch2_read_decompress_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
@@ -797,27 +937,25 @@ static void bch2_read_endio(struct bio *bio)
struct workqueue_struct *wq = NULL;
enum rbio_context context = RBIO_CONTEXT_NULL;
- if (rbio->have_ioref) {
- bch2_latency_acct(ca, rbio->submit_time, READ);
- percpu_ref_put(&ca->io_ref);
- }
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
+ rbio->submit_time, !bio->bi_status);
if (!rbio->split)
rbio->bio.bi_end_io = rbio->end_io;
if (unlikely(bio->bi_status)) {
- bch2_rbio_punt(rbio, bch2_read_io_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
+ bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_io_err, bio->bi_status);
return;
}
- if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
+ if (((rbio->flags & BCH_READ_retry_if_stale) && race_fault()) ||
(ca && dev_ptr_stale(ca, &rbio->pick.ptr))) {
- trace_and_count(c, read_reuse_race, &rbio->bio);
+ trace_and_count(c, io_read_reuse_race, &rbio->bio);
- if (rbio->flags & BCH_READ_RETRY_IF_STALE)
- bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
+ if (rbio->flags & BCH_READ_retry_if_stale)
+ bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_retry, BLK_STS_AGAIN);
else
- bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN);
+ bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_race, BLK_STS_AGAIN);
return;
}
@@ -856,7 +994,7 @@ static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans,
prt_printf(&buf, "memory gen: %u", gen);
- ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
+ ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(trans, &iter)));
if (!ret) {
prt_newline(&buf);
bch2_bkey_val_to_text(&buf, c, k);
@@ -883,15 +1021,15 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
struct bvec_iter iter, struct bpos read_pos,
enum btree_id data_btree, struct bkey_s_c k,
unsigned offset_into_extent,
- struct bch_io_failures *failed, unsigned flags)
+ struct bch_io_failures *failed, unsigned flags, int dev)
{
struct bch_fs *c = trans->c;
struct extent_ptr_decoded pick;
struct bch_read_bio *rbio = NULL;
- struct promote_op *promote = NULL;
bool bounce = false, read_full = false, narrow_crcs = false;
struct bpos data_pos = bkey_start_pos(k.k);
- int pick_ret;
+ struct data_update *u = rbio_data_update(orig);
+ int ret = 0;
if (bkey_extent_is_inline_data(k.k)) {
unsigned bytes = min_t(unsigned, iter.bi_size,
@@ -902,19 +1040,35 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
swap(iter.bi_size, bytes);
bio_advance_iter(&orig->bio, &iter, bytes);
zero_fill_bio_iter(&orig->bio, iter);
+ this_cpu_add(c->counters[BCH_COUNTER_io_read_inline],
+ bvec_iter_sectors(iter));
goto out_read_done;
}
+
+ if ((bch2_bkey_extent_flags(k) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) &&
+ !orig->data_update)
+ return bch_err_throw(c, extent_poisoned);
retry_pick:
- pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick);
+ ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev);
/* hole or reservation - just zero fill: */
- if (!pick_ret)
+ if (!ret)
goto hole;
- if (unlikely(pick_ret < 0)) {
+ if (unlikely(ret < 0)) {
+ if (ret == -BCH_ERR_data_read_csum_err) {
+ int ret2 = maybe_poison_extent(trans, orig, data_btree, k);
+ if (ret2) {
+ ret = ret2;
+ goto err;
+ }
+
+ trace_and_count(c, io_read_fail_and_poison, &orig->bio);
+ }
+
struct printbuf buf = PRINTBUF;
bch2_read_err_msg_trans(trans, &buf, orig, read_pos);
- prt_printf(&buf, "no device to read from: %s\n ", bch2_err_str(pick_ret));
+ prt_printf(&buf, "%s\n ", bch2_err_str(ret));
bch2_bkey_val_to_text(&buf, c, k);
bch_err_ratelimited(c, "%s", buf.buf);
@@ -922,7 +1076,8 @@ retry_pick:
goto err;
}
- if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) && !c->chacha20) {
+ if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) &&
+ !c->chacha20_key_set) {
struct printbuf buf = PRINTBUF;
bch2_read_err_msg_trans(trans, &buf, orig, read_pos);
prt_printf(&buf, "attempting to read encrypted data without encryption key\n ");
@@ -930,10 +1085,12 @@ retry_pick:
bch_err_ratelimited(c, "%s", buf.buf);
printbuf_exit(&buf);
+ ret = bch_err_throw(c, data_read_no_encryption_key);
goto err;
}
- struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ);
+ struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ,
+ BCH_DEV_READ_REF_io_read);
/*
* Stale dirty pointers are treated as IO errors, but @failed isn't
@@ -941,62 +1098,58 @@ retry_pick:
* retry path, don't check here, it'll be caught in bch2_read_endio()
* and we'll end up in the retry path:
*/
- if ((flags & BCH_READ_IN_RETRY) &&
+ if ((flags & BCH_READ_in_retry) &&
!pick.ptr.cached &&
ca &&
unlikely(dev_ptr_stale(ca, &pick.ptr))) {
read_from_stale_dirty_pointer(trans, ca, k, pick.ptr);
- bch2_mark_io_failure(failed, &pick);
- percpu_ref_put(&ca->io_ref);
+ bch2_mark_io_failure(failed, &pick, false);
+ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_io_read);
goto retry_pick;
}
- /*
- * Unlock the iterator while the btree node's lock is still in
- * cache, before doing the IO:
- */
- bch2_trans_unlock(trans);
+ if (likely(!u)) {
+ if (!(flags & BCH_READ_last_fragment) ||
+ bio_flagged(&orig->bio, BIO_CHAIN))
+ flags |= BCH_READ_must_clone;
+
+ narrow_crcs = !(flags & BCH_READ_in_retry) &&
+ bch2_can_narrow_extent_crcs(k, pick.crc);
+
+ if (narrow_crcs && (flags & BCH_READ_user_mapped))
+ flags |= BCH_READ_must_bounce;
- if (flags & BCH_READ_NODECODE) {
+ EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
+
+ if (crc_is_compressed(pick.crc) ||
+ (pick.crc.csum_type != BCH_CSUM_none &&
+ (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
+ (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
+ (flags & BCH_READ_user_mapped)) ||
+ (flags & BCH_READ_must_bounce)))) {
+ read_full = true;
+ bounce = true;
+ }
+ } else {
/*
* can happen if we retry, and the extent we were going to read
* has been merged in the meantime:
*/
- if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) {
+ if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) {
if (ca)
- percpu_ref_put(&ca->io_ref);
- goto hole;
+ enumerated_ref_put(&ca->io_ref[READ],
+ BCH_DEV_READ_REF_io_read);
+ rbio->ret = bch_err_throw(c, data_read_buffer_too_small);
+ goto out_read_done;
}
iter.bi_size = pick.crc.compressed_size << 9;
- goto get_bio;
- }
-
- if (!(flags & BCH_READ_LAST_FRAGMENT) ||
- bio_flagged(&orig->bio, BIO_CHAIN))
- flags |= BCH_READ_MUST_CLONE;
-
- narrow_crcs = !(flags & BCH_READ_IN_RETRY) &&
- bch2_can_narrow_extent_crcs(k, pick.crc);
-
- if (narrow_crcs && (flags & BCH_READ_USER_MAPPED))
- flags |= BCH_READ_MUST_BOUNCE;
-
- EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
-
- if (crc_is_compressed(pick.crc) ||
- (pick.crc.csum_type != BCH_CSUM_none &&
- (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
- (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
- (flags & BCH_READ_USER_MAPPED)) ||
- (flags & BCH_READ_MUST_BOUNCE)))) {
read_full = true;
- bounce = true;
}
if (orig->opts.promote_target || have_io_error(failed))
- promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags,
- &rbio, &bounce, &read_full, failed);
+ rbio = promote_alloc(trans, iter, k, &pick, flags, orig,
+ &bounce, &read_full, failed);
if (!read_full) {
EBUG_ON(crc_is_compressed(pick.crc));
@@ -1015,7 +1168,7 @@ retry_pick:
pick.crc.offset = 0;
pick.crc.live_size = bvec_iter_sectors(iter);
}
-get_bio:
+
if (rbio) {
/*
* promote already allocated bounce rbio:
@@ -1030,17 +1183,16 @@ get_bio:
} else if (bounce) {
unsigned sectors = pick.crc.compressed_size;
- rbio = rbio_init(bio_alloc_bioset(NULL,
+ rbio = rbio_init_fragment(bio_alloc_bioset(NULL,
DIV_ROUND_UP(sectors, PAGE_SECTORS),
0,
GFP_NOFS,
&c->bio_read_split),
- orig->opts);
+ orig);
bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
rbio->bounce = true;
- rbio->split = true;
- } else if (flags & BCH_READ_MUST_CLONE) {
+ } else if (flags & BCH_READ_must_clone) {
/*
* Have to clone if there were any splits, due to error
* reporting issues (if a split errored, and retrying didn't
@@ -1049,11 +1201,10 @@ get_bio:
* from the whole bio, in which case we don't want to retry and
* lose the error)
*/
- rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS,
+ rbio = rbio_init_fragment(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS,
&c->bio_read_split),
- orig->opts);
+ orig);
rbio->bio.bi_iter = iter;
- rbio->split = true;
} else {
rbio = orig;
rbio->bio.bi_iter = iter;
@@ -1062,68 +1213,66 @@ get_bio:
EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
- rbio->c = c;
rbio->submit_time = local_clock();
- if (rbio->split)
- rbio->parent = orig;
- else
+ if (!rbio->split)
rbio->end_io = orig->bio.bi_end_io;
rbio->bvec_iter = iter;
rbio->offset_into_extent= offset_into_extent;
rbio->flags = flags;
rbio->have_ioref = ca != NULL;
rbio->narrow_crcs = narrow_crcs;
- rbio->hole = 0;
- rbio->retry = 0;
+ rbio->ret = 0;
rbio->context = 0;
- /* XXX: only initialize this if needed */
- rbio->devs_have = bch2_bkey_devs(k);
rbio->pick = pick;
rbio->subvol = orig->subvol;
rbio->read_pos = read_pos;
rbio->data_btree = data_btree;
rbio->data_pos = data_pos;
rbio->version = k.k->bversion;
- rbio->promote = promote;
INIT_WORK(&rbio->work, NULL);
- if (flags & BCH_READ_NODECODE)
- orig->pick = pick;
-
rbio->bio.bi_opf = orig->bio.bi_opf;
rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
rbio->bio.bi_end_io = bch2_read_endio;
+ async_object_list_add(c, rbio, rbio, &rbio->list_idx);
+
if (rbio->bounce)
- trace_and_count(c, read_bounce, &rbio->bio);
+ trace_and_count(c, io_read_bounce, &rbio->bio);
- this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio));
+ if (!u)
+ this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio));
+ else
+ this_cpu_add(c->counters[BCH_COUNTER_io_move_read], bio_sectors(&rbio->bio));
bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
/*
* If it's being moved internally, we don't want to flag it as a cache
* hit:
*/
- if (ca && pick.ptr.cached && !(flags & BCH_READ_NODECODE))
+ if (ca && pick.ptr.cached && !u)
bch2_bucket_io_time_reset(trans, pick.ptr.dev,
PTR_BUCKET_NR(ca, &pick.ptr), READ);
- if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) {
+ if (!(flags & (BCH_READ_in_retry|BCH_READ_last_fragment))) {
bio_inc_remaining(&orig->bio);
- trace_and_count(c, read_split, &orig->bio);
+ trace_and_count(c, io_read_split, &orig->bio);
}
- if (!rbio->pick.idx) {
- if (unlikely(!rbio->have_ioref)) {
- struct printbuf buf = PRINTBUF;
- bch2_read_err_msg_trans(trans, &buf, rbio, read_pos);
- prt_printf(&buf, "no device to read from:\n ");
- bch2_bkey_val_to_text(&buf, c, k);
-
- bch_err_ratelimited(c, "%s", buf.buf);
- printbuf_exit(&buf);
+ /*
+ * Unlock the iterator while the btree node's lock is still in
+ * cache, before doing the IO:
+ */
+ if (!(flags & BCH_READ_in_retry))
+ bch2_trans_unlock(trans);
+ else
+ bch2_trans_unlock_long(trans);
- bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
+ if (likely(!rbio->pick.do_ec_reconstruct)) {
+ if (unlikely(!rbio->have_ioref)) {
+ bch2_rbio_error(rbio,
+ -BCH_ERR_data_read_retry_device_offline,
+ BLK_STS_IOERR);
goto out;
}
@@ -1132,10 +1281,10 @@ get_bio:
bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
if (unlikely(c->opts.no_data_io)) {
- if (likely(!(flags & BCH_READ_IN_RETRY)))
+ if (likely(!(flags & BCH_READ_in_retry)))
bio_endio(&rbio->bio);
} else {
- if (likely(!(flags & BCH_READ_IN_RETRY)))
+ if (likely(!(flags & BCH_READ_in_retry)))
submit_bio(&rbio->bio);
else
submit_bio_wait(&rbio->bio);
@@ -1149,70 +1298,76 @@ get_bio:
} else {
/* Attempting reconstruct read: */
if (bch2_ec_read_extent(trans, rbio, k)) {
- bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
+ bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_ec_reconstruct_err,
+ BLK_STS_IOERR);
goto out;
}
- if (likely(!(flags & BCH_READ_IN_RETRY)))
+ if (likely(!(flags & BCH_READ_in_retry)))
bio_endio(&rbio->bio);
}
out:
- if (likely(!(flags & BCH_READ_IN_RETRY))) {
+ if (likely(!(flags & BCH_READ_in_retry))) {
return 0;
} else {
+ bch2_trans_unlock(trans);
+
int ret;
rbio->context = RBIO_CONTEXT_UNBOUND;
bch2_read_endio(&rbio->bio);
- ret = rbio->retry;
+ ret = rbio->ret;
rbio = bch2_rbio_free(rbio);
- if (ret == READ_RETRY_AVOID) {
- bch2_mark_io_failure(failed, &pick);
- ret = READ_RETRY;
- }
-
- if (!ret)
- goto out_read_done;
+ if (bch2_err_matches(ret, BCH_ERR_data_read_retry_avoid))
+ bch2_mark_io_failure(failed, &pick,
+ ret == -BCH_ERR_data_read_retry_csum_err);
return ret;
}
err:
- if (flags & BCH_READ_IN_RETRY)
- return READ_ERR;
+ if (flags & BCH_READ_in_retry)
+ return ret;
- orig->bio.bi_status = BLK_STS_IOERR;
+ orig->bio.bi_status = BLK_STS_IOERR;
+ orig->ret = ret;
goto out_read_done;
hole:
+ this_cpu_add(c->counters[BCH_COUNTER_io_read_hole],
+ bvec_iter_sectors(iter));
/*
- * won't normally happen in the BCH_READ_NODECODE
- * (bch2_move_extent()) path, but if we retry and the extent we wanted
- * to read no longer exists we have to signal that:
+ * won't normally happen in the data update (bch2_move_extent()) path,
+ * but if we retry and the extent we wanted to read no longer exists we
+ * have to signal that:
*/
- if (flags & BCH_READ_NODECODE)
- orig->hole = true;
+ if (u)
+ orig->ret = bch_err_throw(c, data_read_key_overwritten);
zero_fill_bio_iter(&orig->bio, iter);
out_read_done:
- if (flags & BCH_READ_LAST_FRAGMENT)
+ if ((flags & BCH_READ_last_fragment) &&
+ !(flags & BCH_READ_in_retry))
bch2_rbio_done(orig);
return 0;
}
-void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
- struct bvec_iter bvec_iter, subvol_inum inum,
- struct bch_io_failures *failed, unsigned flags)
+int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio,
+ struct bvec_iter bvec_iter, subvol_inum inum,
+ struct bch_io_failures *failed,
+ struct bkey_buf *prev_read,
+ unsigned flags)
{
- struct btree_trans *trans = bch2_trans_get(c);
+ struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bkey_buf sk;
struct bkey_s_c k;
+ enum btree_id data_btree;
int ret;
- BUG_ON(flags & BCH_READ_NODECODE);
+ EBUG_ON(rbio->data_update);
bch2_bkey_buf_init(&sk);
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
@@ -1220,7 +1375,7 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
BTREE_ITER_slots);
while (1) {
- enum btree_id data_btree = BTREE_ID_extents;
+ data_btree = BTREE_ID_extents;
bch2_trans_begin(trans);
@@ -1229,12 +1384,12 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
if (ret)
goto err;
- bch2_btree_iter_set_snapshot(&iter, snapshot);
+ bch2_btree_iter_set_snapshot(trans, &iter, snapshot);
- bch2_btree_iter_set_pos(&iter,
+ bch2_btree_iter_set_pos(trans, &iter,
POS(inum.inum, bvec_iter.bi_sector));
- k = bch2_btree_iter_peek_slot(&iter);
+ k = bch2_btree_iter_peek_slot(trans, &iter);
ret = bkey_err(k);
if (ret)
goto err;
@@ -1252,6 +1407,12 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
k = bkey_i_to_s_c(sk.k);
+ if (unlikely(flags & BCH_READ_in_retry)) {
+ if (!bkey_and_val_eq(k, bkey_i_to_s_c(prev_read->k)))
+ failed->nr = 0;
+ bch2_bkey_buf_copy(prev_read, c, sk.k);
+ }
+
/*
* With indirect extents, the amount of data to read is the min
* of the original extent and the indirect extent:
@@ -1262,42 +1423,91 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
swap(bvec_iter.bi_size, bytes);
if (bvec_iter.bi_size == bytes)
- flags |= BCH_READ_LAST_FRAGMENT;
+ flags |= BCH_READ_last_fragment;
ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos,
data_btree, k,
- offset_into_extent, failed, flags);
+ offset_into_extent, failed, flags, -1);
+ swap(bvec_iter.bi_size, bytes);
+
if (ret)
goto err;
- if (flags & BCH_READ_LAST_FRAGMENT)
+ if (flags & BCH_READ_last_fragment)
break;
- swap(bvec_iter.bi_size, bytes);
bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
err:
+ if (ret == -BCH_ERR_data_read_retry_csum_err_maybe_userspace)
+ flags |= BCH_READ_must_bounce;
+
if (ret &&
!bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
- ret != READ_RETRY &&
- ret != READ_RETRY_AVOID)
+ !bch2_err_matches(ret, BCH_ERR_data_read_retry))
break;
}
- bch2_trans_iter_exit(trans, &iter);
+ if (unlikely(ret)) {
+ if (ret != -BCH_ERR_extent_poisoned) {
+ struct printbuf buf = PRINTBUF;
+ lockrestart_do(trans,
+ bch2_inum_offset_err_msg_trans(trans, &buf, inum,
+ bvec_iter.bi_sector << 9));
+ prt_printf(&buf, "data read error: %s", bch2_err_str(ret));
+ bch_err_ratelimited(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ }
- if (ret) {
- struct printbuf buf = PRINTBUF;
- bch2_inum_offset_err_msg_trans(trans, &buf, inum, bvec_iter.bi_sector << 9);
- prt_printf(&buf, "read error %i from btree lookup", ret);
- bch_err_ratelimited(c, "%s", buf.buf);
- printbuf_exit(&buf);
+ rbio->bio.bi_status = BLK_STS_IOERR;
+ rbio->ret = ret;
- rbio->bio.bi_status = BLK_STS_IOERR;
- bch2_rbio_done(rbio);
+ if (!(flags & BCH_READ_in_retry))
+ bch2_rbio_done(rbio);
}
- bch2_trans_put(trans);
+ bch2_trans_iter_exit(trans, &iter);
bch2_bkey_buf_exit(&sk, c);
+ return ret;
+}
+
+static const char * const bch2_read_bio_flags[] = {
+#define x(n) #n,
+ BCH_READ_FLAGS()
+#undef x
+ NULL
+};
+
+void bch2_read_bio_to_text(struct printbuf *out, struct bch_read_bio *rbio)
+{
+ u64 now = local_clock();
+ prt_printf(out, "start_time:\t%llu\n", rbio->start_time ? now - rbio->start_time : 0);
+ prt_printf(out, "submit_time:\t%llu\n", rbio->submit_time ? now - rbio->submit_time : 0);
+
+ if (!rbio->split)
+ prt_printf(out, "end_io:\t%ps\n", rbio->end_io);
+ else
+ prt_printf(out, "parent:\t%px\n", rbio->parent);
+
+ prt_printf(out, "bi_end_io:\t%ps\n", rbio->bio.bi_end_io);
+
+ prt_printf(out, "promote:\t%u\n", rbio->promote);
+ prt_printf(out, "bounce:\t%u\n", rbio->bounce);
+ prt_printf(out, "split:\t%u\n", rbio->split);
+ prt_printf(out, "have_ioref:\t%u\n", rbio->have_ioref);
+ prt_printf(out, "narrow_crcs:\t%u\n", rbio->narrow_crcs);
+ prt_printf(out, "context:\t%u\n", rbio->context);
+
+ int ret = READ_ONCE(rbio->ret);
+ if (ret < 0)
+ prt_printf(out, "ret:\t%s\n", bch2_err_str(ret));
+ else
+ prt_printf(out, "ret:\t%i\n", ret);
+
+ prt_printf(out, "flags:\t");
+ bch2_prt_bitflags(out, bch2_read_bio_flags, rbio->flags);
+ prt_newline(out);
+
+ bch2_bio_to_text(out, &rbio->bio);
}
void bch2_fs_io_read_exit(struct bch_fs *c)
@@ -1306,20 +1516,28 @@ void bch2_fs_io_read_exit(struct bch_fs *c)
rhashtable_destroy(&c->promote_table);
bioset_exit(&c->bio_read_split);
bioset_exit(&c->bio_read);
+ mempool_exit(&c->bio_bounce_pages);
}
int bch2_fs_io_read_init(struct bch_fs *c)
{
+ if (mempool_init_page_pool(&c->bio_bounce_pages,
+ max_t(unsigned,
+ c->opts.btree_node_size,
+ c->opts.encoded_extent_max) /
+ PAGE_SIZE, 0))
+ return bch_err_throw(c, ENOMEM_bio_bounce_pages_init);
+
if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
BIOSET_NEED_BVECS))
- return -BCH_ERR_ENOMEM_bio_read_init;
+ return bch_err_throw(c, ENOMEM_bio_read_init);
if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
BIOSET_NEED_BVECS))
- return -BCH_ERR_ENOMEM_bio_read_split_init;
+ return bch_err_throw(c, ENOMEM_bio_read_split_init);
if (rhashtable_init(&c->promote_table, &bch_promote_params))
- return -BCH_ERR_ENOMEM_promote_table_init;
+ return bch_err_throw(c, ENOMEM_promote_table_init);
return 0;
}
diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h
index a82e8a94ccb6..9c5ddbf861b3 100644
--- a/fs/bcachefs/io_read.h
+++ b/fs/bcachefs/io_read.h
@@ -3,6 +3,8 @@
#define _BCACHEFS_IO_READ_H
#include "bkey_buf.h"
+#include "btree_iter.h"
+#include "extents_types.h"
#include "reflink.h"
struct bch_read_bio {
@@ -35,19 +37,22 @@ struct bch_read_bio {
u16 flags;
union {
struct {
- u16 bounce:1,
+ u16 data_update:1,
+ promote:1,
+ bounce:1,
split:1,
- kmalloc:1,
have_ioref:1,
narrow_crcs:1,
- hole:1,
- retry:2,
+ saw_error:1,
+ self_healing:1,
context:2;
};
u16 _state;
};
-
- struct bch_devs_list devs_have;
+ s16 ret;
+#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
+ unsigned list_idx;
+#endif
struct extent_ptr_decoded pick;
@@ -65,8 +70,6 @@ struct bch_read_bio {
struct bpos data_pos;
struct bversion version;
- struct promote_op *promote;
-
struct bch_io_opts opts;
struct work_struct work;
@@ -89,6 +92,8 @@ static inline int bch2_read_indirect_extent(struct btree_trans *trans,
return 0;
*data_btree = BTREE_ID_reflink;
+
+ struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bkey_s_c k = bch2_lookup_indirect_extent(trans, &iter,
offset_into_extent,
@@ -100,72 +105,111 @@ static inline int bch2_read_indirect_extent(struct btree_trans *trans,
if (bkey_deleted(k.k)) {
bch2_trans_iter_exit(trans, &iter);
- return -BCH_ERR_missing_indirect_extent;
+ return bch_err_throw(c, missing_indirect_extent);
}
- bch2_bkey_buf_reassemble(extent, trans->c, k);
+ bch2_bkey_buf_reassemble(extent, c, k);
bch2_trans_iter_exit(trans, &iter);
return 0;
}
+#define BCH_READ_FLAGS() \
+ x(retry_if_stale) \
+ x(may_promote) \
+ x(user_mapped) \
+ x(last_fragment) \
+ x(must_bounce) \
+ x(must_clone) \
+ x(in_retry)
+
+enum __bch_read_flags {
+#define x(n) __BCH_READ_##n,
+ BCH_READ_FLAGS()
+#undef x
+};
+
enum bch_read_flags {
- BCH_READ_RETRY_IF_STALE = 1 << 0,
- BCH_READ_MAY_PROMOTE = 1 << 1,
- BCH_READ_USER_MAPPED = 1 << 2,
- BCH_READ_NODECODE = 1 << 3,
- BCH_READ_LAST_FRAGMENT = 1 << 4,
-
- /* internal: */
- BCH_READ_MUST_BOUNCE = 1 << 5,
- BCH_READ_MUST_CLONE = 1 << 6,
- BCH_READ_IN_RETRY = 1 << 7,
+#define x(n) BCH_READ_##n = BIT(__BCH_READ_##n),
+ BCH_READ_FLAGS()
+#undef x
};
int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *,
struct bvec_iter, struct bpos, enum btree_id,
struct bkey_s_c, unsigned,
- struct bch_io_failures *, unsigned);
+ struct bch_io_failures *, unsigned, int);
static inline void bch2_read_extent(struct btree_trans *trans,
struct bch_read_bio *rbio, struct bpos read_pos,
enum btree_id data_btree, struct bkey_s_c k,
unsigned offset_into_extent, unsigned flags)
{
- __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos,
- data_btree, k, offset_into_extent, NULL, flags);
+ int ret = __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos,
+ data_btree, k, offset_into_extent, NULL, flags, -1);
+ /* __bch2_read_extent only returns errors if BCH_READ_in_retry is set */
+ WARN(ret, "unhandled error from __bch2_read_extent()");
}
-void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
- subvol_inum, struct bch_io_failures *, unsigned flags);
+int __bch2_read(struct btree_trans *, struct bch_read_bio *, struct bvec_iter,
+ subvol_inum,
+ struct bch_io_failures *, struct bkey_buf *, unsigned flags);
static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
subvol_inum inum)
{
- struct bch_io_failures failed = { .nr = 0 };
-
BUG_ON(rbio->_state);
- rbio->c = c;
- rbio->start_time = local_clock();
rbio->subvol = inum.subvol;
- __bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed,
- BCH_READ_RETRY_IF_STALE|
- BCH_READ_MAY_PROMOTE|
- BCH_READ_USER_MAPPED);
+ bch2_trans_run(c,
+ __bch2_read(trans, rbio, rbio->bio.bi_iter, inum, NULL, NULL,
+ BCH_READ_retry_if_stale|
+ BCH_READ_may_promote|
+ BCH_READ_user_mapped));
+}
+
+static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio,
+ struct bch_read_bio *orig)
+{
+ struct bch_read_bio *rbio = to_rbio(bio);
+
+ rbio->c = orig->c;
+ rbio->_state = 0;
+ rbio->flags = 0;
+ rbio->ret = 0;
+ rbio->split = true;
+ rbio->parent = orig;
+ rbio->opts = orig->opts;
+#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
+ rbio->list_idx = 0;
+#endif
+ return rbio;
}
static inline struct bch_read_bio *rbio_init(struct bio *bio,
- struct bch_io_opts opts)
+ struct bch_fs *c,
+ struct bch_io_opts opts,
+ bio_end_io_t end_io)
{
struct bch_read_bio *rbio = to_rbio(bio);
- rbio->_state = 0;
- rbio->promote = NULL;
- rbio->opts = opts;
+ rbio->start_time = local_clock();
+ rbio->c = c;
+ rbio->_state = 0;
+ rbio->flags = 0;
+ rbio->ret = 0;
+ rbio->opts = opts;
+ rbio->bio.bi_end_io = end_io;
+#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
+ rbio->list_idx = 0;
+#endif
return rbio;
}
+struct promote_op;
+void bch2_promote_op_to_text(struct printbuf *, struct promote_op *);
+void bch2_read_bio_to_text(struct printbuf *, struct bch_read_bio *);
+
void bch2_fs_io_read_exit(struct bch_fs *);
int bch2_fs_io_read_init(struct bch_fs *);
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index dd508d93e9fc..88b1eec8eff3 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -6,6 +6,7 @@
#include "bcachefs.h"
#include "alloc_foreground.h"
+#include "async_objs.h"
#include "bkey_buf.h"
#include "bset.h"
#include "btree_update.h"
@@ -15,6 +16,7 @@
#include "compress.h"
#include "debug.h"
#include "ec.h"
+#include "enumerated_ref.h"
#include "error.h"
#include "extent_update.h"
#include "inode.h"
@@ -34,6 +36,12 @@
#include <linux/random.h>
#include <linux/sched/mm.h>
+#ifdef CONFIG_BCACHEFS_DEBUG
+static unsigned bch2_write_corrupt_ratio;
+module_param_named(write_corrupt_ratio, bch2_write_corrupt_ratio, uint, 0644);
+MODULE_PARM_DESC(write_corrupt_ratio, "");
+#endif
+
#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency,
@@ -162,9 +170,9 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans,
*i_sectors_delta = 0;
*disk_sectors_delta = 0;
- bch2_trans_copy_iter(&iter, extent_iter);
+ bch2_trans_copy_iter(trans, &iter, extent_iter);
- for_each_btree_key_max_continue_norestart(iter,
+ for_each_btree_key_max_continue_norestart(trans, iter,
new->k.p, BTREE_ITER_slots, old, ret) {
s64 sectors = min(new->k.p.offset, old.k->p.offset) -
max(bkey_start_offset(&new->k),
@@ -249,10 +257,35 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
}
if (i_sectors_delta) {
+ s64 bi_sectors = le64_to_cpu(inode->v.bi_sectors);
+ if (unlikely(bi_sectors + i_sectors_delta < 0)) {
+ struct bch_fs *c = trans->c;
+ struct printbuf buf = PRINTBUF;
+ bch2_log_msg_start(c, &buf);
+ prt_printf(&buf, "inode %llu i_sectors underflow: %lli + %lli < 0",
+ extent_iter->pos.inode, bi_sectors, i_sectors_delta);
+
+ bool print = bch2_count_fsck_err(c, inode_i_sectors_underflow, &buf);
+ if (print)
+ bch2_print_str(c, KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+
+ if (i_sectors_delta < 0)
+ i_sectors_delta = -bi_sectors;
+ else
+ i_sectors_delta = 0;
+ }
+
le64_add_cpu(&inode->v.bi_sectors, i_sectors_delta);
inode_update_flags = 0;
}
+ /*
+ * extents, dirents and xattrs updates require that an inode update also
+ * happens - to ensure that if a key exists in one of those btrees with
+ * a given snapshot ID an inode is also present - so we may have to skip
+ * the nojournal optimization:
+ */
if (inode->k.p.snapshot != iter.snapshot) {
inode->k.p.snapshot = iter.snapshot;
inode_update_flags = 0;
@@ -286,7 +319,7 @@ int bch2_extent_update(struct btree_trans *trans,
* path already traversed at iter->pos because
* bch2_trans_extent_update() will use it to attempt extent merging
*/
- ret = __bch2_btree_iter_traverse(iter);
+ ret = __bch2_btree_iter_traverse(trans, iter);
if (ret)
return ret;
@@ -331,7 +364,7 @@ int bch2_extent_update(struct btree_trans *trans,
if (i_sectors_delta_total)
*i_sectors_delta_total += i_sectors_delta;
- bch2_btree_iter_set_pos(iter, next_pos);
+ bch2_btree_iter_set_pos(trans, iter, next_pos);
return 0;
}
@@ -370,11 +403,10 @@ static int bch2_write_index_default(struct bch_write_op *op)
bkey_start_pos(&sk.k->k),
BTREE_ITER_slots|BTREE_ITER_intent);
- ret = bch2_bkey_set_needs_rebalance(c, &op->opts, sk.k) ?:
- bch2_extent_update(trans, inum, &iter, sk.k,
+ ret = bch2_extent_update(trans, inum, &iter, sk.k,
&op->res,
op->new_i_size, &op->i_sectors_delta,
- op->flags & BCH_WRITE_CHECK_ENOSPC);
+ op->flags & BCH_WRITE_check_enospc);
bch2_trans_iter_exit(trans, &iter);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -396,19 +428,36 @@ static int bch2_write_index_default(struct bch_write_op *op)
/* Writes */
-static void __bch2_write_op_error(struct printbuf *out, struct bch_write_op *op,
- u64 offset)
+void bch2_write_op_error(struct bch_write_op *op, u64 offset, const char *fmt, ...)
{
- bch2_inum_offset_err_msg(op->c, out,
- (subvol_inum) { op->subvol, op->pos.inode, },
- offset << 9);
- prt_printf(out, "write error%s: ",
- op->flags & BCH_WRITE_MOVE ? "(internal move)" : "");
-}
+ struct printbuf buf = PRINTBUF;
-void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op)
-{
- __bch2_write_op_error(out, op, op->pos.offset);
+ if (op->subvol) {
+ bch2_inum_offset_err_msg(op->c, &buf,
+ (subvol_inum) { op->subvol, op->pos.inode, },
+ offset << 9);
+ } else {
+ struct bpos pos = op->pos;
+ pos.offset = offset;
+ bch2_inum_snap_offset_err_msg(op->c, &buf, pos);
+ }
+
+ prt_str(&buf, "write error: ");
+
+ va_list args;
+ va_start(args, fmt);
+ prt_vprintf(&buf, fmt, args);
+ va_end(args);
+
+ if (op->flags & BCH_WRITE_move) {
+ struct data_update *u = container_of(op, struct data_update, op);
+
+ prt_printf(&buf, "\n from internal move ");
+ bch2_bkey_val_to_text(&buf, op->c, bkey_i_to_s_c(u->k.k));
+ }
+
+ bch_err_ratelimited(op->c, "%s", buf.buf);
+ printbuf_exit(&buf);
}
void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
@@ -418,15 +467,28 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
struct bch_write_bio *n;
+ unsigned ref_rw = type == BCH_DATA_btree ? READ : WRITE;
+ unsigned ref_idx = type == BCH_DATA_btree
+ ? BCH_DEV_READ_REF_btree_node_write
+ : BCH_DEV_WRITE_REF_io_write;
BUG_ON(c->opts.nochanges);
+ const struct bch_extent_ptr *last = NULL;
+ bkey_for_each_ptr(ptrs, ptr)
+ last = ptr;
+
bkey_for_each_ptr(ptrs, ptr) {
+ /*
+ * XXX: btree writes should be using io_ref[WRITE], but we
+ * aren't retrying failed btree writes yet (due to device
+ * removal/ro):
+ */
struct bch_dev *ca = nocow
? bch2_dev_have_ref(c, ptr->dev)
- : bch2_dev_get_ioref(c, ptr->dev, type == BCH_DATA_btree ? READ : WRITE);
+ : bch2_dev_get_ioref(c, ptr->dev, ref_rw, ref_idx);
- if (to_entry(ptr + 1) < ptrs.end) {
+ if (ptr != last) {
n = to_wbio(bio_alloc_clone(NULL, &wbio->bio, GFP_NOFS, &c->replica_set));
n->bio.bi_end_io = wbio->bio.bi_end_io;
@@ -483,18 +545,20 @@ static void bch2_write_done(struct closure *cl)
bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
bch2_disk_reservation_put(c, &op->res);
- if (!(op->flags & BCH_WRITE_MOVE))
- bch2_write_ref_put(c, BCH_WRITE_REF_write);
+ if (!(op->flags & BCH_WRITE_move))
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_write);
bch2_keylist_free(&op->insert_keys, op->inline_keys);
EBUG_ON(cl->parent);
closure_debug_destroy(cl);
+ async_object_list_del(c, write_op, op->list_idx);
if (op->end_io)
op->end_io(op);
}
static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op)
{
+ struct bch_fs *c = op->c;
struct keylist *keys = &op->insert_keys;
struct bkey_i *src, *dst = keys->keys, *n;
@@ -506,7 +570,7 @@ static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op)
test_bit(ptr->dev, op->failed.d));
if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src)))
- return -EIO;
+ return bch_err_throw(c, data_write_io);
}
if (dst != src)
@@ -529,7 +593,7 @@ static void __bch2_write_index(struct bch_write_op *op)
unsigned dev;
int ret = 0;
- if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) {
+ if (unlikely(op->flags & BCH_WRITE_io_error)) {
ret = bch2_write_drop_io_error_ptrs(op);
if (ret)
goto err;
@@ -538,7 +602,7 @@ static void __bch2_write_index(struct bch_write_op *op)
if (!bch2_keylist_empty(keys)) {
u64 sectors_start = keylist_sectors(keys);
- ret = !(op->flags & BCH_WRITE_MOVE)
+ ret = !(op->flags & BCH_WRITE_move)
? bch2_write_index_default(op)
: bch2_data_update_index_update(op);
@@ -550,11 +614,8 @@ static void __bch2_write_index(struct bch_write_op *op)
if (unlikely(ret && !bch2_err_matches(ret, EROFS))) {
struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
- struct printbuf buf = PRINTBUF;
- __bch2_write_op_error(&buf, op, bkey_start_offset(&insert->k));
- prt_printf(&buf, "btree update error: %s", bch2_err_str(ret));
- bch_err_ratelimited(c, "%s", buf.buf);
- printbuf_exit(&buf);
+ bch2_write_op_error(op, bkey_start_offset(&insert->k),
+ "btree update error: %s", bch2_err_str(ret));
}
if (ret)
@@ -563,21 +624,29 @@ static void __bch2_write_index(struct bch_write_op *op)
out:
/* If some a bucket wasn't written, we can't erasure code it: */
for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX)
- bch2_open_bucket_write_error(c, &op->open_buckets, dev);
+ bch2_open_bucket_write_error(c, &op->open_buckets, dev, -BCH_ERR_data_write_io);
bch2_open_buckets_put(c, &op->open_buckets);
return;
err:
keys->top = keys->keys;
op->error = ret;
- op->flags |= BCH_WRITE_SUBMITTED;
+ op->flags |= BCH_WRITE_submitted;
goto out;
}
static inline void __wp_update_state(struct write_point *wp, enum write_point_state state)
{
if (state != wp->state) {
+ struct task_struct *p = current;
u64 now = ktime_get_ns();
+ u64 runtime = p->se.sum_exec_runtime +
+ (now - p->se.exec_start);
+
+ if (state == WRITE_POINT_runnable)
+ wp->last_runtime = runtime;
+ else if (wp->state == WRITE_POINT_runnable)
+ wp->time[WRITE_POINT_running] += runtime - wp->last_runtime;
if (wp->last_state_change &&
time_after64(now, wp->last_state_change))
@@ -591,7 +660,7 @@ static inline void wp_update_state(struct write_point *wp, bool running)
{
enum write_point_state state;
- state = running ? WRITE_POINT_running :
+ state = running ? WRITE_POINT_runnable:
!list_empty(&wp->writes) ? WRITE_POINT_waiting_io
: WRITE_POINT_stopped;
@@ -605,8 +674,8 @@ static CLOSURE_CALLBACK(bch2_write_index)
struct workqueue_struct *wq = index_update_wq(op);
unsigned long flags;
- if ((op->flags & BCH_WRITE_SUBMITTED) &&
- (op->flags & BCH_WRITE_MOVE))
+ if ((op->flags & BCH_WRITE_submitted) &&
+ (op->flags & BCH_WRITE_move))
bch2_bio_free_pages_pool(op->c, &op->wbio.bio);
spin_lock_irqsave(&wp->writes_lock, flags);
@@ -644,11 +713,11 @@ void bch2_write_point_do_index_updates(struct work_struct *work)
if (!op)
break;
- op->flags |= BCH_WRITE_IN_WORKER;
+ op->flags |= BCH_WRITE_in_worker;
__bch2_write_index(op);
- if (!(op->flags & BCH_WRITE_SUBMITTED))
+ if (!(op->flags & BCH_WRITE_submitted))
__bch2_write(op);
else
bch2_write_done(&op->cl);
@@ -666,13 +735,24 @@ static void bch2_write_endio(struct bio *bio)
? bch2_dev_have_ref(c, wbio->dev)
: NULL;
- if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
- op->pos.inode,
- wbio->inode_offset << 9,
- "data write error: %s",
- bch2_blk_status_to_str(bio->bi_status))) {
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write,
+ wbio->submit_time, !bio->bi_status);
+
+ if (unlikely(bio->bi_status)) {
+ if (ca)
+ bch_err_inum_offset_ratelimited(ca,
+ op->pos.inode,
+ wbio->inode_offset << 9,
+ "data write error: %s",
+ bch2_blk_status_to_str(bio->bi_status));
+ else
+ bch_err_inum_offset_ratelimited(c,
+ op->pos.inode,
+ wbio->inode_offset << 9,
+ "data write error: %s",
+ bch2_blk_status_to_str(bio->bi_status));
set_bit(wbio->dev, op->failed.d);
- op->flags |= BCH_WRITE_IO_ERROR;
+ op->flags |= BCH_WRITE_io_error;
}
if (wbio->nocow) {
@@ -682,10 +762,9 @@ static void bch2_write_endio(struct bio *bio)
set_bit(wbio->dev, op->devs_need_flush->d);
}
- if (wbio->have_ioref) {
- bch2_latency_acct(ca, wbio->submit_time, WRITE);
- percpu_ref_put(&ca->io_ref);
- }
+ if (wbio->have_ioref)
+ enumerated_ref_put(&ca->io_ref[WRITE],
+ BCH_DEV_WRITE_REF_io_write);
if (wbio->bounce)
bch2_bio_free_pages_pool(c, bio);
@@ -719,7 +798,10 @@ static void init_append_extent(struct bch_write_op *op,
bch2_extent_crc_append(&e->k_i, crc);
bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size,
- op->flags & BCH_WRITE_CACHED);
+ op->flags & BCH_WRITE_cached);
+
+ if (!(op->flags & BCH_WRITE_move))
+ bch2_bkey_set_needs_rebalance(op->c, &op->opts, &e->k_i);
bch2_keylist_push(&op->insert_keys);
}
@@ -779,7 +861,6 @@ static int bch2_write_rechecksum(struct bch_fs *c,
{
struct bio *bio = &op->wbio.bio;
struct bch_extent_crc_unpacked new_crc;
- int ret;
/* bch2_rechecksum_bio() can't encrypt or decrypt data: */
@@ -787,10 +868,10 @@ static int bch2_write_rechecksum(struct bch_fs *c,
bch2_csum_type_is_encryption(new_csum_type))
new_csum_type = op->crc.csum_type;
- ret = bch2_rechecksum_bio(c, bio, op->version, op->crc,
- NULL, &new_crc,
- op->crc.offset, op->crc.live_size,
- new_csum_type);
+ int ret = bch2_rechecksum_bio(c, bio, op->version, op->crc,
+ NULL, &new_crc,
+ op->crc.offset, op->crc.live_size,
+ new_csum_type);
if (ret)
return ret;
@@ -800,44 +881,12 @@ static int bch2_write_rechecksum(struct bch_fs *c,
return 0;
}
-static int bch2_write_decrypt(struct bch_write_op *op)
-{
- struct bch_fs *c = op->c;
- struct nonce nonce = extent_nonce(op->version, op->crc);
- struct bch_csum csum;
- int ret;
-
- if (!bch2_csum_type_is_encryption(op->crc.csum_type))
- return 0;
-
- /*
- * If we need to decrypt data in the write path, we'll no longer be able
- * to verify the existing checksum (poly1305 mac, in this case) after
- * it's decrypted - this is the last point we'll be able to reverify the
- * checksum:
- */
- csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
- if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io)
- return -EIO;
-
- ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
- op->crc.csum_type = 0;
- op->crc.csum = (struct bch_csum) { 0, 0 };
- return ret;
-}
-
-static enum prep_encoded_ret {
- PREP_ENCODED_OK,
- PREP_ENCODED_ERR,
- PREP_ENCODED_CHECKSUM_ERR,
- PREP_ENCODED_DO_WRITE,
-} bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp)
+static noinline int bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp)
{
struct bch_fs *c = op->c;
struct bio *bio = &op->wbio.bio;
-
- if (!(op->flags & BCH_WRITE_DATA_ENCODED))
- return PREP_ENCODED_OK;
+ struct bch_csum csum;
+ int ret = 0;
BUG_ON(bio_sectors(bio) != op->crc.compressed_size);
@@ -848,12 +897,13 @@ static enum prep_encoded_ret {
(op->crc.compression_type == bch2_compression_opt_to_type(op->compression_opt) ||
op->incompressible)) {
if (!crc_is_compressed(op->crc) &&
- op->csum_type != op->crc.csum_type &&
- bch2_write_rechecksum(c, op, op->csum_type) &&
- !c->opts.no_data_io)
- return PREP_ENCODED_CHECKSUM_ERR;
+ op->csum_type != op->crc.csum_type) {
+ ret = bch2_write_rechecksum(c, op, op->csum_type);
+ if (ret)
+ return ret;
+ }
- return PREP_ENCODED_DO_WRITE;
+ return 1;
}
/*
@@ -861,20 +911,24 @@ static enum prep_encoded_ret {
* is, we have to decompress it:
*/
if (crc_is_compressed(op->crc)) {
- struct bch_csum csum;
-
- if (bch2_write_decrypt(op))
- return PREP_ENCODED_CHECKSUM_ERR;
-
/* Last point we can still verify checksum: */
- csum = bch2_checksum_bio(c, op->crc.csum_type,
- extent_nonce(op->version, op->crc),
- bio);
+ struct nonce nonce = extent_nonce(op->version, op->crc);
+ csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, bio);
if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io)
- return PREP_ENCODED_CHECKSUM_ERR;
+ goto csum_err;
+
+ if (bch2_csum_type_is_encryption(op->crc.csum_type)) {
+ ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, bio);
+ if (ret)
+ return ret;
+
+ op->crc.csum_type = 0;
+ op->crc.csum = (struct bch_csum) { 0, 0 };
+ }
- if (bch2_bio_uncompress_inplace(op, bio))
- return PREP_ENCODED_ERR;
+ ret = bch2_bio_uncompress_inplace(op, bio);
+ if (ret)
+ return ret;
}
/*
@@ -886,22 +940,44 @@ static enum prep_encoded_ret {
* If the data is checksummed and we're only writing a subset,
* rechecksum and adjust bio to point to currently live data:
*/
- if ((op->crc.live_size != op->crc.uncompressed_size ||
- op->crc.csum_type != op->csum_type) &&
- bch2_write_rechecksum(c, op, op->csum_type) &&
- !c->opts.no_data_io)
- return PREP_ENCODED_CHECKSUM_ERR;
+ if (op->crc.live_size != op->crc.uncompressed_size ||
+ op->crc.csum_type != op->csum_type) {
+ ret = bch2_write_rechecksum(c, op, op->csum_type);
+ if (ret)
+ return ret;
+ }
/*
* If we want to compress the data, it has to be decrypted:
*/
- if ((op->compression_opt ||
- bch2_csum_type_is_encryption(op->crc.csum_type) !=
- bch2_csum_type_is_encryption(op->csum_type)) &&
- bch2_write_decrypt(op))
- return PREP_ENCODED_CHECKSUM_ERR;
+ if (bch2_csum_type_is_encryption(op->crc.csum_type) &&
+ (op->compression_opt || op->crc.csum_type != op->csum_type)) {
+ struct nonce nonce = extent_nonce(op->version, op->crc);
+ csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, bio);
+ if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io)
+ goto csum_err;
- return PREP_ENCODED_OK;
+ ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, bio);
+ if (ret)
+ return ret;
+
+ op->crc.csum_type = 0;
+ op->crc.csum = (struct bch_csum) { 0, 0 };
+ }
+
+ return 0;
+csum_err:
+ bch2_write_op_error(op, op->pos.offset,
+ "error verifying existing checksum while moving existing data (memory corruption?)\n"
+ " expected %0llx:%0llx got %0llx:%0llx type %s",
+ op->crc.csum.hi,
+ op->crc.csum.lo,
+ csum.hi,
+ csum.lo,
+ op->crc.csum_type < BCH_CSUM_NR
+ ? __bch2_csum_types[op->crc.csum_type]
+ : "(unknown)");
+ return bch_err_throw(c, data_write_csum);
}
static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
@@ -916,43 +992,51 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
bool page_alloc_failed = false;
int ret, more = 0;
+ if (op->incompressible)
+ op->compression_opt = 0;
+
BUG_ON(!bio_sectors(src));
ec_buf = bch2_writepoint_ec_buf(c, wp);
- switch (bch2_write_prep_encoded_data(op, wp)) {
- case PREP_ENCODED_OK:
- break;
- case PREP_ENCODED_ERR:
- ret = -EIO;
- goto err;
- case PREP_ENCODED_CHECKSUM_ERR:
- goto csum_err;
- case PREP_ENCODED_DO_WRITE:
- /* XXX look for bug here */
- if (ec_buf) {
- dst = bch2_write_bio_alloc(c, wp, src,
- &page_alloc_failed,
- ec_buf);
- bio_copy_data(dst, src);
- bounce = true;
+ if (unlikely(op->flags & BCH_WRITE_data_encoded)) {
+ ret = bch2_write_prep_encoded_data(op, wp);
+ if (ret < 0)
+ goto err;
+ if (ret) {
+ if (ec_buf) {
+ dst = bch2_write_bio_alloc(c, wp, src,
+ &page_alloc_failed,
+ ec_buf);
+ bio_copy_data(dst, src);
+ bounce = true;
+ }
+ init_append_extent(op, wp, op->version, op->crc);
+ goto do_write;
}
- init_append_extent(op, wp, op->version, op->crc);
- goto do_write;
}
if (ec_buf ||
op->compression_opt ||
(op->csum_type &&
- !(op->flags & BCH_WRITE_PAGES_STABLE)) ||
+ !(op->flags & BCH_WRITE_pages_stable)) ||
(bch2_csum_type_is_encryption(op->csum_type) &&
- !(op->flags & BCH_WRITE_PAGES_OWNED))) {
+ !(op->flags & BCH_WRITE_pages_owned))) {
dst = bch2_write_bio_alloc(c, wp, src,
&page_alloc_failed,
ec_buf);
bounce = true;
}
+#ifdef CONFIG_BCACHEFS_DEBUG
+ unsigned write_corrupt_ratio = READ_ONCE(bch2_write_corrupt_ratio);
+ if (!bounce && write_corrupt_ratio) {
+ dst = bch2_write_bio_alloc(c, wp, src,
+ &page_alloc_failed,
+ ec_buf);
+ bounce = true;
+ }
+#endif
saved_iter = dst->bi_iter;
do {
@@ -966,7 +1050,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
break;
BUG_ON(op->compression_opt &&
- (op->flags & BCH_WRITE_DATA_ENCODED) &&
+ (op->flags & BCH_WRITE_data_encoded) &&
bch2_csum_type_is_encryption(op->crc.csum_type));
BUG_ON(op->compression_opt && !bounce);
@@ -1004,7 +1088,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
}
}
- if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
+ if ((op->flags & BCH_WRITE_data_encoded) &&
!crc_is_compressed(crc) &&
bch2_csum_type_is_encryption(op->crc.csum_type) ==
bch2_csum_type_is_encryption(op->csum_type)) {
@@ -1022,12 +1106,13 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
* data can't be modified (by userspace) while it's in
* flight.
*/
- if (bch2_rechecksum_bio(c, src, version, op->crc,
+ ret = bch2_rechecksum_bio(c, src, version, op->crc,
&crc, &op->crc,
src_len >> 9,
bio_sectors(src) - (src_len >> 9),
- op->csum_type))
- goto csum_err;
+ op->csum_type);
+ if (ret)
+ goto err;
/*
* rchecksum_bio sets compression_type on crc from op->crc,
* this isn't always correct as sometimes we're changing
@@ -1036,13 +1121,13 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
crc.compression_type = compression_type;
crc.nonce = nonce;
} else {
- if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
- bch2_rechecksum_bio(c, src, version, op->crc,
+ if ((op->flags & BCH_WRITE_data_encoded) &&
+ (ret = bch2_rechecksum_bio(c, src, version, op->crc,
NULL, &op->crc,
src_len >> 9,
bio_sectors(src) - (src_len >> 9),
- op->crc.csum_type))
- goto csum_err;
+ op->crc.csum_type)))
+ goto err;
crc.compressed_size = dst_len >> 9;
crc.uncompressed_size = src_len >> 9;
@@ -1062,6 +1147,14 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
init_append_extent(op, wp, version, crc);
+#ifdef CONFIG_BCACHEFS_DEBUG
+ if (write_corrupt_ratio) {
+ swap(dst->bi_iter.bi_size, dst_len);
+ bch2_maybe_corrupt_bio(dst, write_corrupt_ratio);
+ swap(dst->bi_iter.bi_size, dst_len);
+ }
+#endif
+
if (dst != src)
bio_advance(dst, dst_len);
bio_advance(src, src_len);
@@ -1093,16 +1186,6 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
do_write:
*_dst = dst;
return more;
-csum_err:
- {
- struct printbuf buf = PRINTBUF;
- bch2_write_op_error(&buf, op);
- prt_printf(&buf, "error verifying existing checksum while rewriting existing data (memory corruption?)");
- bch_err_ratelimited(c, "%s", buf.buf);
- printbuf_exit(&buf);
- }
-
- ret = -EIO;
err:
if (to_wbio(dst)->bounce)
bch2_bio_free_pages_pool(c, dst);
@@ -1126,16 +1209,13 @@ static bool bch2_extent_is_writeable(struct bch_write_op *op,
e = bkey_s_c_to_extent(k);
- rcu_read_lock();
+ guard(rcu)();
extent_for_each_ptr_decode(e, p, entry) {
- if (crc_is_encoded(p.crc) || p.has_ec) {
- rcu_read_unlock();
+ if (crc_is_encoded(p.crc) || p.has_ec)
return false;
- }
replicas += bch2_extent_ptr_durability(c, &p);
}
- rcu_read_unlock();
return replicas >= op->opts.data_replicas;
}
@@ -1180,39 +1260,36 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
{
struct bch_fs *c = op->c;
struct btree_trans *trans = bch2_trans_get(c);
+ int ret = 0;
for_each_keylist_key(&op->insert_keys, orig) {
- int ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents,
+ ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents,
bkey_start_pos(&orig->k), orig->k.p,
BTREE_ITER_intent, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size);
}));
-
- if (ret && !bch2_err_matches(ret, EROFS)) {
- struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
-
- struct printbuf buf = PRINTBUF;
- __bch2_write_op_error(&buf, op, bkey_start_offset(&insert->k));
- prt_printf(&buf, "btree update error: %s", bch2_err_str(ret));
- bch_err_ratelimited(c, "%s", buf.buf);
- printbuf_exit(&buf);
- }
-
- if (ret) {
- op->error = ret;
+ if (ret)
break;
- }
}
bch2_trans_put(trans);
+
+ if (ret && !bch2_err_matches(ret, EROFS)) {
+ struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
+ bch2_write_op_error(op, bkey_start_offset(&insert->k),
+ "btree update error: %s", bch2_err_str(ret));
+ }
+
+ if (ret)
+ op->error = ret;
}
static void __bch2_nocow_write_done(struct bch_write_op *op)
{
- if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) {
- op->error = -EIO;
- } else if (unlikely(op->flags & BCH_WRITE_CONVERT_UNWRITTEN))
+ if (unlikely(op->flags & BCH_WRITE_io_error)) {
+ op->error = bch_err_throw(op->c, data_write_io);
+ } else if (unlikely(op->flags & BCH_WRITE_convert_unwritten))
bch2_nocow_write_convert_unwritten(op);
}
@@ -1241,7 +1318,7 @@ static void bch2_nocow_write(struct bch_write_op *op)
struct bucket_to_lock *stale_at;
int stale, ret;
- if (op->flags & BCH_WRITE_MOVE)
+ if (op->flags & BCH_WRITE_move)
return;
darray_init(&buckets);
@@ -1265,7 +1342,7 @@ retry:
if (ret)
break;
- k = bch2_btree_iter_peek_slot(&iter);
+ k = bch2_btree_iter_peek_slot(trans, &iter);
ret = bkey_err(k);
if (ret)
break;
@@ -1284,7 +1361,8 @@ retry:
/* Get iorefs before dropping btree locks: */
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
bkey_for_each_ptr(ptrs, ptr) {
- struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE);
+ struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE,
+ BCH_DEV_WRITE_REF_io_write);
if (unlikely(!ca))
goto err_get_ioref;
@@ -1299,7 +1377,7 @@ retry:
}), GFP_KERNEL|__GFP_NOFAIL);
if (ptr->unwritten)
- op->flags |= BCH_WRITE_CONVERT_UNWRITTEN;
+ op->flags |= BCH_WRITE_convert_unwritten;
}
/* Unlock before taking nocow locks, doing IO: */
@@ -1307,7 +1385,7 @@ retry:
bch2_trans_unlock(trans);
bch2_cut_front(op->pos, op->insert_keys.top);
- if (op->flags & BCH_WRITE_CONVERT_UNWRITTEN)
+ if (op->flags & BCH_WRITE_convert_unwritten)
bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top);
darray_for_each(buckets, i) {
@@ -1332,7 +1410,7 @@ retry:
wbio_init(bio)->put_bio = true;
bio->bi_opf = op->wbio.bio.bi_opf;
} else {
- op->flags |= BCH_WRITE_SUBMITTED;
+ op->flags |= BCH_WRITE_submitted;
}
op->pos.offset += bio_sectors(bio);
@@ -1342,13 +1420,14 @@ retry:
bio->bi_private = &op->cl;
bio->bi_opf |= REQ_OP_WRITE;
closure_get(&op->cl);
+
bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
op->insert_keys.top, true);
bch2_keylist_push(&op->insert_keys);
- if (op->flags & BCH_WRITE_SUBMITTED)
+ if (op->flags & BCH_WRITE_submitted)
break;
- bch2_btree_iter_advance(&iter);
+ bch2_btree_iter_advance(trans, &iter);
}
out:
bch2_trans_iter_exit(trans, &iter);
@@ -1360,21 +1439,18 @@ err:
darray_exit(&buckets);
if (ret) {
- struct printbuf buf = PRINTBUF;
- bch2_write_op_error(&buf, op);
- prt_printf(&buf, "%s(): btree lookup error: %s", __func__, bch2_err_str(ret));
- bch_err_ratelimited(c, "%s", buf.buf);
- printbuf_exit(&buf);
+ bch2_write_op_error(op, op->pos.offset,
+ "%s(): btree lookup error: %s", __func__, bch2_err_str(ret));
op->error = ret;
- op->flags |= BCH_WRITE_SUBMITTED;
+ op->flags |= BCH_WRITE_submitted;
}
/* fallback to cow write path? */
- if (!(op->flags & BCH_WRITE_SUBMITTED)) {
+ if (!(op->flags & BCH_WRITE_submitted)) {
closure_sync(&op->cl);
__bch2_nocow_write_done(op);
op->insert_keys.top = op->insert_keys.keys;
- } else if (op->flags & BCH_WRITE_SYNC) {
+ } else if (op->flags & BCH_WRITE_sync) {
closure_sync(&op->cl);
bch2_nocow_write_done(&op->cl.work);
} else {
@@ -1388,7 +1464,8 @@ err:
return;
err_get_ioref:
darray_for_each(buckets, i)
- percpu_ref_put(&bch2_dev_have_ref(c, i->b.inode)->io_ref);
+ enumerated_ref_put(&bch2_dev_have_ref(c, i->b.inode)->io_ref[WRITE],
+ BCH_DEV_WRITE_REF_io_write);
/* Fall back to COW path: */
goto out;
@@ -1404,10 +1481,10 @@ err_bucket_stale:
"pointer to invalid bucket in nocow path on device %llu\n %s",
stale_at->b.inode,
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
- ret = -EIO;
+ ret = bch_err_throw(c, data_write_invalid_ptr);
} else {
/* We can retry this: */
- ret = -BCH_ERR_transaction_restart;
+ ret = bch_err_throw(c, transaction_restart);
}
printbuf_exit(&buf);
@@ -1426,7 +1503,7 @@ static void __bch2_write(struct bch_write_op *op)
if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) {
bch2_nocow_write(op);
- if (op->flags & BCH_WRITE_SUBMITTED)
+ if (op->flags & BCH_WRITE_submitted)
goto out_nofs_restore;
}
again:
@@ -1456,7 +1533,7 @@ again:
ret = bch2_trans_run(c, lockrestart_do(trans,
bch2_alloc_sectors_start_trans(trans,
op->target,
- op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED),
+ op->opts.erasure_code && !(op->flags & BCH_WRITE_cached),
op->write_point,
&op->devs_have,
op->nr_replicas,
@@ -1479,16 +1556,12 @@ again:
bch2_alloc_sectors_done_inlined(c, wp);
err:
if (ret <= 0) {
- op->flags |= BCH_WRITE_SUBMITTED;
+ op->flags |= BCH_WRITE_submitted;
if (unlikely(ret < 0)) {
- if (!(op->flags & BCH_WRITE_ALLOC_NOWAIT)) {
- struct printbuf buf = PRINTBUF;
- bch2_write_op_error(&buf, op);
- prt_printf(&buf, "%s(): %s", __func__, bch2_err_str(ret));
- bch_err_ratelimited(c, "%s", buf.buf);
- printbuf_exit(&buf);
- }
+ if (!(op->flags & BCH_WRITE_alloc_nowait))
+ bch2_write_op_error(op, op->pos.offset,
+ "%s(): %s", __func__, bch2_err_str(ret));
op->error = ret;
break;
}
@@ -1514,14 +1587,14 @@ err:
* synchronously here if we weren't able to submit all of the IO at
* once, as that signals backpressure to the caller.
*/
- if ((op->flags & BCH_WRITE_SYNC) ||
- (!(op->flags & BCH_WRITE_SUBMITTED) &&
- !(op->flags & BCH_WRITE_IN_WORKER))) {
+ if ((op->flags & BCH_WRITE_sync) ||
+ (!(op->flags & BCH_WRITE_submitted) &&
+ !(op->flags & BCH_WRITE_in_worker))) {
bch2_wait_on_allocator(c, &op->cl);
__bch2_write_index(op);
- if (!(op->flags & BCH_WRITE_SUBMITTED))
+ if (!(op->flags & BCH_WRITE_submitted))
goto again;
bch2_write_done(&op->cl);
} else {
@@ -1542,8 +1615,8 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
memset(&op->failed, 0, sizeof(op->failed));
- op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
- op->flags |= BCH_WRITE_SUBMITTED;
+ op->flags |= BCH_WRITE_wrote_data_inline;
+ op->flags |= BCH_WRITE_submitted;
bch2_check_set_feature(op->c, BCH_FEATURE_inline_data);
@@ -1606,8 +1679,10 @@ CLOSURE_CALLBACK(bch2_write)
BUG_ON(!op->write_point.v);
BUG_ON(bkey_eq(op->pos, POS_MAX));
- if (op->flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)
- op->flags |= BCH_WRITE_ALLOC_NOWAIT;
+ async_object_list_add(c, write_op, op, &op->list_idx);
+
+ if (op->flags & BCH_WRITE_only_specified_devs)
+ op->flags |= BCH_WRITE_alloc_nowait;
op->nr_replicas_required = min_t(unsigned, op->nr_replicas_required, op->nr_replicas);
op->start_time = local_clock();
@@ -1615,26 +1690,24 @@ CLOSURE_CALLBACK(bch2_write)
wbio_init(bio)->put_bio = false;
if (unlikely(bio->bi_iter.bi_size & (c->opts.block_size - 1))) {
- struct printbuf buf = PRINTBUF;
- bch2_write_op_error(&buf, op);
- prt_printf(&buf, "misaligned write");
- printbuf_exit(&buf);
- op->error = -EIO;
+ bch2_write_op_error(op, op->pos.offset, "misaligned write");
+ op->error = bch_err_throw(c, data_write_misaligned);
goto err;
}
if (c->opts.nochanges) {
- op->error = -BCH_ERR_erofs_no_writes;
+ op->error = bch_err_throw(c, erofs_no_writes);
goto err;
}
- if (!(op->flags & BCH_WRITE_MOVE) &&
- !bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) {
- op->error = -BCH_ERR_erofs_no_writes;
+ if (!(op->flags & BCH_WRITE_move) &&
+ !enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_write)) {
+ op->error = bch_err_throw(c, erofs_no_writes);
goto err;
}
- this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio));
+ if (!(op->flags & BCH_WRITE_move))
+ this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio));
bch2_increment_clock(c, bio_sectors(bio), WRITE);
data_len = min_t(u64, bio->bi_iter.bi_size,
@@ -1652,6 +1725,7 @@ err:
bch2_disk_reservation_put(c, &op->res);
closure_debug_destroy(&op->cl);
+ async_object_list_del(c, write_op, op->list_idx);
if (op->end_io)
op->end_io(op);
}
@@ -1665,27 +1739,33 @@ static const char * const bch2_write_flags[] = {
void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op)
{
- prt_str(out, "pos: ");
+ if (!out->nr_tabstops)
+ printbuf_tabstop_push(out, 32);
+
+ prt_printf(out, "pos:\t");
bch2_bpos_to_text(out, op->pos);
prt_newline(out);
printbuf_indent_add(out, 2);
- prt_str(out, "started: ");
+ prt_printf(out, "started:\t");
bch2_pr_time_units(out, local_clock() - op->start_time);
prt_newline(out);
- prt_str(out, "flags: ");
+ prt_printf(out, "flags:\t");
prt_bitflags(out, bch2_write_flags, op->flags);
prt_newline(out);
- prt_printf(out, "ref: %u\n", closure_nr_remaining(&op->cl));
+ prt_printf(out, "nr_replicas:\t%u\n", op->nr_replicas);
+ prt_printf(out, "nr_replicas_required:\t%u\n", op->nr_replicas_required);
+
+ prt_printf(out, "ref:\t%u\n", closure_nr_remaining(&op->cl));
+ prt_printf(out, "ret\t%s\n", bch2_err_str(op->error));
printbuf_indent_sub(out, 2);
}
void bch2_fs_io_write_exit(struct bch_fs *c)
{
- mempool_exit(&c->bio_bounce_pages);
bioset_exit(&c->replica_set);
bioset_exit(&c->bio_write);
}
@@ -1694,14 +1774,7 @@ int bch2_fs_io_write_init(struct bch_fs *c)
{
if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), BIOSET_NEED_BVECS) ||
bioset_init(&c->replica_set, 4, offsetof(struct bch_write_bio, bio), 0))
- return -BCH_ERR_ENOMEM_bio_write_init;
-
- if (mempool_init_page_pool(&c->bio_bounce_pages,
- max_t(unsigned,
- c->opts.btree_node_size,
- c->opts.encoded_extent_max) /
- PAGE_SIZE, 0))
- return -BCH_ERR_ENOMEM_bio_bounce_pages_init;
+ return bch_err_throw(c, ENOMEM_bio_write_init);
return 0;
}
diff --git a/fs/bcachefs/io_write.h b/fs/bcachefs/io_write.h
index b4626013abc8..2c0a8f35ee1f 100644
--- a/fs/bcachefs/io_write.h
+++ b/fs/bcachefs/io_write.h
@@ -11,45 +11,11 @@
void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *);
void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t);
-#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
-void bch2_latency_acct(struct bch_dev *, u64, int);
-#else
-static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {}
-#endif
-
void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
enum bch_data_type, const struct bkey_i *, bool);
-void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op);
-
-#define BCH_WRITE_FLAGS() \
- x(ALLOC_NOWAIT) \
- x(CACHED) \
- x(DATA_ENCODED) \
- x(PAGES_STABLE) \
- x(PAGES_OWNED) \
- x(ONLY_SPECIFIED_DEVS) \
- x(WROTE_DATA_INLINE) \
- x(FROM_INTERNAL) \
- x(CHECK_ENOSPC) \
- x(SYNC) \
- x(MOVE) \
- x(IN_WORKER) \
- x(SUBMITTED) \
- x(IO_ERROR) \
- x(CONVERT_UNWRITTEN)
-
-enum __bch_write_flags {
-#define x(f) __BCH_WRITE_##f,
- BCH_WRITE_FLAGS()
-#undef x
-};
-
-enum bch_write_flags {
-#define x(f) BCH_WRITE_##f = BIT(__BCH_WRITE_##f),
- BCH_WRITE_FLAGS()
-#undef x
-};
+__printf(3, 4)
+void bch2_write_op_error(struct bch_write_op *op, u64, const char *, ...);
static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
{
diff --git a/fs/bcachefs/io_write_types.h b/fs/bcachefs/io_write_types.h
index 6e878a6f2f0b..5da4eb8bb6f6 100644
--- a/fs/bcachefs/io_write_types.h
+++ b/fs/bcachefs/io_write_types.h
@@ -13,6 +13,34 @@
#include <linux/llist.h>
#include <linux/workqueue.h>
+#define BCH_WRITE_FLAGS() \
+ x(alloc_nowait) \
+ x(cached) \
+ x(data_encoded) \
+ x(pages_stable) \
+ x(pages_owned) \
+ x(only_specified_devs) \
+ x(wrote_data_inline) \
+ x(check_enospc) \
+ x(sync) \
+ x(move) \
+ x(in_worker) \
+ x(submitted) \
+ x(io_error) \
+ x(convert_unwritten)
+
+enum __bch_write_flags {
+#define x(f) __BCH_WRITE_##f,
+ BCH_WRITE_FLAGS()
+#undef x
+};
+
+enum bch_write_flags {
+#define x(f) BCH_WRITE_##f = BIT(__BCH_WRITE_##f),
+ BCH_WRITE_FLAGS()
+#undef x
+};
+
struct bch_write_bio {
struct_group(wbio,
struct bch_fs *c;
@@ -43,6 +71,10 @@ struct bch_write_op {
void (*end_io)(struct bch_write_op *);
u64 start_time;
+#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS
+ unsigned list_idx;
+#endif
+
unsigned written; /* sectors */
u16 flags;
s16 error; /* dio write path expects it to hold -ERESTARTSYS... */
@@ -64,7 +96,7 @@ struct bch_write_op {
struct bpos pos;
struct bversion version;
- /* For BCH_WRITE_DATA_ENCODED: */
+ /* For BCH_WRITE_data_encoded: */
struct bch_extent_crc_unpacked crc;
struct write_point_specifier write_point;
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index cb2c3722f674..ddfeb0dafc9d 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -12,6 +12,7 @@
#include "btree_update.h"
#include "btree_write_buffer.h"
#include "buckets.h"
+#include "enumerated_ref.h"
#include "error.h"
#include "journal.h"
#include "journal_io.h"
@@ -20,13 +21,6 @@
#include "journal_seq_blacklist.h"
#include "trace.h"
-static const char * const bch2_journal_errors[] = {
-#define x(n) #n,
- JOURNAL_ERRORS()
-#undef x
- NULL
-};
-
static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
{
return seq > j->seq_ondisk;
@@ -56,14 +50,20 @@ static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u6
prt_printf(out, "seq:\t%llu\n", seq);
printbuf_indent_add(out, 2);
- prt_printf(out, "refcount:\t%u\n", journal_state_count(s, i));
+ if (!buf->write_started)
+ prt_printf(out, "refcount:\t%u\n", journal_state_count(s, i & JOURNAL_STATE_BUF_MASK));
- prt_printf(out, "size:\t");
- prt_human_readable_u64(out, vstruct_bytes(buf->data));
- prt_newline(out);
+ struct closure *cl = &buf->io;
+ int r = atomic_read(&cl->remaining);
+ prt_printf(out, "io:\t%pS r %i\n", cl->fn, r & CLOSURE_REMAINING_MASK);
- prt_printf(out, "expires:\t");
- prt_printf(out, "%li jiffies\n", buf->expires - jiffies);
+ if (buf->data) {
+ prt_printf(out, "size:\t");
+ prt_human_readable_u64(out, vstruct_bytes(buf->data));
+ prt_newline(out);
+ }
+
+ prt_printf(out, "expires:\t%li jiffies\n", buf->expires - jiffies);
prt_printf(out, "flags:\t");
if (buf->noflush)
@@ -87,6 +87,9 @@ static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u6
static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j)
{
+ lockdep_assert_held(&j->lock);
+ out->atomic++;
+
if (!out->nr_tabstops)
printbuf_tabstop_push(out, 24);
@@ -95,6 +98,8 @@ static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j)
seq++)
bch2_journal_buf_to_text(out, j, seq);
prt_printf(out, "last buf %s\n", journal_entry_is_open(j) ? "open" : "closed");
+
+ --out->atomic;
}
static inline struct journal_buf *
@@ -104,10 +109,8 @@ journal_seq_to_buf(struct journal *j, u64 seq)
EBUG_ON(seq > journal_cur_seq(j));
- if (journal_seq_unwritten(j, seq)) {
+ if (journal_seq_unwritten(j, seq))
buf = j->buf + (seq & JOURNAL_BUF_MASK);
- EBUG_ON(le64_to_cpu(buf->data->seq) != seq);
- }
return buf;
}
@@ -139,8 +142,10 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags)
bool stuck = false;
struct printbuf buf = PRINTBUF;
- if (!(error == JOURNAL_ERR_journal_full ||
- error == JOURNAL_ERR_journal_pin_full) ||
+ buf.atomic++;
+
+ if (!(error == -BCH_ERR_journal_full ||
+ error == -BCH_ERR_journal_pin_full) ||
nr_unwritten_journal_entries(j) ||
(flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim)
return stuck;
@@ -164,12 +169,12 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags)
return stuck;
}
j->err_seq = journal_cur_seq(j);
- spin_unlock(&j->lock);
- bch_err(c, "Journal stuck! Hava a pre-reservation but journal full (error %s)",
- bch2_journal_errors[error]);
- bch2_journal_debug_to_text(&buf, j);
- bch_err(c, "%s", buf.buf);
+ __bch2_journal_debug_to_text(&buf, j);
+ spin_unlock(&j->lock);
+ prt_printf(&buf, bch2_fmt(c, "Journal stuck! Hava a pre-reservation but journal full (error %s)"),
+ bch2_err_str(error));
+ bch2_print_str(c, KERN_ERR, buf.buf);
printbuf_reset(&buf);
bch2_journal_pins_to_text(&buf, j);
@@ -195,7 +200,8 @@ void bch2_journal_do_writes(struct journal *j)
if (w->write_started)
continue;
- if (!journal_state_count(j->reservations, idx)) {
+ if (!journal_state_seq_count(j, j->reservations, seq)) {
+ j->seq_write_started = seq;
w->write_started = true;
closure_call(&w->io, bch2_journal_write, j->wq, NULL);
}
@@ -276,7 +282,24 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t
sectors = vstruct_blocks_plus(buf->data, c->block_bits,
buf->u64s_reserved) << c->block_bits;
- BUG_ON(sectors > buf->sectors);
+ if (unlikely(sectors > buf->sectors)) {
+ struct printbuf err = PRINTBUF;
+ err.atomic++;
+
+ prt_printf(&err, "journal entry overran reserved space: %u > %u\n",
+ sectors, buf->sectors);
+ prt_printf(&err, "buf u64s %u u64s reserved %u cur_entry_u64s %u block_bits %u\n",
+ le32_to_cpu(buf->data->u64s), buf->u64s_reserved,
+ j->cur_entry_u64s,
+ c->block_bits);
+ prt_printf(&err, "fatal error - emergency read only");
+ bch2_journal_halt_locked(j);
+
+ bch_err(c, "%s", err.buf);
+ printbuf_exit(&err);
+ return;
+ }
+
buf->sectors = sectors;
/*
@@ -306,16 +329,23 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t
bch2_journal_space_available(j);
- __bch2_journal_buf_put(j, old.idx, le64_to_cpu(buf->data->seq));
+ __bch2_journal_buf_put(j, le64_to_cpu(buf->data->seq));
}
-void bch2_journal_halt(struct journal *j)
+void bch2_journal_halt_locked(struct journal *j)
{
- spin_lock(&j->lock);
+ lockdep_assert_held(&j->lock);
+
__journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL, true);
if (!j->err_seq)
j->err_seq = journal_cur_seq(j);
journal_wake(j);
+}
+
+void bch2_journal_halt(struct journal *j)
+{
+ spin_lock(&j->lock);
+ bch2_journal_halt_locked(j);
spin_unlock(&j->lock);
}
@@ -367,26 +397,41 @@ static int journal_entry_open(struct journal *j)
BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
if (j->blocked)
- return JOURNAL_ERR_blocked;
+ return bch_err_throw(c, journal_blocked);
if (j->cur_entry_error)
return j->cur_entry_error;
- if (bch2_journal_error(j))
- return JOURNAL_ERR_insufficient_devices; /* -EROFS */
+ int ret = bch2_journal_error(j);
+ if (unlikely(ret))
+ return ret;
if (!fifo_free(&j->pin))
- return JOURNAL_ERR_journal_pin_full;
+ return bch_err_throw(c, journal_pin_full);
if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf))
- return JOURNAL_ERR_max_in_flight;
+ return bch_err_throw(c, journal_max_in_flight);
+
+ if (atomic64_read(&j->seq) - j->seq_write_started == JOURNAL_STATE_BUF_NR)
+ return bch_err_throw(c, journal_max_open);
+
+ if (unlikely(journal_cur_seq(j) >= JOURNAL_SEQ_MAX)) {
+ bch_err(c, "cannot start: journal seq overflow");
+ if (bch2_fs_emergency_read_only_locked(c))
+ bch_err(c, "fatal error - emergency read only");
+ return bch_err_throw(c, journal_shutdown);
+ }
- if (bch2_fs_fatal_err_on(journal_cur_seq(j) >= JOURNAL_SEQ_MAX,
- c, "cannot start: journal seq overflow"))
- return JOURNAL_ERR_insufficient_devices; /* -EROFS */
+ if (!j->free_buf && !buf->data)
+ return bch_err_throw(c, journal_buf_enomem); /* will retry after write completion frees up a buf */
BUG_ON(!j->cur_entry_sectors);
+ if (!buf->data) {
+ swap(buf->data, j->free_buf);
+ swap(buf->buf_size, j->free_buf_size);
+ }
+
buf->expires =
(journal_cur_seq(j) == j->flushed_seq_ondisk
? jiffies
@@ -402,7 +447,7 @@ static int journal_entry_open(struct journal *j)
u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1);
if (u64s <= (ssize_t) j->early_journal_entries.nr)
- return JOURNAL_ERR_journal_full;
+ return bch_err_throw(c, journal_full);
if (fifo_empty(&j->pin) && j->reclaim_thread)
wake_up_process(j->reclaim_thread);
@@ -414,6 +459,14 @@ static int journal_entry_open(struct journal *j)
atomic64_inc(&j->seq);
journal_pin_list_init(fifo_push_ref(&j->pin), 1);
+ if (unlikely(bch2_journal_seq_is_blacklisted(c, journal_cur_seq(j), false))) {
+ bch_err(c, "attempting to open blacklisted journal seq %llu",
+ journal_cur_seq(j));
+ if (bch2_fs_emergency_read_only_locked(c))
+ bch_err(c, "fatal error - emergency read only");
+ return bch_err_throw(c, journal_shutdown);
+ }
+
BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq));
BUG_ON(j->buf + (journal_cur_seq(j) & JOURNAL_BUF_MASK) != buf);
@@ -451,7 +504,7 @@ static int journal_entry_open(struct journal *j)
new.idx++;
BUG_ON(journal_state_count(new, new.idx));
- BUG_ON(new.idx != (journal_cur_seq(j) & JOURNAL_BUF_MASK));
+ BUG_ON(new.idx != (journal_cur_seq(j) & JOURNAL_STATE_BUF_MASK));
journal_state_inc(&new);
@@ -501,6 +554,33 @@ static void journal_write_work(struct work_struct *work)
spin_unlock(&j->lock);
}
+static void journal_buf_prealloc(struct journal *j)
+{
+ if (j->free_buf &&
+ j->free_buf_size >= j->buf_size_want)
+ return;
+
+ unsigned buf_size = j->buf_size_want;
+
+ spin_unlock(&j->lock);
+ void *buf = kvmalloc(buf_size, GFP_NOFS);
+ spin_lock(&j->lock);
+
+ if (buf &&
+ (!j->free_buf ||
+ buf_size > j->free_buf_size)) {
+ swap(buf, j->free_buf);
+ swap(buf_size, j->free_buf_size);
+ }
+
+ if (unlikely(buf)) {
+ spin_unlock(&j->lock);
+ /* kvfree can sleep */
+ kvfree(buf);
+ spin_lock(&j->lock);
+ }
+}
+
static int __journal_res_get(struct journal *j, struct journal_res *res,
unsigned flags)
{
@@ -512,25 +592,28 @@ retry:
if (journal_res_get_fast(j, res, flags))
return 0;
- if (bch2_journal_error(j))
- return -BCH_ERR_erofs_journal_err;
+ ret = bch2_journal_error(j);
+ if (unlikely(ret))
+ return ret;
if (j->blocked)
- return -BCH_ERR_journal_res_get_blocked;
+ return bch_err_throw(c, journal_blocked);
if ((flags & BCH_WATERMARK_MASK) < j->watermark) {
- ret = JOURNAL_ERR_journal_full;
+ ret = bch_err_throw(c, journal_full);
can_discard = j->can_discard;
goto out;
}
if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) && !journal_entry_is_open(j)) {
- ret = JOURNAL_ERR_max_in_flight;
+ ret = bch_err_throw(c, journal_max_in_flight);
goto out;
}
spin_lock(&j->lock);
+ journal_buf_prealloc(j);
+
/*
* Recheck after taking the lock, so we don't race with another thread
* that just did journal_entry_open() and call bch2_journal_entry_close()
@@ -553,25 +636,48 @@ retry:
j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1);
__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, false);
- ret = journal_entry_open(j) ?: JOURNAL_ERR_retry;
+ ret = journal_entry_open(j) ?: -BCH_ERR_journal_retry_open;
unlock:
can_discard = j->can_discard;
spin_unlock(&j->lock);
out:
- if (ret == JOURNAL_ERR_retry)
- goto retry;
- if (!ret)
+ if (likely(!ret))
return 0;
+ if (ret == -BCH_ERR_journal_retry_open)
+ goto retry;
if (journal_error_check_stuck(j, ret, flags))
- ret = -BCH_ERR_journal_res_get_blocked;
+ ret = bch_err_throw(c, journal_stuck);
+
+ if (ret == -BCH_ERR_journal_max_in_flight &&
+ track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true) &&
+ trace_journal_entry_full_enabled()) {
+ struct printbuf buf = PRINTBUF;
- if (ret == JOURNAL_ERR_max_in_flight &&
- track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true)) {
+ bch2_printbuf_make_room(&buf, 4096);
+ spin_lock(&j->lock);
+ prt_printf(&buf, "seq %llu\n", journal_cur_seq(j));
+ bch2_journal_bufs_to_text(&buf, j);
+ spin_unlock(&j->lock);
+
+ trace_journal_entry_full(c, buf.buf);
+ printbuf_exit(&buf);
+ count_event(c, journal_entry_full);
+ }
+
+ if (ret == -BCH_ERR_journal_max_open &&
+ track_event_change(&c->times[BCH_TIME_blocked_journal_max_open], true) &&
+ trace_journal_entry_full_enabled()) {
struct printbuf buf = PRINTBUF;
+
+ bch2_printbuf_make_room(&buf, 4096);
+
+ spin_lock(&j->lock);
prt_printf(&buf, "seq %llu\n", journal_cur_seq(j));
bch2_journal_bufs_to_text(&buf, j);
+ spin_unlock(&j->lock);
+
trace_journal_entry_full(c, buf.buf);
printbuf_exit(&buf);
count_event(c, journal_entry_full);
@@ -581,8 +687,8 @@ out:
* Journal is full - can't rely on reclaim from work item due to
* freezing:
*/
- if ((ret == JOURNAL_ERR_journal_full ||
- ret == JOURNAL_ERR_journal_pin_full) &&
+ if ((ret == -BCH_ERR_journal_full ||
+ ret == -BCH_ERR_journal_pin_full) &&
!(flags & JOURNAL_RES_GET_NONBLOCK)) {
if (can_discard) {
bch2_journal_do_discards(j);
@@ -595,16 +701,15 @@ out:
}
}
- return ret == JOURNAL_ERR_insufficient_devices
- ? -BCH_ERR_erofs_journal_err
- : -BCH_ERR_journal_res_get_blocked;
+ return ret;
}
static unsigned max_dev_latency(struct bch_fs *c)
{
u64 nsecs = 0;
- for_each_rw_member(c, ca)
+ guard(rcu)();
+ for_each_rw_member_rcu(c, ca)
nsecs = max(nsecs, ca->io_latency[WRITE].stats.max_duration);
return nsecs_to_jiffies(nsecs);
@@ -627,7 +732,7 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
int ret;
if (closure_wait_event_timeout(&j->async_wait,
- (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked ||
+ !bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) ||
(flags & JOURNAL_RES_GET_NONBLOCK),
HZ))
return ret;
@@ -641,19 +746,19 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
remaining_wait = max(0, remaining_wait - HZ);
if (closure_wait_event_timeout(&j->async_wait,
- (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked ||
+ !bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) ||
(flags & JOURNAL_RES_GET_NONBLOCK),
remaining_wait))
return ret;
struct printbuf buf = PRINTBUF;
bch2_journal_debug_to_text(&buf, j);
- bch_err(c, "Journal stuck? Waited for 10 seconds...\n%s",
- buf.buf);
+ bch2_print_str(c, KERN_ERR, buf.buf);
+ prt_printf(&buf, bch2_fmt(c, "Journal stuck? Waited for 10 seconds, err %s"), bch2_err_str(ret));
printbuf_exit(&buf);
closure_wait_event(&j->async_wait,
- (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked ||
+ !bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) ||
(flags & JOURNAL_RES_GET_NONBLOCK));
return ret;
}
@@ -674,7 +779,6 @@ void bch2_journal_entry_res_resize(struct journal *j,
goto out;
j->cur_entry_u64s = max_t(int, 0, j->cur_entry_u64s - d);
- smp_mb();
state = READ_ONCE(j->reservations);
if (state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL &&
@@ -708,6 +812,7 @@ out:
int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
struct closure *parent)
{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_buf *buf;
int ret = 0;
@@ -723,7 +828,7 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
/* Recheck under lock: */
if (j->err_seq && seq >= j->err_seq) {
- ret = -BCH_ERR_journal_flush_err;
+ ret = bch_err_throw(c, journal_flush_err);
goto out;
}
@@ -783,6 +888,7 @@ recheck_need_open:
}
buf->must_flush = true;
+ j->flushing_seq = max(j->flushing_seq, seq);
if (parent && !closure_wait(&buf->wait, parent))
BUG();
@@ -892,11 +998,11 @@ int bch2_journal_meta(struct journal *j)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_journal))
- return -EROFS;
+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_journal))
+ return bch_err_throw(c, erofs_no_writes);
int ret = __bch2_journal_meta(j);
- bch2_write_ref_put(c, BCH_WRITE_REF_journal);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_journal);
return ret;
}
@@ -937,7 +1043,8 @@ static void __bch2_journal_block(struct journal *j)
new.cur_entry_offset = JOURNAL_ENTRY_BLOCKED_VAL;
} while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v));
- journal_cur_buf(j)->data->u64s = cpu_to_le32(old.cur_entry_offset);
+ if (old.cur_entry_offset < JOURNAL_ENTRY_BLOCKED_VAL)
+ journal_cur_buf(j)->data->u64s = cpu_to_le32(old.cur_entry_offset);
}
}
@@ -975,10 +1082,11 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou
if (open && !*blocked) {
__bch2_journal_block(j);
+ s.v = atomic64_read_acquire(&j->reservations.counter);
*blocked = true;
}
- ret = journal_state_count(s, idx) > open
+ ret = journal_state_count(s, idx & JOURNAL_STATE_BUF_MASK) > open
? ERR_PTR(-EAGAIN)
: buf;
break;
@@ -1007,8 +1115,8 @@ struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j,
/* allocate journal on a device: */
-static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
- bool new_fs, struct closure *cl)
+static int bch2_set_nr_journal_buckets_iter(struct bch_dev *ca, unsigned nr,
+ bool new_fs, struct closure *cl)
{
struct bch_fs *c = ca->fs;
struct journal_device *ja = &ca->journal;
@@ -1025,7 +1133,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
new_buckets = kcalloc(nr, sizeof(u64), GFP_KERNEL);
new_bucket_seq = kcalloc(nr, sizeof(u64), GFP_KERNEL);
if (!bu || !ob || !new_buckets || !new_bucket_seq) {
- ret = -BCH_ERR_ENOMEM_set_nr_journal_buckets;
+ ret = bch_err_throw(c, ENOMEM_set_nr_journal_buckets);
goto err_free;
}
@@ -1136,26 +1244,20 @@ err_free:
return ret;
}
-/*
- * Allocate more journal space at runtime - not currently making use if it, but
- * the code works:
- */
-int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
- unsigned nr)
+static int bch2_set_nr_journal_buckets_loop(struct bch_fs *c, struct bch_dev *ca,
+ unsigned nr, bool new_fs)
{
struct journal_device *ja = &ca->journal;
- struct closure cl;
int ret = 0;
+ struct closure cl;
closure_init_stack(&cl);
- down_write(&c->state_lock);
-
/* don't handle reducing nr of buckets yet: */
if (nr < ja->nr)
- goto unlock;
+ return 0;
- while (ja->nr < nr) {
+ while (!ret && ja->nr < nr) {
struct disk_reservation disk_res = { 0, 0, 0 };
/*
@@ -1168,35 +1270,118 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
* filesystem-wide allocation will succeed, this is a device
* specific allocation - we can hang here:
*/
+ if (!new_fs) {
+ ret = bch2_disk_reservation_get(c, &disk_res,
+ bucket_to_sector(ca, nr - ja->nr), 1, 0);
+ if (ret)
+ break;
+ }
- ret = bch2_disk_reservation_get(c, &disk_res,
- bucket_to_sector(ca, nr - ja->nr), 1, 0);
- if (ret)
- break;
+ ret = bch2_set_nr_journal_buckets_iter(ca, nr, new_fs, &cl);
- ret = __bch2_set_nr_journal_buckets(ca, nr, false, &cl);
+ if (ret == -BCH_ERR_bucket_alloc_blocked ||
+ ret == -BCH_ERR_open_buckets_empty)
+ ret = 0; /* wait and retry */
bch2_disk_reservation_put(c, &disk_res);
+ bch2_wait_on_allocator(c, &cl);
+ }
- closure_sync(&cl);
+ return ret;
+}
- if (ret && ret != -BCH_ERR_bucket_alloc_blocked)
- break;
- }
+/*
+ * Allocate more journal space at runtime - not currently making use if it, but
+ * the code works:
+ */
+int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
+ unsigned nr)
+{
+ down_write(&c->state_lock);
+ int ret = bch2_set_nr_journal_buckets_loop(c, ca, nr, false);
+ up_write(&c->state_lock);
bch_err_fn(c, ret);
-unlock:
- up_write(&c->state_lock);
return ret;
}
+int bch2_dev_journal_bucket_delete(struct bch_dev *ca, u64 b)
+{
+ struct bch_fs *c = ca->fs;
+ struct journal *j = &c->journal;
+ struct journal_device *ja = &ca->journal;
+
+ guard(mutex)(&c->sb_lock);
+ unsigned pos;
+ for (pos = 0; pos < ja->nr; pos++)
+ if (ja->buckets[pos] == b)
+ break;
+
+ if (pos == ja->nr) {
+ bch_err(ca, "journal bucket %llu not found when deleting", b);
+ return -EINVAL;
+ }
+
+ u64 *new_buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);;
+ if (!new_buckets)
+ return bch_err_throw(c, ENOMEM_set_nr_journal_buckets);
+
+ memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64));
+ memmove(&new_buckets[pos],
+ &new_buckets[pos + 1],
+ (ja->nr - 1 - pos) * sizeof(new_buckets[0]));
+
+ int ret = bch2_journal_buckets_to_sb(c, ca, ja->buckets, ja->nr - 1) ?:
+ bch2_write_super(c);
+ if (ret) {
+ kfree(new_buckets);
+ return ret;
+ }
+
+ scoped_guard(spinlock, &j->lock) {
+ if (pos < ja->discard_idx)
+ --ja->discard_idx;
+ if (pos < ja->dirty_idx_ondisk)
+ --ja->dirty_idx_ondisk;
+ if (pos < ja->dirty_idx)
+ --ja->dirty_idx;
+ if (pos < ja->cur_idx)
+ --ja->cur_idx;
+
+ ja->nr--;
+
+ memmove(&ja->buckets[pos],
+ &ja->buckets[pos + 1],
+ (ja->nr - pos) * sizeof(ja->buckets[0]));
+
+ memmove(&ja->bucket_seq[pos],
+ &ja->bucket_seq[pos + 1],
+ (ja->nr - pos) * sizeof(ja->bucket_seq[0]));
+
+ bch2_journal_space_available(j);
+ }
+
+ kfree(new_buckets);
+ return 0;
+}
+
int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs)
{
+ struct bch_fs *c = ca->fs;
+
+ if (!(ca->mi.data_allowed & BIT(BCH_DATA_journal)))
+ return 0;
+
+ if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) {
+ bch_err(c, "cannot allocate journal, filesystem is an unresized image file");
+ return bch_err_throw(c, erofs_filesystem_full);
+ }
+
unsigned nr;
int ret;
if (dynamic_fault("bcachefs:add:journal_alloc")) {
- ret = -BCH_ERR_ENOMEM_set_nr_journal_buckets;
+ ret = bch_err_throw(c, ENOMEM_set_nr_journal_buckets);
goto err;
}
@@ -1212,7 +1397,7 @@ int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs)
min(1 << 13,
(1 << 24) / ca->mi.bucket_size));
- ret = __bch2_set_nr_journal_buckets(ca, nr, new_fs, NULL);
+ ret = bch2_set_nr_journal_buckets_loop(c, ca, nr, new_fs);
err:
bch_err_fn(ca, ret);
return ret;
@@ -1220,13 +1405,14 @@ err:
int bch2_fs_journal_alloc(struct bch_fs *c)
{
- for_each_online_member(c, ca) {
+ for_each_online_member(c, ca, BCH_DEV_READ_REF_fs_journal_alloc) {
if (ca->journal.nr)
continue;
int ret = bch2_dev_journal_alloc(ca, true);
if (ret) {
- percpu_ref_put(&ca->io_ref);
+ enumerated_ref_put(&ca->io_ref[READ],
+ BCH_DEV_READ_REF_fs_journal_alloc);
return ret;
}
}
@@ -1289,50 +1475,58 @@ void bch2_fs_journal_stop(struct journal *j)
clear_bit(JOURNAL_running, &j->flags);
}
-int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
+int bch2_fs_journal_start(struct journal *j, u64 last_seq, u64 cur_seq)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_entry_pin_list *p;
struct journal_replay *i, **_i;
struct genradix_iter iter;
bool had_entries = false;
- u64 last_seq = cur_seq, nr, seq;
+
+ /*
+ *
+ * XXX pick most recent non blacklisted sequence number
+ */
+
+ cur_seq = max(cur_seq, bch2_journal_last_blacklisted_seq(c));
if (cur_seq >= JOURNAL_SEQ_MAX) {
bch_err(c, "cannot start: journal seq overflow");
return -EINVAL;
}
- genradix_for_each_reverse(&c->journal_entries, iter, _i) {
- i = *_i;
-
- if (journal_replay_ignore(i))
- continue;
+ /* Clean filesystem? */
+ if (!last_seq)
+ last_seq = cur_seq;
- last_seq = le64_to_cpu(i->j.last_seq);
- break;
- }
+ u64 nr = cur_seq - last_seq;
- nr = cur_seq - last_seq;
+ /*
+ * Extra fudge factor, in case we crashed when the journal pin fifo was
+ * nearly or completely full. We'll need to be able to open additional
+ * journal entries (at least a few) in order for journal replay to get
+ * going:
+ */
+ nr += nr / 4;
- if (nr + 1 > j->pin.size) {
- free_fifo(&j->pin);
- init_fifo(&j->pin, roundup_pow_of_two(nr + 1), GFP_KERNEL);
- if (!j->pin.data) {
- bch_err(c, "error reallocating journal fifo (%llu open entries)", nr);
- return -BCH_ERR_ENOMEM_journal_pin_fifo;
- }
+ nr = max(nr, JOURNAL_PIN);
+ init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL);
+ if (!j->pin.data) {
+ bch_err(c, "error reallocating journal fifo (%llu open entries)", nr);
+ return bch_err_throw(c, ENOMEM_journal_pin_fifo);
}
j->replay_journal_seq = last_seq;
j->replay_journal_seq_end = cur_seq;
j->last_seq_ondisk = last_seq;
j->flushed_seq_ondisk = cur_seq - 1;
+ j->seq_write_started = cur_seq - 1;
j->seq_ondisk = cur_seq - 1;
j->pin.front = last_seq;
j->pin.back = cur_seq;
atomic64_set(&j->seq, cur_seq - 1);
+ u64 seq;
fifo_for_each_entry_ptr(p, &j->pin, seq)
journal_pin_list_init(p, 1);
@@ -1364,19 +1558,29 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
j->last_empty_seq = cur_seq - 1; /* to match j->seq */
spin_lock(&j->lock);
-
- set_bit(JOURNAL_running, &j->flags);
j->last_flush_write = jiffies;
- j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j);
- j->reservations.unwritten_idx++;
+ j->reservations.idx = journal_cur_seq(j);
c->last_bucket_seq_cleanup = journal_cur_seq(j);
+ spin_unlock(&j->lock);
+
+ return 0;
+}
+void bch2_journal_set_replay_done(struct journal *j)
+{
+ /*
+ * journal_space_available must happen before setting JOURNAL_running
+ * JOURNAL_running must happen before JOURNAL_replay_done
+ */
+ spin_lock(&j->lock);
bch2_journal_space_available(j);
- spin_unlock(&j->lock);
- return bch2_journal_reclaim_start(j);
+ set_bit(JOURNAL_need_flush_write, &j->flags);
+ set_bit(JOURNAL_running, &j->flags);
+ set_bit(JOURNAL_replay_done, &j->flags);
+ spin_unlock(&j->lock);
}
/* init/exit: */
@@ -1398,6 +1602,7 @@ void bch2_dev_journal_exit(struct bch_dev *ca)
int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
{
+ struct bch_fs *c = ca->fs;
struct journal_device *ja = &ca->journal;
struct bch_sb_field_journal *journal_buckets =
bch2_sb_field_get(sb, journal);
@@ -1417,15 +1622,15 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
if (!ja->bucket_seq)
- return -BCH_ERR_ENOMEM_dev_journal_init;
+ return bch_err_throw(c, ENOMEM_dev_journal_init);
unsigned nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE);
for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) {
- ja->bio[i] = kmalloc(struct_size(ja->bio[i], bio.bi_inline_vecs,
+ ja->bio[i] = kzalloc(struct_size(ja->bio[i], bio.bi_inline_vecs,
nr_bvecs), GFP_KERNEL);
if (!ja->bio[i])
- return -BCH_ERR_ENOMEM_dev_journal_init;
+ return bch_err_throw(c, ENOMEM_dev_journal_init);
ja->bio[i]->ca = ca;
ja->bio[i]->buf_idx = i;
@@ -1434,7 +1639,7 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
if (!ja->buckets)
- return -BCH_ERR_ENOMEM_dev_journal_init;
+ return bch_err_throw(c, ENOMEM_dev_journal_init);
if (journal_buckets_v2) {
unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
@@ -1461,10 +1666,11 @@ void bch2_fs_journal_exit(struct journal *j)
for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++)
kvfree(j->buf[i].data);
+ kvfree(j->free_buf);
free_fifo(&j->pin);
}
-int bch2_fs_journal_init(struct journal *j)
+void bch2_fs_journal_init_early(struct journal *j)
{
static struct lock_class_key res_key;
@@ -1483,24 +1689,24 @@ int bch2_fs_journal_init(struct journal *j)
atomic64_set(&j->reservations.counter,
((union journal_res_state)
{ .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
+}
- if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)))
- return -BCH_ERR_ENOMEM_journal_pin_fifo;
+int bch2_fs_journal_init(struct journal *j)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
- for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) {
- j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN;
- j->buf[i].data = kvmalloc(j->buf[i].buf_size, GFP_KERNEL);
- if (!j->buf[i].data)
- return -BCH_ERR_ENOMEM_journal_buf;
- j->buf[i].idx = i;
- }
+ j->free_buf_size = j->buf_size_want = JOURNAL_ENTRY_SIZE_MIN;
+ j->free_buf = kvmalloc(j->free_buf_size, GFP_KERNEL);
+ if (!j->free_buf)
+ return bch_err_throw(c, ENOMEM_journal_buf);
- j->pin.front = j->pin.back = 1;
+ for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++)
+ j->buf[i].idx = i;
j->wq = alloc_workqueue("bcachefs_journal",
WQ_HIGHPRI|WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512);
if (!j->wq)
- return -BCH_ERR_ENOMEM_fs_other_alloc;
+ return bch_err_throw(c, ENOMEM_fs_other_alloc);
return 0;
}
@@ -1524,7 +1730,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
printbuf_tabstop_push(out, 28);
out->atomic++;
- rcu_read_lock();
+ guard(rcu)();
s = READ_ONCE(j->reservations);
prt_printf(out, "flags:\t");
@@ -1543,6 +1749,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
prt_printf(out, "average write size:\t");
prt_human_readable_u64(out, nr_writes ? div64_u64(j->entry_bytes_written, nr_writes) : 0);
prt_newline(out);
+ prt_printf(out, "free buf:\t%u\n", j->free_buf ? j->free_buf_size : 0);
prt_printf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim);
prt_printf(out, "nr background reclaim:\t%llu\n", j->nr_background_reclaim);
prt_printf(out, "reclaim kicked:\t%u\n", j->reclaim_kicked);
@@ -1550,7 +1757,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
? jiffies_to_msecs(j->next_reclaim - jiffies) : 0);
prt_printf(out, "blocked:\t%u\n", j->blocked);
prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors);
- prt_printf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]);
+ prt_printf(out, "current entry error:\t%s\n", bch2_err_str(j->cur_entry_error));
prt_printf(out, "current entry:\t");
switch (s.cur_entry_offset) {
@@ -1614,8 +1821,6 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
prt_printf(out, "replicas want %u need %u\n", c->opts.metadata_replicas, c->opts.metadata_replicas_required);
- rcu_read_unlock();
-
--out->atomic;
}
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index dccddd5420ad..977907038d98 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -121,11 +121,6 @@ static inline void journal_wake(struct journal *j)
closure_wake_up(&j->async_wait);
}
-static inline struct journal_buf *journal_cur_buf(struct journal *j)
-{
- return j->buf + j->reservations.idx;
-}
-
/* Sequence number of oldest dirty journal entry */
static inline u64 journal_last_seq(struct journal *j)
@@ -143,6 +138,15 @@ static inline u64 journal_last_unwritten_seq(struct journal *j)
return j->seq_ondisk + 1;
}
+static inline struct journal_buf *journal_cur_buf(struct journal *j)
+{
+ unsigned idx = (journal_cur_seq(j) &
+ JOURNAL_BUF_MASK &
+ ~JOURNAL_STATE_BUF_MASK) + j->reservations.idx;
+
+ return j->buf + idx;
+}
+
static inline int journal_state_count(union journal_res_state s, int idx)
{
switch (idx) {
@@ -154,6 +158,15 @@ static inline int journal_state_count(union journal_res_state s, int idx)
BUG();
}
+static inline int journal_state_seq_count(struct journal *j,
+ union journal_res_state s, u64 seq)
+{
+ if (journal_cur_seq(j) - seq < JOURNAL_STATE_BUF_NR)
+ return journal_state_count(s, seq & JOURNAL_STATE_BUF_MASK);
+ else
+ return 0;
+}
+
static inline void journal_state_inc(union journal_res_state *s)
{
s->buf0_count += s->idx == 0;
@@ -193,7 +206,7 @@ bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s)
static inline struct jset_entry *
journal_res_entry(struct journal *j, struct journal_res *res)
{
- return vstruct_idx(j->buf[res->idx].data, res->offset);
+ return vstruct_idx(j->buf[res->seq & JOURNAL_BUF_MASK].data, res->offset);
}
static inline unsigned journal_entry_init(struct jset_entry *entry, unsigned type,
@@ -267,8 +280,9 @@ bool bch2_journal_entry_close(struct journal *);
void bch2_journal_do_writes(struct journal *);
void bch2_journal_buf_put_final(struct journal *, u64);
-static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq)
+static inline void __bch2_journal_buf_put(struct journal *j, u64 seq)
{
+ unsigned idx = seq & JOURNAL_STATE_BUF_MASK;
union journal_res_state s;
s = journal_state_buf_put(j, idx);
@@ -276,8 +290,9 @@ static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 s
bch2_journal_buf_put_final(j, seq);
}
-static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq)
+static inline void bch2_journal_buf_put(struct journal *j, u64 seq)
{
+ unsigned idx = seq & JOURNAL_STATE_BUF_MASK;
union journal_res_state s;
s = journal_state_buf_put(j, idx);
@@ -306,7 +321,7 @@ static inline void bch2_journal_res_put(struct journal *j,
BCH_JSET_ENTRY_btree_keys,
0, 0, 0);
- bch2_journal_buf_put(j, res->idx, res->seq);
+ bch2_journal_buf_put(j, res->seq);
res->ref = 0;
}
@@ -335,8 +350,10 @@ static inline int journal_res_get_fast(struct journal *j,
/*
* Check if there is still room in the current journal
- * entry:
+ * entry, smp_rmb() guarantees that reads from reservations.counter
+ * occur before accessing cur_entry_u64s:
*/
+ smp_rmb();
if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s)
return 0;
@@ -361,9 +378,9 @@ static inline int journal_res_get_fast(struct journal *j,
&old.v, new.v));
res->ref = true;
- res->idx = old.idx;
res->offset = old.cur_entry_offset;
- res->seq = le64_to_cpu(j->buf[old.idx].data->seq);
+ res->seq = journal_cur_seq(j);
+ res->seq -= (res->seq - old.idx) & JOURNAL_STATE_BUF_MASK;
return 1;
}
@@ -390,6 +407,7 @@ out:
(flags & JOURNAL_RES_GET_NONBLOCK) != 0,
NULL, _THIS_IP_);
EBUG_ON(!res->ref);
+ BUG_ON(!res->seq);
}
return 0;
}
@@ -408,6 +426,7 @@ int bch2_journal_flush(struct journal *);
bool bch2_journal_noflush_seq(struct journal *, u64, u64);
int bch2_journal_meta(struct journal *);
+void bch2_journal_halt_locked(struct journal *);
void bch2_journal_halt(struct journal *);
static inline int bch2_journal_error(struct journal *j)
@@ -418,12 +437,6 @@ static inline int bch2_journal_error(struct journal *j)
struct bch_dev;
-static inline void bch2_journal_set_replay_done(struct journal *j)
-{
- BUG_ON(!test_bit(JOURNAL_running, &j->flags));
- set_bit(JOURNAL_replay_done, &j->flags);
-}
-
void bch2_journal_unblock(struct journal *);
void bch2_journal_block(struct journal *);
struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *, u64, bool *);
@@ -431,19 +444,22 @@ struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *, u
void __bch2_journal_debug_to_text(struct printbuf *, struct journal *);
void bch2_journal_debug_to_text(struct printbuf *, struct journal *);
-int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *,
- unsigned nr);
+int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, unsigned);
+int bch2_dev_journal_bucket_delete(struct bch_dev *, u64);
+
int bch2_dev_journal_alloc(struct bch_dev *, bool);
int bch2_fs_journal_alloc(struct bch_fs *);
void bch2_dev_journal_stop(struct journal *, struct bch_dev *);
void bch2_fs_journal_stop(struct journal *);
-int bch2_fs_journal_start(struct journal *, u64);
+int bch2_fs_journal_start(struct journal *, u64, u64);
+void bch2_journal_set_replay_done(struct journal *);
void bch2_dev_journal_exit(struct bch_dev *);
int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *);
void bch2_fs_journal_exit(struct journal *);
+void bch2_fs_journal_init_early(struct journal *);
int bch2_fs_journal_init(struct journal *);
#endif /* _BCACHEFS_JOURNAL_H */
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 11c39e0c34f4..9e028dbcc3d0 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -19,6 +19,7 @@
#include <linux/ioprio.h>
#include <linux/string_choices.h>
+#include <linux/sched/sysctl.h>
void bch2_journal_pos_from_member_info_set(struct bch_fs *c)
{
@@ -48,25 +49,27 @@ void bch2_journal_pos_from_member_info_resume(struct bch_fs *c)
mutex_unlock(&c->sb_lock);
}
-void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
- struct journal_replay *j)
+static void bch2_journal_ptr_to_text(struct printbuf *out, struct bch_fs *c, struct journal_ptr *p)
+{
+ struct bch_dev *ca = bch2_dev_tryget_noerror(c, p->dev);
+ prt_printf(out, "%s %u:%u:%u (sector %llu)",
+ ca ? ca->name : "(invalid dev)",
+ p->dev, p->bucket, p->bucket_offset, p->sector);
+ bch2_dev_put(ca);
+}
+
+void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, struct journal_replay *j)
{
darray_for_each(j->ptrs, i) {
if (i != j->ptrs.data)
prt_printf(out, " ");
- prt_printf(out, "%u:%u:%u (sector %llu)",
- i->dev, i->bucket, i->bucket_offset, i->sector);
+ bch2_journal_ptr_to_text(out, c, i);
}
}
-static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c,
- struct journal_replay *j)
+static void bch2_journal_datetime_to_text(struct printbuf *out, struct jset *j)
{
- prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq));
-
- bch2_journal_ptrs_to_text(out, c, j);
-
- for_each_jset_entry_type(entry, &j->j, BCH_JSET_ENTRY_datetime) {
+ for_each_jset_entry_type(entry, j, BCH_JSET_ENTRY_datetime) {
struct jset_entry_datetime *datetime =
container_of(entry, struct jset_entry_datetime, entry);
bch2_prt_datetime(out, le64_to_cpu(datetime->seconds));
@@ -74,6 +77,15 @@ static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c,
}
}
+static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c,
+ struct journal_replay *j)
+{
+ prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq));
+ bch2_journal_datetime_to_text(out, &j->j);
+ prt_char(out, ' ');
+ bch2_journal_ptrs_to_text(out, c, j);
+}
+
static struct nonce journal_nonce(const struct jset *jset)
{
return (struct nonce) {{
@@ -148,6 +160,9 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
struct printbuf buf = PRINTBUF;
int ret = JOURNAL_ENTRY_ADD_OK;
+ if (last_seq && c->opts.journal_rewind)
+ last_seq = min(last_seq, c->opts.journal_rewind);
+
if (!c->journal.oldest_seq_found_ondisk ||
le64_to_cpu(j->seq) < c->journal.oldest_seq_found_ondisk)
c->journal.oldest_seq_found_ondisk = le64_to_cpu(j->seq);
@@ -187,7 +202,7 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
journal_entry_radix_idx(c, le64_to_cpu(j->seq)),
GFP_KERNEL);
if (!_i)
- return -BCH_ERR_ENOMEM_journal_entry_add;
+ return bch_err_throw(c, ENOMEM_journal_entry_add);
/*
* Duplicate journal entries? If so we want the one that didn't have a
@@ -214,12 +229,12 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
fsck_err_on(same_device,
c, journal_entry_dup_same_device,
- "duplicate journal entry on same device\n %s",
+ "duplicate journal entry on same device\n%s",
buf.buf);
fsck_err_on(not_identical,
c, journal_entry_replicas_data_mismatch,
- "found duplicate but non identical journal entries\n %s",
+ "found duplicate but non identical journal entries\n%s",
buf.buf);
if (entry_ptr.csum_good && !identical)
@@ -230,7 +245,7 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
replace:
i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
if (!i)
- return -BCH_ERR_ENOMEM_journal_entry_add;
+ return bch_err_throw(c, ENOMEM_journal_entry_add);
darray_init(&i->ptrs);
i->csum_good = entry_ptr.csum_good;
@@ -308,9 +323,9 @@ static void journal_entry_err_msg(struct printbuf *out,
break; \
case WRITE: \
bch2_sb_error_count(c, BCH_FSCK_ERR_##_err); \
- bch_err(c, "corrupt metadata before write: %s\n", _buf.buf);\
- if (bch2_fs_inconsistent(c)) { \
- ret = -BCH_ERR_fsck_errors_not_fixed; \
+ if (bch2_fs_inconsistent(c, \
+ "corrupt metadata before write: %s\n", _buf.buf)) {\
+ ret = bch_err_throw(c, fsck_errors_not_fixed); \
goto fsck_err; \
} \
break; \
@@ -417,6 +432,10 @@ static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs
bool first = true;
jset_entry_for_each_key(entry, k) {
+ /* We may be called on entries that haven't been validated: */
+ if (!k->k.u64s)
+ break;
+
if (!first) {
prt_newline(out);
bch2_prt_jset_entry_type(out, entry->type);
@@ -764,6 +783,23 @@ static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs
journal_entry_btree_keys_to_text(out, c, entry);
}
+static int journal_entry_log_bkey_validate(struct bch_fs *c,
+ struct jset *jset,
+ struct jset_entry *entry,
+ unsigned version, int big_endian,
+ struct bkey_validate_context from)
+{
+ from.flags = 0;
+ return journal_entry_btree_keys_validate(c, jset, entry,
+ version, big_endian, from);
+}
+
+static void journal_entry_log_bkey_to_text(struct printbuf *out, struct bch_fs *c,
+ struct jset_entry *entry)
+{
+ journal_entry_btree_keys_to_text(out, c, entry);
+}
+
static int journal_entry_write_buffer_keys_validate(struct bch_fs *c,
struct jset *jset,
struct jset_entry *entry,
@@ -987,19 +1023,19 @@ struct journal_read_buf {
size_t size;
};
-static int journal_read_buf_realloc(struct journal_read_buf *b,
+static int journal_read_buf_realloc(struct bch_fs *c, struct journal_read_buf *b,
size_t new_size)
{
void *n;
/* the bios are sized for this many pages, max: */
if (new_size > JOURNAL_ENTRY_SIZE_MAX)
- return -BCH_ERR_ENOMEM_journal_read_buf_realloc;
+ return bch_err_throw(c, ENOMEM_journal_read_buf_realloc);
new_size = roundup_pow_of_two(new_size);
n = kvmalloc(new_size, GFP_KERNEL);
if (!n)
- return -BCH_ERR_ENOMEM_journal_read_buf_realloc;
+ return bch_err_throw(c, ENOMEM_journal_read_buf_realloc);
kvfree(b->data);
b->data = n;
@@ -1019,7 +1055,6 @@ static int journal_read_bucket(struct bch_dev *ca,
u64 offset = bucket_to_sector(ca, ja->buckets[bucket]),
end = offset + ca->mi.bucket_size;
bool saw_bad = false, csum_good;
- struct printbuf err = PRINTBUF;
int ret = 0;
pr_debug("reading %u", bucket);
@@ -1035,26 +1070,32 @@ reread:
bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
if (!bio)
- return -BCH_ERR_ENOMEM_journal_read_bucket;
+ return bch_err_throw(c, ENOMEM_journal_read_bucket);
bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ);
bio->bi_iter.bi_sector = offset;
bch2_bio_map(bio, buf->data, sectors_read << 9);
+ u64 submit_time = local_clock();
ret = submit_bio_wait(bio);
kfree(bio);
- if (bch2_dev_io_err_on(ret, ca, BCH_MEMBER_ERROR_read,
- "journal read error: sector %llu",
- offset) ||
- bch2_meta_read_fault("journal")) {
+ if (!ret && bch2_meta_read_fault("journal"))
+ ret = bch_err_throw(c, EIO_fault_injected);
+
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
+ submit_time, !ret);
+
+ if (ret) {
+ bch_err_dev_ratelimited(ca,
+ "journal read error: sector %llu", offset);
/*
* We don't error out of the recovery process
* here, since the relevant journal entry may be
* found on a different device, and missing or
* no journal entries will be handled later
*/
- goto out;
+ return 0;
}
j = buf->data;
@@ -1068,15 +1109,15 @@ reread:
break;
case JOURNAL_ENTRY_REREAD:
if (vstruct_bytes(j) > buf->size) {
- ret = journal_read_buf_realloc(buf,
+ ret = journal_read_buf_realloc(c, buf,
vstruct_bytes(j));
if (ret)
- goto err;
+ return ret;
}
goto reread;
case JOURNAL_ENTRY_NONE:
if (!saw_bad)
- goto out;
+ return 0;
/*
* On checksum error we don't really trust the size
* field of the journal entry we read, so try reading
@@ -1085,7 +1126,7 @@ reread:
sectors = block_sectors(c);
goto next_block;
default:
- goto err;
+ return ret;
}
if (le64_to_cpu(j->seq) > ja->highest_seq_found) {
@@ -1102,21 +1143,22 @@ reread:
* bucket:
*/
if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket])
- goto out;
+ return 0;
ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
- enum bch_csum_type csum_type = JSET_CSUM_TYPE(j);
struct bch_csum csum;
csum_good = jset_csum_good(c, j, &csum);
- if (bch2_dev_io_err_on(!csum_good, ca, BCH_MEMBER_ERROR_checksum,
- "%s",
- (printbuf_reset(&err),
- prt_str(&err, "journal "),
- bch2_csum_err_msg(&err, csum_type, j->csum, csum),
- err.buf)))
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good);
+
+ if (!csum_good) {
+ /*
+ * Don't print an error here, we'll print the error
+ * later if we need this journal entry
+ */
saw_bad = true;
+ }
ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
j->encrypted_start,
@@ -1126,6 +1168,7 @@ reread:
mutex_lock(&jlist->lock);
ret = journal_entry_add(c, ca, (struct journal_ptr) {
.csum_good = csum_good,
+ .csum = csum,
.dev = ca->dev_idx,
.bucket = bucket,
.bucket_offset = offset -
@@ -1140,7 +1183,7 @@ reread:
case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
break;
default:
- goto err;
+ return ret;
}
next_block:
pr_debug("next");
@@ -1149,11 +1192,7 @@ next_block:
j = ((void *) j) + (sectors << 9);
}
-out:
- ret = 0;
-err:
- printbuf_exit(&err);
- return ret;
+ return 0;
}
static CLOSURE_CALLBACK(bch2_journal_read_device)
@@ -1170,7 +1209,7 @@ static CLOSURE_CALLBACK(bch2_journal_read_device)
if (!ja->nr)
goto out;
- ret = journal_read_buf_realloc(&buf, PAGE_SIZE);
+ ret = journal_read_buf_realloc(c, &buf, PAGE_SIZE);
if (ret)
goto err;
@@ -1192,7 +1231,7 @@ static CLOSURE_CALLBACK(bch2_journal_read_device)
out:
bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret);
kvfree(buf.data);
- percpu_ref_put(&ca->io_ref);
+ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_journal_read);
closure_return(cl);
return;
err:
@@ -1202,13 +1241,105 @@ err:
goto out;
}
+noinline_for_stack
+static void bch2_journal_print_checksum_error(struct bch_fs *c, struct journal_replay *j)
+{
+ struct printbuf buf = PRINTBUF;
+ enum bch_csum_type csum_type = JSET_CSUM_TYPE(&j->j);
+ bool have_good = false;
+
+ prt_printf(&buf, "invalid journal checksum(s) at seq %llu ", le64_to_cpu(j->j.seq));
+ bch2_journal_datetime_to_text(&buf, &j->j);
+ prt_newline(&buf);
+
+ darray_for_each(j->ptrs, ptr)
+ if (!ptr->csum_good) {
+ bch2_journal_ptr_to_text(&buf, c, ptr);
+ prt_char(&buf, ' ');
+ bch2_csum_to_text(&buf, csum_type, ptr->csum);
+ prt_newline(&buf);
+ } else {
+ have_good = true;
+ }
+
+ prt_printf(&buf, "should be ");
+ bch2_csum_to_text(&buf, csum_type, j->j.csum);
+
+ if (have_good)
+ prt_printf(&buf, "\n(had good copy on another device)");
+
+ bch2_print_str(c, KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+}
+
+noinline_for_stack
+static int bch2_journal_check_for_missing(struct bch_fs *c, u64 start_seq, u64 end_seq)
+{
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
+
+ struct genradix_iter radix_iter;
+ struct journal_replay *i, **_i, *prev = NULL;
+ u64 seq = start_seq;
+
+ genradix_for_each(&c->journal_entries, radix_iter, _i) {
+ i = *_i;
+
+ if (journal_replay_ignore(i))
+ continue;
+
+ BUG_ON(seq > le64_to_cpu(i->j.seq));
+
+ while (seq < le64_to_cpu(i->j.seq)) {
+ while (seq < le64_to_cpu(i->j.seq) &&
+ bch2_journal_seq_is_blacklisted(c, seq, false))
+ seq++;
+
+ if (seq == le64_to_cpu(i->j.seq))
+ break;
+
+ u64 missing_start = seq;
+
+ while (seq < le64_to_cpu(i->j.seq) &&
+ !bch2_journal_seq_is_blacklisted(c, seq, false))
+ seq++;
+
+ u64 missing_end = seq - 1;
+
+ printbuf_reset(&buf);
+ prt_printf(&buf, "journal entries %llu-%llu missing! (replaying %llu-%llu)",
+ missing_start, missing_end,
+ start_seq, end_seq);
+
+ prt_printf(&buf, "\nprev at ");
+ if (prev) {
+ bch2_journal_ptrs_to_text(&buf, c, prev);
+ prt_printf(&buf, " size %zu", vstruct_sectors(&prev->j, c->block_bits));
+ } else
+ prt_printf(&buf, "(none)");
+
+ prt_printf(&buf, "\nnext at ");
+ bch2_journal_ptrs_to_text(&buf, c, i);
+ prt_printf(&buf, ", continue?");
+
+ fsck_err(c, journal_entries_missing, "%s", buf.buf);
+ }
+
+ prev = i;
+ seq++;
+ }
+fsck_err:
+ printbuf_exit(&buf);
+ return ret;
+}
+
int bch2_journal_read(struct bch_fs *c,
u64 *last_seq,
u64 *blacklist_seq,
u64 *start_seq)
{
struct journal_list jlist;
- struct journal_replay *i, **_i, *prev = NULL;
+ struct journal_replay *i, **_i;
struct genradix_iter radix_iter;
struct printbuf buf = PRINTBUF;
bool degraded = false, last_write_torn = false;
@@ -1227,7 +1358,8 @@ int bch2_journal_read(struct bch_fs *c,
if ((ca->mi.state == BCH_MEMBER_STATE_rw ||
ca->mi.state == BCH_MEMBER_STATE_ro) &&
- percpu_ref_tryget(&ca->io_ref))
+ enumerated_ref_tryget(&ca->io_ref[READ],
+ BCH_DEV_READ_REF_journal_read))
closure_call(&ca->journal.read,
bch2_journal_read_device,
system_unbound_wq,
@@ -1236,7 +1368,8 @@ int bch2_journal_read(struct bch_fs *c,
degraded = true;
}
- closure_sync(&jlist.cl);
+ while (closure_sync_timeout(&jlist.cl, sysctl_hung_task_timeout_secs * HZ / 2))
+ ;
if (jlist.ret)
return jlist.ret;
@@ -1297,14 +1430,24 @@ int bch2_journal_read(struct bch_fs *c,
return 0;
}
- bch_info(c, "journal read done, replaying entries %llu-%llu",
- *last_seq, *blacklist_seq - 1);
+ printbuf_reset(&buf);
+ prt_printf(&buf, "journal read done, replaying entries %llu-%llu",
+ *last_seq, *blacklist_seq - 1);
- if (*start_seq != *blacklist_seq)
- bch_info(c, "dropped unflushed entries %llu-%llu",
- *blacklist_seq, *start_seq - 1);
+ /*
+ * Drop blacklisted entries and entries older than last_seq (or start of
+ * journal rewind:
+ */
+ u64 drop_before = *last_seq;
+ if (c->opts.journal_rewind) {
+ drop_before = min(drop_before, c->opts.journal_rewind);
+ prt_printf(&buf, " (rewinding from %llu)", c->opts.journal_rewind);
+ }
- /* Drop blacklisted entries and entries older than last_seq: */
+ *last_seq = drop_before;
+ if (*start_seq != *blacklist_seq)
+ prt_printf(&buf, " (unflushed %llu-%llu)", *blacklist_seq, *start_seq - 1);
+ bch_info(c, "%s", buf.buf);
genradix_for_each(&c->journal_entries, radix_iter, _i) {
i = *_i;
@@ -1312,7 +1455,7 @@ int bch2_journal_read(struct bch_fs *c,
continue;
seq = le64_to_cpu(i->j.seq);
- if (seq < *last_seq) {
+ if (seq < drop_before) {
journal_replay_free(c, i, false);
continue;
}
@@ -1325,59 +1468,12 @@ int bch2_journal_read(struct bch_fs *c,
}
}
- /* Check for missing entries: */
- seq = *last_seq;
- genradix_for_each(&c->journal_entries, radix_iter, _i) {
- i = *_i;
-
- if (journal_replay_ignore(i))
- continue;
-
- BUG_ON(seq > le64_to_cpu(i->j.seq));
-
- while (seq < le64_to_cpu(i->j.seq)) {
- u64 missing_start, missing_end;
- struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
-
- while (seq < le64_to_cpu(i->j.seq) &&
- bch2_journal_seq_is_blacklisted(c, seq, false))
- seq++;
-
- if (seq == le64_to_cpu(i->j.seq))
- break;
-
- missing_start = seq;
-
- while (seq < le64_to_cpu(i->j.seq) &&
- !bch2_journal_seq_is_blacklisted(c, seq, false))
- seq++;
-
- if (prev) {
- bch2_journal_ptrs_to_text(&buf1, c, prev);
- prt_printf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits));
- } else
- prt_printf(&buf1, "(none)");
- bch2_journal_ptrs_to_text(&buf2, c, i);
-
- missing_end = seq - 1;
- fsck_err(c, journal_entries_missing,
- "journal entries %llu-%llu missing! (replaying %llu-%llu)\n"
- " prev at %s\n"
- " next at %s, continue?",
- missing_start, missing_end,
- *last_seq, *blacklist_seq - 1,
- buf1.buf, buf2.buf);
-
- printbuf_exit(&buf1);
- printbuf_exit(&buf2);
- }
-
- prev = i;
- seq++;
- }
+ ret = bch2_journal_check_for_missing(c, drop_before, *blacklist_seq - 1);
+ if (ret)
+ goto err;
genradix_for_each(&c->journal_entries, radix_iter, _i) {
- struct bch_replicas_padded replicas = {
+ union bch_replicas_padded replicas = {
.e.data_type = BCH_DATA_journal,
.e.nr_devs = 0,
.e.nr_required = 1,
@@ -1387,15 +1483,15 @@ int bch2_journal_read(struct bch_fs *c,
if (journal_replay_ignore(i))
continue;
- darray_for_each(i->ptrs, ptr) {
- struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
-
- if (!ptr->csum_good)
- bch_err_dev_offset(ca, ptr->sector,
- "invalid journal checksum, seq %llu%s",
- le64_to_cpu(i->j.seq),
- i->csum_good ? " (had good copy on another device)" : "");
- }
+ /*
+ * Don't print checksum errors until we know we're going to use
+ * a given journal entry:
+ */
+ darray_for_each(i->ptrs, ptr)
+ if (!ptr->csum_good) {
+ bch2_journal_print_checksum_error(c, i);
+ break;
+ }
ret = jset_validate(c,
bch2_dev_have_ref(c, i->ptrs.data[0].dev),
@@ -1417,7 +1513,7 @@ int bch2_journal_read(struct bch_fs *c,
!bch2_replicas_marked(c, &replicas.e) &&
(le64_to_cpu(i->j.seq) == *last_seq ||
fsck_err(c, journal_entry_replicas_not_marked,
- "superblock not marked as containing replicas for journal entry %llu\n %s",
+ "superblock not marked as containing replicas for journal entry %llu\n%s",
le64_to_cpu(i->j.seq), buf.buf))) {
ret = bch2_mark_replicas(c, &replicas.e);
if (ret)
@@ -1434,10 +1530,11 @@ fsck_err:
static void journal_advance_devs_to_next_bucket(struct journal *j,
struct dev_alloc_list *devs,
- unsigned sectors, u64 seq)
+ unsigned sectors, __le64 seq)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ guard(rcu)();
darray_for_each(*devs, i) {
struct bch_dev *ca = rcu_dereference(c->devs[*i]);
if (!ca)
@@ -1471,7 +1568,8 @@ static void __journal_write_alloc(struct journal *j,
struct bch_fs *c = container_of(j, struct bch_fs, journal);
darray_for_each(*devs, i) {
- struct bch_dev *ca = rcu_dereference(c->devs[*i]);
+ struct bch_dev *ca = bch2_dev_get_ioref(c, *i, WRITE,
+ BCH_DEV_WRITE_REF_journal_write);
if (!ca)
continue;
@@ -1485,8 +1583,10 @@ static void __journal_write_alloc(struct journal *j,
ca->mi.state != BCH_MEMBER_STATE_rw ||
!ja->nr ||
bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) ||
- sectors > ja->sectors_free)
+ sectors > ja->sectors_free) {
+ enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write);
continue;
+ }
bch2_dev_stripe_increment(ca, &j->wp.stripe);
@@ -1509,15 +1609,8 @@ static void __journal_write_alloc(struct journal *j,
}
}
-/**
- * journal_write_alloc - decide where to write next journal entry
- *
- * @j: journal object
- * @w: journal buf (entry to be written)
- *
- * Returns: 0 on success, or -EROFS on failure
- */
-static int journal_write_alloc(struct journal *j, struct journal_buf *w)
+static int journal_write_alloc(struct journal *j, struct journal_buf *w,
+ unsigned *replicas)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_devs_mask devs;
@@ -1525,29 +1618,18 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w)
unsigned sectors = vstruct_sectors(w->data, c->block_bits);
unsigned target = c->opts.metadata_target ?:
c->opts.foreground_target;
- unsigned replicas = 0, replicas_want =
- READ_ONCE(c->opts.metadata_replicas);
+ unsigned replicas_want = READ_ONCE(c->opts.metadata_replicas);
unsigned replicas_need = min_t(unsigned, replicas_want,
READ_ONCE(c->opts.metadata_replicas_required));
bool advance_done = false;
- rcu_read_lock();
-
- /* We might run more than once if we have to stop and do discards: */
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&w->key));
- bkey_for_each_ptr(ptrs, p) {
- struct bch_dev *ca = bch2_dev_rcu_noerror(c, p->dev);
- if (ca)
- replicas += ca->mi.durability;
- }
-
retry_target:
devs = target_rw_devs(c, BCH_DATA_journal, target);
- devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs);
+ bch2_dev_alloc_list(c, &j->wp.stripe, &devs, &devs_sorted);
retry_alloc:
- __journal_write_alloc(j, w, &devs_sorted, sectors, &replicas, replicas_want);
+ __journal_write_alloc(j, w, &devs_sorted, sectors, replicas, replicas_want);
- if (likely(replicas >= replicas_want))
+ if (likely(*replicas >= replicas_want))
goto done;
if (!advance_done) {
@@ -1556,18 +1638,26 @@ retry_alloc:
goto retry_alloc;
}
- if (replicas < replicas_want && target) {
+ if (*replicas < replicas_want && target) {
/* Retry from all devices: */
target = 0;
advance_done = false;
goto retry_target;
}
done:
- rcu_read_unlock();
-
BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX);
- return replicas >= replicas_need ? 0 : -BCH_ERR_insufficient_journal_devices;
+#if 0
+ /*
+ * XXX: we need a way to alert the user when we go degraded for any
+ * reason
+ */
+ if (*replicas < min(replicas_want,
+ dev_mask_nr(&c->rw_devs[BCH_DATA_free]))) {
+ }
+#endif
+
+ return *replicas >= replicas_need ? 0 : -BCH_ERR_insufficient_journal_devices;
}
static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
@@ -1600,18 +1690,12 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
kvfree(new_buf);
}
-static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
-{
- return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK);
-}
-
static CLOSURE_CALLBACK(journal_write_done)
{
closure_type(w, struct journal_buf, io);
struct journal *j = container_of(w, struct journal, buf[w->idx]);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct bch_replicas_padded replicas;
- union journal_res_state old, new;
+ union bch_replicas_padded replicas;
u64 seq = le64_to_cpu(w->data->seq);
int err = 0;
@@ -1620,17 +1704,28 @@ static CLOSURE_CALLBACK(journal_write_done)
: j->noflush_write_time, j->write_start_time);
if (!w->devs_written.nr) {
- bch_err(c, "unable to write journal to sufficient devices");
- err = -EIO;
+ err = bch_err_throw(c, journal_write_err);
} else {
bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
w->devs_written);
- if (bch2_mark_replicas(c, &replicas.e))
- err = -EIO;
+ err = bch2_mark_replicas(c, &replicas.e);
}
- if (err)
- bch2_fatal_error(c);
+ if (err && !bch2_journal_error(j)) {
+ struct printbuf buf = PRINTBUF;
+ bch2_log_msg_start(c, &buf);
+
+ if (err == -BCH_ERR_journal_write_err)
+ prt_printf(&buf, "unable to write journal to sufficient devices\n");
+ else
+ prt_printf(&buf, "journal write error marking replicas: %s\n",
+ bch2_err_str(err));
+
+ bch2_fs_emergency_read_only2(c, &buf);
+
+ bch2_print_str(c, KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+ }
closure_debug_destroy(cl);
@@ -1641,7 +1736,23 @@ static CLOSURE_CALLBACK(journal_write_done)
j->err_seq = seq;
w->write_done = true;
+ if (!j->free_buf || j->free_buf_size < w->buf_size) {
+ swap(j->free_buf, w->data);
+ swap(j->free_buf_size, w->buf_size);
+ }
+
+ if (w->data) {
+ void *buf = w->data;
+ w->data = NULL;
+ w->buf_size = 0;
+
+ spin_unlock(&j->lock);
+ kvfree(buf);
+ spin_lock(&j->lock);
+ }
+
bool completed = false;
+ bool do_discards = false;
for (seq = journal_last_unwritten_seq(j);
seq <= journal_cur_seq(j);
@@ -1650,13 +1761,13 @@ static CLOSURE_CALLBACK(journal_write_done)
if (!w->write_done)
break;
- if (!j->err_seq && !JSET_NO_FLUSH(w->data)) {
+ if (!j->err_seq && !w->noflush) {
j->flushed_seq_ondisk = seq;
j->last_seq_ondisk = w->last_seq;
- bch2_do_discards(c);
closure_wake_up(&c->freelist_wait);
bch2_reset_alloc_cursors(c);
+ do_discards = true;
}
j->seq_ondisk = seq;
@@ -1671,16 +1782,6 @@ static CLOSURE_CALLBACK(journal_write_done)
if (j->watermark != BCH_WATERMARK_stripe)
journal_reclaim_kick(&c->journal);
- old.v = atomic64_read(&j->reservations.counter);
- do {
- new.v = old.v;
- BUG_ON(journal_state_count(new, new.unwritten_idx));
- BUG_ON(new.unwritten_idx != (seq & JOURNAL_BUF_MASK));
-
- new.unwritten_idx++;
- } while (!atomic64_try_cmpxchg(&j->reservations.counter,
- &old.v, new.v));
-
closure_wake_up(&w->wait);
completed = true;
}
@@ -1695,7 +1796,7 @@ static CLOSURE_CALLBACK(journal_write_done)
}
if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
- new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) {
+ j->reservations.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) {
struct journal_buf *buf = journal_cur_buf(j);
long delta = buf->expires - jiffies;
@@ -1715,6 +1816,9 @@ static CLOSURE_CALLBACK(journal_write_done)
*/
bch2_journal_do_writes(j);
spin_unlock(&j->lock);
+
+ if (do_discards)
+ bch2_do_discards(c);
}
static void journal_write_endio(struct bio *bio)
@@ -1724,20 +1828,23 @@ static void journal_write_endio(struct bio *bio)
struct journal *j = &ca->fs->journal;
struct journal_buf *w = j->buf + jbio->buf_idx;
- if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
+ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write,
+ jbio->submit_time, !bio->bi_status);
+
+ if (bio->bi_status) {
+ bch_err_dev_ratelimited(ca,
"error writing journal entry %llu: %s",
le64_to_cpu(w->data->seq),
- bch2_blk_status_to_str(bio->bi_status)) ||
- bch2_meta_write_fault("journal")) {
- unsigned long flags;
+ bch2_blk_status_to_str(bio->bi_status));
+ unsigned long flags;
spin_lock_irqsave(&j->err_lock, flags);
bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx);
spin_unlock_irqrestore(&j->err_lock, flags);
}
closure_put(&w->io);
- percpu_ref_put(&ca->io_ref);
+ enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write);
}
static CLOSURE_CALLBACK(journal_write_submit)
@@ -1748,18 +1855,17 @@ static CLOSURE_CALLBACK(journal_write_submit)
unsigned sectors = vstruct_sectors(w->data, c->block_bits);
extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
- struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE);
- if (!ca) {
- /* XXX: fix this */
- bch_err(c, "missing device for journal write\n");
- continue;
- }
+ struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
sectors);
struct journal_device *ja = &ca->journal;
- struct bio *bio = &ja->bio[w->idx]->bio;
+ struct journal_bio *jbio = ja->bio[w->idx];
+ struct bio *bio = &jbio->bio;
+
+ jbio->submit_time = local_clock();
+
bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
bio->bi_iter.bi_sector = ptr->offset;
bio->bi_end_io = journal_write_endio;
@@ -1791,6 +1897,10 @@ static CLOSURE_CALLBACK(journal_write_preflush)
struct journal *j = container_of(w, struct journal, buf[w->idx]);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ /*
+ * Wait for previous journal writes to comelete; they won't necessarily
+ * be flushed if they're still in flight
+ */
if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) {
spin_lock(&j->lock);
if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) {
@@ -1803,8 +1913,9 @@ static CLOSURE_CALLBACK(journal_write_preflush)
}
if (w->separate_flush) {
- for_each_rw_member(c, ca) {
- percpu_ref_get(&ca->io_ref);
+ for_each_rw_member(c, ca, BCH_DEV_WRITE_REF_journal_write) {
+ enumerated_ref_get(&ca->io_ref[WRITE],
+ BCH_DEV_WRITE_REF_journal_write);
struct journal_device *ja = &ca->journal;
struct bio *bio = &ja->bio[w->idx]->bio;
@@ -1831,9 +1942,8 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
struct jset_entry *start, *end;
struct jset *jset = w->data;
struct journal_keys_to_wb wb = { NULL };
- unsigned sectors, bytes, u64s;
+ unsigned u64s;
unsigned long btree_roots_have = 0;
- bool validate_before_checksum = false;
u64 seq = le64_to_cpu(jset->seq);
int ret;
@@ -1916,8 +2026,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
le32_add_cpu(&jset->u64s, u64s);
- sectors = vstruct_sectors(jset, c->block_bits);
- bytes = vstruct_bytes(jset);
+ unsigned sectors = vstruct_sectors(jset, c->block_bits);
if (sectors > w->sectors) {
bch2_fs_fatal_error(c, ": journal write overran available space, %zu > %u (extra %u reserved %u/%u)",
@@ -1926,6 +2035,17 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
return -EINVAL;
}
+ return 0;
+}
+
+static int bch2_journal_write_checksum(struct journal *j, struct journal_buf *w)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct jset *jset = w->data;
+ u64 seq = le64_to_cpu(jset->seq);
+ bool validate_before_checksum = false;
+ int ret = 0;
+
jset->magic = cpu_to_le64(jset_magic(c));
jset->version = cpu_to_le32(c->sb.version);
@@ -1948,7 +2068,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
jset->encrypted_start,
vstruct_end(jset) - (void *) jset->encrypted_start);
- if (bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret)))
+ if (bch2_fs_fatal_err_on(ret, c, "encrypting journal entry: %s", bch2_err_str(ret)))
return ret;
jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
@@ -1958,6 +2078,8 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
(ret = jset_validate(c, NULL, jset, 0, WRITE)))
return ret;
+ unsigned sectors = vstruct_sectors(jset, c->block_bits);
+ unsigned bytes = vstruct_bytes(jset);
memset((void *) jset + bytes, 0, (sectors << 9) - bytes);
return 0;
}
@@ -1984,7 +2106,7 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *
* write anything at all.
*/
if (error && test_bit(JOURNAL_need_flush_write, &j->flags))
- return -EIO;
+ return error;
if (error ||
w->noflush ||
@@ -2013,13 +2135,10 @@ CLOSURE_CALLBACK(bch2_journal_write)
closure_type(w, struct journal_buf, io);
struct journal *j = container_of(w, struct journal, buf[w->idx]);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct bch_replicas_padded replicas;
- unsigned nr_rw_members = 0;
+ union bch_replicas_padded replicas;
+ unsigned nr_rw_members = dev_mask_nr(&c->rw_devs[BCH_DATA_free]);
int ret;
- for_each_rw_member(c, ca)
- nr_rw_members++;
-
BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
BUG_ON(!w->write_started);
BUG_ON(w->write_allocated);
@@ -2033,7 +2152,8 @@ CLOSURE_CALLBACK(bch2_journal_write)
ret = bch2_journal_write_pick_flush(j, w);
spin_unlock(&j->lock);
- if (ret)
+
+ if (unlikely(ret))
goto err;
mutex_lock(&j->buf_lock);
@@ -2041,43 +2161,34 @@ CLOSURE_CALLBACK(bch2_journal_write)
ret = bch2_journal_write_prep(j, w);
mutex_unlock(&j->buf_lock);
- if (ret)
- goto err;
- j->entry_bytes_written += vstruct_bytes(w->data);
+ if (unlikely(ret))
+ goto err;
+ unsigned replicas_allocated = 0;
while (1) {
- spin_lock(&j->lock);
- ret = journal_write_alloc(j, w);
+ ret = journal_write_alloc(j, w, &replicas_allocated);
if (!ret || !j->can_discard)
break;
- spin_unlock(&j->lock);
bch2_journal_do_discards(j);
}
- if (ret && !bch2_journal_error(j)) {
- struct printbuf buf = PRINTBUF;
- buf.atomic++;
+ if (unlikely(ret))
+ goto err_allocate_write;
- prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu for %zu sectors: %s"),
- le64_to_cpu(w->data->seq),
- vstruct_sectors(w->data, c->block_bits),
- bch2_err_str(ret));
- __bch2_journal_debug_to_text(&buf, j);
- spin_unlock(&j->lock);
- bch2_print_string_as_lines(KERN_ERR, buf.buf);
- printbuf_exit(&buf);
- }
- if (ret)
+ ret = bch2_journal_write_checksum(j, w);
+ if (unlikely(ret))
goto err;
+ spin_lock(&j->lock);
/*
* write is allocated, no longer need to account for it in
* bch2_journal_space_available():
*/
w->sectors = 0;
w->write_allocated = true;
+ j->entry_bytes_written += vstruct_bytes(w->data);
/*
* journal entry has been compacted and allocated, recalculate space
@@ -2089,9 +2200,6 @@ CLOSURE_CALLBACK(bch2_journal_write)
w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
- if (c->opts.nochanges)
- goto no_io;
-
/*
* Mark journal replicas before we submit the write to guarantee
* recovery will find the journal entries after a crash.
@@ -2102,15 +2210,33 @@ CLOSURE_CALLBACK(bch2_journal_write)
if (ret)
goto err;
+ if (c->opts.nochanges)
+ goto no_io;
+
if (!JSET_NO_FLUSH(w->data))
continue_at(cl, journal_write_preflush, j->wq);
else
continue_at(cl, journal_write_submit, j->wq);
return;
-no_io:
- continue_at(cl, journal_write_done, j->wq);
- return;
+err_allocate_write:
+ if (!bch2_journal_error(j)) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_journal_debug_to_text(&buf, j);
+ prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu for %zu sectors: %s"),
+ le64_to_cpu(w->data->seq),
+ vstruct_sectors(w->data, c->block_bits),
+ bch2_err_str(ret));
+ bch2_print_str(c, KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+ }
err:
bch2_fatal_error(c);
+no_io:
+ extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
+ struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
+ enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write);
+ }
+
continue_at(cl, journal_write_done, j->wq);
}
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
index 12b39fcb4424..6fa82c4050fe 100644
--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
@@ -9,6 +9,7 @@ void bch2_journal_pos_from_member_info_resume(struct bch_fs *);
struct journal_ptr {
bool csum_good;
+ struct bch_csum csum;
u8 dev;
u32 bucket;
u32 bucket_offset;
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index 6a9cefb635d6..0042d43b8e57 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -17,6 +17,8 @@
#include <linux/kthread.h>
#include <linux/sched/mm.h>
+static bool __should_discard_bucket(struct journal *, struct journal_device *);
+
/* Free space calculations: */
static unsigned journal_space_from(struct journal_device *ja,
@@ -81,18 +83,20 @@ static struct journal_space
journal_dev_space_available(struct journal *j, struct bch_dev *ca,
enum journal_space_from from)
{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_device *ja = &ca->journal;
unsigned sectors, buckets, unwritten;
+ unsigned bucket_size_aligned = round_down(ca->mi.bucket_size, block_sectors(c));
u64 seq;
if (from == journal_space_total)
return (struct journal_space) {
- .next_entry = ca->mi.bucket_size,
- .total = ca->mi.bucket_size * ja->nr,
+ .next_entry = bucket_size_aligned,
+ .total = bucket_size_aligned * ja->nr,
};
buckets = bch2_journal_dev_buckets_available(j, ja, from);
- sectors = ja->sectors_free;
+ sectors = round_down(ja->sectors_free, block_sectors(c));
/*
* We that we don't allocate the space for a journal entry
@@ -107,7 +111,7 @@ journal_dev_space_available(struct journal *j, struct bch_dev *ca,
continue;
/* entry won't fit on this device, skip: */
- if (unwritten > ca->mi.bucket_size)
+ if (unwritten > bucket_size_aligned)
continue;
if (unwritten >= sectors) {
@@ -117,7 +121,7 @@ journal_dev_space_available(struct journal *j, struct bch_dev *ca,
}
buckets--;
- sectors = ca->mi.bucket_size;
+ sectors = bucket_size_aligned;
}
sectors -= unwritten;
@@ -125,12 +129,12 @@ journal_dev_space_available(struct journal *j, struct bch_dev *ca,
if (sectors < ca->mi.bucket_size && buckets) {
buckets--;
- sectors = ca->mi.bucket_size;
+ sectors = bucket_size_aligned;
}
return (struct journal_space) {
.next_entry = sectors,
- .total = sectors + buckets * ca->mi.bucket_size,
+ .total = sectors + buckets * bucket_size_aligned,
};
}
@@ -144,7 +148,6 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne
BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space));
- rcu_read_lock();
for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) {
if (!ca->journal.nr ||
!ca->mi.durability)
@@ -162,12 +165,17 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne
array_insert_item(dev_space, nr_devs, pos, space);
}
- rcu_read_unlock();
if (nr_devs < nr_devs_want)
return (struct journal_space) { 0, 0 };
/*
+ * It's possible for bucket size to be misaligned w.r.t. the filesystem
+ * block size:
+ */
+ min_bucket_size = round_down(min_bucket_size, block_sectors(c));
+
+ /*
* We sorted largest to smallest, and we want the smallest out of the
* @nr_devs_want largest devices:
*/
@@ -187,8 +195,8 @@ void bch2_journal_space_available(struct journal *j)
int ret = 0;
lockdep_assert_held(&j->lock);
+ guard(rcu)();
- rcu_read_lock();
for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) {
struct journal_device *ja = &ca->journal;
@@ -203,30 +211,28 @@ void bch2_journal_space_available(struct journal *j)
ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk)
ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr;
- if (ja->discard_idx != ja->dirty_idx_ondisk)
- can_discard = true;
+ can_discard |= __should_discard_bucket(j, ja);
max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size);
nr_online++;
}
- rcu_read_unlock();
j->can_discard = can_discard;
if (nr_online < metadata_replicas_required(c)) {
- struct printbuf buf = PRINTBUF;
- buf.atomic++;
- prt_printf(&buf, "insufficient writeable journal devices available: have %u, need %u\n"
- "rw journal devs:", nr_online, metadata_replicas_required(c));
-
- rcu_read_lock();
- for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal])
- prt_printf(&buf, " %s", ca->name);
- rcu_read_unlock();
-
- bch_err(c, "%s", buf.buf);
- printbuf_exit(&buf);
- ret = JOURNAL_ERR_insufficient_devices;
+ if (!(c->sb.features & BIT_ULL(BCH_FEATURE_small_image))) {
+ struct printbuf buf = PRINTBUF;
+ buf.atomic++;
+ prt_printf(&buf, "insufficient writeable journal devices available: have %u, need %u\n"
+ "rw journal devs:", nr_online, metadata_replicas_required(c));
+
+ for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal])
+ prt_printf(&buf, " %s", ca->name);
+
+ bch_err(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ }
+ ret = bch_err_throw(c, insufficient_journal_devices);
goto out;
}
@@ -240,7 +246,7 @@ void bch2_journal_space_available(struct journal *j)
total = j->space[journal_space_total].total;
if (!j->space[journal_space_discarded].next_entry)
- ret = JOURNAL_ERR_journal_full;
+ ret = bch_err_throw(c, journal_full);
if ((j->space[journal_space_clean_ondisk].next_entry <
j->space[journal_space_clean_ondisk].total) &&
@@ -252,7 +258,9 @@ void bch2_journal_space_available(struct journal *j)
bch2_journal_set_watermark(j);
out:
- j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0;
+ j->cur_entry_sectors = !ret
+ ? j->space[journal_space_discarded].next_entry
+ : 0;
j->cur_entry_error = ret;
if (!ret)
@@ -261,12 +269,19 @@ out:
/* Discards - last part of journal reclaim: */
-static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
+static bool __should_discard_bucket(struct journal *j, struct journal_device *ja)
{
- bool ret;
+ unsigned min_free = max(4, ja->nr / 8);
+ return bch2_journal_dev_buckets_available(j, ja, journal_space_discarded) <
+ min_free &&
+ ja->discard_idx != ja->dirty_idx_ondisk;
+}
+
+static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
+{
spin_lock(&j->lock);
- ret = ja->discard_idx != ja->dirty_idx_ondisk;
+ bool ret = __should_discard_bucket(j, ja);
spin_unlock(&j->lock);
return ret;
@@ -282,12 +297,12 @@ void bch2_journal_do_discards(struct journal *j)
mutex_lock(&j->discard_lock);
- for_each_rw_member(c, ca) {
+ for_each_rw_member(c, ca, BCH_DEV_WRITE_REF_journal_do_discards) {
struct journal_device *ja = &ca->journal;
while (should_discard_bucket(j, ja)) {
if (!c->opts.nochanges &&
- ca->mi.discard &&
+ bch2_discard_opt_enabled(c, ca) &&
bdev_max_discard_sectors(ca->disk_sb.bdev))
blkdev_issue_discard(ca->disk_sb.bdev,
bucket_to_sector(ca,
@@ -384,12 +399,16 @@ void bch2_journal_pin_drop(struct journal *j,
spin_unlock(&j->lock);
}
-static enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn)
+static enum journal_pin_type journal_pin_type(struct journal_entry_pin *pin,
+ journal_pin_flush_fn fn)
{
if (fn == bch2_btree_node_flush0 ||
- fn == bch2_btree_node_flush1)
- return JOURNAL_PIN_TYPE_btree;
- else if (fn == bch2_btree_key_cache_journal_flush)
+ fn == bch2_btree_node_flush1) {
+ unsigned idx = fn == bch2_btree_node_flush1;
+ struct btree *b = container_of(pin, struct btree, writes[idx].journal);
+
+ return JOURNAL_PIN_TYPE_btree0 - b->c.level;
+ } else if (fn == bch2_btree_key_cache_journal_flush)
return JOURNAL_PIN_TYPE_key_cache;
else
return JOURNAL_PIN_TYPE_other;
@@ -441,7 +460,7 @@ void bch2_journal_pin_copy(struct journal *j,
bool reclaim = __journal_pin_drop(j, dst);
- bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(flush_fn));
+ bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(dst, flush_fn));
if (reclaim)
bch2_journal_reclaim_fast(j);
@@ -465,7 +484,7 @@ void bch2_journal_pin_set(struct journal *j, u64 seq,
bool reclaim = __journal_pin_drop(j, pin);
- bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(flush_fn));
+ bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(pin, flush_fn));
if (reclaim)
bch2_journal_reclaim_fast(j);
@@ -587,7 +606,7 @@ static size_t journal_flush_pins(struct journal *j,
spin_lock(&j->lock);
/* Pin might have been dropped or rearmed: */
if (likely(!err && !j->flush_in_progress_dropped))
- list_move(&pin->list, &journal_seq_pin(j, seq)->flushed[journal_pin_type(flush_fn)]);
+ list_move(&pin->list, &journal_seq_pin(j, seq)->flushed[journal_pin_type(pin, flush_fn)]);
j->flush_in_progress = NULL;
j->flush_in_progress_dropped = false;
spin_unlock(&j->lock);
@@ -608,9 +627,10 @@ static u64 journal_seq_to_flush(struct journal *j)
struct bch_fs *c = container_of(j, struct bch_fs, journal);
u64 seq_to_flush = 0;
- spin_lock(&j->lock);
+ guard(spinlock)(&j->lock);
+ guard(rcu)();
- for_each_rw_member(c, ca) {
+ for_each_rw_member_rcu(c, ca) {
struct journal_device *ja = &ca->journal;
unsigned nr_buckets, bucket_to_flush;
@@ -620,20 +640,15 @@ static u64 journal_seq_to_flush(struct journal *j)
/* Try to keep the journal at most half full: */
nr_buckets = ja->nr / 2;
- nr_buckets = min(nr_buckets, ja->nr);
-
bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr;
seq_to_flush = max(seq_to_flush,
ja->bucket_seq[bucket_to_flush]);
}
/* Also flush if the pin fifo is more than half full */
- seq_to_flush = max_t(s64, seq_to_flush,
- (s64) journal_cur_seq(j) -
- (j->pin.size >> 1));
- spin_unlock(&j->lock);
-
- return seq_to_flush;
+ return max_t(s64, seq_to_flush,
+ (s64) journal_cur_seq(j) -
+ (j->pin.size >> 1));
}
/**
@@ -641,7 +656,6 @@ static u64 journal_seq_to_flush(struct journal *j)
* @j: journal object
* @direct: direct or background reclaim?
* @kicked: requested to run since we last ran?
- * Returns: 0 on success, or -EIO if the journal has been shutdown
*
* Background journal reclaim writes out btree nodes. It should be run
* early enough so that we never completely run out of journal buckets.
@@ -681,11 +695,11 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
if (kthread && kthread_should_stop())
break;
- if (bch2_journal_error(j)) {
- ret = -EIO;
+ ret = bch2_journal_error(j);
+ if (ret)
break;
- }
+ /* XXX shove journal discards off to another thread */
bch2_journal_do_discards(j);
seq_to_flush = journal_seq_to_flush(j);
@@ -869,18 +883,13 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
mutex_lock(&j->reclaim_lock);
- if (journal_flush_pins_or_still_flushing(j, seq_to_flush,
- BIT(JOURNAL_PIN_TYPE_key_cache)|
- BIT(JOURNAL_PIN_TYPE_other))) {
- *did_work = true;
- goto unlock;
- }
-
- if (journal_flush_pins_or_still_flushing(j, seq_to_flush,
- BIT(JOURNAL_PIN_TYPE_btree))) {
- *did_work = true;
- goto unlock;
- }
+ for (int type = JOURNAL_PIN_TYPE_NR - 1;
+ type >= 0;
+ --type)
+ if (journal_flush_pins_or_still_flushing(j, seq_to_flush, BIT(type))) {
+ *did_work = true;
+ goto unlock;
+ }
if (seq_to_flush > journal_cur_seq(j))
bch2_journal_entry_close(j);
@@ -953,7 +962,7 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
seq = 0;
spin_lock(&j->lock);
while (!ret) {
- struct bch_replicas_padded replicas;
+ union bch_replicas_padded replicas;
seq = max(seq, journal_last_seq(j));
if (seq >= j->pin.back)
diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c
index 62b910f2fb27..0cb9b93f13e7 100644
--- a/fs/bcachefs/journal_sb.c
+++ b/fs/bcachefs/journal_sb.c
@@ -210,7 +210,7 @@ int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca,
j = bch2_sb_field_resize(&ca->disk_sb, journal_v2,
(sizeof(*j) + sizeof(j->d[0]) * nr_compacted) / sizeof(u64));
if (!j)
- return -BCH_ERR_ENOSPC_sb_journal;
+ return bch_err_throw(c, ENOSPC_sb_journal);
bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal);
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
index 1f25c111c54c..af4fe416d9ec 100644
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -78,7 +78,7 @@ int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist,
sb_blacklist_u64s(nr + 1));
if (!bl) {
- ret = -BCH_ERR_ENOSPC_sb_journal_seq_blacklist;
+ ret = bch_err_throw(c, ENOSPC_sb_journal_seq_blacklist);
goto out;
}
@@ -130,6 +130,16 @@ bool bch2_journal_seq_is_blacklisted(struct bch_fs *c, u64 seq,
return true;
}
+u64 bch2_journal_last_blacklisted_seq(struct bch_fs *c)
+{
+ struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table;
+
+ if (!t || !t->nr)
+ return 0;
+
+ return t->entries[eytzinger0_last(t->nr)].end - 1;
+}
+
int bch2_blacklist_table_initialize(struct bch_fs *c)
{
struct bch_sb_field_journal_seq_blacklist *bl =
@@ -142,7 +152,7 @@ int bch2_blacklist_table_initialize(struct bch_fs *c)
t = kzalloc(struct_size(t, entries, nr), GFP_KERNEL);
if (!t)
- return -BCH_ERR_ENOMEM_blacklist_table_init;
+ return bch_err_throw(c, ENOMEM_blacklist_table_init);
t->nr = nr;
@@ -231,15 +241,14 @@ bool bch2_blacklist_entries_gc(struct bch_fs *c)
struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table;
BUG_ON(nr != t->nr);
- unsigned i;
- for (src = bl->start, i = t->nr == 0 ? 0 : eytzinger0_first(t->nr);
- src < bl->start + nr;
- src++, i = eytzinger0_next(i, nr)) {
+ src = bl->start;
+ eytzinger0_for_each(i, nr) {
BUG_ON(t->entries[i].start != le64_to_cpu(src->start));
BUG_ON(t->entries[i].end != le64_to_cpu(src->end));
if (t->entries[i].dirty || t->entries[i].end >= c->journal.oldest_seq_found_ondisk)
*dst++ = *src;
+ src++;
}
unsigned new_nr = dst - bl->start;
diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h
index d47636f96fdc..f06942ccfcdd 100644
--- a/fs/bcachefs/journal_seq_blacklist.h
+++ b/fs/bcachefs/journal_seq_blacklist.h
@@ -12,6 +12,7 @@ blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl)
}
bool bch2_journal_seq_is_blacklisted(struct bch_fs *, u64, bool);
+u64 bch2_journal_last_blacklisted_seq(struct bch_fs *);
int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64, u64);
int bch2_blacklist_table_initialize(struct bch_fs *);
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 3ba433a48eb8..51104bbb99da 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -12,7 +12,11 @@
/* btree write buffer steals 8 bits for its own purposes: */
#define JOURNAL_SEQ_MAX ((1ULL << 56) - 1)
-#define JOURNAL_BUF_BITS 2
+#define JOURNAL_STATE_BUF_BITS 2
+#define JOURNAL_STATE_BUF_NR (1U << JOURNAL_STATE_BUF_BITS)
+#define JOURNAL_STATE_BUF_MASK (JOURNAL_STATE_BUF_NR - 1)
+
+#define JOURNAL_BUF_BITS 4
#define JOURNAL_BUF_NR (1U << JOURNAL_BUF_BITS)
#define JOURNAL_BUF_MASK (JOURNAL_BUF_NR - 1)
@@ -53,7 +57,10 @@ struct journal_buf {
*/
enum journal_pin_type {
- JOURNAL_PIN_TYPE_btree,
+ JOURNAL_PIN_TYPE_btree3,
+ JOURNAL_PIN_TYPE_btree2,
+ JOURNAL_PIN_TYPE_btree1,
+ JOURNAL_PIN_TYPE_btree0,
JOURNAL_PIN_TYPE_key_cache,
JOURNAL_PIN_TYPE_other,
JOURNAL_PIN_TYPE_NR,
@@ -79,7 +86,6 @@ struct journal_entry_pin {
struct journal_res {
bool ref;
- u8 idx;
u16 u64s;
u32 offset;
u64 seq;
@@ -95,9 +101,8 @@ union journal_res_state {
};
struct {
- u64 cur_entry_offset:20,
+ u64 cur_entry_offset:22,
idx:2,
- unwritten_idx:2,
buf0_count:10,
buf1_count:10,
buf2_count:10,
@@ -107,13 +112,13 @@ union journal_res_state {
/* bytes: */
#define JOURNAL_ENTRY_SIZE_MIN (64U << 10) /* 64k */
-#define JOURNAL_ENTRY_SIZE_MAX (4U << 20) /* 4M */
+#define JOURNAL_ENTRY_SIZE_MAX (4U << 22) /* 16M */
/*
* We stash some journal state as sentinal values in cur_entry_offset:
* note - cur_entry_offset is in units of u64s
*/
-#define JOURNAL_ENTRY_OFFSET_MAX ((1U << 20) - 1)
+#define JOURNAL_ENTRY_OFFSET_MAX ((1U << 22) - 1)
#define JOURNAL_ENTRY_BLOCKED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 2)
#define JOURNAL_ENTRY_CLOSED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 1)
@@ -146,28 +151,10 @@ enum journal_flags {
#undef x
};
-/* Reasons we may fail to get a journal reservation: */
-#define JOURNAL_ERRORS() \
- x(ok) \
- x(retry) \
- x(blocked) \
- x(max_in_flight) \
- x(journal_full) \
- x(journal_pin_full) \
- x(journal_stuck) \
- x(insufficient_devices)
-
-enum journal_errors {
-#define x(n) JOURNAL_ERR_##n,
- JOURNAL_ERRORS()
-#undef x
-};
-
-typedef DARRAY(u64) darray_u64;
-
struct journal_bio {
struct bch_dev *ca;
unsigned buf_idx;
+ u64 submit_time;
struct bio bio;
};
@@ -196,7 +183,7 @@ struct journal {
* 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if
* insufficient devices:
*/
- enum journal_errors cur_entry_error;
+ int cur_entry_error;
unsigned cur_entry_offset_if_blocked;
unsigned buf_size_want;
@@ -217,6 +204,8 @@ struct journal {
* other is possibly being written out.
*/
struct journal_buf buf[JOURNAL_BUF_NR];
+ void *free_buf;
+ unsigned free_buf_size;
spinlock_t lock;
@@ -234,9 +223,11 @@ struct journal {
/* Sequence number of most recent journal entry (last entry in @pin) */
atomic64_t seq;
+ u64 seq_write_started;
/* seq, last_seq from the most recent journal entry successfully written */
u64 seq_ondisk;
u64 flushed_seq_ondisk;
+ u64 flushing_seq;
u64 last_seq_ondisk;
u64 err_seq;
u64 last_empty_seq;
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
index ce794d55818f..57b5b3263b08 100644
--- a/fs/bcachefs/lru.c
+++ b/fs/bcachefs/lru.c
@@ -6,6 +6,7 @@
#include "btree_iter.h"
#include "btree_update.h"
#include "btree_write_buffer.h"
+#include "ec.h"
#include "error.h"
#include "lru.h"
#include "recovery.h"
@@ -59,9 +60,9 @@ int bch2_lru_set(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time
return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_set);
}
-int bch2_lru_change(struct btree_trans *trans,
- u16 lru_id, u64 dev_bucket,
- u64 old_time, u64 new_time)
+int __bch2_lru_change(struct btree_trans *trans,
+ u16 lru_id, u64 dev_bucket,
+ u64 old_time, u64 new_time)
{
if (old_time == new_time)
return 0;
@@ -78,7 +79,9 @@ static const char * const bch2_lru_types[] = {
};
int bch2_lru_check_set(struct btree_trans *trans,
- u16 lru_id, u64 time,
+ u16 lru_id,
+ u64 dev_bucket,
+ u64 time,
struct bkey_s_c referring_k,
struct bkey_buf *last_flushed)
{
@@ -87,9 +90,7 @@ int bch2_lru_check_set(struct btree_trans *trans,
struct btree_iter lru_iter;
struct bkey_s_c lru_k =
bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru,
- lru_pos(lru_id,
- bucket_to_u64(referring_k.k->p),
- time), 0);
+ lru_pos(lru_id, dev_bucket, time), 0);
int ret = bkey_err(lru_k);
if (ret)
return ret;
@@ -100,11 +101,10 @@ int bch2_lru_check_set(struct btree_trans *trans,
goto err;
if (fsck_err(trans, alloc_key_to_missing_lru_entry,
- "missing %s lru entry\n"
- " %s",
+ "missing %s lru entry\n%s",
bch2_lru_types[lru_type(lru_k)],
(bch2_bkey_val_to_text(&buf, c, referring_k), buf.buf))) {
- ret = bch2_lru_set(trans, lru_id, bucket_to_u64(referring_k.k->p), time);
+ ret = bch2_lru_set(trans, lru_id, dev_bucket, time);
if (ret)
goto err;
}
@@ -116,57 +116,79 @@ fsck_err:
return ret;
}
+static struct bbpos lru_pos_to_bp(struct bkey_s_c lru_k)
+{
+ enum bch_lru_type type = lru_type(lru_k);
+
+ switch (type) {
+ case BCH_LRU_read:
+ case BCH_LRU_fragmentation:
+ return BBPOS(BTREE_ID_alloc, u64_to_bucket(lru_k.k->p.offset));
+ case BCH_LRU_stripes:
+ return BBPOS(BTREE_ID_stripes, POS(0, lru_k.k->p.offset));
+ default:
+ BUG();
+ }
+}
+
+static u64 bkey_lru_type_idx(struct bch_fs *c,
+ enum bch_lru_type type,
+ struct bkey_s_c k)
+{
+ struct bch_alloc_v4 a_convert;
+ const struct bch_alloc_v4 *a;
+
+ switch (type) {
+ case BCH_LRU_read:
+ a = bch2_alloc_to_v4(k, &a_convert);
+ return alloc_lru_idx_read(*a);
+ case BCH_LRU_fragmentation: {
+ a = bch2_alloc_to_v4(k, &a_convert);
+
+ guard(rcu)();
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.k->p.inode);
+ return ca
+ ? alloc_lru_idx_fragmentation(*a, ca)
+ : 0;
+ }
+ case BCH_LRU_stripes:
+ return k.k->type == KEY_TYPE_stripe
+ ? stripe_lru_pos(bkey_s_c_to_stripe(k).v)
+ : 0;
+ default:
+ BUG();
+ }
+}
+
static int bch2_check_lru_key(struct btree_trans *trans,
struct btree_iter *lru_iter,
struct bkey_s_c lru_k,
struct bkey_buf *last_flushed)
{
struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bch_alloc_v4 a_convert;
- const struct bch_alloc_v4 *a;
struct printbuf buf1 = PRINTBUF;
struct printbuf buf2 = PRINTBUF;
- enum bch_lru_type type = lru_type(lru_k);
- struct bpos alloc_pos = u64_to_bucket(lru_k.k->p.offset);
- u64 idx;
- int ret;
-
- struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, alloc_pos);
- if (fsck_err_on(!ca,
- trans, lru_entry_to_invalid_bucket,
- "lru key points to nonexistent device:bucket %llu:%llu",
- alloc_pos.inode, alloc_pos.offset))
- return bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, lru_iter->pos, false);
+ struct bbpos bp = lru_pos_to_bp(lru_k);
- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, alloc_pos, 0);
- ret = bkey_err(k);
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, bp.btree, bp.pos, 0);
+ int ret = bkey_err(k);
if (ret)
goto err;
- a = bch2_alloc_to_v4(k, &a_convert);
-
- switch (type) {
- case BCH_LRU_read:
- idx = alloc_lru_idx_read(*a);
- break;
- case BCH_LRU_fragmentation:
- idx = alloc_lru_idx_fragmentation(*a, ca);
- break;
- }
+ enum bch_lru_type type = lru_type(lru_k);
+ u64 idx = bkey_lru_type_idx(c, type, k);
- if (lru_k.k->type != KEY_TYPE_set ||
- lru_pos_time(lru_k.k->p) != idx) {
+ if (lru_pos_time(lru_k.k->p) != idx) {
ret = bch2_btree_write_buffer_maybe_flush(trans, lru_k, last_flushed);
if (ret)
goto err;
if (fsck_err(trans, lru_entry_bad,
"incorrect lru entry: lru %s time %llu\n"
- " %s\n"
- " for %s",
+ "%s\n"
+ "for %s",
bch2_lru_types[type],
lru_pos_time(lru_k.k->p),
(bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf),
@@ -176,7 +198,6 @@ static int bch2_check_lru_key(struct btree_trans *trans,
err:
fsck_err:
bch2_trans_iter_exit(trans, &iter);
- bch2_dev_put(ca);
printbuf_exit(&buf2);
printbuf_exit(&buf1);
return ret;
diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h
index f31a6cf1514c..8abd0aa2083a 100644
--- a/fs/bcachefs/lru.h
+++ b/fs/bcachefs/lru.h
@@ -28,9 +28,14 @@ static inline enum bch_lru_type lru_type(struct bkey_s_c l)
{
u16 lru_id = l.k->p.inode >> 48;
- if (lru_id == BCH_LRU_FRAGMENTATION_START)
+ switch (lru_id) {
+ case BCH_LRU_BUCKET_FRAGMENTATION:
return BCH_LRU_fragmentation;
- return BCH_LRU_read;
+ case BCH_LRU_STRIPE_FRAGMENTATION:
+ return BCH_LRU_stripes;
+ default:
+ return BCH_LRU_read;
+ }
}
int bch2_lru_validate(struct bch_fs *, struct bkey_s_c, struct bkey_validate_context);
@@ -46,10 +51,19 @@ void bch2_lru_pos_to_text(struct printbuf *, struct bpos);
int bch2_lru_del(struct btree_trans *, u16, u64, u64);
int bch2_lru_set(struct btree_trans *, u16, u64, u64);
-int bch2_lru_change(struct btree_trans *, u16, u64, u64, u64);
+int __bch2_lru_change(struct btree_trans *, u16, u64, u64, u64);
+
+static inline int bch2_lru_change(struct btree_trans *trans,
+ u16 lru_id, u64 dev_bucket,
+ u64 old_time, u64 new_time)
+{
+ return old_time != new_time
+ ? __bch2_lru_change(trans, lru_id, dev_bucket, old_time, new_time)
+ : 0;
+}
struct bkey_buf;
-int bch2_lru_check_set(struct btree_trans *, u16, u64, struct bkey_s_c, struct bkey_buf *);
+int bch2_lru_check_set(struct btree_trans *, u16, u64, u64, struct bkey_s_c, struct bkey_buf *);
int bch2_check_lrus(struct bch_fs *);
diff --git a/fs/bcachefs/lru_format.h b/fs/bcachefs/lru_format.h
index f372cb3b8cda..b7392ad8e41f 100644
--- a/fs/bcachefs/lru_format.h
+++ b/fs/bcachefs/lru_format.h
@@ -9,7 +9,8 @@ struct bch_lru {
#define BCH_LRU_TYPES() \
x(read) \
- x(fragmentation)
+ x(fragmentation) \
+ x(stripes)
enum bch_lru_type {
#define x(n) BCH_LRU_##n,
@@ -17,7 +18,8 @@ enum bch_lru_type {
#undef x
};
-#define BCH_LRU_FRAGMENTATION_START ((1U << 16) - 1)
+#define BCH_LRU_BUCKET_FRAGMENTATION ((1U << 16) - 1)
+#define BCH_LRU_STRIPE_FRAGMENTATION ((1U << 16) - 2)
#define LRU_TIME_BITS 48
#define LRU_TIME_MAX ((1ULL << LRU_TIME_BITS) - 1)
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index ddc187fb693d..f296cce95338 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -4,10 +4,13 @@
*/
#include "bcachefs.h"
+#include "backpointers.h"
#include "bkey_buf.h"
#include "btree_update.h"
#include "btree_update_interior.h"
+#include "btree_write_buffer.h"
#include "buckets.h"
+#include "ec.h"
#include "errcode.h"
#include "extents.h"
#include "io_write.h"
@@ -15,11 +18,12 @@
#include "keylist.h"
#include "migrate.h"
#include "move.h"
+#include "progress.h"
#include "replicas.h"
#include "super-io.h"
static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k,
- unsigned dev_idx, int flags, bool metadata)
+ unsigned dev_idx, unsigned flags, bool metadata)
{
unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas;
unsigned lost = metadata ? BCH_FORCE_IF_METADATA_LOST : BCH_FORCE_IF_DATA_LOST;
@@ -31,16 +35,33 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k,
nr_good = bch2_bkey_durability(c, k.s_c);
if ((!nr_good && !(flags & lost)) ||
(nr_good < replicas && !(flags & degraded)))
- return -BCH_ERR_remove_would_lose_data;
+ return bch_err_throw(c, remove_would_lose_data);
return 0;
}
+static int drop_btree_ptrs(struct btree_trans *trans, struct btree_iter *iter,
+ struct btree *b, unsigned dev_idx, unsigned flags)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_buf k;
+
+ bch2_bkey_buf_init(&k);
+ bch2_bkey_buf_copy(&k, c, &b->key);
+
+ int ret = drop_dev_ptrs(c, bkey_i_to_s(k.k), dev_idx, flags, true) ?:
+ bch2_btree_node_update_key(trans, iter, b, k.k, 0, false);
+
+ bch_err_fn(c, ret);
+ bch2_bkey_buf_exit(&k, c);
+ return ret;
+}
+
static int bch2_dev_usrdata_drop_key(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k,
unsigned dev_idx,
- int flags)
+ unsigned flags)
{
struct bch_fs *c = trans->c;
struct bkey_i *n;
@@ -76,7 +97,27 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans,
return 0;
}
-static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
+static int bch2_dev_btree_drop_key(struct btree_trans *trans,
+ struct bkey_s_c_backpointer bp,
+ unsigned dev_idx,
+ struct bkey_buf *last_flushed,
+ unsigned flags)
+{
+ struct btree_iter iter;
+ struct btree *b = bch2_backpointer_get_node(trans, bp, &iter, last_flushed);
+ int ret = PTR_ERR_OR_ZERO(b);
+ if (ret)
+ return ret == -BCH_ERR_backpointer_to_overwritten_btree_node ? 0 : ret;
+
+ ret = drop_btree_ptrs(trans, &iter, b, dev_idx, flags);
+
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static int bch2_dev_usrdata_drop(struct bch_fs *c,
+ struct progress_indicator_state *progress,
+ unsigned dev_idx, unsigned flags)
{
struct btree_trans *trans = bch2_trans_get(c);
enum btree_id id;
@@ -88,8 +129,10 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
ret = for_each_btree_key_commit(trans, iter, id, POS_MIN,
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
- NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags));
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
+ bch2_progress_update_iter(trans, progress, &iter, "dropping user data");
+ bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags);
+ }));
if (ret)
break;
}
@@ -99,7 +142,9 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
return ret;
}
-static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
+static int bch2_dev_metadata_drop(struct bch_fs *c,
+ struct progress_indicator_state *progress,
+ unsigned dev_idx, unsigned flags)
{
struct btree_trans *trans;
struct btree_iter iter;
@@ -111,7 +156,7 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
/* don't handle this yet: */
if (flags & BCH_FORCE_IF_METADATA_LOST)
- return -BCH_ERR_remove_with_metadata_missing_unimplemented;
+ return bch_err_throw(c, remove_with_metadata_missing_unimplemented);
trans = bch2_trans_get(c);
bch2_bkey_buf_init(&k);
@@ -123,29 +168,23 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
retry:
ret = 0;
while (bch2_trans_begin(trans),
- (b = bch2_btree_iter_peek_node(&iter)) &&
+ (b = bch2_btree_iter_peek_node(trans, &iter)) &&
!(ret = PTR_ERR_OR_ZERO(b))) {
+ bch2_progress_update_iter(trans, progress, &iter, "dropping metadata");
+
if (!bch2_bkey_has_device_c(bkey_i_to_s_c(&b->key), dev_idx))
goto next;
- bch2_bkey_buf_copy(&k, c, &b->key);
-
- ret = drop_dev_ptrs(c, bkey_i_to_s(k.k),
- dev_idx, flags, true);
- if (ret)
- break;
-
- ret = bch2_btree_node_update_key(trans, &iter, b, k.k, 0, false);
+ ret = drop_btree_ptrs(trans, &iter, b, dev_idx, flags);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
ret = 0;
continue;
}
- bch_err_msg(c, ret, "updating btree node key");
if (ret)
break;
next:
- bch2_btree_iter_next_node(&iter);
+ bch2_btree_iter_next_node(trans, &iter);
}
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
@@ -167,8 +206,72 @@ err:
return ret;
}
-int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags)
+static int data_drop_bp(struct btree_trans *trans, unsigned dev_idx,
+ struct bkey_s_c_backpointer bp, struct bkey_buf *last_flushed,
+ unsigned flags)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent,
+ last_flushed);
+ int ret = bkey_err(k);
+ if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
+ return 0;
+ if (ret)
+ return ret;
+
+ if (!k.k || !bch2_bkey_has_device_c(k, dev_idx))
+ goto out;
+
+ /*
+ * XXX: pass flags arg to invalidate_stripe_to_dev and handle it
+ * properly
+ */
+
+ if (bkey_is_btree_ptr(k.k))
+ ret = bch2_dev_btree_drop_key(trans, bp, dev_idx, last_flushed, flags);
+ else if (k.k->type == KEY_TYPE_stripe)
+ ret = bch2_invalidate_stripe_to_dev(trans, &iter, k, dev_idx, flags);
+ else
+ ret = bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags);
+out:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+int bch2_dev_data_drop_by_backpointers(struct bch_fs *c, unsigned dev_idx, unsigned flags)
{
- return bch2_dev_usrdata_drop(c, dev_idx, flags) ?:
- bch2_dev_metadata_drop(c, dev_idx, flags);
+ struct btree_trans *trans = bch2_trans_get(c);
+
+ struct bkey_buf last_flushed;
+ bch2_bkey_buf_init(&last_flushed);
+ bkey_init(&last_flushed.k->k);
+
+ int ret = bch2_btree_write_buffer_flush_sync(trans) ?:
+ for_each_btree_key_max_commit(trans, iter, BTREE_ID_backpointers,
+ POS(dev_idx, 0),
+ POS(dev_idx, U64_MAX), 0, k,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
+ if (k.k->type != KEY_TYPE_backpointer)
+ continue;
+
+ data_drop_bp(trans, dev_idx, bkey_s_c_to_backpointer(k),
+ &last_flushed, flags);
+
+ }));
+
+ bch2_bkey_buf_exit(&last_flushed, trans->c);
+ bch2_trans_put(trans);
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, unsigned flags)
+{
+ struct progress_indicator_state progress;
+ bch2_progress_init(&progress, c,
+ BIT_ULL(BTREE_ID_extents)|
+ BIT_ULL(BTREE_ID_reflink));
+
+ return bch2_dev_usrdata_drop(c, &progress, dev_idx, flags) ?:
+ bch2_dev_metadata_drop(c, &progress, dev_idx, flags);
}
diff --git a/fs/bcachefs/migrate.h b/fs/bcachefs/migrate.h
index 027efaa0d575..30018140711b 100644
--- a/fs/bcachefs/migrate.h
+++ b/fs/bcachefs/migrate.h
@@ -2,6 +2,7 @@
#ifndef _BCACHEFS_MIGRATE_H
#define _BCACHEFS_MIGRATE_H
-int bch2_dev_data_drop(struct bch_fs *, unsigned, int);
+int bch2_dev_data_drop_by_backpointers(struct bch_fs *, unsigned, unsigned);
+int bch2_dev_data_drop(struct bch_fs *, unsigned, unsigned);
#endif /* _BCACHEFS_MIGRATE_H */
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 160b4374160a..eec591e947bd 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -38,47 +38,87 @@ const char * const bch2_data_ops_strs[] = {
NULL
};
-static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k,
- struct bch_io_opts *io_opts,
- struct data_update_opts *data_opts)
+struct evacuate_bucket_arg {
+ struct bpos bucket;
+ int gen;
+ struct data_update_opts data_opts;
+};
+
+static bool evacuate_bucket_pred(struct bch_fs *, void *,
+ enum btree_id, struct bkey_s_c,
+ struct bch_io_opts *,
+ struct data_update_opts *);
+
+static noinline void
+trace_io_move2(struct bch_fs *c, struct bkey_s_c k,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
{
- if (trace_move_extent_enabled()) {
- struct printbuf buf = PRINTBUF;
+ struct printbuf buf = PRINTBUF;
- bch2_bkey_val_to_text(&buf, c, k);
- prt_newline(&buf);
- bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts);
- trace_move_extent(c, buf.buf);
- printbuf_exit(&buf);
- }
+ bch2_bkey_val_to_text(&buf, c, k);
+ prt_newline(&buf);
+ bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts);
+ trace_io_move(c, buf.buf);
+ printbuf_exit(&buf);
}
-static void trace_move_extent_read2(struct bch_fs *c, struct bkey_s_c k)
+static noinline void trace_io_move_read2(struct bch_fs *c, struct bkey_s_c k)
{
- if (trace_move_extent_read_enabled()) {
- struct printbuf buf = PRINTBUF;
+ struct printbuf buf = PRINTBUF;
- bch2_bkey_val_to_text(&buf, c, k);
- trace_move_extent_read(c, buf.buf);
- printbuf_exit(&buf);
+ bch2_bkey_val_to_text(&buf, c, k);
+ trace_io_move_read(c, buf.buf);
+ printbuf_exit(&buf);
+}
+
+static noinline void
+trace_io_move_pred2(struct bch_fs *c, struct bkey_s_c k,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts,
+ move_pred_fn pred, void *_arg, bool p)
+{
+ struct printbuf buf = PRINTBUF;
+
+ prt_printf(&buf, "%ps: %u", pred, p);
+
+ if (pred == evacuate_bucket_pred) {
+ struct evacuate_bucket_arg *arg = _arg;
+ prt_printf(&buf, " gen=%u", arg->gen);
}
+
+ prt_newline(&buf);
+ bch2_bkey_val_to_text(&buf, c, k);
+ prt_newline(&buf);
+ bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts);
+ trace_io_move_pred(c, buf.buf);
+ printbuf_exit(&buf);
+}
+
+static noinline void
+trace_io_move_evacuate_bucket2(struct bch_fs *c, struct bpos bucket, int gen)
+{
+ struct printbuf buf = PRINTBUF;
+
+ prt_printf(&buf, "bucket: ");
+ bch2_bpos_to_text(&buf, bucket);
+ prt_printf(&buf, " gen: %i\n", gen);
+
+ trace_io_move_evacuate_bucket(c, buf.buf);
+ printbuf_exit(&buf);
}
struct moving_io {
struct list_head read_list;
struct list_head io_list;
- struct move_bucket_in_flight *b;
+ struct move_bucket *b;
struct closure cl;
bool read_completed;
unsigned read_sectors;
unsigned write_sectors;
- struct bch_read_bio rbio;
-
struct data_update write;
- /* Must be last since it is variable size */
- struct bio_vec bi_inline_vecs[];
};
static void move_free(struct moving_io *io)
@@ -88,43 +128,85 @@ static void move_free(struct moving_io *io)
if (io->b)
atomic_dec(&io->b->count);
- bch2_data_update_exit(&io->write);
-
mutex_lock(&ctxt->lock);
list_del(&io->io_list);
wake_up(&ctxt->wait);
mutex_unlock(&ctxt->lock);
+ if (!io->write.data_opts.scrub) {
+ bch2_data_update_exit(&io->write);
+ } else {
+ bch2_bio_free_pages_pool(io->write.op.c, &io->write.op.wbio.bio);
+ kfree(io->write.bvecs);
+ }
kfree(io);
}
static void move_write_done(struct bch_write_op *op)
{
struct moving_io *io = container_of(op, struct moving_io, write.op);
+ struct bch_fs *c = op->c;
struct moving_context *ctxt = io->write.ctxt;
- if (io->write.op.error)
+ if (op->error) {
+ if (trace_io_move_write_fail_enabled()) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_write_op_to_text(&buf, op);
+ trace_io_move_write_fail(c, buf.buf);
+ printbuf_exit(&buf);
+ }
+ this_cpu_inc(c->counters[BCH_COUNTER_io_move_write_fail]);
+
ctxt->write_error = true;
+ }
- atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors);
- atomic_dec(&io->write.ctxt->write_ios);
+ atomic_sub(io->write_sectors, &ctxt->write_sectors);
+ atomic_dec(&ctxt->write_ios);
move_free(io);
closure_put(&ctxt->cl);
}
static void move_write(struct moving_io *io)
{
- if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) {
+ struct bch_fs *c = io->write.op.c;
+ struct moving_context *ctxt = io->write.ctxt;
+ struct bch_read_bio *rbio = &io->write.rbio;
+
+ if (ctxt->stats) {
+ if (rbio->bio.bi_status)
+ atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9,
+ &ctxt->stats->sectors_error_uncorrected);
+ else if (rbio->saw_error)
+ atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9,
+ &ctxt->stats->sectors_error_corrected);
+ }
+
+ /*
+ * If the extent has been bitrotted, we're going to have to give it a
+ * new checksum in order to move it - but the poison bit will ensure
+ * that userspace still gets the appropriate error.
+ */
+ if (unlikely(rbio->ret == -BCH_ERR_data_read_csum_err &&
+ (bch2_bkey_extent_flags(bkey_i_to_s_c(io->write.k.k)) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)))) {
+ struct bch_extent_crc_unpacked crc = rbio->pick.crc;
+ struct nonce nonce = extent_nonce(rbio->version, crc);
+
+ rbio->pick.crc.csum = bch2_checksum_bio(c, rbio->pick.crc.csum_type,
+ nonce, &rbio->bio);
+ rbio->ret = 0;
+ }
+
+ if (unlikely(rbio->ret || io->write.data_opts.scrub)) {
move_free(io);
return;
}
- if (trace_move_extent_write_enabled()) {
- struct bch_fs *c = io->write.op.c;
+ if (trace_io_move_write_enabled()) {
struct printbuf buf = PRINTBUF;
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(io->write.k.k));
- trace_move_extent_write(c, buf.buf);
+ trace_io_move_write(c, buf.buf);
printbuf_exit(&buf);
}
@@ -132,7 +214,7 @@ static void move_write(struct moving_io *io)
atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
atomic_inc(&io->write.ctxt->write_ios);
- bch2_data_update_read_done(&io->write, io->rbio.pick.crc);
+ bch2_data_update_read_done(&io->write);
}
struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt)
@@ -145,7 +227,7 @@ struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctx
static void move_read_endio(struct bio *bio)
{
- struct moving_io *io = container_of(bio, struct moving_io, rbio.bio);
+ struct moving_io *io = container_of(bio, struct moving_io, write.rbio.bio);
struct moving_context *ctxt = io->write.ctxt;
atomic_sub(io->read_sectors, &ctxt->read_sectors);
@@ -250,7 +332,7 @@ void bch2_move_stats_init(struct bch_move_stats *stats, const char *name)
}
int bch2_move_extent(struct moving_context *ctxt,
- struct move_bucket_in_flight *bucket_in_flight,
+ struct move_bucket *bucket_in_flight,
struct btree_iter *iter,
struct bkey_s_c k,
struct bch_io_opts io_opts,
@@ -258,14 +340,11 @@ int bch2_move_extent(struct moving_context *ctxt,
{
struct btree_trans *trans = ctxt->trans;
struct bch_fs *c = trans->c;
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- struct moving_io *io;
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- unsigned sectors = k.k->size, pages;
int ret = -ENOMEM;
- trace_move_extent2(c, k, &io_opts, &data_opts);
+ if (trace_io_move_enabled())
+ trace_io_move2(c, k, &io_opts, &data_opts);
+ this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size);
if (ctxt->stats)
ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos);
@@ -273,58 +352,49 @@ int bch2_move_extent(struct moving_context *ctxt,
bch2_data_update_opts_normalize(k, &data_opts);
if (!data_opts.rewrite_ptrs &&
- !data_opts.extra_replicas) {
+ !data_opts.extra_replicas &&
+ !data_opts.scrub) {
if (data_opts.kill_ptrs)
return bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &data_opts);
return 0;
}
- /*
- * Before memory allocations & taking nocow locks in
- * bch2_data_update_init():
- */
- bch2_trans_unlock(trans);
-
- /* write path might have to decompress data: */
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
- sectors = max_t(unsigned, sectors, p.crc.uncompressed_size);
-
- pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
- io = kzalloc(sizeof(struct moving_io) +
- sizeof(struct bio_vec) * pages, GFP_KERNEL);
+ struct moving_io *io = allocate_dropping_locks(trans, ret,
+ kzalloc(sizeof(struct moving_io), _gfp));
if (!io)
goto err;
+ if (ret)
+ goto err_free;
+
INIT_LIST_HEAD(&io->io_list);
io->write.ctxt = ctxt;
io->read_sectors = k.k->size;
io->write_sectors = k.k->size;
- bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0);
- io->write.op.wbio.bio.bi_ioprio =
- IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
+ if (!data_opts.scrub) {
+ ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp,
+ &io_opts, data_opts, iter->btree_id, k);
+ if (ret)
+ goto err_free;
- if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9,
- GFP_KERNEL))
- goto err_free;
+ io->write.op.end_io = move_write_done;
+ } else {
+ bch2_bkey_buf_init(&io->write.k);
+ bch2_bkey_buf_reassemble(&io->write.k, c, k);
- io->rbio.c = c;
- io->rbio.opts = io_opts;
- bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0);
- io->rbio.bio.bi_vcnt = pages;
- io->rbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
- io->rbio.bio.bi_iter.bi_size = sectors << 9;
+ io->write.op.c = c;
+ io->write.data_opts = data_opts;
- io->rbio.bio.bi_opf = REQ_OP_READ;
- io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k);
- io->rbio.bio.bi_end_io = move_read_endio;
+ bch2_trans_unlock(trans);
- ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp,
- io_opts, data_opts, iter->btree_id, k);
- if (ret)
- goto err_free_pages;
+ ret = bch2_data_update_bios_init(&io->write, c, &io_opts);
+ if (ret)
+ goto err_free;
+ }
- io->write.op.end_io = move_write_done;
+ io->write.rbio.bio.bi_end_io = move_read_endio;
+ io->write.rbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0);
if (ctxt->rate)
bch2_ratelimit_increment(ctxt->rate, k.k->size);
@@ -339,9 +409,8 @@ int bch2_move_extent(struct moving_context *ctxt,
atomic_inc(&io->b->count);
}
- this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size);
- this_cpu_add(c->counters[BCH_COUNTER_move_extent_read], k.k->size);
- trace_move_extent_read2(c, k);
+ if (trace_io_move_read_enabled())
+ trace_io_move_read2(c, k);
mutex_lock(&ctxt->lock);
atomic_add(io->read_sectors, &ctxt->read_sectors);
@@ -356,39 +425,39 @@ int bch2_move_extent(struct moving_context *ctxt,
* ctxt when doing wakeup
*/
closure_get(&ctxt->cl);
- bch2_read_extent(trans, &io->rbio,
- bkey_start_pos(k.k),
- iter->btree_id, k, 0,
- BCH_READ_NODECODE|
- BCH_READ_LAST_FRAGMENT);
+ __bch2_read_extent(trans, &io->write.rbio,
+ io->write.rbio.bio.bi_iter,
+ bkey_start_pos(k.k),
+ iter->btree_id, k, 0,
+ NULL,
+ BCH_READ_last_fragment,
+ data_opts.scrub ? data_opts.read_dev : -1);
return 0;
-err_free_pages:
- bio_free_pages(&io->write.op.wbio.bio);
err_free:
kfree(io);
err:
- if (ret == -BCH_ERR_data_update_done)
- return 0;
-
if (bch2_err_matches(ret, EROFS) ||
bch2_err_matches(ret, BCH_ERR_transaction_restart))
return ret;
- count_event(c, move_extent_start_fail);
+ count_event(c, io_move_start_fail);
- if (trace_move_extent_start_fail_enabled()) {
+ if (trace_io_move_start_fail_enabled()) {
struct printbuf buf = PRINTBUF;
bch2_bkey_val_to_text(&buf, c, k);
prt_str(&buf, ": ");
prt_str(&buf, bch2_err_str(ret));
- trace_move_extent_start_fail(c, buf.buf);
+ trace_io_move_start_fail(c, buf.buf);
printbuf_exit(&buf);
}
+
+ if (bch2_err_matches(ret, BCH_ERR_data_update_done))
+ return 0;
return ret;
}
-static struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
+struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
struct per_snapshot_io_opts *io_opts,
struct bpos extent_pos, /* extent_iter, extent_k may be in reflink btree */
struct btree_iter *extent_iter,
@@ -399,6 +468,9 @@ static struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
struct bch_io_opts *opts_ret = &io_opts->fs_io_opts;
int ret = 0;
+ if (extent_iter->min_depth)
+ return opts_ret;
+
if (extent_k.k->type == KEY_TYPE_reflink_v)
goto out;
@@ -470,6 +542,7 @@ int bch2_move_get_io_opts_one(struct btree_trans *trans,
bch2_inode_opts_get(io_opts, c, &inode);
}
bch2_trans_iter_exit(trans, &inode_iter);
+ /* seem to be spinning here? */
out:
return bch2_get_update_rebalance_opts(trans, io_opts, extent_iter, extent_k);
}
@@ -518,11 +591,42 @@ int bch2_move_ratelimit(struct moving_context *ctxt)
return 0;
}
-static int bch2_move_data_btree(struct moving_context *ctxt,
- struct bpos start,
- struct bpos end,
- move_pred_fn pred, void *arg,
- enum btree_id btree_id)
+/*
+ * Move requires non extents iterators, and there's also no need for it to
+ * signal indirect_extent_missing_error:
+ */
+static struct bkey_s_c bch2_lookup_indirect_extent_for_move(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c_reflink_p p)
+{
+ if (unlikely(REFLINK_P_ERROR(p.v)))
+ return bkey_s_c_null;
+
+ struct bpos reflink_pos = POS(0, REFLINK_P_IDX(p.v));
+
+ bch2_trans_iter_init(trans, iter,
+ BTREE_ID_reflink, reflink_pos,
+ BTREE_ITER_not_extents);
+
+ struct bkey_s_c k = bch2_btree_iter_peek(trans, iter);
+ if (!k.k || bkey_err(k)) {
+ bch2_trans_iter_exit(trans, iter);
+ return k;
+ }
+
+ if (bkey_lt(reflink_pos, bkey_start_pos(k.k))) {
+ bch2_trans_iter_exit(trans, iter);
+ return bkey_s_c_null;
+ }
+
+ return k;
+}
+
+int bch2_move_data_btree(struct moving_context *ctxt,
+ struct bpos start,
+ struct bpos end,
+ move_pred_fn pred, void *arg,
+ enum btree_id btree_id, unsigned level)
{
struct btree_trans *trans = ctxt->trans;
struct bch_fs *c = trans->c;
@@ -548,10 +652,56 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
ctxt->stats->pos = BBPOS(btree_id, start);
}
+retry_root:
bch2_trans_begin(trans);
- bch2_trans_iter_init(trans, &iter, btree_id, start,
- BTREE_ITER_prefetch|
- BTREE_ITER_all_snapshots);
+
+ if (level == bch2_btree_id_root(c, btree_id)->level + 1) {
+ bch2_trans_node_iter_init(trans, &iter, btree_id, start, 0, level - 1,
+ BTREE_ITER_prefetch|
+ BTREE_ITER_not_extents|
+ BTREE_ITER_all_snapshots);
+ struct btree *b = bch2_btree_iter_peek_node(trans, &iter);
+ ret = PTR_ERR_OR_ZERO(b);
+ if (ret)
+ goto root_err;
+
+ if (b != btree_node_root(c, b)) {
+ bch2_trans_iter_exit(trans, &iter);
+ goto retry_root;
+ }
+
+ k = bkey_i_to_s_c(&b->key);
+
+ io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts,
+ iter.pos, &iter, k);
+ ret = PTR_ERR_OR_ZERO(io_opts);
+ if (ret)
+ goto root_err;
+
+ memset(&data_opts, 0, sizeof(data_opts));
+ if (!pred(c, arg, iter.btree_id, k, io_opts, &data_opts))
+ goto out;
+
+
+ if (!data_opts.scrub)
+ ret = bch2_btree_node_rewrite_pos(trans, btree_id, level,
+ k.k->p, data_opts.target, 0);
+ else
+ ret = bch2_btree_node_scrub(trans, btree_id, level, k, data_opts.read_dev);
+
+root_err:
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+ bch2_trans_iter_exit(trans, &iter);
+ goto retry_root;
+ }
+
+ goto out;
+ }
+
+ bch2_trans_node_iter_init(trans, &iter, btree_id, start, 0, level,
+ BTREE_ITER_prefetch|
+ BTREE_ITER_not_extents|
+ BTREE_ITER_all_snapshots);
if (ctxt->rate)
bch2_ratelimit_reset(ctxt->rate);
@@ -561,7 +711,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
bch2_trans_begin(trans);
- k = bch2_btree_iter_peek(&iter);
+ k = bch2_btree_iter_peek(trans, &iter);
if (!k.k)
break;
@@ -571,7 +721,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
if (ret)
break;
- if (bkey_ge(bkey_start_pos(k.k), end))
+ if (bkey_gt(bkey_start_pos(k.k), end))
break;
if (ctxt->stats)
@@ -581,17 +731,16 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
k.k->type == KEY_TYPE_reflink_p &&
REFLINK_P_MAY_UPDATE_OPTIONS(bkey_s_c_to_reflink_p(k).v)) {
struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
- s64 offset_into_extent = iter.pos.offset - bkey_start_offset(k.k);
bch2_trans_iter_exit(trans, &reflink_iter);
- k = bch2_lookup_indirect_extent(trans, &reflink_iter, &offset_into_extent, p, true, 0);
+ k = bch2_lookup_indirect_extent_for_move(trans, &reflink_iter, p);
ret = bkey_err(k);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
continue;
if (ret)
break;
- if (bkey_deleted(k.k))
+ if (!k.k)
goto next_nondata;
/*
@@ -612,7 +761,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
continue;
memset(&data_opts, 0, sizeof(data_opts));
- if (!pred(c, arg, k, io_opts, &data_opts))
+ if (!pred(c, arg, extent_iter->btree_id, k, io_opts, &data_opts))
goto next;
/*
@@ -622,12 +771,19 @@ static int bch2_move_data_btree(struct moving_context *ctxt,
bch2_bkey_buf_reassemble(&sk, c, k);
k = bkey_i_to_s_c(sk.k);
- ret2 = bch2_move_extent(ctxt, NULL, extent_iter, k, *io_opts, data_opts);
+ if (!level)
+ ret2 = bch2_move_extent(ctxt, NULL, extent_iter, k, *io_opts, data_opts);
+ else if (!data_opts.scrub)
+ ret2 = bch2_btree_node_rewrite_pos(trans, btree_id, level,
+ k.k->p, data_opts.target, 0);
+ else
+ ret2 = bch2_btree_node_scrub(trans, btree_id, level, k, data_opts.read_dev);
+
if (ret2) {
if (bch2_err_matches(ret2, BCH_ERR_transaction_restart))
continue;
- if (ret2 == -ENOMEM) {
+ if (bch2_err_matches(ret2, ENOMEM)) {
/* memory allocation failure, wait for some IO to finish */
bch2_move_ctxt_wait_for_io(ctxt);
continue;
@@ -640,9 +796,10 @@ next:
if (ctxt->stats)
atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
next_nondata:
- bch2_btree_iter_advance(&iter);
+ if (!bch2_btree_iter_advance(trans, &iter))
+ break;
}
-
+out:
bch2_trans_iter_exit(trans, &reflink_iter);
bch2_trans_iter_exit(trans, &iter);
bch2_bkey_buf_exit(&sk, c);
@@ -672,7 +829,7 @@ int __bch2_move_data(struct moving_context *ctxt,
ret = bch2_move_data_btree(ctxt,
id == start.btree ? start.pos : POS_MIN,
id == end.btree ? end.pos : POS_MAX,
- pred, arg, id);
+ pred, arg, id, 0);
if (ret)
break;
}
@@ -689,21 +846,23 @@ int bch2_move_data(struct bch_fs *c,
bool wait_on_copygc,
move_pred_fn pred, void *arg)
{
-
struct moving_context ctxt;
- int ret;
bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
- ret = __bch2_move_data(&ctxt, start, end, pred, arg);
+ int ret = __bch2_move_data(&ctxt, start, end, pred, arg);
bch2_moving_ctxt_exit(&ctxt);
return ret;
}
-int bch2_evacuate_bucket(struct moving_context *ctxt,
- struct move_bucket_in_flight *bucket_in_flight,
- struct bpos bucket, int gen,
- struct data_update_opts _data_opts)
+static int __bch2_move_data_phys(struct moving_context *ctxt,
+ struct move_bucket *bucket_in_flight,
+ unsigned dev,
+ u64 bucket_start,
+ u64 bucket_end,
+ unsigned data_types,
+ bool copygc,
+ move_pred_fn pred, void *arg)
{
struct btree_trans *trans = ctxt->trans;
struct bch_fs *c = trans->c;
@@ -712,16 +871,18 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
struct btree_iter iter = {}, bp_iter = {};
struct bkey_buf sk;
struct bkey_s_c k;
- struct data_update_opts data_opts;
- unsigned sectors_moved = 0;
struct bkey_buf last_flushed;
+ u64 check_mismatch_done = bucket_start;
int ret = 0;
- struct bch_dev *ca = bch2_dev_tryget(c, bucket.inode);
+ struct bch_dev *ca = bch2_dev_tryget(c, dev);
if (!ca)
return 0;
- trace_bucket_evacuate(c, &bucket);
+ bucket_end = min(bucket_end, ca->mi.nbuckets);
+
+ struct bpos bp_start = bucket_pos_to_bp_start(ca, POS(dev, bucket_start));
+ struct bpos bp_end = bucket_pos_to_bp_end(ca, POS(dev, bucket_end));
bch2_bkey_buf_init(&last_flushed);
bkey_init(&last_flushed.k->k);
@@ -732,15 +893,11 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
*/
bch2_trans_begin(trans);
- bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers,
- bucket_pos_to_bp_start(ca, bucket), 0);
-
- bch_err_msg(c, ret, "looking up alloc key");
- if (ret)
- goto err;
+ bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, bp_start, 0);
ret = bch2_btree_write_buffer_tryflush(trans);
- bch_err_msg(c, ret, "flushing btree write buffer");
+ if (!bch2_err_matches(ret, EROFS))
+ bch_err_msg(c, ret, "flushing btree write buffer");
if (ret)
goto err;
@@ -750,122 +907,187 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
bch2_trans_begin(trans);
- k = bch2_btree_iter_peek(&bp_iter);
+ k = bch2_btree_iter_peek(trans, &bp_iter);
ret = bkey_err(k);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
continue;
if (ret)
goto err;
- if (!k.k || bkey_gt(k.k->p, bucket_pos_to_bp_end(ca, bucket)))
+ if (!k.k || bkey_gt(k.k->p, bp_end))
break;
+ if (check_mismatch_done < bp_pos_to_bucket(ca, k.k->p).offset) {
+ while (check_mismatch_done < bp_pos_to_bucket(ca, k.k->p).offset) {
+ bch2_check_bucket_backpointer_mismatch(trans, ca, check_mismatch_done++,
+ copygc, &last_flushed);
+ }
+ continue;
+ }
+
if (k.k->type != KEY_TYPE_backpointer)
goto next;
struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
- if (!bp.v->level) {
- k = bch2_backpointer_get_key(trans, bp, &iter, 0, &last_flushed);
- ret = bkey_err(k);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- continue;
- if (ret)
- goto err;
- if (!k.k)
- goto next;
+ if (ctxt->stats)
+ ctxt->stats->offset = bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT;
+
+ if (!(data_types & BIT(bp.v->data_type)))
+ goto next;
- bch2_bkey_buf_reassemble(&sk, c, k);
- k = bkey_i_to_s_c(sk.k);
+ if (!bp.v->level && bp.v->btree_id == BTREE_ID_stripes)
+ goto next;
+
+ k = bch2_backpointer_get_key(trans, bp, &iter, 0, &last_flushed);
+ ret = bkey_err(k);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ continue;
+ if (ret)
+ goto err;
+ if (!k.k)
+ goto next;
+ if (!bp.v->level) {
ret = bch2_move_get_io_opts_one(trans, &io_opts, &iter, k);
if (ret) {
bch2_trans_iter_exit(trans, &iter);
continue;
}
+ }
- data_opts = _data_opts;
- data_opts.target = io_opts.background_target;
- data_opts.rewrite_ptrs = 0;
-
- unsigned sectors = bp.v->bucket_len; /* move_extent will drop locks */
- unsigned i = 0;
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) {
- if (p.ptr.dev == bucket.inode) {
- if (p.ptr.cached) {
- bch2_trans_iter_exit(trans, &iter);
- goto next;
- }
- data_opts.rewrite_ptrs |= 1U << i;
- break;
- }
- i++;
- }
+ struct data_update_opts data_opts = {};
+ bool p = pred(c, arg, bp.v->btree_id, k, &io_opts, &data_opts);
- ret = bch2_move_extent(ctxt, bucket_in_flight,
- &iter, k, io_opts, data_opts);
- bch2_trans_iter_exit(trans, &iter);
+ if (trace_io_move_pred_enabled())
+ trace_io_move_pred2(c, k, &io_opts, &data_opts,
+ pred, arg, p);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- continue;
- if (ret == -ENOMEM) {
- /* memory allocation failure, wait for some IO to finish */
- bch2_move_ctxt_wait_for_io(ctxt);
- continue;
- }
- if (ret)
- goto err;
+ if (!p) {
+ bch2_trans_iter_exit(trans, &iter);
+ goto next;
+ }
- if (ctxt->stats)
- atomic64_add(sectors, &ctxt->stats->sectors_seen);
- sectors_moved += sectors;
- } else {
- struct btree *b;
+ if (data_opts.scrub &&
+ !bch2_dev_idx_is_online(c, data_opts.read_dev)) {
+ bch2_trans_iter_exit(trans, &iter);
+ ret = bch_err_throw(c, device_offline);
+ break;
+ }
- b = bch2_backpointer_get_node(trans, bp, &iter, &last_flushed);
- ret = PTR_ERR_OR_ZERO(b);
- if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
- goto next;
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- continue;
- if (ret)
- goto err;
- if (!b)
- goto next;
+ bch2_bkey_buf_reassemble(&sk, c, k);
+ k = bkey_i_to_s_c(sk.k);
- unsigned sectors = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key));
+ /* move_extent will drop locks */
+ unsigned sectors = bp.v->bucket_len;
- ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
- bch2_trans_iter_exit(trans, &iter);
+ if (!bp.v->level)
+ ret = bch2_move_extent(ctxt, bucket_in_flight, &iter, k, io_opts, data_opts);
+ else if (!data_opts.scrub)
+ ret = bch2_btree_node_rewrite_pos(trans, bp.v->btree_id, bp.v->level,
+ k.k->p, data_opts.target, 0);
+ else
+ ret = bch2_btree_node_scrub(trans, bp.v->btree_id, bp.v->level, k, data_opts.read_dev);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- continue;
- if (ret)
- goto err;
+ bch2_trans_iter_exit(trans, &iter);
- if (ctxt->rate)
- bch2_ratelimit_increment(ctxt->rate, sectors);
- if (ctxt->stats) {
- atomic64_add(sectors, &ctxt->stats->sectors_seen);
- atomic64_add(sectors, &ctxt->stats->sectors_moved);
- }
- sectors_moved += btree_sectors(c);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ continue;
+ if (ret == -ENOMEM) {
+ /* memory allocation failure, wait for some IO to finish */
+ bch2_move_ctxt_wait_for_io(ctxt);
+ continue;
}
+ if (ret)
+ goto err;
+
+ if (ctxt->stats)
+ atomic64_add(sectors, &ctxt->stats->sectors_seen);
next:
- bch2_btree_iter_advance(&bp_iter);
+ bch2_btree_iter_advance(trans, &bp_iter);
}
- trace_evacuate_bucket(c, &bucket, sectors_moved, ca->mi.bucket_size, ret);
+ while (check_mismatch_done < bucket_end)
+ bch2_check_bucket_backpointer_mismatch(trans, ca, check_mismatch_done++,
+ copygc, &last_flushed);
err:
bch2_trans_iter_exit(trans, &bp_iter);
- bch2_dev_put(ca);
bch2_bkey_buf_exit(&sk, c);
bch2_bkey_buf_exit(&last_flushed, c);
+ bch2_dev_put(ca);
return ret;
}
+int bch2_move_data_phys(struct bch_fs *c,
+ unsigned dev,
+ u64 start,
+ u64 end,
+ unsigned data_types,
+ struct bch_ratelimit *rate,
+ struct bch_move_stats *stats,
+ struct write_point_specifier wp,
+ bool wait_on_copygc,
+ move_pred_fn pred, void *arg)
+{
+ struct moving_context ctxt;
+
+ bch2_trans_run(c, bch2_btree_write_buffer_flush_sync(trans));
+
+ bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
+ if (ctxt.stats) {
+ ctxt.stats->phys = true;
+ ctxt.stats->data_type = (int) DATA_PROGRESS_DATA_TYPE_phys;
+ }
+
+ int ret = __bch2_move_data_phys(&ctxt, NULL, dev, start, end,
+ data_types, false, pred, arg);
+ bch2_moving_ctxt_exit(&ctxt);
+
+ return ret;
+}
+
+static bool evacuate_bucket_pred(struct bch_fs *c, void *_arg,
+ enum btree_id btree, struct bkey_s_c k,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
+{
+ struct evacuate_bucket_arg *arg = _arg;
+
+ *data_opts = arg->data_opts;
+
+ unsigned i = 0;
+ bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
+ if (ptr->dev == arg->bucket.inode &&
+ (arg->gen < 0 || arg->gen == ptr->gen) &&
+ !ptr->cached)
+ data_opts->rewrite_ptrs |= BIT(i);
+ i++;
+ }
+
+ return data_opts->rewrite_ptrs != 0;
+}
+
+int bch2_evacuate_bucket(struct moving_context *ctxt,
+ struct move_bucket *bucket_in_flight,
+ struct bpos bucket, int gen,
+ struct data_update_opts data_opts)
+{
+ struct bch_fs *c = ctxt->trans->c;
+ struct evacuate_bucket_arg arg = { bucket, gen, data_opts, };
+
+ count_event(c, io_move_evacuate_bucket);
+ if (trace_io_move_evacuate_bucket_enabled())
+ trace_io_move_evacuate_bucket2(c, bucket, gen);
+
+ return __bch2_move_data_phys(ctxt, bucket_in_flight,
+ bucket.inode,
+ bucket.offset,
+ bucket.offset + 1,
+ ~0,
+ true,
+ evacuate_bucket_pred, &arg);
+}
+
typedef bool (*move_btree_pred)(struct bch_fs *, void *,
struct btree *, struct bch_io_opts *,
struct data_update_opts *);
@@ -906,7 +1128,7 @@ static int bch2_move_btree(struct bch_fs *c,
retry:
ret = 0;
while (bch2_trans_begin(trans),
- (b = bch2_btree_iter_peek_node(&iter)) &&
+ (b = bch2_btree_iter_peek_node(trans, &iter)) &&
!(ret = PTR_ERR_OR_ZERO(b))) {
if (kthread && kthread_should_stop())
break;
@@ -920,13 +1142,13 @@ retry:
if (!pred(c, arg, b, &io_opts, &data_opts))
goto next;
- ret = bch2_btree_node_rewrite(trans, &iter, b, 0) ?: ret;
+ ret = bch2_btree_node_rewrite(trans, &iter, b, 0, 0) ?: ret;
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
continue;
if (ret)
break;
next:
- bch2_btree_iter_next_node(&iter);
+ bch2_btree_iter_next_node(trans, &iter);
}
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
@@ -945,7 +1167,7 @@ next:
}
static bool rereplicate_pred(struct bch_fs *c, void *arg,
- struct bkey_s_c k,
+ enum btree_id btree, struct bkey_s_c k,
struct bch_io_opts *io_opts,
struct data_update_opts *data_opts)
{
@@ -954,7 +1176,7 @@ static bool rereplicate_pred(struct bch_fs *c, void *arg,
? c->opts.metadata_replicas
: io_opts->data_replicas;
- rcu_read_lock();
+ guard(rcu)();
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
unsigned i = 0;
bkey_for_each_ptr(ptrs, ptr) {
@@ -964,7 +1186,6 @@ static bool rereplicate_pred(struct bch_fs *c, void *arg,
data_opts->kill_ptrs |= BIT(i);
i++;
}
- rcu_read_unlock();
if (!data_opts->kill_ptrs &&
(!nr_good || nr_good >= replicas))
@@ -977,7 +1198,7 @@ static bool rereplicate_pred(struct bch_fs *c, void *arg,
}
static bool migrate_pred(struct bch_fs *c, void *arg,
- struct bkey_s_c k,
+ enum btree_id btree, struct bkey_s_c k,
struct bch_io_opts *io_opts,
struct data_update_opts *data_opts)
{
@@ -1004,15 +1225,7 @@ static bool rereplicate_btree_pred(struct bch_fs *c, void *arg,
struct bch_io_opts *io_opts,
struct data_update_opts *data_opts)
{
- return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
-}
-
-static bool migrate_btree_pred(struct bch_fs *c, void *arg,
- struct btree *b,
- struct bch_io_opts *io_opts,
- struct data_update_opts *data_opts)
-{
- return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
+ return rereplicate_pred(c, arg, b->c.btree_id, bkey_i_to_s_c(&b->key), io_opts, data_opts);
}
/*
@@ -1068,7 +1281,7 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
}
static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg,
- struct bkey_s_c k,
+ enum btree_id btree, struct bkey_s_c k,
struct bch_io_opts *io_opts,
struct data_update_opts *data_opts)
{
@@ -1080,7 +1293,7 @@ static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg,
struct extent_ptr_decoded p;
unsigned i = 0;
- rcu_read_lock();
+ guard(rcu)();
bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) {
unsigned d = bch2_extent_ptr_durability(c, &p);
@@ -1091,7 +1304,6 @@ static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg,
i++;
}
- rcu_read_unlock();
return data_opts->kill_ptrs != 0;
}
@@ -1101,7 +1313,32 @@ static bool drop_extra_replicas_btree_pred(struct bch_fs *c, void *arg,
struct bch_io_opts *io_opts,
struct data_update_opts *data_opts)
{
- return drop_extra_replicas_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
+ return drop_extra_replicas_pred(c, arg, b->c.btree_id, bkey_i_to_s_c(&b->key),
+ io_opts, data_opts);
+}
+
+static bool scrub_pred(struct bch_fs *c, void *_arg,
+ enum btree_id btree, struct bkey_s_c k,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
+{
+ struct bch_ioctl_data *arg = _arg;
+
+ if (k.k->type != KEY_TYPE_btree_ptr_v2) {
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+ if (p.ptr.dev == arg->migrate.dev) {
+ if (!p.crc.csum_type)
+ return false;
+ break;
+ }
+ }
+
+ data_opts->scrub = true;
+ data_opts->read_dev = arg->migrate.dev;
+ return true;
}
int bch2_data_job(struct bch_fs *c,
@@ -1118,6 +1355,22 @@ int bch2_data_job(struct bch_fs *c,
bch2_move_stats_init(stats, bch2_data_ops_strs[op.op]);
switch (op.op) {
+ case BCH_DATA_OP_scrub:
+ /*
+ * prevent tests from spuriously failing, make sure we see all
+ * btree nodes that need to be repaired
+ */
+ bch2_btree_interior_updates_flush(c);
+
+ ret = bch2_move_data_phys(c, op.scrub.dev, 0, U64_MAX,
+ op.scrub.data_types,
+ NULL,
+ stats,
+ writepoint_hashed((unsigned long) current),
+ false,
+ scrub_pred, &op) ?: ret;
+ break;
+
case BCH_DATA_OP_rereplicate:
stats->data_type = BCH_DATA_journal;
ret = bch2_journal_flush_device_pins(&c->journal, -1);
@@ -1137,14 +1390,14 @@ int bch2_data_job(struct bch_fs *c,
stats->data_type = BCH_DATA_journal;
ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
- ret = bch2_move_btree(c, start, end,
- migrate_btree_pred, &op, stats) ?: ret;
- ret = bch2_move_data(c, start, end,
- NULL,
- stats,
- writepoint_hashed((unsigned long) current),
- true,
- migrate_pred, &op) ?: ret;
+ ret = bch2_move_data_phys(c, op.migrate.dev, 0, U64_MAX,
+ ~0,
+ NULL,
+ stats,
+ writepoint_hashed((unsigned long) current),
+ true,
+ migrate_pred, &op) ?: ret;
+ bch2_btree_interior_updates_flush(c);
ret = bch2_replicas_gc2(c) ?: ret;
break;
case BCH_DATA_OP_rewrite_old_nodes:
@@ -1176,17 +1429,17 @@ void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats)
prt_newline(out);
printbuf_indent_add(out, 2);
- prt_printf(out, "keys moved: %llu\n", atomic64_read(&stats->keys_moved));
- prt_printf(out, "keys raced: %llu\n", atomic64_read(&stats->keys_raced));
- prt_printf(out, "bytes seen: ");
+ prt_printf(out, "keys moved:\t%llu\n", atomic64_read(&stats->keys_moved));
+ prt_printf(out, "keys raced:\t%llu\n", atomic64_read(&stats->keys_raced));
+ prt_printf(out, "bytes seen:\t");
prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9);
prt_newline(out);
- prt_printf(out, "bytes moved: ");
+ prt_printf(out, "bytes moved:\t");
prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9);
prt_newline(out);
- prt_printf(out, "bytes raced: ");
+ prt_printf(out, "bytes raced:\t");
prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9);
prt_newline(out);
@@ -1195,7 +1448,8 @@ void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats)
static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt)
{
- struct moving_io *io;
+ if (!out->nr_tabstops)
+ printbuf_tabstop_push(out, 32);
bch2_move_stats_to_text(out, ctxt->stats);
printbuf_indent_add(out, 2);
@@ -1215,8 +1469,9 @@ static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, str
printbuf_indent_add(out, 2);
mutex_lock(&ctxt->lock);
+ struct moving_io *io;
list_for_each_entry(io, &ctxt->ios, io_list)
- bch2_write_op_to_text(out, &io->write.op);
+ bch2_data_update_inflight_to_text(out, &io->write);
mutex_unlock(&ctxt->lock);
printbuf_indent_sub(out, 4);
diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
index 51e0505a8156..86b80499ac55 100644
--- a/fs/bcachefs/move.h
+++ b/fs/bcachefs/move.h
@@ -72,7 +72,7 @@ do { \
break; \
} while (1)
-typedef bool (*move_pred_fn)(struct bch_fs *, void *, struct bkey_s_c,
+typedef bool (*move_pred_fn)(struct bch_fs *, void *, enum btree_id, struct bkey_s_c,
struct bch_io_opts *, struct data_update_opts *);
extern const char * const bch2_data_ops_strs[];
@@ -116,12 +116,18 @@ int bch2_move_get_io_opts_one(struct btree_trans *, struct bch_io_opts *,
int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *);
int bch2_move_extent(struct moving_context *,
- struct move_bucket_in_flight *,
+ struct move_bucket *,
struct btree_iter *,
struct bkey_s_c,
struct bch_io_opts,
struct data_update_opts);
+struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *,
+ struct per_snapshot_io_opts *, struct bpos,
+ struct btree_iter *, struct bkey_s_c);
+
+int bch2_move_data_btree(struct moving_context *, struct bpos, struct bpos,
+ move_pred_fn, void *, enum btree_id, unsigned);
int __bch2_move_data(struct moving_context *,
struct bbpos,
struct bbpos,
@@ -135,8 +141,13 @@ int bch2_move_data(struct bch_fs *,
bool,
move_pred_fn, void *);
+int bch2_move_data_phys(struct bch_fs *, unsigned, u64, u64, unsigned,
+ struct bch_ratelimit *, struct bch_move_stats *,
+ struct write_point_specifier, bool,
+ move_pred_fn, void *);
+
int bch2_evacuate_bucket(struct moving_context *,
- struct move_bucket_in_flight *,
+ struct move_bucket *,
struct bpos, int,
struct data_update_opts);
int bch2_data_job(struct bch_fs *,
diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h
index e22841ef31e4..c5c62cd600de 100644
--- a/fs/bcachefs/move_types.h
+++ b/fs/bcachefs/move_types.h
@@ -3,33 +3,43 @@
#define _BCACHEFS_MOVE_TYPES_H
#include "bbpos_types.h"
+#include "bcachefs_ioctl.h"
struct bch_move_stats {
- enum bch_data_type data_type;
- struct bbpos pos;
char name[32];
+ bool phys;
+ enum bch_ioctl_data_event_ret ret;
+
+ union {
+ struct {
+ enum bch_data_type data_type;
+ struct bbpos pos;
+ };
+ struct {
+ unsigned dev;
+ u64 offset;
+ };
+ };
atomic64_t keys_moved;
atomic64_t keys_raced;
atomic64_t sectors_seen;
atomic64_t sectors_moved;
atomic64_t sectors_raced;
+ atomic64_t sectors_error_corrected;
+ atomic64_t sectors_error_uncorrected;
};
struct move_bucket_key {
struct bpos bucket;
- u8 gen;
+ unsigned gen;
};
struct move_bucket {
+ struct move_bucket *next;
+ struct rhash_head hash;
struct move_bucket_key k;
unsigned sectors;
-};
-
-struct move_bucket_in_flight {
- struct move_bucket_in_flight *next;
- struct rhash_head hash;
- struct move_bucket bucket;
atomic_t count;
};
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 21805509ab9e..5e6de91a8763 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -8,6 +8,7 @@
#include "bcachefs.h"
#include "alloc_background.h"
#include "alloc_foreground.h"
+#include "backpointers.h"
#include "btree_iter.h"
#include "btree_update.h"
#include "btree_write_buffer.h"
@@ -27,93 +28,87 @@
#include <linux/wait.h>
struct buckets_in_flight {
- struct rhashtable table;
- struct move_bucket_in_flight *first;
- struct move_bucket_in_flight *last;
- size_t nr;
- size_t sectors;
+ struct rhashtable *table;
+ struct move_bucket *first;
+ struct move_bucket *last;
+ size_t nr;
+ size_t sectors;
+
+ DARRAY(struct move_bucket *) to_evacuate;
};
static const struct rhashtable_params bch_move_bucket_params = {
- .head_offset = offsetof(struct move_bucket_in_flight, hash),
- .key_offset = offsetof(struct move_bucket_in_flight, bucket.k),
+ .head_offset = offsetof(struct move_bucket, hash),
+ .key_offset = offsetof(struct move_bucket, k),
.key_len = sizeof(struct move_bucket_key),
.automatic_shrinking = true,
};
-static struct move_bucket_in_flight *
-move_bucket_in_flight_add(struct buckets_in_flight *list, struct move_bucket b)
+static void move_bucket_in_flight_add(struct buckets_in_flight *list, struct move_bucket *b)
{
- struct move_bucket_in_flight *new = kzalloc(sizeof(*new), GFP_KERNEL);
- int ret;
-
- if (!new)
- return ERR_PTR(-ENOMEM);
-
- new->bucket = b;
-
- ret = rhashtable_lookup_insert_fast(&list->table, &new->hash,
- bch_move_bucket_params);
- if (ret) {
- kfree(new);
- return ERR_PTR(ret);
- }
-
if (!list->first)
- list->first = new;
+ list->first = b;
else
- list->last->next = new;
+ list->last->next = b;
- list->last = new;
+ list->last = b;
list->nr++;
- list->sectors += b.sectors;
- return new;
+ list->sectors += b->sectors;
}
static int bch2_bucket_is_movable(struct btree_trans *trans,
struct move_bucket *b, u64 time)
{
struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bch_alloc_v4 _a;
- const struct bch_alloc_v4 *a;
- int ret;
- if (bch2_bucket_is_open(trans->c,
- b->k.bucket.inode,
- b->k.bucket.offset))
+ if (bch2_bucket_is_open(c, b->k.bucket.inode, b->k.bucket.offset))
return 0;
- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc,
- b->k.bucket, BTREE_ITER_cached);
- ret = bkey_err(k);
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc,
+ b->k.bucket, BTREE_ITER_cached);
+ int ret = bkey_err(k);
if (ret)
return ret;
- struct bch_dev *ca = bch2_dev_tryget(c, k.k->p.inode);
+ struct bch_dev *ca = bch2_dev_bucket_tryget(c, k.k->p);
if (!ca)
goto out;
- a = bch2_alloc_to_v4(k, &_a);
+ if (bch2_bucket_bitmap_test(&ca->bucket_backpointer_mismatch, b->k.bucket.offset))
+ goto out;
+
+ if (ca->mi.state != BCH_MEMBER_STATE_rw ||
+ !bch2_dev_is_online(ca))
+ goto out;
+
+ struct bch_alloc_v4 _a;
+ const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
b->k.gen = a->gen;
b->sectors = bch2_bucket_sectors_dirty(*a);
u64 lru_idx = alloc_lru_idx_fragmentation(*a, ca);
ret = lru_idx && lru_idx <= time;
-
- bch2_dev_put(ca);
out:
+ bch2_dev_put(ca);
bch2_trans_iter_exit(trans, &iter);
return ret;
}
+static void move_bucket_free(struct buckets_in_flight *list,
+ struct move_bucket *b)
+{
+ int ret = rhashtable_remove_fast(list->table, &b->hash,
+ bch_move_bucket_params);
+ BUG_ON(ret);
+ kfree(b);
+}
+
static void move_buckets_wait(struct moving_context *ctxt,
struct buckets_in_flight *list,
bool flush)
{
- struct move_bucket_in_flight *i;
- int ret;
+ struct move_bucket *i;
while ((i = list->first)) {
if (flush)
@@ -127,12 +122,9 @@ static void move_buckets_wait(struct moving_context *ctxt,
list->last = NULL;
list->nr--;
- list->sectors -= i->bucket.sectors;
+ list->sectors -= i->sectors;
- ret = rhashtable_remove_fast(&list->table, &i->hash,
- bch_move_bucket_params);
- BUG_ON(ret);
- kfree(i);
+ move_bucket_free(list, i);
}
bch2_trans_unlock_long(ctxt->trans);
@@ -141,14 +133,11 @@ static void move_buckets_wait(struct moving_context *ctxt,
static bool bucket_in_flight(struct buckets_in_flight *list,
struct move_bucket_key k)
{
- return rhashtable_lookup_fast(&list->table, &k, bch_move_bucket_params);
+ return rhashtable_lookup_fast(list->table, &k, bch_move_bucket_params);
}
-typedef DARRAY(struct move_bucket) move_buckets;
-
static int bch2_copygc_get_buckets(struct moving_context *ctxt,
- struct buckets_in_flight *buckets_in_flight,
- move_buckets *buckets)
+ struct buckets_in_flight *buckets_in_flight)
{
struct btree_trans *trans = ctxt->trans;
struct bch_fs *c = trans->c;
@@ -165,11 +154,9 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt,
if (bch2_fs_fatal_err_on(ret, c, "%s: from bch2_btree_write_buffer_tryflush()", bch2_err_str(ret)))
return ret;
- bch2_trans_begin(trans);
-
ret = for_each_btree_key_max(trans, iter, BTREE_ID_lru,
- lru_pos(BCH_LRU_FRAGMENTATION_START, 0, 0),
- lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX),
+ lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, 0, 0),
+ lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, U64_MAX, LRU_TIME_MAX),
0, k, ({
struct move_bucket b = { .k.bucket = u64_to_bucket(k.k->p.offset) };
int ret2 = 0;
@@ -185,20 +172,34 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt,
else if (bucket_in_flight(buckets_in_flight, b.k))
in_flight++;
else {
- ret2 = darray_push(buckets, b);
+ struct move_bucket *b_i = kmalloc(sizeof(*b_i), GFP_KERNEL);
+ ret2 = b_i ? 0 : -ENOMEM;
if (ret2)
goto err;
+
+ *b_i = b;
+
+ ret2 = darray_push(&buckets_in_flight->to_evacuate, b_i);
+ if (ret2) {
+ kfree(b_i);
+ goto err;
+ }
+
+ ret2 = rhashtable_lookup_insert_fast(buckets_in_flight->table, &b_i->hash,
+ bch_move_bucket_params);
+ BUG_ON(ret2);
+
sectors += b.sectors;
}
- ret2 = buckets->nr >= nr_to_get;
+ ret2 = buckets_in_flight->to_evacuate.nr >= nr_to_get;
err:
ret2;
}));
pr_debug("have: %zu (%zu) saw %zu in flight %zu not movable %zu got %zu (%zu)/%zu buckets ret %i",
buckets_in_flight->nr, buckets_in_flight->sectors,
- saw, in_flight, not_movable, buckets->nr, sectors, nr_to_get, ret);
+ saw, in_flight, not_movable, buckets_in_flight->to_evacuate.nr, sectors, nr_to_get, ret);
return ret < 0 ? ret : 0;
}
@@ -213,40 +214,30 @@ static int bch2_copygc(struct moving_context *ctxt,
struct data_update_opts data_opts = {
.btree_insert_flags = BCH_WATERMARK_copygc,
};
- move_buckets buckets = { 0 };
- struct move_bucket_in_flight *f;
u64 sectors_seen = atomic64_read(&ctxt->stats->sectors_seen);
u64 sectors_moved = atomic64_read(&ctxt->stats->sectors_moved);
int ret = 0;
- ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight, &buckets);
+ ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight);
if (ret)
goto err;
- darray_for_each(buckets, i) {
+ darray_for_each(buckets_in_flight->to_evacuate, i) {
if (kthread_should_stop() || freezing(current))
break;
- f = move_bucket_in_flight_add(buckets_in_flight, *i);
- ret = PTR_ERR_OR_ZERO(f);
- if (ret == -EEXIST) { /* rare race: copygc_get_buckets returned same bucket more than once */
- ret = 0;
- continue;
- }
- if (ret == -ENOMEM) { /* flush IO, continue later */
- ret = 0;
- break;
- }
+ struct move_bucket *b = *i;
+ *i = NULL;
- ret = bch2_evacuate_bucket(ctxt, f, f->bucket.k.bucket,
- f->bucket.k.gen, data_opts);
+ move_bucket_in_flight_add(buckets_in_flight, b);
+
+ ret = bch2_evacuate_bucket(ctxt, b, b->k.bucket, b->k.gen, data_opts);
if (ret)
goto err;
*did_work = true;
}
err:
-
/* no entries in LRU btree found, or got to end: */
if (bch2_err_matches(ret, ENOENT))
ret = 0;
@@ -256,12 +247,34 @@ err:
sectors_seen = atomic64_read(&ctxt->stats->sectors_seen) - sectors_seen;
sectors_moved = atomic64_read(&ctxt->stats->sectors_moved) - sectors_moved;
- trace_and_count(c, copygc, c, buckets.nr, sectors_seen, sectors_moved);
+ trace_and_count(c, copygc, c, buckets_in_flight->to_evacuate.nr, sectors_seen, sectors_moved);
- darray_exit(&buckets);
+ darray_for_each(buckets_in_flight->to_evacuate, i)
+ if (*i)
+ move_bucket_free(buckets_in_flight, *i);
+ darray_exit(&buckets_in_flight->to_evacuate);
return ret;
}
+static u64 bch2_copygc_dev_wait_amount(struct bch_dev *ca)
+{
+ struct bch_dev_usage_full usage_full = bch2_dev_usage_full_read(ca);
+ struct bch_dev_usage usage;
+
+ for (unsigned i = 0; i < BCH_DATA_NR; i++)
+ usage.buckets[i] = usage_full.d[i].buckets;
+
+ s64 fragmented_allowed = ((__dev_buckets_available(ca, usage, BCH_WATERMARK_stripe) *
+ ca->mi.bucket_size) >> 1);
+ s64 fragmented = 0;
+
+ for (unsigned i = 0; i < BCH_DATA_NR; i++)
+ if (data_type_movable(i))
+ fragmented += usage_full.d[i].fragmented;
+
+ return max(0LL, fragmented_allowed - fragmented);
+}
+
/*
* Copygc runs when the amount of fragmented data is above some arbitrary
* threshold:
@@ -276,24 +289,13 @@ err:
* often and continually reduce the amount of fragmented space as the device
* fills up. So, we increase the threshold by half the current free space.
*/
-unsigned long bch2_copygc_wait_amount(struct bch_fs *c)
+u64 bch2_copygc_wait_amount(struct bch_fs *c)
{
- s64 wait = S64_MAX, fragmented_allowed, fragmented;
-
- for_each_rw_member(c, ca) {
- struct bch_dev_usage usage = bch2_dev_usage_read(ca);
-
- fragmented_allowed = ((__dev_buckets_available(ca, usage, BCH_WATERMARK_stripe) *
- ca->mi.bucket_size) >> 1);
- fragmented = 0;
-
- for (unsigned i = 0; i < BCH_DATA_NR; i++)
- if (data_type_movable(i))
- fragmented += usage.d[i].fragmented;
-
- wait = min(wait, max(0LL, fragmented_allowed - fragmented));
- }
+ u64 wait = U64_MAX;
+ guard(rcu)();
+ for_each_rw_member_rcu(c, ca)
+ wait = min(wait, bch2_copygc_dev_wait_amount(ca));
return wait;
}
@@ -315,9 +317,28 @@ void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c)
c->copygc_wait_at) << 9);
prt_newline(out);
- prt_printf(out, "Currently calculated wait:\t");
- prt_human_readable_u64(out, bch2_copygc_wait_amount(c));
- prt_newline(out);
+ bch2_printbuf_make_room(out, 4096);
+
+ struct task_struct *t;
+ out->atomic++;
+ scoped_guard(rcu) {
+ prt_printf(out, "Currently calculated wait:\n");
+ for_each_rw_member_rcu(c, ca) {
+ prt_printf(out, " %s:\t", ca->name);
+ prt_human_readable_u64(out, bch2_copygc_dev_wait_amount(ca));
+ prt_newline(out);
+ }
+
+ t = rcu_dereference(c->copygc_thread);
+ if (t)
+ get_task_struct(t);
+ }
+ --out->atomic;
+
+ if (t) {
+ bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL);
+ put_task_struct(t);
+ }
}
static int bch2_copygc_thread(void *arg)
@@ -326,22 +347,26 @@ static int bch2_copygc_thread(void *arg)
struct moving_context ctxt;
struct bch_move_stats move_stats;
struct io_clock *clock = &c->io_clock[WRITE];
- struct buckets_in_flight *buckets;
+ struct buckets_in_flight buckets = {};
u64 last, wait;
- int ret = 0;
- buckets = kzalloc(sizeof(struct buckets_in_flight), GFP_KERNEL);
- if (!buckets)
- return -ENOMEM;
- ret = rhashtable_init(&buckets->table, &bch_move_bucket_params);
+ buckets.table = kzalloc(sizeof(*buckets.table), GFP_KERNEL);
+ int ret = !buckets.table
+ ? -ENOMEM
+ : rhashtable_init(buckets.table, &bch_move_bucket_params);
bch_err_msg(c, ret, "allocating copygc buckets in flight");
- if (ret) {
- kfree(buckets);
- return ret;
- }
+ if (ret)
+ goto err;
set_freezable();
+ /*
+ * Data move operations can't run until after check_snapshots has
+ * completed, and bch2_snapshot_is_ancestor() is available.
+ */
+ kthread_wait_freezable(c->recovery.pass_done > BCH_RECOVERY_PASS_check_snapshots ||
+ kthread_should_stop());
+
bch2_move_stats_init(&move_stats, "copygc");
bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats,
writepoint_ptr(&c->copygc_write_point),
@@ -354,13 +379,13 @@ static int bch2_copygc_thread(void *arg)
cond_resched();
if (!c->opts.copygc_enabled) {
- move_buckets_wait(&ctxt, buckets, true);
+ move_buckets_wait(&ctxt, &buckets, true);
kthread_wait_freezable(c->opts.copygc_enabled ||
kthread_should_stop());
}
if (unlikely(freezing(current))) {
- move_buckets_wait(&ctxt, buckets, true);
+ move_buckets_wait(&ctxt, &buckets, true);
__refrigerator(false);
continue;
}
@@ -371,7 +396,7 @@ static int bch2_copygc_thread(void *arg)
if (wait > clock->max_slop) {
c->copygc_wait_at = last;
c->copygc_wait = last + wait;
- move_buckets_wait(&ctxt, buckets, true);
+ move_buckets_wait(&ctxt, &buckets, true);
trace_and_count(c, copygc_wait, c, wait, last + wait);
bch2_kthread_io_clock_wait(clock, last + wait,
MAX_SCHEDULE_TIMEOUT);
@@ -381,7 +406,7 @@ static int bch2_copygc_thread(void *arg)
c->copygc_wait = 0;
c->copygc_running = true;
- ret = bch2_copygc(&ctxt, buckets, &did_work);
+ ret = bch2_copygc(&ctxt, &buckets, &did_work);
c->copygc_running = false;
wake_up(&c->copygc_running_wq);
@@ -392,20 +417,19 @@ static int bch2_copygc_thread(void *arg)
if (min_member_capacity == U64_MAX)
min_member_capacity = 128 * 2048;
- move_buckets_wait(&ctxt, buckets, true);
+ move_buckets_wait(&ctxt, &buckets, true);
bch2_kthread_io_clock_wait(clock, last + (min_member_capacity >> 6),
MAX_SCHEDULE_TIMEOUT);
}
}
- move_buckets_wait(&ctxt, buckets, true);
-
- rhashtable_destroy(&buckets->table);
- kfree(buckets);
+ move_buckets_wait(&ctxt, &buckets, true);
+ rhashtable_destroy(buckets.table);
bch2_moving_ctxt_exit(&ctxt);
bch2_move_stats_exit(&move_stats, c);
-
- return 0;
+err:
+ kfree(buckets.table);
+ return ret;
}
void bch2_copygc_stop(struct bch_fs *c)
diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h
index ea181fef5bc9..f615910d6f98 100644
--- a/fs/bcachefs/movinggc.h
+++ b/fs/bcachefs/movinggc.h
@@ -2,9 +2,17 @@
#ifndef _BCACHEFS_MOVINGGC_H
#define _BCACHEFS_MOVINGGC_H
-unsigned long bch2_copygc_wait_amount(struct bch_fs *);
+u64 bch2_copygc_wait_amount(struct bch_fs *);
void bch2_copygc_wait_to_text(struct printbuf *, struct bch_fs *);
+static inline void bch2_copygc_wakeup(struct bch_fs *c)
+{
+ guard(rcu)();
+ struct task_struct *p = rcu_dereference(c->copygc_thread);
+ if (p)
+ wake_up_process(p);
+}
+
void bch2_copygc_stop(struct bch_fs *);
int bch2_copygc_start(struct bch_fs *);
void bch2_fs_copygc_init(struct bch_fs *);
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/namei.c
index d70d9f634cea..c3f87c59922d 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/namei.c
@@ -4,13 +4,21 @@
#include "acl.h"
#include "btree_update.h"
#include "dirent.h"
-#include "fs-common.h"
#include "inode.h"
+#include "namei.h"
#include "subvolume.h"
#include "xattr.h"
#include <linux/posix_acl.h>
+static inline subvol_inum parent_inum(subvol_inum inum, struct bch_inode_unpacked *inode)
+{
+ return (subvol_inum) {
+ .subvol = inode->bi_parent_subvol ?: inum.subvol,
+ .inum = inode->bi_dir,
+ };
+}
+
static inline int is_subdir_for_nlink(struct bch_inode_unpacked *inode)
{
return S_ISDIR(inode->bi_mode) && !inode->bi_subvol;
@@ -28,8 +36,8 @@ int bch2_create_trans(struct btree_trans *trans,
unsigned flags)
{
struct bch_fs *c = trans->c;
- struct btree_iter dir_iter = { NULL };
- struct btree_iter inode_iter = { NULL };
+ struct btree_iter dir_iter = {};
+ struct btree_iter inode_iter = {};
subvol_inum new_inum = dir;
u64 now = bch2_current_time(c);
u64 cpu = raw_smp_processor_id();
@@ -49,7 +57,7 @@ int bch2_create_trans(struct btree_trans *trans,
if (!(flags & BCH_CREATE_SNAPSHOT)) {
/* Normal create path - allocate a new inode: */
- bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u);
+ bch2_inode_init_late(c, new_inode, now, uid, gid, mode, rdev, dir_u);
if (flags & BCH_CREATE_TMPFILE)
new_inode->bi_flags |= BCH_INODE_unlinked;
@@ -123,8 +131,8 @@ int bch2_create_trans(struct btree_trans *trans,
if (ret)
goto err;
- bch2_btree_iter_set_snapshot(&dir_iter, dir_snapshot);
- ret = bch2_btree_iter_traverse(&dir_iter);
+ bch2_btree_iter_set_snapshot(trans, &dir_iter, dir_snapshot);
+ ret = bch2_btree_iter_traverse(trans, &dir_iter);
if (ret)
goto err;
}
@@ -152,18 +160,14 @@ int bch2_create_trans(struct btree_trans *trans,
if (is_subdir_for_nlink(new_inode))
dir_u->bi_nlink++;
dir_u->bi_mtime = dir_u->bi_ctime = now;
- dir_u->bi_size += dirent_occupied_size(name);
-
- ret = bch2_inode_write(trans, &dir_iter, dir_u);
- if (ret)
- goto err;
- ret = bch2_dirent_create(trans, dir, &dir_hash,
- dir_type,
- name,
- dir_target,
- &dir_offset,
- STR_HASH_must_create|BTREE_ITER_with_updates);
+ ret = bch2_dirent_create(trans, dir, &dir_hash,
+ dir_type,
+ name,
+ dir_target,
+ &dir_offset,
+ STR_HASH_must_create|BTREE_ITER_with_updates) ?:
+ bch2_inode_write(trans, &dir_iter, dir_u);
if (ret)
goto err;
@@ -171,14 +175,24 @@ int bch2_create_trans(struct btree_trans *trans,
new_inode->bi_dir_offset = dir_offset;
}
+ if (S_ISDIR(mode)) {
+ ret = bch2_maybe_propagate_has_case_insensitive(trans,
+ (subvol_inum) {
+ new_inode->bi_subvol ?: dir.subvol,
+ new_inode->bi_inum },
+ new_inode);
+ if (ret)
+ goto err;
+ }
+
if (S_ISDIR(mode) &&
!new_inode->bi_subvol)
new_inode->bi_depth = dir_u->bi_depth + 1;
inode_iter.flags &= ~BTREE_ITER_all_snapshots;
- bch2_btree_iter_set_snapshot(&inode_iter, snapshot);
+ bch2_btree_iter_set_snapshot(trans, &inode_iter, snapshot);
- ret = bch2_btree_iter_traverse(&inode_iter) ?:
+ ret = bch2_btree_iter_traverse(trans, &inode_iter) ?:
bch2_inode_write(trans, &inode_iter, new_inode);
err:
bch2_trans_iter_exit(trans, &inode_iter);
@@ -192,8 +206,8 @@ int bch2_link_trans(struct btree_trans *trans,
const struct qstr *name)
{
struct bch_fs *c = trans->c;
- struct btree_iter dir_iter = { NULL };
- struct btree_iter inode_iter = { NULL };
+ struct btree_iter dir_iter = {};
+ struct btree_iter inode_iter = {};
struct bch_hash_info dir_hash;
u64 now = bch2_current_time(c);
u64 dir_offset = 0;
@@ -221,13 +235,13 @@ int bch2_link_trans(struct btree_trans *trans,
}
dir_u->bi_mtime = dir_u->bi_ctime = now;
- dir_u->bi_size += dirent_occupied_size(name);
dir_hash = bch2_hash_info_init(c, dir_u);
ret = bch2_dirent_create(trans, dir, &dir_hash,
mode_to_type(inode_u->bi_mode),
- name, inum.inum, &dir_offset,
+ name, inum.inum,
+ &dir_offset,
STR_HASH_must_create);
if (ret)
goto err;
@@ -251,9 +265,9 @@ int bch2_unlink_trans(struct btree_trans *trans,
bool deleting_subvol)
{
struct bch_fs *c = trans->c;
- struct btree_iter dir_iter = { NULL };
- struct btree_iter dirent_iter = { NULL };
- struct btree_iter inode_iter = { NULL };
+ struct btree_iter dir_iter = {};
+ struct btree_iter dirent_iter = {};
+ struct btree_iter inode_iter = {};
struct bch_hash_info dir_hash;
subvol_inum inum;
u64 now = bch2_current_time(c);
@@ -283,7 +297,7 @@ int bch2_unlink_trans(struct btree_trans *trans,
}
if (deleting_subvol && !inode_u->bi_subvol) {
- ret = -BCH_ERR_ENOENT_not_subvol;
+ ret = bch_err_throw(c, ENOENT_not_subvol);
goto err;
}
@@ -299,7 +313,7 @@ int bch2_unlink_trans(struct btree_trans *trans,
if (ret)
goto err;
- k = bch2_btree_iter_peek_slot(&dirent_iter);
+ k = bch2_btree_iter_peek_slot(trans, &dirent_iter);
ret = bkey_err(k);
if (ret)
goto err;
@@ -308,8 +322,8 @@ int bch2_unlink_trans(struct btree_trans *trans,
* If we're deleting a subvolume, we need to really delete the
* dirent, not just emit a whiteout in the current snapshot:
*/
- bch2_btree_iter_set_snapshot(&dirent_iter, k.k->p.snapshot);
- ret = bch2_btree_iter_traverse(&dirent_iter);
+ bch2_btree_iter_set_snapshot(trans, &dirent_iter, k.k->p.snapshot);
+ ret = bch2_btree_iter_traverse(trans, &dirent_iter);
if (ret)
goto err;
} else {
@@ -324,7 +338,6 @@ int bch2_unlink_trans(struct btree_trans *trans,
dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now;
dir_u->bi_nlink -= is_subdir_for_nlink(inode_u);
- dir_u->bi_size -= dirent_occupied_size(name);
ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
&dir_hash, &dirent_iter,
@@ -346,6 +359,9 @@ bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u,
bool ret = false;
for (id = 0; id < Inode_opt_nr; id++) {
+ if (!S_ISDIR(dst_u->bi_mode) && id == Inode_opt_casefold)
+ continue;
+
/* Skip attributes that were explicitly set on this inode */
if (dst_u->bi_fields_set & (1 << id))
continue;
@@ -389,10 +405,10 @@ int bch2_rename_trans(struct btree_trans *trans,
enum bch_rename_mode mode)
{
struct bch_fs *c = trans->c;
- struct btree_iter src_dir_iter = { NULL };
- struct btree_iter dst_dir_iter = { NULL };
- struct btree_iter src_inode_iter = { NULL };
- struct btree_iter dst_inode_iter = { NULL };
+ struct btree_iter src_dir_iter = {};
+ struct btree_iter dst_dir_iter = {};
+ struct btree_iter src_inode_iter = {};
+ struct btree_iter dst_inode_iter = {};
struct bch_hash_info src_hash, dst_hash;
subvol_inum src_inum, dst_inum;
u64 src_offset, dst_offset;
@@ -406,8 +422,7 @@ int bch2_rename_trans(struct btree_trans *trans,
src_hash = bch2_hash_info_init(c, src_dir_u);
- if (dst_dir.inum != src_dir.inum ||
- dst_dir.subvol != src_dir.subvol) {
+ if (!subvol_inum_eq(dst_dir, src_dir)) {
ret = bch2_inode_peek(trans, &dst_dir_iter, dst_dir_u, dst_dir,
BTREE_ITER_intent);
if (ret)
@@ -463,14 +478,6 @@ int bch2_rename_trans(struct btree_trans *trans,
goto err;
}
- if (mode == BCH_RENAME) {
- src_dir_u->bi_size -= dirent_occupied_size(src_name);
- dst_dir_u->bi_size += dirent_occupied_size(dst_name);
- }
-
- if (mode == BCH_RENAME_OVERWRITE)
- src_dir_u->bi_size -= dirent_occupied_size(src_name);
-
if (src_inode_u->bi_parent_subvol)
src_inode_u->bi_parent_subvol = dst_dir.subvol;
@@ -507,32 +514,41 @@ int bch2_rename_trans(struct btree_trans *trans,
}
}
- if (bch2_reinherit_attrs(src_inode_u, dst_dir_u) &&
- S_ISDIR(src_inode_u->bi_mode)) {
- ret = -EXDEV;
- goto err;
- }
+ if (!subvol_inum_eq(dst_dir, src_dir)) {
+ if (bch2_reinherit_attrs(src_inode_u, dst_dir_u) &&
+ S_ISDIR(src_inode_u->bi_mode)) {
+ ret = -EXDEV;
+ goto err;
+ }
- if (mode == BCH_RENAME_EXCHANGE &&
- bch2_reinherit_attrs(dst_inode_u, src_dir_u) &&
- S_ISDIR(dst_inode_u->bi_mode)) {
- ret = -EXDEV;
- goto err;
- }
+ if (mode == BCH_RENAME_EXCHANGE &&
+ bch2_reinherit_attrs(dst_inode_u, src_dir_u) &&
+ S_ISDIR(dst_inode_u->bi_mode)) {
+ ret = -EXDEV;
+ goto err;
+ }
- if (is_subdir_for_nlink(src_inode_u)) {
- src_dir_u->bi_nlink--;
- dst_dir_u->bi_nlink++;
- }
+ ret = bch2_maybe_propagate_has_case_insensitive(trans, src_inum, src_inode_u) ?:
+ (mode == BCH_RENAME_EXCHANGE
+ ? bch2_maybe_propagate_has_case_insensitive(trans, dst_inum, dst_inode_u)
+ : 0);
+ if (ret)
+ goto err;
- if (S_ISDIR(src_inode_u->bi_mode) &&
- !src_inode_u->bi_subvol)
- src_inode_u->bi_depth = dst_dir_u->bi_depth + 1;
+ if (is_subdir_for_nlink(src_inode_u)) {
+ src_dir_u->bi_nlink--;
+ dst_dir_u->bi_nlink++;
+ }
- if (mode == BCH_RENAME_EXCHANGE &&
- S_ISDIR(dst_inode_u->bi_mode) &&
- !dst_inode_u->bi_subvol)
- dst_inode_u->bi_depth = src_dir_u->bi_depth + 1;
+ if (S_ISDIR(src_inode_u->bi_mode) &&
+ !src_inode_u->bi_subvol)
+ src_inode_u->bi_depth = dst_dir_u->bi_depth + 1;
+
+ if (mode == BCH_RENAME_EXCHANGE &&
+ S_ISDIR(dst_inode_u->bi_mode) &&
+ !dst_inode_u->bi_subvol)
+ dst_inode_u->bi_depth = src_dir_u->bi_depth + 1;
+ }
if (dst_inum.inum && is_subdir_for_nlink(dst_inode_u)) {
dst_dir_u->bi_nlink--;
@@ -571,6 +587,8 @@ err:
return ret;
}
+/* inum_to_path */
+
static inline void prt_bytes_reversed(struct printbuf *out, const void *b, unsigned n)
{
bch2_printbuf_make_room(out, n);
@@ -601,32 +619,54 @@ static inline void reverse_bytes(void *b, size_t n)
}
}
-/* XXX: we don't yet attempt to print paths when we don't know the subvol */
-int bch2_inum_to_path(struct btree_trans *trans, subvol_inum inum, struct printbuf *path)
+static int __bch2_inum_to_path(struct btree_trans *trans,
+ u32 subvol, u64 inum, u32 snapshot,
+ struct printbuf *path)
{
unsigned orig_pos = path->pos;
int ret = 0;
+ DARRAY(subvol_inum) inums = {};
- while (!(inum.subvol == BCACHEFS_ROOT_SUBVOL &&
- inum.inum == BCACHEFS_ROOT_INO)) {
- struct bch_inode_unpacked inode;
- ret = bch2_inode_find_by_inum_trans(trans, inum, &inode);
+ if (!snapshot) {
+ ret = bch2_subvolume_get_snapshot(trans, subvol, &snapshot);
if (ret)
goto disconnected;
+ }
- if (!inode.bi_dir && !inode.bi_dir_offset) {
- ret = -BCH_ERR_ENOENT_inode_no_backpointer;
- goto disconnected;
+ while (true) {
+ subvol_inum n = (subvol_inum) { subvol ?: snapshot, inum };
+
+ if (darray_find_p(inums, i, i->subvol == n.subvol && i->inum == n.inum)) {
+ prt_str_reversed(path, "(loop)");
+ break;
}
- inum.subvol = inode.bi_parent_subvol ?: inum.subvol;
- inum.inum = inode.bi_dir;
+ ret = darray_push(&inums, n);
+ if (ret)
+ goto err;
- u32 snapshot;
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+ struct bch_inode_unpacked inode;
+ ret = bch2_inode_find_by_inum_snapshot(trans, inum, snapshot, &inode, 0);
if (ret)
goto disconnected;
+ if (inode.bi_subvol == BCACHEFS_ROOT_SUBVOL &&
+ inode.bi_inum == BCACHEFS_ROOT_INO)
+ break;
+
+ if (!inode.bi_dir && !inode.bi_dir_offset) {
+ ret = bch_err_throw(trans->c, ENOENT_inode_no_backpointer);
+ goto disconnected;
+ }
+
+ inum = inode.bi_dir;
+ if (inode.bi_parent_subvol) {
+ subvol = inode.bi_parent_subvol;
+ ret = bch2_subvolume_get_snapshot(trans, inode.bi_parent_subvol, &snapshot);
+ if (ret)
+ goto disconnected;
+ }
+
struct btree_iter d_iter;
struct bkey_s_c_dirent d = bch2_bkey_get_iter_typed(trans, &d_iter,
BTREE_ID_dirents, SPOS(inode.bi_dir, inode.bi_dir_offset, snapshot),
@@ -636,6 +676,7 @@ int bch2_inum_to_path(struct btree_trans *trans, subvol_inum inum, struct printb
goto disconnected;
struct qstr dirent_name = bch2_dirent_get_name(d);
+
prt_bytes_reversed(path, dirent_name.name, dirent_name.len);
prt_char(path, '/');
@@ -651,8 +692,10 @@ out:
goto err;
reverse_bytes(path->buf + orig_pos, path->pos - orig_pos);
+ darray_exit(&inums);
return 0;
err:
+ darray_exit(&inums);
return ret;
disconnected:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -661,3 +704,331 @@ disconnected:
prt_str_reversed(path, "(disconnected)");
goto out;
}
+
+int bch2_inum_to_path(struct btree_trans *trans,
+ subvol_inum inum,
+ struct printbuf *path)
+{
+ return __bch2_inum_to_path(trans, inum.subvol, inum.inum, 0, path);
+}
+
+int bch2_inum_snapshot_to_path(struct btree_trans *trans, u64 inum, u32 snapshot,
+ snapshot_id_list *snapshot_overwrites,
+ struct printbuf *path)
+{
+ return __bch2_inum_to_path(trans, 0, inum, snapshot, path);
+}
+
+/* fsck */
+
+static int bch2_check_dirent_inode_dirent(struct btree_trans *trans,
+ struct bkey_s_c_dirent d,
+ struct bch_inode_unpacked *target,
+ bool in_fsck)
+{
+ struct bch_fs *c = trans->c;
+ struct printbuf buf = PRINTBUF;
+ struct btree_iter bp_iter = {};
+ int ret = 0;
+
+ if (inode_points_to_dirent(target, d))
+ return 0;
+
+ if (!bch2_inode_has_backpointer(target)) {
+ fsck_err_on(S_ISDIR(target->bi_mode),
+ trans, inode_dir_missing_backpointer,
+ "directory with missing backpointer\n%s",
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, d.s_c),
+ prt_printf(&buf, "\n"),
+ bch2_inode_unpacked_to_text(&buf, target),
+ buf.buf));
+
+ fsck_err_on(target->bi_flags & BCH_INODE_unlinked,
+ trans, inode_unlinked_but_has_dirent,
+ "inode unlinked but has dirent\n%s",
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, d.s_c),
+ prt_printf(&buf, "\n"),
+ bch2_inode_unpacked_to_text(&buf, target),
+ buf.buf));
+
+ target->bi_flags &= ~BCH_INODE_unlinked;
+ target->bi_dir = d.k->p.inode;
+ target->bi_dir_offset = d.k->p.offset;
+ return __bch2_fsck_write_inode(trans, target);
+ }
+
+ struct bkey_s_c_dirent bp_dirent =
+ bch2_bkey_get_iter_typed(trans, &bp_iter, BTREE_ID_dirents,
+ SPOS(target->bi_dir, target->bi_dir_offset, target->bi_snapshot),
+ 0, dirent);
+ ret = bkey_err(bp_dirent);
+ if (ret && !bch2_err_matches(ret, ENOENT))
+ goto err;
+
+ bool backpointer_exists = !ret;
+ ret = 0;
+
+ if (!backpointer_exists) {
+ if (fsck_err(trans, inode_wrong_backpointer,
+ "inode %llu:%u has wrong backpointer:\n"
+ "got %llu:%llu\n"
+ "should be %llu:%llu",
+ target->bi_inum, target->bi_snapshot,
+ target->bi_dir,
+ target->bi_dir_offset,
+ d.k->p.inode,
+ d.k->p.offset)) {
+ target->bi_dir = d.k->p.inode;
+ target->bi_dir_offset = d.k->p.offset;
+ ret = __bch2_fsck_write_inode(trans, target);
+ }
+ } else {
+ printbuf_reset(&buf);
+ bch2_bkey_val_to_text(&buf, c, d.s_c);
+ prt_newline(&buf);
+ bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c);
+
+ if (S_ISDIR(target->bi_mode) || target->bi_subvol) {
+ /*
+ * XXX: verify connectivity of the other dirent
+ * up to the root before removing this one
+ *
+ * Additionally, bch2_lookup would need to cope with the
+ * dirent it found being removed - or should we remove
+ * the other one, even though the inode points to it?
+ */
+ if (in_fsck) {
+ if (fsck_err(trans, inode_dir_multiple_links,
+ "%s %llu:%u with multiple links\n%s",
+ S_ISDIR(target->bi_mode) ? "directory" : "subvolume",
+ target->bi_inum, target->bi_snapshot, buf.buf))
+ ret = bch2_fsck_remove_dirent(trans, d.k->p);
+ } else {
+ bch2_fs_inconsistent(c,
+ "%s %llu:%u with multiple links\n%s",
+ S_ISDIR(target->bi_mode) ? "directory" : "subvolume",
+ target->bi_inum, target->bi_snapshot, buf.buf);
+ }
+
+ goto out;
+ } else {
+ /*
+ * hardlinked file with nlink 0:
+ * We're just adjusting nlink here so check_nlinks() will pick
+ * it up, it ignores inodes with nlink 0
+ */
+ if (fsck_err_on(!target->bi_nlink,
+ trans, inode_multiple_links_but_nlink_0,
+ "inode %llu:%u type %s has multiple links but i_nlink 0\n%s",
+ target->bi_inum, target->bi_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
+ target->bi_nlink++;
+ target->bi_flags &= ~BCH_INODE_unlinked;
+ ret = __bch2_fsck_write_inode(trans, target);
+ if (ret)
+ goto err;
+ }
+ }
+ }
+out:
+err:
+fsck_err:
+ bch2_trans_iter_exit(trans, &bp_iter);
+ printbuf_exit(&buf);
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+int __bch2_check_dirent_target(struct btree_trans *trans,
+ struct btree_iter *dirent_iter,
+ struct bkey_s_c_dirent d,
+ struct bch_inode_unpacked *target,
+ bool in_fsck)
+{
+ struct bch_fs *c = trans->c;
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
+
+ ret = bch2_check_dirent_inode_dirent(trans, d, target, in_fsck);
+ if (ret)
+ goto err;
+
+ if (fsck_err_on(d.v->d_type != inode_d_type(target),
+ trans, dirent_d_type_wrong,
+ "incorrect d_type: got %s, should be %s:\n%s",
+ bch2_d_type_str(d.v->d_type),
+ bch2_d_type_str(inode_d_type(target)),
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
+ struct bkey_i_dirent *n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
+ ret = PTR_ERR_OR_ZERO(n);
+ if (ret)
+ goto err;
+
+ bkey_reassemble(&n->k_i, d.s_c);
+ n->v.d_type = inode_d_type(target);
+ if (n->v.d_type == DT_SUBVOL) {
+ n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
+ n->v.d_child_subvol = cpu_to_le32(target->bi_subvol);
+ } else {
+ n->v.d_inum = cpu_to_le64(target->bi_inum);
+ }
+
+ ret = bch2_trans_update(trans, dirent_iter, &n->k_i,
+ BTREE_UPDATE_internal_snapshot_node);
+ if (ret)
+ goto err;
+ }
+err:
+fsck_err:
+ printbuf_exit(&buf);
+ bch_err_fn(c, ret);
+ return ret;
+}
+
+/*
+ * BCH_INODE_has_case_insensitive:
+ * We have to track whether directories have any descendent directory that is
+ * casefolded - for overlayfs:
+ */
+
+static int bch2_propagate_has_case_insensitive(struct btree_trans *trans, subvol_inum inum)
+{
+ struct btree_iter iter = {};
+ int ret = 0;
+
+ while (true) {
+ struct bch_inode_unpacked inode;
+ ret = bch2_inode_peek(trans, &iter, &inode, inum,
+ BTREE_ITER_intent|BTREE_ITER_with_updates);
+ if (ret)
+ break;
+
+ if (inode.bi_flags & BCH_INODE_has_case_insensitive)
+ break;
+
+ inode.bi_flags |= BCH_INODE_has_case_insensitive;
+ ret = bch2_inode_write(trans, &iter, &inode);
+ if (ret)
+ break;
+
+ bch2_trans_iter_exit(trans, &iter);
+ if (subvol_inum_eq(inum, BCACHEFS_ROOT_SUBVOL_INUM))
+ break;
+
+ inum = parent_inum(inum, &inode);
+ }
+
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+int bch2_maybe_propagate_has_case_insensitive(struct btree_trans *trans, subvol_inum inum,
+ struct bch_inode_unpacked *inode)
+{
+ if (!bch2_inode_casefold(trans->c, inode))
+ return 0;
+
+ inode->bi_flags |= BCH_INODE_has_case_insensitive;
+
+ return bch2_propagate_has_case_insensitive(trans, parent_inum(inum, inode));
+}
+
+int bch2_check_inode_has_case_insensitive(struct btree_trans *trans,
+ struct bch_inode_unpacked *inode,
+ snapshot_id_list *snapshot_overwrites,
+ bool *do_update)
+{
+ struct printbuf buf = PRINTBUF;
+ bool repairing_parents = false;
+ int ret = 0;
+
+ if (!S_ISDIR(inode->bi_mode)) {
+ /*
+ * Old versions set bi_casefold for non dirs, but that's
+ * unnecessary and wasteful
+ */
+ if (inode->bi_casefold) {
+ inode->bi_casefold = 0;
+ *do_update = true;
+ }
+ return 0;
+ }
+
+ if (trans->c->sb.version < bcachefs_metadata_version_inode_has_case_insensitive)
+ return 0;
+
+ if (bch2_inode_casefold(trans->c, inode) &&
+ !(inode->bi_flags & BCH_INODE_has_case_insensitive)) {
+ prt_printf(&buf, "casefolded dir with has_case_insensitive not set\ninum %llu:%u ",
+ inode->bi_inum, inode->bi_snapshot);
+
+ ret = bch2_inum_snapshot_to_path(trans, inode->bi_inum, inode->bi_snapshot,
+ snapshot_overwrites, &buf);
+ if (ret)
+ goto err;
+
+ if (fsck_err(trans, inode_has_case_insensitive_not_set, "%s", buf.buf)) {
+ inode->bi_flags |= BCH_INODE_has_case_insensitive;
+ *do_update = true;
+ }
+ }
+
+ if (!(inode->bi_flags & BCH_INODE_has_case_insensitive))
+ goto out;
+
+ struct bch_inode_unpacked dir = *inode;
+ u32 snapshot = dir.bi_snapshot;
+
+ while (!(dir.bi_inum == BCACHEFS_ROOT_INO &&
+ dir.bi_subvol == BCACHEFS_ROOT_SUBVOL)) {
+ if (dir.bi_parent_subvol) {
+ ret = bch2_subvolume_get_snapshot(trans, dir.bi_parent_subvol, &snapshot);
+ if (ret)
+ goto err;
+
+ snapshot_overwrites = NULL;
+ }
+
+ ret = bch2_inode_find_by_inum_snapshot(trans, dir.bi_dir, snapshot, &dir, 0);
+ if (ret)
+ goto err;
+
+ if (!(dir.bi_flags & BCH_INODE_has_case_insensitive)) {
+ prt_printf(&buf, "parent of casefolded dir with has_case_insensitive not set\n");
+
+ ret = bch2_inum_snapshot_to_path(trans, dir.bi_inum, dir.bi_snapshot,
+ snapshot_overwrites, &buf);
+ if (ret)
+ goto err;
+
+ if (fsck_err(trans, inode_parent_has_case_insensitive_not_set, "%s", buf.buf)) {
+ dir.bi_flags |= BCH_INODE_has_case_insensitive;
+ ret = __bch2_fsck_write_inode(trans, &dir);
+ if (ret)
+ goto err;
+ }
+ }
+
+ /*
+ * We only need to check the first parent, unless we find an
+ * inconsistency
+ */
+ if (!repairing_parents)
+ break;
+ }
+out:
+err:
+fsck_err:
+ printbuf_exit(&buf);
+ if (ret)
+ return ret;
+
+ if (repairing_parents) {
+ return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
+ -BCH_ERR_transaction_restart_nested;
+ }
+
+ return 0;
+}
diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/namei.h
index 2b59210bb5e8..ae6ebc2d0785 100644
--- a/fs/bcachefs/fs-common.h
+++ b/fs/bcachefs/namei.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_FS_COMMON_H
-#define _BCACHEFS_FS_COMMON_H
+#ifndef _BCACHEFS_NAMEI_H
+#define _BCACHEFS_NAMEI_H
#include "dirent.h"
@@ -43,5 +43,37 @@ bool bch2_reinherit_attrs(struct bch_inode_unpacked *,
struct bch_inode_unpacked *);
int bch2_inum_to_path(struct btree_trans *, subvol_inum, struct printbuf *);
+int bch2_inum_snapshot_to_path(struct btree_trans *, u64, u32,
+ snapshot_id_list *, struct printbuf *);
-#endif /* _BCACHEFS_FS_COMMON_H */
+int __bch2_check_dirent_target(struct btree_trans *,
+ struct btree_iter *,
+ struct bkey_s_c_dirent,
+ struct bch_inode_unpacked *, bool);
+
+static inline bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
+ struct bkey_s_c_dirent d)
+{
+ return inode->bi_dir == d.k->p.inode &&
+ inode->bi_dir_offset == d.k->p.offset;
+}
+
+static inline int bch2_check_dirent_target(struct btree_trans *trans,
+ struct btree_iter *dirent_iter,
+ struct bkey_s_c_dirent d,
+ struct bch_inode_unpacked *target,
+ bool in_fsck)
+{
+ if (likely(inode_points_to_dirent(target, d) &&
+ d.v->d_type == inode_d_type(target)))
+ return 0;
+
+ return __bch2_check_dirent_target(trans, dirent_iter, d, target, in_fsck);
+}
+
+int bch2_maybe_propagate_has_case_insensitive(struct btree_trans *, subvol_inum,
+ struct bch_inode_unpacked *);
+int bch2_check_inode_has_case_insensitive(struct btree_trans *, struct bch_inode_unpacked *,
+ snapshot_id_list *, bool *);
+
+#endif /* _BCACHEFS_NAMEI_H */
diff --git a/fs/bcachefs/nocow_locking.c b/fs/bcachefs/nocow_locking.c
index 3c21981a4a1c..962218fa68ec 100644
--- a/fs/bcachefs/nocow_locking.c
+++ b/fs/bcachefs/nocow_locking.c
@@ -133,12 +133,10 @@ void bch2_fs_nocow_locking_exit(struct bch_fs *c)
BUG_ON(atomic_read(&l->l[j]));
}
-int bch2_fs_nocow_locking_init(struct bch_fs *c)
+void bch2_fs_nocow_locking_init_early(struct bch_fs *c)
{
struct bucket_nocow_lock_table *t = &c->nocow_locks;
for (struct nocow_lock_bucket *l = t->l; l < t->l + ARRAY_SIZE(t->l); l++)
spin_lock_init(&l->lock);
-
- return 0;
}
diff --git a/fs/bcachefs/nocow_locking.h b/fs/bcachefs/nocow_locking.h
index f9d6a426a960..48b8a003c0d2 100644
--- a/fs/bcachefs/nocow_locking.h
+++ b/fs/bcachefs/nocow_locking.h
@@ -45,6 +45,6 @@ static inline bool bch2_bucket_nocow_trylock(struct bucket_nocow_lock_table *t,
void bch2_nocow_locks_to_text(struct printbuf *, struct bucket_nocow_lock_table *);
void bch2_fs_nocow_locking_exit(struct bch_fs *);
-int bch2_fs_nocow_locking_init(struct bch_fs *);
+void bch2_fs_nocow_locking_init_early(struct bch_fs *);
#endif /* _BCACHEFS_NOCOW_LOCKING_H */
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 6772faf385a5..b1cf88905b81 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -7,7 +7,9 @@
#include "compress.h"
#include "disk_groups.h"
#include "error.h"
+#include "movinggc.h"
#include "opts.h"
+#include "rebalance.h"
#include "recovery_passes.h"
#include "super-io.h"
#include "util.h"
@@ -19,6 +21,11 @@ const char * const bch2_error_actions[] = {
NULL
};
+const char * const bch2_degraded_actions[] = {
+ BCH_DEGRADED_ACTIONS()
+ NULL
+};
+
const char * const bch2_fsck_fix_opts[] = {
BCH_FIX_ERRORS_OPTS()
NULL
@@ -44,7 +51,7 @@ const char * const __bch2_btree_ids[] = {
NULL
};
-static const char * const __bch2_csum_types[] = {
+const char * const __bch2_csum_types[] = {
BCH_CSUM_TYPES()
NULL
};
@@ -163,16 +170,6 @@ const char * const bch2_d_types[BCH_DT_MAX] = {
[DT_SUBVOL] = "subvol",
};
-u64 BCH2_NO_SB_OPT(const struct bch_sb *sb)
-{
- BUG();
-}
-
-void SET_BCH2_NO_SB_OPT(struct bch_sb *sb, u64 v)
-{
- BUG();
-}
-
void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src)
{
#define x(_name, ...) \
@@ -223,6 +220,21 @@ void bch2_opt_set_by_id(struct bch_opts *opts, enum bch_opt_id id, u64 v)
}
}
+/* dummy option, for options that aren't stored in the superblock */
+typedef u64 (*sb_opt_get_fn)(const struct bch_sb *);
+typedef void (*sb_opt_set_fn)(struct bch_sb *, u64);
+typedef u64 (*member_opt_get_fn)(const struct bch_member *);
+typedef void (*member_opt_set_fn)(struct bch_member *, u64);
+
+__maybe_unused static const sb_opt_get_fn BCH2_NO_SB_OPT = NULL;
+__maybe_unused static const sb_opt_set_fn SET_BCH2_NO_SB_OPT = NULL;
+__maybe_unused static const member_opt_get_fn BCH2_NO_MEMBER_OPT = NULL;
+__maybe_unused static const member_opt_set_fn SET_BCH2_NO_MEMBER_OPT = NULL;
+
+#define type_compatible_or_null(_p, _type) \
+ __builtin_choose_expr( \
+ __builtin_types_compatible_p(typeof(_p), typeof(_type)), _p, NULL)
+
const struct bch_option bch2_opt_table[] = {
#define OPT_BOOL() .type = BCH_OPT_BOOL, .min = 0, .max = 2
#define OPT_UINT(_min, _max) .type = BCH_OPT_UINT, \
@@ -239,15 +251,15 @@ const struct bch_option bch2_opt_table[] = {
#define x(_name, _bits, _flags, _type, _sb_opt, _default, _hint, _help) \
[Opt_##_name] = { \
- .attr = { \
- .name = #_name, \
- .mode = (_flags) & OPT_RUNTIME ? 0644 : 0444, \
- }, \
- .flags = _flags, \
- .hint = _hint, \
- .help = _help, \
- .get_sb = _sb_opt, \
- .set_sb = SET_##_sb_opt, \
+ .attr.name = #_name, \
+ .attr.mode = (_flags) & OPT_RUNTIME ? 0644 : 0444, \
+ .flags = _flags, \
+ .hint = _hint, \
+ .help = _help, \
+ .get_sb = type_compatible_or_null(_sb_opt, *BCH2_NO_SB_OPT), \
+ .set_sb = type_compatible_or_null(SET_##_sb_opt,*SET_BCH2_NO_SB_OPT), \
+ .get_member = type_compatible_or_null(_sb_opt, *BCH2_NO_MEMBER_OPT), \
+ .set_member = type_compatible_or_null(SET_##_sb_opt,*SET_BCH2_NO_MEMBER_OPT),\
_type \
},
@@ -268,20 +280,20 @@ int bch2_opt_lookup(const char *name)
return -1;
}
-struct synonym {
+struct opt_synonym {
const char *s1, *s2;
};
-static const struct synonym bch_opt_synonyms[] = {
+static const struct opt_synonym bch2_opt_synonyms[] = {
{ "quota", "usrquota" },
};
static int bch2_mount_opt_lookup(const char *name)
{
- const struct synonym *i;
+ const struct opt_synonym *i;
- for (i = bch_opt_synonyms;
- i < bch_opt_synonyms + ARRAY_SIZE(bch_opt_synonyms);
+ for (i = bch2_opt_synonyms;
+ i < bch2_opt_synonyms + ARRAY_SIZE(bch2_opt_synonyms);
i++)
if (!strcmp(name, i->s1))
name = i->s2;
@@ -289,6 +301,30 @@ static int bch2_mount_opt_lookup(const char *name)
return bch2_opt_lookup(name);
}
+struct opt_val_synonym {
+ const char *opt, *v1, *v2;
+};
+
+static const struct opt_val_synonym bch2_opt_val_synonyms[] = {
+ { "degraded", "true", "yes" },
+ { "degraded", "false", "no" },
+ { "degraded", "1", "yes" },
+ { "degraded", "0", "no" },
+};
+
+static const char *bch2_opt_val_synonym_lookup(const char *opt, const char *val)
+{
+ const struct opt_val_synonym *i;
+
+ for (i = bch2_opt_val_synonyms;
+ i < bch2_opt_val_synonyms + ARRAY_SIZE(bch2_opt_val_synonyms);
+ i++)
+ if (!strcmp(opt, i->opt) && !strcmp(val, i->v1))
+ return i->v2;
+
+ return val;
+}
+
int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err)
{
if (v < opt->min) {
@@ -332,21 +368,22 @@ int bch2_opt_parse(struct bch_fs *c,
{
ssize_t ret;
+ if (err)
+ printbuf_indent_add_nextline(err, 2);
+
switch (opt->type) {
case BCH_OPT_BOOL:
- if (val) {
- ret = lookup_constant(bool_names, val, -BCH_ERR_option_not_bool);
- if (ret != -BCH_ERR_option_not_bool) {
- *res = ret;
- } else {
- if (err)
- prt_printf(err, "%s: must be bool", opt->attr.name);
- return ret;
- }
+ if (!val)
+ val = "1";
+
+ ret = lookup_constant(bool_names, val, -BCH_ERR_option_not_bool);
+ if (ret != -BCH_ERR_option_not_bool) {
+ *res = ret;
} else {
- *res = 1;
+ if (err)
+ prt_printf(err, "%s: must be bool", opt->attr.name);
+ return ret;
}
-
break;
case BCH_OPT_UINT:
if (!val) {
@@ -355,9 +392,15 @@ int bch2_opt_parse(struct bch_fs *c,
return -EINVAL;
}
- ret = opt->flags & OPT_HUMAN_READABLE
- ? bch2_strtou64_h(val, res)
- : kstrtou64(val, 10, res);
+ if (*val != '-') {
+ ret = opt->flags & OPT_HUMAN_READABLE
+ ? bch2_strtou64_h(val, res)
+ : kstrtou64(val, 10, res);
+ } else {
+ prt_printf(err, "%s: must be a non-negative number", opt->attr.name);
+ return -BCH_ERR_option_negative;
+ }
+
if (ret < 0) {
if (err)
prt_printf(err, "%s: must be a number",
@@ -475,11 +518,16 @@ void bch2_opts_to_text(struct printbuf *out,
}
}
-int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v)
+int bch2_opt_hook_pre_set(struct bch_fs *c, struct bch_dev *ca, enum bch_opt_id id, u64 v)
{
int ret = 0;
switch (id) {
+ case Opt_state:
+ if (ca)
+ return bch2_dev_set_state(c, ca, v, BCH_FORCE_IF_DEGRADED);
+ break;
+
case Opt_compression:
case Opt_background_compression:
ret = bch2_check_set_has_compressed_data(c, v);
@@ -488,19 +536,17 @@ int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v)
if (v)
bch2_check_set_feature(c, BCH_FEATURE_ec);
break;
+ default:
+ break;
}
return ret;
}
-int bch2_opts_check_may_set(struct bch_fs *c)
+int bch2_opts_hooks_pre_set(struct bch_fs *c)
{
- unsigned i;
- int ret;
-
- for (i = 0; i < bch2_opts_nr; i++) {
- ret = bch2_opt_check_may_set(c, i,
- bch2_opt_get_by_id(&c->opts, i));
+ for (unsigned i = 0; i < bch2_opts_nr; i++) {
+ int ret = bch2_opt_hook_pre_set(c, NULL, i, bch2_opt_get_by_id(&c->opts, i));
if (ret)
return ret;
}
@@ -508,6 +554,61 @@ int bch2_opts_check_may_set(struct bch_fs *c)
return 0;
}
+void bch2_opt_hook_post_set(struct bch_fs *c, struct bch_dev *ca, u64 inum,
+ struct bch_opts *new_opts, enum bch_opt_id id)
+{
+ switch (id) {
+ case Opt_foreground_target:
+ if (new_opts->foreground_target &&
+ !new_opts->background_target)
+ bch2_set_rebalance_needs_scan(c, inum);
+ break;
+ case Opt_compression:
+ if (new_opts->compression &&
+ !new_opts->background_compression)
+ bch2_set_rebalance_needs_scan(c, inum);
+ break;
+ case Opt_background_target:
+ if (new_opts->background_target)
+ bch2_set_rebalance_needs_scan(c, inum);
+ break;
+ case Opt_background_compression:
+ if (new_opts->background_compression)
+ bch2_set_rebalance_needs_scan(c, inum);
+ break;
+ case Opt_rebalance_enabled:
+ bch2_rebalance_wakeup(c);
+ break;
+ case Opt_copygc_enabled:
+ bch2_copygc_wakeup(c);
+ break;
+ case Opt_discard:
+ if (!ca) {
+ mutex_lock(&c->sb_lock);
+ for_each_member_device(c, ca) {
+ struct bch_member *m =
+ bch2_members_v2_get_mut(ca->disk_sb.sb, ca->dev_idx);
+ SET_BCH_MEMBER_DISCARD(m, c->opts.discard);
+ }
+
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+ }
+ break;
+ case Opt_version_upgrade:
+ /*
+ * XXX: in the future we'll likely want to do compatible
+ * upgrades at runtime as well, but right now there's nothing
+ * that does that:
+ */
+ if (new_opts->version_upgrade == BCH_VERSION_UPGRADE_incompatible)
+ bch2_sb_upgrade_incompat(c);
+ break;
+ default:
+ break;
+ }
+}
+
int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts,
struct printbuf *parse_later,
const char *name, const char *val)
@@ -530,6 +631,12 @@ int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts,
if (id < 0)
return 0;
+ /* must have a value for synonym lookup - but OPT_FN is weird */
+ if (!val && bch2_opt_table[id].type != BCH_OPT_FN)
+ val = "1";
+
+ val = bch2_opt_val_synonym_lookup(name, val);
+
if (!(bch2_opt_table[id].flags & OPT_MOUNT))
goto bad_opt;
@@ -543,14 +650,15 @@ int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts,
goto bad_opt;
ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v, &err);
- if (ret == -BCH_ERR_option_needs_open_fs && parse_later) {
- prt_printf(parse_later, "%s=%s,", name, val);
- if (parse_later->allocation_failure) {
- ret = -ENOMEM;
- goto out;
+ if (ret == -BCH_ERR_option_needs_open_fs) {
+ ret = 0;
+
+ if (parse_later) {
+ prt_printf(parse_later, "%s=%s,", name, val);
+ if (parse_later->allocation_failure)
+ ret = -ENOMEM;
}
- ret = 0;
goto out;
}
@@ -561,28 +669,24 @@ int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts,
bch2_opt_set_by_id(opts, id, v);
ret = 0;
- goto out;
-
+out:
+ printbuf_exit(&err);
+ return ret;
bad_opt:
- pr_err("Bad mount option %s", name);
ret = -BCH_ERR_option_name;
goto out;
-
bad_val:
- pr_err("Invalid mount option %s", err.buf);
ret = -BCH_ERR_option_value;
-
-out:
- printbuf_exit(&err);
- return ret;
+ goto out;
}
int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
- struct printbuf *parse_later, char *options)
+ struct printbuf *parse_later, char *options,
+ bool ignore_unknown)
{
char *copied_opts, *copied_opts_start;
char *opt, *name, *val;
- int ret;
+ int ret = 0;
if (!options)
return 0;
@@ -607,24 +711,37 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
val = opt;
ret = bch2_parse_one_mount_opt(c, opts, parse_later, name, val);
- if (ret < 0)
- goto out;
+ if (ret == -BCH_ERR_option_name && ignore_unknown)
+ ret = 0;
+ if (ret) {
+ pr_err("Error parsing option %s: %s", name, bch2_err_str(ret));
+ break;
+ }
}
- ret = 0;
- goto out;
-
-out:
kfree(copied_opts_start);
return ret;
}
-u64 bch2_opt_from_sb(struct bch_sb *sb, enum bch_opt_id id)
+u64 bch2_opt_from_sb(struct bch_sb *sb, enum bch_opt_id id, int dev_idx)
{
const struct bch_option *opt = bch2_opt_table + id;
u64 v;
- v = opt->get_sb(sb);
+ if (dev_idx < 0) {
+ v = opt->get_sb(sb);
+ } else {
+ if (WARN(!bch2_member_exists(sb, dev_idx),
+ "tried to set device option %s on nonexistent device %i",
+ opt->attr.name, dev_idx))
+ return 0;
+
+ struct bch_member m = bch2_sb_member_get(sb, dev_idx);
+ v = opt->get_member(&m);
+ }
+
+ if (opt->flags & OPT_SB_FIELD_ONE_BIAS)
+ --v;
if (opt->flags & OPT_SB_FIELD_ILOG2)
v = 1ULL << v;
@@ -641,34 +758,20 @@ u64 bch2_opt_from_sb(struct bch_sb *sb, enum bch_opt_id id)
*/
int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb)
{
- unsigned id;
-
- for (id = 0; id < bch2_opts_nr; id++) {
+ for (unsigned id = 0; id < bch2_opts_nr; id++) {
const struct bch_option *opt = bch2_opt_table + id;
- if (opt->get_sb == BCH2_NO_SB_OPT)
- continue;
-
- bch2_opt_set_by_id(opts, id, bch2_opt_from_sb(sb, id));
+ if (opt->get_sb)
+ bch2_opt_set_by_id(opts, id, bch2_opt_from_sb(sb, id, -1));
}
return 0;
}
-struct bch_dev_sb_opt_set {
- void (*set_sb)(struct bch_member *, u64);
-};
-
-static const struct bch_dev_sb_opt_set bch2_dev_sb_opt_setters [] = {
-#define x(n, set) [Opt_##n] = { .set_sb = SET_##set },
- BCH_DEV_OPT_SETTERS()
-#undef x
-};
-
-void __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx,
+bool __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx,
const struct bch_option *opt, u64 v)
{
- enum bch_opt_id id = opt - bch2_opt_table;
+ bool changed = false;
if (opt->flags & OPT_SB_FIELD_SECTORS)
v >>= 9;
@@ -679,34 +782,35 @@ void __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx,
if (opt->flags & OPT_SB_FIELD_ONE_BIAS)
v++;
- if (opt->flags & OPT_FS) {
- if (opt->set_sb != SET_BCH2_NO_SB_OPT)
- opt->set_sb(sb, v);
+ if ((opt->flags & OPT_FS) && opt->set_sb && dev_idx < 0) {
+ changed = v != opt->get_sb(sb);
+
+ opt->set_sb(sb, v);
}
- if ((opt->flags & OPT_DEVICE) && dev_idx >= 0) {
+ if ((opt->flags & OPT_DEVICE) && opt->set_member && dev_idx >= 0) {
if (WARN(!bch2_member_exists(sb, dev_idx),
"tried to set device option %s on nonexistent device %i",
opt->attr.name, dev_idx))
- return;
+ return false;
struct bch_member *m = bch2_members_v2_get_mut(sb, dev_idx);
-
- const struct bch_dev_sb_opt_set *set = bch2_dev_sb_opt_setters + id;
- if (set->set_sb)
- set->set_sb(m, v);
- else
- pr_err("option %s cannot be set via opt_set_sb()", opt->attr.name);
+ changed = v != opt->get_member(m);
+ opt->set_member(m, v);
}
+
+ return changed;
}
-void bch2_opt_set_sb(struct bch_fs *c, struct bch_dev *ca,
+bool bch2_opt_set_sb(struct bch_fs *c, struct bch_dev *ca,
const struct bch_option *opt, u64 v)
{
mutex_lock(&c->sb_lock);
- __bch2_opt_set_sb(c->disk_sb.sb, ca ? ca->dev_idx : -1, opt, v);
- bch2_write_super(c);
+ bool changed = __bch2_opt_set_sb(c->disk_sb.sb, ca ? ca->dev_idx : -1, opt, v);
+ if (changed)
+ bch2_write_super(c);
mutex_unlock(&c->sb_lock);
+ return changed;
}
/* io opts: */
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index a182b5d454ba..63f8e254495c 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -11,11 +11,13 @@
struct bch_fs;
extern const char * const bch2_error_actions[];
+extern const char * const bch2_degraded_actions[];
extern const char * const bch2_fsck_fix_opts[];
extern const char * const bch2_version_upgrade_opts[];
extern const char * const bch2_sb_features[];
extern const char * const bch2_sb_compat[];
extern const char * const __bch2_btree_ids[];
+extern const char * const __bch2_csum_types[];
extern const char * const __bch2_csum_opts[];
extern const char * const __bch2_compression_types[];
extern const char * const bch2_compression_opts[];
@@ -50,10 +52,6 @@ static inline const char *bch2_d_type_str(unsigned d_type)
* apply the options from that struct that are defined.
*/
-/* dummy option, for options that aren't stored in the superblock */
-u64 BCH2_NO_SB_OPT(const struct bch_sb *);
-void SET_BCH2_NO_SB_OPT(struct bch_sb *, u64);
-
/* When can be set: */
enum opt_flags {
OPT_FS = BIT(0), /* Filesystem option */
@@ -132,19 +130,24 @@ enum fsck_err_opts {
OPT_FS|OPT_FORMAT| \
OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS, \
OPT_UINT(512, 1U << 16), \
- BCH_SB_BLOCK_SIZE, 8, \
+ BCH_SB_BLOCK_SIZE, 4 << 10, \
"size", NULL) \
x(btree_node_size, u32, \
OPT_FS|OPT_FORMAT| \
OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS, \
OPT_UINT(512, 1U << 20), \
- BCH_SB_BTREE_NODE_SIZE, 512, \
+ BCH_SB_BTREE_NODE_SIZE, 256 << 10, \
"size", "Btree node size, default 256k") \
x(errors, u8, \
OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_STR(bch2_error_actions), \
BCH_SB_ERROR_ACTION, BCH_ON_ERROR_fix_safe, \
NULL, "Action to take on filesystem error") \
+ x(write_error_timeout, u16, \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_UINT(1, 300), \
+ BCH_SB_WRITE_ERROR_TIMEOUT, 30, \
+ NULL, "Number of consecutive write errors allowed before kicking out a device")\
x(metadata_replicas, u8, \
OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_UINT(1, BCH_REPLICAS_MAX), \
@@ -181,6 +184,11 @@ enum fsck_err_opts {
OPT_STR(__bch2_csum_opts), \
BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \
NULL, NULL) \
+ x(checksum_err_retry_nr, u8, \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_UINT(0, 32), \
+ BCH_SB_CSUM_ERR_RETRY_NR, 3, \
+ NULL, NULL) \
x(compression, u8, \
OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_FN(bch2_opt_compression), \
@@ -197,7 +205,7 @@ enum fsck_err_opts {
BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_siphash, \
NULL, "Hash function for directory entries and xattrs")\
x(metadata_target, u16, \
- OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_FN(bch2_opt_target), \
BCH_SB_METADATA_TARGET, 0, \
"(target)", "Device or label for metadata writes") \
@@ -221,6 +229,16 @@ enum fsck_err_opts {
OPT_BOOL(), \
BCH_SB_ERASURE_CODE, false, \
NULL, "Enable erasure coding (DO NOT USE YET)") \
+ x(casefold, u8, \
+ OPT_FS|OPT_INODE|OPT_FORMAT, \
+ OPT_BOOL(), \
+ BCH_SB_CASEFOLD, false, \
+ NULL, "Dirent lookups are casefolded") \
+ x(casefold_disabled, u8, \
+ OPT_FS|OPT_MOUNT, \
+ OPT_BOOL(), \
+ BCH2_NO_SB_OPT, false, \
+ NULL, "Disable casefolding filesystem wide") \
x(inodes_32bit, u8, \
OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_BOOL(), \
@@ -295,24 +313,14 @@ enum fsck_err_opts {
NULL, "Enable project quotas") \
x(degraded, u8, \
OPT_FS|OPT_MOUNT, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, false, \
+ OPT_STR(bch2_degraded_actions), \
+ BCH_SB_DEGRADED_ACTION, BCH_DEGRADED_ask, \
NULL, "Allow mounting in degraded mode") \
- x(very_degraded, u8, \
- OPT_FS|OPT_MOUNT, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, false, \
- NULL, "Allow mounting in when data will be missing") \
x(no_splitbrain_check, u8, \
OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
BCH2_NO_SB_OPT, false, \
NULL, "Don't kick drives out when splitbrain detected")\
- x(discard, u8, \
- OPT_FS|OPT_MOUNT|OPT_DEVICE, \
- OPT_BOOL(), \
- BCH2_NO_SB_OPT, true, \
- NULL, "Enable discard/TRIM support") \
x(verbose, u8, \
OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
OPT_BOOL(), \
@@ -376,6 +384,11 @@ enum fsck_err_opts {
OPT_BOOL(), \
BCH2_NO_SB_OPT, false, \
NULL, "Exit recovery immediately prior to journal replay")\
+ x(journal_rewind, u64, \
+ OPT_FS|OPT_MOUNT, \
+ OPT_UINT(0, U64_MAX), \
+ BCH2_NO_SB_OPT, 0, \
+ NULL, "Rewind journal") \
x(recovery_passes, u64, \
OPT_FS|OPT_MOUNT, \
OPT_BITFIELD(bch2_recovery_passes), \
@@ -447,7 +460,7 @@ enum fsck_err_opts {
BCH2_NO_SB_OPT, false, \
NULL, "Reconstruct alloc btree") \
x(version_upgrade, u8, \
- OPT_FS|OPT_MOUNT, \
+ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
OPT_STR(bch2_version_upgrade_opts), \
BCH_SB_VERSION_UPGRADE, BCH_VERSION_UPGRADE_compatible, \
NULL, "Set superblock to latest version,\n" \
@@ -487,45 +500,56 @@ enum fsck_err_opts {
BCH2_NO_SB_OPT, true, \
NULL, "Enable rebalance: disable for debugging, or to\n"\
"quiet the system when doing performance testing\n")\
+ x(rebalance_on_ac_only, u8, \
+ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_BOOL(), \
+ BCH_SB_REBALANCE_AC_ONLY, false, \
+ NULL, "Enable rebalance while on mains power only\n") \
+ x(auto_snapshot_deletion, u8, \
+ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_BOOL(), \
+ BCH2_NO_SB_OPT, true, \
+ NULL, "Enable automatic snapshot deletion: disable for debugging, or to\n"\
+ "quiet the system when doing performance testing\n")\
x(no_data_io, u8, \
OPT_MOUNT, \
OPT_BOOL(), \
BCH2_NO_SB_OPT, false, \
NULL, "Skip submit_bio() for data reads and writes, " \
"for performance testing purposes") \
- x(fs_size, u64, \
- OPT_DEVICE, \
+ x(state, u64, \
+ OPT_DEVICE|OPT_RUNTIME, \
+ OPT_STR(bch2_member_states), \
+ BCH_MEMBER_STATE, BCH_MEMBER_STATE_rw, \
+ "state", "rw,ro,failed,spare") \
+ x(bucket_size, u32, \
+ OPT_DEVICE|OPT_HUMAN_READABLE|OPT_SB_FIELD_SECTORS, \
OPT_UINT(0, S64_MAX), \
- BCH2_NO_SB_OPT, 0, \
- "size", "Size of filesystem on device") \
- x(bucket, u32, \
- OPT_DEVICE, \
- OPT_UINT(0, S64_MAX), \
- BCH2_NO_SB_OPT, 0, \
+ BCH_MEMBER_BUCKET_SIZE, 0, \
"size", "Specifies the bucket size; must be greater than the btree node size")\
x(durability, u8, \
- OPT_DEVICE|OPT_SB_FIELD_ONE_BIAS, \
+ OPT_DEVICE|OPT_RUNTIME|OPT_SB_FIELD_ONE_BIAS, \
OPT_UINT(0, BCH_REPLICAS_MAX), \
- BCH2_NO_SB_OPT, 1, \
+ BCH_MEMBER_DURABILITY, 1, \
"n", "Data written to this device will be considered\n"\
"to have already been replicated n times") \
x(data_allowed, u8, \
OPT_DEVICE, \
OPT_BITFIELD(__bch2_data_types), \
- BCH2_NO_SB_OPT, BIT(BCH_DATA_journal)|BIT(BCH_DATA_btree)|BIT(BCH_DATA_user),\
+ BCH_MEMBER_DATA_ALLOWED, BIT(BCH_DATA_journal)|BIT(BCH_DATA_btree)|BIT(BCH_DATA_user),\
"types", "Allowed data types for this device: journal, btree, and/or user")\
+ x(discard, u8, \
+ OPT_MOUNT|OPT_FS|OPT_DEVICE|OPT_RUNTIME, \
+ OPT_BOOL(), \
+ BCH_MEMBER_DISCARD, true, \
+ NULL, "Enable discard/TRIM support") \
x(btree_node_prefetch, u8, \
OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
OPT_BOOL(), \
BCH2_NO_SB_OPT, true, \
- NULL, "BTREE_ITER_prefetch casuse btree nodes to be\n"\
+ NULL, "BTREE_ITER_prefetch causes btree nodes to be\n"\
" prefetched sequentially")
-#define BCH_DEV_OPT_SETTERS() \
- x(discard, BCH_MEMBER_DISCARD) \
- x(durability, BCH_MEMBER_DURABILITY) \
- x(data_allowed, BCH_MEMBER_DATA_ALLOWED)
-
struct bch_opts {
#define x(_name, _bits, ...) unsigned _name##_defined:1;
BCH_OPTS()
@@ -582,8 +606,6 @@ struct printbuf;
struct bch_option {
struct attribute attr;
- u64 (*get_sb)(const struct bch_sb *);
- void (*set_sb)(struct bch_sb *, u64);
enum opt_type type;
enum opt_flags flags;
u64 min, max;
@@ -595,6 +617,12 @@ struct bch_option {
const char *hint;
const char *help;
+ u64 (*get_sb)(const struct bch_sb *);
+ void (*set_sb)(struct bch_sb *, u64);
+
+ u64 (*get_member)(const struct bch_member *);
+ void (*set_member)(struct bch_member *, u64);
+
};
extern const struct bch_option bch2_opt_table[];
@@ -603,12 +631,12 @@ bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id);
u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id);
void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64);
-u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id);
+u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id, int);
int bch2_opts_from_sb(struct bch_opts *, struct bch_sb *);
-void __bch2_opt_set_sb(struct bch_sb *, int, const struct bch_option *, u64);
+bool __bch2_opt_set_sb(struct bch_sb *, int, const struct bch_option *, u64);
struct bch_dev;
-void bch2_opt_set_sb(struct bch_fs *, struct bch_dev *, const struct bch_option *, u64);
+bool bch2_opt_set_sb(struct bch_fs *, struct bch_dev *, const struct bch_option *, u64);
int bch2_opt_lookup(const char *);
int bch2_opt_validate(const struct bch_option *, u64, struct printbuf *);
@@ -625,12 +653,15 @@ void bch2_opts_to_text(struct printbuf *,
struct bch_fs *, struct bch_sb *,
unsigned, unsigned, unsigned);
-int bch2_opt_check_may_set(struct bch_fs *, int, u64);
-int bch2_opts_check_may_set(struct bch_fs *);
+int bch2_opt_hook_pre_set(struct bch_fs *, struct bch_dev *, enum bch_opt_id, u64);
+int bch2_opts_hooks_pre_set(struct bch_fs *);
+void bch2_opt_hook_post_set(struct bch_fs *, struct bch_dev *, u64,
+ struct bch_opts *, enum bch_opt_id);
+
int bch2_parse_one_mount_opt(struct bch_fs *, struct bch_opts *,
struct printbuf *, const char *, const char *);
int bch2_parse_mount_opts(struct bch_fs *, struct bch_opts *, struct printbuf *,
- char *);
+ char *, bool);
/* inode opts: */
@@ -659,18 +690,4 @@ static inline void bch2_io_opts_fixups(struct bch_io_opts *opts)
struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts);
bool bch2_opt_is_inode_opt(enum bch_opt_id);
-/* rebalance opts: */
-
-static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct bch_io_opts *opts)
-{
- return (struct bch_extent_rebalance) {
- .type = BIT(BCH_EXTENT_ENTRY_rebalance),
-#define x(_name) \
- ._name = opts->_name, \
- ._name##_from_inode = opts->_name##_from_inode,
- BCH_REBALANCE_OPTS()
-#undef x
- };
-};
-
#endif /* _BCACHEFS_OPTS_H */
diff --git a/fs/bcachefs/printbuf.c b/fs/bcachefs/printbuf.c
index 4cf5a2af1e6f..3302bbc78a09 100644
--- a/fs/bcachefs/printbuf.c
+++ b/fs/bcachefs/printbuf.c
@@ -277,6 +277,25 @@ void bch2_printbuf_indent_add(struct printbuf *buf, unsigned spaces)
}
/**
+ * bch2_printbuf_indent_add_nextline() - add to the current indent level for
+ * subsequent lines
+ *
+ * @buf: printbuf to control
+ * @spaces: number of spaces to add to the current indent level
+ *
+ * Subsequent lines - not the current line - will be indented by @spaces more
+ * spaces.
+ */
+void bch2_printbuf_indent_add_nextline(struct printbuf *buf, unsigned spaces)
+{
+ if (WARN_ON_ONCE(buf->indent + spaces < buf->indent))
+ spaces = 0;
+
+ buf->indent += spaces;
+ buf->has_indent_or_tabstops = true;
+}
+
+/**
* bch2_printbuf_indent_sub() - subtract from the current indent level
*
* @buf: printbuf to control
diff --git a/fs/bcachefs/printbuf.h b/fs/bcachefs/printbuf.h
index d0dd398baa2b..8f4e28d440ac 100644
--- a/fs/bcachefs/printbuf.h
+++ b/fs/bcachefs/printbuf.h
@@ -112,6 +112,7 @@ void bch2_printbuf_tabstop_pop(struct printbuf *);
int bch2_printbuf_tabstop_push(struct printbuf *, unsigned);
void bch2_printbuf_indent_add(struct printbuf *, unsigned);
+void bch2_printbuf_indent_add_nextline(struct printbuf *, unsigned);
void bch2_printbuf_indent_sub(struct printbuf *, unsigned);
void bch2_prt_newline(struct printbuf *);
@@ -139,6 +140,14 @@ void bch2_prt_bitflags_vector(struct printbuf *, const char * const[],
.size = _size, \
})
+static inline struct printbuf bch2_printbuf_init(void)
+{
+ return PRINTBUF;
+}
+
+DEFINE_CLASS(printbuf, struct printbuf,
+ bch2_printbuf_exit(&_T), bch2_printbuf_init(), void)
+
/*
* Returns size remaining of output buffer:
*/
diff --git a/fs/bcachefs/progress.c b/fs/bcachefs/progress.c
new file mode 100644
index 000000000000..d09898566abe
--- /dev/null
+++ b/fs/bcachefs/progress.c
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "bbpos.h"
+#include "disk_accounting.h"
+#include "progress.h"
+
+void bch2_progress_init(struct progress_indicator_state *s,
+ struct bch_fs *c,
+ u64 btree_id_mask)
+{
+ memset(s, 0, sizeof(*s));
+
+ s->next_print = jiffies + HZ * 10;
+
+ for (unsigned i = 0; i < BTREE_ID_NR; i++) {
+ if (!(btree_id_mask & BIT_ULL(i)))
+ continue;
+
+ struct disk_accounting_pos acc;
+ disk_accounting_key_init(acc, btree, .id = i);
+
+ u64 v;
+ bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1);
+ s->nodes_total += div64_ul(v, btree_sectors(c));
+ }
+}
+
+static inline bool progress_update_p(struct progress_indicator_state *s)
+{
+ bool ret = time_after_eq(jiffies, s->next_print);
+
+ if (ret)
+ s->next_print = jiffies + HZ * 10;
+ return ret;
+}
+
+void bch2_progress_update_iter(struct btree_trans *trans,
+ struct progress_indicator_state *s,
+ struct btree_iter *iter,
+ const char *msg)
+{
+ struct bch_fs *c = trans->c;
+ struct btree *b = path_l(btree_iter_path(trans, iter))->b;
+
+ s->nodes_seen += b != s->last_node;
+ s->last_node = b;
+
+ if (progress_update_p(s)) {
+ struct printbuf buf = PRINTBUF;
+ unsigned percent = s->nodes_total
+ ? div64_u64(s->nodes_seen * 100, s->nodes_total)
+ : 0;
+
+ prt_printf(&buf, "%s: %d%%, done %llu/%llu nodes, at ",
+ msg, percent, s->nodes_seen, s->nodes_total);
+ bch2_bbpos_to_text(&buf, BBPOS(iter->btree_id, iter->pos));
+
+ bch_info(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ }
+}
diff --git a/fs/bcachefs/progress.h b/fs/bcachefs/progress.h
new file mode 100644
index 000000000000..23fb1811f943
--- /dev/null
+++ b/fs/bcachefs/progress.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_PROGRESS_H
+#define _BCACHEFS_PROGRESS_H
+
+/*
+ * Lame progress indicators
+ *
+ * We don't like to use these because they print to the dmesg console, which is
+ * spammy - we much prefer to be wired up to a userspace programm (e.g. via
+ * thread_with_file) and have it print the progress indicator.
+ *
+ * But some code is old and doesn't support that, or runs in a context where
+ * that's not yet practical (mount).
+ */
+
+struct progress_indicator_state {
+ unsigned long next_print;
+ u64 nodes_seen;
+ u64 nodes_total;
+ struct btree *last_node;
+};
+
+void bch2_progress_init(struct progress_indicator_state *, struct bch_fs *, u64);
+void bch2_progress_update_iter(struct btree_trans *,
+ struct progress_indicator_state *,
+ struct btree_iter *,
+ const char *);
+
+#endif /* _BCACHEFS_PROGRESS_H */
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index 8b857fc33244..f241efb1fb50 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -516,7 +516,7 @@ static int bch2_fs_quota_read_inode(struct btree_trans *trans,
bch2_quota_acct(c, bch_qid(&u), Q_INO, 1,
KEY_TYPE_QUOTA_NOCHECK);
advance:
- bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos));
+ bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(iter->pos));
return 0;
}
@@ -527,7 +527,7 @@ int bch2_fs_quota_read(struct bch_fs *c)
struct bch_sb_field_quota *sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
if (!sb_quota) {
mutex_unlock(&c->sb_lock);
- return -BCH_ERR_ENOSPC_sb_quota;
+ return bch_err_throw(c, ENOSPC_sb_quota);
}
bch2_sb_quota_read(c);
@@ -572,7 +572,7 @@ static int bch2_quota_enable(struct super_block *sb, unsigned uflags)
mutex_lock(&c->sb_lock);
sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
if (!sb_quota) {
- ret = -BCH_ERR_ENOSPC_sb_quota;
+ ret = bch_err_throw(c, ENOSPC_sb_quota);
goto unlock;
}
@@ -726,7 +726,7 @@ static int bch2_quota_set_info(struct super_block *sb, int type,
mutex_lock(&c->sb_lock);
sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
if (!sb_quota) {
- ret = -BCH_ERR_ENOSPC_sb_quota;
+ ret = bch_err_throw(c, ENOSPC_sb_quota);
goto unlock;
}
diff --git a/fs/bcachefs/rcu_pending.c b/fs/bcachefs/rcu_pending.c
index bef2aa1b8bcd..b1438be9d690 100644
--- a/fs/bcachefs/rcu_pending.c
+++ b/fs/bcachefs/rcu_pending.c
@@ -182,11 +182,6 @@ static inline void kfree_bulk(size_t nr, void ** p)
while (nr--)
kfree(*p);
}
-
-#define local_irq_save(flags) \
-do { \
- flags = 0; \
-} while (0)
#endif
static noinline void __process_finished_items(struct rcu_pending *pending,
@@ -429,9 +424,15 @@ __rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *head,
BUG_ON((ptr != NULL) != (pending->process == RCU_PENDING_KVFREE_FN));
- local_irq_save(flags);
- p = this_cpu_ptr(pending->p);
- spin_lock(&p->lock);
+ /* We could technically be scheduled before taking the lock and end up
+ * using a different cpu's rcu_pending_pcpu: that's ok, it needs a lock
+ * anyways
+ *
+ * And we have to do it this way to avoid breaking PREEMPT_RT, which
+ * redefines how spinlocks work:
+ */
+ p = raw_cpu_ptr(pending->p);
+ spin_lock_irqsave(&p->lock, flags);
rcu_gp_poll_state_t seq = __get_state_synchronize_rcu(pending->srcu);
restart:
if (may_sleep &&
@@ -520,9 +521,8 @@ check_expired:
goto free_node;
}
- local_irq_save(flags);
- p = this_cpu_ptr(pending->p);
- spin_lock(&p->lock);
+ p = raw_cpu_ptr(pending->p);
+ spin_lock_irqsave(&p->lock, flags);
goto restart;
}
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 4adc74cd3f70..1c345b86b1c0 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -26,9 +26,8 @@
/* bch_extent_rebalance: */
-static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
+static const struct bch_extent_rebalance *bch2_bkey_ptrs_rebalance_opts(struct bkey_ptrs_c ptrs)
{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
bkey_extent_entry_for_each(ptrs, entry)
@@ -38,6 +37,11 @@ static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s
return NULL;
}
+static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
+{
+ return bch2_bkey_ptrs_rebalance_opts(bch2_bkey_ptrs_c(k));
+}
+
static inline unsigned bch2_bkey_ptrs_need_compress(struct bch_fs *c,
struct bch_io_opts *opts,
struct bkey_s_c k,
@@ -76,6 +80,7 @@ static inline unsigned bch2_bkey_ptrs_need_move(struct bch_fs *c,
unsigned ptr_bit = 1;
unsigned rewrite_ptrs = 0;
+ guard(rcu)();
bkey_for_each_ptr(ptrs, ptr) {
if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, opts->background_target))
rewrite_ptrs |= ptr_bit;
@@ -91,17 +96,24 @@ static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c,
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
+ return 0;
+
return bch2_bkey_ptrs_need_compress(c, opts, k, ptrs) |
bch2_bkey_ptrs_need_move(c, opts, ptrs);
}
u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k)
{
- const struct bch_extent_rebalance *opts = bch2_bkey_rebalance_opts(k);
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+
+ const struct bch_extent_rebalance *opts = bch2_bkey_ptrs_rebalance_opts(ptrs);
if (!opts)
return 0;
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
+ return 0;
+
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
u64 sectors = 0;
@@ -121,10 +133,11 @@ u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k)
}
}
incompressible:
- if (opts->background_target &&
- bch2_target_accepts_data(c, BCH_DATA_user, opts->background_target)) {
+ if (opts->background_target) {
+ guard(rcu)();
bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
- if (!p.ptr.cached && !bch2_dev_in_target(c, p.ptr.dev, opts->background_target))
+ if (!p.ptr.cached &&
+ !bch2_dev_in_target(c, p.ptr.dev, opts->background_target))
sectors += p.crc.compressed_size;
}
@@ -140,7 +153,7 @@ static bool bch2_bkey_rebalance_needs_update(struct bch_fs *c, struct bch_io_opt
const struct bch_extent_rebalance *old = bch2_bkey_rebalance_opts(k);
if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k)) {
- struct bch_extent_rebalance new = io_opts_to_rebalance_opts(opts);
+ struct bch_extent_rebalance new = io_opts_to_rebalance_opts(c, opts);
return old == NULL || memcmp(old, &new, sizeof(new));
} else {
return old != NULL;
@@ -163,7 +176,7 @@ int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_io_opts *opts,
k.k->u64s += sizeof(*old) / sizeof(u64);
}
- *old = io_opts_to_rebalance_opts(opts);
+ *old = io_opts_to_rebalance_opts(c, opts);
} else {
if (old)
extent_entry_drop(k, (union bch_extent_entry *) old);
@@ -230,7 +243,7 @@ int bch2_set_rebalance_needs_scan_trans(struct btree_trans *trans, u64 inum)
bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work,
SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
BTREE_ITER_intent);
- k = bch2_btree_iter_peek_slot(&iter);
+ k = bch2_btree_iter_peek_slot(trans, &iter);
ret = bkey_err(k);
if (ret)
goto err;
@@ -259,7 +272,7 @@ int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum)
int ret = bch2_trans_commit_do(c, NULL, NULL,
BCH_TRANS_COMMIT_no_enospc,
bch2_set_rebalance_needs_scan_trans(trans, inum));
- rebalance_wakeup(c);
+ bch2_rebalance_wakeup(c);
return ret;
}
@@ -278,7 +291,7 @@ static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, u64 inum,
bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work,
SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
BTREE_ITER_intent);
- k = bch2_btree_iter_peek_slot(&iter);
+ k = bch2_btree_iter_peek_slot(trans, &iter);
ret = bkey_err(k);
if (ret)
goto err;
@@ -298,7 +311,7 @@ static struct bkey_s_c next_rebalance_entry(struct btree_trans *trans,
struct btree_iter *work_iter)
{
return !kthread_should_stop()
- ? bch2_btree_iter_peek(work_iter)
+ ? bch2_btree_iter_peek(trans, work_iter)
: bkey_s_c_null;
}
@@ -306,7 +319,7 @@ static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k)
{
- if (!bch2_bkey_rebalance_opts(k))
+ if (k.k->type == KEY_TYPE_reflink_v || !bch2_bkey_rebalance_opts(k))
return 0;
struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0);
@@ -332,7 +345,7 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink,
work_pos,
BTREE_ITER_all_snapshots);
- struct bkey_s_c k = bch2_btree_iter_peek_slot(extent_iter);
+ struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, extent_iter);
if (bkey_err(k))
return k;
@@ -343,7 +356,7 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
memset(data_opts, 0, sizeof(*data_opts));
data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k);
data_opts->target = io_opts->background_target;
- data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS;
+ data_opts->write_flags |= BCH_WRITE_only_specified_devs;
if (!data_opts->rewrite_ptrs) {
/*
@@ -430,7 +443,7 @@ static int do_rebalance_extent(struct moving_context *ctxt,
if (bch2_err_matches(ret, ENOMEM)) {
/* memory allocation failure, wait for some IO to finish */
bch2_move_ctxt_wait_for_io(ctxt);
- ret = -BCH_ERR_transaction_restart_nested;
+ ret = bch_err_throw(c, transaction_restart_nested);
}
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -444,22 +457,11 @@ out:
return ret;
}
-static bool rebalance_pred(struct bch_fs *c, void *arg,
- struct bkey_s_c k,
- struct bch_io_opts *io_opts,
- struct data_update_opts *data_opts)
-{
- data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k);
- data_opts->target = io_opts->background_target;
- data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS;
- return data_opts->rewrite_ptrs != 0;
-}
-
static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie)
{
struct btree_trans *trans = ctxt->trans;
+ struct bch_fs *c = trans->c;
struct bch_fs_rebalance *r = &trans->c->rebalance;
- int ret;
bch2_move_stats_init(&r->scan_stats, "rebalance_scan");
ctxt->stats = &r->scan_stats;
@@ -474,11 +476,34 @@ static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie)
r->state = BCH_REBALANCE_scanning;
- ret = __bch2_move_data(ctxt, r->scan_start, r->scan_end, rebalance_pred, NULL) ?:
- commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- bch2_clear_rebalance_needs_scan(trans, inum, cookie));
+ struct per_snapshot_io_opts snapshot_io_opts;
+ per_snapshot_io_opts_init(&snapshot_io_opts, c);
+
+ int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents,
+ r->scan_start.pos, r->scan_end.pos,
+ BTREE_ITER_all_snapshots|
+ BTREE_ITER_not_extents|
+ BTREE_ITER_prefetch, k, ({
+ ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos);
+
+ struct bch_io_opts *io_opts = bch2_move_get_io_opts(trans,
+ &snapshot_io_opts, iter.pos, &iter, k);
+ PTR_ERR_OR_ZERO(io_opts);
+ })) ?:
+ commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ bch2_clear_rebalance_needs_scan(trans, inum, cookie));
+ per_snapshot_io_opts_exit(&snapshot_io_opts);
bch2_move_stats_exit(&r->scan_stats, trans->c);
+
+ /*
+ * Ensure that the rebalance_work entries we created are seen by the
+ * next iteration of do_rebalance(), so we don't end up stuck in
+ * rebalance_wait():
+ */
+ atomic64_inc(&r->scan_stats.sectors_seen);
+ bch2_btree_write_buffer_flush_sync(trans);
+
return ret;
}
@@ -500,7 +525,14 @@ static void rebalance_wait(struct bch_fs *c)
r->state = BCH_REBALANCE_waiting;
}
- bch2_kthread_io_clock_wait(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT);
+ bch2_kthread_io_clock_wait_once(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT);
+}
+
+static bool bch2_rebalance_enabled(struct bch_fs *c)
+{
+ return c->opts.rebalance_enabled &&
+ !(c->opts.rebalance_on_ac_only &&
+ c->rebalance.on_battery);
}
static int do_rebalance(struct moving_context *ctxt)
@@ -508,8 +540,9 @@ static int do_rebalance(struct moving_context *ctxt)
struct btree_trans *trans = ctxt->trans;
struct bch_fs *c = trans->c;
struct bch_fs_rebalance *r = &c->rebalance;
- struct btree_iter rebalance_work_iter, extent_iter = { NULL };
+ struct btree_iter rebalance_work_iter, extent_iter = {};
struct bkey_s_c k;
+ u32 kick = r->kick;
int ret = 0;
bch2_trans_begin(trans);
@@ -522,9 +555,9 @@ static int do_rebalance(struct moving_context *ctxt)
BTREE_ITER_all_snapshots);
while (!bch2_move_ratelimit(ctxt)) {
- if (!c->opts.rebalance_enabled) {
+ if (!bch2_rebalance_enabled(c)) {
bch2_moving_ctxt_flush_all(ctxt);
- kthread_wait_freezable(c->opts.rebalance_enabled ||
+ kthread_wait_freezable(bch2_rebalance_enabled(c) ||
kthread_should_stop());
}
@@ -549,7 +582,7 @@ static int do_rebalance(struct moving_context *ctxt)
if (ret)
break;
- bch2_btree_iter_advance(&rebalance_work_iter);
+ bch2_btree_iter_advance(trans, &rebalance_work_iter);
}
bch2_trans_iter_exit(trans, &extent_iter);
@@ -559,7 +592,8 @@ static int do_rebalance(struct moving_context *ctxt)
if (!ret &&
!kthread_should_stop() &&
!atomic64_read(&r->work_stats.sectors_seen) &&
- !atomic64_read(&r->scan_stats.sectors_seen)) {
+ !atomic64_read(&r->scan_stats.sectors_seen) &&
+ kick == r->kick) {
bch2_moving_ctxt_flush_all(ctxt);
bch2_trans_unlock_long(trans);
rebalance_wait(c);
@@ -578,6 +612,13 @@ static int bch2_rebalance_thread(void *arg)
set_freezable();
+ /*
+ * Data move operations can't run until after check_snapshots has
+ * completed, and bch2_snapshot_is_ancestor() is available.
+ */
+ kthread_wait_freezable(c->recovery.pass_done > BCH_RECOVERY_PASS_check_snapshots ||
+ kthread_should_stop());
+
bch2_moving_ctxt_init(&ctxt, c, NULL, &r->work_stats,
writepoint_ptr(&c->rebalance_write_point),
true);
@@ -592,8 +633,20 @@ static int bch2_rebalance_thread(void *arg)
void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
{
+ printbuf_tabstop_push(out, 32);
+
struct bch_fs_rebalance *r = &c->rebalance;
+ /* print pending work */
+ struct disk_accounting_pos acc;
+ disk_accounting_key_init(acc, rebalance_work);
+ u64 v;
+ bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1);
+
+ prt_printf(out, "pending work:\t");
+ prt_human_readable_u64(out, v << 9);
+ prt_printf(out, "\n\n");
+
prt_str(out, bch2_rebalance_state_strs[r->state]);
prt_newline(out);
printbuf_indent_add(out, 2);
@@ -602,15 +655,15 @@ void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
case BCH_REBALANCE_waiting: {
u64 now = atomic64_read(&c->io_clock[WRITE].now);
- prt_str(out, "io wait duration: ");
+ prt_printf(out, "io wait duration:\t");
bch2_prt_human_readable_s64(out, (r->wait_iotime_end - r->wait_iotime_start) << 9);
prt_newline(out);
- prt_str(out, "io wait remaining: ");
+ prt_printf(out, "io wait remaining:\t");
bch2_prt_human_readable_s64(out, (r->wait_iotime_end - now) << 9);
prt_newline(out);
- prt_str(out, "duration waited: ");
+ prt_printf(out, "duration waited:\t");
bch2_pr_time_units(out, ktime_get_real_ns() - r->wait_wallclock_start);
prt_newline(out);
break;
@@ -623,6 +676,19 @@ void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
break;
}
prt_newline(out);
+
+ struct task_struct *t;
+ scoped_guard(rcu) {
+ t = rcu_dereference(c->rebalance.thread);
+ if (t)
+ get_task_struct(t);
+ }
+
+ if (t) {
+ bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL);
+ put_task_struct(t);
+ }
+
printbuf_indent_sub(out, 2);
}
@@ -637,7 +703,7 @@ void bch2_rebalance_stop(struct bch_fs *c)
c->rebalance.thread = NULL;
if (p) {
- /* for sychronizing with rebalance_wakeup() */
+ /* for sychronizing with bch2_rebalance_wakeup() */
synchronize_rcu();
kthread_stop(p);
@@ -668,7 +734,156 @@ int bch2_rebalance_start(struct bch_fs *c)
return 0;
}
-void bch2_fs_rebalance_init(struct bch_fs *c)
+#ifdef CONFIG_POWER_SUPPLY
+#include <linux/power_supply.h>
+
+static int bch2_rebalance_power_notifier(struct notifier_block *nb,
+ unsigned long event, void *data)
+{
+ struct bch_fs *c = container_of(nb, struct bch_fs, rebalance.power_notifier);
+
+ c->rebalance.on_battery = !power_supply_is_system_supplied();
+ bch2_rebalance_wakeup(c);
+ return NOTIFY_OK;
+}
+#endif
+
+void bch2_fs_rebalance_exit(struct bch_fs *c)
+{
+#ifdef CONFIG_POWER_SUPPLY
+ power_supply_unreg_notifier(&c->rebalance.power_notifier);
+#endif
+}
+
+int bch2_fs_rebalance_init(struct bch_fs *c)
{
- bch2_pd_controller_init(&c->rebalance.pd);
+ struct bch_fs_rebalance *r = &c->rebalance;
+
+ bch2_pd_controller_init(&r->pd);
+
+#ifdef CONFIG_POWER_SUPPLY
+ r->power_notifier.notifier_call = bch2_rebalance_power_notifier;
+ int ret = power_supply_reg_notifier(&r->power_notifier);
+ if (ret)
+ return ret;
+
+ r->on_battery = !power_supply_is_system_supplied();
+#endif
+ return 0;
+}
+
+static int check_rebalance_work_one(struct btree_trans *trans,
+ struct btree_iter *extent_iter,
+ struct btree_iter *rebalance_iter,
+ struct bkey_buf *last_flushed)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_s_c extent_k, rebalance_k;
+ struct printbuf buf = PRINTBUF;
+
+ int ret = bkey_err(extent_k = bch2_btree_iter_peek(trans, extent_iter)) ?:
+ bkey_err(rebalance_k = bch2_btree_iter_peek(trans, rebalance_iter));
+ if (ret)
+ return ret;
+
+ if (!extent_k.k &&
+ extent_iter->btree_id == BTREE_ID_reflink &&
+ (!rebalance_k.k ||
+ rebalance_k.k->p.inode >= BCACHEFS_ROOT_INO)) {
+ bch2_trans_iter_exit(trans, extent_iter);
+ bch2_trans_iter_init(trans, extent_iter,
+ BTREE_ID_extents, POS_MIN,
+ BTREE_ITER_prefetch|
+ BTREE_ITER_all_snapshots);
+ return bch_err_throw(c, transaction_restart_nested);
+ }
+
+ if (!extent_k.k && !rebalance_k.k)
+ return 1;
+
+ int cmp = bpos_cmp(extent_k.k ? extent_k.k->p : SPOS_MAX,
+ rebalance_k.k ? rebalance_k.k->p : SPOS_MAX);
+
+ struct bkey deleted;
+ bkey_init(&deleted);
+
+ if (cmp < 0) {
+ deleted.p = extent_k.k->p;
+ rebalance_k.k = &deleted;
+ } else if (cmp > 0) {
+ deleted.p = rebalance_k.k->p;
+ extent_k.k = &deleted;
+ }
+
+ bool should_have_rebalance =
+ bch2_bkey_sectors_need_rebalance(c, extent_k) != 0;
+ bool have_rebalance = rebalance_k.k->type == KEY_TYPE_set;
+
+ if (should_have_rebalance != have_rebalance) {
+ ret = bch2_btree_write_buffer_maybe_flush(trans, extent_k, last_flushed);
+ if (ret)
+ return ret;
+
+ bch2_bkey_val_to_text(&buf, c, extent_k);
+ }
+
+ if (fsck_err_on(!should_have_rebalance && have_rebalance,
+ trans, rebalance_work_incorrectly_set,
+ "rebalance work incorrectly set\n%s", buf.buf)) {
+ ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
+ extent_k.k->p, false);
+ if (ret)
+ goto err;
+ }
+
+ if (fsck_err_on(should_have_rebalance && !have_rebalance,
+ trans, rebalance_work_incorrectly_unset,
+ "rebalance work incorrectly unset\n%s", buf.buf)) {
+ ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
+ extent_k.k->p, true);
+ if (ret)
+ goto err;
+ }
+
+ if (cmp <= 0)
+ bch2_btree_iter_advance(trans, extent_iter);
+ if (cmp >= 0)
+ bch2_btree_iter_advance(trans, rebalance_iter);
+err:
+fsck_err:
+ printbuf_exit(&buf);
+ return ret;
+}
+
+int bch2_check_rebalance_work(struct bch_fs *c)
+{
+ struct btree_trans *trans = bch2_trans_get(c);
+ struct btree_iter rebalance_iter, extent_iter;
+ int ret = 0;
+
+ bch2_trans_iter_init(trans, &extent_iter,
+ BTREE_ID_reflink, POS_MIN,
+ BTREE_ITER_prefetch);
+ bch2_trans_iter_init(trans, &rebalance_iter,
+ BTREE_ID_rebalance_work, POS_MIN,
+ BTREE_ITER_prefetch);
+
+ struct bkey_buf last_flushed;
+ bch2_bkey_buf_init(&last_flushed);
+ bkey_init(&last_flushed.k->k);
+
+ while (!ret) {
+ bch2_trans_begin(trans);
+
+ ret = check_rebalance_work_one(trans, &extent_iter, &rebalance_iter, &last_flushed);
+
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ ret = 0;
+ }
+
+ bch2_bkey_buf_exit(&last_flushed, c);
+ bch2_trans_iter_exit(trans, &extent_iter);
+ bch2_trans_iter_exit(trans, &rebalance_iter);
+ bch2_trans_put(trans);
+ return ret < 0 ? ret : 0;
}
diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h
index 0a0821ab895d..7a565ea7dbfc 100644
--- a/fs/bcachefs/rebalance.h
+++ b/fs/bcachefs/rebalance.h
@@ -4,8 +4,28 @@
#include "compress.h"
#include "disk_groups.h"
+#include "opts.h"
#include "rebalance_types.h"
+static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct bch_fs *c,
+ struct bch_io_opts *opts)
+{
+ struct bch_extent_rebalance r = {
+ .type = BIT(BCH_EXTENT_ENTRY_rebalance),
+#define x(_name) \
+ ._name = opts->_name, \
+ ._name##_from_inode = opts->_name##_from_inode,
+ BCH_REBALANCE_OPTS()
+#undef x
+ };
+
+ if (r.background_target &&
+ !bch2_target_accepts_data(c, BCH_DATA_user, r.background_target))
+ r.background_target = 0;
+
+ return r;
+};
+
u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c);
int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bch_io_opts *, struct bkey_i *);
int bch2_get_update_rebalance_opts(struct btree_trans *,
@@ -17,21 +37,23 @@ int bch2_set_rebalance_needs_scan_trans(struct btree_trans *, u64);
int bch2_set_rebalance_needs_scan(struct bch_fs *, u64 inum);
int bch2_set_fs_needs_rebalance(struct bch_fs *);
-static inline void rebalance_wakeup(struct bch_fs *c)
+static inline void bch2_rebalance_wakeup(struct bch_fs *c)
{
- struct task_struct *p;
-
- rcu_read_lock();
- p = rcu_dereference(c->rebalance.thread);
+ c->rebalance.kick++;
+ guard(rcu)();
+ struct task_struct *p = rcu_dereference(c->rebalance.thread);
if (p)
wake_up_process(p);
- rcu_read_unlock();
}
void bch2_rebalance_status_to_text(struct printbuf *, struct bch_fs *);
void bch2_rebalance_stop(struct bch_fs *);
int bch2_rebalance_start(struct bch_fs *);
-void bch2_fs_rebalance_init(struct bch_fs *);
+
+void bch2_fs_rebalance_exit(struct bch_fs *);
+int bch2_fs_rebalance_init(struct bch_fs *);
+
+int bch2_check_rebalance_work(struct bch_fs *);
#endif /* _BCACHEFS_REBALANCE_H */
diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h
index fe5098c17dfc..c659da149fa3 100644
--- a/fs/bcachefs/rebalance_types.h
+++ b/fs/bcachefs/rebalance_types.h
@@ -18,6 +18,7 @@ enum bch_rebalance_states {
struct bch_fs_rebalance {
struct task_struct __rcu *thread;
+ u32 kick;
struct bch_pd_controller pd;
enum bch_rebalance_states state;
@@ -30,6 +31,11 @@ struct bch_fs_rebalance {
struct bbpos scan_start;
struct bbpos scan_end;
struct bch_move_stats scan_stats;
+
+ bool on_battery;
+#ifdef CONFIG_POWER_SUPPLY
+ struct notifier_block power_notifier;
+#endif
};
#endif /* _BCACHEFS_REBALANCE_TYPES_H */
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 98825437381c..c94debb12d2f 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -13,12 +13,13 @@
#include "disk_accounting.h"
#include "errcode.h"
#include "error.h"
-#include "fs-common.h"
#include "journal_io.h"
#include "journal_reclaim.h"
#include "journal_seq_blacklist.h"
#include "logged_ops.h"
#include "move.h"
+#include "movinggc.h"
+#include "namei.h"
#include "quota.h"
#include "rebalance.h"
#include "recovery.h"
@@ -32,9 +33,9 @@
#include <linux/sort.h>
#include <linux/stat.h>
-#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
-
-int bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree)
+int bch2_btree_lost_data(struct bch_fs *c,
+ struct printbuf *msg,
+ enum btree_id btree)
{
u64 b = BIT_ULL(btree);
int ret = 0;
@@ -43,32 +44,32 @@ int bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree)
struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
if (!(c->sb.btrees_lost_data & b)) {
- struct printbuf buf = PRINTBUF;
- bch2_btree_id_to_text(&buf, btree);
- bch_err(c, "flagging btree %s lost data", buf.buf);
- printbuf_exit(&buf);
+ prt_printf(msg, "flagging btree ");
+ bch2_btree_id_to_text(msg, btree);
+ prt_printf(msg, " lost data\n");
+
ext->btrees_lost_data |= cpu_to_le64(b);
}
/* Once we have runtime self healing for topology errors we won't need this: */
- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_topology) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0) ?: ret;
/* Btree node accounting will be off: */
__set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent);
- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_allocations) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_allocations, 0) ?: ret;
#ifdef CONFIG_BCACHEFS_DEBUG
/*
* These are much more minor, and don't need to be corrected right away,
* but in debug mode we want the next fsck run to be clean:
*/
- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_lrus) ?: ret;
- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_backpointers_to_extents) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_lrus, 0) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_backpointers_to_extents, 0) ?: ret;
#endif
switch (btree) {
case BTREE_ID_alloc:
- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret;
__set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent);
__set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent);
@@ -78,26 +79,32 @@ int bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree)
__set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent);
goto out;
case BTREE_ID_backpointers:
- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_btree_backpointers) ?: ret;
- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_extents_to_backpointers) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_btree_backpointers, 0) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_extents_to_backpointers, 0) ?: ret;
goto out;
case BTREE_ID_need_discard:
- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret;
goto out;
case BTREE_ID_freespace:
- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret;
goto out;
case BTREE_ID_bucket_gens:
- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret;
goto out;
case BTREE_ID_lru:
- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret;
goto out;
case BTREE_ID_accounting:
- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_allocations) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_allocations, 0) ?: ret;
+ goto out;
+ case BTREE_ID_snapshots:
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_reconstruct_snapshots, 0) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes, 0) ?: ret;
goto out;
default:
- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_scan_for_btree_nodes) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0) ?: ret;
+ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes, 0) ?: ret;
goto out;
}
out:
@@ -114,11 +121,8 @@ static void kill_btree(struct bch_fs *c, enum btree_id btree)
}
/* for -o reconstruct_alloc: */
-static void bch2_reconstruct_alloc(struct bch_fs *c)
+void bch2_reconstruct_alloc(struct bch_fs *c)
{
- bch2_journal_log_msg(c, "dropping alloc info");
- bch_info(c, "dropping and reconstructing all alloc info");
-
mutex_lock(&c->sb_lock);
struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
@@ -160,6 +164,8 @@ static void bch2_reconstruct_alloc(struct bch_fs *c)
c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
+ c->disk_sb.sb->features[0] &= ~cpu_to_le64(BIT_ULL(BCH_FEATURE_no_alloc_info));
+
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
@@ -199,7 +205,7 @@ static int bch2_journal_replay_accounting_key(struct btree_trans *trans,
bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
BTREE_MAX_DEPTH, k->level,
BTREE_ITER_intent);
- int ret = bch2_btree_iter_traverse(&iter);
+ int ret = bch2_btree_iter_traverse(trans, &iter);
if (ret)
goto out;
@@ -262,16 +268,38 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
BTREE_MAX_DEPTH, k->level,
iter_flags);
- ret = bch2_btree_iter_traverse(&iter);
+ ret = bch2_btree_iter_traverse(trans, &iter);
if (ret)
goto out;
struct btree_path *path = btree_iter_path(trans, &iter);
if (unlikely(!btree_path_node(path, k->level))) {
+ struct bch_fs *c = trans->c;
+
+ CLASS(printbuf, buf)();
+ prt_str(&buf, "btree=");
+ bch2_btree_id_to_text(&buf, k->btree_id);
+ prt_printf(&buf, " level=%u ", k->level);
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k->k));
+
+ if (!(c->recovery.passes_complete & (BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes)|
+ BIT_ULL(BCH_RECOVERY_PASS_check_topology)))) {
+ bch_err(c, "have key in journal replay for btree depth that does not exist, confused\n%s",
+ buf.buf);
+ ret = -EINVAL;
+ }
+
+ if (!k->allocated) {
+ bch_notice(c, "dropping key in journal replay for depth that does not exist because we're recovering from scan\n%s",
+ buf.buf);
+ k->overwritten = true;
+ goto out;
+ }
+
bch2_trans_iter_exit(trans, &iter);
bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
BTREE_MAX_DEPTH, 0, iter_flags);
- ret = bch2_btree_iter_traverse(&iter) ?:
+ ret = bch2_btree_iter_traverse(trans, &iter) ?:
bch2_btree_increase_depth(trans, iter.path, 0) ?:
-BCH_ERR_transaction_restart_nested;
goto out;
@@ -282,7 +310,12 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
goto out;
if (k->k->k.type == KEY_TYPE_accounting) {
- ret = bch2_trans_update_buffered(trans, BTREE_ID_accounting, k->k);
+ struct bkey_i *n = bch2_trans_subbuf_alloc(trans, &trans->accounting, k->k->k.u64s);
+ ret = PTR_ERR_OR_ZERO(n);
+ if (ret)
+ goto out;
+
+ bkey_copy(n, k->k);
goto out;
}
@@ -390,9 +423,9 @@ int bch2_journal_replay(struct bch_fs *c)
* Now, replay any remaining keys in the order in which they appear in
* the journal, unpinning those journal entries as we go:
*/
- sort(keys_sorted.data, keys_sorted.nr,
- sizeof(keys_sorted.data[0]),
- journal_sort_seq_cmp, NULL);
+ sort_nonatomic(keys_sorted.data, keys_sorted.nr,
+ sizeof(keys_sorted.data[0]),
+ journal_sort_seq_cmp, NULL);
darray_for_each(keys_sorted, kp) {
cond_resched();
@@ -430,7 +463,7 @@ int bch2_journal_replay(struct bch_fs *c)
trans = NULL;
if (!c->opts.retain_recovery_info &&
- c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay)
+ c->recovery.pass_done >= BCH_RECOVERY_PASS_journal_replay)
bch2_journal_keys_put_initial(c);
replay_now_at(j, j->replay_journal_seq_end);
@@ -585,9 +618,7 @@ static int read_btree_roots(struct bch_fs *c)
buf.buf, bch2_err_str(ret))) {
if (btree_id_is_alloc(i))
r->error = 0;
-
- ret = bch2_btree_lost_data(c, i);
- BUG_ON(ret);
+ ret = 0;
}
}
@@ -667,13 +698,13 @@ static bool check_version_upgrade(struct bch_fs *c)
bch2_recovery_passes_from_stable(le64_to_cpu(passes)));
}
- bch_info(c, "%s", buf.buf);
+ bch_notice(c, "%s", buf.buf);
printbuf_exit(&buf);
ret = true;
}
- if (new_version > c->sb.version_incompat &&
+ if (new_version > c->sb.version_incompat_allowed &&
c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible) {
struct printbuf buf = PRINTBUF;
@@ -683,7 +714,7 @@ static bool check_version_upgrade(struct bch_fs *c)
bch2_version_to_text(&buf, c->sb.version_incompat_allowed);
prt_newline(&buf);
- bch_info(c, "%s", buf.buf);
+ bch_notice(c, "%s", buf.buf);
printbuf_exit(&buf);
ret = true;
@@ -733,7 +764,24 @@ int bch2_fs_recovery(struct bch_fs *c)
? min(c->opts.recovery_pass_last, BCH_RECOVERY_PASS_snapshots_read)
: BCH_RECOVERY_PASS_snapshots_read;
c->opts.nochanges = true;
+ }
+
+ if (c->opts.nochanges)
c->opts.read_only = true;
+
+ if (c->opts.journal_rewind) {
+ bch_info(c, "rewinding journal, fsck required");
+ c->opts.fsck = true;
+ }
+
+ if (go_rw_in_recovery(c)) {
+ /*
+ * start workqueues/kworkers early - kthread creation checks for
+ * pending signals, which is _very_ annoying
+ */
+ ret = bch2_fs_init_rw(c);
+ if (ret)
+ goto err;
}
mutex_lock(&c->sb_lock);
@@ -790,11 +838,11 @@ int bch2_fs_recovery(struct bch_fs *c)
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
- if (c->opts.fsck)
- set_bit(BCH_FS_fsck_running, &c->flags);
if (c->sb.clean)
set_bit(BCH_FS_clean_recovery, &c->flags);
- set_bit(BCH_FS_recovery_running, &c->flags);
+ if (c->opts.fsck)
+ set_bit(BCH_FS_in_fsck, &c->flags);
+ set_bit(BCH_FS_in_recovery, &c->flags);
ret = bch2_blacklist_table_initialize(c);
if (ret) {
@@ -873,7 +921,7 @@ int bch2_fs_recovery(struct bch_fs *c)
use_clean:
if (!clean) {
bch_err(c, "no superblock clean section found");
- ret = -BCH_ERR_fsck_repair_impossible;
+ ret = bch_err_throw(c, fsck_repair_impossible);
goto err;
}
@@ -889,8 +937,37 @@ use_clean:
if (ret)
goto err;
- if (c->opts.reconstruct_alloc)
+ ret = bch2_fs_resize_on_mount(c);
+ if (ret) {
+ up_write(&c->state_lock);
+ goto err;
+ }
+
+ if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) {
+ bch_info(c, "filesystem is an unresized image file, mounting ro");
+ c->opts.read_only = true;
+ }
+
+ if (!c->opts.read_only &&
+ (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))) {
+ bch_info(c, "mounting a filesystem with no alloc info read-write; will recreate");
+
bch2_reconstruct_alloc(c);
+ } else if (c->opts.reconstruct_alloc) {
+ bch2_journal_log_msg(c, "dropping alloc info");
+ bch_info(c, "dropping and reconstructing all alloc info");
+
+ bch2_reconstruct_alloc(c);
+ }
+
+ if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) {
+ /* We can't go RW to fix errors without alloc info */
+ if (c->opts.fix_errors == FSCK_FIX_yes ||
+ c->opts.fix_errors == FSCK_FIX_ask)
+ c->opts.fix_errors = FSCK_FIX_no;
+ if (c->opts.errors == BCH_ON_ERROR_fix_safe)
+ c->opts.errors = BCH_ON_ERROR_continue;
+ }
/*
* After an unclean shutdown, skip then next few journal sequence
@@ -900,7 +977,7 @@ use_clean:
* journal sequence numbers:
*/
if (!c->sb.clean)
- journal_seq += 8;
+ journal_seq += JOURNAL_BUF_NR * 4;
if (blacklist_seq != journal_seq) {
ret = bch2_journal_log_msg(c, "blacklisting entries %llu-%llu",
@@ -915,7 +992,7 @@ use_clean:
ret = bch2_journal_log_msg(c, "starting journal at entry %llu, replaying %llu-%llu",
journal_seq, last_seq, blacklist_seq - 1) ?:
- bch2_fs_journal_start(&c->journal, journal_seq);
+ bch2_fs_journal_start(&c->journal, last_seq, journal_seq);
if (ret)
goto err;
@@ -933,8 +1010,10 @@ use_clean:
set_bit(BCH_FS_btree_running, &c->flags);
ret = bch2_sb_set_upgrade_extra(c);
+ if (ret)
+ goto err;
- ret = bch2_run_recovery_passes(c);
+ ret = bch2_run_recovery_passes(c, 0);
if (ret)
goto err;
@@ -945,8 +1024,7 @@ use_clean:
* multithreaded use:
*/
set_bit(BCH_FS_may_go_rw, &c->flags);
- clear_bit(BCH_FS_fsck_running, &c->flags);
- clear_bit(BCH_FS_recovery_running, &c->flags);
+ clear_bit(BCH_FS_in_fsck, &c->flags);
/* in case we don't run journal replay, i.e. norecovery mode */
set_bit(BCH_FS_accounting_replay_done, &c->flags);
@@ -969,9 +1047,8 @@ use_clean:
bch_info(c, "Fixed errors, running fsck a second time to verify fs is clean");
clear_bit(BCH_FS_errors_fixed, &c->flags);
- c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info;
-
- ret = bch2_run_recovery_passes(c);
+ ret = bch2_run_recovery_passes(c,
+ BCH_RECOVERY_PASS_check_alloc_info);
if (ret)
goto err;
@@ -1015,7 +1092,7 @@ use_clean:
if (c->opts.fsck &&
!test_bit(BCH_FS_error, &c->flags) &&
- c->recovery_pass_done == BCH_RECOVERY_PASS_NR - 1 &&
+ c->recovery.pass_done == BCH_RECOVERY_PASS_NR - 1 &&
ext->btrees_lost_data) {
ext->btrees_lost_data = 0;
write_sb = true;
@@ -1058,13 +1135,6 @@ use_clean:
out:
bch2_flush_fsck_errs(c);
- if (!c->opts.retain_recovery_info) {
- bch2_journal_keys_put_initial(c);
- bch2_find_btree_nodes_exit(&c->found_btree_nodes);
- }
- if (!IS_ERR(clean))
- kfree(clean);
-
if (!ret &&
test_bit(BCH_FS_need_delete_dead_snapshots, &c->flags) &&
!c->opts.nochanges) {
@@ -1073,11 +1143,23 @@ out:
}
bch_err_fn(c, ret);
+final_out:
+ if (!IS_ERR(clean))
+ kfree(clean);
return ret;
err:
fsck_err:
- bch2_fs_emergency_read_only(c);
- goto out;
+ {
+ struct printbuf buf = PRINTBUF;
+ bch2_log_msg_start(c, &buf);
+
+ prt_printf(&buf, "error in recovery: %s\n", bch2_err_str(ret));
+ bch2_fs_emergency_read_only2(c, &buf);
+
+ bch2_print_str(c, KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+ }
+ goto final_out;
}
int bch2_fs_initialize(struct bch_fs *c)
@@ -1126,14 +1208,17 @@ int bch2_fs_initialize(struct bch_fs *c)
* journal_res_get() will crash if called before this has
* set up the journal.pin FIFO and journal.cur pointer:
*/
- bch2_fs_journal_start(&c->journal, 1);
- set_bit(BCH_FS_accounting_replay_done, &c->flags);
- bch2_journal_set_replay_done(&c->journal);
+ ret = bch2_fs_journal_start(&c->journal, 1, 1);
+ if (ret)
+ goto err;
ret = bch2_fs_read_write_early(c);
if (ret)
goto err;
+ set_bit(BCH_FS_accounting_replay_done, &c->flags);
+ bch2_journal_set_replay_done(&c->journal);
+
for_each_member_device(c, ca) {
ret = bch2_dev_usage_init(ca, false);
if (ret) {
@@ -1190,7 +1275,10 @@ int bch2_fs_initialize(struct bch_fs *c)
if (ret)
goto err;
- c->recovery_pass_done = BCH_RECOVERY_PASS_NR - 1;
+ c->recovery.pass_done = BCH_RECOVERY_PASS_NR - 1;
+
+ bch2_copygc_wakeup(c);
+ bch2_rebalance_wakeup(c);
if (enabled_qtypes(c)) {
ret = bch2_fs_quota_read(c);
@@ -1210,7 +1298,7 @@ int bch2_fs_initialize(struct bch_fs *c)
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
- c->curr_recovery_pass = BCH_RECOVERY_PASS_NR;
+ c->recovery.curr_pass = BCH_RECOVERY_PASS_NR;
return 0;
err:
bch_err_fn(c, ret);
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
index b0d55754b21b..c023f52fc2d6 100644
--- a/fs/bcachefs/recovery.h
+++ b/fs/bcachefs/recovery.h
@@ -2,7 +2,8 @@
#ifndef _BCACHEFS_RECOVERY_H
#define _BCACHEFS_RECOVERY_H
-int bch2_btree_lost_data(struct bch_fs *, enum btree_id);
+int bch2_btree_lost_data(struct bch_fs *, struct printbuf *, enum btree_id);
+void bch2_reconstruct_alloc(struct bch_fs *);
int bch2_journal_replay(struct bch_fs *);
diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c
index 0b3c951c32da..6a039e011064 100644
--- a/fs/bcachefs/recovery_passes.c
+++ b/fs/bcachefs/recovery_passes.c
@@ -12,6 +12,7 @@
#include "journal.h"
#include "lru.h"
#include "logged_ops.h"
+#include "movinggc.h"
#include "rebalance.h"
#include "recovery.h"
#include "recovery_passes.h"
@@ -27,6 +28,176 @@ const char * const bch2_recovery_passes[] = {
NULL
};
+static const u8 passes_to_stable_map[] = {
+#define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n,
+ BCH_RECOVERY_PASSES()
+#undef x
+};
+
+static const u8 passes_from_stable_map[] = {
+#define x(n, id, ...) [BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n,
+ BCH_RECOVERY_PASSES()
+#undef x
+};
+
+static enum bch_recovery_pass_stable bch2_recovery_pass_to_stable(enum bch_recovery_pass pass)
+{
+ return passes_to_stable_map[pass];
+}
+
+u64 bch2_recovery_passes_to_stable(u64 v)
+{
+ u64 ret = 0;
+ for (unsigned i = 0; i < ARRAY_SIZE(passes_to_stable_map); i++)
+ if (v & BIT_ULL(i))
+ ret |= BIT_ULL(passes_to_stable_map[i]);
+ return ret;
+}
+
+static enum bch_recovery_pass bch2_recovery_pass_from_stable(enum bch_recovery_pass_stable pass)
+{
+ return pass < ARRAY_SIZE(passes_from_stable_map)
+ ? passes_from_stable_map[pass]
+ : 0;
+}
+
+u64 bch2_recovery_passes_from_stable(u64 v)
+{
+ u64 ret = 0;
+ for (unsigned i = 0; i < ARRAY_SIZE(passes_from_stable_map); i++)
+ if (v & BIT_ULL(i))
+ ret |= BIT_ULL(passes_from_stable_map[i]);
+ return ret;
+}
+
+static int bch2_sb_recovery_passes_validate(struct bch_sb *sb, struct bch_sb_field *f,
+ enum bch_validate_flags flags, struct printbuf *err)
+{
+ return 0;
+}
+
+static void bch2_sb_recovery_passes_to_text(struct printbuf *out,
+ struct bch_sb *sb,
+ struct bch_sb_field *f)
+{
+ struct bch_sb_field_recovery_passes *r =
+ field_to_type(f, recovery_passes);
+ unsigned nr = recovery_passes_nr_entries(r);
+
+ if (out->nr_tabstops < 1)
+ printbuf_tabstop_push(out, 32);
+ if (out->nr_tabstops < 2)
+ printbuf_tabstop_push(out, 16);
+
+ prt_printf(out, "Pass\tLast run\tLast runtime\n");
+
+ for (struct recovery_pass_entry *i = r->start; i < r->start + nr; i++) {
+ if (!i->last_run)
+ continue;
+
+ unsigned idx = i - r->start;
+
+ prt_printf(out, "%s\t", bch2_recovery_passes[bch2_recovery_pass_from_stable(idx)]);
+
+ bch2_prt_datetime(out, le64_to_cpu(i->last_run));
+ prt_tab(out);
+
+ bch2_pr_time_units(out, le32_to_cpu(i->last_runtime) * NSEC_PER_SEC);
+
+ if (BCH_RECOVERY_PASS_NO_RATELIMIT(i))
+ prt_str(out, " (no ratelimit)");
+
+ prt_newline(out);
+ }
+}
+
+static struct recovery_pass_entry *bch2_sb_recovery_pass_entry(struct bch_fs *c,
+ enum bch_recovery_pass pass)
+{
+ enum bch_recovery_pass_stable stable = bch2_recovery_pass_to_stable(pass);
+
+ lockdep_assert_held(&c->sb_lock);
+
+ struct bch_sb_field_recovery_passes *r =
+ bch2_sb_field_get(c->disk_sb.sb, recovery_passes);
+
+ if (stable >= recovery_passes_nr_entries(r)) {
+ unsigned u64s = struct_size(r, start, stable + 1) / sizeof(u64);
+
+ r = bch2_sb_field_resize(&c->disk_sb, recovery_passes, u64s);
+ if (!r) {
+ bch_err(c, "error creating recovery_passes sb section");
+ return NULL;
+ }
+ }
+
+ return r->start + stable;
+}
+
+static void bch2_sb_recovery_pass_complete(struct bch_fs *c,
+ enum bch_recovery_pass pass,
+ s64 start_time)
+{
+ guard(mutex)(&c->sb_lock);
+ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+ __clear_bit_le64(bch2_recovery_pass_to_stable(pass),
+ ext->recovery_passes_required);
+
+ struct recovery_pass_entry *e = bch2_sb_recovery_pass_entry(c, pass);
+ if (e) {
+ s64 end_time = ktime_get_real_seconds();
+ e->last_run = cpu_to_le64(end_time);
+ e->last_runtime = cpu_to_le32(max(0, end_time - start_time));
+ SET_BCH_RECOVERY_PASS_NO_RATELIMIT(e, false);
+ }
+
+ bch2_write_super(c);
+}
+
+void bch2_recovery_pass_set_no_ratelimit(struct bch_fs *c,
+ enum bch_recovery_pass pass)
+{
+ guard(mutex)(&c->sb_lock);
+
+ struct recovery_pass_entry *e = bch2_sb_recovery_pass_entry(c, pass);
+ if (e && !BCH_RECOVERY_PASS_NO_RATELIMIT(e)) {
+ SET_BCH_RECOVERY_PASS_NO_RATELIMIT(e, false);
+ bch2_write_super(c);
+ }
+}
+
+static bool bch2_recovery_pass_want_ratelimit(struct bch_fs *c, enum bch_recovery_pass pass)
+{
+ enum bch_recovery_pass_stable stable = bch2_recovery_pass_to_stable(pass);
+ bool ret = false;
+
+ lockdep_assert_held(&c->sb_lock);
+
+ struct bch_sb_field_recovery_passes *r =
+ bch2_sb_field_get(c->disk_sb.sb, recovery_passes);
+
+ if (stable < recovery_passes_nr_entries(r)) {
+ struct recovery_pass_entry *i = r->start + stable;
+
+ /*
+ * Ratelimit if the last runtime was more than 1% of the time
+ * since we last ran
+ */
+ ret = (u64) le32_to_cpu(i->last_runtime) * 100 >
+ ktime_get_real_seconds() - le64_to_cpu(i->last_run);
+
+ if (BCH_RECOVERY_PASS_NO_RATELIMIT(i))
+ ret = false;
+ }
+
+ return ret;
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_recovery_passes = {
+ .validate = bch2_sb_recovery_passes_validate,
+ .to_text = bch2_sb_recovery_passes_to_text
+};
+
/* Fake recovery pass, so that scan_for_btree_nodes isn't 0: */
static int bch2_recovery_pass_empty(struct bch_fs *c)
{
@@ -46,11 +217,32 @@ static int bch2_set_may_go_rw(struct bch_fs *c)
set_bit(BCH_FS_may_go_rw, &c->flags);
- if (keys->nr || !c->opts.read_only || c->opts.fsck || !c->sb.clean || c->opts.recovery_passes)
+ if (go_rw_in_recovery(c)) {
+ if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) {
+ bch_info(c, "mounting a filesystem with no alloc info read-write; will recreate");
+ bch2_reconstruct_alloc(c);
+ }
+
return bch2_fs_read_write_early(c);
+ }
return 0;
}
+/*
+ * Make sure root inode is readable while we're still in recovery and can rewind
+ * for repair:
+ */
+static int bch2_lookup_root_inode(struct bch_fs *c)
+{
+ subvol_inum inum = BCACHEFS_ROOT_SUBVOL_INUM;
+ struct bch_inode_unpacked inode_u;
+ struct bch_subvolume subvol;
+
+ return bch2_trans_do(c,
+ bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?:
+ bch2_inode_find_by_inum_trans(trans, inum, &inode_u));
+}
+
struct recovery_pass_fn {
int (*fn)(struct bch_fs *);
unsigned when;
@@ -62,255 +254,393 @@ static struct recovery_pass_fn recovery_pass_fns[] = {
#undef x
};
-static const u8 passes_to_stable_map[] = {
-#define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n,
- BCH_RECOVERY_PASSES()
-#undef x
-};
+static u64 bch2_recovery_passes_match(unsigned flags)
+{
+ u64 ret = 0;
-static enum bch_recovery_pass_stable bch2_recovery_pass_to_stable(enum bch_recovery_pass pass)
+ for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++)
+ if (recovery_pass_fns[i].when & flags)
+ ret |= BIT_ULL(i);
+ return ret;
+}
+
+u64 bch2_fsck_recovery_passes(void)
{
- return passes_to_stable_map[pass];
+ return bch2_recovery_passes_match(PASS_FSCK);
}
-u64 bch2_recovery_passes_to_stable(u64 v)
+static void bch2_run_async_recovery_passes(struct bch_fs *c)
{
- u64 ret = 0;
- for (unsigned i = 0; i < ARRAY_SIZE(passes_to_stable_map); i++)
- if (v & BIT_ULL(i))
- ret |= BIT_ULL(passes_to_stable_map[i]);
- return ret;
+ if (!down_trylock(&c->recovery.run_lock))
+ return;
+
+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_async_recovery_passes))
+ goto unlock;
+
+ if (queue_work(system_long_wq, &c->recovery.work))
+ return;
+
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_async_recovery_passes);
+unlock:
+ up(&c->recovery.run_lock);
}
-u64 bch2_recovery_passes_from_stable(u64 v)
+static bool recovery_pass_needs_set(struct bch_fs *c,
+ enum bch_recovery_pass pass,
+ enum bch_run_recovery_pass_flags *flags)
{
- static const u8 map[] = {
-#define x(n, id, ...) [BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n,
- BCH_RECOVERY_PASSES()
-#undef x
- };
+ struct bch_fs_recovery *r = &c->recovery;
- u64 ret = 0;
- for (unsigned i = 0; i < ARRAY_SIZE(map); i++)
- if (v & BIT_ULL(i))
- ret |= BIT_ULL(map[i]);
- return ret;
+ /*
+ * Never run scan_for_btree_nodes persistently: check_topology will run
+ * it if required
+ */
+ if (pass == BCH_RECOVERY_PASS_scan_for_btree_nodes)
+ *flags |= RUN_RECOVERY_PASS_nopersistent;
+
+ if ((*flags & RUN_RECOVERY_PASS_ratelimit) &&
+ !bch2_recovery_pass_want_ratelimit(c, pass))
+ *flags &= ~RUN_RECOVERY_PASS_ratelimit;
+
+ /*
+ * If RUN_RECOVERY_PASS_nopersistent is set, we don't want to do
+ * anything if the pass has already run: these mean we need a prior pass
+ * to run before we continue to repair, we don't expect that pass to fix
+ * the damage we encountered.
+ *
+ * Otherwise, we run run_explicit_recovery_pass when we find damage, so
+ * it should run again even if it's already run:
+ */
+ bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags);
+ bool persistent = !in_recovery || !(*flags & RUN_RECOVERY_PASS_nopersistent);
+ bool rewind = in_recovery &&
+ r->curr_pass > pass &&
+ !(r->passes_complete & BIT_ULL(pass));
+
+ if (persistent
+ ? !(c->sb.recovery_passes_required & BIT_ULL(pass))
+ : !((r->passes_to_run|r->passes_complete) & BIT_ULL(pass)))
+ return true;
+
+ if (!(*flags & RUN_RECOVERY_PASS_ratelimit) &&
+ (r->passes_ratelimiting & BIT_ULL(pass)))
+ return true;
+
+ if (rewind)
+ return true;
+
+ return false;
}
/*
* For when we need to rewind recovery passes and run a pass we skipped:
*/
-static int __bch2_run_explicit_recovery_pass(struct bch_fs *c,
- enum bch_recovery_pass pass)
+int __bch2_run_explicit_recovery_pass(struct bch_fs *c,
+ struct printbuf *out,
+ enum bch_recovery_pass pass,
+ enum bch_run_recovery_pass_flags flags)
{
- if (c->curr_recovery_pass == ARRAY_SIZE(recovery_pass_fns))
- return -BCH_ERR_not_in_recovery;
+ struct bch_fs_recovery *r = &c->recovery;
+ int ret = 0;
- if (c->recovery_passes_complete & BIT_ULL(pass))
- return 0;
+ lockdep_assert_held(&c->sb_lock);
+
+ bch2_printbuf_make_room(out, 1024);
+ out->atomic++;
+
+ unsigned long lockflags;
+ spin_lock_irqsave(&r->lock, lockflags);
- bool print = !(c->opts.recovery_passes & BIT_ULL(pass));
+ if (!recovery_pass_needs_set(c, pass, &flags))
+ goto out;
+
+ bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags);
+ bool rewind = in_recovery &&
+ r->curr_pass > pass &&
+ !(r->passes_complete & BIT_ULL(pass));
+ bool ratelimit = flags & RUN_RECOVERY_PASS_ratelimit;
+
+ if (!(flags & RUN_RECOVERY_PASS_nopersistent)) {
+ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+ __set_bit_le64(bch2_recovery_pass_to_stable(pass), ext->recovery_passes_required);
+ }
if (pass < BCH_RECOVERY_PASS_set_may_go_rw &&
- c->curr_recovery_pass >= BCH_RECOVERY_PASS_set_may_go_rw) {
- if (print)
- bch_info(c, "need recovery pass %s (%u), but already rw",
- bch2_recovery_passes[pass], pass);
- return -BCH_ERR_cannot_rewind_recovery;
+ (!in_recovery || r->curr_pass >= BCH_RECOVERY_PASS_set_may_go_rw)) {
+ prt_printf(out, "need recovery pass %s (%u), but already rw\n",
+ bch2_recovery_passes[pass], pass);
+ ret = bch_err_throw(c, cannot_rewind_recovery);
+ goto out;
}
- if (print)
- bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)",
- bch2_recovery_passes[pass], pass,
- bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass);
+ if (ratelimit)
+ r->passes_ratelimiting |= BIT_ULL(pass);
+ else
+ r->passes_ratelimiting &= ~BIT_ULL(pass);
+
+ if (in_recovery && !ratelimit) {
+ prt_printf(out, "running recovery pass %s (%u), currently at %s (%u)%s\n",
+ bch2_recovery_passes[pass], pass,
+ bch2_recovery_passes[r->curr_pass], r->curr_pass,
+ rewind ? " - rewinding" : "");
- c->opts.recovery_passes |= BIT_ULL(pass);
+ r->passes_to_run |= BIT_ULL(pass);
- if (c->curr_recovery_pass > pass) {
- c->next_recovery_pass = pass;
- c->recovery_passes_complete &= (1ULL << pass) >> 1;
- return -BCH_ERR_restart_recovery;
+ if (rewind) {
+ r->next_pass = pass;
+ r->passes_complete &= (1ULL << pass) >> 1;
+ ret = bch_err_throw(c, restart_recovery);
+ }
} else {
- return 0;
+ prt_printf(out, "scheduling recovery pass %s (%u)%s\n",
+ bch2_recovery_passes[pass], pass,
+ ratelimit ? " - ratelimiting" : "");
+
+ struct recovery_pass_fn *p = recovery_pass_fns + pass;
+ if (p->when & PASS_ONLINE)
+ bch2_run_async_recovery_passes(c);
}
+out:
+ spin_unlock_irqrestore(&r->lock, lockflags);
+ --out->atomic;
+ return ret;
}
int bch2_run_explicit_recovery_pass(struct bch_fs *c,
- enum bch_recovery_pass pass)
+ struct printbuf *out,
+ enum bch_recovery_pass pass,
+ enum bch_run_recovery_pass_flags flags)
{
- unsigned long flags;
- spin_lock_irqsave(&c->recovery_pass_lock, flags);
- int ret = __bch2_run_explicit_recovery_pass(c, pass);
- spin_unlock_irqrestore(&c->recovery_pass_lock, flags);
+ int ret = 0;
+
+ if (recovery_pass_needs_set(c, pass, &flags)) {
+ guard(mutex)(&c->sb_lock);
+ ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags);
+ bch2_write_super(c);
+ }
+
return ret;
}
-int bch2_run_explicit_recovery_pass_persistent_locked(struct bch_fs *c,
- enum bch_recovery_pass pass)
+/*
+ * Returns 0 if @pass has run recently, otherwise one of
+ * -BCH_ERR_restart_recovery
+ * -BCH_ERR_recovery_pass_will_run
+ */
+int bch2_require_recovery_pass(struct bch_fs *c,
+ struct printbuf *out,
+ enum bch_recovery_pass pass)
{
- lockdep_assert_held(&c->sb_lock);
-
- struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
- __set_bit_le64(bch2_recovery_pass_to_stable(pass), ext->recovery_passes_required);
+ if (test_bit(BCH_FS_in_recovery, &c->flags) &&
+ c->recovery.passes_complete & BIT_ULL(pass))
+ return 0;
- return bch2_run_explicit_recovery_pass(c, pass);
-}
+ guard(mutex)(&c->sb_lock);
-int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *c,
- enum bch_recovery_pass pass)
-{
- enum bch_recovery_pass_stable s = bch2_recovery_pass_to_stable(pass);
+ if (bch2_recovery_pass_want_ratelimit(c, pass))
+ return 0;
- mutex_lock(&c->sb_lock);
- struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+ enum bch_run_recovery_pass_flags flags = 0;
+ int ret = 0;
- if (!test_bit_le64(s, ext->recovery_passes_required)) {
- __set_bit_le64(s, ext->recovery_passes_required);
+ if (recovery_pass_needs_set(c, pass, &flags)) {
+ ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags);
bch2_write_super(c);
}
- mutex_unlock(&c->sb_lock);
- return bch2_run_explicit_recovery_pass(c, pass);
+ return ret ?: bch_err_throw(c, recovery_pass_will_run);
}
-static void bch2_clear_recovery_pass_required(struct bch_fs *c,
- enum bch_recovery_pass pass)
+int bch2_run_print_explicit_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
{
- enum bch_recovery_pass_stable s = bch2_recovery_pass_to_stable(pass);
+ enum bch_run_recovery_pass_flags flags = 0;
- mutex_lock(&c->sb_lock);
- struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+ if (!recovery_pass_needs_set(c, pass, &flags))
+ return 0;
- if (test_bit_le64(s, ext->recovery_passes_required)) {
- __clear_bit_le64(s, ext->recovery_passes_required);
- bch2_write_super(c);
- }
- mutex_unlock(&c->sb_lock);
-}
+ struct printbuf buf = PRINTBUF;
+ bch2_log_msg_start(c, &buf);
-u64 bch2_fsck_recovery_passes(void)
-{
- u64 ret = 0;
+ mutex_lock(&c->sb_lock);
+ int ret = __bch2_run_explicit_recovery_pass(c, &buf, pass,
+ RUN_RECOVERY_PASS_nopersistent);
+ mutex_unlock(&c->sb_lock);
- for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++)
- if (recovery_pass_fns[i].when & PASS_FSCK)
- ret |= BIT_ULL(i);
+ bch2_print_str(c, KERN_NOTICE, buf.buf);
+ printbuf_exit(&buf);
return ret;
}
-static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
-{
- struct recovery_pass_fn *p = recovery_pass_fns + pass;
-
- if (c->opts.recovery_passes_exclude & BIT_ULL(pass))
- return false;
- if (c->opts.recovery_passes & BIT_ULL(pass))
- return true;
- if ((p->when & PASS_FSCK) && c->opts.fsck)
- return true;
- if ((p->when & PASS_UNCLEAN) && !c->sb.clean)
- return true;
- if (p->when & PASS_ALWAYS)
- return true;
- return false;
-}
-
static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
{
+ struct bch_fs_recovery *r = &c->recovery;
struct recovery_pass_fn *p = recovery_pass_fns + pass;
- int ret;
if (!(p->when & PASS_SILENT))
bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."),
bch2_recovery_passes[pass]);
- ret = p->fn(c);
- if (ret)
+
+ s64 start_time = ktime_get_real_seconds();
+ int ret = p->fn(c);
+
+ r->passes_to_run &= ~BIT_ULL(pass);
+
+ if (ret) {
+ r->passes_failing |= BIT_ULL(pass);
return ret;
+ }
+
+ r->passes_failing = 0;
+
+ if (!test_bit(BCH_FS_error, &c->flags))
+ bch2_sb_recovery_pass_complete(c, pass, start_time);
+
if (!(p->when & PASS_SILENT))
bch2_print(c, KERN_CONT " done\n");
return 0;
}
-int bch2_run_online_recovery_passes(struct bch_fs *c)
+static int __bch2_run_recovery_passes(struct bch_fs *c, u64 orig_passes_to_run,
+ bool online)
{
+ struct bch_fs_recovery *r = &c->recovery;
int ret = 0;
- down_read(&c->state_lock);
+ spin_lock_irq(&r->lock);
- for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) {
- struct recovery_pass_fn *p = recovery_pass_fns + i;
+ if (online)
+ orig_passes_to_run &= bch2_recovery_passes_match(PASS_ONLINE);
- if (!(p->when & PASS_ONLINE))
- continue;
+ if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))
+ orig_passes_to_run &= ~bch2_recovery_passes_match(PASS_ALLOC);
- ret = bch2_run_recovery_pass(c, i);
- if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) {
- i = c->curr_recovery_pass;
- continue;
+ /*
+ * A failed recovery pass will be retried after another pass succeeds -
+ * but not this iteration.
+ *
+ * This is because some passes depend on repair done by other passes: we
+ * may want to retry, but we don't want to loop on failing passes.
+ */
+
+ orig_passes_to_run &= ~r->passes_failing;
+
+ r->passes_to_run = orig_passes_to_run;
+
+ while (r->passes_to_run) {
+ unsigned prev_done = r->pass_done;
+ unsigned pass = __ffs64(r->passes_to_run);
+ r->curr_pass = pass;
+ r->next_pass = r->curr_pass + 1;
+ r->passes_to_run &= ~BIT_ULL(pass);
+
+ spin_unlock_irq(&r->lock);
+
+ int ret2 = bch2_run_recovery_pass(c, pass) ?:
+ bch2_journal_flush(&c->journal);
+
+ spin_lock_irq(&r->lock);
+
+ if (r->next_pass < r->curr_pass) {
+ /* Rewind: */
+ r->passes_to_run |= orig_passes_to_run & (~0ULL << r->next_pass);
+ } else if (!ret2) {
+ r->pass_done = max(r->pass_done, pass);
+ r->passes_complete |= BIT_ULL(pass);
+ } else {
+ ret = ret2;
}
- if (ret)
+
+ if (ret && !online)
break;
+
+ if (prev_done <= BCH_RECOVERY_PASS_check_snapshots &&
+ r->pass_done > BCH_RECOVERY_PASS_check_snapshots) {
+ bch2_copygc_wakeup(c);
+ bch2_rebalance_wakeup(c);
+ }
}
- up_read(&c->state_lock);
+ clear_bit(BCH_FS_in_recovery, &c->flags);
+ spin_unlock_irq(&r->lock);
return ret;
}
-int bch2_run_recovery_passes(struct bch_fs *c)
+static void bch2_async_recovery_passes_work(struct work_struct *work)
{
- int ret = 0;
+ struct bch_fs *c = container_of(work, struct bch_fs, recovery.work);
+ struct bch_fs_recovery *r = &c->recovery;
+
+ __bch2_run_recovery_passes(c,
+ c->sb.recovery_passes_required & ~r->passes_ratelimiting,
+ true);
+
+ up(&r->run_lock);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_async_recovery_passes);
+}
+
+int bch2_run_online_recovery_passes(struct bch_fs *c, u64 passes)
+{
+ return __bch2_run_recovery_passes(c, c->sb.recovery_passes_required|passes, true);
+}
+
+int bch2_run_recovery_passes(struct bch_fs *c, enum bch_recovery_pass from)
+{
+ u64 passes =
+ bch2_recovery_passes_match(PASS_ALWAYS) |
+ (!c->sb.clean ? bch2_recovery_passes_match(PASS_UNCLEAN) : 0) |
+ (c->opts.fsck ? bch2_recovery_passes_match(PASS_FSCK) : 0) |
+ c->opts.recovery_passes |
+ c->sb.recovery_passes_required;
+
+ if (c->opts.recovery_pass_last)
+ passes &= BIT_ULL(c->opts.recovery_pass_last + 1) - 1;
/*
* We can't allow set_may_go_rw to be excluded; that would cause us to
* use the journal replay keys for updates where it's not expected.
*/
c->opts.recovery_passes_exclude &= ~BCH_RECOVERY_PASS_set_may_go_rw;
+ passes &= ~c->opts.recovery_passes_exclude;
- while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns) && !ret) {
- c->next_recovery_pass = c->curr_recovery_pass + 1;
+ passes &= ~(BIT_ULL(from) - 1);
- spin_lock_irq(&c->recovery_pass_lock);
- unsigned pass = c->curr_recovery_pass;
+ down(&c->recovery.run_lock);
+ int ret = __bch2_run_recovery_passes(c, passes, false);
+ up(&c->recovery.run_lock);
- if (c->opts.recovery_pass_last &&
- c->curr_recovery_pass > c->opts.recovery_pass_last) {
- spin_unlock_irq(&c->recovery_pass_lock);
- break;
- }
-
- if (!should_run_recovery_pass(c, pass)) {
- c->curr_recovery_pass++;
- c->recovery_pass_done = max(c->recovery_pass_done, pass);
- spin_unlock_irq(&c->recovery_pass_lock);
- continue;
- }
- spin_unlock_irq(&c->recovery_pass_lock);
+ return ret;
+}
- ret = bch2_run_recovery_pass(c, pass) ?:
- bch2_journal_flush(&c->journal);
+static void prt_passes(struct printbuf *out, const char *msg, u64 passes)
+{
+ prt_printf(out, "%s:\t", msg);
+ prt_bitflags(out, bch2_recovery_passes, passes);
+ prt_newline(out);
+}
- if (!ret && !test_bit(BCH_FS_error, &c->flags))
- bch2_clear_recovery_pass_required(c, pass);
-
- spin_lock_irq(&c->recovery_pass_lock);
- if (c->next_recovery_pass < c->curr_recovery_pass) {
- /*
- * bch2_run_explicit_recovery_pass() was called: we
- * can't always catch -BCH_ERR_restart_recovery because
- * it may have been called from another thread (btree
- * node read completion)
- */
- ret = 0;
- c->recovery_passes_complete &= ~(~0ULL << c->curr_recovery_pass);
- } else {
- c->recovery_passes_complete |= BIT_ULL(pass);
- c->recovery_pass_done = max(c->recovery_pass_done, pass);
- }
- c->curr_recovery_pass = c->next_recovery_pass;
- spin_unlock_irq(&c->recovery_pass_lock);
+void bch2_recovery_pass_status_to_text(struct printbuf *out, struct bch_fs *c)
+{
+ struct bch_fs_recovery *r = &c->recovery;
+
+ printbuf_tabstop_push(out, 32);
+ prt_passes(out, "Scheduled passes", c->sb.recovery_passes_required);
+ prt_passes(out, "Scheduled online passes", c->sb.recovery_passes_required &
+ bch2_recovery_passes_match(PASS_ONLINE));
+ prt_passes(out, "Complete passes", r->passes_complete);
+ prt_passes(out, "Failing passes", r->passes_failing);
+
+ if (r->curr_pass) {
+ prt_printf(out, "Current pass:\t%s\n", bch2_recovery_passes[r->curr_pass]);
+ prt_passes(out, "Current passes", r->passes_to_run);
}
+}
- return ret;
+void bch2_fs_recovery_passes_init(struct bch_fs *c)
+{
+ spin_lock_init(&c->recovery.lock);
+ sema_init(&c->recovery.run_lock, 1);
+
+ INIT_WORK(&c->recovery.work, bch2_async_recovery_passes_work);
}
diff --git a/fs/bcachefs/recovery_passes.h b/fs/bcachefs/recovery_passes.h
index 7d7339c8fa29..2117f0ce1922 100644
--- a/fs/bcachefs/recovery_passes.h
+++ b/fs/bcachefs/recovery_passes.h
@@ -3,16 +3,46 @@
extern const char * const bch2_recovery_passes[];
+extern const struct bch_sb_field_ops bch_sb_field_ops_recovery_passes;
+
u64 bch2_recovery_passes_to_stable(u64 v);
u64 bch2_recovery_passes_from_stable(u64 v);
u64 bch2_fsck_recovery_passes(void);
-int bch2_run_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass);
-int bch2_run_explicit_recovery_pass_persistent_locked(struct bch_fs *, enum bch_recovery_pass);
-int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *, enum bch_recovery_pass);
+void bch2_recovery_pass_set_no_ratelimit(struct bch_fs *, enum bch_recovery_pass);
+
+enum bch_run_recovery_pass_flags {
+ RUN_RECOVERY_PASS_nopersistent = BIT(0),
+ RUN_RECOVERY_PASS_ratelimit = BIT(1),
+};
+
+static inline bool go_rw_in_recovery(struct bch_fs *c)
+{
+ return (c->journal_keys.nr ||
+ !c->opts.read_only ||
+ !c->sb.clean ||
+ c->opts.recovery_passes ||
+ (c->opts.fsck && !(c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))));
+}
+
+int bch2_run_print_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass);
+
+int __bch2_run_explicit_recovery_pass(struct bch_fs *, struct printbuf *,
+ enum bch_recovery_pass,
+ enum bch_run_recovery_pass_flags);
+int bch2_run_explicit_recovery_pass(struct bch_fs *, struct printbuf *,
+ enum bch_recovery_pass,
+ enum bch_run_recovery_pass_flags);
+
+int bch2_require_recovery_pass(struct bch_fs *, struct printbuf *,
+ enum bch_recovery_pass);
+
+int bch2_run_online_recovery_passes(struct bch_fs *, u64);
+int bch2_run_recovery_passes(struct bch_fs *, enum bch_recovery_pass);
+
+void bch2_recovery_pass_status_to_text(struct printbuf *, struct bch_fs *);
-int bch2_run_online_recovery_passes(struct bch_fs *);
-int bch2_run_recovery_passes(struct bch_fs *);
+void bch2_fs_recovery_passes_init(struct bch_fs *);
#endif /* _BCACHEFS_RECOVERY_PASSES_H */
diff --git a/fs/bcachefs/recovery_passes_format.h b/fs/bcachefs/recovery_passes_format.h
new file mode 100644
index 000000000000..b63c20558d3d
--- /dev/null
+++ b/fs/bcachefs/recovery_passes_format.h
@@ -0,0 +1,106 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_RECOVERY_PASSES_FORMAT_H
+#define _BCACHEFS_RECOVERY_PASSES_FORMAT_H
+
+#define PASS_SILENT BIT(0)
+#define PASS_FSCK BIT(1)
+#define PASS_UNCLEAN BIT(2)
+#define PASS_ALWAYS BIT(3)
+#define PASS_ONLINE BIT(4)
+#define PASS_ALLOC BIT(5)
+#define PASS_FSCK_ALLOC (PASS_FSCK|PASS_ALLOC)
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+#define PASS_FSCK_DEBUG BIT(1)
+#else
+#define PASS_FSCK_DEBUG 0
+#endif
+
+/*
+ * Passes may be reordered, but the second field is a persistent identifier and
+ * must never change:
+ */
+#define BCH_RECOVERY_PASSES() \
+ x(recovery_pass_empty, 41, PASS_SILENT) \
+ x(scan_for_btree_nodes, 37, 0) \
+ x(check_topology, 4, 0) \
+ x(accounting_read, 39, PASS_ALWAYS) \
+ x(alloc_read, 0, PASS_ALWAYS) \
+ x(stripes_read, 1, 0) \
+ x(initialize_subvolumes, 2, 0) \
+ x(snapshots_read, 3, PASS_ALWAYS) \
+ x(check_allocations, 5, PASS_FSCK_ALLOC) \
+ x(trans_mark_dev_sbs, 6, PASS_ALWAYS|PASS_SILENT|PASS_ALLOC) \
+ x(fs_journal_alloc, 7, PASS_ALWAYS|PASS_SILENT|PASS_ALLOC) \
+ x(set_may_go_rw, 8, PASS_ALWAYS|PASS_SILENT) \
+ x(journal_replay, 9, PASS_ALWAYS) \
+ x(check_alloc_info, 10, PASS_ONLINE|PASS_FSCK_ALLOC) \
+ x(check_lrus, 11, PASS_ONLINE|PASS_FSCK_ALLOC) \
+ x(check_btree_backpointers, 12, PASS_ONLINE|PASS_FSCK_ALLOC) \
+ x(check_backpointers_to_extents, 13, PASS_ONLINE|PASS_FSCK_DEBUG) \
+ x(check_extents_to_backpointers, 14, PASS_ONLINE|PASS_FSCK_ALLOC) \
+ x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK_ALLOC) \
+ x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \
+ x(bucket_gens_init, 17, 0) \
+ x(reconstruct_snapshots, 38, 0) \
+ x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \
+ x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \
+ x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \
+ x(check_subvol_children, 35, PASS_ONLINE|PASS_FSCK) \
+ x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \
+ x(fs_upgrade_for_subvolumes, 22, 0) \
+ x(check_inodes, 24, PASS_FSCK) \
+ x(check_extents, 25, PASS_FSCK) \
+ x(check_indirect_extents, 26, PASS_ONLINE|PASS_FSCK) \
+ x(check_dirents, 27, PASS_FSCK) \
+ x(check_xattrs, 28, PASS_FSCK) \
+ x(check_root, 29, PASS_ONLINE|PASS_FSCK) \
+ x(check_unreachable_inodes, 40, PASS_FSCK) \
+ x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \
+ x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \
+ x(check_nlinks, 31, PASS_FSCK) \
+ x(check_rebalance_work, 43, PASS_ONLINE|PASS_FSCK) \
+ x(resume_logged_ops, 23, PASS_ALWAYS) \
+ x(delete_dead_inodes, 32, PASS_ALWAYS) \
+ x(fix_reflink_p, 33, 0) \
+ x(set_fs_needs_rebalance, 34, 0) \
+ x(lookup_root_inode, 42, PASS_ALWAYS|PASS_SILENT)
+
+/* We normally enumerate recovery passes in the order we run them: */
+enum bch_recovery_pass {
+#define x(n, id, when) BCH_RECOVERY_PASS_##n,
+ BCH_RECOVERY_PASSES()
+#undef x
+ BCH_RECOVERY_PASS_NR
+};
+
+/* But we also need stable identifiers that can be used in the superblock */
+enum bch_recovery_pass_stable {
+#define x(n, id, when) BCH_RECOVERY_PASS_STABLE_##n = id,
+ BCH_RECOVERY_PASSES()
+#undef x
+};
+
+struct recovery_pass_entry {
+ __le64 last_run;
+ __le32 last_runtime;
+ __le32 flags;
+};
+
+LE32_BITMASK(BCH_RECOVERY_PASS_NO_RATELIMIT, struct recovery_pass_entry, flags, 0, 1)
+
+struct bch_sb_field_recovery_passes {
+ struct bch_sb_field field;
+ struct recovery_pass_entry start[];
+};
+
+static inline unsigned
+recovery_passes_nr_entries(struct bch_sb_field_recovery_passes *r)
+{
+ return r
+ ? ((vstruct_end(&r->field) - (void *) &r->start[0]) /
+ sizeof(struct recovery_pass_entry))
+ : 0;
+}
+
+#endif /* _BCACHEFS_RECOVERY_PASSES_FORMAT_H */
diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h
index 418557960ed6..aa9526938cc3 100644
--- a/fs/bcachefs/recovery_passes_types.h
+++ b/fs/bcachefs/recovery_passes_types.h
@@ -2,79 +2,26 @@
#ifndef _BCACHEFS_RECOVERY_PASSES_TYPES_H
#define _BCACHEFS_RECOVERY_PASSES_TYPES_H
-#define PASS_SILENT BIT(0)
-#define PASS_FSCK BIT(1)
-#define PASS_UNCLEAN BIT(2)
-#define PASS_ALWAYS BIT(3)
-#define PASS_ONLINE BIT(4)
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-#define PASS_FSCK_DEBUG BIT(1)
-#else
-#define PASS_FSCK_DEBUG 0
-#endif
-
-/*
- * Passes may be reordered, but the second field is a persistent identifier and
- * must never change:
- */
-#define BCH_RECOVERY_PASSES() \
- x(recovery_pass_empty, 41, PASS_SILENT) \
- x(scan_for_btree_nodes, 37, 0) \
- x(check_topology, 4, 0) \
- x(accounting_read, 39, PASS_ALWAYS) \
- x(alloc_read, 0, PASS_ALWAYS) \
- x(stripes_read, 1, PASS_ALWAYS) \
- x(initialize_subvolumes, 2, 0) \
- x(snapshots_read, 3, PASS_ALWAYS) \
- x(check_allocations, 5, PASS_FSCK) \
- x(trans_mark_dev_sbs, 6, PASS_ALWAYS|PASS_SILENT) \
- x(fs_journal_alloc, 7, PASS_ALWAYS|PASS_SILENT) \
- x(set_may_go_rw, 8, PASS_ALWAYS|PASS_SILENT) \
- x(journal_replay, 9, PASS_ALWAYS) \
- x(check_alloc_info, 10, PASS_ONLINE|PASS_FSCK) \
- x(check_lrus, 11, PASS_ONLINE|PASS_FSCK) \
- x(check_btree_backpointers, 12, PASS_ONLINE|PASS_FSCK) \
- x(check_backpointers_to_extents, 13, PASS_ONLINE|PASS_FSCK_DEBUG) \
- x(check_extents_to_backpointers, 14, PASS_ONLINE|PASS_FSCK) \
- x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK) \
- x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \
- x(bucket_gens_init, 17, 0) \
- x(reconstruct_snapshots, 38, 0) \
- x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \
- x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \
- x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \
- x(check_subvol_children, 35, PASS_ONLINE|PASS_FSCK) \
- x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \
- x(fs_upgrade_for_subvolumes, 22, 0) \
- x(check_inodes, 24, PASS_FSCK) \
- x(check_extents, 25, PASS_FSCK) \
- x(check_indirect_extents, 26, PASS_ONLINE|PASS_FSCK) \
- x(check_dirents, 27, PASS_FSCK) \
- x(check_xattrs, 28, PASS_FSCK) \
- x(check_root, 29, PASS_ONLINE|PASS_FSCK) \
- x(check_unreachable_inodes, 40, PASS_FSCK) \
- x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \
- x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \
- x(check_nlinks, 31, PASS_FSCK) \
- x(resume_logged_ops, 23, PASS_ALWAYS) \
- x(delete_dead_inodes, 32, PASS_ALWAYS) \
- x(fix_reflink_p, 33, 0) \
- x(set_fs_needs_rebalance, 34, 0)
-
-/* We normally enumerate recovery passes in the order we run them: */
-enum bch_recovery_pass {
-#define x(n, id, when) BCH_RECOVERY_PASS_##n,
- BCH_RECOVERY_PASSES()
-#undef x
- BCH_RECOVERY_PASS_NR
-};
-
-/* But we also need stable identifiers that can be used in the superblock */
-enum bch_recovery_pass_stable {
-#define x(n, id, when) BCH_RECOVERY_PASS_STABLE_##n = id,
- BCH_RECOVERY_PASSES()
-#undef x
+struct bch_fs_recovery {
+ /*
+ * Two different uses:
+ * "Has this fsck pass?" - i.e. should this type of error be an
+ * emergency read-only
+ * And, in certain situations fsck will rewind to an earlier pass: used
+ * for signaling to the toplevel code which pass we want to run now.
+ */
+ enum bch_recovery_pass curr_pass;
+ enum bch_recovery_pass next_pass;
+ /* never rewinds version of curr_pass */
+ enum bch_recovery_pass pass_done;
+ u64 passes_to_run;
+ /* bitmask of recovery passes that we actually ran */
+ u64 passes_complete;
+ u64 passes_failing;
+ u64 passes_ratelimiting;
+ spinlock_t lock;
+ struct semaphore run_lock;
+ struct work_struct work;
};
#endif /* _BCACHEFS_RECOVERY_PASSES_TYPES_H */
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 93ba4f4e47ca..92b90cfe622b 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -3,6 +3,7 @@
#include "bkey_buf.h"
#include "btree_update.h"
#include "buckets.h"
+#include "enumerated_ref.h"
#include "error.h"
#include "extents.h"
#include "inode.h"
@@ -63,6 +64,9 @@ void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c,
REFLINK_P_IDX(p.v),
le32_to_cpu(p.v->front_pad),
le32_to_cpu(p.v->back_pad));
+
+ if (REFLINK_P_ERROR(p.v))
+ prt_str(out, " error");
}
bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
@@ -172,7 +176,7 @@ static int bch2_indirect_extent_missing_error(struct btree_trans *trans,
bool should_commit)
{
if (REFLINK_P_ERROR(p.v))
- return -BCH_ERR_missing_indirect_extent;
+ return 0;
struct bch_fs *c = trans->c;
u64 live_start = REFLINK_P_IDX(p.v);
@@ -185,12 +189,21 @@ static int bch2_indirect_extent_missing_error(struct btree_trans *trans,
BUG_ON(missing_start < refd_start);
BUG_ON(missing_end > refd_end);
- if (fsck_err(trans, reflink_p_to_missing_reflink_v,
- "pointer to missing indirect extent\n"
- " %s\n"
- " missing range %llu-%llu",
- (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf),
- missing_start, missing_end)) {
+ struct bpos missing_pos = bkey_start_pos(p.k);
+ missing_pos.offset += missing_start - live_start;
+
+ prt_printf(&buf, "pointer to missing indirect extent in ");
+ ret = bch2_inum_snap_offset_err_msg_trans(trans, &buf, missing_pos);
+ if (ret)
+ goto err;
+
+ prt_printf(&buf, "-%llu\n", (missing_pos.offset + (missing_end - missing_start)) << 9);
+ bch2_bkey_val_to_text(&buf, c, p.s_c);
+
+ prt_printf(&buf, "\nmissing reflink btree range %llu-%llu",
+ missing_start, missing_end);
+
+ if (fsck_err(trans, reflink_p_to_missing_reflink_v, "%s", buf.buf)) {
struct bkey_i_reflink_p *new = bch2_bkey_make_mut_noupdate_typed(trans, p.s_c, reflink_p);
ret = PTR_ERR_OR_ZERO(new);
if (ret)
@@ -259,23 +272,22 @@ struct bkey_s_c bch2_lookup_indirect_extent(struct btree_trans *trans,
return k;
if (unlikely(!bkey_extent_is_reflink_data(k.k))) {
- bch2_trans_iter_exit(trans, iter);
-
- unsigned size = min((u64) k.k->size,
- REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad) -
- reflink_offset);
- bch2_key_resize(&iter->k, size);
+ u64 missing_end = min(k.k->p.offset,
+ REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad));
+ BUG_ON(reflink_offset == missing_end);
int ret = bch2_indirect_extent_missing_error(trans, p, reflink_offset,
- k.k->p.offset, should_commit);
- if (ret)
+ missing_end, should_commit);
+ if (ret) {
+ bch2_trans_iter_exit(trans, iter);
return bkey_s_c_err(ret);
+ }
} else if (unlikely(REFLINK_P_ERROR(p.v))) {
- bch2_trans_iter_exit(trans, iter);
-
int ret = bch2_indirect_extent_not_missing(trans, p, should_commit);
- if (ret)
+ if (ret) {
+ bch2_trans_iter_exit(trans, iter);
return bkey_s_c_err(ret);
+ }
}
*offset_into_extent = reflink_offset - bkey_start_offset(k.k);
@@ -300,9 +312,9 @@ static int trans_trigger_reflink_p_segment(struct btree_trans *trans,
if (ret)
return ret;
- if (bkey_deleted(k.k)) {
+ if (!bkey_refcount_c(k)) {
if (!(flags & BTREE_TRIGGER_overwrite))
- ret = -BCH_ERR_missing_indirect_extent;
+ ret = bch_err_throw(c, missing_indirect_extent);
goto next;
}
@@ -314,10 +326,10 @@ static int trans_trigger_reflink_p_segment(struct btree_trans *trans,
__le64 *refcount = bkey_refcount(bkey_i_to_s(new));
if (!*refcount && (flags & BTREE_TRIGGER_overwrite)) {
bch2_bkey_val_to_text(&buf, c, p.s_c);
- prt_printf(&buf, "\n ");
+ prt_newline(&buf);
bch2_bkey_val_to_text(&buf, c, k);
log_fsck_err(trans, reflink_refcount_underflow,
- "indirect extent refcount underflow while marking\n %s",
+ "indirect extent refcount underflow while marking\n%s",
buf.buf);
goto next;
}
@@ -486,7 +498,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
bool reflink_p_may_update_opts_field)
{
struct bch_fs *c = trans->c;
- struct btree_iter reflink_iter = { NULL };
+ struct btree_iter reflink_iter = {};
struct bkey_s_c k;
struct bkey_i *r_v;
struct bkey_i_reflink_p *r_p;
@@ -498,7 +510,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
bch2_trans_iter_init(trans, &reflink_iter, BTREE_ID_reflink, POS_MAX,
BTREE_ITER_intent);
- k = bch2_btree_iter_peek_prev(&reflink_iter);
+ k = bch2_btree_iter_peek_prev(trans, &reflink_iter);
ret = bkey_err(k);
if (ret)
goto err;
@@ -560,12 +572,13 @@ err:
return ret;
}
-static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end)
+static struct bkey_s_c get_next_src(struct btree_trans *trans,
+ struct btree_iter *iter, struct bpos end)
{
struct bkey_s_c k;
int ret;
- for_each_btree_key_max_continue_norestart(*iter, end, 0, k, ret) {
+ for_each_btree_key_max_continue_norestart(trans, *iter, end, 0, k, ret) {
if (bkey_extent_is_unwritten(k))
continue;
@@ -574,7 +587,7 @@ static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end)
}
if (bkey_ge(iter->pos, end))
- bch2_btree_iter_set_pos(iter, end);
+ bch2_btree_iter_set_pos(trans, iter, end);
return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
}
@@ -597,11 +610,11 @@ s64 bch2_remap_range(struct bch_fs *c,
u64 dst_done = 0;
u32 dst_snapshot, src_snapshot;
bool reflink_p_may_update_opts_field =
- bch2_request_incompat_feature(c, bcachefs_metadata_version_reflink_p_may_update_opts);
+ !bch2_request_incompat_feature(c, bcachefs_metadata_version_reflink_p_may_update_opts);
int ret = 0, ret2 = 0;
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_reflink))
- return -BCH_ERR_erofs_no_writes;
+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_reflink))
+ return bch_err_throw(c, erofs_no_writes);
bch2_check_set_feature(c, BCH_FEATURE_reflink);
@@ -638,27 +651,27 @@ s64 bch2_remap_range(struct bch_fs *c,
if (ret)
continue;
- bch2_btree_iter_set_snapshot(&src_iter, src_snapshot);
+ bch2_btree_iter_set_snapshot(trans, &src_iter, src_snapshot);
ret = bch2_subvolume_get_snapshot(trans, dst_inum.subvol,
&dst_snapshot);
if (ret)
continue;
- bch2_btree_iter_set_snapshot(&dst_iter, dst_snapshot);
+ bch2_btree_iter_set_snapshot(trans, &dst_iter, dst_snapshot);
if (dst_inum.inum < src_inum.inum) {
/* Avoid some lock cycle transaction restarts */
- ret = bch2_btree_iter_traverse(&dst_iter);
+ ret = bch2_btree_iter_traverse(trans, &dst_iter);
if (ret)
continue;
}
dst_done = dst_iter.pos.offset - dst_start.offset;
src_want = POS(src_start.inode, src_start.offset + dst_done);
- bch2_btree_iter_set_pos(&src_iter, src_want);
+ bch2_btree_iter_set_pos(trans, &src_iter, src_want);
- src_k = get_next_src(&src_iter, src_end);
+ src_k = get_next_src(trans, &src_iter, src_end);
ret = bkey_err(src_k);
if (ret)
continue;
@@ -700,7 +713,8 @@ s64 bch2_remap_range(struct bch_fs *c,
SET_REFLINK_P_IDX(&dst_p->v, offset);
if (reflink_p_may_update_opts_field &&
- may_change_src_io_path_opts)
+ may_change_src_io_path_opts &&
+ REFLINK_P_MAY_UPDATE_OPTIONS(src_p.v))
SET_REFLINK_P_MAY_UPDATE_OPTIONS(&dst_p->v, true);
} else {
BUG();
@@ -729,7 +743,7 @@ s64 bch2_remap_range(struct bch_fs *c,
do {
struct bch_inode_unpacked inode_u;
- struct btree_iter inode_iter = { NULL };
+ struct btree_iter inode_iter = {};
bch2_trans_begin(trans);
@@ -751,7 +765,7 @@ err:
bch2_bkey_buf_exit(&new_src, c);
bch2_bkey_buf_exit(&new_dst, c);
- bch2_write_ref_put(c, BCH_WRITE_REF_reflink);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_reflink);
return dst_done ?: ret ?: ret2;
}
@@ -786,8 +800,8 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans,
if (fsck_err_on(r->refcount != le64_to_cpu(*refcount),
trans, reflink_v_refcount_wrong,
"reflink key has wrong refcount:\n"
- " %s\n"
- " should be %u",
+ "%s\n"
+ "should be %u",
(bch2_bkey_val_to_text(&buf, c, k), buf.buf),
r->refcount)) {
struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
@@ -836,7 +850,7 @@ int bch2_gc_reflink_start(struct bch_fs *c)
struct reflink_gc *r = genradix_ptr_alloc(&c->reflink_gc_table,
c->reflink_gc_nr++, GFP_KERNEL);
if (!r) {
- ret = -BCH_ERR_ENOMEM_gc_reflink_start;
+ ret = bch_err_throw(c, ENOMEM_gc_reflink_start);
break;
}
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 477ef0997949..8383bd7fdb3f 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -119,7 +119,7 @@ int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r,
return 0;
bad:
bch2_replicas_entry_to_text(err, r);
- return -BCH_ERR_invalid_replicas_entry;
+ return bch_err_throw(c, invalid_replicas_entry);
}
void bch2_cpu_replicas_to_text(struct printbuf *out,
@@ -311,7 +311,7 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c,
!__replicas_has_entry(&c->replicas_gc, new_entry)) {
new_gc = cpu_replicas_add_entry(c, &c->replicas_gc, new_entry);
if (!new_gc.entries) {
- ret = -BCH_ERR_ENOMEM_cpu_replicas;
+ ret = bch_err_throw(c, ENOMEM_cpu_replicas);
goto err;
}
}
@@ -319,7 +319,7 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c,
if (!__replicas_has_entry(&c->replicas, new_entry)) {
new_r = cpu_replicas_add_entry(c, &c->replicas, new_entry);
if (!new_r.entries) {
- ret = -BCH_ERR_ENOMEM_cpu_replicas;
+ ret = bch_err_throw(c, ENOMEM_cpu_replicas);
goto err;
}
@@ -422,7 +422,7 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
if (!c->replicas_gc.entries) {
mutex_unlock(&c->sb_lock);
bch_err(c, "error allocating c->replicas_gc");
- return -BCH_ERR_ENOMEM_replicas_gc;
+ return bch_err_throw(c, ENOMEM_replicas_gc);
}
for_each_cpu_replicas_entry(&c->replicas, e)
@@ -458,7 +458,7 @@ retry:
new.entries = kcalloc(nr, new.entry_size, GFP_KERNEL);
if (!new.entries) {
bch_err(c, "error allocating c->replicas_gc");
- return -BCH_ERR_ENOMEM_replicas_gc;
+ return bch_err_throw(c, ENOMEM_replicas_gc);
}
mutex_lock(&c->sb_lock);
@@ -622,7 +622,7 @@ static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c,
sb_r = bch2_sb_field_resize(&c->disk_sb, replicas_v0,
DIV_ROUND_UP(bytes, sizeof(u64)));
if (!sb_r)
- return -BCH_ERR_ENOSPC_sb_replicas;
+ return bch_err_throw(c, ENOSPC_sb_replicas);
bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas);
sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas_v0);
@@ -667,7 +667,7 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
sb_r = bch2_sb_field_resize(&c->disk_sb, replicas,
DIV_ROUND_UP(bytes, sizeof(u64)));
if (!sb_r)
- return -BCH_ERR_ENOSPC_sb_replicas;
+ return bch_err_throw(c, ENOSPC_sb_replicas);
bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0);
sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas);
@@ -819,19 +819,18 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
if (e->data_type == BCH_DATA_cached)
continue;
- rcu_read_lock();
- for (unsigned i = 0; i < e->nr_devs; i++) {
- if (e->devs[i] == BCH_SB_MEMBER_INVALID) {
- nr_failed++;
- continue;
- }
+ scoped_guard(rcu)
+ for (unsigned i = 0; i < e->nr_devs; i++) {
+ if (e->devs[i] == BCH_SB_MEMBER_INVALID) {
+ nr_failed++;
+ continue;
+ }
- nr_online += test_bit(e->devs[i], devs.d);
+ nr_online += test_bit(e->devs[i], devs.d);
- struct bch_dev *ca = bch2_dev_rcu_noerror(c, e->devs[i]);
- nr_failed += !ca || ca->mi.state == BCH_MEMBER_STATE_failed;
- }
- rcu_read_unlock();
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, e->devs[i]);
+ nr_failed += !ca || ca->mi.state == BCH_MEMBER_STATE_failed;
+ }
if (nr_online + nr_failed == e->nr_devs)
continue;
diff --git a/fs/bcachefs/sb-counters.c b/fs/bcachefs/sb-counters.c
index 6992e7469112..2b4b8445d418 100644
--- a/fs/bcachefs/sb-counters.c
+++ b/fs/bcachefs/sb-counters.c
@@ -5,7 +5,13 @@
/* BCH_SB_FIELD_counters */
-static const char * const bch2_counter_names[] = {
+static const u8 counters_to_stable_map[] = {
+#define x(n, id, ...) [BCH_COUNTER_##n] = BCH_COUNTER_STABLE_##n,
+ BCH_PERSISTENT_COUNTERS()
+#undef x
+};
+
+const char * const bch2_counter_names[] = {
#define x(t, n, ...) (#t),
BCH_PERSISTENT_COUNTERS()
#undef x
@@ -18,13 +24,13 @@ static size_t bch2_sb_counter_nr_entries(struct bch_sb_field_counters *ctrs)
return 0;
return (__le64 *) vstruct_end(&ctrs->field) - &ctrs->d[0];
-};
+}
static int bch2_sb_counters_validate(struct bch_sb *sb, struct bch_sb_field *f,
enum bch_validate_flags flags, struct printbuf *err)
{
return 0;
-};
+}
static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb,
struct bch_sb_field *f)
@@ -32,50 +38,56 @@ static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb,
struct bch_sb_field_counters *ctrs = field_to_type(f, counters);
unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
- for (unsigned i = 0; i < nr; i++)
- prt_printf(out, "%s \t%llu\n",
- i < BCH_COUNTER_NR ? bch2_counter_names[i] : "(unknown)",
- le64_to_cpu(ctrs->d[i]));
-};
+ for (unsigned i = 0; i < BCH_COUNTER_NR; i++) {
+ unsigned stable = counters_to_stable_map[i];
+ if (stable < nr)
+ prt_printf(out, "%s \t%llu\n",
+ bch2_counter_names[i],
+ le64_to_cpu(ctrs->d[stable]));
+ }
+}
int bch2_sb_counters_to_cpu(struct bch_fs *c)
{
struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters);
- unsigned int i;
unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
- u64 val = 0;
- for (i = 0; i < BCH_COUNTER_NR; i++)
+ for (unsigned i = 0; i < BCH_COUNTER_NR; i++)
c->counters_on_mount[i] = 0;
- for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++) {
- val = le64_to_cpu(ctrs->d[i]);
- percpu_u64_set(&c->counters[i], val);
- c->counters_on_mount[i] = val;
+ for (unsigned i = 0; i < BCH_COUNTER_NR; i++) {
+ unsigned stable = counters_to_stable_map[i];
+ if (stable < nr) {
+ u64 v = le64_to_cpu(ctrs->d[stable]);
+ percpu_u64_set(&c->counters[i], v);
+ c->counters_on_mount[i] = v;
+ }
}
+
return 0;
-};
+}
int bch2_sb_counters_from_cpu(struct bch_fs *c)
{
struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters);
struct bch_sb_field_counters *ret;
- unsigned int i;
unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
if (nr < BCH_COUNTER_NR) {
ret = bch2_sb_field_resize(&c->disk_sb, counters,
- sizeof(*ctrs) / sizeof(u64) + BCH_COUNTER_NR);
-
+ sizeof(*ctrs) / sizeof(u64) + BCH_COUNTER_NR);
if (ret) {
ctrs = ret;
nr = bch2_sb_counter_nr_entries(ctrs);
}
}
+ for (unsigned i = 0; i < BCH_COUNTER_NR; i++) {
+ unsigned stable = counters_to_stable_map[i];
+ if (stable < nr)
+ ctrs->d[stable] = cpu_to_le64(percpu_u64_get(&c->counters[i]));
+ }
- for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++)
- ctrs->d[i] = cpu_to_le64(percpu_u64_get(&c->counters[i]));
return 0;
}
@@ -97,3 +109,39 @@ const struct bch_sb_field_ops bch_sb_field_ops_counters = {
.validate = bch2_sb_counters_validate,
.to_text = bch2_sb_counters_to_text,
};
+
+#ifndef NO_BCACHEFS_CHARDEV
+long bch2_ioctl_query_counters(struct bch_fs *c,
+ struct bch_ioctl_query_counters __user *user_arg)
+{
+ struct bch_ioctl_query_counters arg;
+ int ret = copy_from_user_errcode(&arg, user_arg, sizeof(arg));
+ if (ret)
+ return ret;
+
+ if ((arg.flags & ~BCH_IOCTL_QUERY_COUNTERS_MOUNT) ||
+ arg.pad)
+ return -EINVAL;
+
+ arg.nr = min(arg.nr, BCH_COUNTER_NR);
+ ret = put_user(arg.nr, &user_arg->nr);
+ if (ret)
+ return ret;
+
+ for (unsigned i = 0; i < BCH_COUNTER_NR; i++) {
+ unsigned stable = counters_to_stable_map[i];
+
+ if (stable < arg.nr) {
+ u64 v = !(arg.flags & BCH_IOCTL_QUERY_COUNTERS_MOUNT)
+ ? percpu_u64_get(&c->counters[i])
+ : c->counters_on_mount[i];
+
+ ret = put_user(v, &user_arg->d[stable]);
+ if (ret)
+ return ret;
+ }
+ }
+
+ return 0;
+}
+#endif
diff --git a/fs/bcachefs/sb-counters.h b/fs/bcachefs/sb-counters.h
index 81f8aec9fcb1..a4329ad8dd1b 100644
--- a/fs/bcachefs/sb-counters.h
+++ b/fs/bcachefs/sb-counters.h
@@ -11,6 +11,10 @@ int bch2_sb_counters_from_cpu(struct bch_fs *);
void bch2_fs_counters_exit(struct bch_fs *);
int bch2_fs_counters_init(struct bch_fs *);
+extern const char * const bch2_counter_names[];
extern const struct bch_sb_field_ops bch_sb_field_ops_counters;
+long bch2_ioctl_query_counters(struct bch_fs *,
+ struct bch_ioctl_query_counters __user *);
+
#endif // _BCACHEFS_SB_COUNTERS_H
diff --git a/fs/bcachefs/sb-counters_format.h b/fs/bcachefs/sb-counters_format.h
index fdcf598f08b1..b868702a431a 100644
--- a/fs/bcachefs/sb-counters_format.h
+++ b/fs/bcachefs/sb-counters_format.h
@@ -9,10 +9,27 @@ enum counters_flags {
#define BCH_PERSISTENT_COUNTERS() \
x(io_read, 0, TYPE_SECTORS) \
+ x(io_read_inline, 80, TYPE_SECTORS) \
+ x(io_read_hole, 81, TYPE_SECTORS) \
+ x(io_read_promote, 30, TYPE_COUNTER) \
+ x(io_read_bounce, 31, TYPE_COUNTER) \
+ x(io_read_split, 33, TYPE_COUNTER) \
+ x(io_read_reuse_race, 34, TYPE_COUNTER) \
+ x(io_read_retry, 32, TYPE_COUNTER) \
+ x(io_read_fail_and_poison, 82, TYPE_COUNTER) \
x(io_write, 1, TYPE_SECTORS) \
x(io_move, 2, TYPE_SECTORS) \
+ x(io_move_read, 35, TYPE_SECTORS) \
+ x(io_move_write, 36, TYPE_SECTORS) \
+ x(io_move_finish, 37, TYPE_SECTORS) \
+ x(io_move_fail, 38, TYPE_COUNTER) \
+ x(io_move_write_fail, 82, TYPE_COUNTER) \
+ x(io_move_start_fail, 39, TYPE_COUNTER) \
+ x(io_move_created_rebalance, 83, TYPE_COUNTER) \
+ x(io_move_evacuate_bucket, 84, TYPE_COUNTER) \
x(bucket_invalidate, 3, TYPE_COUNTER) \
x(bucket_discard, 4, TYPE_COUNTER) \
+ x(bucket_discard_fast, 79, TYPE_COUNTER) \
x(bucket_alloc, 5, TYPE_COUNTER) \
x(bucket_alloc_fail, 6, TYPE_COUNTER) \
x(btree_cache_scan, 7, TYPE_COUNTER) \
@@ -38,16 +55,6 @@ enum counters_flags {
x(journal_reclaim_finish, 27, TYPE_COUNTER) \
x(journal_reclaim_start, 28, TYPE_COUNTER) \
x(journal_write, 29, TYPE_COUNTER) \
- x(read_promote, 30, TYPE_COUNTER) \
- x(read_bounce, 31, TYPE_COUNTER) \
- x(read_split, 33, TYPE_COUNTER) \
- x(read_retry, 32, TYPE_COUNTER) \
- x(read_reuse_race, 34, TYPE_COUNTER) \
- x(move_extent_read, 35, TYPE_SECTORS) \
- x(move_extent_write, 36, TYPE_SECTORS) \
- x(move_extent_finish, 37, TYPE_SECTORS) \
- x(move_extent_fail, 38, TYPE_COUNTER) \
- x(move_extent_start_fail, 39, TYPE_COUNTER) \
x(copygc, 40, TYPE_COUNTER) \
x(copygc_wait, 41, TYPE_COUNTER) \
x(gc_gens_end, 42, TYPE_COUNTER) \
@@ -95,6 +102,13 @@ enum bch_persistent_counters {
BCH_COUNTER_NR
};
+enum bch_persistent_counters_stable {
+#define x(t, n, ...) BCH_COUNTER_STABLE_##t = n,
+ BCH_PERSISTENT_COUNTERS()
+#undef x
+ BCH_COUNTER_STABLE_NR
+};
+
struct bch_sb_field_counters {
struct bch_sb_field field;
__le64 d[];
diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
index 14f6b6a5fb38..1506d05e0665 100644
--- a/fs/bcachefs/sb-downgrade.c
+++ b/fs/bcachefs/sb-downgrade.c
@@ -20,6 +20,10 @@
* x(version, recovery_passes, errors...)
*/
#define UPGRADE_TABLE() \
+ x(snapshot_2, \
+ RECOVERY_PASS_ALL_FSCK, \
+ BCH_FSCK_ERR_subvol_root_wrong_bi_subvol, \
+ BCH_FSCK_ERR_subvol_not_master_and_not_snapshot) \
x(backpointers, \
RECOVERY_PASS_ALL_FSCK) \
x(inode_v3, \
@@ -91,9 +95,16 @@
BCH_FSCK_ERR_accounting_mismatch, \
BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \
BCH_FSCK_ERR_accounting_key_junk_at_end) \
- x(directory_size, \
+ x(cached_backpointers, \
+ BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\
+ BCH_FSCK_ERR_ptr_to_missing_backpointer) \
+ x(stripe_backpointers, \
+ BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\
+ BCH_FSCK_ERR_ptr_to_missing_backpointer) \
+ x(inode_has_case_insensitive, \
BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \
- BCH_FSCK_ERR_directory_size_mismatch) \
+ BCH_FSCK_ERR_inode_has_case_insensitive_not_set, \
+ BCH_FSCK_ERR_inode_parent_has_case_insensitive_not_set)
#define DOWNGRADE_TABLE() \
x(bucket_stripe_sectors, \
@@ -242,6 +253,7 @@ DOWNGRADE_TABLE()
static int downgrade_table_extra(struct bch_fs *c, darray_char *table)
{
+ unsigned dst_offset = table->nr;
struct bch_sb_field_downgrade_entry *dst = (void *) &darray_top(*table);
unsigned bytes = sizeof(*dst) + sizeof(dst->errors[0]) * le16_to_cpu(dst->nr_errors);
int ret = 0;
@@ -257,6 +269,9 @@ static int downgrade_table_extra(struct bch_fs *c, darray_char *table)
if (ret)
return ret;
+ dst = (void *) &table->data[dst_offset];
+ dst->nr_errors = cpu_to_le16(nr_errors + 1);
+
/* open coded __set_bit_le64, as dst is packed and
* dst->recovery_passes is misaligned */
unsigned b = BCH_RECOVERY_PASS_STABLE_check_allocations;
@@ -267,7 +282,6 @@ static int downgrade_table_extra(struct bch_fs *c, darray_char *table)
break;
}
- dst->nr_errors = cpu_to_le16(nr_errors);
return ret;
}
@@ -367,6 +381,9 @@ int bch2_sb_downgrade_update(struct bch_fs *c)
if (BCH_VERSION_MAJOR(src->version) != BCH_VERSION_MAJOR(le16_to_cpu(c->disk_sb.sb->version)))
continue;
+ if (src->version < c->sb.version_incompat)
+ continue;
+
struct bch_sb_field_downgrade_entry *dst;
unsigned bytes = sizeof(*dst) + sizeof(dst->errors[0]) * src->nr_errors;
@@ -403,7 +420,7 @@ int bch2_sb_downgrade_update(struct bch_fs *c)
d = bch2_sb_field_resize(&c->disk_sb, downgrade, sb_u64s);
if (!d) {
- ret = -BCH_ERR_ENOSPC_sb_downgrade;
+ ret = bch_err_throw(c, ENOSPC_sb_downgrade);
goto out;
}
diff --git a/fs/bcachefs/sb-errors.c b/fs/bcachefs/sb-errors.c
index 013a96883b4e..48853efdc105 100644
--- a/fs/bcachefs/sb-errors.c
+++ b/fs/bcachefs/sb-errors.c
@@ -78,6 +78,28 @@ const struct bch_sb_field_ops bch_sb_field_ops_errors = {
.to_text = bch2_sb_errors_to_text,
};
+void bch2_fs_errors_to_text(struct printbuf *out, struct bch_fs *c)
+{
+ if (out->nr_tabstops < 1)
+ printbuf_tabstop_push(out, 48);
+ if (out->nr_tabstops < 2)
+ printbuf_tabstop_push(out, 8);
+ if (out->nr_tabstops < 3)
+ printbuf_tabstop_push(out, 16);
+
+ guard(mutex)(&c->fsck_error_counts_lock);
+
+ bch_sb_errors_cpu *e = &c->fsck_error_counts;
+ darray_for_each(*e, i) {
+ bch2_sb_error_id_to_text(out, i->id);
+ prt_tab(out);
+ prt_u64(out, i->nr);
+ prt_tab(out);
+ bch2_prt_datetime(out, i->last_error_time);
+ prt_newline(out);
+ }
+}
+
void bch2_sb_error_count(struct bch_fs *c, enum bch_sb_error_id err)
{
bch_sb_errors_cpu *e = &c->fsck_error_counts;
diff --git a/fs/bcachefs/sb-errors.h b/fs/bcachefs/sb-errors.h
index b2357b8e6107..e86267264692 100644
--- a/fs/bcachefs/sb-errors.h
+++ b/fs/bcachefs/sb-errors.h
@@ -7,6 +7,7 @@
extern const char * const bch2_sb_error_strs[];
void bch2_sb_error_id_to_text(struct printbuf *, enum bch_sb_error_id);
+void bch2_fs_errors_to_text(struct printbuf *, struct bch_fs *);
extern const struct bch_sb_field_ops bch_sb_field_ops_errors;
diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
index ea0a18364751..d154b7651d28 100644
--- a/fs/bcachefs/sb-errors_format.h
+++ b/fs/bcachefs/sb-errors_format.h
@@ -3,10 +3,10 @@
#define _BCACHEFS_SB_ERRORS_FORMAT_H
enum bch_fsck_flags {
- FSCK_CAN_FIX = 1 << 0,
- FSCK_CAN_IGNORE = 1 << 1,
- FSCK_NO_RATELIMIT = 1 << 2,
- FSCK_AUTOFIX = 1 << 3,
+ FSCK_CAN_FIX = BIT(0),
+ FSCK_CAN_IGNORE = BIT(1),
+ FSCK_AUTOFIX = BIT(2),
+ FSCK_ERR_NO_LOG = BIT(3),
};
#define BCH_SB_ERRS() \
@@ -47,7 +47,7 @@ enum bch_fsck_flags {
x(btree_node_unsupported_version, 34, 0) \
x(btree_node_bset_older_than_sb_min, 35, 0) \
x(btree_node_bset_newer_than_sb, 36, 0) \
- x(btree_node_data_missing, 37, 0) \
+ x(btree_node_data_missing, 37, FSCK_AUTOFIX) \
x(btree_node_bset_after_end, 38, 0) \
x(btree_node_replicas_sectors_written_mismatch, 39, 0) \
x(btree_node_replicas_data_mismatch, 40, 0) \
@@ -135,7 +135,7 @@ enum bch_fsck_flags {
x(bucket_gens_to_invalid_buckets, 121, FSCK_AUTOFIX) \
x(bucket_gens_nonzero_for_invalid_buckets, 122, FSCK_AUTOFIX) \
x(need_discard_freespace_key_to_invalid_dev_bucket, 123, 0) \
- x(need_discard_freespace_key_bad, 124, 0) \
+ x(need_discard_freespace_key_bad, 124, FSCK_AUTOFIX) \
x(discarding_bucket_not_in_need_discard_btree, 291, 0) \
x(backpointer_bucket_offset_wrong, 125, 0) \
x(backpointer_level_bad, 294, 0) \
@@ -166,7 +166,7 @@ enum bch_fsck_flags {
x(ptr_to_missing_replicas_entry, 149, FSCK_AUTOFIX) \
x(ptr_to_missing_stripe, 150, 0) \
x(ptr_to_incorrect_stripe, 151, 0) \
- x(ptr_gen_newer_than_bucket_gen, 152, 0) \
+ x(ptr_gen_newer_than_bucket_gen, 152, FSCK_AUTOFIX) \
x(ptr_too_stale, 153, 0) \
x(stale_dirty_ptr, 154, FSCK_AUTOFIX) \
x(ptr_bucket_data_type_mismatch, 155, 0) \
@@ -179,10 +179,11 @@ enum bch_fsck_flags {
x(ptr_crc_redundant, 160, 0) \
x(ptr_crc_nonce_mismatch, 162, 0) \
x(ptr_stripe_redundant, 163, 0) \
+ x(extent_flags_not_at_start, 306, 0) \
x(reservation_key_nr_replicas_invalid, 164, 0) \
- x(reflink_v_refcount_wrong, 165, 0) \
+ x(reflink_v_refcount_wrong, 165, FSCK_AUTOFIX) \
x(reflink_v_pos_bad, 292, 0) \
- x(reflink_p_to_missing_reflink_v, 166, 0) \
+ x(reflink_p_to_missing_reflink_v, 166, FSCK_AUTOFIX) \
x(reflink_refcount_underflow, 293, 0) \
x(stripe_pos_bad, 167, 0) \
x(stripe_val_size_bad, 168, 0) \
@@ -205,10 +206,11 @@ enum bch_fsck_flags {
x(snapshot_bad_depth, 184, 0) \
x(snapshot_bad_skiplist, 185, 0) \
x(subvol_pos_bad, 186, 0) \
- x(subvol_not_master_and_not_snapshot, 187, 0) \
+ x(subvol_not_master_and_not_snapshot, 187, FSCK_AUTOFIX) \
x(subvol_to_missing_root, 188, 0) \
- x(subvol_root_wrong_bi_subvol, 189, 0) \
+ x(subvol_root_wrong_bi_subvol, 189, FSCK_AUTOFIX) \
x(bkey_in_missing_snapshot, 190, 0) \
+ x(bkey_in_deleted_snapshot, 315, FSCK_AUTOFIX) \
x(inode_pos_inode_nonzero, 191, 0) \
x(inode_pos_blockdev_range, 192, 0) \
x(inode_alloc_cursor_inode_bad, 301, 0) \
@@ -216,6 +218,7 @@ enum bch_fsck_flags {
x(inode_str_hash_invalid, 194, 0) \
x(inode_v3_fields_start_bad, 195, 0) \
x(inode_snapshot_mismatch, 196, 0) \
+ x(snapshot_key_missing_inode_snapshot, 314, FSCK_AUTOFIX) \
x(inode_unlinked_but_clean, 197, 0) \
x(inode_unlinked_but_nlink_nonzero, 198, 0) \
x(inode_unlinked_and_not_open, 281, 0) \
@@ -230,35 +233,43 @@ enum bch_fsck_flags {
x(inode_dir_multiple_links, 206, FSCK_AUTOFIX) \
x(inode_dir_missing_backpointer, 284, FSCK_AUTOFIX) \
x(inode_dir_unlinked_but_not_empty, 286, FSCK_AUTOFIX) \
+ x(inode_dir_has_nonzero_i_size, 319, FSCK_AUTOFIX) \
x(inode_multiple_links_but_nlink_0, 207, FSCK_AUTOFIX) \
x(inode_wrong_backpointer, 208, FSCK_AUTOFIX) \
x(inode_wrong_nlink, 209, FSCK_AUTOFIX) \
- x(inode_has_child_snapshots_wrong, 287, 0) \
+ x(inode_has_child_snapshots_wrong, 287, FSCK_AUTOFIX) \
x(inode_unreachable, 210, FSCK_AUTOFIX) \
x(inode_journal_seq_in_future, 299, FSCK_AUTOFIX) \
+ x(inode_i_sectors_underflow, 312, FSCK_AUTOFIX) \
+ x(inode_has_case_insensitive_not_set, 316, FSCK_AUTOFIX) \
+ x(inode_parent_has_case_insensitive_not_set, 317, FSCK_AUTOFIX) \
+ x(vfs_inode_i_blocks_underflow, 311, FSCK_AUTOFIX) \
+ x(vfs_inode_i_blocks_not_zero_at_truncate, 313, FSCK_AUTOFIX) \
+ x(vfs_bad_inode_rm, 320, 0) \
x(deleted_inode_but_clean, 211, FSCK_AUTOFIX) \
x(deleted_inode_missing, 212, FSCK_AUTOFIX) \
x(deleted_inode_is_dir, 213, FSCK_AUTOFIX) \
x(deleted_inode_not_unlinked, 214, FSCK_AUTOFIX) \
x(deleted_inode_has_child_snapshots, 288, FSCK_AUTOFIX) \
x(extent_overlapping, 215, 0) \
- x(key_in_missing_inode, 216, 0) \
+ x(key_in_missing_inode, 216, FSCK_AUTOFIX) \
x(key_in_wrong_inode_type, 217, 0) \
- x(extent_past_end_of_inode, 218, 0) \
+ x(extent_past_end_of_inode, 218, FSCK_AUTOFIX) \
x(dirent_empty_name, 219, 0) \
x(dirent_val_too_big, 220, 0) \
x(dirent_name_too_long, 221, 0) \
x(dirent_name_embedded_nul, 222, 0) \
x(dirent_name_dot_or_dotdot, 223, 0) \
x(dirent_name_has_slash, 224, 0) \
- x(dirent_d_type_wrong, 225, 0) \
+ x(dirent_d_type_wrong, 225, FSCK_AUTOFIX) \
x(inode_bi_parent_wrong, 226, 0) \
x(dirent_in_missing_dir_inode, 227, 0) \
x(dirent_in_non_dir_inode, 228, 0) \
- x(dirent_to_missing_inode, 229, 0) \
+ x(dirent_to_missing_inode, 229, FSCK_AUTOFIX) \
x(dirent_to_overwritten_inode, 302, 0) \
x(dirent_to_missing_subvol, 230, 0) \
x(dirent_to_itself, 231, 0) \
+ x(dirent_casefold_mismatch, 318, FSCK_AUTOFIX) \
x(quota_type_invalid, 232, 0) \
x(xattr_val_size_too_small, 233, 0) \
x(xattr_val_size_too_big, 234, 0) \
@@ -269,8 +280,8 @@ enum bch_fsck_flags {
x(root_dir_missing, 239, 0) \
x(root_inode_not_dir, 240, 0) \
x(dir_loop, 241, 0) \
- x(hash_table_key_duplicate, 242, 0) \
- x(hash_table_key_wrong_offset, 243, 0) \
+ x(hash_table_key_duplicate, 242, FSCK_AUTOFIX) \
+ x(hash_table_key_wrong_offset, 243, FSCK_AUTOFIX) \
x(unlinked_inode_not_on_deleted_list, 244, FSCK_AUTOFIX) \
x(reflink_p_front_pad_bad, 245, 0) \
x(journal_entry_dup_same_device, 246, 0) \
@@ -291,18 +302,19 @@ enum bch_fsck_flags {
x(btree_node_topology_empty_interior_node, 261, 0) \
x(btree_ptr_v2_min_key_bad, 262, 0) \
x(btree_root_unreadable_and_scan_found_nothing, 263, 0) \
- x(snapshot_node_missing, 264, 0) \
+ x(snapshot_node_missing, 264, FSCK_AUTOFIX) \
x(dup_backpointer_to_bad_csum_extent, 265, 0) \
x(btree_bitmap_not_marked, 266, FSCK_AUTOFIX) \
x(sb_clean_entry_overrun, 267, 0) \
x(btree_ptr_v2_written_0, 268, 0) \
x(subvol_snapshot_bad, 269, 0) \
x(subvol_inode_bad, 270, 0) \
+ x(subvol_missing, 308, FSCK_AUTOFIX) \
x(alloc_key_stripe_sectors_wrong, 271, FSCK_AUTOFIX) \
x(accounting_mismatch, 272, FSCK_AUTOFIX) \
x(accounting_replicas_not_marked, 273, 0) \
x(accounting_to_invalid_device, 289, 0) \
- x(invalid_btree_id, 274, 0) \
+ x(invalid_btree_id, 274, FSCK_AUTOFIX) \
x(alloc_key_io_time_bad, 275, 0) \
x(alloc_key_fragmentation_lru_wrong, 276, FSCK_AUTOFIX) \
x(accounting_key_junk_at_end, 277, FSCK_AUTOFIX) \
@@ -310,11 +322,16 @@ enum bch_fsck_flags {
x(accounting_key_replicas_nr_required_bad, 279, FSCK_AUTOFIX) \
x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \
x(accounting_key_version_0, 282, FSCK_AUTOFIX) \
+ x(accounting_key_nr_counters_wrong, 307, FSCK_AUTOFIX) \
x(logged_op_but_clean, 283, FSCK_AUTOFIX) \
x(compression_opt_not_marked_in_sb, 295, FSCK_AUTOFIX) \
x(compression_type_not_marked_in_sb, 296, FSCK_AUTOFIX) \
x(directory_size_mismatch, 303, FSCK_AUTOFIX) \
- x(MAX, 304, 0)
+ x(dirent_cf_name_too_big, 304, 0) \
+ x(dirent_stray_data_after_cf_name, 305, 0) \
+ x(rebalance_work_incorrectly_set, 309, FSCK_AUTOFIX) \
+ x(rebalance_work_incorrectly_unset, 310, FSCK_AUTOFIX) \
+ x(MAX, 321, 0)
enum bch_sb_error_id {
#define x(t, n, ...) BCH_FSCK_ERR_##t = n,
diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c
index 116131f95815..6245e342a8a8 100644
--- a/fs/bcachefs/sb-members.c
+++ b/fs/bcachefs/sb-members.c
@@ -5,19 +5,41 @@
#include "disk_groups.h"
#include "error.h"
#include "opts.h"
+#include "recovery_passes.h"
#include "replicas.h"
#include "sb-members.h"
#include "super-io.h"
-void bch2_dev_missing(struct bch_fs *c, unsigned dev)
+int bch2_dev_missing_bkey(struct bch_fs *c, struct bkey_s_c k, unsigned dev)
+{
+ struct printbuf buf = PRINTBUF;
+ bch2_log_msg_start(c, &buf);
+
+ prt_printf(&buf, "pointer to nonexistent device %u in key\n", dev);
+ bch2_bkey_val_to_text(&buf, c, k);
+
+ bool print = bch2_count_fsck_err(c, ptr_to_invalid_device, &buf);
+
+ int ret = bch2_run_explicit_recovery_pass(c, &buf,
+ BCH_RECOVERY_PASS_check_allocations, 0);
+
+ if (print)
+ bch2_print_str(c, KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+ return ret;
+}
+
+void bch2_dev_missing_atomic(struct bch_fs *c, unsigned dev)
{
if (dev != BCH_SB_MEMBER_INVALID)
bch2_fs_inconsistent(c, "pointer to nonexistent device %u", dev);
}
-void bch2_dev_bucket_missing(struct bch_fs *c, struct bpos bucket)
+void bch2_dev_bucket_missing(struct bch_dev *ca, u64 bucket)
{
- bch2_fs_inconsistent(c, "pointer to nonexistent bucket %llu:%llu", bucket.inode, bucket.offset);
+ bch2_fs_inconsistent(ca->fs,
+ "pointer to nonexistent bucket %llu on device %s (valid range %u-%llu)",
+ bucket, ca->name, ca->mi.first_bucket, ca->mi.nbuckets);
}
#define x(t, n, ...) [n] = #t,
@@ -79,7 +101,7 @@ static int sb_members_v2_resize_entries(struct bch_fs *c)
mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s);
if (!mi)
- return -BCH_ERR_ENOSPC_sb_members_v2;
+ return bch_err_throw(c, ENOSPC_sb_members_v2);
for (int i = c->disk_sb.sb->nr_devices - 1; i >= 0; --i) {
void *dst = (void *) mi->_members + (i * sizeof(struct bch_member));
@@ -117,6 +139,11 @@ int bch2_sb_members_cpy_v2_v1(struct bch_sb_handle *disk_sb)
struct bch_sb_field_members_v1 *mi1;
struct bch_sb_field_members_v2 *mi2;
+ if (BCH_SB_VERSION_INCOMPAT(disk_sb->sb) > bcachefs_metadata_version_extent_flags) {
+ bch2_sb_field_resize(disk_sb, members_v1, 0);
+ return 0;
+ }
+
mi1 = bch2_sb_field_resize(disk_sb, members_v1,
DIV_ROUND_UP(sizeof(*mi1) + BCH_MEMBER_V1_BYTES *
disk_sb->sb->nr_devices, sizeof(u64)));
@@ -168,6 +195,12 @@ static int validate_member(struct printbuf *err,
return -BCH_ERR_invalid_sb_members;
}
+ if (BCH_MEMBER_FREESPACE_INITIALIZED(&m) &&
+ sb->features[0] & cpu_to_le64(BIT_ULL(BCH_FEATURE_no_alloc_info))) {
+ prt_printf(err, "device %u: freespace initialized but fs has no alloc info", i);
+ return -BCH_ERR_invalid_sb_members;
+ }
+
return 0;
}
@@ -189,17 +222,11 @@ static void member_to_text(struct printbuf *out,
printbuf_indent_add(out, 2);
prt_printf(out, "Label:\t");
- if (BCH_MEMBER_GROUP(&m)) {
- unsigned idx = BCH_MEMBER_GROUP(&m) - 1;
-
- if (idx < disk_groups_nr(gi))
- prt_printf(out, "%s (%u)",
- gi->entries[idx].label, idx);
- else
- prt_printf(out, "(bad disk labels section)");
- } else {
+ if (BCH_MEMBER_GROUP(&m))
+ bch2_disk_path_to_text_sb(out, sb,
+ BCH_MEMBER_GROUP(&m) - 1);
+ else
prt_printf(out, "(none)");
- }
prt_newline(out);
prt_printf(out, "UUID:\t");
@@ -266,6 +293,7 @@ static void member_to_text(struct printbuf *out,
prt_printf(out, "Discard:\t%llu\n", BCH_MEMBER_DISCARD(&m));
prt_printf(out, "Freespace initialized:\t%llu\n", BCH_MEMBER_FREESPACE_INITIALIZED(&m));
+ prt_printf(out, "Resize on mount:\t%llu\n", BCH_MEMBER_RESIZE_ON_MOUNT(&m));
printbuf_indent_sub(out, 2);
}
@@ -297,9 +325,17 @@ static void bch2_sb_members_v1_to_text(struct printbuf *out, struct bch_sb *sb,
{
struct bch_sb_field_members_v1 *mi = field_to_type(f, members_v1);
struct bch_sb_field_disk_groups *gi = bch2_sb_field_get(sb, disk_groups);
- unsigned i;
- for (i = 0; i < sb->nr_devices; i++)
+ if (vstruct_end(&mi->field) <= (void *) &mi->_members[0]) {
+ prt_printf(out, "field ends before start of entries");
+ return;
+ }
+
+ unsigned nr = (vstruct_end(&mi->field) - (void *) &mi->_members[0]) / sizeof(mi->_members[0]);
+ if (nr != sb->nr_devices)
+ prt_printf(out, "nr_devices mismatch: have %i entries, should be %u", nr, sb->nr_devices);
+
+ for (unsigned i = 0; i < min(sb->nr_devices, nr); i++)
member_to_text(out, members_v1_get(mi, i), gi, sb, i);
}
@@ -313,9 +349,27 @@ static void bch2_sb_members_v2_to_text(struct printbuf *out, struct bch_sb *sb,
{
struct bch_sb_field_members_v2 *mi = field_to_type(f, members_v2);
struct bch_sb_field_disk_groups *gi = bch2_sb_field_get(sb, disk_groups);
- unsigned i;
- for (i = 0; i < sb->nr_devices; i++)
+ if (vstruct_end(&mi->field) <= (void *) &mi->_members[0]) {
+ prt_printf(out, "field ends before start of entries");
+ return;
+ }
+
+ if (!le16_to_cpu(mi->member_bytes)) {
+ prt_printf(out, "member_bytes 0");
+ return;
+ }
+
+ unsigned nr = (vstruct_end(&mi->field) - (void *) &mi->_members[0]) / le16_to_cpu(mi->member_bytes);
+ if (nr != sb->nr_devices)
+ prt_printf(out, "nr_devices mismatch: have %i entries, should be %u", nr, sb->nr_devices);
+
+ /*
+ * We call to_text() on superblock sections that haven't passed
+ * validate, so we can't trust sb->nr_devices.
+ */
+
+ for (unsigned i = 0; i < min(sb->nr_devices, nr); i++)
member_to_text(out, members_v2_get(mi, i), gi, sb, i);
}
@@ -350,14 +404,13 @@ void bch2_sb_members_from_cpu(struct bch_fs *c)
{
struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
- rcu_read_lock();
+ guard(rcu)();
for_each_member_device_rcu(c, ca, NULL) {
struct bch_member *m = __bch2_members_v2_get_mut(mi, ca->dev_idx);
for (unsigned e = 0; e < BCH_MEMBER_ERROR_NR; e++)
m->errors[e] = cpu_to_le64(atomic64_read(&ca->errors[e]));
}
- rcu_read_unlock();
}
void bch2_dev_io_errors_to_text(struct printbuf *out, struct bch_dev *ca)
@@ -415,20 +468,14 @@ void bch2_dev_errors_reset(struct bch_dev *ca)
bool bch2_dev_btree_bitmap_marked(struct bch_fs *c, struct bkey_s_c k)
{
- bool ret = true;
- rcu_read_lock();
+ guard(rcu)();
bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
- if (!ca)
- continue;
-
- if (!bch2_dev_btree_bitmap_marked_sectors(ca, ptr->offset, btree_sectors(c))) {
- ret = false;
- break;
- }
+ if (ca &&
+ !bch2_dev_btree_bitmap_marked_sectors(ca, ptr->offset, btree_sectors(c)))
+ return false;
}
- rcu_read_unlock();
- return ret;
+ return true;
}
static void __bch2_dev_btree_bitmap_mark(struct bch_sb_field_members_v2 *mi, unsigned dev,
@@ -491,6 +538,7 @@ int bch2_sb_member_alloc(struct bch_fs *c)
unsigned u64s;
int best = -1;
u64 best_last_mount = 0;
+ unsigned nr_deleted = 0;
if (dev_idx < BCH_SB_MEMBERS_MAX)
goto have_slot;
@@ -501,7 +549,10 @@ int bch2_sb_member_alloc(struct bch_fs *c)
continue;
struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, dev_idx);
- if (bch2_member_alive(&m))
+
+ nr_deleted += uuid_equal(&m.uuid, &BCH_SB_MEMBER_DELETED_UUID);
+
+ if (!bch2_is_zero(&m.uuid, sizeof(m.uuid)))
continue;
u64 last_mount = le64_to_cpu(m.last_mount);
@@ -515,6 +566,10 @@ int bch2_sb_member_alloc(struct bch_fs *c)
goto have_slot;
}
+ if (nr_deleted)
+ bch_err(c, "unable to allocate new member, but have %u deleted: run fsck",
+ nr_deleted);
+
return -BCH_ERR_ENOSPC_sb_members;
have_slot:
nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices);
@@ -530,3 +585,22 @@ have_slot:
c->disk_sb.sb->nr_devices = nr_devices;
return dev_idx;
}
+
+void bch2_sb_members_clean_deleted(struct bch_fs *c)
+{
+ mutex_lock(&c->sb_lock);
+ bool write_sb = false;
+
+ for (unsigned i = 0; i < c->sb.nr_devices; i++) {
+ struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, i);
+
+ if (uuid_equal(&m->uuid, &BCH_SB_MEMBER_DELETED_UUID)) {
+ memset(&m->uuid, 0, sizeof(m->uuid));
+ write_sb = true;
+ }
+ }
+
+ if (write_sb)
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+}
diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h
index 762083b564ee..8d8a8a857648 100644
--- a/fs/bcachefs/sb-members.h
+++ b/fs/bcachefs/sb-members.h
@@ -4,6 +4,7 @@
#include "darray.h"
#include "bkey_types.h"
+#include "enumerated_ref.h"
extern char * const bch2_member_error_strs[];
@@ -20,10 +21,19 @@ struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i);
static inline bool bch2_dev_is_online(struct bch_dev *ca)
{
- return !percpu_ref_is_zero(&ca->io_ref);
+ return !enumerated_ref_is_zero(&ca->io_ref[READ]);
}
-static inline bool bch2_dev_is_readable(struct bch_dev *ca)
+static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *, unsigned);
+
+static inline bool bch2_dev_idx_is_online(struct bch_fs *c, unsigned dev)
+{
+ guard(rcu)();
+ struct bch_dev *ca = bch2_dev_rcu(c, dev);
+ return ca && bch2_dev_is_online(ca);
+}
+
+static inline bool bch2_dev_is_healthy(struct bch_dev *ca)
{
return bch2_dev_is_online(ca) &&
ca->mi.state != BCH_MEMBER_STATE_failed;
@@ -92,6 +102,12 @@ static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, struct bch_dev *
for (struct bch_dev *_ca = NULL; \
(_ca = __bch2_next_dev((_c), _ca, (_mask)));)
+#define for_each_online_member_rcu(_c, _ca) \
+ for_each_member_device_rcu(_c, _ca, &(_c)->online_devs)
+
+#define for_each_rw_member_rcu(_c, _ca) \
+ for_each_member_device_rcu(_c, _ca, &(_c)->rw_devs[BCH_DATA_free])
+
static inline void bch2_dev_get(struct bch_dev *ca)
{
#ifdef CONFIG_BCACHEFS_DEBUG
@@ -123,12 +139,10 @@ static inline void bch2_dev_put(struct bch_dev *ca)
static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, struct bch_dev *ca)
{
- rcu_read_lock();
+ guard(rcu)();
bch2_dev_put(ca);
if ((ca = __bch2_next_dev(c, ca, NULL)))
bch2_dev_get(ca);
- rcu_read_unlock();
-
return ca;
}
@@ -144,33 +158,33 @@ static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, struct bch_dev
static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
struct bch_dev *ca,
- unsigned state_mask)
+ unsigned state_mask,
+ int rw, unsigned ref_idx)
{
- rcu_read_lock();
+ guard(rcu)();
if (ca)
- percpu_ref_put(&ca->io_ref);
+ enumerated_ref_put(&ca->io_ref[rw], ref_idx);
while ((ca = __bch2_next_dev(c, ca, NULL)) &&
(!((1 << ca->mi.state) & state_mask) ||
- !percpu_ref_tryget(&ca->io_ref)))
+ !enumerated_ref_tryget(&ca->io_ref[rw], ref_idx)))
;
- rcu_read_unlock();
return ca;
}
-#define __for_each_online_member(_c, _ca, state_mask) \
+#define __for_each_online_member(_c, _ca, state_mask, rw, ref_idx) \
for (struct bch_dev *_ca = NULL; \
- (_ca = bch2_get_next_online_dev(_c, _ca, state_mask));)
+ (_ca = bch2_get_next_online_dev(_c, _ca, state_mask, rw, ref_idx));)
-#define for_each_online_member(c, ca) \
- __for_each_online_member(c, ca, ~0)
+#define for_each_online_member(c, ca, ref_idx) \
+ __for_each_online_member(c, ca, ~0, READ, ref_idx)
-#define for_each_rw_member(c, ca) \
- __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw))
+#define for_each_rw_member(c, ca, ref_idx) \
+ __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), WRITE, ref_idx)
-#define for_each_readable_member(c, ca) \
- __for_each_online_member(c, ca, BIT( BCH_MEMBER_STATE_rw)|BIT(BCH_MEMBER_STATE_ro))
+#define for_each_readable_member(c, ca, ref_idx) \
+ __for_each_online_member(c, ca, BIT( BCH_MEMBER_STATE_rw)|BIT(BCH_MEMBER_STATE_ro), READ, ref_idx)
static inline bool bch2_dev_exists(const struct bch_fs *c, unsigned dev)
{
@@ -205,23 +219,24 @@ static inline struct bch_dev *bch2_dev_rcu_noerror(struct bch_fs *c, unsigned de
: NULL;
}
-void bch2_dev_missing(struct bch_fs *, unsigned);
+int bch2_dev_missing_bkey(struct bch_fs *, struct bkey_s_c, unsigned);
+
+void bch2_dev_missing_atomic(struct bch_fs *, unsigned);
static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *c, unsigned dev)
{
struct bch_dev *ca = bch2_dev_rcu_noerror(c, dev);
if (unlikely(!ca))
- bch2_dev_missing(c, dev);
+ bch2_dev_missing_atomic(c, dev);
return ca;
}
static inline struct bch_dev *bch2_dev_tryget_noerror(struct bch_fs *c, unsigned dev)
{
- rcu_read_lock();
+ guard(rcu)();
struct bch_dev *ca = bch2_dev_rcu_noerror(c, dev);
if (ca)
bch2_dev_get(ca);
- rcu_read_unlock();
return ca;
}
@@ -229,27 +244,30 @@ static inline struct bch_dev *bch2_dev_tryget(struct bch_fs *c, unsigned dev)
{
struct bch_dev *ca = bch2_dev_tryget_noerror(c, dev);
if (unlikely(!ca))
- bch2_dev_missing(c, dev);
+ bch2_dev_missing_atomic(c, dev);
return ca;
}
static inline struct bch_dev *bch2_dev_bucket_tryget_noerror(struct bch_fs *c, struct bpos bucket)
{
struct bch_dev *ca = bch2_dev_tryget_noerror(c, bucket.inode);
- if (ca && !bucket_valid(ca, bucket.offset)) {
+ if (ca && unlikely(!bucket_valid(ca, bucket.offset))) {
bch2_dev_put(ca);
ca = NULL;
}
return ca;
}
-void bch2_dev_bucket_missing(struct bch_fs *, struct bpos);
+void bch2_dev_bucket_missing(struct bch_dev *, u64);
static inline struct bch_dev *bch2_dev_bucket_tryget(struct bch_fs *c, struct bpos bucket)
{
- struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, bucket);
- if (!ca)
- bch2_dev_bucket_missing(c, bucket);
+ struct bch_dev *ca = bch2_dev_tryget(c, bucket.inode);
+ if (ca && unlikely(!bucket_valid(ca, bucket.offset))) {
+ bch2_dev_bucket_missing(ca, bucket.offset);
+ bch2_dev_put(ca);
+ ca = NULL;
+ }
return ca;
}
@@ -269,41 +287,31 @@ static inline struct bch_dev *bch2_dev_iterate(struct bch_fs *c, struct bch_dev
return bch2_dev_tryget(c, dev_idx);
}
-static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev, int rw)
+static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev,
+ int rw, unsigned ref_idx)
{
- rcu_read_lock();
+ might_sleep();
+
+ guard(rcu)();
struct bch_dev *ca = bch2_dev_rcu(c, dev);
- if (ca && !percpu_ref_tryget(&ca->io_ref))
- ca = NULL;
- rcu_read_unlock();
+ if (!ca || !enumerated_ref_tryget(&ca->io_ref[rw], ref_idx))
+ return NULL;
- if (ca &&
- (ca->mi.state == BCH_MEMBER_STATE_rw ||
- (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ)))
+ if (ca->mi.state == BCH_MEMBER_STATE_rw ||
+ (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ))
return ca;
- if (ca)
- percpu_ref_put(&ca->io_ref);
+ enumerated_ref_put(&ca->io_ref[rw], ref_idx);
return NULL;
}
-/* XXX kill, move to struct bch_fs */
-static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
-{
- struct bch_devs_mask devs;
-
- memset(&devs, 0, sizeof(devs));
- for_each_online_member(c, ca)
- __set_bit(ca->dev_idx, devs.d);
- return devs;
-}
-
extern const struct bch_sb_field_ops bch_sb_field_ops_members_v1;
extern const struct bch_sb_field_ops bch_sb_field_ops_members_v2;
static inline bool bch2_member_alive(struct bch_member *m)
{
- return !bch2_is_zero(&m->uuid, sizeof(m->uuid));
+ return !bch2_is_zero(&m->uuid, sizeof(m->uuid)) &&
+ !uuid_equal(&m->uuid, &BCH_SB_MEMBER_DELETED_UUID);
}
static inline bool bch2_member_exists(struct bch_sb *sb, unsigned dev)
@@ -333,6 +341,7 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
? BCH_MEMBER_DURABILITY(mi) - 1
: 1,
.freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi),
+ .resize_on_mount = BCH_MEMBER_RESIZE_ON_MOUNT(mi),
.valid = bch2_member_alive(mi),
.btree_bitmap_shift = mi->btree_bitmap_shift,
.btree_allocated_bitmap = le64_to_cpu(mi->btree_allocated_bitmap),
@@ -363,5 +372,6 @@ bool bch2_dev_btree_bitmap_marked(struct bch_fs *, struct bkey_s_c);
void bch2_dev_btree_bitmap_mark(struct bch_fs *, struct bkey_s_c);
int bch2_sb_member_alloc(struct bch_fs *);
+void bch2_sb_members_clean_deleted(struct bch_fs *);
#endif /* _BCACHEFS_SB_MEMBERS_H */
diff --git a/fs/bcachefs/sb-members_format.h b/fs/bcachefs/sb-members_format.h
index 2adf1221a440..fb72ad730518 100644
--- a/fs/bcachefs/sb-members_format.h
+++ b/fs/bcachefs/sb-members_format.h
@@ -13,6 +13,10 @@
*/
#define BCH_SB_MEMBER_INVALID 255
+#define BCH_SB_MEMBER_DELETED_UUID \
+ UUID_INIT(0xffffffff, 0xffff, 0xffff, \
+ 0xd9, 0x6a, 0x60, 0xcf, 0x80, 0x3d, 0xf7, 0xef)
+
#define BCH_MIN_NR_NBUCKETS (1 << 6)
#define BCH_IOPS_MEASUREMENTS() \
@@ -79,6 +83,7 @@ struct bch_member {
#define BCH_MEMBER_V1_BYTES 56
+LE16_BITMASK(BCH_MEMBER_BUCKET_SIZE, struct bch_member, bucket_size, 0, 16)
LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags, 0, 4)
/* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */
LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags, 14, 15)
@@ -87,6 +92,8 @@ LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags, 20, 28)
LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags, 28, 30)
LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED,
struct bch_member, flags, 30, 31)
+LE64_BITMASK(BCH_MEMBER_RESIZE_ON_MOUNT,
+ struct bch_member, flags, 31, 32)
#if 0
LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20);
diff --git a/fs/bcachefs/sb-members_types.h b/fs/bcachefs/sb-members_types.h
index c0eda888fe39..d6443e186872 100644
--- a/fs/bcachefs/sb-members_types.h
+++ b/fs/bcachefs/sb-members_types.h
@@ -13,6 +13,7 @@ struct bch_member_cpu {
u8 data_allowed;
u8 durability;
u8 freespace_initialized;
+ u8 resize_on_mount;
u8 valid;
u8 btree_bitmap_shift;
u64 btree_allocated_bitmap;
diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
index 7e7c66a1e1a6..538c324f4765 100644
--- a/fs/bcachefs/six.c
+++ b/fs/bcachefs/six.c
@@ -339,12 +339,9 @@ static inline bool six_owner_running(struct six_lock *lock)
* acquiring the lock and setting the owner field. If we're an RT task
* that will live-lock because we won't let the owner complete.
*/
- rcu_read_lock();
+ guard(rcu)();
struct task_struct *owner = READ_ONCE(lock->owner);
- bool ret = owner ? owner_on_cpu(owner) : !rt_or_dl_task(current);
- rcu_read_unlock();
-
- return ret;
+ return owner ? owner_on_cpu(owner) : !rt_or_dl_task(current);
}
static inline bool six_optimistic_spin(struct six_lock *lock,
@@ -850,7 +847,8 @@ void six_lock_exit(struct six_lock *lock)
EXPORT_SYMBOL_GPL(six_lock_exit);
void __six_lock_init(struct six_lock *lock, const char *name,
- struct lock_class_key *key, enum six_lock_init_flags flags)
+ struct lock_class_key *key, enum six_lock_init_flags flags,
+ gfp_t gfp)
{
atomic_set(&lock->state, 0);
raw_spin_lock_init(&lock->wait_lock);
@@ -873,7 +871,7 @@ void __six_lock_init(struct six_lock *lock, const char *name,
* failure if they wish by checking lock->readers, but generally
* will not want to treat it as an error.
*/
- lock->readers = alloc_percpu(unsigned);
+ lock->readers = alloc_percpu_gfp(unsigned, gfp);
}
#endif
}
diff --git a/fs/bcachefs/six.h b/fs/bcachefs/six.h
index c142e06b7a3a..59b851cf8bac 100644
--- a/fs/bcachefs/six.h
+++ b/fs/bcachefs/six.h
@@ -164,18 +164,19 @@ enum six_lock_init_flags {
};
void __six_lock_init(struct six_lock *lock, const char *name,
- struct lock_class_key *key, enum six_lock_init_flags flags);
+ struct lock_class_key *key, enum six_lock_init_flags flags,
+ gfp_t gfp);
/**
* six_lock_init - initialize a six lock
* @lock: lock to initialize
* @flags: optional flags, i.e. SIX_LOCK_INIT_PCPU
*/
-#define six_lock_init(lock, flags) \
+#define six_lock_init(lock, flags, gfp) \
do { \
static struct lock_class_key __key; \
\
- __six_lock_init((lock), #lock, &__key, flags); \
+ __six_lock_init((lock), #lock, &__key, flags, gfp); \
} while (0)
/**
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index c54091a28909..4c43d2a2c1f5 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -1,11 +1,13 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "bbpos.h"
#include "bkey_buf.h"
#include "btree_cache.h"
#include "btree_key_cache.h"
#include "btree_update.h"
#include "buckets.h"
+#include "enumerated_ref.h"
#include "errcode.h"
#include "error.h"
#include "fs.h"
@@ -52,7 +54,7 @@ int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id,
BTREE_ITER_with_updates, snapshot_tree, s);
if (bch2_err_matches(ret, ENOENT))
- ret = -BCH_ERR_ENOENT_snapshot_tree;
+ ret = bch_err_throw(trans->c, ENOENT_snapshot_tree);
return ret;
}
@@ -65,7 +67,7 @@ __bch2_snapshot_tree_create(struct btree_trans *trans)
struct bkey_i_snapshot_tree *s_t;
if (ret == -BCH_ERR_ENOSPC_btree_slot)
- ret = -BCH_ERR_ENOSPC_snapshot_tree;
+ ret = bch_err_throw(trans->c, ENOSPC_snapshot_tree);
if (ret)
return ERR_PTR(ret);
@@ -103,11 +105,8 @@ static bool __bch2_snapshot_is_ancestor_early(struct snapshot_table *t, u32 id,
static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
{
- rcu_read_lock();
- bool ret = __bch2_snapshot_is_ancestor_early(rcu_dereference(c->snapshots), id, ancestor);
- rcu_read_unlock();
-
- return ret;
+ guard(rcu)();
+ return __bch2_snapshot_is_ancestor_early(rcu_dereference(c->snapshots), id, ancestor);
}
static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor)
@@ -136,27 +135,25 @@ static bool test_ancestor_bitmap(struct snapshot_table *t, u32 id, u32 ancestor)
bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
{
- bool ret;
+#ifdef CONFIG_BCACHEFS_DEBUG
+ u32 orig_id = id;
+#endif
- rcu_read_lock();
+ guard(rcu)();
struct snapshot_table *t = rcu_dereference(c->snapshots);
- if (unlikely(c->recovery_pass_done < BCH_RECOVERY_PASS_check_snapshots)) {
- ret = __bch2_snapshot_is_ancestor_early(t, id, ancestor);
- goto out;
- }
+ if (unlikely(c->recovery.pass_done < BCH_RECOVERY_PASS_check_snapshots))
+ return __bch2_snapshot_is_ancestor_early(t, id, ancestor);
- while (id && id < ancestor - IS_ANCESTOR_BITMAP)
- id = get_ancestor_below(t, id, ancestor);
+ if (likely(ancestor >= IS_ANCESTOR_BITMAP))
+ while (id && id < ancestor - IS_ANCESTOR_BITMAP)
+ id = get_ancestor_below(t, id, ancestor);
- ret = id && id < ancestor
+ bool ret = id && id < ancestor
? test_ancestor_bitmap(t, id, ancestor)
: id == ancestor;
- EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, id, ancestor));
-out:
- rcu_read_unlock();
-
+ EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, orig_id, ancestor));
return ret;
}
@@ -208,9 +205,14 @@ void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
{
struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
- prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u tree %u",
- BCH_SNAPSHOT_SUBVOL(s.v),
- BCH_SNAPSHOT_DELETED(s.v),
+ if (BCH_SNAPSHOT_SUBVOL(s.v))
+ prt_str(out, "subvol ");
+ if (BCH_SNAPSHOT_WILL_DELETE(s.v))
+ prt_str(out, "will_delete ");
+ if (BCH_SNAPSHOT_DELETED(s.v))
+ prt_str(out, "deleted ");
+
+ prt_printf(out, "parent %10u children %10u %10u subvol %u tree %u",
le32_to_cpu(s.v->parent),
le32_to_cpu(s.v->children[0]),
le32_to_cpu(s.v->children[1]),
@@ -280,6 +282,16 @@ fsck_err:
return ret;
}
+static int bch2_snapshot_table_make_room(struct bch_fs *c, u32 id)
+{
+ mutex_lock(&c->snapshot_table_lock);
+ int ret = snapshot_t_mut(c, id)
+ ? 0
+ : bch_err_throw(c, ENOMEM_mark_snapshot);
+ mutex_unlock(&c->snapshot_table_lock);
+ return ret;
+}
+
static int __bch2_mark_snapshot(struct btree_trans *trans,
enum btree_id btree, unsigned level,
struct bkey_s_c old, struct bkey_s_c new,
@@ -294,14 +306,16 @@ static int __bch2_mark_snapshot(struct btree_trans *trans,
t = snapshot_t_mut(c, id);
if (!t) {
- ret = -BCH_ERR_ENOMEM_mark_snapshot;
+ ret = bch_err_throw(c, ENOMEM_mark_snapshot);
goto err;
}
if (new.k->type == KEY_TYPE_snapshot) {
struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
- t->live = true;
+ t->state = !BCH_SNAPSHOT_DELETED(s.v)
+ ? SNAPSHOT_ID_live
+ : SNAPSHOT_ID_deleted;
t->parent = le32_to_cpu(s.v->parent);
t->children[0] = le32_to_cpu(s.v->children[0]);
t->children[1] = le32_to_cpu(s.v->children[1]);
@@ -326,9 +340,9 @@ static int __bch2_mark_snapshot(struct btree_trans *trans,
parent - id - 1 < IS_ANCESTOR_BITMAP)
__set_bit(parent - id - 1, t->is_ancestor);
- if (BCH_SNAPSHOT_DELETED(s.v)) {
+ if (BCH_SNAPSHOT_WILL_DELETE(s.v)) {
set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags);
- if (c->curr_recovery_pass > BCH_RECOVERY_PASS_delete_dead_snapshots)
+ if (c->recovery.pass_done > BCH_RECOVERY_PASS_delete_dead_snapshots)
bch2_delete_dead_snapshots_async(c);
}
} else {
@@ -389,21 +403,29 @@ static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id)
return 0;
}
-static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root)
+u32 bch2_snapshot_oldest_subvol(struct bch_fs *c, u32 snapshot_root,
+ snapshot_id_list *skip)
{
- u32 id = snapshot_root;
- u32 subvol = 0, s;
-
- rcu_read_lock();
- while (id) {
- s = snapshot_t(c, id)->subvol;
-
- if (s && (!subvol || s < subvol))
- subvol = s;
-
+ guard(rcu)();
+ u32 id, subvol = 0, s;
+retry:
+ id = snapshot_root;
+ while (id && bch2_snapshot_exists(c, id)) {
+ if (!(skip && snapshot_list_has_id(skip, id))) {
+ s = snapshot_t(c, id)->subvol;
+
+ if (s && (!subvol || s < subvol))
+ subvol = s;
+ }
id = bch2_snapshot_tree_next(c, id);
+ if (id == snapshot_root)
+ break;
+ }
+
+ if (!subvol && skip) {
+ skip = NULL;
+ goto retry;
}
- rcu_read_unlock();
return subvol;
}
@@ -436,7 +458,7 @@ static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans,
if (!ret && !found) {
struct bkey_i_subvolume *u;
- *subvol_id = bch2_snapshot_tree_oldest_subvol(c, snapshot_root);
+ *subvol_id = bch2_snapshot_oldest_subvol(c, snapshot_root, NULL);
u = bch2_bkey_get_mut_typed(trans, &iter,
BTREE_ID_subvolumes, POS(0, *subvol_id),
@@ -484,7 +506,7 @@ static int check_snapshot_tree(struct btree_trans *trans,
root_id != bch2_snapshot_root(c, root_id) ||
st.k->p.offset != le32_to_cpu(s.tree),
trans, snapshot_tree_to_missing_snapshot,
- "snapshot tree points to missing/incorrect snapshot:\n %s",
+ "snapshot tree points to missing/incorrect snapshot:\n%s",
(bch2_bkey_val_to_text(&buf, c, st.s_c),
prt_newline(&buf),
ret
@@ -504,19 +526,19 @@ static int check_snapshot_tree(struct btree_trans *trans,
if (fsck_err_on(ret,
trans, snapshot_tree_to_missing_subvol,
- "snapshot tree points to missing subvolume:\n %s",
+ "snapshot tree points to missing subvolume:\n%s",
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
fsck_err_on(!bch2_snapshot_is_ancestor(c,
le32_to_cpu(subvol.snapshot),
root_id),
trans, snapshot_tree_to_wrong_subvol,
- "snapshot tree points to subvolume that does not point to snapshot in this tree:\n %s",
+ "snapshot tree points to subvolume that does not point to snapshot in this tree:\n%s",
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol),
trans, snapshot_tree_to_snapshot_subvol,
- "snapshot tree points to snapshot subvolume:\n %s",
+ "snapshot tree points to snapshot subvolume:\n%s",
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
struct bkey_i_snapshot_tree *u;
@@ -588,18 +610,14 @@ static int snapshot_tree_ptr_good(struct btree_trans *trans,
u32 bch2_snapshot_skiplist_get(struct bch_fs *c, u32 id)
{
- const struct snapshot_t *s;
-
if (!id)
return 0;
- rcu_read_lock();
- s = snapshot_t(c, id);
- if (s->parent)
- id = bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth));
- rcu_read_unlock();
-
- return id;
+ guard(rcu)();
+ const struct snapshot_t *s = snapshot_t(c, id);
+ return s->parent
+ ? bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth))
+ : id;
}
static int snapshot_skiplist_good(struct btree_trans *trans, u32 id, struct bch_snapshot s)
@@ -653,7 +671,7 @@ static int snapshot_tree_ptr_repair(struct btree_trans *trans,
u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot);
ret = PTR_ERR_OR_ZERO(u) ?:
bch2_snapshot_tree_create(trans, root_id,
- bch2_snapshot_tree_oldest_subvol(c, root_id),
+ bch2_snapshot_oldest_subvol(c, root_id, NULL),
&tree_id);
if (ret)
goto err;
@@ -698,6 +716,9 @@ static int check_snapshot(struct btree_trans *trans,
memset(&s, 0, sizeof(s));
memcpy(&s, k.v, min(sizeof(s), bkey_val_bytes(k.k)));
+ if (BCH_SNAPSHOT_DELETED(&s))
+ return 0;
+
id = le32_to_cpu(s.parent);
if (id) {
ret = bch2_snapshot_lookup(trans, id, &v);
@@ -735,7 +756,7 @@ static int check_snapshot(struct btree_trans *trans,
}
bool should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) &&
- !BCH_SNAPSHOT_DELETED(&s);
+ !BCH_SNAPSHOT_WILL_DELETE(&s);
if (should_have_subvol) {
id = le32_to_cpu(s.subvol);
@@ -755,7 +776,7 @@ static int check_snapshot(struct btree_trans *trans,
} else {
if (fsck_err_on(s.subvol,
trans, snapshot_should_not_have_subvol,
- "snapshot should not point to subvol:\n %s",
+ "snapshot should not point to subvol:\n%s",
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
ret = PTR_ERR_OR_ZERO(u);
@@ -773,7 +794,7 @@ static int check_snapshot(struct btree_trans *trans,
if (fsck_err_on(!ret,
trans, snapshot_to_bad_snapshot_tree,
- "snapshot points to missing/incorrect tree:\n %s",
+ "snapshot points to missing/incorrect tree:\n%s",
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
ret = snapshot_tree_ptr_repair(trans, iter, k, &s);
if (ret)
@@ -785,7 +806,7 @@ static int check_snapshot(struct btree_trans *trans,
if (fsck_err_on(le32_to_cpu(s.depth) != real_depth,
trans, snapshot_bad_depth,
- "snapshot with incorrect depth field, should be %u:\n %s",
+ "snapshot with incorrect depth field, should be %u:\n%s",
real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
ret = PTR_ERR_OR_ZERO(u);
@@ -802,7 +823,7 @@ static int check_snapshot(struct btree_trans *trans,
if (fsck_err_on(!ret,
trans, snapshot_bad_skiplist,
- "snapshot with bad skiplist field:\n %s",
+ "snapshot with bad skiplist field:\n%s",
(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
ret = PTR_ERR_OR_ZERO(u);
@@ -842,9 +863,6 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id)
{
struct bch_fs *c = trans->c;
- if (bch2_snapshot_exists(c, id))
- return 0;
-
/* Do we need to reconstruct the snapshot_tree entry as well? */
struct btree_iter iter;
struct bkey_s_c k;
@@ -853,7 +871,8 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id)
for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshot_trees, POS_MIN,
0, k, ret) {
- if (le32_to_cpu(bkey_s_c_to_snapshot_tree(k).v->root_snapshot) == id) {
+ if (k.k->type == KEY_TYPE_snapshot_tree &&
+ le32_to_cpu(bkey_s_c_to_snapshot_tree(k).v->root_snapshot) == id) {
tree_id = k.k->p.offset;
break;
}
@@ -881,7 +900,8 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id)
for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
0, k, ret) {
- if (le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot) == id) {
+ if (k.k->type == KEY_TYPE_subvolume &&
+ le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot) == id) {
snapshot->v.subvol = cpu_to_le32(k.k->p.offset);
SET_BCH_SNAPSHOT_SUBVOL(&snapshot->v, true);
break;
@@ -889,9 +909,8 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id)
}
bch2_trans_iter_exit(trans, &iter);
- return bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0) ?:
- bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
- bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0);
+ return bch2_snapshot_table_make_room(c, id) ?:
+ bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0);
}
/* Figure out which snapshot nodes belong in the same tree: */
@@ -919,10 +938,7 @@ static inline bool same_snapshot(struct snapshot_tree_reconstruct *r, struct bpo
static inline bool snapshot_id_lists_have_common(snapshot_id_list *l, snapshot_id_list *r)
{
- darray_for_each(*l, i)
- if (snapshot_list_has_id(r, *i))
- return true;
- return false;
+ return darray_find_p(*l, i, snapshot_list_has_id(r, *i)) != NULL;
}
static void snapshot_id_list_to_text(struct printbuf *out, snapshot_id_list *s)
@@ -989,12 +1005,12 @@ int bch2_reconstruct_snapshots(struct bch_fs *c)
snapshot_id_list_to_text(&buf, t);
darray_for_each(*t, id) {
- if (fsck_err_on(!bch2_snapshot_exists(c, *id),
+ if (fsck_err_on(bch2_snapshot_id_state(c, *id) == SNAPSHOT_ID_empty,
trans, snapshot_node_missing,
"snapshot node %u from tree %s missing, recreate?", *id, buf.buf)) {
if (t->nr > 1) {
bch_err(c, "cannot reconstruct snapshot trees with multiple nodes");
- ret = -BCH_ERR_fsck_repair_unimplemented;
+ ret = bch_err_throw(c, fsck_repair_unimplemented);
goto err;
}
@@ -1014,27 +1030,92 @@ err:
return ret;
}
-int bch2_check_key_has_snapshot(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bkey_s_c k)
+int __bch2_check_key_has_snapshot(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
struct printbuf buf = PRINTBUF;
int ret = 0;
+ enum snapshot_id_state state = bch2_snapshot_id_state(c, k.k->p.snapshot);
- if (fsck_err_on(!bch2_snapshot_exists(c, k.k->p.snapshot),
- trans, bkey_in_missing_snapshot,
- "key in missing snapshot %s, delete?",
+ /* Snapshot was definitively deleted, this error is marked autofix */
+ if (fsck_err_on(state == SNAPSHOT_ID_deleted,
+ trans, bkey_in_deleted_snapshot,
+ "key in deleted snapshot %s, delete?",
(bch2_btree_id_to_text(&buf, iter->btree_id),
prt_char(&buf, ' '),
bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
ret = bch2_btree_delete_at(trans, iter,
- BTREE_UPDATE_internal_snapshot_node) ?: 1;
+ BTREE_UPDATE_internal_snapshot_node) ?: 1;
+
+ if (state == SNAPSHOT_ID_empty) {
+ /*
+ * Snapshot missing: we should have caught this with btree_lost_data and
+ * kicked off reconstruct_snapshots, so if we end up here we have no
+ * idea what happened.
+ *
+ * Do not delete unless we know that subvolumes and snapshots
+ * are consistent:
+ *
+ * XXX:
+ *
+ * We could be smarter here, and instead of using the generic
+ * recovery pass ratelimiting, track if there have been any
+ * changes to the snapshots or inodes btrees since those passes
+ * last ran.
+ */
+ ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_check_snapshots) ?: ret;
+ ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_check_subvols) ?: ret;
+
+ if (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots))
+ ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_reconstruct_snapshots) ?: ret;
+
+ unsigned repair_flags = FSCK_CAN_IGNORE | (!ret ? FSCK_CAN_FIX : 0);
+
+ if (__fsck_err(trans, repair_flags, bkey_in_missing_snapshot,
+ "key in missing snapshot %s, delete?",
+ (bch2_btree_id_to_text(&buf, iter->btree_id),
+ prt_char(&buf, ' '),
+ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+ ret = bch2_btree_delete_at(trans, iter,
+ BTREE_UPDATE_internal_snapshot_node) ?: 1;
+ }
+ }
fsck_err:
printbuf_exit(&buf);
return ret;
}
+int __bch2_get_snapshot_overwrites(struct btree_trans *trans,
+ enum btree_id btree, struct bpos pos,
+ snapshot_id_list *s)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret = 0;
+
+ for_each_btree_key_reverse_norestart(trans, iter, btree, bpos_predecessor(pos),
+ BTREE_ITER_all_snapshots, k, ret) {
+ if (!bkey_eq(k.k->p, pos))
+ break;
+
+ if (!bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot) ||
+ snapshot_list_has_ancestor(c, s, k.k->p.snapshot))
+ continue;
+
+ ret = snapshot_list_add(c, s, k.k->p.snapshot);
+ if (ret)
+ break;
+ }
+ bch2_trans_iter_exit(trans, &iter);
+ if (ret)
+ darray_exit(s);
+
+ return ret;
+}
+
/*
* Mark a snapshot as deleted, for future cleanup:
*/
@@ -1053,10 +1134,10 @@ int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
}
/* already deleted? */
- if (BCH_SNAPSHOT_DELETED(&s->v))
+ if (BCH_SNAPSHOT_WILL_DELETE(&s->v))
goto err;
- SET_BCH_SNAPSHOT_DELETED(&s->v, true);
+ SET_BCH_SNAPSHOT_WILL_DELETE(&s->v, true);
SET_BCH_SNAPSHOT_SUBVOL(&s->v, false);
s->v.subvol = 0;
err:
@@ -1073,27 +1154,28 @@ static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s)
static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
{
struct bch_fs *c = trans->c;
- struct btree_iter iter, p_iter = (struct btree_iter) { NULL };
- struct btree_iter c_iter = (struct btree_iter) { NULL };
- struct btree_iter tree_iter = (struct btree_iter) { NULL };
- struct bkey_s_c_snapshot s;
+ struct btree_iter iter, p_iter = {};
+ struct btree_iter c_iter = {};
+ struct btree_iter tree_iter = {};
u32 parent_id, child_id;
unsigned i;
int ret = 0;
- s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id),
- BTREE_ITER_intent, snapshot);
- ret = bkey_err(s);
+ struct bkey_i_snapshot *s =
+ bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id),
+ BTREE_ITER_intent, snapshot);
+ ret = PTR_ERR_OR_ZERO(s);
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
"missing snapshot %u", id);
if (ret)
goto err;
- BUG_ON(s.v->children[1]);
+ BUG_ON(BCH_SNAPSHOT_DELETED(&s->v));
+ BUG_ON(s->v.children[1]);
- parent_id = le32_to_cpu(s.v->parent);
- child_id = le32_to_cpu(s.v->children[0]);
+ parent_id = le32_to_cpu(s->v.parent);
+ child_id = le32_to_cpu(s->v.children[0]);
if (parent_id) {
struct bkey_i_snapshot *parent;
@@ -1151,24 +1233,38 @@ static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
*/
struct bkey_i_snapshot_tree *s_t;
- BUG_ON(s.v->children[1]);
+ BUG_ON(s->v.children[1]);
s_t = bch2_bkey_get_mut_typed(trans, &tree_iter,
- BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s.v->tree)),
+ BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s->v.tree)),
0, snapshot_tree);
ret = PTR_ERR_OR_ZERO(s_t);
if (ret)
goto err;
- if (s.v->children[0]) {
- s_t->v.root_snapshot = s.v->children[0];
+ if (s->v.children[0]) {
+ s_t->v.root_snapshot = s->v.children[0];
} else {
s_t->k.type = KEY_TYPE_deleted;
set_bkey_val_u64s(&s_t->k, 0);
}
}
- ret = bch2_btree_delete_at(trans, &iter, 0);
+ if (!bch2_request_incompat_feature(c, bcachefs_metadata_version_snapshot_deletion_v2)) {
+ SET_BCH_SNAPSHOT_DELETED(&s->v, true);
+ s->v.parent = 0;
+ s->v.children[0] = 0;
+ s->v.children[1] = 0;
+ s->v.subvol = 0;
+ s->v.tree = 0;
+ s->v.depth = 0;
+ s->v.skip[0] = 0;
+ s->v.skip[1] = 0;
+ s->v.skip[2] = 0;
+ } else {
+ s->k.type = KEY_TYPE_deleted;
+ set_bkey_val_u64s(&s->k, 0);
+ }
err:
bch2_trans_iter_exit(trans, &tree_iter);
bch2_trans_iter_exit(trans, &p_iter);
@@ -1192,19 +1288,19 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
POS_MIN, BTREE_ITER_intent);
- k = bch2_btree_iter_peek(&iter);
+ k = bch2_btree_iter_peek(trans, &iter);
ret = bkey_err(k);
if (ret)
goto err;
for (i = 0; i < nr_snapids; i++) {
- k = bch2_btree_iter_prev_slot(&iter);
+ k = bch2_btree_iter_prev_slot(trans, &iter);
ret = bkey_err(k);
if (ret)
goto err;
if (!k.k || !k.k->p.offset) {
- ret = -BCH_ERR_ENOSPC_snapshot_create;
+ ret = bch_err_throw(c, ENOSPC_snapshot_create);
goto err;
}
@@ -1338,18 +1434,10 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
* that key to snapshot leaf nodes, where we can mutate it
*/
-struct snapshot_interior_delete {
- u32 id;
- u32 live_child;
-};
-typedef DARRAY(struct snapshot_interior_delete) interior_delete_list;
-
static inline u32 interior_delete_has_id(interior_delete_list *l, u32 id)
{
- darray_for_each(*l, i)
- if (i->id == id)
- return i->live_child;
- return 0;
+ struct snapshot_interior_delete *i = darray_find_p(*l, i, i->id == id);
+ return i ? i->live_child : 0;
}
static unsigned __live_child(struct snapshot_table *t, u32 id,
@@ -1377,28 +1465,32 @@ static unsigned __live_child(struct snapshot_table *t, u32 id,
return 0;
}
-static unsigned live_child(struct bch_fs *c, u32 id,
- snapshot_id_list *delete_leaves,
- interior_delete_list *delete_interior)
+static unsigned live_child(struct bch_fs *c, u32 id)
{
- rcu_read_lock();
- u32 ret = __live_child(rcu_dereference(c->snapshots), id,
- delete_leaves, delete_interior);
- rcu_read_unlock();
- return ret;
+ struct snapshot_delete *d = &c->snapshot_delete;
+
+ guard(rcu)();
+ return __live_child(rcu_dereference(c->snapshots), id,
+ &d->delete_leaves, &d->delete_interior);
+}
+
+static bool snapshot_id_dying(struct snapshot_delete *d, unsigned id)
+{
+ return snapshot_list_has_id(&d->delete_leaves, id) ||
+ interior_delete_has_id(&d->delete_interior, id) != 0;
}
static int delete_dead_snapshots_process_key(struct btree_trans *trans,
struct btree_iter *iter,
- struct bkey_s_c k,
- snapshot_id_list *delete_leaves,
- interior_delete_list *delete_interior)
+ struct bkey_s_c k)
{
- if (snapshot_list_has_id(delete_leaves, k.k->p.snapshot))
+ struct snapshot_delete *d = &trans->c->snapshot_delete;
+
+ if (snapshot_list_has_id(&d->delete_leaves, k.k->p.snapshot))
return bch2_btree_delete_at(trans, iter,
BTREE_UPDATE_internal_snapshot_node);
- u32 live_child = interior_delete_has_id(delete_interior, k.k->p.snapshot);
+ u32 live_child = interior_delete_has_id(&d->delete_interior, k.k->p.snapshot);
if (live_child) {
struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
int ret = PTR_ERR_OR_ZERO(new);
@@ -1429,55 +1521,214 @@ static int delete_dead_snapshots_process_key(struct btree_trans *trans,
return 0;
}
+static bool skip_unrelated_snapshot_tree(struct btree_trans *trans, struct btree_iter *iter, u64 *prev_inum)
+{
+ struct bch_fs *c = trans->c;
+ struct snapshot_delete *d = &c->snapshot_delete;
+
+ u64 inum = iter->btree_id != BTREE_ID_inodes
+ ? iter->pos.inode
+ : iter->pos.offset;
+
+ if (*prev_inum == inum)
+ return false;
+
+ *prev_inum = inum;
+
+ bool ret = !snapshot_list_has_id(&d->deleting_from_trees,
+ bch2_snapshot_tree(c, iter->pos.snapshot));
+ if (unlikely(ret)) {
+ struct bpos pos = iter->pos;
+ pos.snapshot = 0;
+ if (iter->btree_id != BTREE_ID_inodes)
+ pos.offset = U64_MAX;
+ bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(pos));
+ }
+
+ return ret;
+}
+
+static int delete_dead_snapshot_keys_v1(struct btree_trans *trans)
+{
+ struct bch_fs *c = trans->c;
+ struct snapshot_delete *d = &c->snapshot_delete;
+
+ for (d->pos.btree = 0; d->pos.btree < BTREE_ID_NR; d->pos.btree++) {
+ struct disk_reservation res = { 0 };
+ u64 prev_inum = 0;
+
+ d->pos.pos = POS_MIN;
+
+ if (!btree_type_has_snapshots(d->pos.btree))
+ continue;
+
+ int ret = for_each_btree_key_commit(trans, iter,
+ d->pos.btree, POS_MIN,
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
+ &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({
+ d->pos.pos = iter.pos;
+
+ if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum))
+ continue;
+
+ delete_dead_snapshots_process_key(trans, &iter, k);
+ }));
+
+ bch2_disk_reservation_put(c, &res);
+
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int delete_dead_snapshot_keys_range(struct btree_trans *trans, enum btree_id btree,
+ struct bpos start, struct bpos end)
+{
+ struct bch_fs *c = trans->c;
+ struct snapshot_delete *d = &c->snapshot_delete;
+ struct disk_reservation res = { 0 };
+
+ d->pos.btree = btree;
+ d->pos.pos = POS_MIN;
+
+ int ret = for_each_btree_key_max_commit(trans, iter,
+ btree, start, end,
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
+ &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({
+ d->pos.pos = iter.pos;
+ delete_dead_snapshots_process_key(trans, &iter, k);
+ }));
+
+ bch2_disk_reservation_put(c, &res);
+ return ret;
+}
+
+static int delete_dead_snapshot_keys_v2(struct btree_trans *trans)
+{
+ struct bch_fs *c = trans->c;
+ struct snapshot_delete *d = &c->snapshot_delete;
+ struct disk_reservation res = { 0 };
+ u64 prev_inum = 0;
+ int ret = 0;
+
+ struct btree_iter iter;
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, POS_MIN,
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots);
+
+ while (1) {
+ struct bkey_s_c k;
+ ret = lockrestart_do(trans,
+ bkey_err(k = bch2_btree_iter_peek(trans, &iter)));
+ if (ret)
+ break;
+
+ if (!k.k)
+ break;
+
+ d->pos.btree = iter.btree_id;
+ d->pos.pos = iter.pos;
+
+ if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum))
+ continue;
+
+ if (snapshot_id_dying(d, k.k->p.snapshot)) {
+ struct bpos start = POS(k.k->p.offset, 0);
+ struct bpos end = POS(k.k->p.offset, U64_MAX);
+
+ ret = delete_dead_snapshot_keys_range(trans, BTREE_ID_extents, start, end) ?:
+ delete_dead_snapshot_keys_range(trans, BTREE_ID_dirents, start, end) ?:
+ delete_dead_snapshot_keys_range(trans, BTREE_ID_xattrs, start, end);
+ if (ret)
+ break;
+
+ bch2_btree_iter_set_pos(trans, &iter, POS(0, k.k->p.offset + 1));
+ } else {
+ bch2_btree_iter_advance(trans, &iter);
+ }
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
+ if (ret)
+ goto err;
+
+ prev_inum = 0;
+ ret = for_each_btree_key_commit(trans, iter,
+ BTREE_ID_inodes, POS_MIN,
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
+ &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({
+ d->pos.btree = iter.btree_id;
+ d->pos.pos = iter.pos;
+
+ if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum))
+ continue;
+
+ delete_dead_snapshots_process_key(trans, &iter, k);
+ }));
+err:
+ bch2_disk_reservation_put(c, &res);
+ return ret;
+}
+
/*
* For a given snapshot, if it doesn't have a subvolume that points to it, and
* it doesn't have child snapshot nodes - it's now redundant and we can mark it
* as deleted.
*/
-static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s_c k,
- snapshot_id_list *delete_leaves,
- interior_delete_list *delete_interior)
+static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s_c k)
{
if (k.k->type != KEY_TYPE_snapshot)
return 0;
struct bch_fs *c = trans->c;
+ struct snapshot_delete *d = &c->snapshot_delete;
struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
unsigned live_children = 0;
+ int ret = 0;
if (BCH_SNAPSHOT_SUBVOL(s.v))
return 0;
+ if (BCH_SNAPSHOT_DELETED(s.v))
+ return 0;
+
+ mutex_lock(&d->progress_lock);
for (unsigned i = 0; i < 2; i++) {
u32 child = le32_to_cpu(s.v->children[i]);
live_children += child &&
- !snapshot_list_has_id(delete_leaves, child);
+ !snapshot_list_has_id(&d->delete_leaves, child);
}
+ u32 tree = bch2_snapshot_tree(c, s.k->p.offset);
+
if (live_children == 0) {
- return snapshot_list_add(c, delete_leaves, s.k->p.offset);
+ ret = snapshot_list_add_nodup(c, &d->deleting_from_trees, tree) ?:
+ snapshot_list_add(c, &d->delete_leaves, s.k->p.offset);
} else if (live_children == 1) {
- struct snapshot_interior_delete d = {
+ struct snapshot_interior_delete n = {
.id = s.k->p.offset,
- .live_child = live_child(c, s.k->p.offset, delete_leaves, delete_interior),
+ .live_child = live_child(c, s.k->p.offset),
};
- if (!d.live_child) {
- bch_err(c, "error finding live child of snapshot %u", d.id);
- return -EINVAL;
+ if (!n.live_child) {
+ bch_err(c, "error finding live child of snapshot %u", n.id);
+ ret = -EINVAL;
+ } else {
+ ret = snapshot_list_add_nodup(c, &d->deleting_from_trees, tree) ?:
+ darray_push(&d->delete_interior, n);
}
-
- return darray_push(delete_interior, d);
- } else {
- return 0;
}
+ mutex_unlock(&d->progress_lock);
+
+ return ret;
}
static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n,
interior_delete_list *skip)
{
- rcu_read_lock();
+ guard(rcu)();
while (interior_delete_has_id(skip, id))
id = __bch2_snapshot_parent(c, id);
@@ -1486,7 +1737,6 @@ static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n,
id = __bch2_snapshot_parent(c, id);
} while (interior_delete_has_id(skip, id));
}
- rcu_read_unlock();
return id;
}
@@ -1500,6 +1750,9 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
struct bkey_i_snapshot *s;
int ret;
+ if (!bch2_snapshot_exists(c, k.k->p.offset))
+ return 0;
+
if (k.k->type != KEY_TYPE_snapshot)
return 0;
@@ -1547,39 +1800,56 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
return bch2_trans_update(trans, iter, &s->k_i, 0);
}
-int bch2_delete_dead_snapshots(struct bch_fs *c)
+static void bch2_snapshot_delete_nodes_to_text(struct printbuf *out, struct snapshot_delete *d)
{
- if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags))
+ prt_printf(out, "deleting from trees");
+ darray_for_each(d->deleting_from_trees, i)
+ prt_printf(out, " %u", *i);
+
+ prt_printf(out, "deleting leaves");
+ darray_for_each(d->delete_leaves, i)
+ prt_printf(out, " %u", *i);
+ prt_newline(out);
+
+ prt_printf(out, "interior");
+ darray_for_each(d->delete_interior, i)
+ prt_printf(out, " %u->%u", i->id, i->live_child);
+ prt_newline(out);
+}
+
+int __bch2_delete_dead_snapshots(struct bch_fs *c)
+{
+ struct snapshot_delete *d = &c->snapshot_delete;
+ int ret = 0;
+
+ if (!mutex_trylock(&d->lock))
return 0;
+ if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags))
+ goto out_unlock;
+
struct btree_trans *trans = bch2_trans_get(c);
- snapshot_id_list delete_leaves = {};
- interior_delete_list delete_interior = {};
- int ret = 0;
/*
* For every snapshot node: If we have no live children and it's not
* pointed to by a subvolume, delete it:
*/
+ d->running = true;
+ d->pos = BBPOS_MIN;
+
ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k,
- check_should_delete_snapshot(trans, k, &delete_leaves, &delete_interior));
+ check_should_delete_snapshot(trans, k));
if (!bch2_err_matches(ret, EROFS))
bch_err_msg(c, ret, "walking snapshots");
if (ret)
goto err;
- if (!delete_leaves.nr && !delete_interior.nr)
+ if (!d->delete_leaves.nr && !d->delete_interior.nr)
goto err;
{
struct printbuf buf = PRINTBUF;
- prt_printf(&buf, "deleting leaves");
- darray_for_each(delete_leaves, i)
- prt_printf(&buf, " %u", *i);
-
- prt_printf(&buf, " interior");
- darray_for_each(delete_interior, i)
- prt_printf(&buf, " %u->%u", i->id, i->live_child);
+ bch2_snapshot_delete_nodes_to_text(&buf, d);
ret = commit_do(trans, NULL, NULL, 0, bch2_trans_log_msg(trans, &buf));
printbuf_exit(&buf);
@@ -1587,29 +1857,15 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
goto err;
}
- for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) {
- struct disk_reservation res = { 0 };
-
- if (!btree_type_has_snapshots(btree))
- continue;
-
- ret = for_each_btree_key_commit(trans, iter,
- btree, POS_MIN,
- BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
- &res, NULL, BCH_TRANS_COMMIT_no_enospc,
- delete_dead_snapshots_process_key(trans, &iter, k,
- &delete_leaves,
- &delete_interior));
-
- bch2_disk_reservation_put(c, &res);
-
- if (!bch2_err_matches(ret, EROFS))
- bch_err_msg(c, ret, "deleting keys from dying snapshots");
- if (ret)
- goto err;
- }
+ ret = !bch2_request_incompat_feature(c, bcachefs_metadata_version_snapshot_deletion_v2)
+ ? delete_dead_snapshot_keys_v2(trans)
+ : delete_dead_snapshot_keys_v1(trans);
+ if (!bch2_err_matches(ret, EROFS))
+ bch_err_msg(c, ret, "deleting keys from dying snapshots");
+ if (ret)
+ goto err;
- darray_for_each(delete_leaves, i) {
+ darray_for_each(d->delete_leaves, i) {
ret = commit_do(trans, NULL, NULL, 0,
bch2_snapshot_node_delete(trans, *i));
if (!bch2_err_matches(ret, EROFS))
@@ -1626,11 +1882,11 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN,
BTREE_ITER_intent, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &delete_interior));
+ bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &d->delete_interior));
if (ret)
goto err;
- darray_for_each(delete_interior, i) {
+ darray_for_each(d->delete_interior, i) {
ret = commit_do(trans, NULL, NULL, 0,
bch2_snapshot_node_delete(trans, i->id));
if (!bch2_err_matches(ret, EROFS))
@@ -1639,33 +1895,68 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
goto err;
}
err:
- darray_exit(&delete_interior);
- darray_exit(&delete_leaves);
+ mutex_lock(&d->progress_lock);
+ darray_exit(&d->deleting_from_trees);
+ darray_exit(&d->delete_interior);
+ darray_exit(&d->delete_leaves);
+ d->running = false;
+ mutex_unlock(&d->progress_lock);
bch2_trans_put(trans);
+
+ bch2_recovery_pass_set_no_ratelimit(c, BCH_RECOVERY_PASS_check_snapshots);
+out_unlock:
+ mutex_unlock(&d->lock);
if (!bch2_err_matches(ret, EROFS))
bch_err_fn(c, ret);
return ret;
}
+int bch2_delete_dead_snapshots(struct bch_fs *c)
+{
+ if (!c->opts.auto_snapshot_deletion)
+ return 0;
+
+ return __bch2_delete_dead_snapshots(c);
+}
+
void bch2_delete_dead_snapshots_work(struct work_struct *work)
{
- struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
+ struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete.work);
set_worker_desc("bcachefs-delete-dead-snapshots/%s", c->name);
bch2_delete_dead_snapshots(c);
- bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_delete_dead_snapshots);
}
void bch2_delete_dead_snapshots_async(struct bch_fs *c)
{
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots))
+ if (!c->opts.auto_snapshot_deletion)
+ return;
+
+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_delete_dead_snapshots))
return;
BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags));
- if (!queue_work(c->write_ref_wq, &c->snapshot_delete_work))
- bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
+ if (!queue_work(system_long_wq, &c->snapshot_delete.work))
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_delete_dead_snapshots);
+}
+
+void bch2_snapshot_delete_status_to_text(struct printbuf *out, struct bch_fs *c)
+{
+ struct snapshot_delete *d = &c->snapshot_delete;
+
+ if (!d->running) {
+ prt_str(out, "(not running)");
+ return;
+ }
+
+ mutex_lock(&d->progress_lock);
+ bch2_snapshot_delete_nodes_to_text(out, d);
+
+ bch2_bbpos_to_text(out, d->pos);
+ mutex_unlock(&d->progress_lock);
}
int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
@@ -1706,7 +1997,7 @@ static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct
return 0;
struct bkey_s_c_snapshot snap = bkey_s_c_to_snapshot(k);
- if (BCH_SNAPSHOT_DELETED(snap.v) ||
+ if (BCH_SNAPSHOT_WILL_DELETE(snap.v) ||
interior_snapshot_needs_delete(snap))
set_bit(BCH_FS_need_delete_dead_snapshots, &trans->c->flags);
@@ -1735,10 +2026,6 @@ int bch2_snapshots_read(struct bch_fs *c)
BUG_ON(!test_bit(BCH_FS_new_fs, &c->flags) &&
test_bit(BCH_FS_may_go_rw, &c->flags));
- if (bch2_err_matches(ret, EIO) ||
- (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots)))
- ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_reconstruct_snapshots);
-
return ret;
}
@@ -1746,3 +2033,11 @@ void bch2_fs_snapshots_exit(struct bch_fs *c)
{
kvfree(rcu_dereference_protected(c->snapshots, true));
}
+
+void bch2_fs_snapshots_init_early(struct bch_fs *c)
+{
+ INIT_WORK(&c->snapshot_delete.work, bch2_delete_dead_snapshots_work);
+ mutex_init(&c->snapshot_delete.lock);
+ mutex_init(&c->snapshot_delete.progress_lock);
+ mutex_init(&c->snapshots_unlinked_lock);
+}
diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h
index 00373cf32e7b..6766bf673ed9 100644
--- a/fs/bcachefs/snapshot.h
+++ b/fs/bcachefs/snapshot.h
@@ -46,12 +46,9 @@ static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id)
static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id)
{
- rcu_read_lock();
+ guard(rcu)();
const struct snapshot_t *s = snapshot_t(c, id);
- id = s ? s->tree : 0;
- rcu_read_unlock();
-
- return id;
+ return s ? s->tree : 0;
}
static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
@@ -62,11 +59,8 @@ static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
{
- rcu_read_lock();
- id = __bch2_snapshot_parent_early(c, id);
- rcu_read_unlock();
-
- return id;
+ guard(rcu)();
+ return __bch2_snapshot_parent_early(c, id);
}
static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id)
@@ -88,60 +82,53 @@ static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id)
static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id)
{
- rcu_read_lock();
- id = __bch2_snapshot_parent(c, id);
- rcu_read_unlock();
-
- return id;
+ guard(rcu)();
+ return __bch2_snapshot_parent(c, id);
}
static inline u32 bch2_snapshot_nth_parent(struct bch_fs *c, u32 id, u32 n)
{
- rcu_read_lock();
+ guard(rcu)();
while (n--)
id = __bch2_snapshot_parent(c, id);
- rcu_read_unlock();
-
return id;
}
+u32 bch2_snapshot_oldest_subvol(struct bch_fs *, u32, snapshot_id_list *);
u32 bch2_snapshot_skiplist_get(struct bch_fs *, u32);
static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id)
{
- u32 parent;
+ guard(rcu)();
- rcu_read_lock();
+ u32 parent;
while ((parent = __bch2_snapshot_parent(c, id)))
id = parent;
- rcu_read_unlock();
-
return id;
}
-static inline bool __bch2_snapshot_exists(struct bch_fs *c, u32 id)
+static inline enum snapshot_id_state __bch2_snapshot_id_state(struct bch_fs *c, u32 id)
{
const struct snapshot_t *s = snapshot_t(c, id);
- return s ? s->live : 0;
+ return s ? s->state : SNAPSHOT_ID_empty;
}
-static inline bool bch2_snapshot_exists(struct bch_fs *c, u32 id)
+static inline enum snapshot_id_state bch2_snapshot_id_state(struct bch_fs *c, u32 id)
{
- rcu_read_lock();
- bool ret = __bch2_snapshot_exists(c, id);
- rcu_read_unlock();
+ guard(rcu)();
+ return __bch2_snapshot_id_state(c, id);
+}
- return ret;
+static inline bool bch2_snapshot_exists(struct bch_fs *c, u32 id)
+{
+ return bch2_snapshot_id_state(c, id) == SNAPSHOT_ID_live;
}
static inline int bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id)
{
- rcu_read_lock();
+ guard(rcu)();
const struct snapshot_t *s = snapshot_t(c, id);
- int ret = s ? s->children[0] : -BCH_ERR_invalid_snapshot_node;
- rcu_read_unlock();
-
- return ret;
+ return s ? s->children[0] : -BCH_ERR_invalid_snapshot_node;
}
static inline int bch2_snapshot_is_leaf(struct bch_fs *c, u32 id)
@@ -154,13 +141,8 @@ static inline int bch2_snapshot_is_leaf(struct bch_fs *c, u32 id)
static inline u32 bch2_snapshot_depth(struct bch_fs *c, u32 parent)
{
- u32 depth;
-
- rcu_read_lock();
- depth = parent ? snapshot_t(c, parent)->depth + 1 : 0;
- rcu_read_unlock();
-
- return depth;
+ guard(rcu)();
+ return parent ? snapshot_t(c, parent)->depth + 1 : 0;
}
bool __bch2_snapshot_is_ancestor(struct bch_fs *, u32, u32);
@@ -174,20 +156,14 @@ static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ances
static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id)
{
- rcu_read_lock();
+ guard(rcu)();
const struct snapshot_t *t = snapshot_t(c, id);
- bool ret = t && (t->children[0]|t->children[1]) != 0;
- rcu_read_unlock();
-
- return ret;
+ return t && (t->children[0]|t->children[1]) != 0;
}
static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id)
{
- darray_for_each(*s, i)
- if (*i == id)
- return true;
- return false;
+ return darray_find(*s, id) != NULL;
}
static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list *s, u32 id)
@@ -240,10 +216,38 @@ int bch2_snapshot_node_create(struct btree_trans *, u32,
int bch2_check_snapshot_trees(struct bch_fs *);
int bch2_check_snapshots(struct bch_fs *);
int bch2_reconstruct_snapshots(struct bch_fs *);
-int bch2_check_key_has_snapshot(struct btree_trans *, struct btree_iter *, struct bkey_s_c);
+
+int __bch2_check_key_has_snapshot(struct btree_trans *, struct btree_iter *, struct bkey_s_c);
+
+static inline int bch2_check_key_has_snapshot(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k)
+{
+ return likely(bch2_snapshot_exists(trans->c, k.k->p.snapshot))
+ ? 0
+ : __bch2_check_key_has_snapshot(trans, iter, k);
+}
+
+int __bch2_get_snapshot_overwrites(struct btree_trans *,
+ enum btree_id, struct bpos,
+ snapshot_id_list *);
+
+/*
+ * Get a list of snapshot IDs that have overwritten a given key:
+ */
+static inline int bch2_get_snapshot_overwrites(struct btree_trans *trans,
+ enum btree_id btree, struct bpos pos,
+ snapshot_id_list *s)
+{
+ darray_init(s);
+
+ return bch2_snapshot_has_children(trans->c, pos.snapshot)
+ ? __bch2_get_snapshot_overwrites(trans, btree, pos, s)
+ : 0;
+
+}
int bch2_snapshot_node_set_deleted(struct btree_trans *, u32);
-void bch2_delete_dead_snapshots_work(struct work_struct *);
int __bch2_key_has_snapshot_overwrites(struct btree_trans *, enum btree_id, struct bpos);
@@ -258,7 +262,14 @@ static inline int bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
return __bch2_key_has_snapshot_overwrites(trans, id, pos);
}
+int __bch2_delete_dead_snapshots(struct bch_fs *);
+int bch2_delete_dead_snapshots(struct bch_fs *);
+void bch2_delete_dead_snapshots_work(struct work_struct *);
+void bch2_delete_dead_snapshots_async(struct bch_fs *);
+void bch2_snapshot_delete_status_to_text(struct printbuf *, struct bch_fs *);
+
int bch2_snapshots_read(struct bch_fs *);
void bch2_fs_snapshots_exit(struct bch_fs *);
+void bch2_fs_snapshots_init_early(struct bch_fs *);
#endif /* _BCACHEFS_SNAPSHOT_H */
diff --git a/fs/bcachefs/snapshot_format.h b/fs/bcachefs/snapshot_format.h
index aabcd3a74cd9..9bccae1f3590 100644
--- a/fs/bcachefs/snapshot_format.h
+++ b/fs/bcachefs/snapshot_format.h
@@ -15,10 +15,10 @@ struct bch_snapshot {
bch_le128 btime;
};
-LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1)
-
+LE32_BITMASK(BCH_SNAPSHOT_WILL_DELETE, struct bch_snapshot, flags, 0, 1)
/* True if a subvolume points to this snapshot node: */
LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2)
+LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 2, 3)
/*
* Snapshot trees:
diff --git a/fs/bcachefs/snapshot_types.h b/fs/bcachefs/snapshot_types.h
new file mode 100644
index 000000000000..0ab698f13e5c
--- /dev/null
+++ b/fs/bcachefs/snapshot_types.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SNAPSHOT_TYPES_H
+#define _BCACHEFS_SNAPSHOT_TYPES_H
+
+#include "bbpos_types.h"
+#include "darray.h"
+#include "subvolume_types.h"
+
+typedef DARRAY(u32) snapshot_id_list;
+
+#define IS_ANCESTOR_BITMAP 128
+
+struct snapshot_t {
+ enum snapshot_id_state {
+ SNAPSHOT_ID_empty,
+ SNAPSHOT_ID_live,
+ SNAPSHOT_ID_deleted,
+ } state;
+ u32 parent;
+ u32 skip[3];
+ u32 depth;
+ u32 children[2];
+ u32 subvol; /* Nonzero only if a subvolume points to this node: */
+ u32 tree;
+ unsigned long is_ancestor[BITS_TO_LONGS(IS_ANCESTOR_BITMAP)];
+};
+
+struct snapshot_table {
+ struct rcu_head rcu;
+ size_t nr;
+#ifndef RUST_BINDGEN
+ DECLARE_FLEX_ARRAY(struct snapshot_t, s);
+#else
+ struct snapshot_t s[0];
+#endif
+};
+
+struct snapshot_interior_delete {
+ u32 id;
+ u32 live_child;
+};
+typedef DARRAY(struct snapshot_interior_delete) interior_delete_list;
+
+struct snapshot_delete {
+ struct mutex lock;
+ struct work_struct work;
+
+ struct mutex progress_lock;
+ snapshot_id_list deleting_from_trees;
+ snapshot_id_list delete_leaves;
+ interior_delete_list delete_interior;
+
+ bool running;
+ struct bbpos pos;
+};
+
+#endif /* _BCACHEFS_SNAPSHOT_TYPES_H */
diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c
index d78451c2a0c6..3e9f59226bdf 100644
--- a/fs/bcachefs/str_hash.c
+++ b/fs/bcachefs/str_hash.c
@@ -31,14 +31,16 @@ static int bch2_dirent_has_target(struct btree_trans *trans, struct bkey_s_c_dir
}
}
-static noinline int fsck_rename_dirent(struct btree_trans *trans,
- struct snapshots_seen *s,
- const struct bch_hash_desc desc,
- struct bch_hash_info *hash_info,
- struct bkey_s_c_dirent old)
+static int bch2_fsck_rename_dirent(struct btree_trans *trans,
+ struct snapshots_seen *s,
+ const struct bch_hash_desc desc,
+ struct bch_hash_info *hash_info,
+ struct bkey_s_c_dirent old,
+ bool *updated_before_k_pos)
{
+ struct bch_fs *c = trans->c;
struct qstr old_name = bch2_dirent_get_name(old);
- struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, bkey_bytes(old.k) + 32);
+ struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, BKEY_U64s_MAX * sizeof(u64));
int ret = PTR_ERR_OR_ZERO(new);
if (ret)
return ret;
@@ -47,28 +49,39 @@ static noinline int fsck_rename_dirent(struct btree_trans *trans,
dirent_copy_target(new, old);
new->k.p = old.k->p;
+ char *renamed_buf = bch2_trans_kmalloc(trans, old_name.len + 20);
+ ret = PTR_ERR_OR_ZERO(renamed_buf);
+ if (ret)
+ return ret;
+
for (unsigned i = 0; i < 1000; i++) {
- unsigned len = sprintf(new->v.d_name, "%.*s.fsck_renamed-%u",
- old_name.len, old_name.name, i);
- unsigned u64s = BKEY_U64s + dirent_val_u64s(len);
+ new->k.u64s = BKEY_U64s_MAX;
- if (u64s > U8_MAX)
- return -EINVAL;
+ struct qstr renamed_name = (struct qstr) QSTR_INIT(renamed_buf,
+ sprintf(renamed_buf, "%.*s.fsck_renamed-%u",
+ old_name.len, old_name.name, i));
- new->k.u64s = u64s;
+ ret = bch2_dirent_init_name(c, new, hash_info, &renamed_name, NULL);
+ if (ret)
+ return ret;
ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info,
(subvol_inum) { 0, old.k->p.inode },
old.k->p.snapshot, &new->k_i,
- BTREE_UPDATE_internal_snapshot_node);
- if (!bch2_err_matches(ret, EEXIST))
+ BTREE_UPDATE_internal_snapshot_node|
+ STR_HASH_must_create);
+ if (ret && !bch2_err_matches(ret, EEXIST))
+ break;
+ if (!ret) {
+ if (bpos_lt(new->k.p, old.k->p))
+ *updated_before_k_pos = true;
break;
+ }
}
- if (ret)
- return ret;
-
- return bch2_fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i);
+ ret = ret ?: bch2_fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i);
+ bch_err_fn(c, ret);
+ return ret;
}
static noinline int hash_pick_winner(struct btree_trans *trans,
@@ -101,17 +114,25 @@ static noinline int hash_pick_winner(struct btree_trans *trans,
}
}
-static int repair_inode_hash_info(struct btree_trans *trans,
- struct bch_inode_unpacked *snapshot_root)
+/*
+ * str_hash lookups across snapshots break in wild ways if hash_info in
+ * different snapshot versions doesn't match - so if we find one mismatch, check
+ * them all
+ */
+int bch2_repair_inode_hash_info(struct btree_trans *trans,
+ struct bch_inode_unpacked *snapshot_root)
{
+ struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bkey_s_c k;
+ struct printbuf buf = PRINTBUF;
+ bool need_commit = false;
int ret = 0;
- for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes,
- SPOS(0, snapshot_root->bi_inum, snapshot_root->bi_snapshot - 1),
- BTREE_ITER_all_snapshots, k, ret) {
- if (k.k->p.offset != snapshot_root->bi_inum)
+ for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes,
+ POS(0, snapshot_root->bi_inum),
+ BTREE_ITER_all_snapshots, k, ret) {
+ if (bpos_ge(k.k->p, SPOS(0, snapshot_root->bi_inum, snapshot_root->bi_snapshot)))
break;
if (!bkey_is_inode(k.k))
continue;
@@ -121,19 +142,72 @@ static int repair_inode_hash_info(struct btree_trans *trans,
if (ret)
break;
- if (fsck_err_on(inode.bi_hash_seed != snapshot_root->bi_hash_seed ||
- INODE_STR_HASH(&inode) != INODE_STR_HASH(snapshot_root),
- trans, inode_snapshot_mismatch,
- "inode hash info in different snapshots don't match")) {
+ if (inode.bi_hash_seed == snapshot_root->bi_hash_seed &&
+ INODE_STR_HASH(&inode) == INODE_STR_HASH(snapshot_root)) {
+#ifdef CONFIG_BCACHEFS_DEBUG
+ struct bch_hash_info hash1 = bch2_hash_info_init(c, snapshot_root);
+ struct bch_hash_info hash2 = bch2_hash_info_init(c, &inode);
+
+ BUG_ON(hash1.type != hash2.type ||
+ memcmp(&hash1.siphash_key,
+ &hash2.siphash_key,
+ sizeof(hash1.siphash_key)));
+#endif
+ continue;
+ }
+
+ printbuf_reset(&buf);
+ prt_printf(&buf, "inode %llu hash info in snapshots %u %u don't match\n",
+ snapshot_root->bi_inum,
+ inode.bi_snapshot,
+ snapshot_root->bi_snapshot);
+
+ bch2_prt_str_hash_type(&buf, INODE_STR_HASH(&inode));
+ prt_printf(&buf, " %llx\n", inode.bi_hash_seed);
+
+ bch2_prt_str_hash_type(&buf, INODE_STR_HASH(snapshot_root));
+ prt_printf(&buf, " %llx", snapshot_root->bi_hash_seed);
+
+ if (fsck_err(trans, inode_snapshot_mismatch, "%s", buf.buf)) {
inode.bi_hash_seed = snapshot_root->bi_hash_seed;
SET_INODE_STR_HASH(&inode, INODE_STR_HASH(snapshot_root));
- ret = __bch2_fsck_write_inode(trans, &inode) ?:
- bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
- -BCH_ERR_transaction_restart_nested;
- break;
+
+ ret = __bch2_fsck_write_inode(trans, &inode);
+ if (ret)
+ break;
+ need_commit = true;
}
}
+
+ if (ret)
+ goto err;
+
+ if (!need_commit) {
+ struct printbuf buf = PRINTBUF;
+ bch2_log_msg_start(c, &buf);
+
+ prt_printf(&buf, "inode %llu hash info mismatch with root, but mismatch not found\n",
+ snapshot_root->bi_inum);
+
+ prt_printf(&buf, "root snapshot %u ", snapshot_root->bi_snapshot);
+ bch2_prt_str_hash_type(&buf, INODE_STR_HASH(snapshot_root));
+ prt_printf(&buf, " %llx\n", snapshot_root->bi_hash_seed);
+#if 0
+ prt_printf(&buf, "vs snapshot %u ", hash_info->inum_snapshot);
+ bch2_prt_str_hash_type(&buf, hash_info->type);
+ prt_printf(&buf, " %llx %llx", hash_info->siphash_key.k0, hash_info->siphash_key.k1);
+#endif
+ bch2_print_str(c, KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+ ret = bch_err_throw(c, fsck_repair_unimplemented);
+ goto err;
+ }
+
+ ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
+ -BCH_ERR_transaction_restart_nested;
+err:
fsck_err:
+ printbuf_exit(&buf);
bch2_trans_iter_exit(trans, &iter);
return ret;
}
@@ -145,46 +219,121 @@ fsck_err:
static noinline int check_inode_hash_info_matches_root(struct btree_trans *trans, u64 inum,
struct bch_hash_info *hash_info)
{
+ struct bch_inode_unpacked snapshot_root;
+ int ret = bch2_inode_find_snapshot_root(trans, inum, &snapshot_root);
+ if (ret)
+ return ret;
+
+ struct bch_hash_info hash_root = bch2_hash_info_init(trans->c, &snapshot_root);
+ if (hash_info->type != hash_root.type ||
+ memcmp(&hash_info->siphash_key,
+ &hash_root.siphash_key,
+ sizeof(hash_root.siphash_key)))
+ ret = bch2_repair_inode_hash_info(trans, &snapshot_root);
+
+ return ret;
+}
+
+/* Put a str_hash key in its proper location, checking for duplicates */
+int bch2_str_hash_repair_key(struct btree_trans *trans,
+ struct snapshots_seen *s,
+ const struct bch_hash_desc *desc,
+ struct bch_hash_info *hash_info,
+ struct btree_iter *k_iter, struct bkey_s_c k,
+ struct btree_iter *dup_iter, struct bkey_s_c dup_k,
+ bool *updated_before_k_pos)
+{
struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_s_c k;
+ struct printbuf buf = PRINTBUF;
+ bool free_snapshots_seen = false;
int ret = 0;
- for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, U32_MAX),
- BTREE_ITER_all_snapshots, k, ret) {
- if (k.k->p.offset != inum)
- break;
- if (bkey_is_inode(k.k))
- goto found;
+ if (!s) {
+ s = bch2_trans_kmalloc(trans, sizeof(*s));
+ ret = PTR_ERR_OR_ZERO(s);
+ if (ret)
+ goto out;
+
+ s->pos = k_iter->pos;
+ darray_init(&s->ids);
+
+ ret = bch2_get_snapshot_overwrites(trans, desc->btree_id, k_iter->pos, &s->ids);
+ if (ret)
+ goto out;
+
+ free_snapshots_seen = true;
}
- bch_err(c, "%s(): inum %llu not found", __func__, inum);
- ret = -BCH_ERR_fsck_repair_unimplemented;
- goto err;
-found:;
- struct bch_inode_unpacked inode;
- ret = bch2_inode_unpack(k, &inode);
- if (ret)
- goto err;
- struct bch_hash_info hash2 = bch2_hash_info_init(c, &inode);
- if (hash_info->type != hash2.type ||
- memcmp(&hash_info->siphash_key, &hash2.siphash_key, sizeof(hash2.siphash_key))) {
- ret = repair_inode_hash_info(trans, &inode);
- if (!ret) {
- bch_err(c, "inode hash info mismatch with root, but mismatch not found\n"
- "%u %llx %llx\n"
- "%u %llx %llx",
- hash_info->type,
- hash_info->siphash_key.k0,
- hash_info->siphash_key.k1,
- hash2.type,
- hash2.siphash_key.k0,
- hash2.siphash_key.k1);
- ret = -BCH_ERR_fsck_repair_unimplemented;
+ if (!dup_k.k) {
+ struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
+ ret = PTR_ERR_OR_ZERO(new);
+ if (ret)
+ goto out;
+
+ dup_k = bch2_hash_set_or_get_in_snapshot(trans, dup_iter, *desc, hash_info,
+ (subvol_inum) { 0, new->k.p.inode },
+ new->k.p.snapshot, new,
+ STR_HASH_must_create|
+ BTREE_ITER_with_updates|
+ BTREE_UPDATE_internal_snapshot_node);
+ ret = bkey_err(dup_k);
+ if (ret)
+ goto out;
+ if (dup_k.k)
+ goto duplicate_entries;
+
+ if (bpos_lt(new->k.p, k.k->p))
+ *updated_before_k_pos = true;
+
+ ret = bch2_insert_snapshot_whiteouts(trans, desc->btree_id,
+ k_iter->pos, new->k.p) ?:
+ bch2_hash_delete_at(trans, *desc, hash_info, k_iter,
+ BTREE_ITER_with_updates|
+ BTREE_UPDATE_internal_snapshot_node) ?:
+ bch2_fsck_update_backpointers(trans, s, *desc, hash_info, new) ?:
+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
+ -BCH_ERR_transaction_restart_commit;
+ } else {
+duplicate_entries:
+ ret = hash_pick_winner(trans, *desc, hash_info, k, dup_k);
+ if (ret < 0)
+ goto out;
+
+ if (!fsck_err(trans, hash_table_key_duplicate,
+ "duplicate hash table keys%s:\n%s",
+ ret != 2 ? "" : ", both point to valid inodes",
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, k),
+ prt_newline(&buf),
+ bch2_bkey_val_to_text(&buf, c, dup_k),
+ buf.buf)))
+ goto out;
+
+ switch (ret) {
+ case 0:
+ ret = bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0);
+ break;
+ case 1:
+ ret = bch2_hash_delete_at(trans, *desc, hash_info, dup_iter, 0);
+ break;
+ case 2:
+ ret = bch2_fsck_rename_dirent(trans, s, *desc, hash_info,
+ bkey_s_c_to_dirent(k),
+ updated_before_k_pos) ?:
+ bch2_hash_delete_at(trans, *desc, hash_info, k_iter,
+ BTREE_ITER_with_updates);
+ goto out;
}
+
+ ret = bch2_trans_commit(trans, NULL, NULL, 0) ?:
+ -BCH_ERR_transaction_restart_commit;
}
-err:
- bch2_trans_iter_exit(trans, &iter);
+out:
+fsck_err:
+ bch2_trans_iter_exit(trans, dup_iter);
+ printbuf_exit(&buf);
+ if (free_snapshots_seen)
+ darray_exit(&s->ids);
return ret;
}
@@ -192,10 +341,11 @@ int __bch2_str_hash_check_key(struct btree_trans *trans,
struct snapshots_seen *s,
const struct bch_hash_desc *desc,
struct bch_hash_info *hash_info,
- struct btree_iter *k_iter, struct bkey_s_c hash_k)
+ struct btree_iter *k_iter, struct bkey_s_c hash_k,
+ bool *updated_before_k_pos)
{
struct bch_fs *c = trans->c;
- struct btree_iter iter = { NULL };
+ struct btree_iter iter = {};
struct printbuf buf = PRINTBUF;
struct bkey_s_c k;
int ret = 0;
@@ -206,24 +356,31 @@ int __bch2_str_hash_check_key(struct btree_trans *trans,
for_each_btree_key_norestart(trans, iter, desc->btree_id,
SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot),
- BTREE_ITER_slots, k, ret) {
+ BTREE_ITER_slots|
+ BTREE_ITER_with_updates, k, ret) {
if (bkey_eq(k.k->p, hash_k.k->p))
break;
if (k.k->type == desc->key_type &&
- !desc->cmp_bkey(k, hash_k))
- goto duplicate_entries;
+ !desc->cmp_bkey(k, hash_k)) {
+ ret = check_inode_hash_info_matches_root(trans, hash_k.k->p.inode,
+ hash_info) ?:
+ bch2_str_hash_repair_key(trans, s, desc, hash_info,
+ k_iter, hash_k,
+ &iter, k, updated_before_k_pos);
+ break;
+ }
- if (bkey_deleted(k.k)) {
- bch2_trans_iter_exit(trans, &iter);
+ if (bkey_deleted(k.k))
goto bad_hash;
- }
}
-out:
bch2_trans_iter_exit(trans, &iter);
+out:
+fsck_err:
printbuf_exit(&buf);
return ret;
bad_hash:
+ bch2_trans_iter_exit(trans, &iter);
/*
* Before doing any repair, check hash_info itself:
*/
@@ -232,64 +389,12 @@ bad_hash:
goto out;
if (fsck_err(trans, hash_table_key_wrong_offset,
- "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n %s",
- bch2_btree_id_str(desc->btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash,
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) {
- struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, hash_k);
- if (IS_ERR(new))
- return PTR_ERR(new);
-
- k = bch2_hash_set_or_get_in_snapshot(trans, &iter, *desc, hash_info,
- (subvol_inum) { 0, hash_k.k->p.inode },
- hash_k.k->p.snapshot, new,
- STR_HASH_must_create|
- BTREE_ITER_with_updates|
- BTREE_UPDATE_internal_snapshot_node);
- ret = bkey_err(k);
- if (ret)
- goto out;
- if (k.k)
- goto duplicate_entries;
-
- ret = bch2_hash_delete_at(trans, *desc, hash_info, k_iter,
- BTREE_UPDATE_internal_snapshot_node) ?:
- bch2_fsck_update_backpointers(trans, s, *desc, hash_info, new) ?:
- bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
- -BCH_ERR_transaction_restart_nested;
- goto out;
- }
-fsck_err:
- goto out;
-duplicate_entries:
- ret = hash_pick_winner(trans, *desc, hash_info, hash_k, k);
- if (ret < 0)
- goto out;
-
- if (!fsck_err(trans, hash_table_key_duplicate,
- "duplicate hash table keys%s:\n%s",
- ret != 2 ? "" : ", both point to valid inodes",
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, hash_k),
- prt_newline(&buf),
- bch2_bkey_val_to_text(&buf, c, k),
- buf.buf)))
- goto out;
-
- switch (ret) {
- case 0:
- ret = bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0);
- break;
- case 1:
- ret = bch2_hash_delete_at(trans, *desc, hash_info, &iter, 0);
- break;
- case 2:
- ret = fsck_rename_dirent(trans, s, *desc, hash_info, bkey_s_c_to_dirent(hash_k)) ?:
- bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0);
- goto out;
- }
-
- ret = bch2_trans_commit(trans, NULL, NULL, 0) ?:
- -BCH_ERR_transaction_restart_nested;
+ "hash table key at wrong offset: should be at %llu\n%s",
+ hash,
+ (bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf)))
+ ret = bch2_str_hash_repair_key(trans, s, desc, hash_info,
+ k_iter, hash_k,
+ &iter, bkey_s_c_null,
+ updated_before_k_pos);
goto out;
}
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index 55a4ac7bf220..8979ac2d7a3b 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -12,7 +12,6 @@
#include "super.h"
#include <linux/crc32c.h>
-#include <crypto/hash.h>
#include <crypto/sha2.h>
static inline enum bch_str_hash_type
@@ -33,7 +32,9 @@ bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt)
}
struct bch_hash_info {
+ u32 inum_snapshot;
u8 type;
+ struct unicode_map *cf_encoding;
/*
* For crc32 or crc64 string hashes the first key value of
* the siphash_key (k0) is used as the key.
@@ -44,20 +45,18 @@ struct bch_hash_info {
static inline struct bch_hash_info
bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi)
{
- /* XXX ick */
struct bch_hash_info info = {
- .type = INODE_STR_HASH(bi),
- .siphash_key = { .k0 = bi->bi_hash_seed }
+ .inum_snapshot = bi->bi_snapshot,
+ .type = INODE_STR_HASH(bi),
+ .cf_encoding = bch2_inode_casefold(c, bi) ? c->cf_encoding : NULL,
+ .siphash_key = { .k0 = bi->bi_hash_seed }
};
if (unlikely(info.type == BCH_STR_HASH_siphash_old)) {
- SHASH_DESC_ON_STACK(desc, c->sha256);
u8 digest[SHA256_DIGEST_SIZE];
- desc->tfm = c->sha256;
-
- crypto_shash_digest(desc, (void *) &bi->bi_hash_seed,
- sizeof(bi->bi_hash_seed), digest);
+ sha256((const u8 *)&bi->bi_hash_seed,
+ sizeof(bi->bi_hash_seed), digest);
memcpy(&info.siphash_key, digest, sizeof(info.siphash_key));
}
@@ -231,11 +230,11 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans,
struct bkey_s_c k;
int ret;
- bch2_trans_copy_iter(&iter, start);
+ bch2_trans_copy_iter(trans, &iter, start);
- bch2_btree_iter_advance(&iter);
+ bch2_btree_iter_advance(trans, &iter);
- for_each_btree_key_continue_norestart(iter, BTREE_ITER_slots, k, ret) {
+ for_each_btree_key_continue_norestart(trans, iter, BTREE_ITER_slots, k, ret) {
if (k.k->type != desc.key_type &&
k.k->type != KEY_TYPE_hash_whiteout)
break;
@@ -260,6 +259,7 @@ struct bkey_s_c bch2_hash_set_or_get_in_snapshot(struct btree_trans *trans,
struct bkey_i *insert,
enum btree_iter_update_trigger_flags flags)
{
+ struct bch_fs *c = trans->c;
struct btree_iter slot = {};
struct bkey_s_c k;
bool found = false;
@@ -280,14 +280,14 @@ struct bkey_s_c bch2_hash_set_or_get_in_snapshot(struct btree_trans *trans,
}
if (!slot.path && !(flags & STR_HASH_must_replace))
- bch2_trans_copy_iter(&slot, iter);
+ bch2_trans_copy_iter(trans, &slot, iter);
if (k.k->type != KEY_TYPE_hash_whiteout)
goto not_found;
}
if (!ret)
- ret = -BCH_ERR_ENOSPC_str_hash_create;
+ ret = bch_err_throw(c, ENOSPC_str_hash_create);
out:
bch2_trans_iter_exit(trans, &slot);
bch2_trans_iter_exit(trans, iter);
@@ -299,7 +299,7 @@ not_found:
bch2_trans_iter_exit(trans, &slot);
return k;
} else if (!found && (flags & STR_HASH_must_replace)) {
- ret = -BCH_ERR_ENOENT_str_hash_set_must_replace;
+ ret = bch_err_throw(c, ENOENT_str_hash_set_must_replace);
} else {
if (!found && slot.path)
swap(*iter, slot);
@@ -327,7 +327,7 @@ int bch2_hash_set_in_snapshot(struct btree_trans *trans,
return ret;
if (k.k) {
bch2_trans_iter_exit(trans, &iter);
- return -BCH_ERR_EEXIST_str_hash_set;
+ return bch_err_throw(trans->c, EEXIST_str_hash_set);
}
return 0;
@@ -393,18 +393,30 @@ int bch2_hash_delete(struct btree_trans *trans,
return ret;
}
+int bch2_repair_inode_hash_info(struct btree_trans *, struct bch_inode_unpacked *);
+
struct snapshots_seen;
+int bch2_str_hash_repair_key(struct btree_trans *,
+ struct snapshots_seen *,
+ const struct bch_hash_desc *,
+ struct bch_hash_info *,
+ struct btree_iter *, struct bkey_s_c,
+ struct btree_iter *, struct bkey_s_c,
+ bool *);
+
int __bch2_str_hash_check_key(struct btree_trans *,
struct snapshots_seen *,
const struct bch_hash_desc *,
struct bch_hash_info *,
- struct btree_iter *, struct bkey_s_c);
+ struct btree_iter *, struct bkey_s_c,
+ bool *);
static inline int bch2_str_hash_check_key(struct btree_trans *trans,
struct snapshots_seen *s,
const struct bch_hash_desc *desc,
struct bch_hash_info *hash_info,
- struct btree_iter *k_iter, struct bkey_s_c hash_k)
+ struct btree_iter *k_iter, struct bkey_s_c hash_k,
+ bool *updated_before_k_pos)
{
if (hash_k.k->type != desc->key_type)
return 0;
@@ -412,7 +424,8 @@ static inline int bch2_str_hash_check_key(struct btree_trans *trans,
if (likely(desc->hash_bkey(hash_info, hash_k) == hash_k.k->p.offset))
return 0;
- return __bch2_str_hash_check_key(trans, s, desc, hash_info, k_iter, hash_k);
+ return __bch2_str_hash_check_key(trans, s, desc, hash_info, k_iter, hash_k,
+ updated_before_k_pos);
}
#endif /* _BCACHEFS_STR_HASH_H */
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index e3d0475232e5..020587449123 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -3,9 +3,11 @@
#include "bcachefs.h"
#include "btree_key_cache.h"
#include "btree_update.h"
+#include "enumerated_ref.h"
#include "errcode.h"
#include "error.h"
#include "fs.h"
+#include "recovery_passes.h"
#include "snapshot.h"
#include "subvolume.h"
@@ -13,6 +15,22 @@
static int bch2_subvolume_delete(struct btree_trans *, u32);
+static int bch2_subvolume_missing(struct bch_fs *c, u32 subvolid)
+{
+ struct printbuf buf = PRINTBUF;
+ bch2_log_msg_start(c, &buf);
+
+ prt_printf(&buf, "missing subvolume %u", subvolid);
+ bool print = bch2_count_fsck_err(c, subvol_missing, &buf);
+
+ int ret = bch2_run_explicit_recovery_pass(c, &buf,
+ BCH_RECOVERY_PASS_check_inodes, 0);
+ if (print)
+ bch2_print_str(c, KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+ return ret;
+}
+
static struct bpos subvolume_children_pos(struct bkey_s_c k)
{
if (k.k->type != KEY_TYPE_subvolume)
@@ -44,8 +62,8 @@ static int check_subvol(struct btree_trans *trans,
ret = bch2_snapshot_lookup(trans, snapid, &snapshot);
if (bch2_err_matches(ret, ENOENT))
- bch_err(c, "subvolume %llu points to nonexistent snapshot %u",
- k.k->p.offset, snapid);
+ return bch2_run_print_explicit_recovery_pass(c,
+ BCH_RECOVERY_PASS_reconstruct_snapshots) ?: ret;
if (ret)
return ret;
@@ -112,10 +130,20 @@ static int check_subvol(struct btree_trans *trans,
"subvolume %llu points to missing subvolume root %llu:%u",
k.k->p.offset, le64_to_cpu(subvol.v->inode),
le32_to_cpu(subvol.v->snapshot))) {
- ret = bch2_subvolume_delete(trans, iter->pos.offset);
- bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset);
- ret = ret ?: -BCH_ERR_transaction_restart_nested;
- goto err;
+ /*
+ * Recreate - any contents that are still disconnected
+ * will then get reattached under lost+found
+ */
+ bch2_inode_init_early(c, &inode);
+ bch2_inode_init_late(c, &inode, bch2_current_time(c),
+ 0, 0, S_IFDIR|0700, 0, NULL);
+ inode.bi_inum = le64_to_cpu(subvol.v->inode);
+ inode.bi_snapshot = le32_to_cpu(subvol.v->snapshot);
+ inode.bi_subvol = k.k->p.offset;
+ inode.bi_parent_subvol = le32_to_cpu(subvol.v->fs_path_parent);
+ ret = __bch2_fsck_write_inode(trans, &inode);
+ if (ret)
+ goto err;
}
} else {
goto err;
@@ -123,13 +151,9 @@ static int check_subvol(struct btree_trans *trans,
if (!BCH_SUBVOLUME_SNAP(subvol.v)) {
u32 snapshot_root = bch2_snapshot_root(c, le32_to_cpu(subvol.v->snapshot));
- u32 snapshot_tree;
- struct bch_snapshot_tree st;
-
- rcu_read_lock();
- snapshot_tree = snapshot_t(c, snapshot_root)->tree;
- rcu_read_unlock();
+ u32 snapshot_tree = bch2_snapshot_tree(c, snapshot_root);
+ struct bch_snapshot_tree st;
ret = bch2_snapshot_tree_lookup(trans, snapshot_tree, &st);
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
@@ -241,6 +265,13 @@ void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c,
prt_printf(out, " creation_parent %u", le32_to_cpu(s.v->creation_parent));
prt_printf(out, " fs_parent %u", le32_to_cpu(s.v->fs_path_parent));
}
+
+ if (BCH_SUBVOLUME_RO(s.v))
+ prt_printf(out, " ro");
+ if (BCH_SUBVOLUME_SNAP(s.v))
+ prt_printf(out, " snapshot");
+ if (BCH_SUBVOLUME_UNLINKED(s.v))
+ prt_printf(out, " unlinked");
}
static int subvolume_children_mod(struct btree_trans *trans, struct bpos pos, bool set)
@@ -275,7 +306,7 @@ int bch2_subvol_has_children(struct btree_trans *trans, u32 subvol)
struct btree_iter iter;
bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolume_children, POS(subvol, 0), 0);
- struct bkey_s_c k = bch2_btree_iter_peek(&iter);
+ struct bkey_s_c k = bch2_btree_iter_peek(trans, &iter);
bch2_trans_iter_exit(trans, &iter);
return bkey_err(k) ?: k.k && k.k->p.inode == subvol
@@ -291,9 +322,8 @@ bch2_subvolume_get_inlined(struct btree_trans *trans, unsigned subvol,
int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, subvol),
BTREE_ITER_cached|
BTREE_ITER_with_updates, subvolume, s);
- bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT) &&
- inconsistent_if_not_found,
- trans->c, "missing subvolume %u", subvol);
+ if (bch2_err_matches(ret, ENOENT) && inconsistent_if_not_found)
+ ret = bch2_subvolume_missing(trans->c, subvol) ?: ret;
return ret;
}
@@ -343,8 +373,8 @@ int __bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid,
subvolume);
ret = bkey_err(subvol);
- bch2_fs_inconsistent_on(warn && bch2_err_matches(ret, ENOENT), trans->c,
- "missing subvolume %u", subvolid);
+ if (bch2_err_matches(ret, ENOENT))
+ ret = bch2_subvolume_missing(trans->c, subvolid) ?: ret;
if (likely(!ret))
*snapid = le32_to_cpu(subvol.v->snapshot);
@@ -417,8 +447,8 @@ static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
BTREE_ITER_cached|BTREE_ITER_intent,
subvolume);
int ret = bkey_err(subvol);
- bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
- "missing subvolume %u", subvolid);
+ if (bch2_err_matches(ret, ENOENT))
+ ret = bch2_subvolume_missing(trans->c, subvolid) ?: ret;
if (ret)
goto err;
@@ -428,7 +458,7 @@ static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
bch2_bkey_get_iter_typed(trans, &snapshot_iter,
BTREE_ID_snapshots, POS(0, snapid),
0, snapshot);
- ret = bkey_err(subvol);
+ ret = bkey_err(snapshot);
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
"missing snapshot %u", snapid);
if (ret)
@@ -440,6 +470,11 @@ static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
bch2_bkey_get_iter_typed(trans, &snapshot_tree_iter,
BTREE_ID_snapshot_trees, POS(0, treeid),
0, snapshot_tree);
+ ret = bkey_err(snapshot_tree);
+ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
+ "missing snapshot tree %u", treeid);
+ if (ret)
+ goto err;
if (le32_to_cpu(snapshot_tree.v->master_subvol) == subvolid) {
struct bkey_i_snapshot_tree *snapshot_tree_mut =
@@ -464,22 +499,23 @@ err:
static int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
{
- return bch2_subvolumes_reparent(trans, subvolid) ?:
+ int ret = bch2_subvolumes_reparent(trans, subvolid) ?:
commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
__bch2_subvolume_delete(trans, subvolid));
+
+ bch2_recovery_pass_set_no_ratelimit(trans->c, BCH_RECOVERY_PASS_check_subvols);
+ return ret;
}
static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work)
{
struct bch_fs *c = container_of(work, struct bch_fs,
snapshot_wait_for_pagecache_and_delete_work);
- snapshot_id_list s;
- u32 *id;
int ret = 0;
while (!ret) {
mutex_lock(&c->snapshots_unlinked_lock);
- s = c->snapshots_unlinked;
+ snapshot_id_list s = c->snapshots_unlinked;
darray_init(&c->snapshots_unlinked);
mutex_unlock(&c->snapshots_unlinked_lock);
@@ -488,7 +524,7 @@ static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *wor
bch2_evict_subvolume_inodes(c, &s);
- for (id = s.data; id < s.data + s.nr; id++) {
+ darray_for_each(s, id) {
ret = bch2_trans_run(c, bch2_subvolume_delete(trans, *id));
bch_err_msg(c, ret, "deleting subvolume %u", *id);
if (ret)
@@ -498,7 +534,7 @@ static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *wor
darray_exit(&s);
}
- bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_snapshot_delete_pagecache);
}
struct subvolume_unlink_hook {
@@ -521,11 +557,11 @@ static int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans
if (ret)
return ret;
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_snapshot_delete_pagecache))
+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_snapshot_delete_pagecache))
return -EROFS;
if (!queue_work(c->write_ref_wq, &c->snapshot_wait_for_pagecache_and_delete_work))
- bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_snapshot_delete_pagecache);
return 0;
}
@@ -549,13 +585,13 @@ int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid)
BTREE_ID_subvolumes, POS(0, subvolid),
BTREE_ITER_cached, subvolume);
ret = PTR_ERR_OR_ZERO(n);
- if (unlikely(ret)) {
- bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
- "missing subvolume %u", subvolid);
+ if (bch2_err_matches(ret, ENOENT))
+ ret = bch2_subvolume_missing(trans->c, subvolid) ?: ret;
+ if (unlikely(ret))
return ret;
- }
SET_BCH_SUBVOLUME_UNLINKED(&n->v, true);
+ n->v.fs_path_parent = 0;
bch2_trans_iter_exit(trans, &iter);
return ret;
}
@@ -568,7 +604,7 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
bool ro)
{
struct bch_fs *c = trans->c;
- struct btree_iter dst_iter, src_iter = (struct btree_iter) { NULL };
+ struct btree_iter dst_iter, src_iter = {};
struct bkey_i_subvolume *new_subvol = NULL;
struct bkey_i_subvolume *src_subvol = NULL;
u32 parent = 0, new_nodes[2], snapshot_subvols[2];
@@ -577,7 +613,7 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
ret = bch2_bkey_get_empty_slot(trans, &dst_iter,
BTREE_ID_subvolumes, POS(0, U32_MAX));
if (ret == -BCH_ERR_ENOSPC_btree_slot)
- ret = -BCH_ERR_ENOSPC_subvolume_create;
+ ret = bch_err_throw(c, ENOSPC_subvolume_create);
if (ret)
return ret;
@@ -591,11 +627,10 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
BTREE_ID_subvolumes, POS(0, src_subvolid),
BTREE_ITER_cached, subvolume);
ret = PTR_ERR_OR_ZERO(src_subvol);
- if (unlikely(ret)) {
- bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
- "subvolume %u not found", src_subvolid);
+ if (bch2_err_matches(ret, ENOENT))
+ ret = bch2_subvolume_missing(trans->c, src_subvolid) ?: ret;
+ if (unlikely(ret))
goto err;
- }
parent = le32_to_cpu(src_subvol->v.snapshot);
}
@@ -684,8 +719,9 @@ static int __bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans)
return ret;
if (!bkey_is_inode(k.k)) {
- bch_err(trans->c, "root inode not found");
- ret = -BCH_ERR_ENOENT_inode;
+ struct bch_fs *c = trans->c;
+ bch_err(c, "root inode not found");
+ ret = bch_err_throw(c, ENOENT_inode);
goto err;
}
@@ -709,11 +745,8 @@ int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c)
return ret;
}
-int bch2_fs_subvolumes_init(struct bch_fs *c)
+void bch2_fs_subvolumes_init_early(struct bch_fs *c)
{
- INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work);
INIT_WORK(&c->snapshot_wait_for_pagecache_and_delete_work,
bch2_subvolume_wait_for_pagecache_and_delete);
- mutex_init(&c->snapshots_unlinked_lock);
- return 0;
}
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index 910f6196700e..075f55e25c70 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -33,16 +33,16 @@ int bch2_subvol_is_ro_trans(struct btree_trans *, u32);
int bch2_subvol_is_ro(struct bch_fs *, u32);
static inline struct bkey_s_c
-bch2_btree_iter_peek_in_subvolume_max_type(struct btree_iter *iter, struct bpos end,
- u32 subvolid, unsigned flags)
+bch2_btree_iter_peek_in_subvolume_max_type(struct btree_trans *trans, struct btree_iter *iter,
+ struct bpos end, u32 subvolid, unsigned flags)
{
u32 snapshot;
- int ret = bch2_subvolume_get_snapshot(iter->trans, subvolid, &snapshot);
+ int ret = bch2_subvolume_get_snapshot(trans, subvolid, &snapshot);
if (ret)
return bkey_s_c_err(ret);
- bch2_btree_iter_set_snapshot(iter, snapshot);
- return bch2_btree_iter_peek_max_type(iter, end, flags);
+ bch2_btree_iter_set_snapshot(trans, iter, snapshot);
+ return bch2_btree_iter_peek_max_type(trans, iter, end, flags);
}
#define for_each_btree_key_in_subvolume_max_continue(_trans, _iter, \
@@ -53,14 +53,14 @@ bch2_btree_iter_peek_in_subvolume_max_type(struct btree_iter *iter, struct bpos
\
do { \
_ret3 = lockrestart_do(_trans, ({ \
- (_k) = bch2_btree_iter_peek_in_subvolume_max_type(&(_iter), \
+ (_k) = bch2_btree_iter_peek_in_subvolume_max_type(trans, &(_iter),\
_end, _subvolid, (_flags)); \
if (!(_k).k) \
break; \
\
bkey_err(_k) ?: (_do); \
})); \
- } while (!_ret3 && bch2_btree_iter_advance(&(_iter))); \
+ } while (!_ret3 && bch2_btree_iter_advance(_trans, &(_iter))); \
\
bch2_trans_iter_exit((_trans), &(_iter)); \
_ret3; \
@@ -77,15 +77,12 @@ bch2_btree_iter_peek_in_subvolume_max_type(struct btree_iter *iter, struct bpos
_end, _subvolid, _flags, _k, _do); \
})
-int bch2_delete_dead_snapshots(struct bch_fs *);
-void bch2_delete_dead_snapshots_async(struct bch_fs *);
-
int bch2_subvolume_unlink(struct btree_trans *, u32);
int bch2_subvolume_create(struct btree_trans *, u64, u32, u32, u32 *, u32 *, bool);
int bch2_initialize_subvolumes(struct bch_fs *);
int bch2_fs_upgrade_for_subvolumes(struct bch_fs *);
-int bch2_fs_subvolumes_init(struct bch_fs *);
+void bch2_fs_subvolumes_init_early(struct bch_fs *);
#endif /* _BCACHEFS_SUBVOLUME_H */
diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h
index 1549d6daf7af..9d634b906dcd 100644
--- a/fs/bcachefs/subvolume_types.h
+++ b/fs/bcachefs/subvolume_types.h
@@ -2,33 +2,6 @@
#ifndef _BCACHEFS_SUBVOLUME_TYPES_H
#define _BCACHEFS_SUBVOLUME_TYPES_H
-#include "darray.h"
-
-typedef DARRAY(u32) snapshot_id_list;
-
-#define IS_ANCESTOR_BITMAP 128
-
-struct snapshot_t {
- bool live;
- u32 parent;
- u32 skip[3];
- u32 depth;
- u32 children[2];
- u32 subvol; /* Nonzero only if a subvolume points to this node: */
- u32 tree;
- unsigned long is_ancestor[BITS_TO_LONGS(IS_ANCESTOR_BITMAP)];
-};
-
-struct snapshot_table {
- struct rcu_head rcu;
- size_t nr;
-#ifndef RUST_BINDGEN
- DECLARE_FLEX_ARRAY(struct snapshot_t, s);
-#else
- struct snapshot_t s[0];
-#endif
-};
-
typedef struct {
/* we can't have padding in this struct: */
u64 subvol;
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 8037ccbacf6a..6c2e1d647403 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -25,9 +25,6 @@
#include <linux/sort.h>
#include <linux/string_choices.h>
-static const struct blk_holder_ops bch2_sb_handle_bdev_ops = {
-};
-
struct bch2_metadata_version {
u16 version;
const char *name;
@@ -69,14 +66,39 @@ enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_meta
return v;
}
-void bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version version)
+int bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version version)
{
+ int ret = ((c->sb.features & BIT_ULL(BCH_FEATURE_incompat_version_field)) &&
+ version <= c->sb.version_incompat_allowed)
+ ? 0
+ : -BCH_ERR_may_not_use_incompat_feature;
+
mutex_lock(&c->sb_lock);
- SET_BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb,
- max(BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb), version));
- c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_FEATURE_incompat_version_field);
- bch2_write_super(c);
+ if (!ret) {
+ SET_BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb,
+ max(BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb), version));
+ bch2_write_super(c);
+ } else {
+ darray_for_each(c->incompat_versions_requested, i)
+ if (version == *i)
+ goto out;
+
+ darray_push(&c->incompat_versions_requested, version);
+ struct printbuf buf = PRINTBUF;
+ prt_str(&buf, "requested incompat feature ");
+ bch2_version_to_text(&buf, version);
+ prt_str(&buf, " currently not enabled, allowed up to ");
+ bch2_version_to_text(&buf, version);
+ prt_printf(&buf, "\n set version_upgrade=incompat to enable");
+
+ bch_notice(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ }
+
+out:
mutex_unlock(&c->sb_lock);
+
+ return ret;
}
const char * const bch2_sb_fields[] = {
@@ -239,11 +261,11 @@ struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *sb,
/* XXX: we're not checking that offline device have enough space */
- for_each_online_member(c, ca) {
+ for_each_online_member(c, ca, BCH_DEV_READ_REF_sb_field_resize) {
struct bch_sb_handle *dev_sb = &ca->disk_sb;
if (bch2_sb_realloc(dev_sb, le32_to_cpu(dev_sb->sb->u64s) + d)) {
- percpu_ref_put(&ca->io_ref);
+ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_sb_field_resize);
return NULL;
}
}
@@ -360,39 +382,40 @@ static int bch2_sb_compatible(struct bch_sb *sb, struct printbuf *out)
return 0;
}
-static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
- enum bch_validate_flags flags, struct printbuf *out)
+int bch2_sb_validate(struct bch_sb *sb, u64 read_offset,
+ enum bch_validate_flags flags, struct printbuf *out)
{
- struct bch_sb *sb = disk_sb->sb;
- struct bch_sb_field_members_v1 *mi;
enum bch_opt_id opt_id;
- u16 block_size;
int ret;
ret = bch2_sb_compatible(sb, out);
if (ret)
return ret;
- if (sb->features[1] ||
- (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) {
- prt_printf(out, "Filesystem has incompatible features");
+ u64 incompat = le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR);
+ unsigned incompat_bit = 0;
+ if (incompat)
+ incompat_bit = __ffs64(incompat);
+ else if (sb->features[1])
+ incompat_bit = 64 + __ffs64(le64_to_cpu(sb->features[1]));
+
+ if (incompat_bit) {
+ prt_printf(out, "Filesystem has incompatible feature bit %u, highest supported %s (%u)",
+ incompat_bit,
+ bch2_sb_features[BCH_FEATURE_NR - 1],
+ BCH_FEATURE_NR - 1);
return -BCH_ERR_invalid_sb_features;
}
if (BCH_VERSION_MAJOR(le16_to_cpu(sb->version)) > BCH_VERSION_MAJOR(bcachefs_metadata_version_current) ||
BCH_SB_VERSION_INCOMPAT(sb) > bcachefs_metadata_version_current) {
- prt_printf(out, "Filesystem has incompatible version");
+ prt_str(out, "Filesystem has incompatible version ");
+ bch2_version_to_text(out, le16_to_cpu(sb->version));
+ prt_str(out, ", current version ");
+ bch2_version_to_text(out, bcachefs_metadata_version_current);
return -BCH_ERR_invalid_sb_features;
}
- block_size = le16_to_cpu(sb->block_size);
-
- if (block_size > PAGE_SECTORS) {
- prt_printf(out, "Block size too big (got %u, max %u)",
- block_size, PAGE_SECTORS);
- return -BCH_ERR_invalid_sb_block_size;
- }
-
if (bch2_is_zero(sb->user_uuid.b, sizeof(sb->user_uuid))) {
prt_printf(out, "Bad user UUID (got zeroes)");
return -BCH_ERR_invalid_sb_uuid;
@@ -403,6 +426,13 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
return -BCH_ERR_invalid_sb_uuid;
}
+ if (!(flags & BCH_VALIDATE_write) &&
+ le64_to_cpu(sb->offset) != read_offset) {
+ prt_printf(out, "Bad sb offset (got %llu, read from %llu)",
+ le64_to_cpu(sb->offset), read_offset);
+ return -BCH_ERR_invalid_sb_offset;
+ }
+
if (!sb->nr_devices ||
sb->nr_devices > BCH_SB_MEMBERS_MAX) {
prt_printf(out, "Bad number of member devices %u (max %u)",
@@ -438,6 +468,9 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(sb, BCH_SB_VERSION_INCOMPAT(sb));
}
+ if (sb->nr_devices > 1)
+ SET_BCH_SB_MULTI_DEVICE(sb, true);
+
if (!flags) {
/*
* Been seeing a bug where these are getting inexplicably
@@ -458,6 +491,13 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2)
SET_BCH_SB_PROMOTE_WHOLE_EXTENTS(sb, true);
+
+ if (!BCH_SB_WRITE_ERROR_TIMEOUT(sb))
+ SET_BCH_SB_WRITE_ERROR_TIMEOUT(sb, 30);
+
+ if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_extent_flags &&
+ !BCH_SB_CSUM_ERR_RETRY_NR(sb))
+ SET_BCH_SB_CSUM_ERR_RETRY_NR(sb, 3);
}
#ifdef __KERNEL__
@@ -468,8 +508,8 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) {
const struct bch_option *opt = bch2_opt_table + opt_id;
- if (opt->get_sb != BCH2_NO_SB_OPT) {
- u64 v = bch2_opt_from_sb(sb, opt_id);
+ if (opt->get_sb) {
+ u64 v = bch2_opt_from_sb(sb, opt_id, -1);
prt_printf(out, "Invalid option ");
ret = bch2_opt_validate(opt, v, out);
@@ -499,14 +539,17 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
}
}
+ struct bch_sb_field *mi =
+ bch2_sb_field_get_id(sb, BCH_SB_FIELD_members_v2) ?:
+ bch2_sb_field_get_id(sb, BCH_SB_FIELD_members_v1);
+
/* members must be validated first: */
- mi = bch2_sb_field_get(sb, members_v1);
if (!mi) {
prt_printf(out, "Invalid superblock: member info area missing");
return -BCH_ERR_invalid_sb_members_missing;
}
- ret = bch2_sb_field_validate(sb, &mi->field, flags, out);
+ ret = bch2_sb_field_validate(sb, mi, flags, out);
if (ret)
return ret;
@@ -575,11 +618,15 @@ static void bch2_sb_update(struct bch_fs *c)
c->sb.features = le64_to_cpu(src->features[0]);
c->sb.compat = le64_to_cpu(src->compat[0]);
+ c->sb.multi_device = BCH_SB_MULTI_DEVICE(src);
memset(c->sb.errors_silent, 0, sizeof(c->sb.errors_silent));
struct bch_sb_field_ext *ext = bch2_sb_field_get(src, ext);
if (ext) {
+ c->sb.recovery_passes_required =
+ bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
+
le_bitvector_to_cpu(c->sb.errors_silent, (void *) ext->errors_silent,
sizeof(c->sb.errors_silent) * 8);
c->sb.btrees_lost_data = le64_to_cpu(ext->btrees_lost_data);
@@ -749,7 +796,7 @@ retry:
memset(sb, 0, sizeof(*sb));
sb->mode = BLK_OPEN_READ;
sb->have_bio = true;
- sb->holder = kmalloc(1, GFP_KERNEL);
+ sb->holder = kzalloc(sizeof(*sb->holder), GFP_KERNEL);
if (!sb->holder)
return -ENOMEM;
@@ -875,7 +922,7 @@ got_super:
sb->have_layout = true;
- ret = bch2_sb_validate(sb, 0, &err);
+ ret = bch2_sb_validate(sb->sb, offset, 0, &err);
if (ret) {
bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error validating superblock: %s\n",
path, err.buf);
@@ -912,19 +959,19 @@ static void write_super_endio(struct bio *bio)
{
struct bch_dev *ca = bio->bi_private;
+ bch2_account_io_success_fail(ca, bio_data_dir(bio), !bio->bi_status);
+
/* XXX: return errors directly */
- if (bch2_dev_io_err_on(bio->bi_status, ca,
- bio_data_dir(bio)
- ? BCH_MEMBER_ERROR_write
- : BCH_MEMBER_ERROR_read,
- "superblock %s error: %s",
+ if (bio->bi_status) {
+ bch_err_dev_ratelimited(ca, "superblock %s error: %s",
str_write_read(bio_data_dir(bio)),
- bch2_blk_status_to_str(bio->bi_status)))
+ bch2_blk_status_to_str(bio->bi_status));
ca->sb_write_error = 1;
+ }
closure_put(&ca->fs->sb_write);
- percpu_ref_put(&ca->io_ref);
+ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super);
}
static void read_back_super(struct bch_fs *c, struct bch_dev *ca)
@@ -942,7 +989,7 @@ static void read_back_super(struct bch_fs *c, struct bch_dev *ca)
this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb], bio_sectors(bio));
- percpu_ref_get(&ca->io_ref);
+ enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super);
closure_bio_submit(bio, &c->sb_write);
}
@@ -968,7 +1015,7 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_sb],
bio_sectors(bio));
- percpu_ref_get(&ca->io_ref);
+ enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super);
closure_bio_submit(bio, &c->sb_write);
}
@@ -985,7 +1032,7 @@ int bch2_write_super(struct bch_fs *c)
trace_and_count(c, write_super, c, _RET_IP_);
- if (c->opts.very_degraded)
+ if (c->opts.degraded == BCH_DEGRADED_very)
degraded_flags |= BCH_FORCE_IF_LOST;
lockdep_assert_held(&c->sb_lock);
@@ -993,13 +1040,20 @@ int bch2_write_super(struct bch_fs *c)
closure_init_stack(cl);
memset(&sb_written, 0, sizeof(sb_written));
- for_each_online_member(c, ca) {
+ /*
+ * Note: we do writes to RO devices here, and we might want to change
+ * that in the future.
+ *
+ * For now, we expect to be able to call write_super() when we're not
+ * yet RW:
+ */
+ for_each_online_member(c, ca, BCH_DEV_READ_REF_write_super) {
ret = darray_push(&online_devices, ca);
if (bch2_fs_fatal_err_on(ret, c, "%s: error allocating online devices", __func__)) {
- percpu_ref_put(&ca->io_ref);
+ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super);
goto out;
}
- percpu_ref_get(&ca->io_ref);
+ enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super);
}
/* Make sure we're using the new magic numbers: */
@@ -1032,7 +1086,7 @@ int bch2_write_super(struct bch_fs *c)
darray_for_each(online_devices, ca) {
printbuf_reset(&err);
- ret = bch2_sb_validate(&(*ca)->disk_sb, BCH_VALIDATE_write, &err);
+ ret = bch2_sb_validate((*ca)->disk_sb.sb, 0, BCH_VALIDATE_write, &err);
if (ret) {
bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf);
goto out;
@@ -1058,7 +1112,8 @@ int bch2_write_super(struct bch_fs *c)
prt_str(&buf, ")");
bch2_fs_fatal_error(c, ": %s", buf.buf);
printbuf_exit(&buf);
- return -BCH_ERR_sb_not_downgraded;
+ ret = bch_err_throw(c, sb_not_downgraded);
+ goto out;
}
darray_for_each(online_devices, ca) {
@@ -1087,7 +1142,7 @@ int bch2_write_super(struct bch_fs *c)
if (c->opts.errors != BCH_ON_ERROR_continue &&
c->opts.errors != BCH_ON_ERROR_fix_safe) {
- ret = -BCH_ERR_erofs_sb_err;
+ ret = bch_err_throw(c, erofs_sb_err);
bch2_fs_fatal_error(c, "%s", buf.buf);
} else {
bch_err(c, "%s", buf.buf);
@@ -1106,7 +1161,7 @@ int bch2_write_super(struct bch_fs *c)
ca->disk_sb.seq);
bch2_fs_fatal_error(c, "%s", buf.buf);
printbuf_exit(&buf);
- ret = -BCH_ERR_erofs_sb_err;
+ ret = bch_err_throw(c, erofs_sb_err);
}
}
@@ -1160,12 +1215,12 @@ int bch2_write_super(struct bch_fs *c)
!can_mount_with_written), c,
": Unable to write superblock to sufficient devices (from %ps)",
(void *) _RET_IP_))
- ret = -1;
+ ret = bch_err_throw(c, erofs_sb_err);
out:
/* Make new options visible after they're persistent: */
bch2_sb_update(c);
darray_for_each(online_devices, ca)
- percpu_ref_put(&(*ca)->io_ref);
+ enumerated_ref_put(&(*ca)->io_ref[READ], BCH_DEV_READ_REF_write_super);
darray_exit(&online_devices);
printbuf_exit(&err);
return ret;
@@ -1217,11 +1272,37 @@ void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version, bool incompat)
bch2_sb_field_resize(&c->disk_sb, downgrade, 0);
c->disk_sb.sb->version = cpu_to_le16(new_version);
- c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
- if (incompat)
+ if (incompat) {
+ c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb,
max(BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb), new_version));
+ }
+}
+
+void bch2_sb_upgrade_incompat(struct bch_fs *c)
+{
+ mutex_lock(&c->sb_lock);
+ if (c->sb.version == c->sb.version_incompat_allowed)
+ goto unlock;
+
+ struct printbuf buf = PRINTBUF;
+
+ prt_str(&buf, "Now allowing incompatible features up to ");
+ bch2_version_to_text(&buf, c->sb.version);
+ prt_str(&buf, ", previously allowed up to ");
+ bch2_version_to_text(&buf, c->sb.version_incompat_allowed);
+ prt_newline(&buf);
+
+ bch_notice(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+
+ c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
+ SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb,
+ max(BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb), c->sb.version));
+ bch2_write_super(c);
+unlock:
+ mutex_unlock(&c->sb_lock);
}
static int bch2_sb_ext_validate(struct bch_sb *sb, struct bch_sb_field *f,
@@ -1451,8 +1532,8 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
for (id = 0; id < bch2_opts_nr; id++) {
const struct bch_option *opt = bch2_opt_table + id;
- if (opt->get_sb != BCH2_NO_SB_OPT) {
- u64 v = bch2_opt_from_sb(sb, id);
+ if (opt->get_sb) {
+ u64 v = bch2_opt_from_sb(sb, id, -1);
prt_printf(out, "%s:\t", opt->attr.name);
bch2_opt_to_text(out, NULL, sb, opt, v,
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
index f1ab4f943720..a3b7a90f2533 100644
--- a/fs/bcachefs/super-io.h
+++ b/fs/bcachefs/super-io.h
@@ -21,17 +21,14 @@ static inline bool bch2_version_compatible(u16 version)
void bch2_version_to_text(struct printbuf *, enum bcachefs_metadata_version);
enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_metadata_version);
-void bch2_set_version_incompat(struct bch_fs *, enum bcachefs_metadata_version);
+int bch2_set_version_incompat(struct bch_fs *, enum bcachefs_metadata_version);
-static inline bool bch2_request_incompat_feature(struct bch_fs *c,
- enum bcachefs_metadata_version version)
+static inline int bch2_request_incompat_feature(struct bch_fs *c,
+ enum bcachefs_metadata_version version)
{
- if (unlikely(version > c->sb.version_incompat)) {
- if (version > c->sb.version_incompat_allowed)
- return false;
- bch2_set_version_incompat(c, version);
- }
- return true;
+ return likely(version <= c->sb.version_incompat)
+ ? 0
+ : bch2_set_version_incompat(c, version);
}
static inline size_t bch2_sb_field_bytes(struct bch_sb_field *f)
@@ -95,6 +92,8 @@ int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *);
void bch2_free_super(struct bch_sb_handle *);
int bch2_sb_realloc(struct bch_sb_handle *, unsigned);
+int bch2_sb_validate(struct bch_sb *, u64, enum bch_validate_flags, struct printbuf *);
+
int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *);
int bch2_read_super_silent(const char *, struct bch_opts *, struct bch_sb_handle *);
int bch2_write_super(struct bch_fs *);
@@ -108,6 +107,7 @@ static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat)
bool bch2_check_version_downgrade(struct bch_fs *);
void bch2_sb_upgrade(struct bch_fs *, unsigned, bool);
+void bch2_sb_upgrade_incompat(struct bch_fs *);
void __bch2_sb_field_to_text(struct printbuf *, struct bch_sb *,
struct bch_sb_field *);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index d97ea7bd1171..c46b1053a02c 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -10,6 +10,8 @@
#include "bcachefs.h"
#include "alloc_background.h"
#include "alloc_foreground.h"
+#include "async_objs.h"
+#include "backpointers.h"
#include "bkey_sort.h"
#include "btree_cache.h"
#include "btree_gc.h"
@@ -28,6 +30,7 @@
#include "disk_accounting.h"
#include "disk_groups.h"
#include "ec.h"
+#include "enumerated_ref.h"
#include "errcode.h"
#include "error.h"
#include "fs.h"
@@ -48,6 +51,7 @@
#include "quota.h"
#include "rebalance.h"
#include "recovery.h"
+#include "recovery_passes.h"
#include "replicas.h"
#include "sb-clean.h"
#include "sb-counters.h"
@@ -70,26 +74,37 @@
#include <linux/percpu.h>
#include <linux/random.h>
#include <linux/sysfs.h>
-#include <crypto/hash.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
MODULE_DESCRIPTION("bcachefs filesystem");
-MODULE_SOFTDEP("pre: crc32c");
-MODULE_SOFTDEP("pre: crc64");
-MODULE_SOFTDEP("pre: sha256");
-MODULE_SOFTDEP("pre: chacha20");
-MODULE_SOFTDEP("pre: poly1305");
-MODULE_SOFTDEP("pre: xxhash");
-const char * const bch2_fs_flag_strs[] = {
+typedef DARRAY(struct bch_sb_handle) bch_sb_handles;
+
#define x(n) #n,
+const char * const bch2_fs_flag_strs[] = {
BCH_FS_FLAGS()
-#undef x
NULL
};
-void bch2_print_str(struct bch_fs *c, const char *str)
+const char * const bch2_write_refs[] = {
+ BCH_WRITE_REFS()
+ NULL
+};
+
+const char * const bch2_dev_read_refs[] = {
+ BCH_DEV_READ_REFS()
+ NULL
+};
+
+const char * const bch2_dev_write_refs[] = {
+ BCH_DEV_WRITE_REFS()
+ NULL
+};
+#undef x
+
+static void __bch2_print_str(struct bch_fs *c, const char *prefix,
+ const char *str)
{
#ifdef __KERNEL__
struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c);
@@ -102,6 +117,11 @@ void bch2_print_str(struct bch_fs *c, const char *str)
bch2_print_string_as_lines(KERN_ERR, str);
}
+void bch2_print_str(struct bch_fs *c, const char *prefix, const char *str)
+{
+ __bch2_print_str(c, prefix, str);
+}
+
__printf(2, 0)
static void bch2_print_maybe_redirect(struct stdio_redirect *stdio, const char *fmt, va_list args)
{
@@ -188,27 +208,22 @@ static void bch2_dev_unlink(struct bch_dev *);
static void bch2_dev_free(struct bch_dev *);
static int bch2_dev_alloc(struct bch_fs *, unsigned);
static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *);
+static void bch2_dev_io_ref_stop(struct bch_dev *, int);
static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *);
struct bch_fs *bch2_dev_to_fs(dev_t dev)
{
- struct bch_fs *c;
-
- mutex_lock(&bch_fs_list_lock);
- rcu_read_lock();
+ guard(mutex)(&bch_fs_list_lock);
+ guard(rcu)();
+ struct bch_fs *c;
list_for_each_entry(c, &bch_fs_list, list)
for_each_member_device_rcu(c, ca, NULL)
if (ca->disk_sb.bdev && ca->disk_sb.bdev->bd_dev == dev) {
closure_get(&c->cl);
- goto found;
+ return c;
}
- c = NULL;
-found:
- rcu_read_unlock();
- mutex_unlock(&bch_fs_list_lock);
-
- return c;
+ return NULL;
}
static struct bch_fs *__bch2_uuid_to_fs(__uuid_t uuid)
@@ -297,19 +312,19 @@ static void __bch2_fs_read_only(struct bch_fs *c)
/*
* After stopping journal:
*/
- for_each_member_device(c, ca)
+ for_each_member_device(c, ca) {
+ bch2_dev_io_ref_stop(ca, WRITE);
bch2_dev_allocator_remove(c, ca);
+ }
}
-#ifndef BCH_WRITE_REF_DEBUG
-static void bch2_writes_disabled(struct percpu_ref *writes)
+static void bch2_writes_disabled(struct enumerated_ref *writes)
{
struct bch_fs *c = container_of(writes, struct bch_fs, writes);
set_bit(BCH_FS_write_disable_complete, &c->flags);
wake_up(&bch2_read_only_wait);
}
-#endif
void bch2_fs_read_only(struct bch_fs *c)
{
@@ -327,12 +342,7 @@ void bch2_fs_read_only(struct bch_fs *c)
* writes will return -EROFS:
*/
set_bit(BCH_FS_going_ro, &c->flags);
-#ifndef BCH_WRITE_REF_DEBUG
- percpu_ref_kill(&c->writes);
-#else
- for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++)
- bch2_write_ref_put(c, i);
-#endif
+ enumerated_ref_stop_async(&c->writes);
/*
* If we're not doing an emergency shutdown, we want to wait on
@@ -370,7 +380,7 @@ void bch2_fs_read_only(struct bch_fs *c)
!test_bit(BCH_FS_emergency_ro, &c->flags) &&
test_bit(BCH_FS_started, &c->flags) &&
test_bit(BCH_FS_clean_shutdown, &c->flags) &&
- c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) {
+ c->recovery.pass_done >= BCH_RECOVERY_PASS_journal_replay) {
BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal));
BUG_ON(atomic_long_read(&c->btree_cache.nr_dirty));
BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty));
@@ -381,6 +391,11 @@ void bch2_fs_read_only(struct bch_fs *c)
bch_verbose(c, "marking filesystem clean");
bch2_fs_mark_clean(c);
} else {
+ /* Make sure error counts/counters are persisted */
+ mutex_lock(&c->sb_lock);
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+
bch_verbose(c, "done going read-only, filesystem not clean");
}
}
@@ -411,30 +426,39 @@ bool bch2_fs_emergency_read_only(struct bch_fs *c)
return ret;
}
-static int bch2_fs_read_write_late(struct bch_fs *c)
+static bool __bch2_fs_emergency_read_only2(struct bch_fs *c, struct printbuf *out,
+ bool locked)
{
- int ret;
+ bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags);
- /*
- * Data move operations can't run until after check_snapshots has
- * completed, and bch2_snapshot_is_ancestor() is available.
- *
- * Ideally we'd start copygc/rebalance earlier instead of waiting for
- * all of recovery/fsck to complete:
- */
- ret = bch2_copygc_start(c);
- if (ret) {
- bch_err(c, "error starting copygc thread");
- return ret;
- }
+ if (!locked)
+ bch2_journal_halt(&c->journal);
+ else
+ bch2_journal_halt_locked(&c->journal);
+ bch2_fs_read_only_async(c);
+ wake_up(&bch2_read_only_wait);
- ret = bch2_rebalance_start(c);
- if (ret) {
- bch_err(c, "error starting rebalance thread");
- return ret;
- }
+ if (ret)
+ prt_printf(out, "emergency read only at seq %llu\n",
+ journal_cur_seq(&c->journal));
- return 0;
+ return ret;
+}
+
+bool bch2_fs_emergency_read_only2(struct bch_fs *c, struct printbuf *out)
+{
+ return __bch2_fs_emergency_read_only2(c, out, false);
+}
+
+bool bch2_fs_emergency_read_only_locked(struct bch_fs *c)
+{
+ bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags);
+
+ bch2_journal_halt_locked(&c->journal);
+ bch2_fs_read_only_async(c);
+
+ wake_up(&bch2_read_only_wait);
+ return ret;
}
static int __bch2_fs_read_write(struct bch_fs *c, bool early)
@@ -443,9 +467,17 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags));
+ if (WARN_ON(c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)))
+ return bch_err_throw(c, erofs_no_alloc_info);
+
if (test_bit(BCH_FS_initial_gc_unfixed, &c->flags)) {
bch_err(c, "cannot go rw, unfixed btree errors");
- return -BCH_ERR_erofs_unfixed_errors;
+ return bch_err_throw(c, erofs_unfixed_errors);
+ }
+
+ if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) {
+ bch_err(c, "cannot go rw, filesystem is an unresized image file");
+ return bch_err_throw(c, erofs_filesystem_full);
}
if (test_bit(BCH_FS_rw, &c->flags))
@@ -453,49 +485,60 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
bch_info(c, "going read-write");
- ret = bch2_sb_members_v2_init(c);
+ ret = bch2_fs_init_rw(c);
if (ret)
goto err;
- ret = bch2_fs_mark_dirty(c);
+ ret = bch2_sb_members_v2_init(c);
if (ret)
goto err;
clear_bit(BCH_FS_clean_shutdown, &c->flags);
+ scoped_guard(rcu)
+ for_each_online_member_rcu(c, ca)
+ if (ca->mi.state == BCH_MEMBER_STATE_rw) {
+ bch2_dev_allocator_add(c, ca);
+ enumerated_ref_start(&ca->io_ref[WRITE]);
+ }
+
+ bch2_recalc_capacity(c);
+
/*
* First journal write must be a flush write: after a clean shutdown we
* don't read the journal, so the first journal write may end up
* overwriting whatever was there previously, and there must always be
* at least one non-flush write in the journal or recovery will fail:
*/
+ spin_lock(&c->journal.lock);
set_bit(JOURNAL_need_flush_write, &c->journal.flags);
set_bit(JOURNAL_running, &c->journal.flags);
+ bch2_journal_space_available(&c->journal);
+ spin_unlock(&c->journal.lock);
- for_each_rw_member(c, ca)
- bch2_dev_allocator_add(c, ca);
- bch2_recalc_capacity(c);
+ ret = bch2_fs_mark_dirty(c);
+ if (ret)
+ goto err;
+
+ ret = bch2_journal_reclaim_start(&c->journal);
+ if (ret)
+ goto err;
set_bit(BCH_FS_rw, &c->flags);
set_bit(BCH_FS_was_rw, &c->flags);
-#ifndef BCH_WRITE_REF_DEBUG
- percpu_ref_reinit(&c->writes);
-#else
- for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) {
- BUG_ON(atomic_long_read(&c->writes[i]));
- atomic_long_inc(&c->writes[i]);
- }
-#endif
+ enumerated_ref_start(&c->writes);
- ret = bch2_journal_reclaim_start(&c->journal);
- if (ret)
+ ret = bch2_copygc_start(c);
+ if (ret) {
+ bch_err_msg(c, ret, "error starting copygc thread");
goto err;
+ }
- if (!early) {
- ret = bch2_fs_read_write_late(c);
- if (ret)
- goto err;
+ ret = bch2_rebalance_start(c);
+ if (ret) {
+ bch_err_msg(c, ret, "error starting rebalance thread");
+ goto err;
}
bch2_do_discards(c);
@@ -515,19 +558,24 @@ int bch2_fs_read_write(struct bch_fs *c)
{
if (c->opts.recovery_pass_last &&
c->opts.recovery_pass_last < BCH_RECOVERY_PASS_journal_replay)
- return -BCH_ERR_erofs_norecovery;
+ return bch_err_throw(c, erofs_norecovery);
if (c->opts.nochanges)
- return -BCH_ERR_erofs_nochanges;
+ return bch_err_throw(c, erofs_nochanges);
+
+ if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))
+ return bch_err_throw(c, erofs_no_alloc_info);
return __bch2_fs_read_write(c, false);
}
int bch2_fs_read_write_early(struct bch_fs *c)
{
- lockdep_assert_held(&c->state_lock);
+ down_write(&c->state_lock);
+ int ret = __bch2_fs_read_write(c, true);
+ up_write(&c->state_lock);
- return __bch2_fs_read_write(c, true);
+ return ret;
}
/* Filesystem startup/shutdown: */
@@ -537,37 +585,44 @@ static void __bch2_fs_free(struct bch_fs *c)
for (unsigned i = 0; i < BCH_TIME_STAT_NR; i++)
bch2_time_stats_exit(&c->times[i]);
+#ifdef CONFIG_UNICODE
+ utf8_unload(c->cf_encoding);
+#endif
+
bch2_find_btree_nodes_exit(&c->found_btree_nodes);
bch2_free_pending_node_rewrites(c);
- bch2_fs_accounting_exit(c);
- bch2_fs_sb_errors_exit(c);
- bch2_fs_counters_exit(c);
+ bch2_free_fsck_errs(c);
+ bch2_fs_vfs_exit(c);
bch2_fs_snapshots_exit(c);
+ bch2_fs_sb_errors_exit(c);
+ bch2_fs_replicas_exit(c);
+ bch2_fs_rebalance_exit(c);
bch2_fs_quota_exit(c);
+ bch2_fs_nocow_locking_exit(c);
+ bch2_fs_journal_exit(&c->journal);
bch2_fs_fs_io_direct_exit(c);
bch2_fs_fs_io_buffered_exit(c);
bch2_fs_fsio_exit(c);
- bch2_fs_vfs_exit(c);
- bch2_fs_ec_exit(c);
- bch2_fs_encryption_exit(c);
- bch2_fs_nocow_locking_exit(c);
bch2_fs_io_write_exit(c);
bch2_fs_io_read_exit(c);
+ bch2_fs_encryption_exit(c);
+ bch2_fs_ec_exit(c);
+ bch2_fs_counters_exit(c);
+ bch2_fs_compress_exit(c);
+ bch2_io_clock_exit(&c->io_clock[WRITE]);
+ bch2_io_clock_exit(&c->io_clock[READ]);
bch2_fs_buckets_waiting_for_journal_exit(c);
- bch2_fs_btree_interior_update_exit(c);
+ bch2_fs_btree_write_buffer_exit(c);
bch2_fs_btree_key_cache_exit(&c->btree_key_cache);
- bch2_fs_btree_cache_exit(c);
bch2_fs_btree_iter_exit(c);
- bch2_fs_replicas_exit(c);
- bch2_fs_journal_exit(&c->journal);
- bch2_io_clock_exit(&c->io_clock[WRITE]);
- bch2_io_clock_exit(&c->io_clock[READ]);
- bch2_fs_compress_exit(c);
- bch2_fs_btree_gc_exit(c);
+ bch2_fs_btree_interior_update_exit(c);
+ bch2_fs_btree_cache_exit(c);
+ bch2_fs_accounting_exit(c);
+ bch2_fs_async_obj_exit(c);
bch2_journal_keys_put_initial(c);
bch2_find_btree_nodes_exit(&c->found_btree_nodes);
+
BUG_ON(atomic_read(&c->journal_keys.ref));
- bch2_fs_btree_write_buffer_exit(c);
percpu_free_rwsem(&c->mark_lock);
if (c->online_reserved) {
u64 v = percpu_u64_get(c->online_reserved);
@@ -575,6 +630,7 @@ static void __bch2_fs_free(struct bch_fs *c)
free_percpu(c->online_reserved);
}
+ darray_exit(&c->incompat_versions_requested);
darray_exit(&c->btree_roots_extra);
free_percpu(c->pcpu);
free_percpu(c->usage);
@@ -582,9 +638,7 @@ static void __bch2_fs_free(struct bch_fs *c)
mempool_exit(&c->btree_bounce_pool);
bioset_exit(&c->btree_bio);
mempool_exit(&c->fill_iter);
-#ifndef BCH_WRITE_REF_DEBUG
- percpu_ref_exit(&c->writes);
-#endif
+ enumerated_ref_exit(&c->writes);
kfree(rcu_dereference_protected(c->disk_groups, 1));
kfree(c->journal_seq_blacklist_table);
@@ -596,8 +650,8 @@ static void __bch2_fs_free(struct bch_fs *c)
destroy_workqueue(c->btree_read_complete_wq);
if (c->copygc_wq)
destroy_workqueue(c->copygc_wq);
- if (c->btree_io_complete_wq)
- destroy_workqueue(c->btree_io_complete_wq);
+ if (c->btree_write_complete_wq)
+ destroy_workqueue(c->btree_write_complete_wq);
if (c->btree_update_wq)
destroy_workqueue(c->btree_update_wq);
@@ -623,6 +677,12 @@ void __bch2_fs_stop(struct bch_fs *c)
bch2_fs_read_only(c);
up_write(&c->state_lock);
+ for (unsigned i = 0; i < c->sb.nr_devices; i++) {
+ struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true);
+ if (ca)
+ bch2_dev_io_ref_stop(ca, READ);
+ }
+
for_each_member_device(c, ca)
bch2_dev_unlink(ca);
@@ -651,8 +711,6 @@ void __bch2_fs_stop(struct bch_fs *c)
void bch2_fs_free(struct bch_fs *c)
{
- unsigned i;
-
mutex_lock(&bch_fs_list_lock);
list_del(&c->list);
mutex_unlock(&bch_fs_list_lock);
@@ -660,11 +718,12 @@ void bch2_fs_free(struct bch_fs *c)
closure_sync(&c->cl);
closure_debug_destroy(&c->cl);
- for (i = 0; i < c->sb.nr_devices; i++) {
+ for (unsigned i = 0; i < c->sb.nr_devices; i++) {
struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true);
if (ca) {
EBUG_ON(atomic_long_read(&ca->ref) != 1);
+ bch2_dev_io_ref_stop(ca, READ);
bch2_free_super(&ca->disk_sb);
bch2_dev_free(ca);
}
@@ -687,9 +746,10 @@ static int bch2_fs_online(struct bch_fs *c)
lockdep_assert_held(&bch_fs_list_lock);
- if (__bch2_uuid_to_fs(c->sb.uuid)) {
+ if (c->sb.multi_device &&
+ __bch2_uuid_to_fs(c->sb.uuid)) {
bch_err(c, "filesystem UUID already open");
- return -EINVAL;
+ return bch_err_throw(c, filesystem_uuid_already_open);
}
ret = bch2_fs_chardev_init(c);
@@ -700,14 +760,16 @@ static int bch2_fs_online(struct bch_fs *c)
bch2_fs_debug_init(c);
- ret = kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ?:
+ ret = (c->sb.multi_device
+ ? kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b)
+ : kobject_add(&c->kobj, NULL, "%s", c->name)) ?:
kobject_add(&c->internal, &c->kobj, "internal") ?:
kobject_add(&c->opts_dir, &c->kobj, "options") ?:
#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
kobject_add(&c->time_stats, &c->kobj, "time_stats") ?:
#endif
kobject_add(&c->counters_kobj, &c->kobj, "counters") ?:
- bch2_opts_create_sysfs_files(&c->opts_dir);
+ bch2_opts_create_sysfs_files(&c->opts_dir, OPT_FS);
if (ret) {
bch_err(c, "error creating sysfs objects");
return ret;
@@ -731,7 +793,37 @@ err:
return ret;
}
-static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
+int bch2_fs_init_rw(struct bch_fs *c)
+{
+ if (test_bit(BCH_FS_rw_init_done, &c->flags))
+ return 0;
+
+ if (!(c->btree_update_wq = alloc_workqueue("bcachefs",
+ WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) ||
+ !(c->btree_write_complete_wq = alloc_workqueue("bcachefs_btree_write_complete",
+ WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
+ !(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
+ WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
+ !(c->btree_write_submit_wq = alloc_workqueue("bcachefs_btree_write_sumit",
+ WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
+ !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref",
+ WQ_FREEZABLE, 0)))
+ return bch_err_throw(c, ENOMEM_fs_other_alloc);
+
+ int ret = bch2_fs_btree_interior_update_init(c) ?:
+ bch2_fs_btree_write_buffer_init(c) ?:
+ bch2_fs_fs_io_buffered_init(c) ?:
+ bch2_fs_io_write_init(c) ?:
+ bch2_fs_journal_init(&c->journal);
+ if (ret)
+ return ret;
+
+ set_bit(BCH_FS_rw_init_done, &c->flags);
+ return 0;
+}
+
+static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts,
+ bch_sb_handles *sbs)
{
struct bch_fs *c;
struct printbuf name = PRINTBUF;
@@ -744,7 +836,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
goto out;
}
- c->stdio = (void *)(unsigned long) opts.stdio;
+ c->stdio = (void *)(unsigned long) opts->stdio;
__module_get(THIS_MODULE);
@@ -768,24 +860,29 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
refcount_set(&c->ro_ref, 1);
init_waitqueue_head(&c->ro_ref_wait);
- spin_lock_init(&c->recovery_pass_lock);
- sema_init(&c->online_fsck_mutex, 1);
for (i = 0; i < BCH_TIME_STAT_NR; i++)
bch2_time_stats_init(&c->times[i]);
- bch2_fs_copygc_init(c);
- bch2_fs_btree_key_cache_init_early(&c->btree_key_cache);
- bch2_fs_btree_iter_init_early(c);
- bch2_fs_btree_interior_update_init_early(c);
- bch2_fs_journal_keys_init(c);
bch2_fs_allocator_background_init(c);
bch2_fs_allocator_foreground_init(c);
- bch2_fs_rebalance_init(c);
- bch2_fs_quota_init(c);
+ bch2_fs_btree_cache_init_early(&c->btree_cache);
+ bch2_fs_btree_gc_init_early(c);
+ bch2_fs_btree_interior_update_init_early(c);
+ bch2_fs_btree_iter_init_early(c);
+ bch2_fs_btree_key_cache_init_early(&c->btree_key_cache);
+ bch2_fs_btree_write_buffer_init_early(c);
+ bch2_fs_copygc_init(c);
bch2_fs_ec_init_early(c);
+ bch2_fs_journal_init_early(&c->journal);
+ bch2_fs_journal_keys_init(c);
bch2_fs_move_init(c);
+ bch2_fs_nocow_locking_init_early(c);
+ bch2_fs_quota_init(c);
+ bch2_fs_recovery_passes_init(c);
bch2_fs_sb_errors_init_early(c);
+ bch2_fs_snapshots_init_early(c);
+ bch2_fs_subvolumes_init_early(c);
INIT_LIST_HEAD(&c->list);
@@ -811,8 +908,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write];
c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq];
- bch2_fs_btree_cache_init_early(&c->btree_cache);
-
mutex_init(&c->sectors_available_lock);
ret = percpu_init_rwsem(&c->mark_lock);
@@ -826,14 +921,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
if (ret)
goto err;
- pr_uuid(&name, c->sb.user_uuid.b);
- ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0;
- if (ret)
- goto err;
-
- strscpy(c->name, name.buf, sizeof(c->name));
- printbuf_exit(&name);
-
/* Compat: */
if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 &&
!BCH_SB_JOURNAL_FLUSH_DELAY(sb))
@@ -848,7 +935,14 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
if (ret)
goto err;
- bch2_opts_apply(&c->opts, opts);
+ bch2_opts_apply(&c->opts, *opts);
+
+ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
+ c->opts.block_size > PAGE_SIZE) {
+ bch_err(c, "cannot mount bs > ps filesystem without CONFIG_TRANSPARENT_HUGEPAGE");
+ ret = -EINVAL;
+ goto err;
+ }
c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc;
if (c->opts.inodes_use_key_cache)
@@ -864,26 +958,26 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
goto err;
}
+ if (c->sb.multi_device)
+ pr_uuid(&name, c->sb.user_uuid.b);
+ else
+ prt_bdevname(&name, sbs->data[0].bdev);
+
+ ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0;
+ if (ret)
+ goto err;
+
+ strscpy(c->name, name.buf, sizeof(c->name));
+ printbuf_exit(&name);
+
iter_size = sizeof(struct sort_iter) +
(btree_blocks(c) + 1) * 2 *
sizeof(struct sort_iter_set);
- if (!(c->btree_update_wq = alloc_workqueue("bcachefs",
- WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) ||
- !(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io",
- WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
- !(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
- WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
- !(c->btree_read_complete_wq = alloc_workqueue("bcachefs_btree_read_complete",
+ if (!(c->btree_read_complete_wq = alloc_workqueue("bcachefs_btree_read_complete",
WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 512)) ||
- !(c->btree_write_submit_wq = alloc_workqueue("bcachefs_btree_write_sumit",
- WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
- !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref",
- WQ_FREEZABLE, 0)) ||
-#ifndef BCH_WRITE_REF_DEBUG
- percpu_ref_init(&c->writes, bch2_writes_disabled,
- PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
-#endif
+ enumerated_ref_init(&c->writes, BCH_WRITE_REF_NR,
+ bch2_writes_disabled) ||
mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
bioset_init(&c->btree_bio, 1,
max(offsetof(struct btree_read_bio, bio),
@@ -895,36 +989,62 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1,
c->opts.btree_node_size) ||
mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048)) {
- ret = -BCH_ERR_ENOMEM_fs_other_alloc;
+ ret = bch_err_throw(c, ENOMEM_fs_other_alloc);
goto err;
}
- ret = bch2_fs_counters_init(c) ?:
- bch2_fs_sb_errors_init(c) ?:
- bch2_io_clock_init(&c->io_clock[READ]) ?:
- bch2_io_clock_init(&c->io_clock[WRITE]) ?:
- bch2_fs_journal_init(&c->journal) ?:
- bch2_fs_btree_iter_init(c) ?:
+ ret =
+ bch2_fs_async_obj_init(c) ?:
bch2_fs_btree_cache_init(c) ?:
+ bch2_fs_btree_iter_init(c) ?:
bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?:
- bch2_fs_btree_interior_update_init(c) ?:
- bch2_fs_btree_gc_init(c) ?:
bch2_fs_buckets_waiting_for_journal_init(c) ?:
- bch2_fs_btree_write_buffer_init(c) ?:
- bch2_fs_subvolumes_init(c) ?:
- bch2_fs_io_read_init(c) ?:
- bch2_fs_io_write_init(c) ?:
- bch2_fs_nocow_locking_init(c) ?:
- bch2_fs_encryption_init(c) ?:
+ bch2_io_clock_init(&c->io_clock[READ]) ?:
+ bch2_io_clock_init(&c->io_clock[WRITE]) ?:
bch2_fs_compress_init(c) ?:
+ bch2_fs_counters_init(c) ?:
bch2_fs_ec_init(c) ?:
- bch2_fs_vfs_init(c) ?:
+ bch2_fs_encryption_init(c) ?:
bch2_fs_fsio_init(c) ?:
- bch2_fs_fs_io_buffered_init(c) ?:
- bch2_fs_fs_io_direct_init(c);
+ bch2_fs_fs_io_direct_init(c) ?:
+ bch2_fs_io_read_init(c) ?:
+ bch2_fs_rebalance_init(c) ?:
+ bch2_fs_sb_errors_init(c) ?:
+ bch2_fs_vfs_init(c);
if (ret)
goto err;
+ if (go_rw_in_recovery(c)) {
+ /*
+ * start workqueues/kworkers early - kthread creation checks for
+ * pending signals, which is _very_ annoying
+ */
+ ret = bch2_fs_init_rw(c);
+ if (ret)
+ goto err;
+ }
+
+#ifdef CONFIG_UNICODE
+ if (bch2_fs_casefold_enabled(c)) {
+ /* Default encoding until we can potentially have more as an option. */
+ c->cf_encoding = utf8_load(BCH_FS_DEFAULT_UTF8_ENCODING);
+ if (IS_ERR(c->cf_encoding)) {
+ printk(KERN_ERR "Cannot load UTF-8 encoding for filesystem. Version: %u.%u.%u",
+ unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING),
+ unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING),
+ unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING));
+ ret = -EINVAL;
+ goto err;
+ }
+ }
+#else
+ if (c->sb.features & BIT_ULL(BCH_FEATURE_casefolding)) {
+ printk(KERN_ERR "Cannot mount a filesystem with casefolding on a kernel without CONFIG_UNICODE\n");
+ ret = -EINVAL;
+ goto err;
+ }
+#endif
+
for (i = 0; i < c->sb.nr_devices; i++) {
if (!bch2_member_exists(c->disk_sb.sb, i))
continue;
@@ -958,18 +1078,13 @@ noinline_for_stack
static void print_mount_opts(struct bch_fs *c)
{
enum bch_opt_id i;
- struct printbuf p = PRINTBUF;
- bool first = true;
+ CLASS(printbuf, p)();
+ bch2_log_msg_start(c, &p);
prt_str(&p, "starting version ");
bch2_version_to_text(&p, c->sb.version);
- if (c->opts.read_only) {
- prt_str(&p, " opts=");
- first = false;
- prt_printf(&p, "ro");
- }
-
+ bool first = true;
for (i = 0; i < bch2_opts_nr; i++) {
const struct bch_option *opt = &bch2_opt_table[i];
u64 v = bch2_opt_get_by_id(&c->opts, i);
@@ -985,45 +1100,113 @@ static void print_mount_opts(struct bch_fs *c)
bch2_opt_to_text(&p, c, c->disk_sb.sb, opt, v, OPT_SHOW_MOUNT_STYLE);
}
- bch_info(c, "%s", p.buf);
- printbuf_exit(&p);
+ if (c->sb.version_incompat_allowed != c->sb.version) {
+ prt_printf(&p, "\nallowing incompatible features above ");
+ bch2_version_to_text(&p, c->sb.version_incompat_allowed);
+ }
+
+ if (c->opts.verbose) {
+ prt_printf(&p, "\nfeatures: ");
+ prt_bitflags(&p, bch2_sb_features, c->sb.features);
+ }
+
+ if (c->sb.multi_device) {
+ prt_printf(&p, "\nwith devices");
+ for_each_online_member(c, ca, BCH_DEV_READ_REF_bch2_online_devs) {
+ prt_char(&p, ' ');
+ prt_str(&p, ca->name);
+ }
+ }
+
+ bch2_print_str(c, KERN_INFO, p.buf);
+}
+
+static bool bch2_fs_may_start(struct bch_fs *c)
+{
+ struct bch_dev *ca;
+ unsigned flags = 0;
+
+ switch (c->opts.degraded) {
+ case BCH_DEGRADED_very:
+ flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST;
+ break;
+ case BCH_DEGRADED_yes:
+ flags |= BCH_FORCE_IF_DEGRADED;
+ break;
+ default:
+ mutex_lock(&c->sb_lock);
+ for (unsigned i = 0; i < c->disk_sb.sb->nr_devices; i++) {
+ if (!bch2_member_exists(c->disk_sb.sb, i))
+ continue;
+
+ ca = bch2_dev_locked(c, i);
+
+ if (!bch2_dev_is_online(ca) &&
+ (ca->mi.state == BCH_MEMBER_STATE_rw ||
+ ca->mi.state == BCH_MEMBER_STATE_ro)) {
+ mutex_unlock(&c->sb_lock);
+ return false;
+ }
+ }
+ mutex_unlock(&c->sb_lock);
+ break;
+ }
+
+ return bch2_have_enough_devs(c, c->online_devs, flags, true);
}
int bch2_fs_start(struct bch_fs *c)
{
time64_t now = ktime_get_real_seconds();
- int ret;
+ int ret = 0;
print_mount_opts(c);
+ if (c->cf_encoding)
+ bch_info(c, "Using encoding defined by superblock: utf8-%u.%u.%u",
+ unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING),
+ unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING),
+ unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING));
+
+ if (!bch2_fs_may_start(c))
+ return bch_err_throw(c, insufficient_devices_to_start);
+
down_write(&c->state_lock);
+ mutex_lock(&c->sb_lock);
BUG_ON(test_bit(BCH_FS_started, &c->flags));
- mutex_lock(&c->sb_lock);
+ if (!bch2_sb_field_get_minsize(&c->disk_sb, ext,
+ sizeof(struct bch_sb_field_ext) / sizeof(u64))) {
+ mutex_unlock(&c->sb_lock);
+ up_write(&c->state_lock);
+ ret = bch_err_throw(c, ENOSPC_sb);
+ goto err;
+ }
ret = bch2_sb_members_v2_init(c);
if (ret) {
mutex_unlock(&c->sb_lock);
+ up_write(&c->state_lock);
goto err;
}
- for_each_online_member(c, ca)
- bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = cpu_to_le64(now);
+ scoped_guard(rcu)
+ for_each_online_member_rcu(c, ca)
+ bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount =
+ cpu_to_le64(now);
- struct bch_sb_field_ext *ext =
- bch2_sb_field_get_minsize(&c->disk_sb, ext, sizeof(*ext) / sizeof(u64));
+ /*
+ * Dno't write superblock yet: recovery might have to downgrade
+ */
mutex_unlock(&c->sb_lock);
- if (!ext) {
- bch_err(c, "insufficient space in superblock for sb_field_ext");
- ret = -BCH_ERR_ENOSPC_sb;
- goto err;
- }
-
- for_each_rw_member(c, ca)
- bch2_dev_allocator_add(c, ca);
+ scoped_guard(rcu)
+ for_each_online_member_rcu(c, ca)
+ if (ca->mi.state == BCH_MEMBER_STATE_rw)
+ bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
+ up_write(&c->state_lock);
c->recovery_task = current;
ret = BCH_SB_INITIALIZED(c->disk_sb.sb)
@@ -1034,35 +1217,30 @@ int bch2_fs_start(struct bch_fs *c)
if (ret)
goto err;
- ret = bch2_opts_check_may_set(c);
+ ret = bch2_opts_hooks_pre_set(c);
if (ret)
goto err;
if (bch2_fs_init_fault("fs_start")) {
- bch_err(c, "fs_start fault injected");
- ret = -EINVAL;
+ ret = bch_err_throw(c, injected_fs_start);
goto err;
}
set_bit(BCH_FS_started, &c->flags);
+ wake_up(&c->ro_ref_wait);
- if (c->opts.read_only) {
+ down_write(&c->state_lock);
+ if (c->opts.read_only)
bch2_fs_read_only(c);
- } else {
- ret = !test_bit(BCH_FS_rw, &c->flags)
- ? bch2_fs_read_write(c)
- : bch2_fs_read_write_late(c);
- if (ret)
- goto err;
- }
+ else if (!test_bit(BCH_FS_rw, &c->flags))
+ ret = bch2_fs_read_write(c);
+ up_write(&c->state_lock);
- ret = 0;
err:
if (ret)
bch_err_msg(c, ret, "starting filesystem");
else
bch_verbose(c, "done starting filesystem");
- up_write(&c->state_lock);
return ret;
}
@@ -1071,11 +1249,11 @@ static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx);
if (le16_to_cpu(sb->block_size) != block_sectors(c))
- return -BCH_ERR_mismatched_block_size;
+ return bch_err_throw(c, mismatched_block_size);
if (le16_to_cpu(m.bucket_size) <
BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb))
- return -BCH_ERR_bucket_size_too_small;
+ return bch_err_throw(c, bucket_size_too_small);
return 0;
}
@@ -1171,6 +1349,18 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs,
/* Device startup/shutdown: */
+static void bch2_dev_io_ref_stop(struct bch_dev *ca, int rw)
+{
+ if (rw == READ)
+ clear_bit(ca->dev_idx, ca->fs->online_devs.d);
+
+ if (!enumerated_ref_is_zero(&ca->io_ref[rw]))
+ enumerated_ref_stop(&ca->io_ref[rw],
+ rw == READ
+ ? bch2_dev_read_refs
+ : bch2_dev_write_refs);
+}
+
static void bch2_dev_release(struct kobject *kobj)
{
struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
@@ -1180,6 +1370,9 @@ static void bch2_dev_release(struct kobject *kobj)
static void bch2_dev_free(struct bch_dev *ca)
{
+ WARN_ON(!enumerated_ref_is_zero(&ca->io_ref[WRITE]));
+ WARN_ON(!enumerated_ref_is_zero(&ca->io_ref[READ]));
+
cancel_work_sync(&ca->io_error_work);
bch2_dev_unlink(ca);
@@ -1187,6 +1380,9 @@ static void bch2_dev_free(struct bch_dev *ca)
if (ca->kobj.state_in_sysfs)
kobject_del(&ca->kobj);
+ bch2_bucket_bitmap_free(&ca->bucket_backpointer_mismatch);
+ bch2_bucket_bitmap_free(&ca->bucket_backpointer_empty);
+
bch2_free_super(&ca->disk_sb);
bch2_dev_allocator_background_exit(ca);
bch2_dev_journal_exit(ca);
@@ -1198,7 +1394,8 @@ static void bch2_dev_free(struct bch_dev *ca)
bch2_time_stats_quantiles_exit(&ca->io_latency[WRITE]);
bch2_time_stats_quantiles_exit(&ca->io_latency[READ]);
- percpu_ref_exit(&ca->io_ref);
+ enumerated_ref_exit(&ca->io_ref[WRITE]);
+ enumerated_ref_exit(&ca->io_ref[READ]);
#ifndef CONFIG_BCACHEFS_DEBUG
percpu_ref_exit(&ca->ref);
#endif
@@ -1210,14 +1407,12 @@ static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca)
lockdep_assert_held(&c->state_lock);
- if (percpu_ref_is_zero(&ca->io_ref))
+ if (enumerated_ref_is_zero(&ca->io_ref[READ]))
return;
__bch2_dev_read_only(c, ca);
- reinit_completion(&ca->io_ref_completion);
- percpu_ref_kill(&ca->io_ref);
- wait_for_completion(&ca->io_ref_completion);
+ bch2_dev_io_ref_stop(ca, READ);
bch2_dev_unlink(ca);
@@ -1234,13 +1429,6 @@ static void bch2_dev_ref_complete(struct percpu_ref *ref)
}
#endif
-static void bch2_dev_io_ref_complete(struct percpu_ref *ref)
-{
- struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref);
-
- complete(&ca->io_ref_completion);
-}
-
static void bch2_dev_unlink(struct bch_dev *ca)
{
struct kobject *b;
@@ -1269,8 +1457,8 @@ static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca)
return 0;
if (!ca->kobj.state_in_sysfs) {
- ret = kobject_add(&ca->kobj, &c->kobj,
- "dev-%u", ca->dev_idx);
+ ret = kobject_add(&ca->kobj, &c->kobj, "dev-%u", ca->dev_idx) ?:
+ bch2_opts_create_sysfs_files(&ca->kobj, OPT_DEVICE);
if (ret)
return ret;
}
@@ -1302,7 +1490,6 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
kobject_init(&ca->kobj, &bch2_dev_ktype);
init_completion(&ca->ref_completion);
- init_completion(&ca->io_ref_completion);
INIT_WORK(&ca->io_error_work, bch2_io_error_work);
@@ -1326,10 +1513,13 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
atomic_long_set(&ca->ref, 1);
#endif
+ mutex_init(&ca->bucket_backpointer_mismatch.lock);
+ mutex_init(&ca->bucket_backpointer_empty.lock);
+
bch2_dev_allocator_background_init(ca);
- if (percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete,
- PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
+ if (enumerated_ref_init(&ca->io_ref[READ], BCH_DEV_READ_REF_NR, NULL) ||
+ enumerated_ref_init(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_NR, NULL) ||
!(ca->sb_read_scratch = kmalloc(BCH_SB_READ_SCRATCH_BUF_SIZE, GFP_KERNEL)) ||
bch2_dev_buckets_alloc(c, ca) ||
!(ca->io_done = alloc_percpu(*ca->io_done)))
@@ -1346,7 +1536,9 @@ static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca,
{
ca->dev_idx = dev_idx;
__set_bit(ca->dev_idx, ca->self.d);
- scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx);
+
+ if (!ca->name[0])
+ scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx);
ca->fs = c;
rcu_assign_pointer(c->devs[ca->dev_idx], ca);
@@ -1372,7 +1564,7 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
bch2_dev_attach(c, ca, dev_idx);
return 0;
err:
- return -BCH_ERR_ENOMEM_dev_alloc;
+ return bch_err_throw(c, ENOMEM_dev_alloc);
}
static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
@@ -1382,28 +1574,41 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
if (bch2_dev_is_online(ca)) {
bch_err(ca, "already have device online in slot %u",
sb->sb->dev_idx);
- return -BCH_ERR_device_already_online;
+ return bch_err_throw(ca->fs, device_already_online);
}
if (get_capacity(sb->bdev->bd_disk) <
ca->mi.bucket_size * ca->mi.nbuckets) {
bch_err(ca, "cannot online: device too small");
- return -BCH_ERR_device_size_too_small;
+ return bch_err_throw(ca->fs, device_size_too_small);
}
- BUG_ON(!percpu_ref_is_zero(&ca->io_ref));
+ BUG_ON(!enumerated_ref_is_zero(&ca->io_ref[READ]));
+ BUG_ON(!enumerated_ref_is_zero(&ca->io_ref[WRITE]));
ret = bch2_dev_journal_init(ca, sb->sb);
if (ret)
return ret;
+ struct printbuf name = PRINTBUF;
+ prt_bdevname(&name, sb->bdev);
+ strscpy(ca->name, name.buf, sizeof(ca->name));
+ printbuf_exit(&name);
+
/* Commit: */
ca->disk_sb = *sb;
memset(sb, 0, sizeof(*sb));
+ /*
+ * Stash pointer to the filesystem for blk_holder_ops - note that once
+ * attached to a filesystem, we will always close the block device
+ * before tearing down the filesystem object.
+ */
+ ca->disk_sb.holder->c = ca->fs;
+
ca->dev = ca->disk_sb.bdev->bd_dev;
- percpu_ref_reinit(&ca->io_ref);
+ enumerated_ref_start(&ca->io_ref[READ]);
return 0;
}
@@ -1427,18 +1632,11 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
if (ret)
return ret;
- bch2_dev_sysfs_online(c, ca);
-
- struct printbuf name = PRINTBUF;
- prt_bdevname(&name, ca->disk_sb.bdev);
+ set_bit(ca->dev_idx, c->online_devs.d);
- if (c->sb.nr_devices == 1)
- strscpy(c->name, name.buf, sizeof(c->name));
- strscpy(ca->name, name.buf, sizeof(ca->name));
-
- printbuf_exit(&name);
+ bch2_dev_sysfs_online(c, ca);
- rebalance_wakeup(c);
+ bch2_rebalance_wakeup(c);
return 0;
}
@@ -1488,7 +1686,7 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
return true;
/* do we have enough devices to read from? */
- new_online_devs = bch2_online_devs(c);
+ new_online_devs = c->online_devs;
__clear_bit(ca->dev_idx, new_online_devs.d);
return bch2_have_enough_devs(c, new_online_devs, flags, false);
@@ -1497,42 +1695,10 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
}
}
-static bool bch2_fs_may_start(struct bch_fs *c)
-{
- struct bch_dev *ca;
- unsigned i, flags = 0;
-
- if (c->opts.very_degraded)
- flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST;
-
- if (c->opts.degraded)
- flags |= BCH_FORCE_IF_DEGRADED;
-
- if (!c->opts.degraded &&
- !c->opts.very_degraded) {
- mutex_lock(&c->sb_lock);
-
- for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
- if (!bch2_member_exists(c->disk_sb.sb, i))
- continue;
-
- ca = bch2_dev_locked(c, i);
-
- if (!bch2_dev_is_online(ca) &&
- (ca->mi.state == BCH_MEMBER_STATE_rw ||
- ca->mi.state == BCH_MEMBER_STATE_ro)) {
- mutex_unlock(&c->sb_lock);
- return false;
- }
- }
- mutex_unlock(&c->sb_lock);
- }
-
- return bch2_have_enough_devs(c, bch2_online_devs(c), flags, true);
-}
-
static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
{
+ bch2_dev_io_ref_stop(ca, WRITE);
+
/*
* The allocator thread itself allocates btree nodes, so stop it first:
*/
@@ -1549,6 +1715,10 @@ static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
+
+ if (enumerated_ref_is_zero(&ca->io_ref[WRITE]))
+ enumerated_ref_start(&ca->io_ref[WRITE]);
+
bch2_dev_do_discards(ca);
}
@@ -1562,7 +1732,7 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
return 0;
if (!bch2_dev_state_allowed(c, ca, new_state, flags))
- return -BCH_ERR_device_state_not_allowed;
+ return bch_err_throw(c, device_state_not_allowed);
if (new_state != BCH_MEMBER_STATE_rw)
__bch2_dev_read_only(c, ca);
@@ -1578,7 +1748,7 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
if (new_state == BCH_MEMBER_STATE_rw)
__bch2_dev_read_write(c, ca);
- rebalance_wakeup(c);
+ bch2_rebalance_wakeup(c);
return ret;
}
@@ -1601,6 +1771,8 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
{
struct bch_member *m;
unsigned dev_idx = ca->dev_idx, data;
+ bool fast_device_removal = !bch2_request_incompat_feature(c,
+ bcachefs_metadata_version_fast_device_removal);
int ret;
down_write(&c->state_lock);
@@ -1613,17 +1785,31 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) {
bch_err(ca, "Cannot remove without losing data");
- ret = -BCH_ERR_device_state_not_allowed;
+ ret = bch_err_throw(c, device_state_not_allowed);
goto err;
}
__bch2_dev_read_only(c, ca);
- ret = bch2_dev_data_drop(c, ca->dev_idx, flags);
- bch_err_msg(ca, ret, "bch2_dev_data_drop()");
+ ret = fast_device_removal
+ ? bch2_dev_data_drop_by_backpointers(c, ca->dev_idx, flags)
+ : (bch2_dev_data_drop(c, ca->dev_idx, flags) ?:
+ bch2_dev_remove_stripes(c, ca->dev_idx, flags));
if (ret)
goto err;
+ /* Check if device still has data before blowing away alloc info */
+ struct bch_dev_usage usage = bch2_dev_usage_read(ca);
+ for (unsigned i = 0; i < BCH_DATA_NR; i++)
+ if (!data_type_is_empty(i) &&
+ !data_type_is_hidden(i) &&
+ usage.buckets[i]) {
+ bch_err(ca, "Remove failed: still has data (%s, %llu buckets)",
+ __bch2_data_types[i], usage.buckets[i]);
+ ret = -EBUSY;
+ goto err;
+ }
+
ret = bch2_dev_remove_alloc(c, ca);
bch_err_msg(ca, ret, "bch2_dev_remove_alloc()");
if (ret)
@@ -1687,7 +1873,11 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
*/
mutex_lock(&c->sb_lock);
m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx);
- memset(&m->uuid, 0, sizeof(m->uuid));
+
+ if (fast_device_removal)
+ m->uuid = BCH_SB_MEMBER_DELETED_UUID;
+ else
+ memset(&m->uuid, 0, sizeof(m->uuid));
bch2_write_super(c);
@@ -1695,8 +1885,9 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
up_write(&c->state_lock);
return 0;
err:
- if (ca->mi.state == BCH_MEMBER_STATE_rw &&
- !percpu_ref_is_zero(&ca->io_ref))
+ if (test_bit(BCH_FS_rw, &c->flags) &&
+ ca->mi.state == BCH_MEMBER_STATE_rw &&
+ !enumerated_ref_is_zero(&ca->io_ref[READ]))
__bch2_dev_read_write(c, ca);
up_write(&c->state_lock);
return ret;
@@ -1706,11 +1897,11 @@ err:
int bch2_dev_add(struct bch_fs *c, const char *path)
{
struct bch_opts opts = bch2_opts_empty();
- struct bch_sb_handle sb;
+ struct bch_sb_handle sb = {};
struct bch_dev *ca = NULL;
struct printbuf errbuf = PRINTBUF;
struct printbuf label = PRINTBUF;
- int ret;
+ int ret = 0;
ret = bch2_read_super(path, &opts, &sb);
bch_err_msg(c, ret, "reading super");
@@ -1727,6 +1918,20 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
}
}
+ if (list_empty(&c->list)) {
+ mutex_lock(&bch_fs_list_lock);
+ if (__bch2_uuid_to_fs(c->sb.uuid))
+ ret = bch_err_throw(c, filesystem_uuid_already_open);
+ else
+ list_add(&c->list, &bch_fs_list);
+ mutex_unlock(&bch_fs_list_lock);
+
+ if (ret) {
+ bch_err(c, "filesystem UUID already open");
+ goto err;
+ }
+ }
+
ret = bch2_dev_may_add(sb.sb, c);
if (ret)
goto err;
@@ -1743,6 +1948,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
down_write(&c->state_lock);
mutex_lock(&c->sb_lock);
+ SET_BCH_SB_MULTI_DEVICE(c->disk_sb.sb, true);
ret = bch2_sb_from_fs(c, ca);
bch_err_msg(c, ret, "setting up new superblock");
@@ -1758,6 +1964,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
goto err_unlock;
}
unsigned dev_idx = ret;
+ ret = 0;
/* success: */
@@ -1777,30 +1984,52 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
- ret = bch2_dev_usage_init(ca, false);
- if (ret)
- goto err_late;
+ if (test_bit(BCH_FS_started, &c->flags)) {
+ ret = bch2_dev_usage_init(ca, false);
+ if (ret)
+ goto err_late;
- ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional);
- bch_err_msg(ca, ret, "marking new superblock");
- if (ret)
- goto err_late;
+ ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional);
+ bch_err_msg(ca, ret, "marking new superblock");
+ if (ret)
+ goto err_late;
- ret = bch2_fs_freespace_init(c);
- bch_err_msg(ca, ret, "initializing free space");
- if (ret)
- goto err_late;
+ ret = bch2_fs_freespace_init(c);
+ bch_err_msg(ca, ret, "initializing free space");
+ if (ret)
+ goto err_late;
- if (ca->mi.state == BCH_MEMBER_STATE_rw)
- __bch2_dev_read_write(c, ca);
+ if (ca->mi.state == BCH_MEMBER_STATE_rw)
+ __bch2_dev_read_write(c, ca);
- ret = bch2_dev_journal_alloc(ca, false);
- bch_err_msg(c, ret, "allocating journal");
- if (ret)
- goto err_late;
+ ret = bch2_dev_journal_alloc(ca, false);
+ bch_err_msg(c, ret, "allocating journal");
+ if (ret)
+ goto err_late;
+ }
+
+ /*
+ * We just changed the superblock UUID, invalidate cache and send a
+ * uevent to update /dev/disk/by-uuid
+ */
+ invalidate_bdev(ca->disk_sb.bdev);
+
+ char uuid_str[37];
+ snprintf(uuid_str, sizeof(uuid_str), "UUID=%pUb", &c->sb.uuid);
+
+ char *envp[] = {
+ "CHANGE=uuid",
+ uuid_str,
+ NULL,
+ };
+ kobject_uevent_env(&ca->disk_sb.bdev->bd_device.kobj, KOBJ_CHANGE, envp);
up_write(&c->state_lock);
- return 0;
+out:
+ printbuf_exit(&label);
+ printbuf_exit(&errbuf);
+ bch_err_fn(c, ret);
+ return ret;
err_unlock:
mutex_unlock(&c->sb_lock);
@@ -1809,10 +2038,7 @@ err:
if (ca)
bch2_dev_free(ca);
bch2_free_super(&sb);
- printbuf_exit(&label);
- printbuf_exit(&errbuf);
- bch_err_fn(c, ret);
- return ret;
+ goto out;
err_late:
up_write(&c->state_lock);
ca = NULL;
@@ -1898,7 +2124,7 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) {
bch_err(ca, "Cannot offline required disk");
up_write(&c->state_lock);
- return -BCH_ERR_device_state_not_allowed;
+ return bch_err_throw(c, device_state_not_allowed);
}
__bch2_dev_offline(c, ca);
@@ -1907,6 +2133,18 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
return 0;
}
+static int __bch2_dev_resize_alloc(struct bch_dev *ca, u64 old_nbuckets, u64 new_nbuckets)
+{
+ struct bch_fs *c = ca->fs;
+ u64 v[3] = { new_nbuckets - old_nbuckets, 0, 0 };
+
+ return bch2_trans_commit_do(ca->fs, NULL, NULL, 0,
+ bch2_disk_accounting_mod2(trans, false, v, dev_data_type,
+ .dev = ca->dev_idx,
+ .data_type = BCH_DATA_free)) ?:
+ bch2_dev_freespace_init(c, ca, old_nbuckets, new_nbuckets);
+}
+
int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
{
struct bch_member *m;
@@ -1925,7 +2163,7 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
if (nbuckets > BCH_MEMBER_NBUCKETS_MAX) {
bch_err(ca, "New device size too big (%llu greater than max %u)",
nbuckets, BCH_MEMBER_NBUCKETS_MAX);
- ret = -BCH_ERR_device_size_too_big;
+ ret = bch_err_throw(c, device_size_too_big);
goto err;
}
@@ -1933,7 +2171,7 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
get_capacity(ca->disk_sb.bdev->bd_disk) <
ca->mi.bucket_size * nbuckets) {
bch_err(ca, "New size larger than device");
- ret = -BCH_ERR_device_size_too_small;
+ ret = bch_err_throw(c, device_size_too_small);
goto err;
}
@@ -1954,16 +2192,7 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
mutex_unlock(&c->sb_lock);
if (ca->mi.freespace_initialized) {
- struct disk_accounting_pos acc = {
- .type = BCH_DISK_ACCOUNTING_dev_data_type,
- .dev_data_type.dev = ca->dev_idx,
- .dev_data_type.data_type = BCH_DATA_free,
- };
- u64 v[3] = { nbuckets - old_nbuckets, 0, 0 };
-
- ret = bch2_trans_commit_do(ca->fs, NULL, NULL, 0,
- bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v), false)) ?:
- bch2_dev_freespace_init(c, ca, old_nbuckets, nbuckets);
+ ret = __bch2_dev_resize_alloc(ca, old_nbuckets, nbuckets);
if (ret)
goto err;
}
@@ -1974,6 +2203,49 @@ err:
return ret;
}
+int bch2_fs_resize_on_mount(struct bch_fs *c)
+{
+ for_each_online_member(c, ca, BCH_DEV_READ_REF_fs_resize_on_mount) {
+ u64 old_nbuckets = ca->mi.nbuckets;
+ u64 new_nbuckets = div64_u64(get_capacity(ca->disk_sb.bdev->bd_disk),
+ ca->mi.bucket_size);
+
+ if (ca->mi.resize_on_mount &&
+ new_nbuckets > ca->mi.nbuckets) {
+ bch_info(ca, "resizing to size %llu", new_nbuckets * ca->mi.bucket_size);
+ int ret = bch2_dev_buckets_resize(c, ca, new_nbuckets);
+ bch_err_fn(ca, ret);
+ if (ret) {
+ enumerated_ref_put(&ca->io_ref[READ],
+ BCH_DEV_READ_REF_fs_resize_on_mount);
+ up_write(&c->state_lock);
+ return ret;
+ }
+
+ mutex_lock(&c->sb_lock);
+ struct bch_member *m =
+ bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
+ m->nbuckets = cpu_to_le64(new_nbuckets);
+ SET_BCH_MEMBER_RESIZE_ON_MOUNT(m, false);
+
+ c->disk_sb.sb->features[0] &= ~cpu_to_le64(BIT_ULL(BCH_FEATURE_small_image));
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
+
+ if (ca->mi.freespace_initialized) {
+ ret = __bch2_dev_resize_alloc(ca, old_nbuckets, new_nbuckets);
+ if (ret) {
+ enumerated_ref_put(&ca->io_ref[READ],
+ BCH_DEV_READ_REF_fs_resize_on_mount);
+ up_write(&c->state_lock);
+ return ret;
+ }
+ }
+ }
+ }
+ return 0;
+}
+
/* return with ref on ca->ref: */
struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name)
{
@@ -1986,6 +2258,114 @@ struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name)
return ERR_PTR(-BCH_ERR_ENOENT_dev_not_found);
}
+/* blk_holder_ops: */
+
+static struct bch_fs *bdev_get_fs(struct block_device *bdev)
+ __releases(&bdev->bd_holder_lock)
+{
+ struct bch_sb_handle_holder *holder = bdev->bd_holder;
+ struct bch_fs *c = holder->c;
+
+ if (c && !bch2_ro_ref_tryget(c))
+ c = NULL;
+
+ mutex_unlock(&bdev->bd_holder_lock);
+
+ if (c)
+ wait_event(c->ro_ref_wait, test_bit(BCH_FS_started, &c->flags));
+ return c;
+}
+
+/* returns with ref on ca->ref */
+static struct bch_dev *bdev_to_bch_dev(struct bch_fs *c, struct block_device *bdev)
+{
+ for_each_member_device(c, ca)
+ if (ca->disk_sb.bdev == bdev)
+ return ca;
+ return NULL;
+}
+
+static void bch2_fs_bdev_mark_dead(struct block_device *bdev, bool surprise)
+{
+ struct bch_fs *c = bdev_get_fs(bdev);
+ if (!c)
+ return;
+
+ struct super_block *sb = c->vfs_sb;
+ if (sb) {
+ /*
+ * Not necessary, c->ro_ref guards against the filesystem being
+ * unmounted - we only take this to avoid a warning in
+ * sync_filesystem:
+ */
+ down_read(&sb->s_umount);
+ }
+
+ down_write(&c->state_lock);
+ struct bch_dev *ca = bdev_to_bch_dev(c, bdev);
+ if (!ca)
+ goto unlock;
+
+ bool dev = bch2_dev_state_allowed(c, ca,
+ BCH_MEMBER_STATE_failed,
+ BCH_FORCE_IF_DEGRADED);
+
+ if (!dev && sb) {
+ if (!surprise)
+ sync_filesystem(sb);
+ shrink_dcache_sb(sb);
+ evict_inodes(sb);
+ }
+
+ struct printbuf buf = PRINTBUF;
+ __bch2_log_msg_start(ca->name, &buf);
+
+ prt_printf(&buf, "offline from block layer");
+
+ if (dev) {
+ __bch2_dev_offline(c, ca);
+ } else {
+ bch2_journal_flush(&c->journal);
+ bch2_fs_emergency_read_only2(c, &buf);
+ }
+
+ bch2_print_str(c, KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+
+ bch2_dev_put(ca);
+unlock:
+ if (sb)
+ up_read(&sb->s_umount);
+ up_write(&c->state_lock);
+ bch2_ro_ref_put(c);
+}
+
+static void bch2_fs_bdev_sync(struct block_device *bdev)
+{
+ struct bch_fs *c = bdev_get_fs(bdev);
+ if (!c)
+ return;
+
+ struct super_block *sb = c->vfs_sb;
+ if (sb) {
+ /*
+ * Not necessary, c->ro_ref guards against the filesystem being
+ * unmounted - we only take this to avoid a warning in
+ * sync_filesystem:
+ */
+ down_read(&sb->s_umount);
+ sync_filesystem(sb);
+ up_read(&sb->s_umount);
+ }
+
+ bch2_ro_ref_put(c);
+}
+
+const struct blk_holder_ops bch2_sb_handle_bdev_ops = {
+ .mark_dead = bch2_fs_bdev_mark_dead,
+ .sync = bch2_fs_bdev_sync,
+};
+
/* Filesystem open: */
static inline int sb_cmp(struct bch_sb *l, struct bch_sb *r)
@@ -1994,10 +2374,10 @@ static inline int sb_cmp(struct bch_sb *l, struct bch_sb *r)
cmp_int(le64_to_cpu(l->write_time), le64_to_cpu(r->write_time));
}
-struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
- struct bch_opts opts)
+struct bch_fs *bch2_fs_open(darray_const_str *devices,
+ struct bch_opts *opts)
{
- DARRAY(struct bch_sb_handle) sbs = { 0 };
+ bch_sb_handles sbs = {};
struct bch_fs *c = NULL;
struct bch_sb_handle *best = NULL;
struct printbuf errbuf = PRINTBUF;
@@ -2006,27 +2386,27 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
if (!try_module_get(THIS_MODULE))
return ERR_PTR(-ENODEV);
- if (!nr_devices) {
+ if (!devices->nr) {
ret = -EINVAL;
goto err;
}
- ret = darray_make_room(&sbs, nr_devices);
+ ret = darray_make_room(&sbs, devices->nr);
if (ret)
goto err;
- for (unsigned i = 0; i < nr_devices; i++) {
+ darray_for_each(*devices, i) {
struct bch_sb_handle sb = { NULL };
- ret = bch2_read_super(devices[i], &opts, &sb);
+ ret = bch2_read_super(*i, opts, &sb);
if (ret)
goto err;
BUG_ON(darray_push(&sbs, sb));
}
- if (opts.nochanges && !opts.read_only) {
- ret = -BCH_ERR_erofs_nochanges;
+ if (opts->nochanges && !opts->read_only) {
+ ret = bch_err_throw(c, erofs_nochanges);
goto err_print;
}
@@ -2035,7 +2415,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
best = sb;
darray_for_each_reverse(sbs, sb) {
- ret = bch2_dev_in_fs(best, sb, &opts);
+ ret = bch2_dev_in_fs(best, sb, opts);
if (ret == -BCH_ERR_device_has_been_removed ||
ret == -BCH_ERR_device_splitbrain) {
@@ -2050,7 +2430,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
goto err_print;
}
- c = bch2_fs_alloc(best->sb, opts);
+ c = bch2_fs_alloc(best->sb, opts, &sbs);
ret = PTR_ERR_OR_ZERO(c);
if (ret)
goto err;
@@ -2065,11 +2445,6 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
}
up_write(&c->state_lock);
- if (!bch2_fs_may_start(c)) {
- ret = -BCH_ERR_insufficient_devices_to_start;
- goto err_print;
- }
-
if (!c->opts.nostart) {
ret = bch2_fs_start(c);
if (ret)
@@ -2084,7 +2459,7 @@ out:
return c;
err_print:
pr_err("bch_fs_open err opening %s: %s",
- devices[0], bch2_err_str(ret));
+ devices->data[0], bch2_err_str(ret));
err:
if (!IS_ERR_OR_NULL(c))
bch2_fs_stop(c);
@@ -2121,16 +2496,52 @@ err:
return -ENOMEM;
}
-#define BCH_DEBUG_PARAM(name, description) \
- bool bch2_##name; \
- module_param_named(name, bch2_##name, bool, 0644); \
+#define BCH_DEBUG_PARAM(name, description) DEFINE_STATIC_KEY_FALSE(bch2_##name);
+BCH_DEBUG_PARAMS_ALL()
+#undef BCH_DEBUG_PARAM
+
+static int bch2_param_set_static_key_t(const char *val, const struct kernel_param *kp)
+{
+ /* Match bool exactly, by re-using it. */
+ struct static_key *key = kp->arg;
+ struct kernel_param boolkp = *kp;
+ bool v;
+ int ret;
+
+ boolkp.arg = &v;
+
+ ret = param_set_bool(val, &boolkp);
+ if (ret)
+ return ret;
+ if (v)
+ static_key_enable(key);
+ else
+ static_key_disable(key);
+ return 0;
+}
+
+static int bch2_param_get_static_key_t(char *buffer, const struct kernel_param *kp)
+{
+ struct static_key *key = kp->arg;
+ return sprintf(buffer, "%c\n", static_key_enabled(key) ? 'N' : 'Y');
+}
+
+static const struct kernel_param_ops bch2_param_ops_static_key_t = {
+ .flags = KERNEL_PARAM_OPS_FL_NOARG,
+ .set = bch2_param_set_static_key_t,
+ .get = bch2_param_get_static_key_t,
+};
+
+#define BCH_DEBUG_PARAM(name, description) \
+ module_param_cb(name, &bch2_param_ops_static_key_t, &bch2_##name.key, 0644);\
+ __MODULE_PARM_TYPE(name, "static_key_t"); \
MODULE_PARM_DESC(name, description);
BCH_DEBUG_PARAMS()
#undef BCH_DEBUG_PARAM
__maybe_unused
static unsigned bch2_metadata_version = bcachefs_metadata_version_current;
-module_param_named(version, bch2_metadata_version, uint, 0400);
+module_param_named(version, bch2_metadata_version, uint, 0444);
module_exit(bcachefs_exit);
module_init(bcachefs_init);
diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
index fa6d52216510..e90bab9afe78 100644
--- a/fs/bcachefs/super.h
+++ b/fs/bcachefs/super.h
@@ -9,6 +9,9 @@
#include <linux/math64.h>
extern const char * const bch2_fs_flag_strs[];
+extern const char * const bch2_write_refs[];
+extern const char * const bch2_dev_read_refs[];
+extern const char * const bch2_dev_write_refs[];
struct bch_fs *bch2_dev_to_fs(dev_t);
struct bch_fs *bch2_uuid_to_fs(__uuid_t);
@@ -29,16 +32,24 @@ int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64);
struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *);
bool bch2_fs_emergency_read_only(struct bch_fs *);
+bool bch2_fs_emergency_read_only2(struct bch_fs *, struct printbuf *);
+
+bool bch2_fs_emergency_read_only_locked(struct bch_fs *);
void bch2_fs_read_only(struct bch_fs *);
int bch2_fs_read_write(struct bch_fs *);
int bch2_fs_read_write_early(struct bch_fs *);
+int bch2_fs_resize_on_mount(struct bch_fs *);
+
void __bch2_fs_stop(struct bch_fs *);
void bch2_fs_free(struct bch_fs *);
void bch2_fs_stop(struct bch_fs *);
+int bch2_fs_init_rw(struct bch_fs *);
int bch2_fs_start(struct bch_fs *);
-struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts);
+struct bch_fs *bch2_fs_open(darray_const_str *, struct bch_opts *);
+
+extern const struct blk_holder_ops bch2_sb_handle_bdev_ops;
#endif /* _BCACHEFS_SUPER_H */
diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h
index 368a63d938cf..3a899f799d1d 100644
--- a/fs/bcachefs/super_types.h
+++ b/fs/bcachefs/super_types.h
@@ -2,13 +2,19 @@
#ifndef _BCACHEFS_SUPER_TYPES_H
#define _BCACHEFS_SUPER_TYPES_H
+struct bch_fs;
+
+struct bch_sb_handle_holder {
+ struct bch_fs *c;
+};
+
struct bch_sb_handle {
struct bch_sb *sb;
struct file *s_bdev_file;
struct block_device *bdev;
char *sb_name;
struct bio *bio;
- void *holder;
+ struct bch_sb_handle_holder *holder;
size_t buffer_size;
blk_mode_t mode;
unsigned have_layout:1;
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index a7eb1f511484..05848375cea2 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -25,6 +25,8 @@
#include "disk_accounting.h"
#include "disk_groups.h"
#include "ec.h"
+#include "enumerated_ref.h"
+#include "error.h"
#include "inode.h"
#include "journal.h"
#include "journal_reclaim.h"
@@ -34,7 +36,9 @@
#include "nocow_locking.h"
#include "opts.h"
#include "rebalance.h"
+#include "recovery_passes.h"
#include "replicas.h"
+#include "sb-errors.h"
#include "super-io.h"
#include "tests.h"
@@ -141,20 +145,23 @@ do { \
write_attribute(trigger_gc);
write_attribute(trigger_discards);
write_attribute(trigger_invalidates);
+write_attribute(trigger_journal_commit);
write_attribute(trigger_journal_flush);
write_attribute(trigger_journal_writes);
write_attribute(trigger_btree_cache_shrink);
write_attribute(trigger_btree_key_cache_shrink);
+write_attribute(trigger_btree_updates);
write_attribute(trigger_freelist_wakeup);
+write_attribute(trigger_recalc_capacity);
+write_attribute(trigger_delete_dead_snapshots);
+write_attribute(trigger_emergency_read_only);
read_attribute(gc_gens_pos);
read_attribute(uuid);
read_attribute(minor);
read_attribute(flags);
-read_attribute(bucket_size);
read_attribute(first_bucket);
read_attribute(nbuckets);
-rw_attribute(durability);
read_attribute(io_done);
read_attribute(io_errors);
write_attribute(io_errors_reset);
@@ -169,35 +176,18 @@ read_attribute(btree_write_stats);
read_attribute(btree_cache_size);
read_attribute(compression_stats);
+read_attribute(errors);
read_attribute(journal_debug);
read_attribute(btree_cache);
read_attribute(btree_key_cache);
read_attribute(btree_reserve_cache);
-read_attribute(stripes_heap);
read_attribute(open_buckets);
read_attribute(open_buckets_partial);
-read_attribute(write_points);
read_attribute(nocow_lock_table);
-#ifdef BCH_WRITE_REF_DEBUG
+read_attribute(read_refs);
read_attribute(write_refs);
-static const char * const bch2_write_refs[] = {
-#define x(n) #n,
- BCH_WRITE_REFS()
-#undef x
- NULL
-};
-
-static void bch2_write_refs_to_text(struct printbuf *out, struct bch_fs *c)
-{
- bch2_printbuf_tabstop_push(out, 24);
-
- for (unsigned i = 0; i < ARRAY_SIZE(c->writes); i++)
- prt_printf(out, "%s\t%li\n", bch2_write_refs[i], atomic_long_read(&c->writes[i]));
-}
-#endif
-
read_attribute(internal_uuid);
read_attribute(disk_groups);
@@ -209,14 +199,14 @@ read_attribute(usage_base);
BCH_PERSISTENT_COUNTERS()
#undef x
-rw_attribute(discard);
-read_attribute(state);
rw_attribute(label);
read_attribute(copy_gc_wait);
sysfs_pd_controller_attribute(rebalance);
read_attribute(rebalance_status);
+read_attribute(snapshot_delete_status);
+read_attribute(recovery_status);
read_attribute(new_stripes);
@@ -262,10 +252,8 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
prt_printf(out, "type\tcompressed\runcompressed\raverage extent size\r\n");
for (unsigned i = 1; i < BCH_COMPRESSION_TYPE_NR; i++) {
- struct disk_accounting_pos a = {
- .type = BCH_DISK_ACCOUNTING_compression,
- .compression.type = i,
- };
+ struct disk_accounting_pos a;
+ disk_accounting_key_init(a, compression, .type = i);
struct bpos p = disk_accounting_pos_to_bpos(&a);
u64 v[3];
bch2_accounting_mem_read(c, p, v, ARRAY_SIZE(v));
@@ -341,6 +329,12 @@ SHOW(bch2_fs)
if (attr == &sysfs_rebalance_status)
bch2_rebalance_status_to_text(out, c);
+ if (attr == &sysfs_snapshot_delete_status)
+ bch2_snapshot_delete_status_to_text(out, c);
+
+ if (attr == &sysfs_recovery_status)
+ bch2_recovery_pass_status_to_text(out, c);
+
/* Debugging: */
if (attr == &sysfs_journal_debug)
@@ -355,21 +349,18 @@ SHOW(bch2_fs)
if (attr == &sysfs_btree_reserve_cache)
bch2_btree_reserve_cache_to_text(out, c);
- if (attr == &sysfs_stripes_heap)
- bch2_stripes_heap_to_text(out, c);
-
if (attr == &sysfs_open_buckets)
bch2_open_buckets_to_text(out, c, NULL);
if (attr == &sysfs_open_buckets_partial)
bch2_open_buckets_partial_to_text(out, c);
- if (attr == &sysfs_write_points)
- bch2_write_points_to_text(out, c);
-
if (attr == &sysfs_compression_stats)
bch2_compression_stats_to_text(out, c);
+ if (attr == &sysfs_errors)
+ bch2_fs_errors_to_text(out, c);
+
if (attr == &sysfs_new_stripes)
bch2_new_stripes_to_text(out, c);
@@ -382,10 +373,8 @@ SHOW(bch2_fs)
if (attr == &sysfs_moving_ctxts)
bch2_fs_moving_ctxts_to_text(out, c);
-#ifdef BCH_WRITE_REF_DEBUG
if (attr == &sysfs_write_refs)
- bch2_write_refs_to_text(out, c);
-#endif
+ enumerated_ref_to_text(out, &c->writes, bch2_write_refs);
if (attr == &sysfs_nocow_lock_table)
bch2_nocow_locks_to_text(out, &c->nocow_locks);
@@ -415,7 +404,10 @@ STORE(bch2_fs)
/* Debugging: */
- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs))
+ if (attr == &sysfs_trigger_btree_updates)
+ queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work);
+
+ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_sysfs))
return -EROFS;
if (attr == &sysfs_trigger_btree_cache_shrink) {
@@ -444,6 +436,9 @@ STORE(bch2_fs)
if (attr == &sysfs_trigger_invalidates)
bch2_do_invalidates(c);
+ if (attr == &sysfs_trigger_journal_commit)
+ bch2_journal_flush(&c->journal);
+
if (attr == &sysfs_trigger_journal_flush) {
bch2_journal_flush_all_pins(&c->journal);
bch2_journal_meta(&c->journal);
@@ -455,6 +450,25 @@ STORE(bch2_fs)
if (attr == &sysfs_trigger_freelist_wakeup)
closure_wake_up(&c->freelist_wait);
+ if (attr == &sysfs_trigger_recalc_capacity) {
+ down_read(&c->state_lock);
+ bch2_recalc_capacity(c);
+ up_read(&c->state_lock);
+ }
+
+ if (attr == &sysfs_trigger_delete_dead_snapshots)
+ __bch2_delete_dead_snapshots(c);
+
+ if (attr == &sysfs_trigger_emergency_read_only) {
+ struct printbuf buf = PRINTBUF;
+ bch2_log_msg_start(c, &buf);
+
+ prt_printf(&buf, "shutdown by sysfs\n");
+ bch2_fs_emergency_read_only2(c, &buf);
+ bch2_print_str(c, KERN_ERR, buf.buf);
+ printbuf_exit(&buf);
+ }
+
#ifdef CONFIG_BCACHEFS_TESTS
if (attr == &sysfs_perf_test) {
char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp;
@@ -475,7 +489,7 @@ STORE(bch2_fs)
size = ret;
}
#endif
- bch2_write_ref_put(c, BCH_WRITE_REF_sysfs);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_sysfs);
return size;
}
SYSFS_OPS(bch2_fs);
@@ -486,8 +500,11 @@ struct attribute *bch2_fs_files[] = {
&sysfs_btree_write_stats,
&sysfs_rebalance_status,
+ &sysfs_snapshot_delete_status,
+ &sysfs_recovery_status,
&sysfs_compression_stats,
+ &sysfs_errors,
#ifdef CONFIG_BCACHEFS_TESTS
&sysfs_perf_test,
@@ -566,13 +583,9 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_btree_key_cache,
&sysfs_btree_reserve_cache,
&sysfs_new_stripes,
- &sysfs_stripes_heap,
&sysfs_open_buckets,
&sysfs_open_buckets_partial,
- &sysfs_write_points,
-#ifdef BCH_WRITE_REF_DEBUG
&sysfs_write_refs,
-#endif
&sysfs_nocow_lock_table,
&sysfs_io_timers_read,
&sysfs_io_timers_write,
@@ -580,11 +593,16 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_trigger_gc,
&sysfs_trigger_discards,
&sysfs_trigger_invalidates,
+ &sysfs_trigger_journal_commit,
&sysfs_trigger_journal_flush,
&sysfs_trigger_journal_writes,
&sysfs_trigger_btree_cache_shrink,
&sysfs_trigger_btree_key_cache_shrink,
+ &sysfs_trigger_btree_updates,
&sysfs_trigger_freelist_wakeup,
+ &sysfs_trigger_recalc_capacity,
+ &sysfs_trigger_delete_dead_snapshots,
+ &sysfs_trigger_emergency_read_only,
&sysfs_gc_gens_pos,
@@ -604,87 +622,115 @@ struct attribute *bch2_fs_internal_files[] = {
/* options */
-SHOW(bch2_fs_opts_dir)
+static ssize_t sysfs_opt_show(struct bch_fs *c,
+ struct bch_dev *ca,
+ enum bch_opt_id id,
+ struct printbuf *out)
{
- struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
- const struct bch_option *opt = container_of(attr, struct bch_option, attr);
- int id = opt - bch2_opt_table;
- u64 v = bch2_opt_get_by_id(&c->opts, id);
+ const struct bch_option *opt = bch2_opt_table + id;
+ u64 v;
+
+ if (opt->flags & OPT_FS) {
+ v = bch2_opt_get_by_id(&c->opts, id);
+ } else if ((opt->flags & OPT_DEVICE) && opt->get_member) {
+ v = bch2_opt_from_sb(c->disk_sb.sb, id, ca->dev_idx);
+ } else {
+ return -EINVAL;
+ }
bch2_opt_to_text(out, c, c->disk_sb.sb, opt, v, OPT_SHOW_FULL_LIST);
prt_char(out, '\n');
-
return 0;
}
-STORE(bch2_fs_opts_dir)
+static ssize_t sysfs_opt_store(struct bch_fs *c,
+ struct bch_dev *ca,
+ enum bch_opt_id id,
+ const char *buf, size_t size)
{
- struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
- const struct bch_option *opt = container_of(attr, struct bch_option, attr);
- int ret, id = opt - bch2_opt_table;
- char *tmp;
- u64 v;
+ const struct bch_option *opt = bch2_opt_table + id;
+ int ret = 0;
/*
* We don't need to take c->writes for correctness, but it eliminates an
* unsightly error message in the dmesg log when we're RO:
*/
- if (unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs)))
+ if (unlikely(!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_sysfs)))
return -EROFS;
- tmp = kstrdup(buf, GFP_KERNEL);
+ char *tmp = kstrdup(buf, GFP_KERNEL);
if (!tmp) {
ret = -ENOMEM;
goto err;
}
- ret = bch2_opt_parse(c, opt, strim(tmp), &v, NULL);
+ u64 v;
+ ret = bch2_opt_parse(c, opt, strim(tmp), &v, NULL) ?:
+ bch2_opt_hook_pre_set(c, ca, id, v);
kfree(tmp);
if (ret < 0)
goto err;
- ret = bch2_opt_check_may_set(c, id, v);
- if (ret < 0)
- goto err;
-
- bch2_opt_set_sb(c, NULL, opt, v);
- bch2_opt_set_by_id(&c->opts, id, v);
-
- if (v &&
- (id == Opt_background_target ||
- id == Opt_background_compression ||
- (id == Opt_compression && !c->opts.background_compression)))
- bch2_set_rebalance_needs_scan(c, 0);
+ bool is_sb = opt->get_sb || opt->get_member;
+ bool changed = false;
+
+ if (is_sb) {
+ changed = bch2_opt_set_sb(c, ca, opt, v);
+ } else if (!ca) {
+ changed = bch2_opt_get_by_id(&c->opts, id) != v;
+ } else {
+ /* device options that aren't superblock options aren't
+ * supported */
+ BUG();
+ }
- if (v && id == Opt_rebalance_enabled)
- rebalance_wakeup(c);
+ if (!ca)
+ bch2_opt_set_by_id(&c->opts, id, v);
- if (v && id == Opt_copygc_enabled &&
- c->copygc_thread)
- wake_up_process(c->copygc_thread);
+ if (changed)
+ bch2_opt_hook_post_set(c, ca, 0, &c->opts, id);
ret = size;
err:
- bch2_write_ref_put(c, BCH_WRITE_REF_sysfs);
+ enumerated_ref_put(&c->writes, BCH_WRITE_REF_sysfs);
return ret;
}
+
+SHOW(bch2_fs_opts_dir)
+{
+ struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
+ int id = bch2_opt_lookup(attr->name);
+ if (id < 0)
+ return 0;
+
+ return sysfs_opt_show(c, NULL, id, out);
+}
+
+STORE(bch2_fs_opts_dir)
+{
+ struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
+ int id = bch2_opt_lookup(attr->name);
+ if (id < 0)
+ return 0;
+
+ return sysfs_opt_store(c, NULL, id, buf, size);
+}
SYSFS_OPS(bch2_fs_opts_dir);
struct attribute *bch2_fs_opts_dir_files[] = { NULL };
-int bch2_opts_create_sysfs_files(struct kobject *kobj)
+int bch2_opts_create_sysfs_files(struct kobject *kobj, unsigned type)
{
- const struct bch_option *i;
- int ret;
-
- for (i = bch2_opt_table;
+ for (const struct bch_option *i = bch2_opt_table;
i < bch2_opt_table + bch2_opts_nr;
i++) {
- if (!(i->flags & OPT_FS))
+ if (i->flags & OPT_HIDDEN)
+ continue;
+ if (!(i->flags & type))
continue;
- ret = sysfs_create_file(kobj, &i->attr);
+ int ret = sysfs_create_file(kobj, &i->attr);
if (ret)
return ret;
}
@@ -755,11 +801,8 @@ SHOW(bch2_dev)
sysfs_printf(uuid, "%pU\n", ca->uuid.b);
- sysfs_print(bucket_size, bucket_bytes(ca));
sysfs_print(first_bucket, ca->mi.first_bucket);
sysfs_print(nbuckets, ca->mi.nbuckets);
- sysfs_print(durability, ca->mi.durability);
- sysfs_print(discard, ca->mi.discard);
if (attr == &sysfs_label) {
if (ca->mi.group)
@@ -772,11 +815,6 @@ SHOW(bch2_dev)
prt_char(out, '\n');
}
- if (attr == &sysfs_state) {
- prt_string_option(out, bch2_member_states, ca->mi.state);
- prt_char(out, '\n');
- }
-
if (attr == &sysfs_io_done)
dev_io_done_to_text(out, ca);
@@ -802,6 +840,16 @@ SHOW(bch2_dev)
if (attr == &sysfs_open_buckets)
bch2_open_buckets_to_text(out, c, ca);
+ int opt_id = bch2_opt_lookup(attr->name);
+ if (opt_id >= 0)
+ return sysfs_opt_show(c, ca, opt_id, out);
+
+ if (attr == &sysfs_read_refs)
+ enumerated_ref_to_text(out, &ca->io_ref[READ], bch2_dev_read_refs);
+
+ if (attr == &sysfs_write_refs)
+ enumerated_ref_to_text(out, &ca->io_ref[WRITE], bch2_dev_write_refs);
+
return 0;
}
@@ -810,18 +858,6 @@ STORE(bch2_dev)
struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
struct bch_fs *c = ca->fs;
- if (attr == &sysfs_discard) {
- bool v = strtoul_or_return(buf);
-
- bch2_opt_set_sb(c, ca, bch2_opt_table + Opt_discard, v);
- }
-
- if (attr == &sysfs_durability) {
- u64 v = strtoul_or_return(buf);
-
- bch2_opt_set_sb(c, ca, bch2_opt_table + Opt_durability, v);
- }
-
if (attr == &sysfs_label) {
char *tmp;
int ret;
@@ -839,20 +875,20 @@ STORE(bch2_dev)
if (attr == &sysfs_io_errors_reset)
bch2_dev_errors_reset(ca);
+ int opt_id = bch2_opt_lookup(attr->name);
+ if (opt_id >= 0)
+ return sysfs_opt_store(c, ca, opt_id, buf, size);
+
return size;
}
SYSFS_OPS(bch2_dev);
struct attribute *bch2_dev_files[] = {
&sysfs_uuid,
- &sysfs_bucket_size,
&sysfs_first_bucket,
&sysfs_nbuckets,
- &sysfs_durability,
/* settings: */
- &sysfs_discard,
- &sysfs_state,
&sysfs_label,
&sysfs_has_data,
@@ -869,6 +905,9 @@ struct attribute *bch2_dev_files[] = {
/* debug: */
&sysfs_alloc_debug,
&sysfs_open_buckets,
+
+ &sysfs_read_refs,
+ &sysfs_write_refs,
NULL
};
diff --git a/fs/bcachefs/sysfs.h b/fs/bcachefs/sysfs.h
index 222cd5062702..303e0433c702 100644
--- a/fs/bcachefs/sysfs.h
+++ b/fs/bcachefs/sysfs.h
@@ -23,7 +23,7 @@ extern const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops;
extern const struct sysfs_ops bch2_fs_time_stats_sysfs_ops;
extern const struct sysfs_ops bch2_dev_sysfs_ops;
-int bch2_opts_create_sysfs_files(struct kobject *);
+int bch2_opts_create_sysfs_files(struct kobject *, unsigned);
#else
@@ -41,7 +41,8 @@ static const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops;
static const struct sysfs_ops bch2_fs_time_stats_sysfs_ops;
static const struct sysfs_ops bch2_dev_sysfs_ops;
-static inline int bch2_opts_create_sysfs_files(struct kobject *kobj) { return 0; }
+static inline int bch2_opts_create_sysfs_files(struct kobject *kobj, unsigned type)
+{ return 0; }
#endif /* NO_BCACHEFS_SYSFS */
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index 6c6469814637..782a05fe7656 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -43,7 +43,7 @@ static int test_delete(struct bch_fs *c, u64 nr)
BTREE_ITER_intent);
ret = commit_do(trans, NULL, NULL, 0,
- bch2_btree_iter_traverse(&iter) ?:
+ bch2_btree_iter_traverse(trans, &iter) ?:
bch2_trans_update(trans, &iter, &k.k_i, 0));
bch_err_msg(c, ret, "update error");
if (ret)
@@ -51,7 +51,7 @@ static int test_delete(struct bch_fs *c, u64 nr)
pr_info("deleting once");
ret = commit_do(trans, NULL, NULL, 0,
- bch2_btree_iter_traverse(&iter) ?:
+ bch2_btree_iter_traverse(trans, &iter) ?:
bch2_btree_delete_at(trans, &iter, 0));
bch_err_msg(c, ret, "delete error (first)");
if (ret)
@@ -59,7 +59,7 @@ static int test_delete(struct bch_fs *c, u64 nr)
pr_info("deleting twice");
ret = commit_do(trans, NULL, NULL, 0,
- bch2_btree_iter_traverse(&iter) ?:
+ bch2_btree_iter_traverse(trans, &iter) ?:
bch2_btree_delete_at(trans, &iter, 0));
bch_err_msg(c, ret, "delete error (second)");
if (ret)
@@ -84,7 +84,7 @@ static int test_delete_written(struct bch_fs *c, u64 nr)
BTREE_ITER_intent);
ret = commit_do(trans, NULL, NULL, 0,
- bch2_btree_iter_traverse(&iter) ?:
+ bch2_btree_iter_traverse(trans, &iter) ?:
bch2_trans_update(trans, &iter, &k.k_i, 0));
bch_err_msg(c, ret, "update error");
if (ret)
@@ -94,7 +94,7 @@ static int test_delete_written(struct bch_fs *c, u64 nr)
bch2_journal_flush_all_pins(&c->journal);
ret = commit_do(trans, NULL, NULL, 0,
- bch2_btree_iter_traverse(&iter) ?:
+ bch2_btree_iter_traverse(trans, &iter) ?:
bch2_btree_delete_at(trans, &iter, 0));
bch_err_msg(c, ret, "delete error");
if (ret)
@@ -342,6 +342,8 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
*/
static int test_peek_end(struct bch_fs *c, u64 nr)
{
+ delete_test_keys(c);
+
struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter;
struct bkey_s_c k;
@@ -349,10 +351,10 @@ static int test_peek_end(struct bch_fs *c, u64 nr)
bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
SPOS(0, 0, U32_MAX), 0);
- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX))));
+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX))));
BUG_ON(k.k);
- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX))));
+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX))));
BUG_ON(k.k);
bch2_trans_iter_exit(trans, &iter);
@@ -362,6 +364,8 @@ static int test_peek_end(struct bch_fs *c, u64 nr)
static int test_peek_end_extents(struct bch_fs *c, u64 nr)
{
+ delete_test_keys(c);
+
struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter;
struct bkey_s_c k;
@@ -369,10 +373,10 @@ static int test_peek_end_extents(struct bch_fs *c, u64 nr)
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
SPOS(0, 0, U32_MAX), 0);
- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX))));
+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX))));
BUG_ON(k.k);
- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX))));
+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX))));
BUG_ON(k.k);
bch2_trans_iter_exit(trans, &iter);
@@ -488,7 +492,7 @@ static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi)
trans = bch2_trans_get(c);
bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
SPOS(0, 0, snapid_lo), 0);
- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX))));
+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX))));
BUG_ON(k.k->p.snapshot != U32_MAX);
@@ -602,9 +606,9 @@ static int rand_lookup(struct bch_fs *c, u64 nr)
SPOS(0, 0, U32_MAX), 0);
for (i = 0; i < nr; i++) {
- bch2_btree_iter_set_pos(&iter, SPOS(0, test_rand(), U32_MAX));
+ bch2_btree_iter_set_pos(trans, &iter, SPOS(0, test_rand(), U32_MAX));
- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
+ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek(trans, &iter)));
ret = bkey_err(k);
if (ret)
break;
@@ -623,9 +627,9 @@ static int rand_mixed_trans(struct btree_trans *trans,
struct bkey_s_c k;
int ret;
- bch2_btree_iter_set_pos(iter, SPOS(0, pos, U32_MAX));
+ bch2_btree_iter_set_pos(trans, iter, SPOS(0, pos, U32_MAX));
- k = bch2_btree_iter_peek(iter);
+ k = bch2_btree_iter_peek(trans, iter);
ret = bkey_err(k);
bch_err_msg(trans->c, ret, "lookup error");
if (ret)
@@ -672,7 +676,7 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos)
bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos,
BTREE_ITER_intent);
- k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX));
+ k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX));
ret = bkey_err(k);
if (ret)
goto err;
diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c
index dea73bc1cb51..314a24d15d4e 100644
--- a/fs/bcachefs/thread_with_file.c
+++ b/fs/bcachefs/thread_with_file.c
@@ -455,8 +455,10 @@ ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *stdio, bool nonblocki
struct stdio_buf *buf = &stdio->output;
unsigned long flags;
ssize_t ret;
-
again:
+ if (stdio->done)
+ return -EPIPE;
+
spin_lock_irqsave(&buf->lock, flags);
ret = bch2_darray_vprintf(&buf->buf, GFP_NOWAIT, fmt, args);
spin_unlock_irqrestore(&buf->lock, flags);
diff --git a/fs/bcachefs/time_stats.c b/fs/bcachefs/time_stats.c
index 3fe82757f93a..2c34fe4be912 100644
--- a/fs/bcachefs/time_stats.c
+++ b/fs/bcachefs/time_stats.c
@@ -10,6 +10,9 @@
#include "eytzinger.h"
#include "time_stats.h"
+/* disable automatic switching to percpu mode */
+#define TIME_STATS_NONPCPU ((unsigned long) 1)
+
static const struct time_unit time_units[] = {
{ "ns", 1 },
{ "us", NSEC_PER_USEC },
@@ -123,11 +126,12 @@ void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
{
unsigned long flags;
- if (!stats->buffer) {
+ if ((unsigned long) stats->buffer <= TIME_STATS_NONPCPU) {
spin_lock_irqsave(&stats->lock, flags);
time_stats_update_one(stats, start, end);
- if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT) < 32 &&
+ if (!stats->buffer &&
+ mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT) < 32 &&
stats->duration_stats.n > 1024)
stats->buffer =
alloc_percpu_gfp(struct time_stat_buffer,
@@ -157,7 +161,7 @@ void bch2_time_stats_reset(struct bch2_time_stats *stats)
unsigned offset = offsetof(struct bch2_time_stats, min_duration);
memset((void *) stats + offset, 0, sizeof(*stats) - offset);
- if (stats->buffer) {
+ if ((unsigned long) stats->buffer > TIME_STATS_NONPCPU) {
int cpu;
for_each_possible_cpu(cpu)
per_cpu_ptr(stats->buffer, cpu)->nr = 0;
@@ -167,7 +171,9 @@ void bch2_time_stats_reset(struct bch2_time_stats *stats)
void bch2_time_stats_exit(struct bch2_time_stats *stats)
{
- free_percpu(stats->buffer);
+ if ((unsigned long) stats->buffer > TIME_STATS_NONPCPU)
+ free_percpu(stats->buffer);
+ stats->buffer = NULL;
}
void bch2_time_stats_init(struct bch2_time_stats *stats)
@@ -177,3 +183,9 @@ void bch2_time_stats_init(struct bch2_time_stats *stats)
stats->min_freq = U64_MAX;
spin_lock_init(&stats->lock);
}
+
+void bch2_time_stats_init_no_pcpu(struct bch2_time_stats *stats)
+{
+ bch2_time_stats_init(stats);
+ stats->buffer = (struct time_stat_buffer __percpu *) TIME_STATS_NONPCPU;
+}
diff --git a/fs/bcachefs/time_stats.h b/fs/bcachefs/time_stats.h
index dc6493f7bbab..eddb0985bab4 100644
--- a/fs/bcachefs/time_stats.h
+++ b/fs/bcachefs/time_stats.h
@@ -145,6 +145,7 @@ static inline bool track_event_change(struct bch2_time_stats *stats, bool v)
void bch2_time_stats_reset(struct bch2_time_stats *);
void bch2_time_stats_exit(struct bch2_time_stats *);
void bch2_time_stats_init(struct bch2_time_stats *);
+void bch2_time_stats_init_no_pcpu(struct bch2_time_stats *);
static inline void bch2_time_stats_quantiles_exit(struct bch2_time_stats_quantiles *statq)
{
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 56a5a7fbc0fd..9c5a9c551f03 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -199,6 +199,50 @@ DECLARE_EVENT_CLASS(bio,
(unsigned long long)__entry->sector, __entry->nr_sector)
);
+/* errors */
+
+TRACE_EVENT(error_throw,
+ TP_PROTO(struct bch_fs *c, int bch_err, unsigned long ip),
+ TP_ARGS(c, bch_err, ip),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev )
+ __field(int, err )
+ __array(char, err_str, 32 )
+ __array(char, ip, 32 )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = c->dev;
+ __entry->err = bch_err;
+ strscpy(__entry->err_str, bch2_err_str(bch_err), sizeof(__entry->err_str));
+ snprintf(__entry->ip, sizeof(__entry->ip), "%ps", (void *) ip);
+ ),
+
+ TP_printk("%d,%d %s ret %s", MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ip, __entry->err_str)
+);
+
+TRACE_EVENT(error_downcast,
+ TP_PROTO(int bch_err, int std_err, unsigned long ip),
+ TP_ARGS(bch_err, std_err, ip),
+
+ TP_STRUCT__entry(
+ __array(char, bch_err, 32 )
+ __array(char, std_err, 32 )
+ __array(char, ip, 32 )
+ ),
+
+ TP_fast_assign(
+ strscpy(__entry->bch_err, bch2_err_str(bch_err), sizeof(__entry->bch_err));
+ strscpy(__entry->std_err, bch2_err_str(std_err), sizeof(__entry->std_err));
+ snprintf(__entry->ip, sizeof(__entry->ip), "%ps", (void *) ip);
+ ),
+
+ TP_printk("%s ret %s -> %s %s", __entry->ip,
+ __entry->bch_err, __entry->std_err, __entry->ip)
+);
+
/* disk_accounting.c */
TRACE_EVENT(accounting_mem_insert,
@@ -295,12 +339,12 @@ TRACE_EVENT(write_super,
/* io.c: */
-DEFINE_EVENT(bio, read_promote,
+DEFINE_EVENT(bio, io_read_promote,
TP_PROTO(struct bio *bio),
TP_ARGS(bio)
);
-TRACE_EVENT(read_nopromote,
+TRACE_EVENT(io_read_nopromote,
TP_PROTO(struct bch_fs *c, int ret),
TP_ARGS(c, ret),
@@ -319,26 +363,55 @@ TRACE_EVENT(read_nopromote,
__entry->ret)
);
-DEFINE_EVENT(bio, read_bounce,
+DEFINE_EVENT(bio, io_read_bounce,
+ TP_PROTO(struct bio *bio),
+ TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bio, io_read_split,
TP_PROTO(struct bio *bio),
TP_ARGS(bio)
);
-DEFINE_EVENT(bio, read_split,
+DEFINE_EVENT(bio, io_read_retry,
TP_PROTO(struct bio *bio),
TP_ARGS(bio)
);
-DEFINE_EVENT(bio, read_retry,
+DEFINE_EVENT(bio, io_read_reuse_race,
TP_PROTO(struct bio *bio),
TP_ARGS(bio)
);
-DEFINE_EVENT(bio, read_reuse_race,
+DEFINE_EVENT(bio, io_read_fail_and_poison,
TP_PROTO(struct bio *bio),
TP_ARGS(bio)
);
+/* ec.c */
+
+TRACE_EVENT(stripe_create,
+ TP_PROTO(struct bch_fs *c, u64 idx, int ret),
+ TP_ARGS(c, idx, ret),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev )
+ __field(u64, idx )
+ __field(int, ret )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = c->dev;
+ __entry->idx = idx;
+ __entry->ret = ret;
+ ),
+
+ TP_printk("%d,%d idx %llu ret %i",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->idx,
+ __entry->ret)
+);
+
/* Journal */
DEFINE_EVENT(bch_fs, journal_full,
@@ -727,7 +800,7 @@ DEFINE_EVENT(fs_str, bucket_alloc_fail,
TP_ARGS(c, str)
);
-TRACE_EVENT(discard_buckets,
+DECLARE_EVENT_CLASS(discard_buckets_class,
TP_PROTO(struct bch_fs *c, u64 seen, u64 open,
u64 need_journal_commit, u64 discarded, const char *err),
TP_ARGS(c, seen, open, need_journal_commit, discarded, err),
@@ -759,6 +832,18 @@ TRACE_EVENT(discard_buckets,
__entry->err)
);
+DEFINE_EVENT(discard_buckets_class, discard_buckets,
+ TP_PROTO(struct bch_fs *c, u64 seen, u64 open,
+ u64 need_journal_commit, u64 discarded, const char *err),
+ TP_ARGS(c, seen, open, need_journal_commit, discarded, err)
+);
+
+DEFINE_EVENT(discard_buckets_class, discard_buckets_fast,
+ TP_PROTO(struct bch_fs *c, u64 seen, u64 open,
+ u64 need_journal_commit, u64 discarded, const char *err),
+ TP_ARGS(c, seen, open, need_journal_commit, discarded, err)
+);
+
TRACE_EVENT(bucket_invalidate,
TP_PROTO(struct bch_fs *c, unsigned dev, u64 bucket, u32 sectors),
TP_ARGS(c, dev, bucket, sectors),
@@ -785,53 +870,37 @@ TRACE_EVENT(bucket_invalidate,
/* Moving IO */
-TRACE_EVENT(bucket_evacuate,
- TP_PROTO(struct bch_fs *c, struct bpos *bucket),
- TP_ARGS(c, bucket),
-
- TP_STRUCT__entry(
- __field(dev_t, dev )
- __field(u32, dev_idx )
- __field(u64, bucket )
- ),
-
- TP_fast_assign(
- __entry->dev = c->dev;
- __entry->dev_idx = bucket->inode;
- __entry->bucket = bucket->offset;
- ),
-
- TP_printk("%d:%d %u:%llu",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->dev_idx, __entry->bucket)
+DEFINE_EVENT(fs_str, io_move,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
);
-DEFINE_EVENT(fs_str, move_extent,
+DEFINE_EVENT(fs_str, io_move_read,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);
-DEFINE_EVENT(fs_str, move_extent_read,
+DEFINE_EVENT(fs_str, io_move_write,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);
-DEFINE_EVENT(fs_str, move_extent_write,
+DEFINE_EVENT(fs_str, io_move_finish,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);
-DEFINE_EVENT(fs_str, move_extent_finish,
+DEFINE_EVENT(fs_str, io_move_fail,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);
-DEFINE_EVENT(fs_str, move_extent_fail,
+DEFINE_EVENT(fs_str, io_move_write_fail,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);
-DEFINE_EVENT(fs_str, move_extent_start_fail,
+DEFINE_EVENT(fs_str, io_move_start_fail,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);
@@ -869,37 +938,6 @@ TRACE_EVENT(move_data,
__entry->sectors_raced)
);
-TRACE_EVENT(evacuate_bucket,
- TP_PROTO(struct bch_fs *c, struct bpos *bucket,
- unsigned sectors, unsigned bucket_size,
- int ret),
- TP_ARGS(c, bucket, sectors, bucket_size, ret),
-
- TP_STRUCT__entry(
- __field(dev_t, dev )
- __field(u64, member )
- __field(u64, bucket )
- __field(u32, sectors )
- __field(u32, bucket_size )
- __field(int, ret )
- ),
-
- TP_fast_assign(
- __entry->dev = c->dev;
- __entry->member = bucket->inode;
- __entry->bucket = bucket->offset;
- __entry->sectors = sectors;
- __entry->bucket_size = bucket_size;
- __entry->ret = ret;
- ),
-
- TP_printk("%d,%d %llu:%llu sectors %u/%u ret %i",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->member, __entry->bucket,
- __entry->sectors, __entry->bucket_size,
- __entry->ret)
-);
-
TRACE_EVENT(copygc,
TP_PROTO(struct bch_fs *c,
u64 buckets,
@@ -1042,34 +1080,14 @@ TRACE_EVENT(trans_blocked_journal_reclaim,
__entry->must_wait)
);
-TRACE_EVENT(trans_restart_journal_preres_get,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip,
- unsigned flags),
- TP_ARGS(trans, caller_ip, flags),
-
- TP_STRUCT__entry(
- __array(char, trans_fn, 32 )
- __field(unsigned long, caller_ip )
- __field(unsigned, flags )
- ),
-
- TP_fast_assign(
- strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
- __entry->caller_ip = caller_ip;
- __entry->flags = flags;
- ),
-
- TP_printk("%s %pS %x", __entry->trans_fn,
- (void *) __entry->caller_ip,
- __entry->flags)
-);
-
+#if 0
+/* todo: bring back dynamic fault injection */
DEFINE_EVENT(transaction_event, trans_restart_fault_inject,
TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip),
TP_ARGS(trans, caller_ip)
);
+#endif
DEFINE_EVENT(transaction_event, trans_traverse_all,
TP_PROTO(struct btree_trans *trans,
@@ -1133,51 +1151,9 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_split,
TP_ARGS(trans, caller_ip, path)
);
-TRACE_EVENT(trans_restart_upgrade,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip,
- struct btree_path *path,
- unsigned old_locks_want,
- unsigned new_locks_want,
- struct get_locks_fail *f),
- TP_ARGS(trans, caller_ip, path, old_locks_want, new_locks_want, f),
-
- TP_STRUCT__entry(
- __array(char, trans_fn, 32 )
- __field(unsigned long, caller_ip )
- __field(u8, btree_id )
- __field(u8, old_locks_want )
- __field(u8, new_locks_want )
- __field(u8, level )
- __field(u32, path_seq )
- __field(u32, node_seq )
- TRACE_BPOS_entries(pos)
- ),
-
- TP_fast_assign(
- strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
- __entry->caller_ip = caller_ip;
- __entry->btree_id = path->btree_id;
- __entry->old_locks_want = old_locks_want;
- __entry->new_locks_want = new_locks_want;
- __entry->level = f->l;
- __entry->path_seq = path->l[f->l].lock_seq;
- __entry->node_seq = IS_ERR_OR_NULL(f->b) ? 0 : f->b->c.lock.seq;
- TRACE_BPOS_assign(pos, path->pos)
- ),
-
- TP_printk("%s %pS btree %s pos %llu:%llu:%u locks_want %u -> %u level %u path seq %u node seq %u",
- __entry->trans_fn,
- (void *) __entry->caller_ip,
- bch2_btree_id_str(__entry->btree_id),
- __entry->pos_inode,
- __entry->pos_offset,
- __entry->pos_snapshot,
- __entry->old_locks_want,
- __entry->new_locks_want,
- __entry->level,
- __entry->path_seq,
- __entry->node_seq)
+DEFINE_EVENT(fs_str, trans_restart_upgrade,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
);
DEFINE_EVENT(trans_str, trans_restart_relock,
@@ -1199,19 +1175,6 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_parent_for_fill,
TP_ARGS(trans, caller_ip, path)
);
-DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_after_fill,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip,
- struct btree_path *path),
- TP_ARGS(trans, caller_ip, path)
-);
-
-DEFINE_EVENT(transaction_event, trans_restart_key_cache_upgrade,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip),
- TP_ARGS(trans, caller_ip)
-);
-
DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_key_cache_fill,
TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
@@ -1233,13 +1196,6 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_path_intent,
TP_ARGS(trans, caller_ip, path)
);
-DEFINE_EVENT(transaction_restart_iter, trans_restart_traverse,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip,
- struct btree_path *path),
- TP_ARGS(trans, caller_ip, path)
-);
-
DEFINE_EVENT(transaction_restart_iter, trans_restart_memory_allocation_failure,
TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
@@ -1298,44 +1254,6 @@ TRACE_EVENT(trans_restart_mem_realloced,
__entry->bytes)
);
-TRACE_EVENT(trans_restart_key_cache_key_realloced,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip,
- struct btree_path *path,
- unsigned old_u64s,
- unsigned new_u64s),
- TP_ARGS(trans, caller_ip, path, old_u64s, new_u64s),
-
- TP_STRUCT__entry(
- __array(char, trans_fn, 32 )
- __field(unsigned long, caller_ip )
- __field(enum btree_id, btree_id )
- TRACE_BPOS_entries(pos)
- __field(u32, old_u64s )
- __field(u32, new_u64s )
- ),
-
- TP_fast_assign(
- strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
- __entry->caller_ip = caller_ip;
-
- __entry->btree_id = path->btree_id;
- TRACE_BPOS_assign(pos, path->pos);
- __entry->old_u64s = old_u64s;
- __entry->new_u64s = new_u64s;
- ),
-
- TP_printk("%s %pS btree %s pos %llu:%llu:%u old_u64s %u new_u64s %u",
- __entry->trans_fn,
- (void *) __entry->caller_ip,
- bch2_btree_id_str(__entry->btree_id),
- __entry->pos_inode,
- __entry->pos_offset,
- __entry->pos_snapshot,
- __entry->old_u64s,
- __entry->new_u64s)
-);
-
DEFINE_EVENT(transaction_event, trans_restart_write_buffer_flush,
TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip),
@@ -1479,23 +1397,44 @@ DEFINE_EVENT(fs_str, data_update,
TP_ARGS(c, str)
);
-TRACE_EVENT(error_downcast,
- TP_PROTO(int bch_err, int std_err, unsigned long ip),
- TP_ARGS(bch_err, std_err, ip),
+DEFINE_EVENT(fs_str, io_move_pred,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
+);
- TP_STRUCT__entry(
- __array(char, bch_err, 32 )
- __array(char, std_err, 32 )
- __array(char, ip, 32 )
- ),
+DEFINE_EVENT(fs_str, io_move_created_rebalance,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
+);
- TP_fast_assign(
- strscpy(__entry->bch_err, bch2_err_str(bch_err), sizeof(__entry->bch_err));
- strscpy(__entry->std_err, bch2_err_str(std_err), sizeof(__entry->std_err));
- snprintf(__entry->ip, sizeof(__entry->ip), "%ps", (void *) ip);
- ),
+DEFINE_EVENT(fs_str, io_move_evacuate_bucket,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
+);
- TP_printk("%s -> %s %s", __entry->bch_err, __entry->std_err, __entry->ip)
+DEFINE_EVENT(fs_str, extent_trim_atomic,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
+);
+
+DEFINE_EVENT(fs_str, btree_iter_peek_slot,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
+);
+
+DEFINE_EVENT(fs_str, __btree_iter_peek,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
+);
+
+DEFINE_EVENT(fs_str, btree_iter_peek_max,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
+);
+
+DEFINE_EVENT(fs_str, btree_iter_peek_prev_min,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
);
#ifdef CONFIG_BCACHEFS_PATH_TRACEPOINTS
@@ -1910,21 +1849,6 @@ TRACE_EVENT(btree_path_free,
__entry->dup_locked)
);
-TRACE_EVENT(btree_path_free_trans_begin,
- TP_PROTO(btree_path_idx_t path),
- TP_ARGS(path),
-
- TP_STRUCT__entry(
- __field(btree_path_idx_t, idx )
- ),
-
- TP_fast_assign(
- __entry->idx = path;
- ),
-
- TP_printk(" path %3u", __entry->idx)
-);
-
#else /* CONFIG_BCACHEFS_PATH_TRACEPOINTS */
#ifndef _TRACE_BCACHEFS_H
@@ -1942,7 +1866,6 @@ static inline void trace_btree_path_traverse_start(struct btree_trans *trans, st
static inline void trace_btree_path_traverse_end(struct btree_trans *trans, struct btree_path *path) {}
static inline void trace_btree_path_set_pos(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos) {}
static inline void trace_btree_path_free(struct btree_trans *trans, btree_path_idx_t path, struct btree_path *dup) {}
-static inline void trace_btree_path_free_trans_begin(btree_path_idx_t path) {}
#endif
#endif /* CONFIG_BCACHEFS_PATH_TRACEPOINTS */
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index e0a876cbaa6b..df9a6071fe18 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -252,8 +252,17 @@ void bch2_prt_u64_base2(struct printbuf *out, u64 v)
bch2_prt_u64_base2_nbits(out, v, fls64(v) ?: 1);
}
-static void __bch2_print_string_as_lines(const char *prefix, const char *lines,
- bool nonblocking)
+static bool string_is_spaces(const char *str)
+{
+ while (*str) {
+ if (*str != ' ')
+ return false;
+ str++;
+ }
+ return true;
+}
+
+void bch2_print_string_as_lines(const char *prefix, const char *lines)
{
bool locked = false;
const char *p;
@@ -263,15 +272,13 @@ static void __bch2_print_string_as_lines(const char *prefix, const char *lines,
return;
}
- if (!nonblocking) {
- console_lock();
- locked = true;
- } else {
- locked = console_trylock();
- }
+ locked = console_trylock();
- while (1) {
+ while (*lines) {
p = strchrnul(lines, '\n');
+ if (!*p && string_is_spaces(lines))
+ break;
+
printk("%s%.*s\n", prefix, (int) (p - lines), lines);
if (!*p)
break;
@@ -281,16 +288,6 @@ static void __bch2_print_string_as_lines(const char *prefix, const char *lines,
console_unlock();
}
-void bch2_print_string_as_lines(const char *prefix, const char *lines)
-{
- return __bch2_print_string_as_lines(prefix, lines, false);
-}
-
-void bch2_print_string_as_lines_nonblocking(const char *prefix, const char *lines)
-{
- return __bch2_print_string_as_lines(prefix, lines, true);
-}
-
int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task, unsigned skipnr,
gfp_t gfp)
{
@@ -473,10 +470,10 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
u64 last_q = 0;
prt_printf(out, "quantiles (%s):\t", u->name);
- eytzinger0_for_each(i, NR_QUANTILES) {
- bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
+ eytzinger0_for_each(j, NR_QUANTILES) {
+ bool is_last = eytzinger0_next(j, NR_QUANTILES) == -1;
- u64 q = max(quantiles->entries[i].m, last_q);
+ u64 q = max(quantiles->entries[j].m, last_q);
prt_printf(out, "%llu ", div64_u64(q, u->nsecs));
if (is_last)
prt_newline(out);
@@ -653,19 +650,25 @@ int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask)
return 0;
}
-size_t bch2_rand_range(size_t max)
+u64 bch2_get_random_u64_below(u64 ceil)
{
- size_t rand;
-
- if (!max)
- return 0;
-
- do {
- rand = get_random_long();
- rand &= roundup_pow_of_two(max) - 1;
- } while (rand >= max);
+ if (ceil <= U32_MAX)
+ return __get_random_u32_below(ceil);
+
+ /* this is the same (clever) algorithm as in __get_random_u32_below() */
+ u64 rand = get_random_u64();
+ u64 mult = ceil * rand;
+
+ if (unlikely(mult < ceil)) {
+ u64 bound;
+ div64_u64_rem(-ceil, ceil, &bound);
+ while (unlikely(mult < bound)) {
+ rand = get_random_u64();
+ mult = ceil * rand;
+ }
+ }
- return rand;
+ return mul_u64_u64_shr(ceil, rand, 64);
}
void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, const void *src)
@@ -698,12 +701,43 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter)
}
}
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_corrupt_bio(struct bio *bio)
+{
+ struct bvec_iter iter;
+ struct bio_vec bv;
+ unsigned offset = get_random_u32_below(bio->bi_iter.bi_size / sizeof(u64));
+
+ bio_for_each_segment(bv, bio, iter) {
+ unsigned u64s = bv.bv_len / sizeof(u64);
+
+ if (offset < u64s) {
+ u64 *segment = bvec_kmap_local(&bv);
+ segment[offset] = get_random_u64();
+ kunmap_local(segment);
+ return;
+ }
+ offset -= u64s;
+ }
+}
+#endif
+
+void bch2_bio_to_text(struct printbuf *out, struct bio *bio)
+{
+ prt_printf(out, "bi_remaining:\t%u\n",
+ atomic_read(&bio->__bi_remaining));
+ prt_printf(out, "bi_end_io:\t%ps\n",
+ bio->bi_end_io);
+ prt_printf(out, "bi_status:\t%u\n",
+ bio->bi_status);
+}
+
#if 0
void eytzinger1_test(void)
{
- unsigned inorder, eytz, size;
+ unsigned inorder, size;
- pr_info("1 based eytzinger test:");
+ pr_info("1 based eytzinger test:\n");
for (size = 2;
size < 65536;
@@ -711,13 +745,7 @@ void eytzinger1_test(void)
unsigned extra = eytzinger1_extra(size);
if (!(size % 4096))
- pr_info("tree size %u", size);
-
- BUG_ON(eytzinger1_prev(0, size) != eytzinger1_last(size));
- BUG_ON(eytzinger1_next(0, size) != eytzinger1_first(size));
-
- BUG_ON(eytzinger1_prev(eytzinger1_first(size), size) != 0);
- BUG_ON(eytzinger1_next(eytzinger1_last(size), size) != 0);
+ pr_info("tree size %u\n", size);
inorder = 1;
eytzinger1_for_each(eytz, size) {
@@ -728,15 +756,16 @@ void eytzinger1_test(void)
inorder++;
}
+ BUG_ON(inorder - 1 != size);
}
}
void eytzinger0_test(void)
{
- unsigned inorder, eytz, size;
+ unsigned inorder, size;
- pr_info("0 based eytzinger test:");
+ pr_info("0 based eytzinger test:\n");
for (size = 1;
size < 65536;
@@ -744,13 +773,7 @@ void eytzinger0_test(void)
unsigned extra = eytzinger0_extra(size);
if (!(size % 4096))
- pr_info("tree size %u", size);
-
- BUG_ON(eytzinger0_prev(-1, size) != eytzinger0_last(size));
- BUG_ON(eytzinger0_next(-1, size) != eytzinger0_first(size));
-
- BUG_ON(eytzinger0_prev(eytzinger0_first(size), size) != -1);
- BUG_ON(eytzinger0_next(eytzinger0_last(size), size) != -1);
+ pr_info("tree size %u\n", size);
inorder = 0;
eytzinger0_for_each(eytz, size) {
@@ -761,54 +784,191 @@ void eytzinger0_test(void)
inorder++;
}
+ BUG_ON(inorder != size);
+
+ inorder = size - 1;
+ eytzinger0_for_each_prev(eytz, size) {
+ BUG_ON(eytz != eytzinger0_first(size) &&
+ eytzinger0_next(eytzinger0_prev(eytz, size), size) != eytz);
+
+ inorder--;
+ }
+ BUG_ON(inorder != -1);
}
}
-static inline int cmp_u16(const void *_l, const void *_r, size_t size)
+static inline int cmp_u16(const void *_l, const void *_r)
{
const u16 *l = _l, *r = _r;
- return (*l > *r) - (*r - *l);
+ return (*l > *r) - (*r > *l);
}
-static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search)
+static void eytzinger0_find_test_le(u16 *test_array, unsigned nr, u16 search)
{
- int i, c1 = -1, c2 = -1;
- ssize_t r;
+ int r, s;
+ bool bad;
r = eytzinger0_find_le(test_array, nr,
sizeof(test_array[0]),
cmp_u16, &search);
- if (r >= 0)
- c1 = test_array[r];
-
- for (i = 0; i < nr; i++)
- if (test_array[i] <= search && test_array[i] > c2)
- c2 = test_array[i];
-
- if (c1 != c2) {
- eytzinger0_for_each(i, nr)
- pr_info("[%3u] = %12u", i, test_array[i]);
- pr_info("find_le(%2u) -> [%2zi] = %2i should be %2i",
- i, r, c1, c2);
+ if (r >= 0) {
+ if (test_array[r] > search) {
+ bad = true;
+ } else {
+ s = eytzinger0_next(r, nr);
+ bad = s >= 0 && test_array[s] <= search;
+ }
+ } else {
+ s = eytzinger0_last(nr);
+ bad = s >= 0 && test_array[s] <= search;
+ }
+
+ if (bad) {
+ s = -1;
+ eytzinger0_for_each_prev(j, nr) {
+ if (test_array[j] <= search) {
+ s = j;
+ break;
+ }
+ }
+
+ eytzinger0_for_each(j, nr)
+ pr_info("[%3u] = %12u\n", j, test_array[j]);
+ pr_info("find_le(%12u) = %3i should be %3i\n",
+ search, r, s);
+ BUG();
}
}
+static void eytzinger0_find_test_gt(u16 *test_array, unsigned nr, u16 search)
+{
+ int r, s;
+ bool bad;
+
+ r = eytzinger0_find_gt(test_array, nr,
+ sizeof(test_array[0]),
+ cmp_u16, &search);
+ if (r >= 0) {
+ if (test_array[r] <= search) {
+ bad = true;
+ } else {
+ s = eytzinger0_prev(r, nr);
+ bad = s >= 0 && test_array[s] > search;
+ }
+ } else {
+ s = eytzinger0_first(nr);
+ bad = s >= 0 && test_array[s] > search;
+ }
+
+ if (bad) {
+ s = -1;
+ eytzinger0_for_each(j, nr) {
+ if (test_array[j] > search) {
+ s = j;
+ break;
+ }
+ }
+
+ eytzinger0_for_each(j, nr)
+ pr_info("[%3u] = %12u\n", j, test_array[j]);
+ pr_info("find_gt(%12u) = %3i should be %3i\n",
+ search, r, s);
+ BUG();
+ }
+}
+
+static void eytzinger0_find_test_ge(u16 *test_array, unsigned nr, u16 search)
+{
+ int r, s;
+ bool bad;
+
+ r = eytzinger0_find_ge(test_array, nr,
+ sizeof(test_array[0]),
+ cmp_u16, &search);
+ if (r >= 0) {
+ if (test_array[r] < search) {
+ bad = true;
+ } else {
+ s = eytzinger0_prev(r, nr);
+ bad = s >= 0 && test_array[s] >= search;
+ }
+ } else {
+ s = eytzinger0_first(nr);
+ bad = s >= 0 && test_array[s] >= search;
+ }
+
+ if (bad) {
+ s = -1;
+ eytzinger0_for_each(j, nr) {
+ if (test_array[j] >= search) {
+ s = j;
+ break;
+ }
+ }
+
+ eytzinger0_for_each(j, nr)
+ pr_info("[%3u] = %12u\n", j, test_array[j]);
+ pr_info("find_ge(%12u) = %3i should be %3i\n",
+ search, r, s);
+ BUG();
+ }
+}
+
+static void eytzinger0_find_test_eq(u16 *test_array, unsigned nr, u16 search)
+{
+ unsigned r;
+ int s;
+ bool bad;
+
+ r = eytzinger0_find(test_array, nr,
+ sizeof(test_array[0]),
+ cmp_u16, &search);
+
+ if (r < nr) {
+ bad = test_array[r] != search;
+ } else {
+ s = eytzinger0_find_le(test_array, nr,
+ sizeof(test_array[0]),
+ cmp_u16, &search);
+ bad = s >= 0 && test_array[s] == search;
+ }
+
+ if (bad) {
+ eytzinger0_for_each(j, nr)
+ pr_info("[%3u] = %12u\n", j, test_array[j]);
+ pr_info("find(%12u) = %3i is incorrect\n",
+ search, r);
+ BUG();
+ }
+}
+
+static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search)
+{
+ eytzinger0_find_test_le(test_array, nr, search);
+ eytzinger0_find_test_gt(test_array, nr, search);
+ eytzinger0_find_test_ge(test_array, nr, search);
+ eytzinger0_find_test_eq(test_array, nr, search);
+}
+
void eytzinger0_find_test(void)
{
unsigned i, nr, allocated = 1 << 12;
u16 *test_array = kmalloc_array(allocated, sizeof(test_array[0]), GFP_KERNEL);
for (nr = 1; nr < allocated; nr++) {
- pr_info("testing %u elems", nr);
+ u16 prev = 0;
+
+ pr_info("testing %u elems\n", nr);
get_random_bytes(test_array, nr * sizeof(test_array[0]));
eytzinger0_sort(test_array, nr, sizeof(test_array[0]), cmp_u16, NULL);
/* verify array is sorted correctly: */
- eytzinger0_for_each(i, nr)
- BUG_ON(i != eytzinger0_last(nr) &&
- test_array[i] > test_array[eytzinger0_next(i, nr)]);
+ eytzinger0_for_each(j, nr) {
+ BUG_ON(test_array[j] < prev);
+ prev = test_array[j];
+ }
for (i = 0; i < U16_MAX; i += 1 << 12)
eytzinger0_find_test_val(test_array, nr, i);
@@ -850,14 +1010,14 @@ u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr)
return ret;
}
-void bch2_darray_str_exit(darray_str *d)
+void bch2_darray_str_exit(darray_const_str *d)
{
darray_for_each(*d, i)
kfree(*i);
darray_exit(d);
}
-int bch2_split_devs(const char *_dev_name, darray_str *ret)
+int bch2_split_devs(const char *_dev_name, darray_const_str *ret)
{
darray_init(ret);
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 1a1720116071..6488f098d140 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -14,8 +14,10 @@
#include <linux/log2.h>
#include <linux/percpu.h>
#include <linux/preempt.h>
+#include <linux/random.h>
#include <linux/ratelimit.h>
#include <linux/slab.h>
+#include <linux/sort.h>
#include <linux/vmalloc.h>
#include <linux/workqueue.h>
@@ -55,15 +57,16 @@ static inline size_t buf_pages(void *p, size_t len)
PAGE_SIZE);
}
-static inline void *bch2_kvmalloc(size_t n, gfp_t flags)
+static inline void *bch2_kvmalloc_noprof(size_t n, gfp_t flags)
{
void *p = unlikely(n >= INT_MAX)
- ? vmalloc(n)
- : kvmalloc(n, flags & ~__GFP_ZERO);
+ ? vmalloc_noprof(n)
+ : kvmalloc_noprof(n, flags & ~__GFP_ZERO);
if (p && (flags & __GFP_ZERO))
memset(p, 0, n);
return p;
}
+#define bch2_kvmalloc(...) alloc_hooks(bch2_kvmalloc_noprof(__VA_ARGS__))
#define init_heap(heap, _size, gfp) \
({ \
@@ -94,6 +97,7 @@ do { \
#define printbuf_tabstop_push(_buf, _n) bch2_printbuf_tabstop_push(_buf, _n)
#define printbuf_indent_add(_out, _n) bch2_printbuf_indent_add(_out, _n)
+#define printbuf_indent_add_nextline(_out, _n) bch2_printbuf_indent_add_nextline(_out, _n)
#define printbuf_indent_sub(_out, _n) bch2_printbuf_indent_sub(_out, _n)
#define prt_newline(_out) bch2_prt_newline(_out)
@@ -210,8 +214,7 @@ u64 bch2_read_flag_list(const char *, const char * const[]);
void bch2_prt_u64_base2_nbits(struct printbuf *, u64, unsigned);
void bch2_prt_u64_base2(struct printbuf *, u64);
-void bch2_print_string_as_lines(const char *prefix, const char *lines);
-void bch2_print_string_as_lines_nonblocking(const char *prefix, const char *lines);
+void bch2_print_string_as_lines(const char *, const char *);
typedef DARRAY(unsigned long) bch_stacktrace;
int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *, unsigned, gfp_t);
@@ -401,11 +404,25 @@ do { \
_ret; \
})
-size_t bch2_rand_range(size_t);
+u64 bch2_get_random_u64_below(u64);
void memcpy_to_bio(struct bio *, struct bvec_iter, const void *);
void memcpy_from_bio(void *, struct bio *, struct bvec_iter);
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_corrupt_bio(struct bio *);
+
+static inline void bch2_maybe_corrupt_bio(struct bio *bio, unsigned ratio)
+{
+ if (ratio && !get_random_u32_below(ratio))
+ bch2_corrupt_bio(bio);
+}
+#else
+#define bch2_maybe_corrupt_bio(...) do {} while (0)
+#endif
+
+void bch2_bio_to_text(struct printbuf *, struct bio *);
+
static inline void memcpy_u64s_small(void *dst, const void *src,
unsigned u64s)
{
@@ -419,7 +436,7 @@ static inline void memcpy_u64s_small(void *dst, const void *src,
static inline void __memcpy_u64s(void *dst, const void *src,
unsigned u64s)
{
-#ifdef CONFIG_X86_64
+#if defined(CONFIG_X86_64) && !defined(CONFIG_KMSAN)
long d0, d1, d2;
asm volatile("rep ; movsq"
@@ -496,7 +513,7 @@ static inline void __memmove_u64s_up(void *_dst, const void *_src,
u64 *dst = (u64 *) _dst + u64s - 1;
u64 *src = (u64 *) _src + u64s - 1;
-#ifdef CONFIG_X86_64
+#if defined(CONFIG_X86_64) && !defined(CONFIG_KMSAN)
long d0, d1, d2;
asm volatile("std ;\n"
@@ -609,7 +626,7 @@ do { \
#define per_cpu_sum(_p) \
({ \
- typeof(*_p) _ret = 0; \
+ TYPEOF_UNQUAL(*_p) _ret = 0; \
\
int cpu; \
for_each_possible_cpu(cpu) \
@@ -656,8 +673,6 @@ static inline void percpu_memset(void __percpu *p, int c, size_t bytes)
u64 *bch2_acc_percpu_u64s(u64 __percpu *, unsigned);
-#define cmp_int(l, r) ((l > r) - (l < r))
-
static inline int u8_cmp(u8 l, u8 r)
{
return cmp_int(l, r);
@@ -670,15 +685,13 @@ static inline int cmp_le32(__le32 l, __le32 r)
#include <linux/uuid.h>
-#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
-
static inline bool qstr_eq(const struct qstr l, const struct qstr r)
{
return l.len == r.len && !memcmp(l.name, r.name, l.len);
}
-void bch2_darray_str_exit(darray_str *);
-int bch2_split_devs(const char *, darray_str *);
+void bch2_darray_str_exit(darray_const_str *);
+int bch2_split_devs(const char *, darray_const_str *);
#ifdef __KERNEL__
@@ -728,4 +741,42 @@ static inline void memcpy_swab(void *_dst, void *_src, size_t len)
*--dst = *src++;
}
+#define set_flags(_map, _in, _out) \
+do { \
+ unsigned _i; \
+ \
+ for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \
+ if ((_in) & (1 << _i)) \
+ (_out) |= _map[_i]; \
+ else \
+ (_out) &= ~_map[_i]; \
+} while (0)
+
+#define map_flags(_map, _in) \
+({ \
+ unsigned _out = 0; \
+ \
+ set_flags(_map, _in, _out); \
+ _out; \
+})
+
+#define map_flags_rev(_map, _in) \
+({ \
+ unsigned _i, _out = 0; \
+ \
+ for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \
+ if ((_in) & _map[_i]) { \
+ (_out) |= 1 << _i; \
+ (_in) &= ~_map[_i]; \
+ } \
+ (_out); \
+})
+
+#define map_defined(_map) \
+({ \
+ unsigned _in = ~0; \
+ \
+ map_flags_rev(_map, _in); \
+})
+
#endif /* _BCACHEFS_UTIL_H */
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index aed7c6984173..627f153798c6 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -38,7 +38,7 @@ static u64 xattr_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
struct bkey_s_c_xattr x = bkey_s_c_to_xattr(k);
return bch2_xattr_hash(info,
- &X_SEARCH(x.v->x_type, x.v->x_name, x.v->x_name_len));
+ &X_SEARCH(x.v->x_type, x.v->x_name_and_value, x.v->x_name_len));
}
static bool xattr_cmp_key(struct bkey_s_c _l, const void *_r)
@@ -48,7 +48,7 @@ static bool xattr_cmp_key(struct bkey_s_c _l, const void *_r)
return l.v->x_type != r->type ||
l.v->x_name_len != r->name.len ||
- memcmp(l.v->x_name, r->name.name, r->name.len);
+ memcmp(l.v->x_name_and_value, r->name.name, r->name.len);
}
static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
@@ -58,7 +58,7 @@ static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
return l.v->x_type != r.v->x_type ||
l.v->x_name_len != r.v->x_name_len ||
- memcmp(l.v->x_name, r.v->x_name, r.v->x_name_len);
+ memcmp(l.v->x_name_and_value, r.v->x_name_and_value, r.v->x_name_len);
}
const struct bch_hash_desc bch2_xattr_hash_desc = {
@@ -96,7 +96,7 @@ int bch2_xattr_validate(struct bch_fs *c, struct bkey_s_c k,
c, xattr_invalid_type,
"invalid type (%u)", xattr.v->x_type);
- bkey_fsck_err_on(memchr(xattr.v->x_name, '\0', xattr.v->x_name_len),
+ bkey_fsck_err_on(memchr(xattr.v->x_name_and_value, '\0', xattr.v->x_name_len),
c, xattr_name_invalid_chars,
"xattr name has invalid characters");
fsck_err:
@@ -120,13 +120,13 @@ void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c,
unsigned name_len = xattr.v->x_name_len;
unsigned val_len = le16_to_cpu(xattr.v->x_val_len);
unsigned max_name_val_bytes = bkey_val_bytes(xattr.k) -
- offsetof(struct bch_xattr, x_name);
+ offsetof(struct bch_xattr, x_name_and_value);
val_len = min_t(int, val_len, max_name_val_bytes - name_len);
name_len = min(name_len, max_name_val_bytes);
prt_printf(out, "%.*s:%.*s",
- name_len, xattr.v->x_name,
+ name_len, xattr.v->x_name_and_value,
val_len, (char *) xattr_val(xattr.v));
if (xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS ||
@@ -168,7 +168,7 @@ int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum,
int type, int flags)
{
struct bch_fs *c = trans->c;
- struct btree_iter inode_iter = { NULL };
+ struct btree_iter inode_iter = {};
int ret;
ret = bch2_subvol_is_ro_trans(trans, inum.subvol) ?:
@@ -176,6 +176,11 @@ int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum,
if (ret)
return ret;
+ /*
+ * Besides the ctime update, extents, dirents and xattrs updates require
+ * that an inode update also happens - to ensure that if a key exists in
+ * one of those btrees with a given snapshot ID an inode is also present
+ */
inode_u->bi_ctime = bch2_current_time(c);
ret = bch2_inode_write(trans, &inode_iter, inode_u);
@@ -202,7 +207,7 @@ int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum,
xattr->v.x_type = type;
xattr->v.x_name_len = namelen;
xattr->v.x_val_len = cpu_to_le16(size);
- memcpy(xattr->v.x_name, name, namelen);
+ memcpy(xattr->v.x_name_and_value, name, namelen);
memcpy(xattr_val(&xattr->v), value, size);
ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info,
@@ -270,7 +275,7 @@ static int bch2_xattr_emit(struct dentry *dentry,
if (!prefix)
return 0;
- return __bch2_xattr_emit(prefix, xattr->x_name, xattr->x_name_len, buf);
+ return __bch2_xattr_emit(prefix, xattr->x_name_and_value, xattr->x_name_len, buf);
}
static int bch2_xattr_list_bcachefs(struct bch_fs *c,
@@ -473,6 +478,12 @@ static int inode_opt_set_fn(struct btree_trans *trans,
{
struct inode_opt_set *s = p;
+ if (s->id == Inode_opt_casefold) {
+ int ret = bch2_inode_set_casefold(trans, inode_inum(inode), bi, s->v);
+ if (ret)
+ return ret;
+ }
+
if (s->defined)
bi->bi_fields_set |= 1U << s->id;
else
@@ -523,7 +534,7 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
if (ret < 0)
goto err_class_exit;
- ret = bch2_opt_check_may_set(c, opt_id, v);
+ ret = bch2_opt_hook_pre_set(c, NULL, opt_id, v);
if (ret < 0)
goto err_class_exit;
diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h
index 132fbbd15a66..1139bf345f70 100644
--- a/fs/bcachefs/xattr.h
+++ b/fs/bcachefs/xattr.h
@@ -18,12 +18,12 @@ void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
static inline unsigned xattr_val_u64s(unsigned name_len, unsigned val_len)
{
- return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name) +
+ return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name_and_value) +
name_len + val_len, sizeof(u64));
}
#define xattr_val(_xattr) \
- ((void *) (_xattr)->x_name + (_xattr)->x_name_len)
+ ((void *) (_xattr)->x_name_and_value + (_xattr)->x_name_len)
struct xattr_search_key {
u8 type;
diff --git a/fs/bcachefs/xattr_format.h b/fs/bcachefs/xattr_format.h
index c7916011ef34..4121b78d9a92 100644
--- a/fs/bcachefs/xattr_format.h
+++ b/fs/bcachefs/xattr_format.h
@@ -13,7 +13,13 @@ struct bch_xattr {
__u8 x_type;
__u8 x_name_len;
__le16 x_val_len;
- __u8 x_name[] __counted_by(x_name_len);
+ /*
+ * x_name contains the name and value counted by
+ * x_name_len + x_val_len. The introduction of
+ * __counted_by(x_name_len) previously caused a false positive
+ * detection of an out of bounds write.
+ */
+ __u8 x_name_and_value[];
} __packed __aligned(8);
#endif /* _BCACHEFS_XATTR_FORMAT_H */
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index fa66a09e496a..d33d6bde992b 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -27,7 +27,7 @@ const struct file_operations bfs_file_operations = {
.llseek = generic_file_llseek,
.read_iter = generic_file_read_iter,
.write_iter = generic_file_write_iter,
- .mmap = generic_file_mmap,
+ .mmap_prepare = generic_file_mmap_prepare,
.splice_read = filemap_splice_read,
};
@@ -170,9 +170,10 @@ static void bfs_write_failed(struct address_space *mapping, loff_t to)
truncate_pagecache(inode, inode->i_size);
}
-static int bfs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len,
- struct folio **foliop, void **fsdata)
+static int bfs_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len,
+ struct folio **foliop, void **fsdata)
{
int ret;
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index db81570c9637..1d41ce477df5 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -17,6 +17,7 @@
#include <linux/writeback.h>
#include <linux/uio.h>
#include <linux/uaccess.h>
+#include <linux/fs_context.h>
#include "bfs.h"
MODULE_AUTHOR("Tigran Aivazian <aivazian.tigran@gmail.com>");
@@ -305,7 +306,7 @@ void bfs_dump_imap(const char *prefix, struct super_block *s)
#endif
}
-static int bfs_fill_super(struct super_block *s, void *data, int silent)
+static int bfs_fill_super(struct super_block *s, struct fs_context *fc)
{
struct buffer_head *bh, *sbh;
struct bfs_super_block *bfs_sb;
@@ -314,6 +315,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
struct bfs_sb_info *info;
int ret = -EINVAL;
unsigned long i_sblock, i_eblock, i_eoff, s_size;
+ int silent = fc->sb_flags & SB_SILENT;
info = kzalloc(sizeof(*info), GFP_KERNEL);
if (!info)
@@ -446,18 +448,28 @@ out:
return ret;
}
-static struct dentry *bfs_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int bfs_get_tree(struct fs_context *fc)
{
- return mount_bdev(fs_type, flags, dev_name, data, bfs_fill_super);
+ return get_tree_bdev(fc, bfs_fill_super);
+}
+
+static const struct fs_context_operations bfs_context_ops = {
+ .get_tree = bfs_get_tree,
+};
+
+static int bfs_init_fs_context(struct fs_context *fc)
+{
+ fc->ops = &bfs_context_ops;
+
+ return 0;
}
static struct file_system_type bfs_fs_type = {
- .owner = THIS_MODULE,
- .name = "bfs",
- .mount = bfs_mount,
- .kill_sb = kill_block_super,
- .fs_flags = FS_REQUIRES_DEV,
+ .owner = THIS_MODULE,
+ .name = "bfs",
+ .init_fs_context = bfs_init_fs_context,
+ .kill_sb = kill_block_super,
+ .fs_flags = FS_REQUIRES_DEV,
};
MODULE_ALIAS_FS("bfs");
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 8054f44d39cf..264fba0d44bd 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -68,12 +68,6 @@
static int load_elf_binary(struct linux_binprm *bprm);
-#ifdef CONFIG_USELIB
-static int load_elf_library(struct file *);
-#else
-#define load_elf_library NULL
-#endif
-
/*
* If we don't support core dumping, then supply a NULL so we
* don't even try.
@@ -101,7 +95,6 @@ static int elf_core_dump(struct coredump_params *cprm);
static struct linux_binfmt elf_format = {
.module = THIS_MODULE,
.load_binary = load_elf_binary,
- .load_shlib = load_elf_library,
#ifdef CONFIG_COREDUMP
.core_dump = elf_core_dump,
.min_coredump = ELF_EXEC_PAGESIZE,
@@ -526,7 +519,7 @@ static struct elf_phdr *load_elf_phdrs(const struct elfhdr *elf_ex,
/* Sanity check the number of program headers... */
/* ...and their total size. */
size = sizeof(struct elf_phdr) * elf_ex->e_phnum;
- if (size == 0 || size > 65536 || size > ELF_MIN_ALIGN)
+ if (size == 0 || size > 65536)
goto out;
elf_phdata = kmalloc(size, GFP_KERNEL);
@@ -653,7 +646,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
if (!elf_check_arch(interp_elf_ex) ||
elf_check_fdpic(interp_elf_ex))
goto out;
- if (!interpreter->f_op->mmap)
+ if (!can_mmap_file(interpreter))
goto out;
total_size = total_mapping_size(interp_elf_phdata,
@@ -762,8 +755,7 @@ static int parse_elf_property(const char *data, size_t *off, size_t datasz,
}
#define NOTE_DATA_SZ SZ_1K
-#define GNU_PROPERTY_TYPE_0_NAME "GNU"
-#define NOTE_NAME_SZ (sizeof(GNU_PROPERTY_TYPE_0_NAME))
+#define NOTE_NAME_SZ (sizeof(NN_GNU_PROPERTY_TYPE_0))
static int parse_elf_properties(struct file *f, const struct elf_phdr *phdr,
struct arch_elf_state *arch)
@@ -800,7 +792,7 @@ static int parse_elf_properties(struct file *f, const struct elf_phdr *phdr,
if (note.nhdr.n_type != NT_GNU_PROPERTY_TYPE_0 ||
note.nhdr.n_namesz != NOTE_NAME_SZ ||
strncmp(note.data + sizeof(note.nhdr),
- GNU_PROPERTY_TYPE_0_NAME, n - sizeof(note.nhdr)))
+ NN_GNU_PROPERTY_TYPE_0, n - sizeof(note.nhdr)))
return -ENOEXEC;
off = round_up(sizeof(note.nhdr) + NOTE_NAME_SZ,
@@ -831,6 +823,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL;
struct elf_phdr *elf_property_phdata = NULL;
unsigned long elf_brk;
+ bool brk_moved = false;
int retval, i;
unsigned long elf_entry;
unsigned long e_entry;
@@ -855,7 +848,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
goto out;
if (elf_check_fdpic(elf_ex))
goto out;
- if (!bprm->file->f_op->mmap)
+ if (!can_mmap_file(bprm->file))
goto out;
elf_phdata = load_elf_phdrs(elf_ex, bprm->file);
@@ -1098,15 +1091,19 @@ out_free_interp:
/* Calculate any requested alignment. */
alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum);
- /*
- * There are effectively two types of ET_DYN
- * binaries: programs (i.e. PIE: ET_DYN with PT_INTERP)
- * and loaders (ET_DYN without PT_INTERP, since they
- * _are_ the ELF interpreter). The loaders must
- * be loaded away from programs since the program
- * may otherwise collide with the loader (especially
- * for ET_EXEC which does not have a randomized
- * position). For example to handle invocations of
+ /**
+ * DOC: PIE handling
+ *
+ * There are effectively two types of ET_DYN ELF
+ * binaries: programs (i.e. PIE: ET_DYN with
+ * PT_INTERP) and loaders (i.e. static PIE: ET_DYN
+ * without PT_INTERP, usually the ELF interpreter
+ * itself). Loaders must be loaded away from programs
+ * since the program may otherwise collide with the
+ * loader (especially for ET_EXEC which does not have
+ * a randomized position).
+ *
+ * For example, to handle invocations of
* "./ld.so someprog" to test out a new version of
* the loader, the subsequent program that the
* loader loads must avoid the loader itself, so
@@ -1119,6 +1116,9 @@ out_free_interp:
* ELF_ET_DYN_BASE and loaders are loaded into the
* independently randomized mmap region (0 load_bias
* without MAP_FIXED nor MAP_FIXED_NOREPLACE).
+ *
+ * See below for "brk" handling details, which is
+ * also affected by program vs loader and ASLR.
*/
if (interpreter) {
/* On ET_DYN with PT_INTERP, we do the ASLR. */
@@ -1235,8 +1235,6 @@ out_free_interp:
start_data += load_bias;
end_data += load_bias;
- current->mm->start_brk = current->mm->brk = ELF_PAGEALIGN(elf_brk);
-
if (interpreter) {
elf_entry = load_elf_interp(interp_elf_ex,
interpreter,
@@ -1292,27 +1290,44 @@ out_free_interp:
mm->end_data = end_data;
mm->start_stack = bprm->p;
- if ((current->flags & PF_RANDOMIZE) && (snapshot_randomize_va_space > 1)) {
+ /**
+ * DOC: "brk" handling
+ *
+ * For architectures with ELF randomization, when executing a
+ * loader directly (i.e. static PIE: ET_DYN without PT_INTERP),
+ * move the brk area out of the mmap region and into the unused
+ * ELF_ET_DYN_BASE region. Since "brk" grows up it may collide
+ * early with the stack growing down or other regions being put
+ * into the mmap region by the kernel (e.g. vdso).
+ *
+ * In the CONFIG_COMPAT_BRK case, though, everything is turned
+ * off because we're not allowed to move the brk at all.
+ */
+ if (!IS_ENABLED(CONFIG_COMPAT_BRK) &&
+ IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) &&
+ elf_ex->e_type == ET_DYN && !interpreter) {
+ elf_brk = ELF_ET_DYN_BASE;
+ /* This counts as moving the brk, so let brk(2) know. */
+ brk_moved = true;
+ }
+ mm->start_brk = mm->brk = ELF_PAGEALIGN(elf_brk);
+
+ if ((current->flags & PF_RANDOMIZE) && snapshot_randomize_va_space > 1) {
/*
- * For architectures with ELF randomization, when executing
- * a loader directly (i.e. no interpreter listed in ELF
- * headers), move the brk area out of the mmap region
- * (since it grows up, and may collide early with the stack
- * growing down), and into the unused ELF_ET_DYN_BASE region.
+ * If we didn't move the brk to ELF_ET_DYN_BASE (above),
+ * leave a gap between .bss and brk.
*/
- if (IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) &&
- elf_ex->e_type == ET_DYN && !interpreter) {
- mm->brk = mm->start_brk = ELF_ET_DYN_BASE;
- } else {
- /* Otherwise leave a gap between .bss and brk. */
+ if (!brk_moved)
mm->brk = mm->start_brk = mm->brk + PAGE_SIZE;
- }
mm->brk = mm->start_brk = arch_randomize_brk(mm);
+ brk_moved = true;
+ }
+
#ifdef compat_brk_randomized
+ if (brk_moved)
current->brk_randomized = 1;
#endif
- }
if (current->personality & MMAP_PAGE_ZERO) {
/* Why this, you ask??? Well SVr4 maps page 0 as read-only,
@@ -1362,75 +1377,6 @@ out_free_ph:
goto out;
}
-#ifdef CONFIG_USELIB
-/* This is really simpleminded and specialized - we are loading an
- a.out library that is given an ELF header. */
-static int load_elf_library(struct file *file)
-{
- struct elf_phdr *elf_phdata;
- struct elf_phdr *eppnt;
- int retval, error, i, j;
- struct elfhdr elf_ex;
-
- error = -ENOEXEC;
- retval = elf_read(file, &elf_ex, sizeof(elf_ex), 0);
- if (retval < 0)
- goto out;
-
- if (memcmp(elf_ex.e_ident, ELFMAG, SELFMAG) != 0)
- goto out;
-
- /* First of all, some simple consistency checks */
- if (elf_ex.e_type != ET_EXEC || elf_ex.e_phnum > 2 ||
- !elf_check_arch(&elf_ex) || !file->f_op->mmap)
- goto out;
- if (elf_check_fdpic(&elf_ex))
- goto out;
-
- /* Now read in all of the header information */
-
- j = sizeof(struct elf_phdr) * elf_ex.e_phnum;
- /* j < ELF_MIN_ALIGN because elf_ex.e_phnum <= 2 */
-
- error = -ENOMEM;
- elf_phdata = kmalloc(j, GFP_KERNEL);
- if (!elf_phdata)
- goto out;
-
- eppnt = elf_phdata;
- error = -ENOEXEC;
- retval = elf_read(file, eppnt, j, elf_ex.e_phoff);
- if (retval < 0)
- goto out_free_ph;
-
- for (j = 0, i = 0; i<elf_ex.e_phnum; i++)
- if ((eppnt + i)->p_type == PT_LOAD)
- j++;
- if (j != 1)
- goto out_free_ph;
-
- while (eppnt->p_type != PT_LOAD)
- eppnt++;
-
- /* Now use mmap to map the library into memory. */
- error = elf_load(file, ELF_PAGESTART(eppnt->p_vaddr),
- eppnt,
- PROT_READ | PROT_WRITE | PROT_EXEC,
- MAP_FIXED_NOREPLACE | MAP_PRIVATE,
- 0);
-
- if (error != ELF_PAGESTART(eppnt->p_vaddr))
- goto out_free_ph;
-
- error = 0;
-
-out_free_ph:
- kfree(elf_phdata);
-out:
- return error;
-}
-#endif /* #ifdef CONFIG_USELIB */
-
#ifdef CONFIG_ELF_CORE
/*
* ELF core dumper
@@ -1504,8 +1450,8 @@ static void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, loff_t offset)
phdr->p_align = 4;
}
-static void fill_note(struct memelfnote *note, const char *name, int type,
- unsigned int sz, void *data)
+static void __fill_note(struct memelfnote *note, const char *name, int type,
+ unsigned int sz, void *data)
{
note->name = name;
note->type = type;
@@ -1513,6 +1459,9 @@ static void fill_note(struct memelfnote *note, const char *name, int type,
note->data = data;
}
+#define fill_note(note, type, sz, data) \
+ __fill_note(note, NN_ ## type, NT_ ## type, sz, data)
+
/*
* fill up all the fields in prstatus from the given task struct, except
* registers which need to be filled up separately.
@@ -1603,14 +1552,14 @@ static void fill_auxv_note(struct memelfnote *note, struct mm_struct *mm)
do
i += 2;
while (auxv[i - 2] != AT_NULL);
- fill_note(note, "CORE", NT_AUXV, i * sizeof(elf_addr_t), auxv);
+ fill_note(note, AUXV, i * sizeof(elf_addr_t), auxv);
}
static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata,
const kernel_siginfo_t *siginfo)
{
copy_siginfo_to_external(csigdata, siginfo);
- fill_note(note, "CORE", NT_SIGINFO, sizeof(*csigdata), csigdata);
+ fill_note(note, SIGINFO, sizeof(*csigdata), csigdata);
}
/*
@@ -1706,7 +1655,7 @@ static int fill_files_note(struct memelfnote *note, struct coredump_params *cprm
}
size = name_curpos - (char *)data;
- fill_note(note, "CORE", NT_FILE, size, data);
+ fill_note(note, FILE, size, data);
return 0;
}
@@ -1767,8 +1716,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
regset_get(t->task, &view->regsets[0],
sizeof(t->prstatus.pr_reg), &t->prstatus.pr_reg);
- fill_note(&t->notes[0], "CORE", NT_PRSTATUS,
- PRSTATUS_SIZE, &t->prstatus);
+ fill_note(&t->notes[0], PRSTATUS, PRSTATUS_SIZE, &t->prstatus);
info->size += notesize(&t->notes[0]);
do_thread_regset_writeback(t->task, &view->regsets[0]);
@@ -1781,6 +1729,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
for (view_iter = 1; view_iter < view->n; ++view_iter) {
const struct user_regset *regset = &view->regsets[view_iter];
int note_type = regset->core_note_type;
+ const char *note_name = regset->core_note_name;
bool is_fpreg = note_type == NT_PRFPREG;
void *data;
int ret;
@@ -1801,8 +1750,16 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
if (is_fpreg)
SET_PR_FPVALID(&t->prstatus);
- fill_note(&t->notes[note_iter], is_fpreg ? "CORE" : "LINUX",
- note_type, ret, data);
+ /* There should be a note name, but if not, guess: */
+ if (WARN_ON_ONCE(!note_name))
+ note_name = "LINUX";
+ else
+ /* Warn on non-legacy-compatible names, for now. */
+ WARN_ON_ONCE(strcmp(note_name,
+ is_fpreg ? "CORE" : "LINUX"));
+
+ __fill_note(&t->notes[note_iter], note_name, note_type,
+ ret, data);
info->size += notesize(&t->notes[note_iter]);
note_iter++;
@@ -1821,8 +1778,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
fill_prstatus(&t->prstatus.common, p, signr);
elf_core_copy_task_regs(p, &t->prstatus.pr_reg);
- fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus),
- &(t->prstatus));
+ fill_note(&t->notes[0], PRSTATUS, sizeof(t->prstatus), &t->prstatus);
info->size += notesize(&t->notes[0]);
fpu = kzalloc(sizeof(elf_fpregset_t), GFP_KERNEL);
@@ -1832,7 +1788,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
}
t->prstatus.pr_fpvalid = 1;
- fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(*fpu), fpu);
+ fill_note(&t->notes[1], PRFPREG, sizeof(*fpu), fpu);
info->size += notesize(&t->notes[1]);
return 1;
@@ -1852,7 +1808,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL);
if (!psinfo)
return 0;
- fill_note(&info->psinfo, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo);
+ fill_note(&info->psinfo, PRPSINFO, sizeof(*psinfo), psinfo);
#ifdef CORE_DUMP_USE_REGSET
view = task_user_regset_view(dump_task);
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index c13ee8180b17..48fd2de3bca0 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -109,7 +109,7 @@ static int is_elf(struct elfhdr *hdr, struct file *file)
return 0;
if (!elf_check_arch(hdr))
return 0;
- if (!file->f_op->mmap)
+ if (!can_mmap_file(file))
return 0;
return 1;
}
@@ -1024,7 +1024,7 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
/* deal with each load segment separately */
phdr = params->phdrs;
for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) {
- unsigned long maddr, disp, excess, excess1;
+ unsigned long maddr, disp, excess;
int prot = 0, flags;
if (phdr->p_type != PT_LOAD)
@@ -1120,9 +1120,10 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
* extant in the file
*/
excess = phdr->p_memsz - phdr->p_filesz;
- excess1 = PAGE_SIZE - ((maddr + phdr->p_filesz) & ~PAGE_MASK);
#ifdef CONFIG_MMU
+ unsigned long excess1
+ = PAGE_SIZE - ((maddr + phdr->p_filesz) & ~PAGE_MASK);
if (excess > excess1) {
unsigned long xaddr = maddr + phdr->p_filesz + excess1;
unsigned long xmaddr;
@@ -1274,8 +1275,8 @@ static inline void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, loff_t offs
return;
}
-static inline void fill_note(struct memelfnote *note, const char *name, int type,
- unsigned int sz, void *data)
+static inline void __fill_note(struct memelfnote *note, const char *name, int type,
+ unsigned int sz, void *data)
{
note->name = name;
note->type = type;
@@ -1284,6 +1285,9 @@ static inline void fill_note(struct memelfnote *note, const char *name, int type
return;
}
+#define fill_note(note, type, sz, data) \
+ __fill_note(note, NN_ ## type, NT_ ## type, sz, data)
+
/*
* fill up all the fields in prstatus from the given task struct, except
* registers which need to be filled up separately.
@@ -1397,8 +1401,7 @@ static struct elf_thread_status *elf_dump_thread_status(long signr, struct task_
regset_get(p, &view->regsets[0],
sizeof(t->prstatus.pr_reg), &t->prstatus.pr_reg);
- fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus),
- &t->prstatus);
+ fill_note(&t->notes[0], PRSTATUS, sizeof(t->prstatus), &t->prstatus);
t->num_notes++;
*sz += notesize(&t->notes[0]);
@@ -1415,8 +1418,7 @@ static struct elf_thread_status *elf_dump_thread_status(long signr, struct task_
}
if (t->prstatus.pr_fpvalid) {
- fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(t->fpu),
- &t->fpu);
+ fill_note(&t->notes[1], PRFPREG, sizeof(t->fpu), &t->fpu);
t->num_notes++;
*sz += notesize(&t->notes[1]);
}
@@ -1530,7 +1532,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
*/
fill_psinfo(psinfo, current->group_leader, current->mm);
- fill_note(&psinfo_note, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo);
+ fill_note(&psinfo_note, PRPSINFO, sizeof(*psinfo), psinfo);
thread_status_size += notesize(&psinfo_note);
auxv = (elf_addr_t *) current->mm->saved_auxv;
@@ -1538,7 +1540,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
do
i += 2;
while (auxv[i - 2] != AT_NULL);
- fill_note(&auxv_note, "CORE", NT_AUXV, i * sizeof(elf_addr_t), auxv);
+ fill_note(&auxv_note, AUXV, i * sizeof(elf_addr_t), auxv);
thread_status_size += notesize(&auxv_note);
offset = sizeof(*elf); /* ELF header */
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 5a7ebd160724..a839f960cd4a 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -675,44 +675,6 @@ static void bm_evict_inode(struct inode *inode)
}
/**
- * unlink_binfmt_dentry - remove the dentry for the binary type handler
- * @dentry: dentry associated with the binary type handler
- *
- * Do the actual filesystem work to remove a dentry for a registered binary
- * type handler. Since binfmt_misc only allows simple files to be created
- * directly under the root dentry of the filesystem we ensure that we are
- * indeed passed a dentry directly beneath the root dentry, that the inode
- * associated with the root dentry is locked, and that it is a regular file we
- * are asked to remove.
- */
-static void unlink_binfmt_dentry(struct dentry *dentry)
-{
- struct dentry *parent = dentry->d_parent;
- struct inode *inode, *parent_inode;
-
- /* All entries are immediate descendants of the root dentry. */
- if (WARN_ON_ONCE(dentry->d_sb->s_root != parent))
- return;
-
- /* We only expect to be called on regular files. */
- inode = d_inode(dentry);
- if (WARN_ON_ONCE(!S_ISREG(inode->i_mode)))
- return;
-
- /* The parent inode must be locked. */
- parent_inode = d_inode(parent);
- if (WARN_ON_ONCE(!inode_is_locked(parent_inode)))
- return;
-
- if (simple_positive(dentry)) {
- dget(dentry);
- simple_unlink(parent_inode, dentry);
- d_delete(dentry);
- dput(dentry);
- }
-}
-
-/**
* remove_binfmt_handler - remove a binary type handler
* @misc: handle to binfmt_misc instance
* @e: binary type handler to remove
@@ -729,7 +691,7 @@ static void remove_binfmt_handler(struct binfmt_misc *misc, Node *e)
write_lock(&misc->entries_lock);
list_del_init(&e->list);
write_unlock(&misc->entries_lock);
- unlink_binfmt_dentry(e->dentry);
+ locked_recursive_removal(e->dentry, NULL);
}
/* /<entry> */
@@ -772,7 +734,7 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer,
case 3:
/* Delete this handler. */
inode = d_inode(inode->i_sb->s_root);
- inode_lock(inode);
+ inode_lock_nested(inode, I_MUTEX_PARENT);
/*
* In order to add new element or remove elements from the list
@@ -842,7 +804,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
}
inode_lock(d_inode(root));
- dentry = lookup_one_len(e->name, root, strlen(e->name));
+ dentry = lookup_noperm(&QSTR(e->name), root);
err = PTR_ERR(dentry);
if (IS_ERR(dentry))
goto out;
@@ -922,7 +884,7 @@ static ssize_t bm_status_write(struct file *file, const char __user *buffer,
case 3:
/* Delete all handlers. */
inode = d_inode(file_inode(file)->i_sb->s_root);
- inode_lock(inode);
+ inode_lock_nested(inode, I_MUTEX_PARENT);
/*
* In order to add new element or remove elements from the list
diff --git a/fs/bpf_fs_kfuncs.c b/fs/bpf_fs_kfuncs.c
index 3fe9f59ef867..1e36a12b88f7 100644
--- a/fs/bpf_fs_kfuncs.c
+++ b/fs/bpf_fs_kfuncs.c
@@ -2,11 +2,14 @@
/* Copyright (c) 2024 Google LLC. */
#include <linux/bpf.h>
+#include <linux/bpf_lsm.h>
#include <linux/btf.h>
#include <linux/btf_ids.h>
#include <linux/dcache.h>
#include <linux/fs.h>
+#include <linux/fsnotify.h>
#include <linux/file.h>
+#include <linux/kernfs.h>
#include <linux/mm.h>
#include <linux/xattr.h>
@@ -93,6 +96,24 @@ __bpf_kfunc int bpf_path_d_path(struct path *path, char *buf, size_t buf__sz)
return len;
}
+static bool match_security_bpf_prefix(const char *name__str)
+{
+ return !strncmp(name__str, XATTR_NAME_BPF_LSM, XATTR_NAME_BPF_LSM_LEN);
+}
+
+static int bpf_xattr_read_permission(const char *name, struct inode *inode)
+{
+ if (WARN_ON(!inode))
+ return -EINVAL;
+
+ /* Allow reading xattr with user. and security.bpf. prefix */
+ if (strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) &&
+ !match_security_bpf_prefix(name))
+ return -EPERM;
+
+ return inode_permission(&nop_mnt_idmap, inode, MAY_READ);
+}
+
/**
* bpf_get_dentry_xattr - get xattr of a dentry
* @dentry: dentry to get xattr from
@@ -101,9 +122,10 @@ __bpf_kfunc int bpf_path_d_path(struct path *path, char *buf, size_t buf__sz)
*
* Get xattr *name__str* of *dentry* and store the output in *value_ptr*.
*
- * For security reasons, only *name__str* with prefix "user." is allowed.
+ * For security reasons, only *name__str* with prefixes "user." or
+ * "security.bpf." are allowed.
*
- * Return: 0 on success, a negative value on error.
+ * Return: length of the xattr value on success, a negative value on error.
*/
__bpf_kfunc int bpf_get_dentry_xattr(struct dentry *dentry, const char *name__str,
struct bpf_dynptr *value_p)
@@ -114,18 +136,12 @@ __bpf_kfunc int bpf_get_dentry_xattr(struct dentry *dentry, const char *name__st
void *value;
int ret;
- if (WARN_ON(!inode))
- return -EINVAL;
-
- if (strncmp(name__str, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
- return -EPERM;
-
value_len = __bpf_dynptr_size(value_ptr);
value = __bpf_dynptr_data_rw(value_ptr, value_len);
if (!value)
return -EINVAL;
- ret = inode_permission(&nop_mnt_idmap, inode, MAY_READ);
+ ret = bpf_xattr_read_permission(name__str, inode);
if (ret)
return ret;
return __vfs_getxattr(dentry, inode, name__str, value, value_len);
@@ -139,9 +155,10 @@ __bpf_kfunc int bpf_get_dentry_xattr(struct dentry *dentry, const char *name__st
*
* Get xattr *name__str* of *file* and store the output in *value_ptr*.
*
- * For security reasons, only *name__str* with prefix "user." is allowed.
+ * For security reasons, only *name__str* with prefixes "user." or
+ * "security.bpf." are allowed.
*
- * Return: 0 on success, a negative value on error.
+ * Return: length of the xattr value on success, a negative value on error.
*/
__bpf_kfunc int bpf_get_file_xattr(struct file *file, const char *name__str,
struct bpf_dynptr *value_p)
@@ -154,6 +171,193 @@ __bpf_kfunc int bpf_get_file_xattr(struct file *file, const char *name__str,
__bpf_kfunc_end_defs();
+static int bpf_xattr_write_permission(const char *name, struct inode *inode)
+{
+ if (WARN_ON(!inode))
+ return -EINVAL;
+
+ /* Only allow setting and removing security.bpf. xattrs */
+ if (!match_security_bpf_prefix(name))
+ return -EPERM;
+
+ return inode_permission(&nop_mnt_idmap, inode, MAY_WRITE);
+}
+
+/**
+ * bpf_set_dentry_xattr_locked - set a xattr of a dentry
+ * @dentry: dentry to get xattr from
+ * @name__str: name of the xattr
+ * @value_p: xattr value
+ * @flags: flags to pass into filesystem operations
+ *
+ * Set xattr *name__str* of *dentry* to the value in *value_ptr*.
+ *
+ * For security reasons, only *name__str* with prefix "security.bpf."
+ * is allowed.
+ *
+ * The caller already locked dentry->d_inode.
+ *
+ * Return: 0 on success, a negative value on error.
+ */
+int bpf_set_dentry_xattr_locked(struct dentry *dentry, const char *name__str,
+ const struct bpf_dynptr *value_p, int flags)
+{
+
+ struct bpf_dynptr_kern *value_ptr = (struct bpf_dynptr_kern *)value_p;
+ struct inode *inode = d_inode(dentry);
+ const void *value;
+ u32 value_len;
+ int ret;
+
+ value_len = __bpf_dynptr_size(value_ptr);
+ value = __bpf_dynptr_data(value_ptr, value_len);
+ if (!value)
+ return -EINVAL;
+
+ ret = bpf_xattr_write_permission(name__str, inode);
+ if (ret)
+ return ret;
+
+ ret = __vfs_setxattr(&nop_mnt_idmap, dentry, inode, name__str,
+ value, value_len, flags);
+ if (!ret) {
+ fsnotify_xattr(dentry);
+
+ /* This xattr is set by BPF LSM, so we do not call
+ * security_inode_post_setxattr. Otherwise, we would
+ * risk deadlocks by calling back to the same kfunc.
+ *
+ * This is the same as security_inode_setsecurity().
+ */
+ }
+ return ret;
+}
+
+/**
+ * bpf_remove_dentry_xattr_locked - remove a xattr of a dentry
+ * @dentry: dentry to get xattr from
+ * @name__str: name of the xattr
+ *
+ * Rmove xattr *name__str* of *dentry*.
+ *
+ * For security reasons, only *name__str* with prefix "security.bpf."
+ * is allowed.
+ *
+ * The caller already locked dentry->d_inode.
+ *
+ * Return: 0 on success, a negative value on error.
+ */
+int bpf_remove_dentry_xattr_locked(struct dentry *dentry, const char *name__str)
+{
+ struct inode *inode = d_inode(dentry);
+ int ret;
+
+ ret = bpf_xattr_write_permission(name__str, inode);
+ if (ret)
+ return ret;
+
+ ret = __vfs_removexattr(&nop_mnt_idmap, dentry, name__str);
+ if (!ret) {
+ fsnotify_xattr(dentry);
+
+ /* This xattr is removed by BPF LSM, so we do not call
+ * security_inode_post_removexattr. Otherwise, we would
+ * risk deadlocks by calling back to the same kfunc.
+ */
+ }
+ return ret;
+}
+
+__bpf_kfunc_start_defs();
+
+/**
+ * bpf_set_dentry_xattr - set a xattr of a dentry
+ * @dentry: dentry to get xattr from
+ * @name__str: name of the xattr
+ * @value_p: xattr value
+ * @flags: flags to pass into filesystem operations
+ *
+ * Set xattr *name__str* of *dentry* to the value in *value_ptr*.
+ *
+ * For security reasons, only *name__str* with prefix "security.bpf."
+ * is allowed.
+ *
+ * The caller has not locked dentry->d_inode.
+ *
+ * Return: 0 on success, a negative value on error.
+ */
+__bpf_kfunc int bpf_set_dentry_xattr(struct dentry *dentry, const char *name__str,
+ const struct bpf_dynptr *value_p, int flags)
+{
+ struct inode *inode = d_inode(dentry);
+ int ret;
+
+ inode_lock(inode);
+ ret = bpf_set_dentry_xattr_locked(dentry, name__str, value_p, flags);
+ inode_unlock(inode);
+ return ret;
+}
+
+/**
+ * bpf_remove_dentry_xattr - remove a xattr of a dentry
+ * @dentry: dentry to get xattr from
+ * @name__str: name of the xattr
+ *
+ * Rmove xattr *name__str* of *dentry*.
+ *
+ * For security reasons, only *name__str* with prefix "security.bpf."
+ * is allowed.
+ *
+ * The caller has not locked dentry->d_inode.
+ *
+ * Return: 0 on success, a negative value on error.
+ */
+__bpf_kfunc int bpf_remove_dentry_xattr(struct dentry *dentry, const char *name__str)
+{
+ struct inode *inode = d_inode(dentry);
+ int ret;
+
+ inode_lock(inode);
+ ret = bpf_remove_dentry_xattr_locked(dentry, name__str);
+ inode_unlock(inode);
+ return ret;
+}
+
+#ifdef CONFIG_CGROUPS
+/**
+ * bpf_cgroup_read_xattr - read xattr of a cgroup's node in cgroupfs
+ * @cgroup: cgroup to get xattr from
+ * @name__str: name of the xattr
+ * @value_p: output buffer of the xattr value
+ *
+ * Get xattr *name__str* of *cgroup* and store the output in *value_ptr*.
+ *
+ * For security reasons, only *name__str* with prefix "user." is allowed.
+ *
+ * Return: length of the xattr value on success, a negative value on error.
+ */
+__bpf_kfunc int bpf_cgroup_read_xattr(struct cgroup *cgroup, const char *name__str,
+ struct bpf_dynptr *value_p)
+{
+ struct bpf_dynptr_kern *value_ptr = (struct bpf_dynptr_kern *)value_p;
+ u32 value_len;
+ void *value;
+
+ /* Only allow reading "user.*" xattrs */
+ if (strncmp(name__str, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
+ return -EPERM;
+
+ value_len = __bpf_dynptr_size(value_ptr);
+ value = __bpf_dynptr_data_rw(value_ptr, value_len);
+ if (!value)
+ return -EINVAL;
+
+ return kernfs_xattr_get(cgroup->kn, name__str, value, value_len);
+}
+#endif /* CONFIG_CGROUPS */
+
+__bpf_kfunc_end_defs();
+
BTF_KFUNCS_START(bpf_fs_kfunc_set_ids)
BTF_ID_FLAGS(func, bpf_get_task_exe_file,
KF_ACQUIRE | KF_TRUSTED_ARGS | KF_RET_NULL)
@@ -161,6 +365,8 @@ BTF_ID_FLAGS(func, bpf_put_file, KF_RELEASE)
BTF_ID_FLAGS(func, bpf_path_d_path, KF_TRUSTED_ARGS)
BTF_ID_FLAGS(func, bpf_get_dentry_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS)
BTF_ID_FLAGS(func, bpf_get_file_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_set_dentry_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_remove_dentry_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS)
BTF_KFUNCS_END(bpf_fs_kfunc_set_ids)
static int bpf_fs_kfuncs_filter(const struct bpf_prog *prog, u32 kfunc_id)
@@ -171,6 +377,37 @@ static int bpf_fs_kfuncs_filter(const struct bpf_prog *prog, u32 kfunc_id)
return -EACCES;
}
+/* bpf_[set|remove]_dentry_xattr.* hooks have KF_TRUSTED_ARGS and
+ * KF_SLEEPABLE, so they are only available to sleepable hooks with
+ * dentry arguments.
+ *
+ * Setting and removing xattr requires exclusive lock on dentry->d_inode.
+ * Some hooks already locked d_inode, while some hooks have not locked
+ * d_inode. Therefore, we need different kfuncs for different hooks.
+ * Specifically, hooks in the following list (d_inode_locked_hooks)
+ * should call bpf_[set|remove]_dentry_xattr_locked; while other hooks
+ * should call bpf_[set|remove]_dentry_xattr.
+ */
+BTF_SET_START(d_inode_locked_hooks)
+BTF_ID(func, bpf_lsm_inode_post_removexattr)
+BTF_ID(func, bpf_lsm_inode_post_setattr)
+BTF_ID(func, bpf_lsm_inode_post_setxattr)
+BTF_ID(func, bpf_lsm_inode_removexattr)
+BTF_ID(func, bpf_lsm_inode_rmdir)
+BTF_ID(func, bpf_lsm_inode_setattr)
+BTF_ID(func, bpf_lsm_inode_setxattr)
+BTF_ID(func, bpf_lsm_inode_unlink)
+#ifdef CONFIG_SECURITY_PATH
+BTF_ID(func, bpf_lsm_path_unlink)
+BTF_ID(func, bpf_lsm_path_rmdir)
+#endif /* CONFIG_SECURITY_PATH */
+BTF_SET_END(d_inode_locked_hooks)
+
+bool bpf_lsm_has_d_inode_locked(const struct bpf_prog *prog)
+{
+ return btf_id_set_contains(&d_inode_locked_hooks, prog->aux->attach_btf_id);
+}
+
static const struct btf_kfunc_id_set bpf_fs_kfunc_set = {
.owner = THIS_MODULE,
.set = &bpf_fs_kfunc_set_ids,
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index fa8515598341..ea95c90c8474 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -3,9 +3,9 @@
config BTRFS_FS
tristate "Btrfs filesystem support"
select BLK_CGROUP_PUNT_BIO
+ select CRC32
select CRYPTO
select CRYPTO_CRC32C
- select LIBCRC32C
select CRYPTO_XXHASH
select CRYPTO_SHA256
select CRYPTO_BLAKE2B
@@ -52,10 +52,10 @@ config BTRFS_FS_RUN_SANITY_TESTS
bool "Btrfs will run sanity tests upon loading"
depends on BTRFS_FS
help
- This will run some basic sanity tests on the free space cache
- code to make sure it is acting as it should. These are mostly
- regression tests and are only really interesting to btrfs
- developers.
+ This will run sanity tests for core functionality like free space,
+ extent maps, extent io, extent buffers, inodes, qgroups and others,
+ at module load time. These are mostly regression tests and are only
+ interesting to developers.
If unsure, say N.
@@ -63,9 +63,12 @@ config BTRFS_DEBUG
bool "Btrfs debugging support"
depends on BTRFS_FS
help
- Enable run-time debugging support for the btrfs filesystem. This may
- enable additional and expensive checks with negative impact on
- performance, or export extra information via sysfs.
+ Enable run-time debugging support for the btrfs filesystem.
+
+ Additional potentially expensive checks, debugging functionality or
+ sysfs exported information is enabled, like leak checks of internal
+ objects, optional forced space fragmentation and /sys/fs/btrfs/debug .
+ This has negative impact on performance.
If unsure, say N.
@@ -73,8 +76,10 @@ config BTRFS_ASSERT
bool "Btrfs assert support"
depends on BTRFS_FS
help
- Enable run-time assertion checking. This will result in panics if
- any of the assertions trip. This is meant for btrfs developers only.
+ Enable run-time assertion checking. Additional safety checks are
+ done, simple enough not to affect performance but verify invariants
+ and assumptions of code to run properly. This may result in panics,
+ and is meant for developers but can be enabled in general.
If unsure, say N.
@@ -89,7 +94,14 @@ config BTRFS_EXPERIMENTAL
Current list:
- - extent map shrinker - performance problems with too frequent shrinks
+ - COW fixup worker warning - last warning before removing the
+ functionality catching out-of-band page
+ dirtying, not necessary since 5.8
+
+ - RAID mirror read policy - additional read policies for balancing
+ reading from redundant block group
+ profiles (currently: pid, round-robin,
+ fixed devid)
- send stream protocol v3 - fs-verity support
@@ -102,6 +114,8 @@ config BTRFS_EXPERIMENTAL
- extent tree v2 - complex rework of extent tracking
+ - large folio support
+
If unsure, say N.
config BTRFS_FS_REF_VERIFY
diff --git a/fs/btrfs/accessors.c b/fs/btrfs/accessors.c
index e3716516ca38..861c7d92c437 100644
--- a/fs/btrfs/accessors.c
+++ b/fs/btrfs/accessors.c
@@ -9,27 +9,24 @@
#include "fs.h"
#include "accessors.h"
-static bool check_setget_bounds(const struct extent_buffer *eb,
- const void *ptr, unsigned off, int size)
+static void __cold report_setget_bounds(const struct extent_buffer *eb,
+ const void *ptr, unsigned off, int size)
{
- const unsigned long member_offset = (unsigned long)ptr + off;
+ unsigned long member_offset = (unsigned long)ptr + off;
- if (unlikely(member_offset + size > eb->len)) {
- btrfs_warn(eb->fs_info,
- "bad eb member %s: ptr 0x%lx start %llu member offset %lu size %d",
- (member_offset > eb->len ? "start" : "end"),
- (unsigned long)ptr, eb->start, member_offset, size);
- return false;
- }
-
- return true;
+ btrfs_warn(eb->fs_info,
+ "bad eb member %s: ptr 0x%lx start %llu member offset %lu size %d",
+ (member_offset > eb->len ? "start" : "end"),
+ (unsigned long)ptr, eb->start, member_offset, size);
}
-void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *eb)
+/* Copy bytes from @src1 and @src2 to @dest. */
+static __always_inline void memcpy_split_src(char *dest, const char *src1,
+ const char *src2, const size_t len1,
+ const size_t total)
{
- token->eb = eb;
- token->kaddr = folio_address(eb->folios[0]);
- token->offset = 0;
+ memcpy(dest, src1, len1);
+ memcpy(dest + len1, src2, total - len1);
}
/*
@@ -41,11 +38,6 @@ void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *e
* - btrfs_set_8 (for 8/16/32/64)
* - btrfs_get_8 (for 8/16/32/64)
*
- * Generic helpers with a token (cached address of the most recently accessed
- * page):
- * - btrfs_set_token_8 (for 8/16/32/64)
- * - btrfs_get_token_8 (for 8/16/32/64)
- *
* The set/get functions handle data spanning two pages transparently, in case
* metadata block size is larger than page. Every pointer to metadata items is
* an offset into the extent buffer page array, cast to a specific type. This
@@ -57,118 +49,66 @@ void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *e
*/
#define DEFINE_BTRFS_SETGET_BITS(bits) \
-u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \
- const void *ptr, unsigned long off) \
-{ \
- const unsigned long member_offset = (unsigned long)ptr + off; \
- const unsigned long idx = get_eb_folio_index(token->eb, member_offset); \
- const unsigned long oil = get_eb_offset_in_folio(token->eb, \
- member_offset);\
- const int unit_size = token->eb->folio_size; \
- const int unit_shift = token->eb->folio_shift; \
- const int size = sizeof(u##bits); \
- u8 lebytes[sizeof(u##bits)]; \
- const int part = unit_size - oil; \
- \
- ASSERT(token); \
- ASSERT(token->kaddr); \
- ASSERT(check_setget_bounds(token->eb, ptr, off, size)); \
- if (token->offset <= member_offset && \
- member_offset + size <= token->offset + unit_size) { \
- return get_unaligned_le##bits(token->kaddr + oil); \
- } \
- token->kaddr = folio_address(token->eb->folios[idx]); \
- token->offset = idx << unit_shift; \
- if (INLINE_EXTENT_BUFFER_PAGES == 1 || oil + size <= unit_size) \
- return get_unaligned_le##bits(token->kaddr + oil); \
- \
- memcpy(lebytes, token->kaddr + oil, part); \
- token->kaddr = folio_address(token->eb->folios[idx + 1]); \
- token->offset = (idx + 1) << unit_shift; \
- memcpy(lebytes + part, token->kaddr, size - part); \
- return get_unaligned_le##bits(lebytes); \
-} \
u##bits btrfs_get_##bits(const struct extent_buffer *eb, \
const void *ptr, unsigned long off) \
{ \
const unsigned long member_offset = (unsigned long)ptr + off; \
const unsigned long idx = get_eb_folio_index(eb, member_offset);\
- const unsigned long oil = get_eb_offset_in_folio(eb, \
- member_offset);\
- const int unit_size = eb->folio_size; \
- char *kaddr = folio_address(eb->folios[idx]); \
- const int size = sizeof(u##bits); \
- const int part = unit_size - oil; \
- u8 lebytes[sizeof(u##bits)]; \
- \
- ASSERT(check_setget_bounds(eb, ptr, off, size)); \
- if (INLINE_EXTENT_BUFFER_PAGES == 1 || oil + size <= unit_size) \
- return get_unaligned_le##bits(kaddr + oil); \
- \
- memcpy(lebytes, kaddr + oil, part); \
- kaddr = folio_address(eb->folios[idx + 1]); \
- memcpy(lebytes + part, kaddr, size - part); \
- return get_unaligned_le##bits(lebytes); \
-} \
-void btrfs_set_token_##bits(struct btrfs_map_token *token, \
- const void *ptr, unsigned long off, \
- u##bits val) \
-{ \
- const unsigned long member_offset = (unsigned long)ptr + off; \
- const unsigned long idx = get_eb_folio_index(token->eb, member_offset); \
- const unsigned long oil = get_eb_offset_in_folio(token->eb, \
+ const unsigned long oif = get_eb_offset_in_folio(eb, \
member_offset);\
- const int unit_size = token->eb->folio_size; \
- const int unit_shift = token->eb->folio_shift; \
- const int size = sizeof(u##bits); \
+ char *kaddr = folio_address(eb->folios[idx]) + oif; \
+ const int part = eb->folio_size - oif; \
u8 lebytes[sizeof(u##bits)]; \
- const int part = unit_size - oil; \
\
- ASSERT(token); \
- ASSERT(token->kaddr); \
- ASSERT(check_setget_bounds(token->eb, ptr, off, size)); \
- if (token->offset <= member_offset && \
- member_offset + size <= token->offset + unit_size) { \
- put_unaligned_le##bits(val, token->kaddr + oil); \
- return; \
+ if (unlikely(member_offset + sizeof(u##bits) > eb->len)) { \
+ report_setget_bounds(eb, ptr, off, sizeof(u##bits)); \
+ return 0; \
} \
- token->kaddr = folio_address(token->eb->folios[idx]); \
- token->offset = idx << unit_shift; \
- if (INLINE_EXTENT_BUFFER_PAGES == 1 || \
- oil + size <= unit_size) { \
- put_unaligned_le##bits(val, token->kaddr + oil); \
- return; \
+ if (INLINE_EXTENT_BUFFER_PAGES == 1 || sizeof(u##bits) == 1 || \
+ likely(sizeof(u##bits) <= part)) \
+ return get_unaligned_le##bits(kaddr); \
+ \
+ if (sizeof(u##bits) == 2) { \
+ lebytes[0] = *kaddr; \
+ kaddr = folio_address(eb->folios[idx + 1]); \
+ lebytes[1] = *kaddr; \
+ } else { \
+ memcpy_split_src(lebytes, kaddr, \
+ folio_address(eb->folios[idx + 1]), \
+ part, sizeof(u##bits)); \
} \
- put_unaligned_le##bits(val, lebytes); \
- memcpy(token->kaddr + oil, lebytes, part); \
- token->kaddr = folio_address(token->eb->folios[idx + 1]); \
- token->offset = (idx + 1) << unit_shift; \
- memcpy(token->kaddr, lebytes + part, size - part); \
+ return get_unaligned_le##bits(lebytes); \
} \
void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \
unsigned long off, u##bits val) \
{ \
const unsigned long member_offset = (unsigned long)ptr + off; \
const unsigned long idx = get_eb_folio_index(eb, member_offset);\
- const unsigned long oil = get_eb_offset_in_folio(eb, \
+ const unsigned long oif = get_eb_offset_in_folio(eb, \
member_offset);\
- const int unit_size = eb->folio_size; \
- char *kaddr = folio_address(eb->folios[idx]); \
- const int size = sizeof(u##bits); \
- const int part = unit_size - oil; \
+ char *kaddr = folio_address(eb->folios[idx]) + oif; \
+ const int part = eb->folio_size - oif; \
u8 lebytes[sizeof(u##bits)]; \
\
- ASSERT(check_setget_bounds(eb, ptr, off, size)); \
- if (INLINE_EXTENT_BUFFER_PAGES == 1 || \
- oil + size <= unit_size) { \
- put_unaligned_le##bits(val, kaddr + oil); \
+ if (unlikely(member_offset + sizeof(u##bits) > eb->len)) { \
+ report_setget_bounds(eb, ptr, off, sizeof(u##bits)); \
+ return; \
+ } \
+ if (INLINE_EXTENT_BUFFER_PAGES == 1 || sizeof(u##bits) == 1 || \
+ likely(sizeof(u##bits) <= part)) { \
+ put_unaligned_le##bits(val, kaddr); \
return; \
} \
- \
put_unaligned_le##bits(val, lebytes); \
- memcpy(kaddr + oil, lebytes, part); \
- kaddr = folio_address(eb->folios[idx + 1]); \
- memcpy(kaddr, lebytes + part, size - part); \
+ if (sizeof(u##bits) == 2) { \
+ *kaddr = lebytes[0]; \
+ kaddr = folio_address(eb->folios[idx + 1]); \
+ *kaddr = lebytes[1]; \
+ } else { \
+ memcpy(kaddr, lebytes, part); \
+ kaddr = folio_address(eb->folios[idx + 1]); \
+ memcpy(kaddr, lebytes + part, sizeof(u##bits) - part); \
+ } \
}
DEFINE_BTRFS_SETGET_BITS(8)
diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h
index 7a7e0ef69973..99b3ced12805 100644
--- a/fs/btrfs/accessors.h
+++ b/fs/btrfs/accessors.h
@@ -12,17 +12,10 @@
#include <linux/string.h>
#include <linux/mm.h>
#include <uapi/linux/btrfs_tree.h>
+#include "extent_io.h"
struct extent_buffer;
-struct btrfs_map_token {
- struct extent_buffer *eb;
- char *kaddr;
- unsigned long offset;
-};
-
-void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *eb);
-
/*
* Some macros to generate set/get functions for the struct fields. This
* assumes there is a lefoo_to_cpu for every type, so lets make a simple one
@@ -55,11 +48,6 @@ static inline void put_unaligned_le8(u8 val, void *p)
sizeof_field(type, member)))
#define DECLARE_BTRFS_SETGET_BITS(bits) \
-u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \
- const void *ptr, unsigned long off); \
-void btrfs_set_token_##bits(struct btrfs_map_token *token, \
- const void *ptr, unsigned long off, \
- u##bits val); \
u##bits btrfs_get_##bits(const struct extent_buffer *eb, \
const void *ptr, unsigned long off); \
void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \
@@ -82,18 +70,6 @@ static inline void btrfs_set_##name(const struct extent_buffer *eb, type *s, \
{ \
static_assert(sizeof(u##bits) == sizeof_field(type, member)); \
btrfs_set_##bits(eb, s, offsetof(type, member), val); \
-} \
-static inline u##bits btrfs_token_##name(struct btrfs_map_token *token, \
- const type *s) \
-{ \
- static_assert(sizeof(u##bits) == sizeof_field(type, member)); \
- return btrfs_get_token_##bits(token, s, offsetof(type, member));\
-} \
-static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\
- type *s, u##bits val) \
-{ \
- static_assert(sizeof(u##bits) == sizeof_field(type, member)); \
- btrfs_set_token_##bits(token, s, offsetof(type, member), val); \
}
#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \
@@ -478,18 +454,6 @@ static inline void btrfs_set_item_##member(const struct extent_buffer *eb, \
int slot, u32 val) \
{ \
btrfs_set_raw_item_##member(eb, btrfs_item_nr(eb, slot), val); \
-} \
-static inline u32 btrfs_token_item_##member(struct btrfs_map_token *token, \
- int slot) \
-{ \
- struct btrfs_item *item = btrfs_item_nr(token->eb, slot); \
- return btrfs_token_raw_item_##member(token, item); \
-} \
-static inline void btrfs_set_token_item_##member(struct btrfs_map_token *token, \
- int slot, u32 val) \
-{ \
- struct btrfs_item *item = btrfs_item_nr(token->eb, slot); \
- btrfs_set_token_raw_item_##member(token, item, val); \
}
BTRFS_ITEM_SETGET_FUNCS(offset)
diff --git a/fs/btrfs/acl.h b/fs/btrfs/acl.h
index 48b9ddae4a46..0458cd51ed48 100644
--- a/fs/btrfs/acl.h
+++ b/fs/btrfs/acl.h
@@ -3,6 +3,8 @@
#ifndef BTRFS_ACL_H
#define BTRFS_ACL_H
+#include <linux/types.h>
+
struct posix_acl;
struct inode;
struct btrfs_trans_handle;
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index a4c51600a408..6c6f3bb58f4e 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -168,7 +168,7 @@ static inline void thresh_exec_hook(struct btrfs_workqueue *wq)
{
int new_current_active;
long pending;
- int need_change = 0;
+ bool need_change = false;
if (wq->thresh == NO_THRESHOLD)
return;
@@ -196,15 +196,14 @@ static inline void thresh_exec_hook(struct btrfs_workqueue *wq)
new_current_active--;
new_current_active = clamp_val(new_current_active, 1, wq->limit_active);
if (new_current_active != wq->current_active) {
- need_change = 1;
+ need_change = true;
wq->current_active = new_current_active;
}
out:
spin_unlock(&wq->thres_lock);
- if (need_change) {
+ if (need_change)
workqueue_set_max_active(wq->normal_wq, wq->current_active);
- }
}
static void run_ordered_work(struct btrfs_workqueue *wq,
@@ -220,8 +219,7 @@ static void run_ordered_work(struct btrfs_workqueue *wq,
spin_lock_irqsave(lock, flags);
if (list_empty(list))
break;
- work = list_entry(list->next, struct btrfs_work,
- ordered_list);
+ work = list_first_entry(list, struct btrfs_work, ordered_list);
if (!test_bit(WORK_DONE_BIT, &work->flags))
break;
/*
@@ -296,7 +294,7 @@ static void btrfs_work_helper(struct work_struct *normal_work)
struct btrfs_work *work = container_of(normal_work, struct btrfs_work,
normal_work);
struct btrfs_workqueue *wq = work->wq;
- int need_order = 0;
+ bool need_order = false;
/*
* We should not touch things inside work in the following cases:
@@ -307,7 +305,7 @@ static void btrfs_work_helper(struct work_struct *normal_work)
* So we save the needed things here.
*/
if (work->ordered_func)
- need_order = 1;
+ need_order = true;
trace_btrfs_work_sched(work);
thresh_exec_hook(wq);
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 3d3923cfc357..6a450be293b1 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -733,7 +733,6 @@ static int resolve_indirect_refs(struct btrfs_backref_walk_ctx *ctx,
struct preftrees *preftrees,
struct share_check *sc)
{
- int err;
int ret = 0;
struct ulist *parents;
struct ulist_node *node;
@@ -752,6 +751,7 @@ static int resolve_indirect_refs(struct btrfs_backref_walk_ctx *ctx,
*/
while ((rnode = rb_first_cached(&preftrees->indirect.root))) {
struct prelim_ref *ref;
+ int ret2;
ref = rb_entry(rnode, struct prelim_ref, rbnode);
if (WARN(ref->parent,
@@ -773,18 +773,18 @@ static int resolve_indirect_refs(struct btrfs_backref_walk_ctx *ctx,
ret = BACKREF_FOUND_SHARED;
goto out;
}
- err = resolve_indirect_ref(ctx, path, preftrees, ref, parents);
+ ret2 = resolve_indirect_ref(ctx, path, preftrees, ref, parents);
/*
* we can only tolerate ENOENT,otherwise,we should catch error
* and return directly.
*/
- if (err == -ENOENT) {
+ if (ret2 == -ENOENT) {
prelim_ref_insert(ctx->fs_info, &preftrees->direct, ref,
NULL);
continue;
- } else if (err) {
+ } else if (ret2) {
free_pref(ref);
- ret = err;
+ ret = ret2;
goto out;
}
@@ -1399,11 +1399,11 @@ static int find_parent_nodes(struct btrfs_backref_walk_ctx *ctx,
ASSERT(ctx->roots == NULL);
key.objectid = ctx->bytenr;
- key.offset = (u64)-1;
if (btrfs_fs_incompat(ctx->fs_info, SKINNY_METADATA))
key.type = BTRFS_METADATA_ITEM_KEY;
else
key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = (u64)-1;
path = btrfs_alloc_path();
if (!path)
@@ -2201,16 +2201,15 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
int ret;
u64 flags;
u64 size = 0;
- u32 item_size;
const struct extent_buffer *eb;
struct btrfs_extent_item *ei;
struct btrfs_key key;
+ key.objectid = logical;
if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
key.type = BTRFS_METADATA_ITEM_KEY;
else
key.type = BTRFS_EXTENT_ITEM_KEY;
- key.objectid = logical;
key.offset = (u64)-1;
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
@@ -2244,7 +2243,6 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
}
eb = path->nodes[0];
- item_size = btrfs_item_size(eb, path->slots[0]);
ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
flags = btrfs_extent_flags(eb, ei);
@@ -2252,7 +2250,7 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
btrfs_debug(fs_info,
"logical %llu is at position %llu within the extent (%llu EXTENT_ITEM %llu) flags %#llx size %u",
logical, logical - found_key->objectid, found_key->objectid,
- found_key->offset, flags, item_size);
+ found_key->offset, flags, btrfs_item_size(eb, path->slots[0]));
WARN_ON(!flags_ret);
if (flags_ret) {
@@ -2548,17 +2546,20 @@ static int build_ino_list(u64 inum, u64 offset, u64 num_bytes, u64 root, void *c
}
int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
- struct btrfs_path *path,
void *ctx, bool ignore_offset)
{
struct btrfs_backref_walk_ctx walk_ctx = { 0 };
int ret;
u64 flags = 0;
struct btrfs_key found_key;
- int search_commit_root = path->search_commit_root;
+ struct btrfs_path *path;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
ret = extent_from_logical(fs_info, logical, path, &found_key, &flags);
- btrfs_release_path(path);
+ btrfs_free_path(path);
if (ret < 0)
return ret;
if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
@@ -2571,8 +2572,7 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
walk_ctx.extent_item_pos = logical - found_key.objectid;
walk_ctx.fs_info = fs_info;
- return iterate_extent_inodes(&walk_ctx, search_commit_root,
- build_ino_list, ctx);
+ return iterate_extent_inodes(&walk_ctx, false, build_ino_list, ctx);
}
static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off,
@@ -2877,7 +2877,7 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr)
goto release;
}
if (path->slots[0] == 0) {
- WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ DEBUG_WARN();
ret = -EUCLEAN;
goto release;
}
@@ -3134,8 +3134,8 @@ void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache,
return;
while (!list_empty(&node->upper)) {
- edge = list_entry(node->upper.next, struct btrfs_backref_edge,
- list[LOWER]);
+ edge = list_first_entry(&node->upper, struct btrfs_backref_edge,
+ list[LOWER]);
list_del(&edge->list[LOWER]);
list_del(&edge->list[UPPER]);
btrfs_backref_free_edge(cache, edge);
@@ -3161,18 +3161,14 @@ void btrfs_backref_release_cache(struct btrfs_backref_cache *cache)
ASSERT(!cache->nr_edges);
}
-void btrfs_backref_link_edge(struct btrfs_backref_edge *edge,
- struct btrfs_backref_node *lower,
- struct btrfs_backref_node *upper,
- int link_which)
+static void btrfs_backref_link_edge(struct btrfs_backref_edge *edge,
+ struct btrfs_backref_node *lower,
+ struct btrfs_backref_node *upper)
{
ASSERT(upper && lower && upper->level == lower->level + 1);
edge->node[LOWER] = lower;
edge->node[UPPER] = upper;
- if (link_which & LINK_LOWER)
- list_add_tail(&edge->list[LOWER], &lower->upper);
- if (link_which & LINK_UPPER)
- list_add_tail(&edge->list[UPPER], &upper->lower);
+ list_add_tail(&edge->list[LOWER], &lower->upper);
}
/*
* Handle direct tree backref
@@ -3242,7 +3238,7 @@ static int handle_direct_tree_backref(struct btrfs_backref_cache *cache,
ASSERT(upper->checked);
INIT_LIST_HEAD(&edge->list[UPPER]);
}
- btrfs_backref_link_edge(edge, cur, upper, LINK_LOWER);
+ btrfs_backref_link_edge(edge, cur, upper);
return 0;
}
@@ -3412,7 +3408,7 @@ static int handle_indirect_tree_backref(struct btrfs_trans_handle *trans,
if (!upper->owner)
upper->owner = btrfs_header_owner(eb);
}
- btrfs_backref_link_edge(edge, lower, upper, LINK_LOWER);
+ btrfs_backref_link_edge(edge, lower, upper);
if (rb_node) {
btrfs_put_root(root);
@@ -3473,8 +3469,8 @@ int btrfs_backref_add_tree_node(struct btrfs_trans_handle *trans,
* type BTRFS_TREE_BLOCK_REF_KEY
*/
ASSERT(list_is_singular(&cur->upper));
- edge = list_entry(cur->upper.next, struct btrfs_backref_edge,
- list[LOWER]);
+ edge = list_first_entry(&cur->upper, struct btrfs_backref_edge,
+ list[LOWER]);
ASSERT(list_empty(&edge->list[UPPER]));
exist = edge->node[UPPER];
/*
@@ -3570,7 +3566,7 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache,
ASSERT(start->checked);
- rb_node = rb_simple_insert(&cache->rb_root, start->bytenr, &start->rb_node);
+ rb_node = rb_simple_insert(&cache->rb_root, &start->simple_node);
if (rb_node)
btrfs_backref_panic(cache->fs_info, start->bytenr, -EEXIST);
@@ -3617,12 +3613,11 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache,
/* Sanity check, we shouldn't have any unchecked nodes */
if (!upper->checked) {
- ASSERT(0);
+ DEBUG_WARN("we should not have any unchecked nodes");
return -EUCLEAN;
}
- rb_node = rb_simple_insert(&cache->rb_root, upper->bytenr,
- &upper->rb_node);
+ rb_node = rb_simple_insert(&cache->rb_root, &upper->simple_node);
if (unlikely(rb_node)) {
btrfs_backref_panic(cache->fs_info, upper->bytenr, -EEXIST);
return -EUCLEAN;
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 74e614031274..34b0193a181c 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -226,8 +226,7 @@ int iterate_extent_inodes(struct btrfs_backref_walk_ctx *ctx,
iterate_extent_inodes_t *iterate, void *user_ctx);
int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
- struct btrfs_path *path, void *ctx,
- bool ignore_offset);
+ void *ctx, bool ignore_offset);
int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
@@ -313,10 +312,15 @@ int btrfs_backref_iter_next(struct btrfs_backref_iter *iter);
* Represent a tree block in the backref cache
*/
struct btrfs_backref_node {
- struct {
- struct rb_node rb_node;
- u64 bytenr;
- }; /* Use rb_simple_node for search/insert */
+ union{
+ /* Use rb_simple_node for search/insert */
+ struct {
+ struct rb_node rb_node;
+ u64 bytenr;
+ };
+
+ struct rb_simple_node simple_node;
+ };
/*
* This is a sanity check, whenever we COW a block we will update
@@ -423,13 +427,6 @@ struct btrfs_backref_node *btrfs_backref_alloc_node(
struct btrfs_backref_edge *btrfs_backref_alloc_edge(
struct btrfs_backref_cache *cache);
-#define LINK_LOWER (1 << 0)
-#define LINK_UPPER (1 << 1)
-
-void btrfs_backref_link_edge(struct btrfs_backref_edge *edge,
- struct btrfs_backref_node *lower,
- struct btrfs_backref_node *upper,
- int link_which);
void btrfs_backref_free_node(struct btrfs_backref_cache *cache,
struct btrfs_backref_node *node);
void btrfs_backref_free_edge(struct btrfs_backref_cache *cache,
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index bc2555c44a12..50b5fc1c06d7 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -27,12 +27,12 @@ struct btrfs_failed_bio {
};
/* Is this a data path I/O that needs storage layer checksum and repair? */
-static inline bool is_data_bbio(struct btrfs_bio *bbio)
+static inline bool is_data_bbio(const struct btrfs_bio *bbio)
{
return bbio->inode && is_data_inode(bbio->inode);
}
-static bool bbio_has_ordered_extent(struct btrfs_bio *bbio)
+static bool bbio_has_ordered_extent(const struct btrfs_bio *bbio)
{
return is_data_bbio(bbio) && btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE;
}
@@ -97,33 +97,17 @@ static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info,
return bbio;
}
-/* Free a bio that was never submitted to the underlying device. */
-static void btrfs_cleanup_bio(struct btrfs_bio *bbio)
-{
- if (bbio_has_ordered_extent(bbio))
- btrfs_put_ordered_extent(bbio->ordered);
- bio_put(&bbio->bio);
-}
-
-static void __btrfs_bio_end_io(struct btrfs_bio *bbio)
-{
- if (bbio_has_ordered_extent(bbio)) {
- struct btrfs_ordered_extent *ordered = bbio->ordered;
-
- bbio->end_io(bbio);
- btrfs_put_ordered_extent(ordered);
- } else {
- bbio->end_io(bbio);
- }
-}
-
void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
{
bbio->bio.bi_status = status;
if (bbio->bio.bi_pool == &btrfs_clone_bioset) {
struct btrfs_bio *orig_bbio = bbio->private;
- btrfs_cleanup_bio(bbio);
+ /* Free bio that was never submitted to the underlying device. */
+ if (bbio_has_ordered_extent(bbio))
+ btrfs_put_ordered_extent(bbio->ordered);
+ bio_put(&bbio->bio);
+
bbio = orig_bbio;
}
@@ -138,18 +122,26 @@ void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
/* Load split bio's error which might be set above. */
if (status == BLK_STS_OK)
bbio->bio.bi_status = READ_ONCE(bbio->status);
- __btrfs_bio_end_io(bbio);
+
+ if (bbio_has_ordered_extent(bbio)) {
+ struct btrfs_ordered_extent *ordered = bbio->ordered;
+
+ bbio->end_io(bbio);
+ btrfs_put_ordered_extent(ordered);
+ } else {
+ bbio->end_io(bbio);
+ }
}
}
-static int next_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror)
+static int next_repair_mirror(const struct btrfs_failed_bio *fbio, int cur_mirror)
{
if (cur_mirror == fbio->num_copies)
return cur_mirror + 1 - fbio->num_copies;
return cur_mirror + 1;
}
-static int prev_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror)
+static int prev_repair_mirror(const struct btrfs_failed_bio *fbio, int cur_mirror)
{
if (cur_mirror == 1)
return fbio->num_copies;
@@ -173,12 +165,6 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio,
struct bio_vec *bv = bio_first_bvec_all(&repair_bbio->bio);
int mirror = repair_bbio->mirror_num;
- /*
- * We can only trigger this for data bio, which doesn't support larger
- * folios yet.
- */
- ASSERT(folio_order(page_folio(bv->bv_page)) == 0);
-
if (repair_bbio->bio.bi_status ||
!btrfs_data_csum_ok(repair_bbio, dev, 0, bv)) {
bio_reset(&repair_bbio->bio, NULL, REQ_OP_READ);
@@ -200,7 +186,7 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio,
btrfs_repair_io_failure(fs_info, btrfs_ino(inode),
repair_bbio->file_offset, fs_info->sectorsize,
repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT,
- page_folio(bv->bv_page), bv->bv_offset, mirror);
+ bvec_phys(bv), mirror);
} while (mirror != fbio->bbio->mirror_num);
done:
@@ -309,7 +295,7 @@ static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *de
btrfs_bio_end_io(bbio, bbio->bio.bi_status);
}
-static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev)
+static void btrfs_log_dev_io_error(const struct bio *bio, struct btrfs_device *dev)
{
if (!dev || !dev->bdev)
return;
@@ -324,8 +310,8 @@ static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev)
btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS);
}
-static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_fs_info *fs_info,
- struct bio *bio)
+static struct workqueue_struct *btrfs_end_io_wq(const struct btrfs_fs_info *fs_info,
+ const struct bio *bio)
{
if (bio->bi_opf & REQ_META)
return fs_info->endio_meta_workers;
@@ -447,7 +433,7 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio)
ASSERT(btrfs_dev_is_sequential(dev, physical));
bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT;
}
- btrfs_debug_in_rcu(dev->fs_info,
+ btrfs_debug(dev->fs_info,
"%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
__func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
(unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev),
@@ -520,7 +506,7 @@ static void btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc,
}
}
-static blk_status_t btrfs_bio_csum(struct btrfs_bio *bbio)
+static int btrfs_bio_csum(struct btrfs_bio *bbio)
{
if (bbio->bio.bi_opf & REQ_META)
return btree_csum_one_bio(bbio);
@@ -551,11 +537,11 @@ static void run_one_async_start(struct btrfs_work *work)
{
struct async_submit_bio *async =
container_of(work, struct async_submit_bio, work);
- blk_status_t ret;
+ int ret;
ret = btrfs_bio_csum(async->bbio);
if (ret)
- async->bbio->bio.bi_status = ret;
+ async->bbio->bio.bi_status = errno_to_blk_status(ret);
}
/*
@@ -581,7 +567,7 @@ static void run_one_async_done(struct btrfs_work *work, bool do_free)
/* If an error occurred we just want to clean up the bio and move on. */
if (bio->bi_status) {
- btrfs_bio_end_io(async->bbio, async->bbio->bio.bi_status);
+ btrfs_bio_end_io(async->bbio, bio->bi_status);
return;
}
@@ -682,8 +668,8 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
bool use_append = btrfs_use_zone_append(bbio);
struct btrfs_io_context *bioc = NULL;
struct btrfs_io_stripe smap;
- blk_status_t ret;
- int error;
+ blk_status_t status;
+ int ret;
if (!bbio->inode || btrfs_is_data_reloc_root(inode->root))
smap.rst_search_commit_root = true;
@@ -691,10 +677,10 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
smap.rst_search_commit_root = false;
btrfs_bio_counter_inc_blocked(fs_info);
- error = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
- &bioc, &smap, &mirror_num);
- if (error) {
- ret = errno_to_blk_status(error);
+ ret = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
+ &bioc, &smap, &mirror_num);
+ if (ret) {
+ status = errno_to_blk_status(ret);
btrfs_bio_counter_dec(fs_info);
goto end_bbio;
}
@@ -708,7 +694,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
split = btrfs_split_bio(fs_info, bbio, map_length);
if (IS_ERR(split)) {
- ret = errno_to_blk_status(PTR_ERR(split));
+ status = errno_to_blk_status(PTR_ERR(split));
btrfs_bio_counter_dec(fs_info);
goto end_bbio;
}
@@ -723,7 +709,8 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio)) {
bbio->saved_iter = bio->bi_iter;
ret = btrfs_lookup_bio_sums(bbio);
- if (ret)
+ status = errno_to_blk_status(ret);
+ if (status)
goto fail;
}
@@ -756,13 +743,15 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
goto done;
ret = btrfs_bio_csum(bbio);
- if (ret)
+ status = errno_to_blk_status(ret);
+ if (status)
goto fail;
} else if (use_append ||
(btrfs_is_zoned(fs_info) && inode &&
inode->flags & BTRFS_INODE_NODATASUM)) {
ret = btrfs_alloc_dummy_sum(bbio);
- if (ret)
+ status = errno_to_blk_status(ret);
+ if (status)
goto fail;
}
}
@@ -783,10 +772,10 @@ fail:
ASSERT(bbio->bio.bi_pool == &btrfs_clone_bioset);
ASSERT(remaining);
- btrfs_bio_end_io(remaining, ret);
+ btrfs_bio_end_io(remaining, status);
}
end_bbio:
- btrfs_bio_end_io(bbio, ret);
+ btrfs_bio_end_io(bbio, status);
/* Do not submit another chunk */
return true;
}
@@ -811,8 +800,7 @@ void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num)
* freeing the bio.
*/
int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
- u64 length, u64 logical, struct folio *folio,
- unsigned int folio_offset, int mirror_num)
+ u64 length, u64 logical, phys_addr_t paddr, int mirror_num)
{
struct btrfs_io_stripe smap = { 0 };
struct bio_vec bvec;
@@ -843,8 +831,7 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
bio_init(&bio, smap.dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC);
bio.bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT;
- ret = bio_add_folio(&bio, folio, length, folio_offset);
- ASSERT(ret);
+ __bio_add_page(&bio, phys_to_page(paddr), length, offset_in_page(paddr));
ret = submit_bio_wait(&bio);
if (ret) {
/* try to remap that extent elsewhere? */
@@ -852,7 +839,7 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
goto out_bio_uninit;
}
- btrfs_info_rl_in_rcu(fs_info,
+ btrfs_info_rl(fs_info,
"read error corrected: ino %llu off %llu (dev %s sector %llu)",
ino, start, btrfs_dev_name(smap.dev),
smap.physical >> SECTOR_SHIFT);
@@ -908,22 +895,18 @@ int __init btrfs_bioset_init(void)
return -ENOMEM;
if (bioset_init(&btrfs_clone_bioset, BIO_POOL_SIZE,
offsetof(struct btrfs_bio, bio), 0))
- goto out_free_bioset;
+ goto out;
if (bioset_init(&btrfs_repair_bioset, BIO_POOL_SIZE,
offsetof(struct btrfs_bio, bio),
BIOSET_NEED_BVECS))
- goto out_free_clone_bioset;
+ goto out;
if (mempool_init_kmalloc_pool(&btrfs_failed_bio_pool, BIO_POOL_SIZE,
sizeof(struct btrfs_failed_bio)))
- goto out_free_repair_bioset;
+ goto out;
return 0;
-out_free_repair_bioset:
- bioset_exit(&btrfs_repair_bioset);
-out_free_clone_bioset:
- bioset_exit(&btrfs_clone_bioset);
-out_free_bioset:
- bioset_exit(&btrfs_bioset);
+out:
+ btrfs_bioset_exit();
return -ENOMEM;
}
diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h
index e2fe16074ad6..dc2eb43b7097 100644
--- a/fs/btrfs/bio.h
+++ b/fs/btrfs/bio.h
@@ -110,7 +110,6 @@ void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status);
void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num);
void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace);
int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
- u64 length, u64 logical, struct folio *folio,
- unsigned int folio_offset, int mirror_num);
+ u64 length, u64 logical, phys_addr_t paddr, int mirror_num);
#endif
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index c0a8f7d92acc..9bf282d2453c 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -34,6 +34,19 @@ int btrfs_should_fragment_free_space(const struct btrfs_block_group *block_group
}
#endif
+static inline bool has_unwritten_metadata(struct btrfs_block_group *block_group)
+{
+ /* The meta_write_pointer is available only on the zoned setup. */
+ if (!btrfs_is_zoned(block_group->fs_info))
+ return false;
+
+ if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
+ return false;
+
+ return block_group->start + block_group->alloc_offset >
+ block_group->meta_write_pointer;
+}
+
/*
* Return target flags in extended format or 0 if restripe for this chunk_type
* is not in progress
@@ -191,21 +204,21 @@ static int btrfs_bg_start_cmp(const struct rb_node *new,
/*
* This adds the block group to the fs_info rb tree for the block group cache
*/
-static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
- struct btrfs_block_group *block_group)
+static int btrfs_add_block_group_cache(struct btrfs_block_group *block_group)
{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
struct rb_node *exist;
int ret = 0;
ASSERT(block_group->length != 0);
- write_lock(&info->block_group_cache_lock);
+ write_lock(&fs_info->block_group_cache_lock);
exist = rb_find_add_cached(&block_group->cache_node,
- &info->block_group_cache_tree, btrfs_bg_start_cmp);
+ &fs_info->block_group_cache_tree, btrfs_bg_start_cmp);
if (exist)
ret = -EEXIST;
- write_unlock(&info->block_group_cache_lock);
+ write_unlock(&fs_info->block_group_cache_lock);
return ret;
}
@@ -525,10 +538,9 @@ int btrfs_add_new_free_space(struct btrfs_block_group *block_group, u64 start,
*total_added_ret = 0;
while (start < end) {
- if (!find_first_extent_bit(&info->excluded_extents, start,
- &extent_start, &extent_end,
- EXTENT_DIRTY | EXTENT_UPTODATE,
- NULL))
+ if (!btrfs_find_first_extent_bit(&info->excluded_extents, start,
+ &extent_start, &extent_end,
+ EXTENT_DIRTY, NULL))
break;
if (extent_start <= start) {
@@ -584,7 +596,7 @@ static int sample_block_group_extent_item(struct btrfs_caching_control *caching_
struct btrfs_root *extent_root;
u64 search_offset;
u64 search_end = block_group->start + block_group->length;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key search_key;
int ret = 0;
@@ -626,7 +638,6 @@ static int sample_block_group_extent_item(struct btrfs_caching_control *caching_
lockdep_assert_held(&caching_ctl->mutex);
lockdep_assert_held_read(&fs_info->commit_root_sem);
- btrfs_free_path(path);
return ret;
}
@@ -702,7 +713,7 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
struct btrfs_block_group *block_group = caching_ctl->block_group;
struct btrfs_fs_info *fs_info = block_group->fs_info;
struct btrfs_root *extent_root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
struct btrfs_key key;
u64 total_found = 0;
@@ -738,8 +749,8 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
path->reada = READA_FORWARD;
key.objectid = last;
- key.offset = 0;
key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = 0;
next:
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
@@ -785,8 +796,8 @@ next:
if (key.objectid < last) {
key.objectid = last;
- key.offset = 0;
key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = 0;
btrfs_release_path(path);
goto next;
}
@@ -829,14 +840,13 @@ next:
block_group->start + block_group->length,
NULL);
out:
- btrfs_free_path(path);
return ret;
}
static inline void btrfs_free_excluded_extents(const struct btrfs_block_group *bg)
{
- clear_extent_bits(&bg->fs_info->excluded_extents, bg->start,
- bg->start + bg->length - 1, EXTENT_UPTODATE);
+ btrfs_clear_extent_bit(&bg->fs_info->excluded_extents, bg->start,
+ bg->start + bg->length - 1, EXTENT_DIRTY, NULL);
}
static noinline void caching_thread(struct btrfs_work *work)
@@ -880,7 +890,7 @@ static noinline void caching_thread(struct btrfs_work *work)
*/
if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
!(test_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags)))
- ret = load_free_space_tree(caching_ctl);
+ ret = btrfs_load_free_space_tree(caching_ctl);
else
ret = load_extent_tree_free(caching_ctl);
done:
@@ -1238,7 +1248,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
* another task to attempt to create another block group with the same
* item key (and failing with -EEXIST and a transaction abort).
*/
- ret = remove_block_group_free_space(trans, block_group);
+ ret = btrfs_remove_block_group_free_space(trans, block_group);
if (ret)
goto out;
@@ -1247,6 +1257,15 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
goto out;
spin_lock(&block_group->lock);
+ /*
+ * Hitting this WARN means we removed a block group with an unwritten
+ * region. It will cause "unable to find chunk map for logical" errors.
+ */
+ if (WARN_ON(has_unwritten_metadata(block_group)))
+ btrfs_warn(fs_info,
+ "block group %llu is removed before metadata write out",
+ block_group->start);
+
set_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags);
/*
@@ -1406,7 +1425,7 @@ out:
if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
btrfs_info(cache->fs_info,
"unable to make block group %llu ro", cache->start);
- btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0);
+ btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, false);
}
return ret;
}
@@ -1421,9 +1440,8 @@ static bool clean_pinned_extents(struct btrfs_trans_handle *trans,
int ret;
spin_lock(&fs_info->trans_lock);
- if (trans->transaction->list.prev != &fs_info->trans_list) {
- prev_trans = list_last_entry(&trans->transaction->list,
- struct btrfs_transaction, list);
+ if (!list_is_first(&trans->transaction->list, &fs_info->trans_list)) {
+ prev_trans = list_prev_entry(trans->transaction, list);
refcount_inc(&prev_trans->use_count);
}
spin_unlock(&fs_info->trans_lock);
@@ -1440,14 +1458,14 @@ static bool clean_pinned_extents(struct btrfs_trans_handle *trans,
*/
mutex_lock(&fs_info->unused_bg_unpin_mutex);
if (prev_trans) {
- ret = clear_extent_bits(&prev_trans->pinned_extents, start, end,
- EXTENT_DIRTY);
+ ret = btrfs_clear_extent_bit(&prev_trans->pinned_extents, start, end,
+ EXTENT_DIRTY, NULL);
if (ret)
goto out;
}
- ret = clear_extent_bits(&trans->transaction->pinned_extents, start, end,
- EXTENT_DIRTY);
+ ret = btrfs_clear_extent_bit(&trans->transaction->pinned_extents, start, end,
+ EXTENT_DIRTY, NULL);
out:
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
if (prev_trans)
@@ -1457,6 +1475,32 @@ out:
}
/*
+ * Link the block_group to a list via bg_list.
+ *
+ * @bg: The block_group to link to the list.
+ * @list: The list to link it to.
+ *
+ * Use this rather than list_add_tail() directly to ensure proper respect
+ * to locking and refcounting.
+ *
+ * Returns: true if the bg was linked with a refcount bump and false otherwise.
+ */
+static bool btrfs_link_bg_list(struct btrfs_block_group *bg, struct list_head *list)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+ bool added = false;
+
+ spin_lock(&fs_info->unused_bgs_lock);
+ if (list_empty(&bg->bg_list)) {
+ btrfs_get_block_group(bg);
+ list_add_tail(&bg->bg_list, list);
+ added = true;
+ }
+ spin_unlock(&fs_info->unused_bgs_lock);
+ return added;
+}
+
+/*
* Process the unused_bgs list and remove any that don't have any allocated
* space inside of them.
*/
@@ -1564,15 +1608,15 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
* needing to allocate extents from the block group.
*/
used = btrfs_space_info_used(space_info, true);
- if (space_info->total_bytes - block_group->length < used &&
- block_group->zone_unusable < block_group->length) {
+ if ((space_info->total_bytes - block_group->length < used &&
+ block_group->zone_unusable < block_group->length) ||
+ has_unwritten_metadata(block_group)) {
/*
* Add a reference for the list, compensate for the ref
* drop under the "next" label for the
* fs_info->unused_bgs list.
*/
- btrfs_get_block_group(block_group);
- list_add_tail(&block_group->bg_list, &retry_list);
+ btrfs_link_bg_list(block_group, &retry_list);
trace_btrfs_skip_unused_block_group(block_group);
spin_unlock(&block_group->lock);
@@ -1595,8 +1639,10 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
ret = btrfs_zone_finish(block_group);
if (ret < 0) {
btrfs_dec_block_group_ro(block_group);
- if (ret == -EAGAIN)
+ if (ret == -EAGAIN) {
+ btrfs_link_bg_list(block_group, &retry_list);
ret = 0;
+ }
goto next;
}
@@ -1822,8 +1868,8 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
*/
list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp);
while (!list_empty(&fs_info->reclaim_bgs)) {
- u64 zone_unusable;
- u64 reclaimed;
+ u64 used;
+ u64 reserved;
int ret = 0;
bg = list_first_entry(&fs_info->reclaim_bgs,
@@ -1887,6 +1933,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
up_write(&space_info->groups_sem);
goto next;
}
+
spin_unlock(&bg->lock);
spin_unlock(&space_info->lock);
@@ -1903,31 +1950,41 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
goto next;
}
- /*
- * Cache the zone_unusable value before turning the block group
- * to read only. As soon as the blog group is read only it's
- * zone_unusable value gets moved to the block group's read-only
- * bytes and isn't available for calculations anymore.
- */
- zone_unusable = bg->zone_unusable;
ret = inc_block_group_ro(bg, 0);
up_write(&space_info->groups_sem);
if (ret < 0)
goto next;
- btrfs_info(fs_info,
- "reclaiming chunk %llu with %llu%% used %llu%% unusable",
- bg->start,
- div64_u64(bg->used * 100, bg->length),
- div64_u64(zone_unusable * 100, bg->length));
+ /*
+ * The amount of bytes reclaimed corresponds to the sum of the
+ * "used" and "reserved" counters. We have set the block group
+ * to RO above, which prevents reservations from happening but
+ * we may have existing reservations for which allocation has
+ * not yet been done - btrfs_update_block_group() was not yet
+ * called, which is where we will transfer a reserved extent's
+ * size from the "reserved" counter to the "used" counter - this
+ * happens when running delayed references. When we relocate the
+ * chunk below, relocation first flushes dellaloc, waits for
+ * ordered extent completion (which is where we create delayed
+ * references for data extents) and commits the current
+ * transaction (which runs delayed references), and only after
+ * it does the actual work to move extents out of the block
+ * group. So the reported amount of reclaimed bytes is
+ * effectively the sum of the 'used' and 'reserved' counters.
+ */
+ spin_lock(&bg->lock);
+ used = bg->used;
+ reserved = bg->reserved;
+ spin_unlock(&bg->lock);
+
trace_btrfs_reclaim_block_group(bg);
- reclaimed = bg->used;
- ret = btrfs_relocate_chunk(fs_info, bg->start);
+ ret = btrfs_relocate_chunk(fs_info, bg->start, false);
if (ret) {
btrfs_dec_block_group_ro(bg);
btrfs_err(fs_info, "error relocating chunk %llu",
bg->start);
- reclaimed = 0;
+ used = 0;
+ reserved = 0;
spin_lock(&space_info->lock);
space_info->reclaim_errors++;
if (READ_ONCE(space_info->periodic_reclaim))
@@ -1936,24 +1993,13 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
}
spin_lock(&space_info->lock);
space_info->reclaim_count++;
- space_info->reclaim_bytes += reclaimed;
+ space_info->reclaim_bytes += used;
+ space_info->reclaim_bytes += reserved;
spin_unlock(&space_info->lock);
next:
- if (ret && !READ_ONCE(space_info->periodic_reclaim)) {
- /* Refcount held by the reclaim_bgs list after splice. */
- spin_lock(&fs_info->unused_bgs_lock);
- /*
- * This block group might be added to the unused list
- * during the above process. Move it back to the
- * reclaim list otherwise.
- */
- if (list_empty(&bg->bg_list)) {
- btrfs_get_block_group(bg);
- list_add_tail(&bg->bg_list, &retry_list);
- }
- spin_unlock(&fs_info->unused_bgs_lock);
- }
+ if (ret && !READ_ONCE(space_info->periodic_reclaim))
+ btrfs_link_bg_list(bg, &retry_list);
btrfs_put_block_group(bg);
mutex_unlock(&fs_info->reclaim_bgs_lock);
@@ -1993,13 +2039,8 @@ void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg)
{
struct btrfs_fs_info *fs_info = bg->fs_info;
- spin_lock(&fs_info->unused_bgs_lock);
- if (list_empty(&bg->bg_list)) {
- btrfs_get_block_group(bg);
+ if (btrfs_link_bg_list(bg, &fs_info->reclaim_bgs))
trace_btrfs_add_reclaim_block_group(bg);
- list_add_tail(&bg->bg_list, &fs_info->reclaim_bgs);
- }
- spin_unlock(&fs_info->unused_bgs_lock);
}
static int read_bg_from_eb(struct btrfs_fs_info *fs_info, const struct btrfs_key *key,
@@ -2182,9 +2223,9 @@ static int exclude_super_stripes(struct btrfs_block_group *cache)
if (cache->start < BTRFS_SUPER_INFO_OFFSET) {
stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->start;
cache->bytes_super += stripe_len;
- ret = set_extent_bit(&fs_info->excluded_extents, cache->start,
- cache->start + stripe_len - 1,
- EXTENT_UPTODATE, NULL);
+ ret = btrfs_set_extent_bit(&fs_info->excluded_extents, cache->start,
+ cache->start + stripe_len - 1,
+ EXTENT_DIRTY, NULL);
if (ret)
return ret;
}
@@ -2210,9 +2251,9 @@ static int exclude_super_stripes(struct btrfs_block_group *cache)
cache->start + cache->length - logical[nr]);
cache->bytes_super += len;
- ret = set_extent_bit(&fs_info->excluded_extents, logical[nr],
- logical[nr] + len - 1,
- EXTENT_UPTODATE, NULL);
+ ret = btrfs_set_extent_bit(&fs_info->excluded_extents,
+ logical[nr], logical[nr] + len - 1,
+ EXTENT_DIRTY, NULL);
if (ret) {
kfree(logical);
return ret;
@@ -2337,8 +2378,9 @@ static int read_one_block_group(struct btrfs_fs_info *info,
cache->commit_used = cache->used;
cache->flags = btrfs_stack_block_group_flags(bgi);
cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi);
+ cache->space_info = btrfs_find_space_info(info, cache->flags);
- set_free_space_tree_thresholds(cache);
+ btrfs_set_free_space_tree_thresholds(cache);
if (need_clear) {
/*
@@ -2410,11 +2452,12 @@ static int read_one_block_group(struct btrfs_fs_info *info,
goto error;
}
- ret = btrfs_add_block_group_cache(info, cache);
+ ret = btrfs_add_block_group_cache(cache);
if (ret) {
btrfs_remove_free_space_cache(cache);
goto error;
}
+
trace_btrfs_add_block_group(info, cache, 0);
btrfs_add_bg_to_space_info(info, cache);
@@ -2459,7 +2502,8 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
bg->cached = BTRFS_CACHE_FINISHED;
bg->used = map->chunk_len;
bg->flags = map->type;
- ret = btrfs_add_block_group_cache(fs_info, bg);
+ bg->space_info = btrfs_find_space_info(fs_info, bg->flags);
+ ret = btrfs_add_block_group_cache(bg);
/*
* We may have some valid block group cache added already, in
* that case we skip to the next one.
@@ -2509,8 +2553,8 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
return fill_dummy_bgs(info);
key.objectid = 0;
- key.offset = 0;
key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+ key.offset = 0;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -2641,7 +2685,7 @@ static int insert_dev_extent(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *fs_info = device->fs_info;
struct btrfs_root *root = fs_info->dev_root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_dev_extent *extent;
struct extent_buffer *leaf;
struct btrfs_key key;
@@ -2658,7 +2702,7 @@ static int insert_dev_extent(struct btrfs_trans_handle *trans,
key.offset = start;
ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent));
if (ret)
- goto out;
+ return ret;
leaf = path->nodes[0];
extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
@@ -2666,10 +2710,8 @@ static int insert_dev_extent(struct btrfs_trans_handle *trans,
btrfs_set_dev_extent_chunk_objectid(leaf, extent,
BTRFS_FIRST_CHUNK_TREE_OBJECTID);
btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
-
btrfs_set_dev_extent_length(leaf, extent, num_bytes);
-out:
- btrfs_free_path(path);
+
return ret;
}
@@ -2757,7 +2799,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
block_group->length);
if (ret)
btrfs_abort_transaction(trans, ret);
- add_block_group_free_space(trans, block_group);
+ btrfs_add_block_group_free_space(trans, block_group);
/*
* If we restriped during balance, we may have added a new raid
@@ -2771,8 +2813,12 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
/* Already aborted the transaction if it failed. */
next:
btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info);
+
+ spin_lock(&fs_info->unused_bgs_lock);
list_del_init(&block_group->bg_list);
clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags);
+ btrfs_put_block_group(block_group);
+ spin_unlock(&fs_info->unused_bgs_lock);
/*
* If the block group is still unused, add it to the list of
@@ -2830,8 +2876,8 @@ static u64 calculate_global_root_id(const struct btrfs_fs_info *fs_info, u64 off
}
struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
- u64 type,
- u64 chunk_offset, u64 size)
+ struct btrfs_space_info *space_info,
+ u64 type, u64 chunk_offset, u64 size)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_block_group *cache;
@@ -2851,7 +2897,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
set_bit(BLOCK_GROUP_FLAG_NEW, &cache->runtime_flags);
cache->length = size;
- set_free_space_tree_thresholds(cache);
+ btrfs_set_free_space_tree_thresholds(cache);
cache->flags = type;
cache->cached = BTRFS_CACHE_FINISHED;
cache->global_root_id = calculate_global_root_id(fs_info, cache->start);
@@ -2885,10 +2931,10 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
* assigned to our block group. We want our bg to be added to the rbtree
* with its ->space_info set.
*/
- cache->space_info = btrfs_find_space_info(fs_info, cache->flags);
+ cache->space_info = space_info;
ASSERT(cache->space_info);
- ret = btrfs_add_block_group_cache(fs_info, cache);
+ ret = btrfs_add_block_group_cache(cache);
if (ret) {
btrfs_remove_free_space_cache(cache);
btrfs_put_block_group(cache);
@@ -2910,7 +2956,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
}
#endif
- list_add_tail(&cache->bg_list, &trans->new_bgs);
+ btrfs_link_bg_list(cache, &trans->new_bgs);
btrfs_inc_delayed_refs_rsv_bg_inserts(fs_info);
set_avail_alloc_bits(fs_info, type);
@@ -2930,6 +2976,7 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
bool do_chunk_alloc)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
+ struct btrfs_space_info *space_info = cache->space_info;
struct btrfs_trans_handle *trans;
struct btrfs_root *root = btrfs_block_group_root(fs_info);
u64 alloc_flags;
@@ -2982,7 +3029,7 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
*/
alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
if (alloc_flags != cache->flags) {
- ret = btrfs_chunk_alloc(trans, alloc_flags,
+ ret = btrfs_chunk_alloc(trans, space_info, alloc_flags,
CHUNK_ALLOC_FORCE);
/*
* ENOSPC is allowed here, we may have enough space
@@ -3010,15 +3057,15 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
(cache->flags & BTRFS_BLOCK_GROUP_SYSTEM))
goto unlock_out;
- alloc_flags = btrfs_get_alloc_profile(fs_info, cache->space_info->flags);
- ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
+ alloc_flags = btrfs_get_alloc_profile(fs_info, space_info->flags);
+ ret = btrfs_chunk_alloc(trans, space_info, alloc_flags, CHUNK_ALLOC_FORCE);
if (ret < 0)
goto out;
/*
* We have allocated a new chunk. We also need to activate that chunk to
* grant metadata tickets for zoned filesystem.
*/
- ret = btrfs_zoned_activate_one_bg(fs_info, cache->space_info, true);
+ ret = btrfs_zoned_activate_one_bg(fs_info, space_info, true);
if (ret < 0)
goto out;
@@ -3306,7 +3353,7 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans)
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_block_group *cache, *tmp;
struct btrfs_transaction *cur_trans = trans->transaction;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
if (list_empty(&cur_trans->dirty_bgs) ||
!btrfs_test_opt(fs_info, SPACE_CACHE))
@@ -3323,7 +3370,6 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans)
cache_save_setup(cache, trans, path);
}
- btrfs_free_path(path);
return 0;
}
@@ -3346,7 +3392,7 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
struct btrfs_transaction *cur_trans = trans->transaction;
int ret = 0;
int should_put;
- struct btrfs_path *path = NULL;
+ BTRFS_PATH_AUTO_FREE(path);
LIST_HEAD(dirty);
struct list_head *io = &cur_trans->io_bgs;
int loops = 0;
@@ -3501,7 +3547,6 @@ out:
btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
}
- btrfs_free_path(path);
return ret;
}
@@ -3512,7 +3557,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
struct btrfs_transaction *cur_trans = trans->transaction;
int ret = 0;
int should_put;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct list_head *io = &cur_trans->io_bgs;
path = btrfs_alloc_path();
@@ -3599,9 +3644,11 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
wait_event(cur_trans->writer_wait,
atomic_read(&cur_trans->num_writers) == 1);
ret = update_block_group_item(trans, path, cache);
- }
- if (ret)
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ } else if (ret) {
btrfs_abort_transaction(trans, ret);
+ }
}
/* If its not on the io list, we need to put the block group */
@@ -3624,7 +3671,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
btrfs_put_block_group(cache);
}
- btrfs_free_path(path);
return ret;
}
@@ -3703,8 +3749,8 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
spin_unlock(&cache->lock);
spin_unlock(&space_info->lock);
- set_extent_bit(&trans->transaction->pinned_extents, bytenr,
- bytenr + num_bytes - 1, EXTENT_DIRTY, NULL);
+ btrfs_set_extent_bit(&trans->transaction->pinned_extents, bytenr,
+ bytenr + num_bytes - 1, EXTENT_DIRTY, NULL);
}
spin_lock(&trans->transaction->dirty_bgs_lock);
@@ -3793,17 +3839,17 @@ out:
/*
* Update the block_group and space info counters.
*
- * @cache: The cache we are manipulating
- * @num_bytes: The number of bytes in question
- * @delalloc: The blocks are allocated for the delalloc write
+ * @cache: The cache we are manipulating.
+ * @num_bytes: The number of bytes in question.
+ * @is_delalloc: Whether the blocks are allocated for a delalloc write.
*
* This is called by somebody who is freeing space that was never actually used
* on disk. For example if you reserve some space for a new leaf in transaction
* A and before transaction A commits you free that leaf, you call this with
* reserve set to 0 in order to clear the reservation.
*/
-void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
- u64 num_bytes, int delalloc)
+void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, u64 num_bytes,
+ bool is_delalloc)
{
struct btrfs_space_info *space_info = cache->space_info;
@@ -3817,7 +3863,7 @@ void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
space_info->bytes_reserved -= num_bytes;
space_info->max_extent_size = 0;
- if (delalloc)
+ if (is_delalloc)
cache->delalloc_bytes -= num_bytes;
spin_unlock(&cache->lock);
@@ -3836,14 +3882,14 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
}
}
-static int should_alloc_chunk(const struct btrfs_fs_info *fs_info,
- const struct btrfs_space_info *sinfo, int force)
+static bool should_alloc_chunk(const struct btrfs_fs_info *fs_info,
+ const struct btrfs_space_info *sinfo, int force)
{
u64 bytes_used = btrfs_space_info_used(sinfo, false);
u64 thresh;
if (force == CHUNK_ALLOC_FORCE)
- return 1;
+ return true;
/*
* in limited mode, we want to have some free space up to
@@ -3854,22 +3900,31 @@ static int should_alloc_chunk(const struct btrfs_fs_info *fs_info,
thresh = max_t(u64, SZ_64M, mult_perc(thresh, 1));
if (sinfo->total_bytes - bytes_used < thresh)
- return 1;
+ return true;
}
if (bytes_used + SZ_2M < mult_perc(sinfo->total_bytes, 80))
- return 0;
- return 1;
+ return false;
+ return true;
}
int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
{
u64 alloc_flags = btrfs_get_alloc_profile(trans->fs_info, type);
+ struct btrfs_space_info *space_info;
- return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
+ space_info = btrfs_find_space_info(trans->fs_info, type);
+ if (!space_info) {
+ DEBUG_WARN();
+ return -EINVAL;
+ }
+
+ return btrfs_chunk_alloc(trans, space_info, alloc_flags, CHUNK_ALLOC_FORCE);
}
-static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
+static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans,
+ struct btrfs_space_info *space_info,
+ u64 flags)
{
struct btrfs_block_group *bg;
int ret;
@@ -3882,7 +3937,7 @@ static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans
*/
check_system_chunk(trans, flags);
- bg = btrfs_create_chunk(trans, flags);
+ bg = btrfs_create_chunk(trans, space_info, flags);
if (IS_ERR(bg)) {
ret = PTR_ERR(bg);
goto out;
@@ -3930,8 +3985,16 @@ static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans
if (ret == -ENOSPC) {
const u64 sys_flags = btrfs_system_alloc_profile(trans->fs_info);
struct btrfs_block_group *sys_bg;
+ struct btrfs_space_info *sys_space_info;
- sys_bg = btrfs_create_chunk(trans, sys_flags);
+ sys_space_info = btrfs_find_space_info(trans->fs_info, sys_flags);
+ if (!sys_space_info) {
+ ret = -EINVAL;
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+ sys_bg = btrfs_create_chunk(trans, sys_space_info, sys_flags);
if (IS_ERR(sys_bg)) {
ret = PTR_ERR(sys_bg);
btrfs_abort_transaction(trans, ret);
@@ -4062,6 +4125,8 @@ out:
*
* This function, btrfs_chunk_alloc(), belongs to phase 1.
*
+ * @space_info: specify which space_info the new chunk should belong to.
+ *
* If @force is CHUNK_ALLOC_FORCE:
* - return 1 if it successfully allocates a chunk,
* - return errors including -ENOSPC otherwise.
@@ -4070,11 +4135,11 @@ out:
* - return 1 if it successfully allocates a chunk,
* - return errors including -ENOSPC otherwise.
*/
-int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
+int btrfs_chunk_alloc(struct btrfs_trans_handle *trans,
+ struct btrfs_space_info *space_info, u64 flags,
enum btrfs_chunk_alloc_enum force)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_space_info *space_info;
struct btrfs_block_group *ret_bg;
bool wait_for_alloc = false;
bool should_alloc = false;
@@ -4113,9 +4178,6 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
return -ENOSPC;
- space_info = btrfs_find_space_info(fs_info, flags);
- ASSERT(space_info);
-
do {
spin_lock(&space_info->lock);
if (force < space_info->force_alloc)
@@ -4176,7 +4238,7 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
force_metadata_allocation(fs_info);
}
- ret_bg = do_chunk_alloc(trans, flags);
+ ret_bg = do_chunk_alloc(trans, space_info, flags);
trans->allocating_chunk = false;
if (IS_ERR(ret_bg)) {
@@ -4246,12 +4308,16 @@ static void reserve_chunk_space(struct btrfs_trans_handle *trans,
if (left < bytes && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
left, bytes, type);
- btrfs_dump_space_info(fs_info, info, 0, 0);
+ btrfs_dump_space_info(fs_info, info, 0, false);
}
if (left < bytes) {
u64 flags = btrfs_system_alloc_profile(fs_info);
struct btrfs_block_group *bg;
+ struct btrfs_space_info *space_info;
+
+ space_info = btrfs_find_space_info(fs_info, flags);
+ ASSERT(space_info);
/*
* Ignore failure to create system chunk. We might end up not
@@ -4259,7 +4325,7 @@ static void reserve_chunk_space(struct btrfs_trans_handle *trans,
* the paths we visit in the chunk tree (they were already COWed
* or created in the current transaction for example).
*/
- bg = btrfs_create_chunk(trans, flags);
+ bg = btrfs_create_chunk(trans, space_info, flags);
if (IS_ERR(bg)) {
ret = PTR_ERR(bg);
} else {
@@ -4367,6 +4433,43 @@ void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
}
}
+static void check_removing_space_info(struct btrfs_space_info *space_info)
+{
+ struct btrfs_fs_info *info = space_info->fs_info;
+
+ if (space_info->subgroup_id == BTRFS_SUB_GROUP_PRIMARY) {
+ /* This is a top space_info, proceed with its children first. */
+ for (int i = 0; i < BTRFS_SPACE_INFO_SUB_GROUP_MAX; i++) {
+ if (space_info->sub_group[i]) {
+ check_removing_space_info(space_info->sub_group[i]);
+ kfree(space_info->sub_group[i]);
+ space_info->sub_group[i] = NULL;
+ }
+ }
+ }
+
+ /*
+ * Do not hide this behind enospc_debug, this is actually important and
+ * indicates a real bug if this happens.
+ */
+ if (WARN_ON(space_info->bytes_pinned > 0 || space_info->bytes_may_use > 0))
+ btrfs_dump_space_info(info, space_info, 0, false);
+
+ /*
+ * If there was a failure to cleanup a log tree, very likely due to an
+ * IO failure on a writeback attempt of one or more of its extent
+ * buffers, we could not do proper (and cheap) unaccounting of their
+ * reserved space, so don't warn on bytes_reserved > 0 in that case.
+ */
+ if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) ||
+ !BTRFS_FS_LOG_CLEANUP_ERROR(info)) {
+ if (WARN_ON(space_info->bytes_reserved > 0))
+ btrfs_dump_space_info(info, space_info, 0, false);
+ }
+
+ WARN_ON(space_info->reclaim_size > 0);
+}
+
/*
* Must be called only after stopping all workers, since we could have block
* group caching kthreads running, and therefore they could race with us if we
@@ -4392,8 +4495,8 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
write_lock(&info->block_group_cache_lock);
while (!list_empty(&info->caching_block_groups)) {
- caching_ctl = list_entry(info->caching_block_groups.next,
- struct btrfs_caching_control, list);
+ caching_ctl = list_first_entry(&info->caching_block_groups,
+ struct btrfs_caching_control, list);
list_del(&caching_ctl->list);
btrfs_put_caching_control(caching_ctl);
}
@@ -4464,32 +4567,10 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
btrfs_release_global_block_rsv(info);
while (!list_empty(&info->space_info)) {
- space_info = list_entry(info->space_info.next,
- struct btrfs_space_info,
- list);
-
- /*
- * Do not hide this behind enospc_debug, this is actually
- * important and indicates a real bug if this happens.
- */
- if (WARN_ON(space_info->bytes_pinned > 0 ||
- space_info->bytes_may_use > 0))
- btrfs_dump_space_info(info, space_info, 0, 0);
-
- /*
- * If there was a failure to cleanup a log tree, very likely due
- * to an IO failure on a writeback attempt of one or more of its
- * extent buffers, we could not do proper (and cheap) unaccounting
- * of their reserved space, so don't warn on bytes_reserved > 0 in
- * that case.
- */
- if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) ||
- !BTRFS_FS_LOG_CLEANUP_ERROR(info)) {
- if (WARN_ON(space_info->bytes_reserved > 0))
- btrfs_dump_space_info(info, space_info, 0, 0);
- }
+ space_info = list_first_entry(&info->space_info,
+ struct btrfs_space_info, list);
- WARN_ON(space_info->reclaim_size > 0);
+ check_removing_space_info(space_info);
list_del(&space_info->list);
btrfs_sysfs_remove_space_info(space_info);
}
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 36937eeab9b8..a8bb8429c966 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -83,6 +83,8 @@ enum btrfs_block_group_flags {
BLOCK_GROUP_FLAG_ZONED_DATA_RELOC,
/* Does the block group need to be added to the free space tree? */
BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE,
+ /* Set after we add a new block group to the free space tree. */
+ BLOCK_GROUP_FLAG_FREE_SPACE_ADDED,
/* Indicate that the block group is placed on a sequential zone */
BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE,
/*
@@ -244,6 +246,11 @@ struct btrfs_block_group {
/* Lock for free space tree operations. */
struct mutex free_space_lock;
+ /* Protected by @free_space_lock. */
+ bool using_free_space_bitmaps;
+ /* Protected by @free_space_lock. */
+ bool using_free_space_bitmaps_cached;
+
/*
* Number of extents in this block group used for swap files.
* All accesses protected by the spinlock 'lock'.
@@ -326,8 +333,8 @@ void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info);
void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg);
int btrfs_read_block_groups(struct btrfs_fs_info *info);
struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
- u64 type,
- u64 chunk_offset, u64 size);
+ struct btrfs_space_info *space_info,
+ u64 type, u64 chunk_offset, u64 size);
void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans);
int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
bool do_chunk_alloc);
@@ -340,9 +347,10 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
u64 ram_bytes, u64 num_bytes, int delalloc,
bool force_wrong_size_class);
-void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
- u64 num_bytes, int delalloc);
-int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
+void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, u64 num_bytes,
+ bool is_delalloc);
+int btrfs_chunk_alloc(struct btrfs_trans_handle *trans,
+ struct btrfs_space_info *space_info, u64 flags,
enum btrfs_chunk_alloc_enum force);
int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type);
void check_system_chunk(struct btrfs_trans_handle *trans, const u64 type);
diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index 3f3608299c0b..5ad6de738aee 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -418,6 +418,9 @@ void btrfs_init_root_block_rsv(struct btrfs_root *root)
case BTRFS_CHUNK_TREE_OBJECTID:
root->block_rsv = &fs_info->chunk_block_rsv;
break;
+ case BTRFS_TREE_LOG_OBJECTID:
+ root->block_rsv = &fs_info->treelog_rsv;
+ break;
default:
root->block_rsv = NULL;
break;
@@ -438,6 +441,14 @@ void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info)
fs_info->delayed_block_rsv.space_info = space_info;
fs_info->delayed_refs_rsv.space_info = space_info;
+ /* The treelog_rsv uses a dedicated space_info on the zoned mode. */
+ if (!btrfs_is_zoned(fs_info)) {
+ fs_info->treelog_rsv.space_info = space_info;
+ } else {
+ ASSERT(space_info->sub_group[0]->subgroup_id == BTRFS_SUB_GROUP_TREELOG);
+ fs_info->treelog_rsv.space_info = space_info->sub_group[0];
+ }
+
btrfs_update_global_block_rsv(fs_info);
}
diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h
index d12b1fac5c74..79ae9d05cd91 100644
--- a/fs/btrfs/block-rsv.h
+++ b/fs/btrfs/block-rsv.h
@@ -24,6 +24,7 @@ enum btrfs_rsv_type {
BTRFS_BLOCK_RSV_CHUNK,
BTRFS_BLOCK_RSV_DELOPS,
BTRFS_BLOCK_RSV_DELREFS,
+ BTRFS_BLOCK_RSV_TREELOG,
BTRFS_BLOCK_RSV_EMPTY,
BTRFS_BLOCK_RSV_TEMP,
};
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index b2fa33911c28..b99fb0273292 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -145,6 +145,7 @@ struct btrfs_inode {
* different from prop_compress and takes precedence if set.
*/
u8 defrag_compress;
+ s8 defrag_compress_level;
/*
* Lock for counters and all fields used to determine if the inode is in
@@ -516,15 +517,36 @@ static inline void btrfs_assert_inode_locked(struct btrfs_inode *inode)
lockdep_assert_held(&inode->vfs_inode.i_rwsem);
}
+static inline void btrfs_update_inode_mapping_flags(struct btrfs_inode *inode)
+{
+ if (inode->flags & BTRFS_INODE_NODATASUM)
+ mapping_clear_stable_writes(inode->vfs_inode.i_mapping);
+ else
+ mapping_set_stable_writes(inode->vfs_inode.i_mapping);
+}
+
+static inline void btrfs_set_inode_mapping_order(struct btrfs_inode *inode)
+{
+ /* Metadata inode should not reach here. */
+ ASSERT(is_data_inode(inode));
+
+ /* We only allow BITS_PER_LONGS blocks for each bitmap. */
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
+ mapping_set_folio_order_range(inode->vfs_inode.i_mapping, 0,
+ ilog2(((BITS_PER_LONG << inode->root->fs_info->sectorsize_bits)
+ >> PAGE_SHIFT)));
+#endif
+}
+
/* Array of bytes with variable length, hexadecimal format 0x1234 */
#define CSUM_FMT "0x%*phN"
#define CSUM_FMT_VALUE(size, bytes) size, bytes
-int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
- u32 pgoff, u8 *csum, const u8 * const csum_expected);
+int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, void *kaddr, u8 *csum,
+ const u8 * const csum_expected);
bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
u32 bio_offset, struct bio_vec *bv);
-noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
+noinline int can_nocow_extent(struct btrfs_inode *inode, u64 offset, u64 *len,
struct btrfs_file_extent *file_extent,
bool nowait);
@@ -538,8 +560,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
const struct fscrypt_str *name, int add_backref, u64 index);
int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry);
-int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
- int front);
+int btrfs_truncate_block(struct btrfs_inode *inode, u64 offset, u64 start, u64 end);
int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context);
int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
@@ -584,9 +605,9 @@ void btrfs_free_inode(struct inode *inode);
int btrfs_drop_inode(struct inode *inode);
int __init btrfs_init_cachep(void);
void __cold btrfs_destroy_cachep(void);
-struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root,
- struct btrfs_path *path);
-struct inode *btrfs_iget(u64 ino, struct btrfs_root *root);
+struct btrfs_inode *btrfs_iget_path(u64 ino, struct btrfs_root *root,
+ struct btrfs_path *path);
+struct btrfs_inode *btrfs_iget(u64 ino, struct btrfs_root *root);
struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
struct folio *folio, u64 start, u64 len);
int btrfs_update_inode(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 0c4d486c3048..d09d622016ef 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -282,15 +282,15 @@ static noinline void end_compressed_writeback(const struct compressed_bio *cb)
{
struct inode *inode = &cb->bbio.inode->vfs_inode;
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- unsigned long index = cb->start >> PAGE_SHIFT;
- unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT;
+ pgoff_t index = cb->start >> PAGE_SHIFT;
+ const pgoff_t end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT;
struct folio_batch fbatch;
- const int error = blk_status_to_errno(cb->bbio.bio.bi_status);
int i;
int ret;
- if (error)
- mapping_set_error(inode->i_mapping, error);
+ ret = blk_status_to_errno(cb->bbio.bio.bi_status);
+ if (ret)
+ mapping_set_error(inode->i_mapping, ret);
folio_batch_init(&fbatch);
while (index <= end_index) {
@@ -415,7 +415,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
int *memstall, unsigned long *pflags)
{
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- unsigned long end_index;
+ pgoff_t end_index;
struct bio *orig_bio = &cb->orig_bbio->bio;
u64 cur = cb->orig_bbio->file_offset + orig_bio->bi_iter.bi_size;
u64 isize = i_size_read(inode);
@@ -446,8 +446,8 @@ static noinline int add_ra_bio_pages(struct inode *inode,
end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
while (cur < compressed_end) {
- u64 page_end;
- u64 pg_index = cur >> PAGE_SHIFT;
+ pgoff_t page_end;
+ pgoff_t pg_index = cur >> PAGE_SHIFT;
u32 add_size;
if (pg_index > end_index)
@@ -499,9 +499,9 @@ static noinline int add_ra_bio_pages(struct inode *inode,
}
page_end = (pg_index << PAGE_SHIFT) + folio_size(folio) - 1;
- lock_extent(tree, cur, page_end, NULL);
+ btrfs_lock_extent(tree, cur, page_end, NULL);
read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, cur, page_end + 1 - cur);
+ em = btrfs_lookup_extent_mapping(em_tree, cur, page_end + 1 - cur);
read_unlock(&em_tree->lock);
/*
@@ -510,20 +510,20 @@ static noinline int add_ra_bio_pages(struct inode *inode,
* to this compressed extent on disk.
*/
if (!em || cur < em->start ||
- (cur + fs_info->sectorsize > extent_map_end(em)) ||
- (extent_map_block_start(em) >> SECTOR_SHIFT) !=
+ (cur + fs_info->sectorsize > btrfs_extent_map_end(em)) ||
+ (btrfs_extent_map_block_start(em) >> SECTOR_SHIFT) !=
orig_bio->bi_iter.bi_sector) {
- free_extent_map(em);
- unlock_extent(tree, cur, page_end, NULL);
+ btrfs_free_extent_map(em);
+ btrfs_unlock_extent(tree, cur, page_end, NULL);
folio_unlock(folio);
folio_put(folio);
break;
}
add_size = min(em->start + em->len, page_end + 1) - cur;
- free_extent_map(em);
- unlock_extent(tree, cur, page_end, NULL);
+ btrfs_free_extent_map(em);
+ btrfs_unlock_extent(tree, cur, page_end, NULL);
- if (folio->index == end_index) {
+ if (folio_contains(folio, end_index)) {
size_t zero_offset = offset_in_folio(folio, isize);
if (zero_offset) {
@@ -576,19 +576,19 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio)
struct extent_map *em;
unsigned long pflags;
int memstall = 0;
- blk_status_t ret;
- int ret2;
+ blk_status_t status;
+ int ret;
/* we need the actual starting offset of this extent in the file */
read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, file_offset, fs_info->sectorsize);
+ em = btrfs_lookup_extent_mapping(em_tree, file_offset, fs_info->sectorsize);
read_unlock(&em_tree->lock);
if (!em) {
- ret = BLK_STS_IOERR;
+ status = BLK_STS_IOERR;
goto out;
}
- ASSERT(extent_map_is_compressed(em));
+ ASSERT(btrfs_extent_map_is_compressed(em));
compressed_len = em->disk_num_bytes;
cb = alloc_compressed_bio(inode, file_offset, REQ_OP_READ,
@@ -600,21 +600,21 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio)
cb->len = bbio->bio.bi_iter.bi_size;
cb->compressed_len = compressed_len;
- cb->compress_type = extent_map_compression(em);
+ cb->compress_type = btrfs_extent_map_compression(em);
cb->orig_bbio = bbio;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
cb->nr_folios = DIV_ROUND_UP(compressed_len, PAGE_SIZE);
- cb->compressed_folios = kcalloc(cb->nr_folios, sizeof(struct page *), GFP_NOFS);
+ cb->compressed_folios = kcalloc(cb->nr_folios, sizeof(struct folio *), GFP_NOFS);
if (!cb->compressed_folios) {
- ret = BLK_STS_RESOURCE;
+ status = BLK_STS_RESOURCE;
goto out_free_bio;
}
- ret2 = btrfs_alloc_folio_array(cb->nr_folios, cb->compressed_folios);
- if (ret2) {
- ret = BLK_STS_RESOURCE;
+ ret = btrfs_alloc_folio_array(cb->nr_folios, cb->compressed_folios);
+ if (ret) {
+ status = BLK_STS_RESOURCE;
goto out_free_compressed_pages;
}
@@ -637,7 +637,7 @@ out_free_compressed_pages:
out_free_bio:
bio_put(&cb->bbio.bio);
out:
- btrfs_bio_end_io(bbio, ret);
+ btrfs_bio_end_io(bbio, status);
}
/*
@@ -740,7 +740,7 @@ static const struct btrfs_compress_op * const btrfs_compress_op[] = {
&btrfs_zstd_compress,
};
-static struct list_head *alloc_workspace(int type, unsigned int level)
+static struct list_head *alloc_workspace(int type, int level)
{
switch (type) {
case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws();
@@ -789,8 +789,8 @@ static void btrfs_init_workspace_manager(int type)
*/
workspace = alloc_workspace(type, 0);
if (IS_ERR(workspace)) {
- pr_warn(
- "BTRFS: cannot preallocate compression workspace, will try later\n");
+ btrfs_warn(NULL,
+ "cannot preallocate compression workspace, will try later");
} else {
atomic_set(&wsm->total_ws, 1);
wsm->free_ws = 1;
@@ -818,7 +818,7 @@ static void btrfs_cleanup_workspace_manager(int type)
* Preallocation makes a forward progress guarantees and we do not return
* errors.
*/
-struct list_head *btrfs_get_workspace(int type, unsigned int level)
+struct list_head *btrfs_get_workspace(int type, int level)
{
struct workspace_manager *wsm;
struct list_head *workspace;
@@ -888,9 +888,9 @@ again:
/* once per minute */ 60 * HZ,
/* no burst */ 1);
- if (__ratelimit(&_rs)) {
- pr_warn("BTRFS: no compression workspaces, low memory, retrying\n");
- }
+ if (__ratelimit(&_rs))
+ btrfs_warn(NULL,
+ "no compression workspaces, low memory, retrying");
}
goto again;
}
@@ -968,18 +968,28 @@ static void put_workspace(int type, struct list_head *ws)
* Adjust @level according to the limits of the compression algorithm or
* fallback to default
*/
-static unsigned int btrfs_compress_set_level(int type, unsigned level)
+static int btrfs_compress_set_level(unsigned int type, int level)
{
const struct btrfs_compress_op *ops = btrfs_compress_op[type];
if (level == 0)
level = ops->default_level;
else
- level = min(level, ops->max_level);
+ level = clamp(level, ops->min_level, ops->max_level);
return level;
}
+/*
+ * Check whether the @level is within the valid range for the given type.
+ */
+bool btrfs_compress_level_valid(unsigned int type, int level)
+{
+ const struct btrfs_compress_op *ops = btrfs_compress_op[type];
+
+ return ops->min_level <= level && level <= ops->max_level;
+}
+
/* Wrapper around find_get_page(), with extra error message. */
int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start,
struct folio **in_folio_ret)
@@ -1023,12 +1033,10 @@ int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start,
* @total_out is an in/out parameter, must be set to the input length and will
* be also used to return the total number of compressed bytes
*/
-int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping,
+int btrfs_compress_folios(unsigned int type, int level, struct address_space *mapping,
u64 start, struct folio **folios, unsigned long *out_folios,
unsigned long *total_in, unsigned long *total_out)
{
- int type = btrfs_compress_type(type_level);
- int level = btrfs_compress_level(type_level);
const unsigned long orig_len = *total_out;
struct list_head *workspace;
int ret;
@@ -1130,6 +1138,22 @@ void __cold btrfs_exit_compress(void)
}
/*
+ * The bvec is a single page bvec from a bio that contains folios from a filemap.
+ *
+ * Since the folio may be a large one, and if the bv_page is not a head page of
+ * a large folio, then page->index is unreliable.
+ *
+ * Thus we need this helper to grab the proper file offset.
+ */
+static u64 file_offset_from_bvec(const struct bio_vec *bvec)
+{
+ const struct page *page = bvec->bv_page;
+ const struct folio *folio = page_folio(page);
+
+ return (page_pgoff(folio, page) << PAGE_SHIFT) + bvec->bv_offset;
+}
+
+/*
* Copy decompressed data from working buffer to pages.
*
* @buf: The decompressed data buffer
@@ -1174,13 +1198,14 @@ int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
u32 copy_start;
/* Offset inside the full decompressed extent */
u32 bvec_offset;
+ void *kaddr;
bvec = bio_iter_iovec(orig_bio, orig_bio->bi_iter);
/*
* cb->start may underflow, but subtracting that value can still
* give us correct offset inside the full decompressed extent.
*/
- bvec_offset = page_offset(bvec.bv_page) + bvec.bv_offset - cb->start;
+ bvec_offset = file_offset_from_bvec(&bvec) - cb->start;
/* Haven't reached the bvec range, exit */
if (decompressed + buf_len <= bvec_offset)
@@ -1196,10 +1221,12 @@ int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
* @buf + @buf_len.
*/
ASSERT(copy_start - decompressed < buf_len);
- memcpy_to_page(bvec.bv_page, bvec.bv_offset,
- buf + copy_start - decompressed, copy_len);
- cur_offset += copy_len;
+ kaddr = bvec_kmap_local(&bvec);
+ memcpy(kaddr, buf + copy_start - decompressed, copy_len);
+ kunmap_local(kaddr);
+
+ cur_offset += copy_len;
bio_advance(orig_bio, copy_len);
/* Finished the bio */
if (!orig_bio->bi_iter.bi_size)
@@ -1455,7 +1482,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end,
struct heuristic_ws *ws)
{
struct page *page;
- u64 index, index_end;
+ pgoff_t index, index_end;
u32 i, curr_sample_pos;
u8 *in_data;
@@ -1590,18 +1617,19 @@ out:
/*
* Convert the compression suffix (eg. after "zlib" starting with ":") to
- * level, unrecognized string will set the default level
+ * level, unrecognized string will set the default level. Negative level
+ * numbers are allowed.
*/
-unsigned int btrfs_compress_str2level(unsigned int type, const char *str)
+int btrfs_compress_str2level(unsigned int type, const char *str)
{
- unsigned int level = 0;
+ int level = 0;
int ret;
if (!type)
return 0;
if (str[0] == ':') {
- ret = kstrtouint(str + 1, 10, &level);
+ ret = kstrtoint(str + 1, 10, &level);
if (ret)
level = 0;
}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 954034086d0d..1b38e707bbd9 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -11,7 +11,10 @@
#include <linux/list.h>
#include <linux/workqueue.h>
#include <linux/wait.h>
+#include <linux/pagemap.h>
#include "bio.h"
+#include "fs.h"
+#include "messages.h"
struct address_space;
struct page;
@@ -72,28 +75,20 @@ struct compressed_bio {
struct btrfs_bio bbio;
};
-static inline unsigned int btrfs_compress_type(unsigned int type_level)
-{
- return (type_level & 0xF);
-}
-
-static inline unsigned int btrfs_compress_level(unsigned int type_level)
-{
- return ((type_level & 0xF0) >> 4);
-}
-
/* @range_end must be exclusive. */
-static inline u32 btrfs_calc_input_length(u64 range_end, u64 cur)
+static inline u32 btrfs_calc_input_length(struct folio *folio, u64 range_end, u64 cur)
{
- u64 page_end = round_down(cur, PAGE_SIZE) + PAGE_SIZE;
-
- return min(range_end, page_end) - cur;
+ /* @cur must be inside the folio. */
+ ASSERT(folio_pos(folio) <= cur);
+ ASSERT(cur < folio_end(folio));
+ return min(range_end, folio_end(folio)) - cur;
}
int __init btrfs_init_compress(void);
void __cold btrfs_exit_compress(void);
-int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping,
+bool btrfs_compress_level_valid(unsigned int type, int level);
+int btrfs_compress_folios(unsigned int type, int level, struct address_space *mapping,
u64 start, struct folio **folios, unsigned long *out_folios,
unsigned long *total_in, unsigned long *total_out);
int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio,
@@ -107,7 +102,7 @@ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered,
bool writeback);
void btrfs_submit_compressed_read(struct btrfs_bio *bbio);
-unsigned int btrfs_compress_str2level(unsigned int type, const char *str);
+int btrfs_compress_str2level(unsigned int type, const char *str);
struct folio *btrfs_alloc_compr_folio(void);
void btrfs_free_compr_folio(struct folio *folio);
@@ -118,6 +113,8 @@ enum btrfs_compression_type {
BTRFS_COMPRESS_LZO = 2,
BTRFS_COMPRESS_ZSTD = 3,
BTRFS_NR_COMPRESS_TYPES = 4,
+
+ BTRFS_DEFRAG_DONT_COMPRESS,
};
struct workspace_manager {
@@ -131,14 +128,15 @@ struct workspace_manager {
wait_queue_head_t ws_wait;
};
-struct list_head *btrfs_get_workspace(int type, unsigned int level);
+struct list_head *btrfs_get_workspace(int type, int level);
void btrfs_put_workspace(int type, struct list_head *ws);
struct btrfs_compress_op {
struct workspace_manager *workspace_manager;
/* Maximum level supported by the compression algorithm */
- unsigned int max_level;
- unsigned int default_level;
+ int min_level;
+ int max_level;
+ int default_level;
};
/* The heuristic workspaces are managed via the 0th workspace manager */
@@ -187,9 +185,9 @@ int zstd_decompress(struct list_head *ws, const u8 *data_in,
size_t destlen);
void zstd_init_workspace_manager(void);
void zstd_cleanup_workspace_manager(void);
-struct list_head *zstd_alloc_workspace(unsigned int level);
+struct list_head *zstd_alloc_workspace(int level);
void zstd_free_workspace(struct list_head *ws);
-struct list_head *zstd_get_workspace(unsigned int level);
+struct list_head *zstd_get_workspace(int level);
void zstd_put_workspace(struct list_head *ws);
#endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 92071ca0655f..74e6d7f3d266 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -198,7 +198,7 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
* the inc_not_zero dance and if it doesn't work then
* synchronize_rcu and try again.
*/
- if (atomic_inc_not_zero(&eb->refs)) {
+ if (refcount_inc_not_zero(&eb->refs)) {
rcu_read_unlock();
break;
}
@@ -283,15 +283,26 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
- WARN_ON(btrfs_header_generation(buf) > trans->transid);
- if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
+ if (unlikely(btrfs_header_generation(buf) > trans->transid)) {
+ btrfs_tree_unlock(cow);
+ free_extent_buffer(cow);
+ ret = -EUCLEAN;
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+
+ if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
ret = btrfs_inc_ref(trans, root, cow, 1);
- else
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ } else {
ret = btrfs_inc_ref(trans, root, cow, 0);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ }
if (ret) {
btrfs_tree_unlock(cow);
free_extent_buffer(cow);
- btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -303,9 +314,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
/*
* check if the tree block can be shared by multiple trees
*/
-bool btrfs_block_can_be_shared(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct extent_buffer *buf)
+bool btrfs_block_can_be_shared(const struct btrfs_trans_handle *trans,
+ const struct btrfs_root *root,
+ const struct extent_buffer *buf)
{
const u64 buf_gen = btrfs_header_generation(buf);
@@ -549,7 +560,7 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
btrfs_abort_transaction(trans, ret);
goto error_unlock_cow;
}
- atomic_inc(&cow->refs);
+ refcount_inc(&cow->refs);
rcu_assign_pointer(root->node, cow);
ret = btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
@@ -602,9 +613,9 @@ error_unlock_cow:
return ret;
}
-static inline int should_cow_block(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct extent_buffer *buf)
+static inline int should_cow_block(const struct btrfs_trans_handle *trans,
+ const struct btrfs_root *root,
+ const struct extent_buffer *buf)
{
if (btrfs_is_testing(root->fs_info))
return 0;
@@ -724,7 +735,7 @@ int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_ke
* Slot may point to the total number of items (i.e. one position beyond the last
* key) if the key is bigger than the last key in the extent buffer.
*/
-int btrfs_bin_search(struct extent_buffer *eb, int first_slot,
+int btrfs_bin_search(const struct extent_buffer *eb, int first_slot,
const struct btrfs_key *key, int *slot)
{
unsigned long p;
@@ -1081,7 +1092,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
/* update the path */
if (left) {
if (btrfs_header_nritems(left) > orig_slot) {
- atomic_inc(&left->refs);
+ refcount_inc(&left->refs);
/* left was locked after cow */
path->nodes[level] = left;
path->slots[level + 1] -= 1;
@@ -1268,7 +1279,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
* to the block in 'slot', and triggering ra on them.
*/
static void reada_for_search(struct btrfs_fs_info *fs_info,
- struct btrfs_path *path,
+ const struct btrfs_path *path,
int level, int slot, u64 objectid)
{
struct extent_buffer *node;
@@ -1350,7 +1361,7 @@ static void reada_for_search(struct btrfs_fs_info *fs_info,
}
}
-static noinline void reada_for_balance(struct btrfs_path *path, int level)
+static noinline void reada_for_balance(const struct btrfs_path *path, int level)
{
struct extent_buffer *parent;
int slot;
@@ -1446,8 +1457,8 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
u64 blocknr;
struct extent_buffer *tmp = NULL;
int ret = 0;
+ int ret2;
int parent_level;
- int err;
bool read_tmp = false;
bool tmp_locked = false;
bool path_released = false;
@@ -1496,6 +1507,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
if (!p->skip_locking) {
btrfs_unlock_up_safe(p, parent_level + 1);
+ btrfs_maybe_reset_lockdep_class(root, tmp);
tmp_locked = true;
btrfs_tree_read_lock(tmp);
btrfs_release_path(p);
@@ -1504,9 +1516,9 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
}
/* Now we're allowed to do a blocking uptodate check. */
- err = btrfs_read_extent_buffer(tmp, &check);
- if (err) {
- ret = err;
+ ret2 = btrfs_read_extent_buffer(tmp, &check);
+ if (ret2) {
+ ret = ret2;
goto out;
}
@@ -1539,6 +1551,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
if (!p->skip_locking) {
ASSERT(ret == -EAGAIN);
+ btrfs_maybe_reset_lockdep_class(root, tmp);
tmp_locked = true;
btrfs_tree_read_lock(tmp);
btrfs_release_path(p);
@@ -1546,9 +1559,9 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
}
/* Now we're allowed to do a blocking uptodate check. */
- err = btrfs_read_extent_buffer(tmp, &check);
- if (err) {
- ret = err;
+ ret2 = btrfs_read_extent_buffer(tmp, &check);
+ if (ret2) {
+ ret = ret2;
goto out;
}
@@ -1683,7 +1696,7 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
if (p->search_commit_root) {
b = root->commit_root;
- atomic_inc(&b->refs);
+ refcount_inc(&b->refs);
level = btrfs_header_level(b);
/*
* Ensure that all callers have set skip_locking when
@@ -1792,7 +1805,7 @@ static int finish_need_commit_sem_search(struct btrfs_path *path)
return 0;
}
-static inline int search_for_key_slot(struct extent_buffer *eb,
+static inline int search_for_key_slot(const struct extent_buffer *eb,
int search_low_slot,
const struct btrfs_key *key,
int prev_cmp,
@@ -1926,15 +1939,14 @@ static int search_leaf(struct btrfs_trans_handle *trans,
ASSERT(leaf_free_space >= 0);
if (leaf_free_space < ins_len) {
- int err;
-
- err = split_leaf(trans, root, key, path, ins_len,
- (ret == 0));
- ASSERT(err <= 0);
- if (WARN_ON(err > 0))
- err = -EUCLEAN;
- if (err)
- ret = err;
+ int ret2;
+
+ ret2 = split_leaf(trans, root, key, path, ins_len, (ret == 0));
+ ASSERT(ret2 <= 0);
+ if (WARN_ON(ret2 > 0))
+ ret2 = -EUCLEAN;
+ if (ret2)
+ ret = ret2;
}
}
@@ -1980,7 +1992,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct extent_buffer *b;
int slot;
int ret;
- int err;
int level;
int lowest_unlock = 1;
/* everything at write_lock_level or lower must be write locked */
@@ -2051,6 +2062,7 @@ again:
while (b) {
int dec = 0;
+ int ret2;
level = btrfs_header_level(b);
@@ -2079,16 +2091,15 @@ again:
}
if (last_level)
- err = btrfs_cow_block(trans, root, b, NULL, 0,
- &b,
- BTRFS_NESTING_COW);
+ ret2 = btrfs_cow_block(trans, root, b, NULL, 0,
+ &b, BTRFS_NESTING_COW);
else
- err = btrfs_cow_block(trans, root, b,
- p->nodes[level + 1],
- p->slots[level + 1], &b,
- BTRFS_NESTING_COW);
- if (err) {
- ret = err;
+ ret2 = btrfs_cow_block(trans, root, b,
+ p->nodes[level + 1],
+ p->slots[level + 1], &b,
+ BTRFS_NESTING_COW);
+ if (ret2) {
+ ret = ret2;
goto done;
}
}
@@ -2136,12 +2147,12 @@ cow_done:
slot--;
}
p->slots[level] = slot;
- err = setup_nodes_for_search(trans, root, p, b, level, ins_len,
- &write_lock_level);
- if (err == -EAGAIN)
+ ret2 = setup_nodes_for_search(trans, root, p, b, level, ins_len,
+ &write_lock_level);
+ if (ret2 == -EAGAIN)
goto again;
- if (err) {
- ret = err;
+ if (ret2) {
+ ret = ret2;
goto done;
}
b = p->nodes[level];
@@ -2167,11 +2178,11 @@ cow_done:
goto done;
}
- err = read_block_for_search(root, p, &b, slot, key);
- if (err == -EAGAIN && !p->nowait)
+ ret2 = read_block_for_search(root, p, &b, slot, key);
+ if (ret2 == -EAGAIN && !p->nowait)
goto again;
- if (err) {
- ret = err;
+ if (ret2) {
+ ret = ret2;
goto done;
}
@@ -2234,7 +2245,6 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
struct extent_buffer *b;
int slot;
int ret;
- int err;
int level;
int lowest_unlock = 1;
u8 lowest_level = 0;
@@ -2259,6 +2269,7 @@ again:
while (b) {
int dec = 0;
+ int ret2;
level = btrfs_header_level(b);
p->nodes[level] = b;
@@ -2294,11 +2305,11 @@ again:
goto done;
}
- err = read_block_for_search(root, p, &b, slot, key);
- if (err == -EAGAIN && !p->nowait)
+ ret2 = read_block_for_search(root, p, &b, slot, key);
+ if (ret2 == -EAGAIN && !p->nowait)
goto again;
- if (err) {
- ret = err;
+ if (ret2) {
+ ret = ret2;
goto done;
}
@@ -2870,6 +2881,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
if (ret < 0) {
int ret2;
+ btrfs_clear_buffer_dirty(trans, c);
ret2 = btrfs_free_tree_block(trans, btrfs_root_id(root), c, 0, 1);
if (ret2 < 0)
btrfs_abort_transaction(trans, ret2);
@@ -2883,7 +2895,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
free_extent_buffer(old);
add_root_to_dirty_list(root);
- atomic_inc(&c->refs);
+ refcount_inc(&c->refs);
path->nodes[level] = c;
path->locks[level] = BTRFS_WRITE_LOCK;
path->slots[level] = 0;
@@ -3098,7 +3110,6 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = right->fs_info;
struct extent_buffer *left = path->nodes[0];
struct extent_buffer *upper = path->nodes[1];
- struct btrfs_map_token token;
struct btrfs_disk_key disk_key;
int slot;
u32 i;
@@ -3172,13 +3183,12 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
copy_leaf_items(right, left, 0, left_nritems - push_items, push_items);
/* update the item pointers */
- btrfs_init_map_token(&token, right);
right_nritems += push_items;
btrfs_set_header_nritems(right, right_nritems);
push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
for (i = 0; i < right_nritems; i++) {
- push_space -= btrfs_token_item_size(&token, i);
- btrfs_set_token_item_offset(&token, i, push_space);
+ push_space -= btrfs_item_size(right, i);
+ btrfs_set_item_offset(right, i, push_space);
}
left_nritems -= push_items;
@@ -3321,7 +3331,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
int ret = 0;
u32 this_item_size;
u32 old_left_item_size;
- struct btrfs_map_token token;
if (empty)
nr = min(right_nritems, max_slot);
@@ -3369,13 +3378,12 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
old_left_nritems = btrfs_header_nritems(left);
BUG_ON(old_left_nritems <= 0);
- btrfs_init_map_token(&token, left);
old_left_item_size = btrfs_item_offset(left, old_left_nritems - 1);
for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
u32 ioff;
- ioff = btrfs_token_item_offset(&token, i);
- btrfs_set_token_item_offset(&token, i,
+ ioff = btrfs_item_offset(left, i);
+ btrfs_set_item_offset(left, i,
ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size));
}
btrfs_set_header_nritems(left, old_left_nritems + push_items);
@@ -3396,13 +3404,12 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
btrfs_header_nritems(right) - push_items);
}
- btrfs_init_map_token(&token, right);
right_nritems -= push_items;
btrfs_set_header_nritems(right, right_nritems);
push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
for (i = 0; i < right_nritems; i++) {
- push_space = push_space - btrfs_token_item_size(&token, i);
- btrfs_set_token_item_offset(&token, i, push_space);
+ push_space = push_space - btrfs_item_size(right, i);
+ btrfs_set_item_offset(right, i, push_space);
}
btrfs_mark_buffer_dirty(trans, left);
@@ -3516,7 +3523,6 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans,
int i;
int ret;
struct btrfs_disk_key disk_key;
- struct btrfs_map_token token;
nritems = nritems - mid;
btrfs_set_header_nritems(right, nritems);
@@ -3529,12 +3535,11 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans,
rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid);
- btrfs_init_map_token(&token, right);
for (i = 0; i < nritems; i++) {
u32 ioff;
- ioff = btrfs_token_item_offset(&token, i);
- btrfs_set_token_item_offset(&token, i, ioff + rt_data_off);
+ ioff = btrfs_item_offset(right, i);
+ btrfs_set_item_offset(right, i, ioff + rt_data_off);
}
btrfs_set_header_nritems(l, mid);
@@ -4000,7 +4005,6 @@ void btrfs_truncate_item(struct btrfs_trans_handle *trans,
unsigned int old_size;
unsigned int size_diff;
int i;
- struct btrfs_map_token token;
leaf = path->nodes[0];
slot = path->slots[0];
@@ -4023,12 +4027,11 @@ void btrfs_truncate_item(struct btrfs_trans_handle *trans,
* item0..itemN ... dataN.offset..dataN.size .. data0.size
*/
/* first correct the data pointers */
- btrfs_init_map_token(&token, leaf);
for (i = slot; i < nritems; i++) {
u32 ioff;
- ioff = btrfs_token_item_offset(&token, i);
- btrfs_set_token_item_offset(&token, i, ioff + size_diff);
+ ioff = btrfs_item_offset(leaf, i);
+ btrfs_set_item_offset(leaf, i, ioff + size_diff);
}
/* shift the data */
@@ -4091,7 +4094,6 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans,
unsigned int old_data;
unsigned int old_size;
int i;
- struct btrfs_map_token token;
leaf = path->nodes[0];
@@ -4117,12 +4119,11 @@ void btrfs_extend_item(struct btrfs_trans_handle *trans,
* item0..itemN ... dataN.offset..dataN.size .. data0.size
*/
/* first correct the data pointers */
- btrfs_init_map_token(&token, leaf);
for (i = slot; i < nritems; i++) {
u32 ioff;
- ioff = btrfs_token_item_offset(&token, i);
- btrfs_set_token_item_offset(&token, i, ioff - data_size);
+ ioff = btrfs_item_offset(leaf, i);
+ btrfs_set_item_offset(leaf, i, ioff - data_size);
}
/* shift the data */
@@ -4162,7 +4163,6 @@ static void setup_items_for_insert(struct btrfs_trans_handle *trans,
struct btrfs_disk_key disk_key;
struct extent_buffer *leaf;
int slot;
- struct btrfs_map_token token;
u32 total_size;
/*
@@ -4190,7 +4190,6 @@ static void setup_items_for_insert(struct btrfs_trans_handle *trans,
BUG();
}
- btrfs_init_map_token(&token, leaf);
if (slot != nritems) {
unsigned int old_data = btrfs_item_data_end(leaf, slot);
@@ -4208,8 +4207,8 @@ static void setup_items_for_insert(struct btrfs_trans_handle *trans,
for (i = slot; i < nritems; i++) {
u32 ioff;
- ioff = btrfs_token_item_offset(&token, i);
- btrfs_set_token_item_offset(&token, i,
+ ioff = btrfs_item_offset(leaf, i);
+ btrfs_set_item_offset(leaf, i,
ioff - batch->total_data_size);
}
/* shift the items */
@@ -4226,8 +4225,8 @@ static void setup_items_for_insert(struct btrfs_trans_handle *trans,
btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]);
btrfs_set_item_key(leaf, &disk_key, slot + i);
data_end -= batch->data_sizes[i];
- btrfs_set_token_item_offset(&token, slot + i, data_end);
- btrfs_set_token_item_size(&token, slot + i, batch->data_sizes[i]);
+ btrfs_set_item_offset(leaf, slot + i, data_end);
+ btrfs_set_item_size(leaf, slot + i, batch->data_sizes[i]);
}
btrfs_set_header_nritems(leaf, nritems + batch->nr);
@@ -4304,7 +4303,7 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
u32 data_size)
{
int ret = 0;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
unsigned long ptr;
@@ -4318,7 +4317,6 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
write_extent_buffer(leaf, data, ptr, data_size);
btrfs_mark_buffer_dirty(trans, leaf);
}
- btrfs_free_path(path);
return ret;
}
@@ -4441,7 +4439,7 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
root_sub_used_bytes(root);
- atomic_inc(&leaf->refs);
+ refcount_inc(&leaf->refs);
ret = btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1);
free_extent_buffer_stale(leaf);
if (ret < 0)
@@ -4468,7 +4466,6 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
if (slot + nr != nritems) {
const u32 last_off = btrfs_item_offset(leaf, slot + nr - 1);
const int data_end = leaf_data_end(leaf);
- struct btrfs_map_token token;
u32 dsize = 0;
int i;
@@ -4478,12 +4475,11 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
memmove_leaf_data(leaf, data_end + dsize, data_end,
last_off - data_end);
- btrfs_init_map_token(&token, leaf);
for (i = slot + nr; i < nritems; i++) {
u32 ioff;
- ioff = btrfs_token_item_offset(&token, i);
- btrfs_set_token_item_offset(&token, i, ioff + dsize);
+ ioff = btrfs_item_offset(leaf, i);
+ btrfs_set_item_offset(leaf, i, ioff + dsize);
}
memmove_leaf_items(leaf, slot, slot + nr, nritems - slot - nr);
@@ -4526,7 +4522,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
* for possible call to btrfs_del_ptr below
*/
slot = path->slots[1];
- atomic_inc(&leaf->refs);
+ refcount_inc(&leaf->refs);
/*
* We want to be able to at least push one item to the
* left neighbour leaf, and that's the first item.
@@ -4584,16 +4580,13 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
/*
* A helper function to walk down the tree starting at min_key, and looking
- * for nodes or leaves that are have a minimum transaction id.
+ * for leaves that have a minimum transaction id.
* This is used by the btree defrag code, and tree logging
*
* This does not cow, but it does stuff the starting key it finds back
* into min_key, so you can call btrfs_search_slot with cow=1 on the
* key and get a writable path.
*
- * This honors path->lowest_level to prevent descent past a given level
- * of the tree.
- *
* min_trans indicates the oldest transaction that you are interested
* in walking through. Any nodes or leaves older than min_trans are
* skipped over (without reading them).
@@ -4606,7 +4599,6 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
u64 min_trans)
{
struct extent_buffer *cur;
- struct btrfs_key found_key;
int slot;
int sret;
u32 nritems;
@@ -4615,6 +4607,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
int keep_locks = path->keep_locks;
ASSERT(!path->nowait);
+ ASSERT(path->lowest_level == 0);
path->keep_locks = 1;
again:
cur = btrfs_read_lock_root_node(root);
@@ -4636,13 +4629,14 @@ again:
goto out;
}
- /* at the lowest level, we're done, setup the path and exit */
- if (level == path->lowest_level) {
+ /* At level 0 we're done, setup the path and exit. */
+ if (level == 0) {
if (slot >= nritems)
goto find_next_key;
ret = 0;
path->slots[level] = slot;
- btrfs_item_key_to_cpu(cur, &found_key, slot);
+ /* Save our key for returning back. */
+ btrfs_item_key_to_cpu(cur, min_key, slot);
goto out;
}
if (sret && slot > 0)
@@ -4666,8 +4660,8 @@ find_next_key:
* we didn't find a candidate key in this node, walk forward
* and find another one
*/
+ path->slots[level] = slot;
if (slot >= nritems) {
- path->slots[level] = slot;
sret = btrfs_find_next_key(root, path, min_key, level,
min_trans);
if (sret == 0) {
@@ -4677,13 +4671,6 @@ find_next_key:
goto out;
}
}
- /* save our key for returning back */
- btrfs_node_key_to_cpu(cur, &found_key, slot);
- path->slots[level] = slot;
- if (level == path->lowest_level) {
- ret = 0;
- goto out;
- }
cur = btrfs_read_node_slot(cur, slot);
if (IS_ERR(cur)) {
ret = PTR_ERR(cur);
@@ -4698,10 +4685,8 @@ find_next_key:
}
out:
path->keep_locks = keep_locks;
- if (ret == 0) {
- btrfs_unlock_up_safe(path, path->lowest_level + 1);
- memcpy(min_key, &found_key, sizeof(found_key));
- }
+ if (ret == 0)
+ btrfs_unlock_up_safe(path, 1);
return ret;
}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1096a80a64e7..fe70b593c7cd 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -6,7 +6,7 @@
#ifndef BTRFS_CTREE_H
#define BTRFS_CTREE_H
-#include "linux/cleanup.h"
+#include <linux/cleanup.h>
#include <linux/spinlock.h>
#include <linux/rbtree.h>
#include <linux/mutex.h>
@@ -61,7 +61,6 @@ struct btrfs_path {
/* if there is real range locking, this locks field will change */
u8 locks[BTRFS_MAX_LEVEL];
u8 reada;
- /* keep some upper locks as we walk down */
u8 lowest_level;
/*
@@ -69,6 +68,7 @@ struct btrfs_path {
* and to force calls to keep space in the nodes
*/
unsigned int search_for_split:1;
+ /* Keep some upper locks as we walk down. */
unsigned int keep_locks:1;
unsigned int skip_locking:1;
unsigned int search_commit_root:1;
@@ -224,16 +224,10 @@ struct btrfs_root {
struct list_head root_list;
- /*
- * Xarray that keeps track of in-memory inodes, protected by the lock
- * @inode_lock.
- */
+ /* Xarray that keeps track of in-memory inodes. */
struct xarray inodes;
- /*
- * Xarray that keeps track of delayed nodes of every inode, protected
- * by @inode_lock.
- */
+ /* Xarray that keeps track of delayed nodes of every inode. */
struct xarray delayed_nodes;
/*
* right now this just gets used so that a root has its own devid
@@ -508,7 +502,7 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
int __init btrfs_ctree_init(void);
void __cold btrfs_ctree_exit(void);
-int btrfs_bin_search(struct extent_buffer *eb, int first_slot,
+int btrfs_bin_search(const struct extent_buffer *eb, int first_slot,
const struct btrfs_key *key, int *slot);
int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2);
@@ -576,9 +570,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
struct extent_buffer **cow_ret, u64 new_root_objectid);
-bool btrfs_block_can_be_shared(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct extent_buffer *buf);
+bool btrfs_block_can_be_shared(const struct btrfs_trans_handle *trans,
+ const struct btrfs_root *root,
+ const struct extent_buffer *buf);
int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct btrfs_path *path, int level, int slot);
void btrfs_extend_item(struct btrfs_trans_handle *trans,
@@ -727,13 +721,18 @@ static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p)
}
int btrfs_leaf_free_space(const struct extent_buffer *leaf);
-static inline int is_fstree(u64 rootid)
+static inline bool btrfs_is_fstree(u64 rootid)
{
- if (rootid == BTRFS_FS_TREE_OBJECTID ||
- ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID &&
- !btrfs_qgroup_level(rootid)))
- return 1;
- return 0;
+ if (rootid == BTRFS_FS_TREE_OBJECTID)
+ return true;
+
+ if ((s64)rootid < (s64)BTRFS_FIRST_FREE_OBJECTID)
+ return false;
+
+ if (btrfs_qgroup_level(rootid) != 0)
+ return false;
+
+ return true;
}
static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root)
diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index 968dae953948..738179a5e170 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -60,6 +60,14 @@ static int compare_inode_defrag(const struct inode_defrag *defrag1,
return 0;
}
+static int inode_defrag_cmp(struct rb_node *new, const struct rb_node *existing)
+{
+ const struct inode_defrag *new_defrag = rb_entry(new, struct inode_defrag, rb_node);
+ const struct inode_defrag *existing_defrag = rb_entry(existing, struct inode_defrag, rb_node);
+
+ return compare_inode_defrag(new_defrag, existing_defrag);
+}
+
/*
* Insert a record for an inode into the defrag tree. The lock must be held
* already.
@@ -71,49 +79,35 @@ static int btrfs_insert_inode_defrag(struct btrfs_inode *inode,
struct inode_defrag *defrag)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct inode_defrag *entry;
- struct rb_node **p;
- struct rb_node *parent = NULL;
- int ret;
+ struct rb_node *node;
- p = &fs_info->defrag_inodes.rb_node;
- while (*p) {
- parent = *p;
- entry = rb_entry(parent, struct inode_defrag, rb_node);
+ node = rb_find_add(&defrag->rb_node, &fs_info->defrag_inodes, inode_defrag_cmp);
+ if (node) {
+ struct inode_defrag *entry;
- ret = compare_inode_defrag(defrag, entry);
- if (ret < 0)
- p = &parent->rb_left;
- else if (ret > 0)
- p = &parent->rb_right;
- else {
- /*
- * If we're reinserting an entry for an old defrag run,
- * make sure to lower the transid of our existing
- * record.
- */
- if (defrag->transid < entry->transid)
- entry->transid = defrag->transid;
- entry->extent_thresh = min(defrag->extent_thresh,
- entry->extent_thresh);
- return -EEXIST;
- }
+ entry = rb_entry(node, struct inode_defrag, rb_node);
+ /*
+ * If we're reinserting an entry for an old defrag run, make
+ * sure to lower the transid of our existing record.
+ */
+ if (defrag->transid < entry->transid)
+ entry->transid = defrag->transid;
+ entry->extent_thresh = min(defrag->extent_thresh, entry->extent_thresh);
+ return -EEXIST;
}
set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags);
- rb_link_node(&defrag->rb_node, parent, p);
- rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes);
return 0;
}
-static inline int need_auto_defrag(struct btrfs_fs_info *fs_info)
+static inline bool need_auto_defrag(struct btrfs_fs_info *fs_info)
{
if (!btrfs_test_opt(fs_info, AUTO_DEFRAG))
- return 0;
+ return false;
if (btrfs_fs_closing(fs_info))
- return 0;
+ return false;
- return 1;
+ return true;
}
/*
@@ -191,10 +185,7 @@ static struct inode_defrag *btrfs_pick_defrag_inode(
if (parent && compare_inode_defrag(&tmp, entry) > 0) {
parent = rb_next(parent);
- if (parent)
- entry = rb_entry(parent, struct inode_defrag, rb_node);
- else
- entry = NULL;
+ entry = rb_entry_safe(parent, struct inode_defrag, rb_node);
}
out:
if (entry)
@@ -225,7 +216,7 @@ static int btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
struct file_ra_state *ra)
{
struct btrfs_root *inode_root;
- struct inode *inode;
+ struct btrfs_inode *inode;
struct btrfs_ioctl_defrag_range_args range;
int ret = 0;
u64 cur = 0;
@@ -250,24 +241,24 @@ again:
goto cleanup;
}
- if (cur >= i_size_read(inode)) {
- iput(inode);
+ if (cur >= i_size_read(&inode->vfs_inode)) {
+ iput(&inode->vfs_inode);
goto cleanup;
}
/* Do a chunk of defrag */
- clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
+ clear_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags);
memset(&range, 0, sizeof(range));
range.len = (u64)-1;
range.start = cur;
range.extent_thresh = defrag->extent_thresh;
- file_ra_state_init(ra, inode->i_mapping);
+ file_ra_state_init(ra, inode->vfs_inode.i_mapping);
sb_start_write(fs_info->sb);
ret = btrfs_defrag_file(inode, ra, &range, defrag->transid,
- BTRFS_DEFRAG_BATCH);
+ BTRFS_DEFRAG_BATCH);
sb_end_write(fs_info->sb);
- iput(inode);
+ iput(&inode->vfs_inode);
if (ret < 0)
goto cleanup;
@@ -624,7 +615,7 @@ static struct extent_map *defrag_get_extent(struct btrfs_inode *inode,
u64 ino = btrfs_ino(inode);
int ret;
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
ret = -ENOMEM;
goto err;
@@ -734,12 +725,12 @@ next:
not_found:
btrfs_release_path(&path);
- free_extent_map(em);
+ btrfs_free_extent_map(em);
return NULL;
err:
btrfs_release_path(&path);
- free_extent_map(em);
+ btrfs_free_extent_map(em);
return ERR_PTR(ret);
}
@@ -756,7 +747,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
* full extent lock.
*/
read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, start, sectorsize);
+ em = btrfs_lookup_extent_mapping(em_tree, start, sectorsize);
read_unlock(&em_tree->lock);
/*
@@ -769,7 +760,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
* file extent items in the inode's subvolume tree).
*/
if (em && (em->flags & EXTENT_FLAG_MERGED)) {
- free_extent_map(em);
+ btrfs_free_extent_map(em);
em = NULL;
}
@@ -779,10 +770,10 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
/* Get the big lock and read metadata off disk. */
if (!locked)
- lock_extent(io_tree, start, end, &cached);
+ btrfs_lock_extent(io_tree, start, end, &cached);
em = defrag_get_extent(BTRFS_I(inode), start, newer_than);
if (!locked)
- unlock_extent(io_tree, start, end, &cached);
+ btrfs_unlock_extent(io_tree, start, end, &cached);
if (IS_ERR(em))
return NULL;
@@ -794,7 +785,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info,
const struct extent_map *em)
{
- if (extent_map_is_compressed(em))
+ if (btrfs_extent_map_is_compressed(em))
return BTRFS_MAX_COMPRESSED;
return fs_info->max_extent_size;
}
@@ -837,7 +828,7 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
ret = true;
out:
- free_extent_map(next);
+ btrfs_free_extent_map(next);
return ret;
}
@@ -857,13 +848,14 @@ static struct folio *defrag_prepare_one_folio(struct btrfs_inode *inode, pgoff_t
{
struct address_space *mapping = inode->vfs_inode.i_mapping;
gfp_t mask = btrfs_alloc_write_mask(mapping);
- u64 page_start = (u64)index << PAGE_SHIFT;
- u64 page_end = page_start + PAGE_SIZE - 1;
+ u64 lock_start;
+ u64 lock_end;
struct extent_state *cached_state = NULL;
struct folio *folio;
int ret;
again:
+ /* TODO: Add order fgp order flags when large folios are fully enabled. */
folio = __filemap_get_folio(mapping, index,
FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask);
if (IS_ERR(folio))
@@ -871,13 +863,16 @@ again:
/*
* Since we can defragment files opened read-only, we can encounter
- * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We
- * can't do I/O using huge pages yet, so return an error for now.
+ * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS).
+ *
+ * The IO for such large folios is not fully tested, thus return
+ * an error to reject such folios unless it's an experimental build.
+ *
* Filesystem transparent huge pages are typically only used for
* executables that explicitly enable them, so this isn't very
* restrictive.
*/
- if (folio_test_large(folio)) {
+ if (!IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL) && folio_test_large(folio)) {
folio_unlock(folio);
folio_put(folio);
return ERR_PTR(-ETXTBSY);
@@ -890,14 +885,15 @@ again:
return ERR_PTR(ret);
}
+ lock_start = folio_pos(folio);
+ lock_end = folio_end(folio) - 1;
/* Wait for any existing ordered extent in the range */
while (1) {
struct btrfs_ordered_extent *ordered;
- lock_extent(&inode->io_tree, page_start, page_end, &cached_state);
- ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
- unlock_extent(&inode->io_tree, page_start, page_end,
- &cached_state);
+ btrfs_lock_extent(&inode->io_tree, lock_start, lock_end, &cached_state);
+ ordered = btrfs_lookup_ordered_range(inode, lock_start, folio_size(folio));
+ btrfs_unlock_extent(&inode->io_tree, lock_start, lock_end, &cached_state);
if (!ordered)
break;
@@ -951,7 +947,7 @@ struct defrag_target_range {
* @extent_thresh: file extent size threshold, any extent size >= this value
* will be ignored
* @newer_than: only defrag extents newer than this value
- * @do_compress: whether the defrag is doing compression
+ * @do_compress: whether the defrag is doing compression or no-compression
* if true, @extent_thresh will be ignored and all regular
* file extents meeting @newer_than will be targets.
* @locked: if the range has already held extent lock
@@ -1027,8 +1023,8 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
* very likely resulting in a larger extent after writeback is
* triggered (except in a case of free space fragmentation).
*/
- if (test_range_bit_exists(&inode->io_tree, cur, cur + range_len - 1,
- EXTENT_DELALLOC))
+ if (btrfs_test_range_bit_exists(&inode->io_tree, cur, cur + range_len - 1,
+ EXTENT_DELALLOC))
goto next;
/*
@@ -1066,8 +1062,8 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
/* Empty target list, no way to merge with last entry */
if (list_empty(target_list))
goto next;
- last = list_entry(target_list->prev,
- struct defrag_target_range, list);
+ last = list_last_entry(target_list,
+ struct defrag_target_range, list);
/* Not mergeable with last entry */
if (last->start + last->len != cur)
goto next;
@@ -1077,7 +1073,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
add:
last_is_target = true;
- range_len = min(extent_map_end(em), start + len) - cur;
+ range_len = min(btrfs_extent_map_end(em), start + len) - cur;
/*
* This one is a good target, check if it can be merged into
* last range of the target list.
@@ -1085,8 +1081,8 @@ add:
if (!list_empty(target_list)) {
struct defrag_target_range *last;
- last = list_entry(target_list->prev,
- struct defrag_target_range, list);
+ last = list_last_entry(target_list,
+ struct defrag_target_range, list);
ASSERT(last->start + last->len <= cur);
if (last->start + last->len == cur) {
/* Mergeable, enlarge the last entry */
@@ -1099,7 +1095,7 @@ add:
/* Allocate new defrag_target_range */
new = kmalloc(sizeof(*new), GFP_NOFS);
if (!new) {
- free_extent_map(em);
+ btrfs_free_extent_map(em);
ret = -ENOMEM;
break;
}
@@ -1108,8 +1104,8 @@ add:
list_add_tail(&new->list, target_list);
next:
- cur = extent_map_end(em);
- free_extent_map(em);
+ cur = btrfs_extent_map_end(em);
+ btrfs_free_extent_map(em);
}
if (ret < 0) {
struct defrag_target_range *entry;
@@ -1162,27 +1158,30 @@ static int defrag_one_locked_target(struct btrfs_inode *inode,
struct extent_changeset *data_reserved = NULL;
const u64 start = target->start;
const u64 len = target->len;
- unsigned long last_index = (start + len - 1) >> PAGE_SHIFT;
- unsigned long start_index = start >> PAGE_SHIFT;
- unsigned long first_index = folios[0]->index;
int ret = 0;
- int i;
-
- ASSERT(last_index - first_index + 1 <= nr_pages);
ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len);
if (ret < 0)
return ret;
- clear_extent_bit(&inode->io_tree, start, start + len - 1,
- EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
- EXTENT_DEFRAG, cached_state);
- set_extent_bit(&inode->io_tree, start, start + len - 1,
- EXTENT_DELALLOC | EXTENT_DEFRAG, cached_state);
-
- /* Update the page status */
- for (i = start_index - first_index; i <= last_index - first_index; i++) {
- folio_clear_checked(folios[i]);
- btrfs_folio_clamp_set_dirty(fs_info, folios[i], start, len);
+ btrfs_clear_extent_bit(&inode->io_tree, start, start + len - 1,
+ EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
+ EXTENT_DEFRAG, cached_state);
+ btrfs_set_extent_bit(&inode->io_tree, start, start + len - 1,
+ EXTENT_DELALLOC | EXTENT_DEFRAG, cached_state);
+
+ /*
+ * Update the page status.
+ * Due to possible large folios, we have to check all folios one by one.
+ */
+ for (int i = 0; i < nr_pages && folios[i]; i++) {
+ struct folio *folio = folios[i];
+
+ if (!folio)
+ break;
+ if (start >= folio_end(folio) || start + len <= folio_pos(folio))
+ continue;
+ btrfs_folio_clamp_clear_checked(fs_info, folio, start, len);
+ btrfs_folio_clamp_set_dirty(fs_info, folio, start, len);
}
btrfs_delalloc_release_extents(inode, len);
extent_changeset_free(data_reserved);
@@ -1200,11 +1199,10 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
LIST_HEAD(target_list);
struct folio **folios;
const u32 sectorsize = inode->root->fs_info->sectorsize;
- u64 last_index = (start + len - 1) >> PAGE_SHIFT;
- u64 start_index = start >> PAGE_SHIFT;
- unsigned int nr_pages = last_index - start_index + 1;
+ u64 cur = start;
+ const unsigned int nr_pages = ((start + len - 1) >> PAGE_SHIFT) -
+ (start >> PAGE_SHIFT) + 1;
int ret = 0;
- int i;
ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE);
ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize));
@@ -1214,21 +1212,25 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
return -ENOMEM;
/* Prepare all pages */
- for (i = 0; i < nr_pages; i++) {
- folios[i] = defrag_prepare_one_folio(inode, start_index + i);
+ for (int i = 0; cur < start + len && i < nr_pages; i++) {
+ folios[i] = defrag_prepare_one_folio(inode, cur >> PAGE_SHIFT);
if (IS_ERR(folios[i])) {
ret = PTR_ERR(folios[i]);
- nr_pages = i;
+ folios[i] = NULL;
goto free_folios;
}
+ cur = folio_end(folios[i]);
}
- for (i = 0; i < nr_pages; i++)
+ for (int i = 0; i < nr_pages; i++) {
+ if (!folios[i])
+ break;
folio_wait_writeback(folios[i]);
+ }
+ /* We should get at least one folio. */
+ ASSERT(folios[0]);
/* Lock the pages range */
- lock_extent(&inode->io_tree, start_index << PAGE_SHIFT,
- (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
- &cached_state);
+ btrfs_lock_extent(&inode->io_tree, folio_pos(folios[0]), cur - 1, &cached_state);
/*
* Now we have a consistent view about the extent map, re-check
* which range really needs to be defragged.
@@ -1254,11 +1256,11 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
kfree(entry);
}
unlock_extent:
- unlock_extent(&inode->io_tree, start_index << PAGE_SHIFT,
- (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
- &cached_state);
+ btrfs_unlock_extent(&inode->io_tree, folio_pos(folios[0]), cur - 1, &cached_state);
free_folios:
- for (i = 0; i < nr_pages; i++) {
+ for (int i = 0; i < nr_pages; i++) {
+ if (!folios[i])
+ break;
folio_unlock(folios[i]);
folio_put(folios[i]);
}
@@ -1352,17 +1354,19 @@ out:
* (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without
* defragging all the range).
*/
-int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
+int btrfs_defrag_file(struct btrfs_inode *inode, struct file_ra_state *ra,
struct btrfs_ioctl_defrag_range_args *range,
u64 newer_than, unsigned long max_to_defrag)
{
- struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
unsigned long sectors_defragged = 0;
- u64 isize = i_size_read(inode);
+ u64 isize = i_size_read(&inode->vfs_inode);
u64 cur;
u64 last_byte;
bool do_compress = (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS);
+ bool no_compress = (range->flags & BTRFS_DEFRAG_RANGE_NOCOMPRESS);
int compress_type = BTRFS_COMPRESS_ZLIB;
+ int compress_level = 0;
int ret = 0;
u32 extent_thresh = range->extent_thresh;
pgoff_t start_index;
@@ -1376,10 +1380,24 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
return -EINVAL;
if (do_compress) {
- if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES)
- return -EINVAL;
- if (range->compress_type)
- compress_type = range->compress_type;
+ if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS_LEVEL) {
+ if (range->compress.type >= BTRFS_NR_COMPRESS_TYPES)
+ return -EINVAL;
+ if (range->compress.type) {
+ compress_type = range->compress.type;
+ compress_level = range->compress.level;
+ if (!btrfs_compress_level_valid(compress_type, compress_level))
+ return -EINVAL;
+ }
+ } else {
+ if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES)
+ return -EINVAL;
+ if (range->compress_type)
+ compress_type = range->compress_type;
+ }
+ } else if (range->flags & BTRFS_DEFRAG_RANGE_NOCOMPRESS) {
+ compress_type = BTRFS_DEFRAG_DONT_COMPRESS;
+ compress_level = 1;
}
if (extent_thresh == 0)
@@ -1402,8 +1420,8 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
* defrag range can be written sequentially.
*/
start_index = cur >> PAGE_SHIFT;
- if (start_index < inode->i_mapping->writeback_index)
- inode->i_mapping->writeback_index = start_index;
+ if (start_index < inode->vfs_inode.i_mapping->writeback_index)
+ inode->vfs_inode.i_mapping->writeback_index = start_index;
while (cur < last_byte) {
const unsigned long prev_sectors_defragged = sectors_defragged;
@@ -1420,27 +1438,30 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
(SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1;
cluster_end = min(cluster_end, last_byte);
- btrfs_inode_lock(BTRFS_I(inode), 0);
- if (IS_SWAPFILE(inode)) {
+ btrfs_inode_lock(inode, 0);
+ if (IS_SWAPFILE(&inode->vfs_inode)) {
ret = -ETXTBSY;
- btrfs_inode_unlock(BTRFS_I(inode), 0);
+ btrfs_inode_unlock(inode, 0);
break;
}
- if (!(inode->i_sb->s_flags & SB_ACTIVE)) {
- btrfs_inode_unlock(BTRFS_I(inode), 0);
+ if (!(inode->vfs_inode.i_sb->s_flags & SB_ACTIVE)) {
+ btrfs_inode_unlock(inode, 0);
break;
}
- if (do_compress)
- BTRFS_I(inode)->defrag_compress = compress_type;
- ret = defrag_one_cluster(BTRFS_I(inode), ra, cur,
+ if (do_compress || no_compress) {
+ inode->defrag_compress = compress_type;
+ inode->defrag_compress_level = compress_level;
+ }
+ ret = defrag_one_cluster(inode, ra, cur,
cluster_end + 1 - cur, extent_thresh,
- newer_than, do_compress, &sectors_defragged,
+ newer_than, do_compress || no_compress,
+ &sectors_defragged,
max_to_defrag, &last_scanned);
if (sectors_defragged > prev_sectors_defragged)
- balance_dirty_pages_ratelimited(inode->i_mapping);
+ balance_dirty_pages_ratelimited(inode->vfs_inode.i_mapping);
- btrfs_inode_unlock(BTRFS_I(inode), 0);
+ btrfs_inode_unlock(inode, 0);
if (ret < 0)
break;
cur = max(cluster_end + 1, last_scanned);
@@ -1462,10 +1483,10 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
* need to be written back immediately.
*/
if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) {
- filemap_flush(inode->i_mapping);
+ filemap_flush(inode->vfs_inode.i_mapping);
if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags))
- filemap_flush(inode->i_mapping);
+ &inode->runtime_flags))
+ filemap_flush(inode->vfs_inode.i_mapping);
}
if (range->compress_type == BTRFS_COMPRESS_LZO)
btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
@@ -1473,10 +1494,10 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
ret = sectors_defragged;
}
- if (do_compress) {
- btrfs_inode_lock(BTRFS_I(inode), 0);
- BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE;
- btrfs_inode_unlock(BTRFS_I(inode), 0);
+ if (do_compress || no_compress) {
+ btrfs_inode_lock(inode, 0);
+ inode->defrag_compress = BTRFS_COMPRESS_NONE;
+ btrfs_inode_unlock(inode, 0);
}
return ret;
}
diff --git a/fs/btrfs/defrag.h b/fs/btrfs/defrag.h
index 6b7596c4f0dc..a7f917a38dbf 100644
--- a/fs/btrfs/defrag.h
+++ b/fs/btrfs/defrag.h
@@ -6,14 +6,14 @@
#include <linux/types.h>
#include <linux/compiler_types.h>
-struct inode;
struct file_ra_state;
+struct btrfs_inode;
struct btrfs_fs_info;
struct btrfs_root;
struct btrfs_trans_handle;
struct btrfs_ioctl_defrag_range_args;
-int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
+int btrfs_defrag_file(struct btrfs_inode *inode, struct file_ra_state *ra,
struct btrfs_ioctl_defrag_range_args *range,
u64 newer_than, unsigned long max_to_defrag);
int __init btrfs_auto_defrag_init(void);
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index 88e900e5a43d..288e1776c02d 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -111,6 +111,18 @@
* making error handling and cleanup easier.
*/
+static inline struct btrfs_space_info *data_sinfo_for_inode(const struct btrfs_inode *inode)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+
+ if (btrfs_is_zoned(fs_info) && btrfs_is_data_reloc_root(inode->root)) {
+ ASSERT(fs_info->data_sinfo->sub_group[0]->subgroup_id ==
+ BTRFS_SUB_GROUP_DATA_RELOC);
+ return fs_info->data_sinfo->sub_group[0];
+ }
+ return fs_info->data_sinfo;
+}
+
int btrfs_alloc_data_chunk_ondemand(const struct btrfs_inode *inode, u64 bytes)
{
struct btrfs_root *root = inode->root;
@@ -123,7 +135,7 @@ int btrfs_alloc_data_chunk_ondemand(const struct btrfs_inode *inode, u64 bytes)
if (btrfs_is_free_space_inode(inode))
flush = BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE;
- return btrfs_reserve_data_bytes(fs_info, bytes, flush);
+ return btrfs_reserve_data_bytes(data_sinfo_for_inode(inode), bytes, flush);
}
int btrfs_check_data_free_space(struct btrfs_inode *inode,
@@ -144,14 +156,14 @@ int btrfs_check_data_free_space(struct btrfs_inode *inode,
else if (btrfs_is_free_space_inode(inode))
flush = BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE;
- ret = btrfs_reserve_data_bytes(fs_info, len, flush);
+ ret = btrfs_reserve_data_bytes(data_sinfo_for_inode(inode), len, flush);
if (ret < 0)
return ret;
/* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
ret = btrfs_qgroup_reserve_data(inode, reserved, start, len);
if (ret < 0) {
- btrfs_free_reserved_data_space_noquota(fs_info, len);
+ btrfs_free_reserved_data_space_noquota(inode, len);
extent_changeset_free(*reserved);
*reserved = NULL;
} else {
@@ -168,15 +180,13 @@ int btrfs_check_data_free_space(struct btrfs_inode *inode,
* which we can't sleep and is sure it won't affect qgroup reserved space.
* Like clear_bit_hook().
*/
-void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info,
- u64 len)
+void btrfs_free_reserved_data_space_noquota(struct btrfs_inode *inode, u64 len)
{
- struct btrfs_space_info *data_sinfo;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
ASSERT(IS_ALIGNED(len, fs_info->sectorsize));
- data_sinfo = fs_info->data_sinfo;
- btrfs_space_info_free_bytes_may_use(data_sinfo, len);
+ btrfs_space_info_free_bytes_may_use(data_sinfo_for_inode(inode), len);
}
/*
@@ -196,7 +206,7 @@ void btrfs_free_reserved_data_space(struct btrfs_inode *inode,
round_down(start, fs_info->sectorsize);
start = round_down(start, fs_info->sectorsize);
- btrfs_free_reserved_data_space_noquota(fs_info, len);
+ btrfs_free_reserved_data_space_noquota(inode, len);
btrfs_qgroup_free_data(inode, reserved, start, len, NULL);
}
@@ -439,6 +449,29 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
btrfs_inode_rsv_release(inode, true);
}
+/* Shrink a previously reserved extent to a new length. */
+void btrfs_delalloc_shrink_extents(struct btrfs_inode *inode, u64 reserved_len, u64 new_len)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ const u32 reserved_num_extents = count_max_extents(fs_info, reserved_len);
+ const u32 new_num_extents = count_max_extents(fs_info, new_len);
+ const int diff_num_extents = new_num_extents - reserved_num_extents;
+
+ ASSERT(new_len <= reserved_len);
+ if (new_num_extents == reserved_num_extents)
+ return;
+
+ spin_lock(&inode->lock);
+ btrfs_mod_outstanding_extents(inode, diff_num_extents);
+ btrfs_calculate_inode_block_rsv_size(fs_info, inode);
+ spin_unlock(&inode->lock);
+
+ if (btrfs_is_testing(fs_info))
+ return;
+
+ btrfs_inode_rsv_release(inode, true);
+}
+
/*
* Reserve data and metadata space for delalloc
*
diff --git a/fs/btrfs/delalloc-space.h b/fs/btrfs/delalloc-space.h
index 3f32953c0a80..6119c0d3f883 100644
--- a/fs/btrfs/delalloc-space.h
+++ b/fs/btrfs/delalloc-space.h
@@ -18,8 +18,7 @@ void btrfs_free_reserved_data_space(struct btrfs_inode *inode,
void btrfs_delalloc_release_space(struct btrfs_inode *inode,
struct extent_changeset *reserved,
u64 start, u64 len, bool qgroup_free);
-void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info,
- u64 len);
+void btrfs_free_reserved_data_space_noquota(struct btrfs_inode *inode, u64 len);
void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
bool qgroup_free);
int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
@@ -27,5 +26,6 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
u64 disk_num_bytes, bool noflush);
void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes);
+void btrfs_delalloc_shrink_extents(struct btrfs_inode *inode, u64 reserved_len, u64 new_len);
#endif /* BTRFS_DELALLOC_SPACE_H */
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 0b4933c6a889..0f8d8e275143 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -119,7 +119,12 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
return NULL;
}
-/* Will return either the node or PTR_ERR(-ENOMEM) */
+/*
+ * Look up an existing delayed node associated with @btrfs_inode or create a new
+ * one and insert it to the delayed nodes of the root.
+ *
+ * Return the delayed node, or error pointer on failure.
+ */
static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
struct btrfs_inode *btrfs_inode)
{
@@ -211,17 +216,13 @@ static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root,
static struct btrfs_delayed_node *btrfs_first_delayed_node(
struct btrfs_delayed_root *delayed_root)
{
- struct list_head *p;
- struct btrfs_delayed_node *node = NULL;
+ struct btrfs_delayed_node *node;
spin_lock(&delayed_root->lock);
- if (list_empty(&delayed_root->node_list))
- goto out;
-
- p = delayed_root->node_list.next;
- node = list_entry(p, struct btrfs_delayed_node, n_list);
- refcount_inc(&node->refs);
-out:
+ node = list_first_entry_or_null(&delayed_root->node_list,
+ struct btrfs_delayed_node, n_list);
+ if (node)
+ refcount_inc(&node->refs);
spin_unlock(&delayed_root->lock);
return node;
@@ -293,18 +294,15 @@ static inline void btrfs_release_delayed_node(struct btrfs_delayed_node *node)
static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node(
struct btrfs_delayed_root *delayed_root)
{
- struct list_head *p;
- struct btrfs_delayed_node *node = NULL;
+ struct btrfs_delayed_node *node;
spin_lock(&delayed_root->lock);
- if (list_empty(&delayed_root->prepare_list))
- goto out;
-
- p = delayed_root->prepare_list.next;
- list_del_init(p);
- node = list_entry(p, struct btrfs_delayed_node, p_list);
- refcount_inc(&node->refs);
-out:
+ node = list_first_entry_or_null(&delayed_root->prepare_list,
+ struct btrfs_delayed_node, p_list);
+ if (node) {
+ list_del_init(&node->p_list);
+ refcount_inc(&node->refs);
+ }
spin_unlock(&delayed_root->lock);
return node;
@@ -336,6 +334,20 @@ static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u16 data_len,
return item;
}
+static int delayed_item_index_cmp(const void *key, const struct rb_node *node)
+{
+ const u64 *index = key;
+ const struct btrfs_delayed_item *delayed_item = rb_entry(node,
+ struct btrfs_delayed_item, rb_node);
+
+ if (delayed_item->index < *index)
+ return 1;
+ else if (delayed_item->index > *index)
+ return -1;
+
+ return 0;
+}
+
/*
* Look up the delayed item by key.
*
@@ -349,21 +361,10 @@ static struct btrfs_delayed_item *__btrfs_lookup_delayed_item(
struct rb_root *root,
u64 index)
{
- struct rb_node *node = root->rb_node;
- struct btrfs_delayed_item *delayed_item = NULL;
-
- while (node) {
- delayed_item = rb_entry(node, struct btrfs_delayed_item,
- rb_node);
- if (delayed_item->index < index)
- node = node->rb_right;
- else if (delayed_item->index > index)
- node = node->rb_left;
- else
- return delayed_item;
- }
+ struct rb_node *node;
- return NULL;
+ node = rb_find(&index, root, delayed_item_index_cmp);
+ return rb_entry_safe(node, struct btrfs_delayed_item, rb_node);
}
static int btrfs_delayed_item_cmp(const struct rb_node *new,
@@ -371,14 +372,8 @@ static int btrfs_delayed_item_cmp(const struct rb_node *new,
{
const struct btrfs_delayed_item *new_item =
rb_entry(new, struct btrfs_delayed_item, rb_node);
- const struct btrfs_delayed_item *exist_item =
- rb_entry(exist, struct btrfs_delayed_item, rb_node);
- if (new_item->index < exist_item->index)
- return -1;
- if (new_item->index > exist_item->index)
- return 1;
- return 0;
+ return delayed_item_index_cmp(&new_item->index, exist);
}
static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
@@ -454,40 +449,25 @@ static void btrfs_release_delayed_item(struct btrfs_delayed_item *item)
static struct btrfs_delayed_item *__btrfs_first_delayed_insertion_item(
struct btrfs_delayed_node *delayed_node)
{
- struct rb_node *p;
- struct btrfs_delayed_item *item = NULL;
+ struct rb_node *p = rb_first_cached(&delayed_node->ins_root);
- p = rb_first_cached(&delayed_node->ins_root);
- if (p)
- item = rb_entry(p, struct btrfs_delayed_item, rb_node);
-
- return item;
+ return rb_entry_safe(p, struct btrfs_delayed_item, rb_node);
}
static struct btrfs_delayed_item *__btrfs_first_delayed_deletion_item(
struct btrfs_delayed_node *delayed_node)
{
- struct rb_node *p;
- struct btrfs_delayed_item *item = NULL;
+ struct rb_node *p = rb_first_cached(&delayed_node->del_root);
- p = rb_first_cached(&delayed_node->del_root);
- if (p)
- item = rb_entry(p, struct btrfs_delayed_item, rb_node);
-
- return item;
+ return rb_entry_safe(p, struct btrfs_delayed_item, rb_node);
}
static struct btrfs_delayed_item *__btrfs_next_delayed_item(
struct btrfs_delayed_item *item)
{
- struct rb_node *p;
- struct btrfs_delayed_item *next = NULL;
+ struct rb_node *p = rb_next(&item->rb_node);
- p = rb_next(&item->rb_node);
- if (p)
- next = rb_entry(p, struct btrfs_delayed_item, rb_node);
-
- return next;
+ return rb_entry_safe(p, struct btrfs_delayed_item, rb_node);
}
static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
@@ -1025,8 +1005,16 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
ret = btrfs_lookup_inode(trans, root, path, &key, mod);
if (ret > 0)
ret = -ENOENT;
- if (ret < 0)
+ if (ret < 0) {
+ /*
+ * If we fail to update the delayed inode we need to abort the
+ * transaction, because we could leave the inode with the
+ * improper counts behind.
+ */
+ if (ret != -ENOENT)
+ btrfs_abort_transaction(trans, ret);
goto out;
+ }
leaf = path->nodes[0];
inode_item = btrfs_item_ptr(leaf, path->slots[0],
@@ -1051,8 +1039,10 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
- if (ret < 0)
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
goto err_out;
+ }
ASSERT(ret > 0);
ASSERT(path->slots[0] > 0);
ret = 0;
@@ -1074,21 +1064,14 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
* in the same item doesn't exist.
*/
ret = btrfs_del_item(trans, root, path);
+ if (ret < 0)
+ btrfs_abort_transaction(trans, ret);
out:
btrfs_release_delayed_iref(node);
btrfs_release_path(path);
err_out:
btrfs_delayed_inode_release_metadata(fs_info, node, (ret < 0));
btrfs_release_delayed_inode(node);
-
- /*
- * If we fail to update the delayed inode we need to abort the
- * transaction, because we could leave the inode with the improper
- * counts behind.
- */
- if (ret && ret != -ENOENT)
- btrfs_abort_transaction(trans, ret);
-
return ret;
}
@@ -1211,7 +1194,7 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode)
{
struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_block_rsv *block_rsv;
int ret;
@@ -1238,7 +1221,6 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
ret = __btrfs_commit_inode_delayed_items(trans, path, delayed_node);
btrfs_release_delayed_node(delayed_node);
- btrfs_free_path(path);
trans->block_rsv = block_rsv;
return ret;
@@ -1395,20 +1377,23 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info)
{
- WARN_ON(btrfs_first_delayed_node(fs_info->delayed_root));
+ struct btrfs_delayed_node *node = btrfs_first_delayed_node(fs_info->delayed_root);
+
+ if (WARN_ON(node))
+ refcount_dec(&node->refs);
}
-static int could_end_wait(struct btrfs_delayed_root *delayed_root, int seq)
+static bool could_end_wait(struct btrfs_delayed_root *delayed_root, int seq)
{
int val = atomic_read(&delayed_root->items_seq);
if (val < seq || val >= seq + BTRFS_DELAYED_BATCH)
- return 1;
+ return true;
if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND)
- return 1;
+ return true;
- return 0;
+ return false;
}
void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info)
@@ -1555,8 +1540,8 @@ release_node:
return ret;
}
-static int btrfs_delete_delayed_insertion_item(struct btrfs_delayed_node *node,
- u64 index)
+static bool btrfs_delete_delayed_insertion_item(struct btrfs_delayed_node *node,
+ u64 index)
{
struct btrfs_delayed_item *item;
@@ -1564,7 +1549,7 @@ static int btrfs_delete_delayed_insertion_item(struct btrfs_delayed_node *node,
item = __btrfs_lookup_delayed_item(&node->ins_root.rb_root, index);
if (!item) {
mutex_unlock(&node->mutex);
- return 1;
+ return false;
}
/*
@@ -1599,7 +1584,7 @@ static int btrfs_delete_delayed_insertion_item(struct btrfs_delayed_node *node,
}
mutex_unlock(&node->mutex);
- return 0;
+ return true;
}
int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
@@ -1613,9 +1598,10 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
if (IS_ERR(node))
return PTR_ERR(node);
- ret = btrfs_delete_delayed_insertion_item(node, index);
- if (!ret)
+ if (btrfs_delete_delayed_insertion_item(node, index)) {
+ ret = 0;
goto end;
+ }
item = btrfs_alloc_delayed_item(0, node, BTRFS_DELAYED_DELETION_ITEM);
if (!item) {
@@ -1632,7 +1618,8 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
*/
if (ret < 0) {
btrfs_err(trans->fs_info,
-"metadata reservation failed for delayed dir item deltiona, should have been reserved");
+"metadata reservation failed for delayed dir item deletion, index: %llu, root: %llu, inode: %llu, error: %d",
+ index, btrfs_root_id(node->root), node->inode_id, ret);
btrfs_release_delayed_item(item);
goto end;
}
@@ -1641,9 +1628,8 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
ret = __btrfs_add_delayed_item(node, item);
if (unlikely(ret)) {
btrfs_err(trans->fs_info,
- "err add delayed dir index item(index: %llu) into the deletion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)",
- index, btrfs_root_id(node->root),
- node->inode_id, ret);
+"failed to add delayed dir index item, root: %llu, inode: %llu, index: %llu, error: %d",
+ index, btrfs_root_id(node->root), node->inode_id, ret);
btrfs_delayed_item_release_metadata(dir->root, item);
btrfs_release_delayed_item(item);
}
@@ -1748,17 +1734,16 @@ void btrfs_readdir_put_delayed_items(struct btrfs_inode *inode,
downgrade_write(&inode->vfs_inode.i_rwsem);
}
-int btrfs_should_delete_dir_index(const struct list_head *del_list,
- u64 index)
+bool btrfs_should_delete_dir_index(const struct list_head *del_list, u64 index)
{
struct btrfs_delayed_item *curr;
- int ret = 0;
+ bool ret = false;
list_for_each_entry(curr, del_list, readdir_list) {
if (curr->index > index)
break;
if (curr->index == index) {
- ret = 1;
+ ret = true;
break;
}
}
@@ -1768,15 +1753,14 @@ int btrfs_should_delete_dir_index(const struct list_head *del_list,
/*
* Read dir info stored in the delayed tree.
*/
-int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
- const struct list_head *ins_list)
+bool btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
+ const struct list_head *ins_list)
{
struct btrfs_dir_item *di;
struct btrfs_delayed_item *curr, *next;
struct btrfs_key location;
char *name;
int name_len;
- int over = 0;
unsigned char d_type;
/*
@@ -1785,6 +1769,8 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
* directory, nobody can delete any directory indexes now.
*/
list_for_each_entry_safe(curr, next, ins_list, readdir_list) {
+ bool over;
+
list_del(&curr->readdir_list);
if (curr->index < ctx->pos) {
@@ -1802,68 +1788,67 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
d_type = fs_ftype_to_dtype(btrfs_dir_flags_to_ftype(di->type));
btrfs_disk_key_to_cpu(&location, &di->location);
- over = !dir_emit(ctx, name, name_len,
- location.objectid, d_type);
+ over = !dir_emit(ctx, name, name_len, location.objectid, d_type);
if (refcount_dec_and_test(&curr->refs))
kfree(curr);
if (over)
- return 1;
+ return true;
ctx->pos++;
}
- return 0;
+ return false;
}
static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
struct btrfs_inode_item *inode_item,
- struct inode *inode)
+ struct btrfs_inode *inode)
{
+ struct inode *vfs_inode = &inode->vfs_inode;
u64 flags;
- btrfs_set_stack_inode_uid(inode_item, i_uid_read(inode));
- btrfs_set_stack_inode_gid(inode_item, i_gid_read(inode));
- btrfs_set_stack_inode_size(inode_item, BTRFS_I(inode)->disk_i_size);
- btrfs_set_stack_inode_mode(inode_item, inode->i_mode);
- btrfs_set_stack_inode_nlink(inode_item, inode->i_nlink);
- btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(inode));
- btrfs_set_stack_inode_generation(inode_item,
- BTRFS_I(inode)->generation);
+ btrfs_set_stack_inode_uid(inode_item, i_uid_read(vfs_inode));
+ btrfs_set_stack_inode_gid(inode_item, i_gid_read(vfs_inode));
+ btrfs_set_stack_inode_size(inode_item, inode->disk_i_size);
+ btrfs_set_stack_inode_mode(inode_item, vfs_inode->i_mode);
+ btrfs_set_stack_inode_nlink(inode_item, vfs_inode->i_nlink);
+ btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(vfs_inode));
+ btrfs_set_stack_inode_generation(inode_item, inode->generation);
btrfs_set_stack_inode_sequence(inode_item,
- inode_peek_iversion(inode));
+ inode_peek_iversion(vfs_inode));
btrfs_set_stack_inode_transid(inode_item, trans->transid);
- btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev);
- flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
- BTRFS_I(inode)->ro_flags);
+ btrfs_set_stack_inode_rdev(inode_item, vfs_inode->i_rdev);
+ flags = btrfs_inode_combine_flags(inode->flags, inode->ro_flags);
btrfs_set_stack_inode_flags(inode_item, flags);
btrfs_set_stack_inode_block_group(inode_item, 0);
btrfs_set_stack_timespec_sec(&inode_item->atime,
- inode_get_atime_sec(inode));
+ inode_get_atime_sec(vfs_inode));
btrfs_set_stack_timespec_nsec(&inode_item->atime,
- inode_get_atime_nsec(inode));
+ inode_get_atime_nsec(vfs_inode));
btrfs_set_stack_timespec_sec(&inode_item->mtime,
- inode_get_mtime_sec(inode));
+ inode_get_mtime_sec(vfs_inode));
btrfs_set_stack_timespec_nsec(&inode_item->mtime,
- inode_get_mtime_nsec(inode));
+ inode_get_mtime_nsec(vfs_inode));
btrfs_set_stack_timespec_sec(&inode_item->ctime,
- inode_get_ctime_sec(inode));
+ inode_get_ctime_sec(vfs_inode));
btrfs_set_stack_timespec_nsec(&inode_item->ctime,
- inode_get_ctime_nsec(inode));
+ inode_get_ctime_nsec(vfs_inode));
- btrfs_set_stack_timespec_sec(&inode_item->otime, BTRFS_I(inode)->i_otime_sec);
- btrfs_set_stack_timespec_nsec(&inode_item->otime, BTRFS_I(inode)->i_otime_nsec);
+ btrfs_set_stack_timespec_sec(&inode_item->otime, inode->i_otime_sec);
+ btrfs_set_stack_timespec_nsec(&inode_item->otime, inode->i_otime_nsec);
}
-int btrfs_fill_inode(struct inode *inode, u32 *rdev)
+int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev)
{
- struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_delayed_node *delayed_node;
struct btrfs_inode_item *inode_item;
+ struct inode *vfs_inode = &inode->vfs_inode;
- delayed_node = btrfs_get_delayed_node(BTRFS_I(inode));
+ delayed_node = btrfs_get_delayed_node(inode);
if (!delayed_node)
return -ENOENT;
@@ -1876,39 +1861,38 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
inode_item = &delayed_node->inode_item;
- i_uid_write(inode, btrfs_stack_inode_uid(inode_item));
- i_gid_write(inode, btrfs_stack_inode_gid(inode_item));
- btrfs_i_size_write(BTRFS_I(inode), btrfs_stack_inode_size(inode_item));
- btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
- round_up(i_size_read(inode), fs_info->sectorsize));
- inode->i_mode = btrfs_stack_inode_mode(inode_item);
- set_nlink(inode, btrfs_stack_inode_nlink(inode_item));
- inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item));
- BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item);
- BTRFS_I(inode)->last_trans = btrfs_stack_inode_transid(inode_item);
-
- inode_set_iversion_queried(inode,
- btrfs_stack_inode_sequence(inode_item));
- inode->i_rdev = 0;
+ i_uid_write(vfs_inode, btrfs_stack_inode_uid(inode_item));
+ i_gid_write(vfs_inode, btrfs_stack_inode_gid(inode_item));
+ btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item));
+ btrfs_inode_set_file_extent_range(inode, 0,
+ round_up(i_size_read(vfs_inode), fs_info->sectorsize));
+ vfs_inode->i_mode = btrfs_stack_inode_mode(inode_item);
+ set_nlink(vfs_inode, btrfs_stack_inode_nlink(inode_item));
+ inode_set_bytes(vfs_inode, btrfs_stack_inode_nbytes(inode_item));
+ inode->generation = btrfs_stack_inode_generation(inode_item);
+ inode->last_trans = btrfs_stack_inode_transid(inode_item);
+
+ inode_set_iversion_queried(vfs_inode, btrfs_stack_inode_sequence(inode_item));
+ vfs_inode->i_rdev = 0;
*rdev = btrfs_stack_inode_rdev(inode_item);
btrfs_inode_split_flags(btrfs_stack_inode_flags(inode_item),
- &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);
+ &inode->flags, &inode->ro_flags);
- inode_set_atime(inode, btrfs_stack_timespec_sec(&inode_item->atime),
+ inode_set_atime(vfs_inode, btrfs_stack_timespec_sec(&inode_item->atime),
btrfs_stack_timespec_nsec(&inode_item->atime));
- inode_set_mtime(inode, btrfs_stack_timespec_sec(&inode_item->mtime),
+ inode_set_mtime(vfs_inode, btrfs_stack_timespec_sec(&inode_item->mtime),
btrfs_stack_timespec_nsec(&inode_item->mtime));
- inode_set_ctime(inode, btrfs_stack_timespec_sec(&inode_item->ctime),
+ inode_set_ctime(vfs_inode, btrfs_stack_timespec_sec(&inode_item->ctime),
btrfs_stack_timespec_nsec(&inode_item->ctime));
- BTRFS_I(inode)->i_otime_sec = btrfs_stack_timespec_sec(&inode_item->otime);
- BTRFS_I(inode)->i_otime_nsec = btrfs_stack_timespec_nsec(&inode_item->otime);
+ inode->i_otime_sec = btrfs_stack_timespec_sec(&inode_item->otime);
+ inode->i_otime_nsec = btrfs_stack_timespec_nsec(&inode_item->otime);
- inode->i_generation = BTRFS_I(inode)->generation;
- if (S_ISDIR(inode->i_mode))
- BTRFS_I(inode)->index_cnt = (u64)-1;
+ vfs_inode->i_generation = inode->generation;
+ if (S_ISDIR(vfs_inode->i_mode))
+ inode->index_cnt = (u64)-1;
mutex_unlock(&delayed_node->mutex);
btrfs_release_delayed_node(delayed_node);
@@ -1928,8 +1912,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
mutex_lock(&delayed_node->mutex);
if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
- fill_stack_inode_item(trans, &delayed_node->inode_item,
- &inode->vfs_inode);
+ fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
goto release_node;
}
@@ -1937,7 +1920,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
if (ret)
goto release_node;
- fill_stack_inode_item(trans, &delayed_node->inode_item, &inode->vfs_inode);
+ fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
set_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags);
delayed_node->count++;
atomic_inc(&root->fs_info->delayed_root->items);
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index f4d9feac0d0e..e6e763ad2d42 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -133,7 +133,7 @@ int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode);
int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode);
-int btrfs_fill_inode(struct inode *inode, u32 *rdev);
+int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev);
int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode);
/* Used for drop dead root */
@@ -150,10 +150,9 @@ bool btrfs_readdir_get_delayed_items(struct btrfs_inode *inode,
void btrfs_readdir_put_delayed_items(struct btrfs_inode *inode,
struct list_head *ins_list,
struct list_head *del_list);
-int btrfs_should_delete_dir_index(const struct list_head *del_list,
- u64 index);
-int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
- const struct list_head *ins_list);
+bool btrfs_should_delete_dir_index(const struct list_head *del_list, u64 index);
+bool btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
+ const struct list_head *ins_list);
/* Used during directory logging. */
void btrfs_log_get_delayed_items(struct btrfs_inode *inode,
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 98c5b61dabe8..ca382c5b186f 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -331,12 +331,9 @@ static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root,
struct btrfs_delayed_ref_node *ins)
{
struct rb_node *node = &ins->ref_node;
- struct rb_node *exist;
+ struct rb_node *exist = rb_find_add_cached(node, root, cmp_refs_node);
- exist = rb_find_add_cached(node, root, cmp_refs_node);
- if (exist)
- return rb_entry(exist, struct btrfs_delayed_ref_node, ref_node);
- return NULL;
+ return rb_entry_safe(exist, struct btrfs_delayed_ref_node, ref_node);
}
static struct btrfs_delayed_ref_head *find_first_ref_head(
@@ -931,7 +928,7 @@ static void init_delayed_ref_common(struct btrfs_fs_info *fs_info,
if (action == BTRFS_ADD_DELAYED_EXTENT)
action = BTRFS_ADD_DELAYED_REF;
- if (is_fstree(generic_ref->ref_root))
+ if (btrfs_is_fstree(generic_ref->ref_root))
seq = atomic64_read(&fs_info->tree_mod_seq);
refcount_set(&ref->refs, 1);
@@ -961,8 +958,8 @@ void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, u64 mod_root,
#endif
generic_ref->tree_ref.level = level;
generic_ref->type = BTRFS_REF_METADATA;
- if (skip_qgroup || !(is_fstree(generic_ref->ref_root) &&
- (!mod_root || is_fstree(mod_root))))
+ if (skip_qgroup || !(btrfs_is_fstree(generic_ref->ref_root) &&
+ (!mod_root || btrfs_is_fstree(mod_root))))
generic_ref->skip_qgroup = true;
else
generic_ref->skip_qgroup = false;
@@ -979,8 +976,8 @@ void btrfs_init_data_ref(struct btrfs_ref *generic_ref, u64 ino, u64 offset,
generic_ref->data_ref.objectid = ino;
generic_ref->data_ref.offset = offset;
generic_ref->type = BTRFS_REF_DATA;
- if (skip_qgroup || !(is_fstree(generic_ref->ref_root) &&
- (!mod_root || is_fstree(mod_root))))
+ if (skip_qgroup || !(btrfs_is_fstree(generic_ref->ref_root) &&
+ (!mod_root || btrfs_is_fstree(mod_root))))
generic_ref->skip_qgroup = true;
else
generic_ref->skip_qgroup = false;
@@ -1339,7 +1336,7 @@ int __init btrfs_delayed_ref_init(void)
{
btrfs_delayed_ref_head_cachep = KMEM_CACHE(btrfs_delayed_ref_head, 0);
if (!btrfs_delayed_ref_head_cachep)
- goto fail;
+ return -ENOMEM;
btrfs_delayed_ref_node_cachep = KMEM_CACHE(btrfs_delayed_ref_node, 0);
if (!btrfs_delayed_ref_node_cachep)
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index a35067cebb97..552ec4fa645d 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -14,6 +14,8 @@
#include <linux/spinlock.h>
#include <linux/slab.h>
#include <uapi/linux/btrfs_tree.h>
+#include "fs.h"
+#include "messages.h"
struct btrfs_trans_handle;
struct btrfs_fs_info;
@@ -260,7 +262,6 @@ enum btrfs_ref_type {
BTRFS_REF_NOT_SET,
BTRFS_REF_DATA,
BTRFS_REF_METADATA,
- BTRFS_REF_LAST,
} __packed;
struct btrfs_ref {
@@ -419,7 +420,7 @@ bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head,
u64 root, u64 parent);
void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans);
-static inline u64 btrfs_delayed_ref_owner(struct btrfs_delayed_ref_node *node)
+static inline u64 btrfs_delayed_ref_owner(const struct btrfs_delayed_ref_node *node)
{
if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
node->type == BTRFS_SHARED_DATA_REF_KEY)
@@ -427,7 +428,7 @@ static inline u64 btrfs_delayed_ref_owner(struct btrfs_delayed_ref_node *node)
return node->tree_ref.level;
}
-static inline u64 btrfs_delayed_ref_offset(struct btrfs_delayed_ref_node *node)
+static inline u64 btrfs_delayed_ref_offset(const struct btrfs_delayed_ref_node *node)
{
if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
node->type == BTRFS_SHARED_DATA_REF_KEY)
@@ -435,7 +436,7 @@ static inline u64 btrfs_delayed_ref_offset(struct btrfs_delayed_ref_node *node)
return 0;
}
-static inline u8 btrfs_ref_type(struct btrfs_ref *ref)
+static inline u8 btrfs_ref_type(const struct btrfs_ref *ref)
{
ASSERT(ref->type == BTRFS_REF_DATA || ref->type == BTRFS_REF_METADATA);
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index f86fbea0b3de..4675bcd5f92e 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -76,7 +76,7 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
struct extent_buffer *eb;
int slot;
int ret = 0;
- struct btrfs_path *path = NULL;
+ BTRFS_PATH_AUTO_FREE(path);
int item_size;
struct btrfs_dev_replace_item *ptr;
u64 src_devid;
@@ -85,10 +85,8 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
return 0;
path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!path)
+ return -ENOMEM;
key.objectid = 0;
key.type = BTRFS_DEV_REPLACE_KEY;
@@ -103,10 +101,8 @@ no_valid_dev_replace_entry_found:
if (btrfs_find_device(fs_info->fs_devices, &args)) {
btrfs_err(fs_info,
"found replace target device without a valid replace item");
- ret = -EUCLEAN;
- goto out;
+ return -EUCLEAN;
}
- ret = 0;
dev_replace->replace_state =
BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED;
dev_replace->cont_reading_from_srcdev_mode =
@@ -123,7 +119,7 @@ no_valid_dev_replace_entry_found:
dev_replace->tgtdev = NULL;
dev_replace->is_valid = 0;
dev_replace->item_needs_writeback = 0;
- goto out;
+ return 0;
}
slot = path->slots[0];
eb = path->nodes[0];
@@ -226,8 +222,6 @@ no_valid_dev_replace_entry_found:
break;
}
-out:
- btrfs_free_path(path);
return ret;
}
@@ -256,7 +250,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
}
bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE,
- fs_info->bdev_holder, NULL);
+ fs_info->sb, &fs_holder_ops);
if (IS_ERR(bdev_file)) {
btrfs_err(fs_info, "target device %s is invalid!", device_path);
return PTR_ERR(bdev_file);
@@ -333,7 +327,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
return 0;
error:
- fput(bdev_file);
+ bdev_fput(bdev_file);
return ret;
}
@@ -346,7 +340,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans)
struct btrfs_fs_info *fs_info = trans->fs_info;
int ret;
struct btrfs_root *dev_root = fs_info->dev_root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct extent_buffer *eb;
struct btrfs_dev_replace_item *ptr;
@@ -365,16 +359,15 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans)
key.offset = 0;
path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!path)
+ return -ENOMEM;
+
ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
if (ret < 0) {
btrfs_warn(fs_info,
"error %d while searching for dev_replace item!",
ret);
- goto out;
+ return ret;
}
if (ret == 0 &&
@@ -395,7 +388,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans)
btrfs_warn(fs_info,
"delete too small dev_replace item failed %d!",
ret);
- goto out;
+ return ret;
}
ret = 1;
}
@@ -408,7 +401,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans)
if (ret < 0) {
btrfs_warn(fs_info,
"insert dev_replace item failed %d!", ret);
- goto out;
+ return ret;
}
}
@@ -440,8 +433,6 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans)
dev_replace->cursor_right);
dev_replace->item_needs_writeback = 0;
up_write(&dev_replace->rwsem);
-out:
- btrfs_free_path(path);
return ret;
}
@@ -609,7 +600,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
return PTR_ERR(src_device);
if (btrfs_pinned_by_swapfile(fs_info, src_device)) {
- btrfs_warn_in_rcu(fs_info,
+ btrfs_warn(fs_info,
"cannot replace device %s (devid %llu) due to active swapfile",
btrfs_dev_name(src_device), src_device->devid);
return -ETXTBSY;
@@ -646,7 +637,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
- ASSERT(0);
+ DEBUG_WARN("unexpected STARTED ot SUSPENDED dev-replace state");
ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
up_write(&dev_replace->rwsem);
goto leave;
@@ -656,7 +647,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
dev_replace->srcdev = src_device;
dev_replace->tgtdev = tgt_device;
- btrfs_info_in_rcu(fs_info,
+ btrfs_info(fs_info,
"dev_replace from %s (devid %llu) to %s started",
btrfs_dev_name(src_device),
src_device->devid,
@@ -803,17 +794,17 @@ static int btrfs_set_target_alloc_state(struct btrfs_device *srcdev,
lockdep_assert_held(&srcdev->fs_info->chunk_mutex);
- while (find_first_extent_bit(&srcdev->alloc_state, start,
- &found_start, &found_end,
- CHUNK_ALLOCATED, &cached_state)) {
- ret = set_extent_bit(&tgtdev->alloc_state, found_start,
- found_end, CHUNK_ALLOCATED, NULL);
+ while (btrfs_find_first_extent_bit(&srcdev->alloc_state, start,
+ &found_start, &found_end,
+ CHUNK_ALLOCATED, &cached_state)) {
+ ret = btrfs_set_extent_bit(&tgtdev->alloc_state, found_start,
+ found_end, CHUNK_ALLOCATED, NULL);
if (ret)
break;
start = found_end + 1;
}
- free_extent_state(cached_state);
+ btrfs_free_extent_state(cached_state);
return ret;
}
@@ -952,7 +943,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
tgt_device);
} else {
if (scrub_ret != -ECANCELED)
- btrfs_err_in_rcu(fs_info,
+ btrfs_err(fs_info,
"btrfs_scrub_dev(%s, %llu, %s) failed %d",
btrfs_dev_name(src_device),
src_device->devid,
@@ -970,7 +961,7 @@ error:
return scrub_ret;
}
- btrfs_info_in_rcu(fs_info,
+ btrfs_info(fs_info,
"dev_replace from %s (devid %llu) to %s finished",
btrfs_dev_name(src_device),
src_device->devid,
@@ -1118,7 +1109,7 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
* btrfs_dev_replace_finishing() will handle the
* cleanup part
*/
- btrfs_info_in_rcu(fs_info,
+ btrfs_info(fs_info,
"dev_replace from %s (devid %llu) to %s canceled",
btrfs_dev_name(src_device), src_device->devid,
btrfs_dev_name(tgt_device));
@@ -1152,7 +1143,7 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
ret = btrfs_commit_transaction(trans);
WARN_ON(ret);
- btrfs_info_in_rcu(fs_info,
+ btrfs_info(fs_info,
"suspended dev_replace from %s (devid %llu) to %s canceled",
btrfs_dev_name(src_device), src_device->devid,
btrfs_dev_name(tgt_device));
@@ -1256,7 +1247,7 @@ static int btrfs_dev_replace_kthread(void *data)
progress = btrfs_dev_replace_progress(fs_info);
progress = div_u64(progress, 10);
- btrfs_info_in_rcu(fs_info,
+ btrfs_info(fs_info,
"continuing dev_replace from %s (devid %llu) to target %s @%u%%",
btrfs_dev_name(dev_replace->srcdev),
dev_replace->srcdev->devid,
@@ -1274,16 +1265,16 @@ static int btrfs_dev_replace_kthread(void *data)
return 0;
}
-int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
+bool __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
{
if (!dev_replace->is_valid)
- return 0;
+ return false;
switch (dev_replace->replace_state) {
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
- return 0;
+ return false;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
/*
@@ -1298,7 +1289,7 @@ int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
*/
break;
}
- return 1;
+ return true;
}
void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount)
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
index 23e480efe5e6..b35cecf388f2 100644
--- a/fs/btrfs/dev-replace.h
+++ b/fs/btrfs/dev-replace.h
@@ -25,7 +25,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info);
int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info);
-int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
+bool __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
struct btrfs_block_group *cache,
u64 physical);
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index ccf91de29f80..69863e398e22 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -227,7 +227,7 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
return di;
}
-int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
+int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir_ino,
const struct fscrypt_str *name)
{
int ret;
@@ -236,13 +236,13 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
int data_size;
struct extent_buffer *leaf;
int slot;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- key.objectid = dir;
+ key.objectid = dir_ino;
key.type = BTRFS_DIR_ITEM_KEY;
key.offset = btrfs_name_hash(name->name, name->len);
@@ -251,20 +251,17 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
if (IS_ERR(di)) {
ret = PTR_ERR(di);
/* Nothing found, we're safe */
- if (ret == -ENOENT) {
- ret = 0;
- goto out;
- }
+ if (ret == -ENOENT)
+ return 0;
if (ret < 0)
- goto out;
+ return ret;
}
/* we found an item, look for our name in the item */
if (di) {
/* our exact name was found */
- ret = -EEXIST;
- goto out;
+ return -EEXIST;
}
/* See if there is room in the item to insert this name. */
@@ -273,14 +270,11 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
slot = path->slots[0];
if (data_size + btrfs_item_size(leaf, slot) +
sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root->fs_info)) {
- ret = -EOVERFLOW;
- } else {
- /* plenty of insertion room */
- ret = 0;
+ return -EOVERFLOW;
}
-out:
- btrfs_free_path(path);
- return ret;
+
+ /* Plenty of insertion room. */
+ return 0;
}
/*
diff --git a/fs/btrfs/dir-item.h b/fs/btrfs/dir-item.h
index 28d69970bc70..e52174a8baf9 100644
--- a/fs/btrfs/dir-item.h
+++ b/fs/btrfs/dir-item.h
@@ -10,10 +10,11 @@ struct fscrypt_str;
struct btrfs_fs_info;
struct btrfs_key;
struct btrfs_path;
+struct btrfs_inode;
struct btrfs_root;
struct btrfs_trans_handle;
-int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
+int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir_ino,
const struct fscrypt_str *name);
int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
const struct fscrypt_str *name, struct btrfs_inode *dir,
diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c
index 8567af46e16f..fe9a4bd7e6e6 100644
--- a/fs/btrfs/direct-io.c
+++ b/fs/btrfs/direct-io.c
@@ -42,21 +42,21 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
/* Direct lock must be taken before the extent lock. */
if (nowait) {
- if (!try_lock_dio_extent(io_tree, lockstart, lockend, cached_state))
+ if (!btrfs_try_lock_dio_extent(io_tree, lockstart, lockend, cached_state))
return -EAGAIN;
} else {
- lock_dio_extent(io_tree, lockstart, lockend, cached_state);
+ btrfs_lock_dio_extent(io_tree, lockstart, lockend, cached_state);
}
while (1) {
if (nowait) {
- if (!try_lock_extent(io_tree, lockstart, lockend,
- cached_state)) {
+ if (!btrfs_try_lock_extent(io_tree, lockstart, lockend,
+ cached_state)) {
ret = -EAGAIN;
break;
}
} else {
- lock_extent(io_tree, lockstart, lockend, cached_state);
+ btrfs_lock_extent(io_tree, lockstart, lockend, cached_state);
}
/*
* We're concerned with the entire range that we're going to be
@@ -78,7 +78,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
lockstart, lockend)))
break;
- unlock_extent(io_tree, lockstart, lockend, cached_state);
+ btrfs_unlock_extent(io_tree, lockstart, lockend, cached_state);
if (ordered) {
if (nowait) {
@@ -131,7 +131,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
}
if (ret)
- unlock_dio_extent(io_tree, lockstart, lockend, cached_state);
+ btrfs_unlock_dio_extent(io_tree, lockstart, lockend, cached_state);
return ret;
}
@@ -151,11 +151,11 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
}
ordered = btrfs_alloc_ordered_extent(inode, start, file_extent,
- (1 << type) |
- (1 << BTRFS_ORDERED_DIRECT));
+ (1U << type) |
+ (1U << BTRFS_ORDERED_DIRECT));
if (IS_ERR(ordered)) {
if (em) {
- free_extent_map(em);
+ btrfs_free_extent_map(em);
btrfs_drop_extent_map_range(inode, start,
start + file_extent->num_bytes - 1, false);
}
@@ -204,8 +204,7 @@ again:
BTRFS_ORDERED_REGULAR);
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
if (IS_ERR(em))
- btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset,
- 1);
+ btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true);
return em;
}
@@ -246,9 +245,10 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
else
type = BTRFS_ORDERED_NOCOW;
len = min(len, em->len - (start - em->start));
- block_start = extent_map_block_start(em) + (start - em->start);
+ block_start = btrfs_extent_map_block_start(em) + (start - em->start);
- if (can_nocow_extent(inode, start, &len, &file_extent, false) == 1) {
+ if (can_nocow_extent(BTRFS_I(inode), start, &len, &file_extent,
+ false) == 1) {
bg = btrfs_inc_nocow_writers(fs_info, block_start);
if (bg)
can_nocow = true;
@@ -264,7 +264,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
nowait);
if (ret < 0) {
/* Our caller expects us to free the input extent map. */
- free_extent_map(em);
+ btrfs_free_extent_map(em);
*map = NULL;
btrfs_dec_nocow_writers(bg);
if (nowait && (ret == -ENOSPC || ret == -EDQUOT))
@@ -277,7 +277,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
&file_extent, type);
btrfs_dec_nocow_writers(bg);
if (type == BTRFS_ORDERED_PREALLOC) {
- free_extent_map(em);
+ btrfs_free_extent_map(em);
*map = em2;
em = em2;
}
@@ -290,7 +290,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
dio_data->nocow_done = true;
} else {
/* Our caller expects us to free the input extent map. */
- free_extent_map(em);
+ btrfs_free_extent_map(em);
*map = NULL;
if (nowait) {
@@ -439,8 +439,8 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
start, data_alloc_len, false);
if (!ret)
dio_data->data_space_reserved = true;
- else if (ret && !(BTRFS_I(inode)->flags &
- (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
+ else if (!(BTRFS_I(inode)->flags &
+ (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
goto err;
}
@@ -473,8 +473,8 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
* to buffered IO. Don't blame me, this is the price we pay for using
* the generic code.
*/
- if (extent_map_is_compressed(em) || em->disk_bytenr == EXTENT_MAP_INLINE) {
- free_extent_map(em);
+ if (btrfs_extent_map_is_compressed(em) || em->disk_bytenr == EXTENT_MAP_INLINE) {
+ btrfs_free_extent_map(em);
/*
* If we are in a NOWAIT context, return -EAGAIN in order to
* fallback to buffered IO. This is not only because we can
@@ -515,7 +515,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
* after we have submitted bios for all the extents in the range.
*/
if ((flags & IOMAP_NOWAIT) && len < length) {
- free_extent_map(em);
+ btrfs_free_extent_map(em);
ret = -EAGAIN;
goto unlock_err;
}
@@ -557,13 +557,13 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
iomap->addr = IOMAP_NULL_ADDR;
iomap->type = IOMAP_HOLE;
} else {
- iomap->addr = extent_map_block_start(em) + (start - em->start);
+ iomap->addr = btrfs_extent_map_block_start(em) + (start - em->start);
iomap->type = IOMAP_MAPPED;
}
iomap->offset = start;
iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
iomap->length = len;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
/*
* Reads will hold the EXTENT_DIO_LOCKED bit until the io is completed,
@@ -574,13 +574,13 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
if (write)
unlock_bits |= EXTENT_DIO_LOCKED;
- clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- unlock_bits, &cached_state);
+ btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ unlock_bits, &cached_state);
/* We didn't use everything, unlock the dio extent for the remainder. */
if (!write && (start + len) < lockend)
- unlock_dio_extent(&BTRFS_I(inode)->io_tree, start + len,
- lockend, NULL);
+ btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, start + len,
+ lockend, NULL);
return 0;
@@ -590,8 +590,8 @@ unlock_err:
* to update this, be explicit that we expect EXTENT_LOCKED and
* EXTENT_DIO_LOCKED to be set here, and so that's what we're clearing.
*/
- clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- EXTENT_LOCKED | EXTENT_DIO_LOCKED, &cached_state);
+ btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ EXTENT_LOCKED | EXTENT_DIO_LOCKED, &cached_state);
err:
if (dio_data->data_space_reserved) {
btrfs_free_reserved_data_space(BTRFS_I(inode),
@@ -614,8 +614,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
if (!write && (iomap->type == IOMAP_HOLE)) {
/* If reading from a hole, unlock and return */
- unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos,
- pos + length - 1, NULL);
+ btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos,
+ pos + length - 1, NULL);
return 0;
}
@@ -626,8 +626,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
btrfs_finish_ordered_extent(dio_data->ordered, NULL,
pos, length, false);
else
- unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos,
- pos + length - 1, NULL);
+ btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos,
+ pos + length - 1, NULL);
ret = -ENOTBLK;
}
if (write) {
@@ -659,8 +659,8 @@ static void btrfs_dio_end_io(struct btrfs_bio *bbio)
dip->file_offset, dip->bytes,
!bio->bi_status);
} else {
- unlock_dio_extent(&inode->io_tree, dip->file_offset,
- dip->file_offset + dip->bytes - 1, NULL);
+ btrfs_unlock_dio_extent(&inode->io_tree, dip->file_offset,
+ dip->file_offset + dip->bytes - 1, NULL);
}
bbio->bio.bi_private = bbio->private;
@@ -691,9 +691,9 @@ static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio,
* a pre-existing one.
*/
if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
- ret = split_extent_map(bbio->inode, bbio->file_offset,
- ordered->num_bytes, len,
- ordered->disk_bytenr);
+ ret = btrfs_split_extent_map(bbio->inode, bbio->file_offset,
+ ordered->num_bytes, len,
+ ordered->disk_bytenr);
if (ret)
return ret;
}
@@ -855,6 +855,22 @@ relock:
btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
goto buffered;
}
+ /*
+ * We can't control the folios being passed in, applications can write
+ * to them while a direct IO write is in progress. This means the
+ * content might change after we calculated the data checksum.
+ * Therefore we can end up storing a checksum that doesn't match the
+ * persisted data.
+ *
+ * To be extra safe and avoid false data checksum mismatch, if the
+ * inode requires data checksum, just fallback to buffered IO.
+ * For buffered IO we have full control of page cache and can ensure
+ * no one is modifying the content during writeback.
+ */
+ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
+ btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
+ goto buffered;
+ }
/*
* The iov_iter can be mapped to the same file range we are writing to.
diff --git a/fs/btrfs/direct-io.h b/fs/btrfs/direct-io.h
index 3dc3ea926afe..df5d45ee6de7 100644
--- a/fs/btrfs/direct-io.h
+++ b/fs/btrfs/direct-io.h
@@ -5,6 +5,8 @@
#include <linux/types.h>
+struct kiocb;
+
int __init btrfs_init_dio(void);
void __cold btrfs_destroy_dio(void);
diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c
index e815d165cccc..89fe85778115 100644
--- a/fs/btrfs/discard.c
+++ b/fs/btrfs/discard.c
@@ -94,8 +94,6 @@ static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
struct btrfs_block_group *block_group)
{
lockdep_assert_held(&discard_ctl->lock);
- if (!btrfs_run_discard_work(discard_ctl))
- return;
if (list_empty(&block_group->discard_list) ||
block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED) {
@@ -118,6 +116,9 @@ static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
if (!btrfs_is_block_group_data_only(block_group))
return;
+ if (!btrfs_run_discard_work(discard_ctl))
+ return;
+
spin_lock(&discard_ctl->lock);
__add_to_discard_list(discard_ctl, block_group);
spin_unlock(&discard_ctl->lock);
@@ -167,13 +168,7 @@ static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl,
block_group->discard_eligible_time = 0;
queued = !list_empty(&block_group->discard_list);
list_del_init(&block_group->discard_list);
- /*
- * If the block group is currently running in the discard workfn, we
- * don't want to deref it, since it's still being used by the workfn.
- * The workfn will notice this case and deref the block group when it is
- * finished.
- */
- if (queued && !running)
+ if (queued)
btrfs_put_block_group(block_group);
spin_unlock(&discard_ctl->lock);
@@ -250,6 +245,20 @@ again:
block_group->used != 0) {
if (btrfs_is_block_group_data_only(block_group)) {
__add_to_discard_list(discard_ctl, block_group);
+ /*
+ * The block group must have been moved to other
+ * discard list even if discard was disabled in
+ * the meantime or a transaction abort happened,
+ * otherwise we can end up in an infinite loop,
+ * always jumping into the 'again' label and
+ * keep getting this block group over and over
+ * in case there are no other block groups in
+ * the discard lists.
+ */
+ ASSERT(block_group->discard_index !=
+ BTRFS_DISCARD_INDEX_UNUSED,
+ "discard_index=%d",
+ block_group->discard_index);
} else {
list_del_init(&block_group->discard_list);
btrfs_put_block_group(block_group);
@@ -260,9 +269,10 @@ again:
block_group->discard_cursor = block_group->start;
block_group->discard_state = BTRFS_DISCARD_EXTENTS;
}
- discard_ctl->block_group = block_group;
}
if (block_group) {
+ btrfs_get_block_group(block_group);
+ discard_ctl->block_group = block_group;
*discard_state = block_group->discard_state;
*discard_index = block_group->discard_index;
}
@@ -493,9 +503,20 @@ static void btrfs_discard_workfn(struct work_struct *work)
block_group = peek_discard_list(discard_ctl, &discard_state,
&discard_index, now);
- if (!block_group || !btrfs_run_discard_work(discard_ctl))
+ if (!block_group)
+ return;
+ if (!btrfs_run_discard_work(discard_ctl)) {
+ spin_lock(&discard_ctl->lock);
+ btrfs_put_block_group(block_group);
+ discard_ctl->block_group = NULL;
+ spin_unlock(&discard_ctl->lock);
return;
+ }
if (now < block_group->discard_eligible_time) {
+ spin_lock(&discard_ctl->lock);
+ btrfs_put_block_group(block_group);
+ discard_ctl->block_group = NULL;
+ spin_unlock(&discard_ctl->lock);
btrfs_discard_schedule_work(discard_ctl, false);
return;
}
@@ -547,15 +568,7 @@ static void btrfs_discard_workfn(struct work_struct *work)
spin_lock(&discard_ctl->lock);
discard_ctl->prev_discard = trimmed;
discard_ctl->prev_discard_time = now;
- /*
- * If the block group was removed from the discard list while it was
- * running in this workfn, then we didn't deref it, since this function
- * still owned that reference. But we set the discard_ctl->block_group
- * back to NULL, so we can use that condition to know that now we need
- * to deref the block_group.
- */
- if (discard_ctl->block_group == NULL)
- btrfs_put_block_group(block_group);
+ btrfs_put_block_group(block_group);
discard_ctl->block_group = NULL;
__btrfs_discard_schedule_work(discard_ctl, now, false);
spin_unlock(&discard_ctl->lock);
diff --git a/fs/btrfs/discard.h b/fs/btrfs/discard.h
index dddb0f9101ba..2c5e85394092 100644
--- a/fs/btrfs/discard.h
+++ b/fs/btrfs/discard.h
@@ -3,6 +3,7 @@
#ifndef BTRFS_DISCARD_H
#define BTRFS_DISCARD_H
+#include <linux/types.h>
#include <linux/sizes.h>
struct btrfs_fs_info;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f09db62e61a1..70fc4e7cc5a0 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -182,22 +182,22 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb,
int mirror_num)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
- int num_folios = num_extent_folios(eb);
int ret = 0;
if (sb_rdonly(fs_info->sb))
return -EROFS;
- for (int i = 0; i < num_folios; i++) {
+ for (int i = 0; i < num_extent_folios(eb); i++) {
struct folio *folio = eb->folios[i];
u64 start = max_t(u64, eb->start, folio_pos(folio));
u64 end = min_t(u64, eb->start + eb->len,
folio_pos(folio) + eb->folio_size);
u32 len = end - start;
+ phys_addr_t paddr = PFN_PHYS(folio_pfn(folio)) +
+ offset_in_folio(folio, start);
- ret = btrfs_repair_io_failure(fs_info, 0, start, len,
- start, folio, offset_in_folio(folio, start),
- mirror_num);
+ ret = btrfs_repair_io_failure(fs_info, 0, start, len, start,
+ paddr, mirror_num);
if (ret)
break;
}
@@ -225,7 +225,6 @@ int btrfs_read_extent_buffer(struct extent_buffer *eb,
ASSERT(check);
while (1) {
- clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
ret = read_extent_buffer_pages(eb, mirror_num, check);
if (!ret)
break;
@@ -257,7 +256,7 @@ int btrfs_read_extent_buffer(struct extent_buffer *eb,
/*
* Checksum a dirty tree block before IO.
*/
-blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio)
+int btree_csum_one_bio(struct btrfs_bio *bbio)
{
struct extent_buffer *eb = bbio->private;
struct btrfs_fs_info *fs_info = eb->fs_info;
@@ -268,9 +267,9 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio)
/* Btree blocks are always contiguous on disk. */
if (WARN_ON_ONCE(bbio->file_offset != eb->start))
- return BLK_STS_IOERR;
+ return -EIO;
if (WARN_ON_ONCE(bbio->bio.bi_iter.bi_size != eb->len))
- return BLK_STS_IOERR;
+ return -EIO;
/*
* If an extent_buffer is marked as EXTENT_BUFFER_ZONED_ZEROOUT, don't
@@ -279,14 +278,13 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio)
*/
if (test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags)) {
memzero_extent_buffer(eb, 0, eb->len);
- return BLK_STS_OK;
+ return 0;
}
if (WARN_ON_ONCE(found_start != eb->start))
- return BLK_STS_IOERR;
- if (WARN_ON(!btrfs_folio_test_uptodate(fs_info, eb->folios[0],
- eb->start, eb->len)))
- return BLK_STS_IOERR;
+ return -EIO;
+ if (WARN_ON(!btrfs_meta_folio_test_uptodate(eb->folios[0], eb)))
+ return -EIO;
ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid,
offsetof(struct btrfs_header, fsid),
@@ -314,7 +312,7 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio)
goto error;
}
write_extent_buffer(eb, result, 0, fs_info->csum_size);
- return BLK_STS_OK;
+ return 0;
error:
btrfs_print_tree(eb, 0);
@@ -328,7 +326,7 @@ error:
*/
WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG) ||
btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID);
- return errno_to_blk_status(ret);
+ return ret;
}
static bool check_tree_block_fsid(struct extent_buffer *eb)
@@ -454,15 +452,9 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb,
goto out;
}
- /*
- * If this is a leaf block and it is corrupt, set the corrupt bit so
- * that we don't try and read the other copies of this block, just
- * return -EIO.
- */
- if (found_level == 0 && btrfs_check_leaf(eb)) {
- set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
+ /* If this is a leaf block and it is corrupt, just return -EIO. */
+ if (found_level == 0 && btrfs_check_leaf(eb))
ret = -EIO;
- }
if (found_level > 0 && btrfs_check_node(eb))
ret = -EIO;
@@ -643,11 +635,16 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
}
-static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
- u64 objectid)
+static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
+ u64 objectid, gfp_t flags)
{
+ struct btrfs_root *root;
bool dummy = btrfs_is_testing(fs_info);
+ root = kzalloc(sizeof(*root), flags);
+ if (!root)
+ return NULL;
+
memset(&root->root_key, 0, sizeof(root->root_key));
memset(&root->root_item, 0, sizeof(root->root_item));
memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
@@ -700,10 +697,10 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
btrfs_set_root_last_log_commit(root, 0);
root->anon_dev = 0;
if (!dummy) {
- extent_io_tree_init(fs_info, &root->dirty_log_pages,
- IO_TREE_ROOT_DIRTY_LOG_PAGES);
- extent_io_tree_init(fs_info, &root->log_csum_range,
- IO_TREE_LOG_CSUM_RANGE);
+ btrfs_extent_io_tree_init(fs_info, &root->dirty_log_pages,
+ IO_TREE_ROOT_DIRTY_LOG_PAGES);
+ btrfs_extent_io_tree_init(fs_info, &root->log_csum_range,
+ IO_TREE_LOG_CSUM_RANGE);
}
spin_lock_init(&root->root_item_lock);
@@ -714,14 +711,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
list_add_tail(&root->leak_list, &fs_info->allocated_roots);
spin_unlock(&fs_info->fs_roots_radix_lock);
#endif
-}
-static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
- u64 objectid, gfp_t flags)
-{
- struct btrfs_root *root = kzalloc(sizeof(*root), flags);
- if (root)
- __setup_root(root, fs_info, objectid);
return root;
}
@@ -894,7 +884,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
btrfs_set_root_used(&root->root_item, leaf->len);
btrfs_set_root_last_snapshot(&root->root_item, 0);
btrfs_set_root_dirid(&root->root_item, 0);
- if (is_fstree(objectid))
+ if (btrfs_is_fstree(objectid))
generate_random_guid(root->root_item.uuid);
else
export_guid(root->root_item.uuid, &guid_null);
@@ -1089,21 +1079,22 @@ struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
const struct btrfs_key *key)
{
struct btrfs_root *root;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
path = btrfs_alloc_path();
if (!path)
return ERR_PTR(-ENOMEM);
root = read_tree_root_path(tree_root, path, key);
- btrfs_free_path(path);
return root;
}
/*
- * Initialize subvolume root in-memory structure
+ * Initialize subvolume root in-memory structure.
*
* @anon_dev: anonymous device to attach to the root, if zero, allocate new
+ *
+ * In case of failure the caller is responsible to call btrfs_free_fs_root()
*/
static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
{
@@ -1113,7 +1104,7 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
if (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID &&
!btrfs_is_data_reloc_root(root) &&
- is_fstree(btrfs_root_id(root))) {
+ btrfs_is_fstree(btrfs_root_id(root))) {
set_bit(BTRFS_ROOT_SHAREABLE, &root->state);
btrfs_check_and_init_root_item(&root->root_item);
}
@@ -1122,12 +1113,12 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
* Don't assign anonymous block device to roots that are not exposed to
* userspace, the id pool is limited to 1M
*/
- if (is_fstree(btrfs_root_id(root)) &&
+ if (btrfs_is_fstree(btrfs_root_id(root)) &&
btrfs_root_refs(&root->root_item) > 0) {
if (!anon_dev) {
ret = get_anon_bdev(&root->anon_dev);
if (ret)
- goto fail;
+ return ret;
} else {
root->anon_dev = anon_dev;
}
@@ -1137,7 +1128,7 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
ret = btrfs_init_root_free_objectid(root);
if (ret) {
mutex_unlock(&root->objectid_mutex);
- goto fail;
+ return ret;
}
ASSERT(root->free_objectid <= BTRFS_LAST_FREE_OBJECTID);
@@ -1145,9 +1136,6 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
mutex_unlock(&root->objectid_mutex);
return 0;
-fail:
- /* The caller is responsible to call btrfs_free_fs_root */
- return ret;
}
static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
@@ -1258,6 +1246,8 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
{
struct percpu_counter *em_counter = &fs_info->evictable_extent_maps;
+ if (fs_info->fs_devices)
+ btrfs_close_devices(fs_info->fs_devices);
percpu_counter_destroy(&fs_info->stats_read_blocks);
percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
percpu_counter_destroy(&fs_info->delalloc_bytes);
@@ -1327,7 +1317,7 @@ static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info,
* This is namely for free-space-tree and quota tree, which can change
* at runtime and should only be grabbed from fs_info.
*/
- if (!is_fstree(objectid) && objectid != BTRFS_DATA_RELOC_TREE_OBJECTID)
+ if (!btrfs_is_fstree(objectid) && objectid != BTRFS_DATA_RELOC_TREE_OBJECTID)
return ERR_PTR(-ENOENT);
again:
root = btrfs_lookup_fs_root(fs_info, objectid);
@@ -1568,7 +1558,7 @@ static int transaction_kthread(void *arg)
do {
cannot_commit = false;
- delay = msecs_to_jiffies(fs_info->commit_interval * 1000);
+ delay = secs_to_jiffies(fs_info->commit_interval);
mutex_lock(&fs_info->transaction_kthread_mutex);
spin_lock(&fs_info->trans_lock);
@@ -1583,9 +1573,9 @@ static int transaction_kthread(void *arg)
cur->state < TRANS_STATE_COMMIT_PREP &&
delta < fs_info->commit_interval) {
spin_unlock(&fs_info->trans_lock);
- delay -= msecs_to_jiffies((delta - 1) * 1000);
+ delay -= secs_to_jiffies(delta - 1);
delay = min(delay,
- msecs_to_jiffies(fs_info->commit_interval * 1000));
+ secs_to_jiffies(fs_info->commit_interval));
goto sleep;
}
transid = cur->transid;
@@ -1847,6 +1837,8 @@ void btrfs_put_root(struct btrfs_root *root)
if (refcount_dec_and_test(&root->refs)) {
if (WARN_ON(!xa_empty(&root->inodes)))
xa_destroy(&root->inodes);
+ if (WARN_ON(!xa_empty(&root->delayed_nodes)))
+ xa_destroy(&root->delayed_nodes);
WARN_ON(test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state));
if (root->anon_dev)
free_anon_bdev(root->anon_dev);
@@ -1867,8 +1859,8 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
int i;
while (!list_empty(&fs_info->dead_roots)) {
- gang[0] = list_entry(fs_info->dead_roots.next,
- struct btrfs_root, root_list);
+ gang[0] = list_first_entry(&fs_info->dead_roots,
+ struct btrfs_root, root_list);
list_del(&gang[0]->root_list);
if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state))
@@ -1931,9 +1923,9 @@ static int btrfs_init_btree_inode(struct super_block *sb)
inode->i_mapping->a_ops = &btree_aops;
mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
- extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree,
- IO_TREE_BTREE_INODE_IO);
- extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
+ btrfs_extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree,
+ IO_TREE_BTREE_INODE_IO);
+ btrfs_extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root);
set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
@@ -1957,7 +1949,6 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
fs_info->qgroup_tree = RB_ROOT;
INIT_LIST_HEAD(&fs_info->dirty_qgroups);
fs_info->qgroup_seq = 1;
- fs_info->qgroup_ulist = NULL;
fs_info->qgroup_rescan_running = false;
fs_info->qgroup_drop_subtree_thres = BTRFS_QGROUP_DROP_SUBTREE_THRES_DEFAULT;
mutex_init(&fs_info->qgroup_rescan_lock);
@@ -2006,7 +1997,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
btrfs_alloc_ordered_workqueue(fs_info, "qgroup-rescan",
ordered_flags);
fs_info->discard_ctl.discard_workers =
- alloc_ordered_workqueue("btrfs_discard", WQ_FREEZABLE);
+ alloc_ordered_workqueue("btrfs-discard", WQ_FREEZABLE);
if (!(fs_info->workers &&
fs_info->delalloc_workers && fs_info->flush_workers &&
@@ -2038,14 +2029,10 @@ static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type)
fs_info->csum_shash = csum_shash;
- /*
- * Check if the checksum implementation is a fast accelerated one.
- * As-is this is a bit of a hack and should be replaced once the csum
- * implementations provide that information themselves.
- */
+ /* Check if the checksum implementation is a fast accelerated one. */
switch (csum_type) {
case BTRFS_CSUM_TYPE_CRC32:
- if (!strstr(crypto_shash_driver_name(csum_shash), "generic"))
+ if (crc32_optimizations() & CRC32C_OPTIMIZATION)
set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags);
break;
case BTRFS_CSUM_TYPE_XXHASH:
@@ -2168,8 +2155,7 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root,
found = true;
root = read_tree_root_path(tree_root, path, &key);
if (IS_ERR(root)) {
- if (!btrfs_test_opt(fs_info, IGNOREBADROOTS))
- ret = PTR_ERR(root);
+ ret = PTR_ERR(root);
break;
}
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
@@ -2200,8 +2186,8 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root,
static int load_global_roots(struct btrfs_root *tree_root)
{
- struct btrfs_path *path;
- int ret = 0;
+ BTRFS_PATH_AUTO_FREE(path);
+ int ret;
path = btrfs_alloc_path();
if (!path)
@@ -2210,18 +2196,17 @@ static int load_global_roots(struct btrfs_root *tree_root)
ret = load_global_roots_objectid(tree_root, path,
BTRFS_EXTENT_TREE_OBJECTID, "extent");
if (ret)
- goto out;
+ return ret;
ret = load_global_roots_objectid(tree_root, path,
BTRFS_CSUM_TREE_OBJECTID, "csum");
if (ret)
- goto out;
+ return ret;
if (!btrfs_fs_compat_ro(tree_root->fs_info, FREE_SPACE_TREE))
- goto out;
+ return ret;
ret = load_global_roots_objectid(tree_root, path,
BTRFS_FREE_SPACE_TREE_OBJECTID,
"free space");
-out:
- btrfs_free_path(path);
+
return ret;
}
@@ -2447,21 +2432,27 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info,
* Check sectorsize and nodesize first, other check will need it.
* Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here.
*/
- if (!is_power_of_2(sectorsize) || sectorsize < 4096 ||
+ if (!is_power_of_2(sectorsize) || sectorsize < BTRFS_MIN_BLOCKSIZE ||
sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) {
btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize);
ret = -EINVAL;
}
/*
- * We only support at most two sectorsizes: 4K and PAGE_SIZE.
+ * We only support at most 3 sectorsizes: 4K, PAGE_SIZE, MIN_BLOCKSIZE.
+ *
+ * For 4K page sized systems with non-debug builds, all 3 matches (4K).
+ * For 4K page sized systems with debug builds, there are two block sizes
+ * supported. (4K and 2K)
*
* We can support 16K sectorsize with 64K page size without problem,
* but such sectorsize/pagesize combination doesn't make much sense.
* 4K will be our future standard, PAGE_SIZE is supported from the very
* beginning.
*/
- if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K && sectorsize != PAGE_SIZE)) {
+ if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K &&
+ sectorsize != PAGE_SIZE &&
+ sectorsize != BTRFS_MIN_BLOCKSIZE)) {
btrfs_err(fs_info,
"sectorsize %llu not yet supported for page size %lu",
sectorsize, PAGE_SIZE);
@@ -2561,6 +2552,9 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info,
ret = -EINVAL;
}
+ if (ret)
+ return ret;
+
ret = validate_sys_chunk_array(fs_info, sb);
/*
@@ -2765,10 +2759,21 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
return ret;
}
+/*
+ * Lockdep gets confused between our buffer_tree which requires IRQ locking because
+ * we modify marks in the IRQ context, and our delayed inode xarray which doesn't
+ * have these requirements. Use a class key so lockdep doesn't get them mixed up.
+ */
+static struct lock_class_key buffer_xa_class;
+
void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
{
INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
- INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
+
+ /* Use the same flags as mapping->i_pages. */
+ xa_init_flags(&fs_info->buffer_tree, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT);
+ lockdep_set_class(&fs_info->buffer_tree.xa_lock, &buffer_xa_class);
+
INIT_LIST_HEAD(&fs_info->trans_list);
INIT_LIST_HEAD(&fs_info->dead_roots);
INIT_LIST_HEAD(&fs_info->delayed_iputs);
@@ -2780,7 +2785,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
spin_lock_init(&fs_info->delayed_iput_lock);
spin_lock_init(&fs_info->defrag_inodes_lock);
spin_lock_init(&fs_info->super_lock);
- spin_lock_init(&fs_info->buffer_lock);
spin_lock_init(&fs_info->unused_bgs_lock);
spin_lock_init(&fs_info->treelog_bg_lock);
spin_lock_init(&fs_info->zone_active_bgs_lock);
@@ -2825,6 +2829,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
BTRFS_BLOCK_RSV_GLOBAL);
btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
+ btrfs_init_block_rsv(&fs_info->treelog_rsv, BTRFS_BLOCK_RSV_TREELOG);
btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
BTRFS_BLOCK_RSV_DELOPS);
@@ -2858,8 +2863,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
rwlock_init(&fs_info->block_group_cache_lock);
fs_info->block_group_cache_tree = RB_ROOT_CACHED;
- extent_io_tree_init(fs_info, &fs_info->excluded_extents,
- IO_TREE_FS_EXCLUDED_EXTENTS);
+ btrfs_extent_io_tree_init(fs_info, &fs_info->excluded_extents,
+ IO_TREE_FS_EXCLUDED_EXTENTS);
mutex_init(&fs_info->ordered_operations_mutex);
mutex_init(&fs_info->tree_log_mutex);
@@ -3311,7 +3316,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
/*
* Read super block and check the signature bytes only
*/
- disk_super = btrfs_read_dev_super(fs_devices->latest_dev->bdev);
+ disk_super = btrfs_read_disk_super(fs_devices->latest_dev->bdev, 0, false);
if (IS_ERR(disk_super)) {
ret = PTR_ERR(disk_super);
goto fail_alloc;
@@ -3388,9 +3393,9 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
fs_info->nodesize = nodesize;
+ fs_info->nodesize_bits = ilog2(nodesize);
fs_info->sectorsize = sectorsize;
fs_info->sectorsize_bits = ilog2(sectorsize);
- fs_info->sectors_per_page = (PAGE_SIZE >> fs_info->sectorsize_bits);
fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
fs_info->stripesize = stripesize;
fs_info->fs_devices->fs_info = fs_info;
@@ -3416,11 +3421,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
*/
fs_info->max_inline = min_t(u64, fs_info->max_inline, fs_info->sectorsize);
- if (sectorsize < PAGE_SIZE)
- btrfs_warn(fs_info,
- "read-write for sector size %u with page size %lu is experimental",
- sectorsize, PAGE_SIZE);
-
ret = btrfs_init_workqueues(fs_info);
if (ret)
goto fail_sb_buffer;
@@ -3559,6 +3559,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
goto fail_sysfs;
}
+ btrfs_zoned_reserve_data_reloc_bg(fs_info);
btrfs_free_zone_cache(fs_info);
btrfs_check_active_zone_reservation(fs_info);
@@ -3679,7 +3680,6 @@ fail_alloc:
iput(fs_info->btree_inode);
fail:
- btrfs_close_devices(fs_info->fs_devices);
ASSERT(ret < 0);
return ret;
}
@@ -3692,7 +3692,7 @@ static void btrfs_end_super_write(struct bio *bio)
bio_for_each_folio_all(fi, bio) {
if (bio->bi_status) {
- btrfs_warn_rl_in_rcu(device->fs_info,
+ btrfs_warn_rl(device->fs_info,
"lost super block write due to IO error on %s (%d)",
btrfs_dev_name(device),
blk_status_to_errno(bio->bi_status));
@@ -3712,85 +3712,6 @@ static void btrfs_end_super_write(struct bio *bio)
bio_put(bio);
}
-struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
- int copy_num, bool drop_cache)
-{
- struct btrfs_super_block *super;
- struct page *page;
- u64 bytenr, bytenr_orig;
- struct address_space *mapping = bdev->bd_mapping;
- int ret;
-
- bytenr_orig = btrfs_sb_offset(copy_num);
- ret = btrfs_sb_log_location_bdev(bdev, copy_num, READ, &bytenr);
- if (ret == -ENOENT)
- return ERR_PTR(-EINVAL);
- else if (ret)
- return ERR_PTR(ret);
-
- if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev))
- return ERR_PTR(-EINVAL);
-
- if (drop_cache) {
- /* This should only be called with the primary sb. */
- ASSERT(copy_num == 0);
-
- /*
- * Drop the page of the primary superblock, so later read will
- * always read from the device.
- */
- invalidate_inode_pages2_range(mapping,
- bytenr >> PAGE_SHIFT,
- (bytenr + BTRFS_SUPER_INFO_SIZE) >> PAGE_SHIFT);
- }
-
- page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS);
- if (IS_ERR(page))
- return ERR_CAST(page);
-
- super = page_address(page);
- if (btrfs_super_magic(super) != BTRFS_MAGIC) {
- btrfs_release_disk_super(super);
- return ERR_PTR(-ENODATA);
- }
-
- if (btrfs_super_bytenr(super) != bytenr_orig) {
- btrfs_release_disk_super(super);
- return ERR_PTR(-EINVAL);
- }
-
- return super;
-}
-
-
-struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev)
-{
- struct btrfs_super_block *super, *latest = NULL;
- int i;
- u64 transid = 0;
-
- /* we would like to check all the supers, but that would make
- * a btrfs mount succeed after a mkfs from a different FS.
- * So, we need to add a special mount option to scan for
- * later supers, using BTRFS_SUPER_MIRROR_MAX instead
- */
- for (i = 0; i < 1; i++) {
- super = btrfs_read_dev_one_super(bdev, i, false);
- if (IS_ERR(super))
- continue;
-
- if (!latest || btrfs_super_generation(super) > transid) {
- if (latest)
- btrfs_release_disk_super(super);
-
- latest = super;
- transid = btrfs_super_generation(super);
- }
- }
-
- return super;
-}
-
/*
* Write superblock @sb to the @device. Do not wait for completion, all the
* folios we use for writing are locked.
@@ -3830,8 +3751,8 @@ static int write_dev_supers(struct btrfs_device *device,
continue;
} else if (ret < 0) {
btrfs_err(device->fs_info,
- "couldn't get super block location for mirror %d",
- i);
+ "couldn't get super block location for mirror %d error %d",
+ i, ret);
atomic_inc(&device->sb_write_errors);
continue;
}
@@ -3850,12 +3771,11 @@ static int write_dev_supers(struct btrfs_device *device,
GFP_NOFS);
if (IS_ERR(folio)) {
btrfs_err(device->fs_info,
- "couldn't get super block page for bytenr %llu",
- bytenr);
+ "couldn't get super block page for bytenr %llu error %ld",
+ bytenr, PTR_ERR(folio));
atomic_inc(&device->sb_write_errors);
continue;
}
- ASSERT(folio_order(folio) == 0);
offset = offset_in_folio(folio, bytenr);
disk_super = folio_address(folio) + offset;
@@ -3928,7 +3848,6 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
/* If the folio has been removed, then we know it completed. */
if (IS_ERR(folio))
continue;
- ASSERT(folio_order(folio) == 0);
/* Folio will be unlocked once the write completes. */
folio_wait_locked(folio);
@@ -4071,7 +3990,7 @@ int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags)
}
if (min_tolerated == INT_MAX) {
- pr_warn("BTRFS: unknown raid flag: %llu", flags);
+ btrfs_warn(NULL, "unknown raid flag: %llu", flags);
min_tolerated = 0;
}
@@ -4248,8 +4167,9 @@ static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info)
u64 found_end;
found = true;
- while (find_first_extent_bit(&trans->dirty_pages, cur,
- &found_start, &found_end, EXTENT_DIRTY, &cached)) {
+ while (btrfs_find_first_extent_bit(&trans->dirty_pages, cur,
+ &found_start, &found_end,
+ EXTENT_DIRTY, &cached)) {
dirty_bytes += found_end + 1 - found_start;
cur = found_end + 1;
}
@@ -4326,6 +4246,14 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
btrfs_cleanup_defrag_inodes(fs_info);
/*
+ * Handle the error fs first, as it will flush and wait for all ordered
+ * extents. This will generate delayed iputs, thus we want to handle
+ * it first.
+ */
+ if (unlikely(BTRFS_FS_ERROR(fs_info)))
+ btrfs_error_commit_super(fs_info);
+
+ /*
* Wait for any fixup workers to complete.
* If we don't wait for them here and they are still running by the time
* we call kthread_stop() against the cleaner kthread further below, we
@@ -4346,6 +4274,31 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
btrfs_flush_workqueue(fs_info->delalloc_workers);
/*
+ * We can have ordered extents getting their last reference dropped from
+ * the fs_info->workers queue because for async writes for data bios we
+ * queue a work for that queue, at btrfs_wq_submit_bio(), that runs
+ * run_one_async_done() which calls btrfs_bio_end_io() in case the bio
+ * has an error, and that later function can do the final
+ * btrfs_put_ordered_extent() on the ordered extent attached to the bio,
+ * which adds a delayed iput for the inode. So we must flush the queue
+ * so that we don't have delayed iputs after committing the current
+ * transaction below and stopping the cleaner and transaction kthreads.
+ */
+ btrfs_flush_workqueue(fs_info->workers);
+
+ /*
+ * When finishing a compressed write bio we schedule a work queue item
+ * to finish an ordered extent - btrfs_finish_compressed_write_work()
+ * calls btrfs_finish_ordered_extent() which in turns does a call to
+ * btrfs_queue_ordered_fn(), and that queues the ordered extent
+ * completion either in the endio_write_workers work queue or in the
+ * fs_info->endio_freespace_worker work queue. We flush those queues
+ * below, so before we flush them we must flush this queue for the
+ * workers of compressed writes.
+ */
+ flush_workqueue(fs_info->compressed_write_workers);
+
+ /*
* After we parked the cleaner kthread, ordered extents may have
* completed and created new delayed iputs. If one of the async reclaim
* tasks is running and in the RUN_DELAYED_IPUTS flush state, then we
@@ -4356,8 +4309,8 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
*
* So wait for all ongoing ordered extents to complete and then run
* delayed iputs. This works because once we reach this point no one
- * can either create new ordered extents nor create delayed iputs
- * through some other means.
+ * can create new ordered extents, but delayed iputs can still be added
+ * by a reclaim worker (see comments further below).
*
* Also note that btrfs_wait_ordered_roots() is not safe here, because
* it waits for BTRFS_ORDERED_COMPLETE to be set on an ordered extent,
@@ -4368,6 +4321,10 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
btrfs_flush_workqueue(fs_info->endio_write_workers);
/* Ordered extents for free space inodes. */
btrfs_flush_workqueue(fs_info->endio_freespace_worker);
+ /*
+ * Run delayed iputs in case an async reclaim worker is waiting for them
+ * to be run as mentioned above.
+ */
btrfs_run_delayed_iputs(fs_info);
cancel_work_sync(&fs_info->async_reclaim_work);
@@ -4375,6 +4332,18 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
cancel_work_sync(&fs_info->preempt_reclaim_work);
cancel_work_sync(&fs_info->em_shrinker_work);
+ /*
+ * Run delayed iputs again because an async reclaim worker may have
+ * added new ones if it was flushing delalloc:
+ *
+ * shrink_delalloc() -> btrfs_start_delalloc_roots() ->
+ * start_delalloc_inodes() -> btrfs_add_delayed_iput()
+ */
+ btrfs_run_delayed_iputs(fs_info);
+
+ /* There should be no more workload to generate new delayed iputs. */
+ set_bit(BTRFS_FS_STATE_NO_DELAYED_IPUT, &fs_info->fs_state);
+
/* Cancel or finish ongoing discard work */
btrfs_discard_cleanup(fs_info);
@@ -4403,9 +4372,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
btrfs_err(fs_info, "commit super ret %d", ret);
}
- if (BTRFS_FS_ERROR(fs_info))
- btrfs_error_commit_super(fs_info);
-
kthread_stop(fs_info->transaction_kthread);
kthread_stop(fs_info->cleaner_kthread);
@@ -4413,7 +4379,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags);
if (btrfs_check_quota_leak(fs_info)) {
- WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ DEBUG_WARN("qgroup reserved space leaked");
btrfs_err(fs_info, "qgroup reserved space leaked");
}
@@ -4460,7 +4426,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
iput(fs_info->btree_inode);
btrfs_mapping_tree_free(fs_info);
- btrfs_close_devices(fs_info->fs_devices);
}
void btrfs_mark_buffer_dirty(struct btrfs_trans_handle *trans,
@@ -4528,10 +4493,6 @@ static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info)
/* cleanup FS via transaction */
btrfs_cleanup_transaction(fs_info);
- mutex_lock(&fs_info->cleaner_mutex);
- btrfs_run_delayed_iputs(fs_info);
- mutex_unlock(&fs_info->cleaner_mutex);
-
down_write(&fs_info->cleanup_work_sem);
up_write(&fs_info->cleanup_work_sem);
}
@@ -4674,9 +4635,9 @@ static void btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info,
u64 start = 0;
u64 end;
- while (find_first_extent_bit(dirty_pages, start, &start, &end,
- mark, NULL)) {
- clear_extent_bits(dirty_pages, start, end, mark);
+ while (btrfs_find_first_extent_bit(dirty_pages, start, &start, &end,
+ mark, NULL)) {
+ btrfs_clear_extent_bit(dirty_pages, start, end, mark, NULL);
while (start <= end) {
eb = find_extent_buffer(fs_info, start);
start += fs_info->nodesize;
@@ -4709,14 +4670,14 @@ static void btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
* the same extent range.
*/
mutex_lock(&fs_info->unused_bg_unpin_mutex);
- if (!find_first_extent_bit(unpin, 0, &start, &end,
- EXTENT_DIRTY, &cached_state)) {
+ if (!btrfs_find_first_extent_bit(unpin, 0, &start, &end,
+ EXTENT_DIRTY, &cached_state)) {
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
break;
}
- clear_extent_dirty(unpin, start, end, &cached_state);
- free_extent_state(cached_state);
+ btrfs_clear_extent_dirty(unpin, start, end, &cached_state);
+ btrfs_free_extent_state(cached_state);
btrfs_error_unpin_extent_range(fs_info, start, end);
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
cond_resched();
@@ -4902,7 +4863,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
int btrfs_init_root_free_objectid(struct btrfs_root *root)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
int ret;
struct extent_buffer *l;
struct btrfs_key search_key;
@@ -4918,14 +4879,13 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root)
search_key.offset = (u64)-1;
ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
if (ret < 0)
- goto error;
+ return ret;
if (ret == 0) {
/*
* Key with offset -1 found, there would have to exist a root
* with such id, but this is out of valid range.
*/
- ret = -EUCLEAN;
- goto error;
+ return -EUCLEAN;
}
if (path->slots[0] > 0) {
slot = path->slots[0] - 1;
@@ -4936,10 +4896,8 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root)
} else {
root->free_objectid = BTRFS_FIRST_FREE_OBJECTID;
}
- ret = 0;
-error:
- btrfs_free_path(path);
- return ret;
+
+ return 0;
}
int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid)
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 587842991b24..864a55a96226 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -58,9 +58,6 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info,
const struct btrfs_super_block *sb, int mirror_num);
int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount);
int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors);
-struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev);
-struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
- int copy_num, bool drop_cache);
int btrfs_commit_super(struct btrfs_fs_info *fs_info);
struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
const struct btrfs_key *key);
@@ -114,7 +111,7 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
int btrfs_read_extent_buffer(struct extent_buffer *buf,
const struct btrfs_tree_parent_check *check);
-blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio);
+int btree_csum_one_bio(struct btrfs_bio *bbio);
int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index e2b22bea348a..7fc8a3200b40 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -75,7 +75,7 @@ struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
{
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
struct btrfs_root *root;
- struct inode *inode;
+ struct btrfs_inode *inode;
if (objectid < BTRFS_FIRST_FREE_OBJECTID)
return ERR_PTR(-ESTALE);
@@ -89,12 +89,12 @@ struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
if (IS_ERR(inode))
return ERR_CAST(inode);
- if (generation != 0 && generation != inode->i_generation) {
- iput(inode);
+ if (generation != 0 && generation != inode->vfs_inode.i_generation) {
+ iput(&inode->vfs_inode);
return ERR_PTR(-ESTALE);
}
- return d_obtain_alias(inode);
+ return d_obtain_alias(&inode->vfs_inode);
}
static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
@@ -145,9 +145,10 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
struct dentry *btrfs_get_parent(struct dentry *child)
{
- struct inode *dir = d_inode(child);
- struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
- struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_inode *dir = BTRFS_I(d_inode(child));
+ struct btrfs_inode *inode;
+ struct btrfs_root *root = dir->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_root_ref *ref;
@@ -159,13 +160,13 @@ struct dentry *btrfs_get_parent(struct dentry *child)
if (!path)
return ERR_PTR(-ENOMEM);
- if (btrfs_ino(BTRFS_I(dir)) == BTRFS_FIRST_FREE_OBJECTID) {
+ if (btrfs_ino(dir) == BTRFS_FIRST_FREE_OBJECTID) {
key.objectid = btrfs_root_id(root);
key.type = BTRFS_ROOT_BACKREF_KEY;
key.offset = (u64)-1;
root = fs_info->tree_root;
} else {
- key.objectid = btrfs_ino(BTRFS_I(dir));
+ key.objectid = btrfs_ino(dir);
key.type = BTRFS_INODE_REF_KEY;
key.offset = (u64)-1;
}
@@ -210,7 +211,11 @@ struct dentry *btrfs_get_parent(struct dentry *child)
found_key.offset, 0);
}
- return d_obtain_alias(btrfs_iget(key.objectid, root));
+ inode = btrfs_iget(key.objectid, root);
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
+
+ return d_obtain_alias(&inode->vfs_inode);
fail:
btrfs_free_path(path);
return ERR_PTR(ret);
@@ -219,11 +224,11 @@ fail:
static int btrfs_get_name(struct dentry *parent, char *name,
struct dentry *child)
{
- struct inode *inode = d_inode(child);
- struct inode *dir = d_inode(parent);
- struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- struct btrfs_path *path;
- struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_inode *inode = BTRFS_I(d_inode(child));
+ struct btrfs_inode *dir = BTRFS_I(d_inode(parent));
+ struct btrfs_root *root = dir->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_inode_ref *iref;
struct btrfs_root_ref *rref;
struct extent_buffer *leaf;
@@ -233,37 +238,34 @@ static int btrfs_get_name(struct dentry *parent, char *name,
int ret;
u64 ino;
- if (!S_ISDIR(dir->i_mode))
+ if (!S_ISDIR(dir->vfs_inode.i_mode))
return -EINVAL;
- ino = btrfs_ino(BTRFS_I(inode));
+ ino = btrfs_ino(inode);
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
if (ino == BTRFS_FIRST_FREE_OBJECTID) {
- key.objectid = btrfs_root_id(BTRFS_I(inode)->root);
+ key.objectid = btrfs_root_id(inode->root);
key.type = BTRFS_ROOT_BACKREF_KEY;
key.offset = (u64)-1;
root = fs_info->tree_root;
} else {
key.objectid = ino;
- key.offset = btrfs_ino(BTRFS_I(dir));
key.type = BTRFS_INODE_REF_KEY;
+ key.offset = btrfs_ino(dir);
}
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0) {
- btrfs_free_path(path);
return ret;
} else if (ret > 0) {
- if (ino == BTRFS_FIRST_FREE_OBJECTID) {
+ if (ino == BTRFS_FIRST_FREE_OBJECTID)
path->slots[0]--;
- } else {
- btrfs_free_path(path);
+ else
return -ENOENT;
- }
}
leaf = path->nodes[0];
@@ -280,7 +282,6 @@ static int btrfs_get_name(struct dentry *parent, char *name,
}
read_extent_buffer(leaf, name, name_ptr, name_len);
- btrfs_free_path(path);
/*
* have to add the null termination to make sure that reconnect_path
diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
index 6d08c100b01d..66361325f6dc 100644
--- a/fs/btrfs/extent-io-tree.c
+++ b/fs/btrfs/extent-io-tree.c
@@ -42,8 +42,9 @@ static inline void btrfs_extent_state_leak_debug_check(void)
struct extent_state *state;
while (!list_empty(&states)) {
- state = list_entry(states.next, struct extent_state, leak_list);
- pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
+ state = list_first_entry(&states, struct extent_state, leak_list);
+ btrfs_err(NULL,
+ "state leak: start %llu end %llu state %u in tree %d refs %d",
state->start, state->end, state->state,
extent_state_in_tree(state),
refcount_read(&state->refs));
@@ -59,13 +60,12 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller,
struct extent_io_tree *tree,
u64 start, u64 end)
{
- const struct btrfs_inode *inode;
+ const struct btrfs_inode *inode = tree->inode;
u64 isize;
if (tree->owner != IO_TREE_INODE_IO)
return;
- inode = extent_io_tree_to_inode_const(tree);
isize = i_size_read(&inode->vfs_inode);
if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
btrfs_debug_rl(inode->root->fs_info,
@@ -80,25 +80,8 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller,
#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0)
#endif
-
-/*
- * The only tree allowed to set the inode is IO_TREE_INODE_IO.
- */
-static bool is_inode_io_tree(const struct extent_io_tree *tree)
-{
- return tree->owner == IO_TREE_INODE_IO;
-}
-
-/* Return the inode if it's valid for the given tree, otherwise NULL. */
-struct btrfs_inode *extent_io_tree_to_inode(struct extent_io_tree *tree)
-{
- if (tree->owner == IO_TREE_INODE_IO)
- return tree->inode;
- return NULL;
-}
-
/* Read-only access to the inode. */
-const struct btrfs_inode *extent_io_tree_to_inode_const(const struct extent_io_tree *tree)
+const struct btrfs_inode *btrfs_extent_io_tree_to_inode(const struct extent_io_tree *tree)
{
if (tree->owner == IO_TREE_INODE_IO)
return tree->inode;
@@ -106,15 +89,15 @@ const struct btrfs_inode *extent_io_tree_to_inode_const(const struct extent_io_t
}
/* For read-only access to fs_info. */
-const struct btrfs_fs_info *extent_io_tree_to_fs_info(const struct extent_io_tree *tree)
+const struct btrfs_fs_info *btrfs_extent_io_tree_to_fs_info(const struct extent_io_tree *tree)
{
if (tree->owner == IO_TREE_INODE_IO)
return tree->inode->root->fs_info;
return tree->fs_info;
}
-void extent_io_tree_init(struct btrfs_fs_info *fs_info,
- struct extent_io_tree *tree, unsigned int owner)
+void btrfs_extent_io_tree_init(struct btrfs_fs_info *fs_info,
+ struct extent_io_tree *tree, unsigned int owner)
{
tree->state = RB_ROOT;
spin_lock_init(&tree->lock);
@@ -129,7 +112,7 @@ void extent_io_tree_init(struct btrfs_fs_info *fs_info,
* aren't any waiters on any extent state record (EXTENT_LOCK_BITS are never
* set on any extent state when calling this function).
*/
-void extent_io_tree_release(struct extent_io_tree *tree)
+void btrfs_extent_io_tree_release(struct extent_io_tree *tree)
{
struct rb_root root;
struct extent_state *state;
@@ -148,7 +131,7 @@ void extent_io_tree_release(struct extent_io_tree *tree)
* (see wait_extent_bit()).
*/
ASSERT(!waitqueue_active(&state->wq));
- free_extent_state(state);
+ btrfs_free_extent_state(state);
cond_resched_lock(&tree->lock);
}
/*
@@ -176,7 +159,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
btrfs_leak_debug_add_state(state);
refcount_set(&state->refs, 1);
init_waitqueue_head(&state->wq);
- trace_alloc_extent_state(state, mask, _RET_IP_);
+ trace_btrfs_alloc_extent_state(state, mask, _RET_IP_);
return state;
}
@@ -188,14 +171,14 @@ static struct extent_state *alloc_extent_state_atomic(struct extent_state *preal
return prealloc;
}
-void free_extent_state(struct extent_state *state)
+void btrfs_free_extent_state(struct extent_state *state)
{
if (!state)
return;
if (refcount_dec_and_test(&state->refs)) {
WARN_ON(extent_state_in_tree(state));
btrfs_leak_debug_del_state(state);
- trace_free_extent_state(state, _RET_IP_);
+ trace_btrfs_free_extent_state(state, _RET_IP_);
kmem_cache_free(extent_state_cache, state);
}
}
@@ -222,38 +205,34 @@ static inline struct extent_state *next_state(struct extent_state *state)
{
struct rb_node *next = rb_next(&state->rb_node);
- if (next)
- return rb_entry(next, struct extent_state, rb_node);
- else
- return NULL;
+ return rb_entry_safe(next, struct extent_state, rb_node);
}
static inline struct extent_state *prev_state(struct extent_state *state)
{
struct rb_node *next = rb_prev(&state->rb_node);
- if (next)
- return rb_entry(next, struct extent_state, rb_node);
- else
- return NULL;
+ return rb_entry_safe(next, struct extent_state, rb_node);
}
/*
- * Search @tree for an entry that contains @offset. Such entry would have
- * entry->start <= offset && entry->end >= offset.
+ * Search @tree for an entry that contains @offset or if none exists for the
+ * first entry that starts and ends after that offset.
*
* @tree: the tree to search
- * @offset: offset that should fall within an entry in @tree
+ * @offset: search offset
* @node_ret: pointer where new node should be anchored (used when inserting an
* entry in the tree)
* @parent_ret: points to entry which would have been the parent of the entry,
* containing @offset
*
- * Return a pointer to the entry that contains @offset byte address and don't change
- * @node_ret and @parent_ret.
+ * Return a pointer to the entry that contains @offset byte address.
+ *
+ * If no such entry exists, return the first entry that starts and ends after
+ * @offset if one exists, otherwise NULL.
*
- * If no such entry exists, return pointer to entry that ends before @offset
- * and fill parameters @node_ret and @parent_ret, ie. does not return NULL.
+ * If the returned entry starts at @offset, then @node_ret and @parent_ret
+ * aren't changed.
*/
static inline struct extent_state *tree_search_for_insert(struct extent_io_tree *tree,
u64 offset,
@@ -282,7 +261,11 @@ static inline struct extent_state *tree_search_for_insert(struct extent_io_tree
if (parent_ret)
*parent_ret = prev;
- /* Search neighbors until we find the first one past the end */
+ /*
+ * Return either the current entry if it contains offset (it ends after
+ * or at offset) or the first entry that starts and ends after offset if
+ * one exists, or NULL.
+ */
while (entry && offset > entry->end)
entry = next_state(entry);
@@ -346,12 +329,12 @@ static inline struct extent_state *tree_search(struct extent_io_tree *tree, u64
return tree_search_for_insert(tree, offset, NULL, NULL);
}
-static void extent_io_tree_panic(const struct extent_io_tree *tree,
- const struct extent_state *state,
- const char *opname,
- int err)
+static void __cold extent_io_tree_panic(const struct extent_io_tree *tree,
+ const struct extent_state *state,
+ const char *opname,
+ int err)
{
- btrfs_panic(extent_io_tree_to_fs_info(tree), err,
+ btrfs_panic(btrfs_extent_io_tree_to_fs_info(tree), err,
"extent io tree error on %s state start %llu end %llu",
opname, state->start, state->end);
}
@@ -362,13 +345,12 @@ static void merge_prev_state(struct extent_io_tree *tree, struct extent_state *s
prev = prev_state(state);
if (prev && prev->end == state->start - 1 && prev->state == state->state) {
- if (is_inode_io_tree(tree))
- btrfs_merge_delalloc_extent(extent_io_tree_to_inode(tree),
- state, prev);
+ if (tree->owner == IO_TREE_INODE_IO)
+ btrfs_merge_delalloc_extent(tree->inode, state, prev);
state->start = prev->start;
rb_erase(&prev->rb_node, &tree->state);
RB_CLEAR_NODE(&prev->rb_node);
- free_extent_state(prev);
+ btrfs_free_extent_state(prev);
}
}
@@ -378,13 +360,12 @@ static void merge_next_state(struct extent_io_tree *tree, struct extent_state *s
next = next_state(state);
if (next && next->start == state->end + 1 && next->state == state->state) {
- if (is_inode_io_tree(tree))
- btrfs_merge_delalloc_extent(extent_io_tree_to_inode(tree),
- state, next);
+ if (tree->owner == IO_TREE_INODE_IO)
+ btrfs_merge_delalloc_extent(tree->inode, state, next);
state->end = next->end;
rb_erase(&next->rb_node, &tree->state);
RB_CLEAR_NODE(&next->rb_node);
- free_extent_state(next);
+ btrfs_free_extent_state(next);
}
}
@@ -413,8 +394,8 @@ static void set_state_bits(struct extent_io_tree *tree,
u32 bits_to_set = bits & ~EXTENT_CTLBITS;
int ret;
- if (is_inode_io_tree(tree))
- btrfs_set_delalloc_extent(extent_io_tree_to_inode(tree), state, bits);
+ if (tree->owner == IO_TREE_INODE_IO)
+ btrfs_set_delalloc_extent(tree->inode, state, bits);
ret = add_extent_changeset(state, bits_to_set, changeset, 1);
BUG_ON(ret < 0);
@@ -459,10 +440,9 @@ static struct extent_state *insert_state(struct extent_io_tree *tree,
if (state->end < entry->start) {
if (try_merge && end == entry->start &&
state->state == entry->state) {
- if (is_inode_io_tree(tree))
- btrfs_merge_delalloc_extent(
- extent_io_tree_to_inode(tree),
- state, entry);
+ if (tree->owner == IO_TREE_INODE_IO)
+ btrfs_merge_delalloc_extent(tree->inode,
+ state, entry);
entry->start = state->start;
merge_prev_state(tree, entry);
state->state = 0;
@@ -472,10 +452,9 @@ static struct extent_state *insert_state(struct extent_io_tree *tree,
} else if (state->end > entry->end) {
if (try_merge && entry->end == start &&
state->state == entry->state) {
- if (is_inode_io_tree(tree))
- btrfs_merge_delalloc_extent(
- extent_io_tree_to_inode(tree),
- state, entry);
+ if (tree->owner == IO_TREE_INODE_IO)
+ btrfs_merge_delalloc_extent(tree->inode,
+ state, entry);
entry->end = state->end;
merge_next_state(tree, entry);
state->state = 0;
@@ -527,9 +506,8 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
struct rb_node *parent = NULL;
struct rb_node **node;
- if (is_inode_io_tree(tree))
- btrfs_split_delalloc_extent(extent_io_tree_to_inode(tree), orig,
- split);
+ if (tree->owner == IO_TREE_INODE_IO)
+ btrfs_split_delalloc_extent(tree->inode, orig, split);
prealloc->start = orig->start;
prealloc->end = split - 1;
@@ -549,7 +527,7 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
} else if (prealloc->end > entry->end) {
node = &(*node)->rb_right;
} else {
- free_extent_state(prealloc);
+ btrfs_free_extent_state(prealloc);
return -EEXIST;
}
}
@@ -561,6 +539,18 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
}
/*
+ * Use this during tree iteration to avoid doing next node searches when it's
+ * not needed (the current record ends at or after the target range's end).
+ */
+static inline struct extent_state *next_search_state(struct extent_state *state, u64 end)
+{
+ if (state->end < end)
+ return next_state(state);
+
+ return NULL;
+}
+
+/*
* Utility function to clear some bits in an extent state struct. It will
* optionally wake up anyone waiting on this state (wake == 1).
*
@@ -569,16 +559,15 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
*/
static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
struct extent_state *state,
- u32 bits, int wake,
+ u32 bits, int wake, u64 end,
struct extent_changeset *changeset)
{
struct extent_state *next;
u32 bits_to_clear = bits & ~EXTENT_CTLBITS;
int ret;
- if (is_inode_io_tree(tree))
- btrfs_clear_delalloc_extent(extent_io_tree_to_inode(tree), state,
- bits);
+ if (tree->owner == IO_TREE_INODE_IO)
+ btrfs_clear_delalloc_extent(tree->inode, state, bits);
ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
BUG_ON(ret < 0);
@@ -586,17 +575,17 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
if (wake)
wake_up(&state->wq);
if (state->state == 0) {
- next = next_state(state);
+ next = next_search_state(state, end);
if (extent_state_in_tree(state)) {
rb_erase(&state->rb_node, &tree->state);
RB_CLEAR_NODE(&state->rb_node);
- free_extent_state(state);
+ btrfs_free_extent_state(state);
} else {
WARN_ON(1);
}
} else {
merge_state(tree, state);
- next = next_state(state);
+ next = next_search_state(state, end);
}
return next;
}
@@ -620,18 +609,18 @@ static void set_gfp_mask_from_bits(u32 *bits, gfp_t *mask)
*
* This takes the tree lock, and returns 0 on success and < 0 on error.
*/
-int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, struct extent_state **cached_state,
- struct extent_changeset *changeset)
+int btrfs_clear_extent_bit_changeset(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_state **cached_state,
+ struct extent_changeset *changeset)
{
struct extent_state *state;
struct extent_state *cached;
struct extent_state *prealloc = NULL;
u64 last_end;
- int err;
- int clear = 0;
- int wake;
- int delete = (bits & EXTENT_CLEAR_ALL_BITS);
+ int ret = 0;
+ bool clear;
+ bool wake;
+ const bool delete = (bits & EXTENT_CLEAR_ALL_BITS);
gfp_t mask;
set_gfp_mask_from_bits(&bits, &mask);
@@ -644,9 +633,8 @@ int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
if (bits & EXTENT_DELALLOC)
bits |= EXTENT_NORESERVE;
- wake = ((bits & EXTENT_LOCK_BITS) ? 1 : 0);
- if (bits & (EXTENT_LOCK_BITS | EXTENT_BOUNDARY))
- clear = 1;
+ wake = (bits & EXTENT_LOCK_BITS);
+ clear = (bits & (EXTENT_LOCK_BITS | EXTENT_BOUNDARY));
again:
if (!prealloc) {
/*
@@ -676,7 +664,7 @@ again:
goto hit_next;
}
if (clear)
- free_extent_state(cached);
+ btrfs_free_extent_state(cached);
}
/* This search will find the extents that end after our range starts. */
@@ -691,7 +679,7 @@ hit_next:
/* The state doesn't have the wanted bits, go ahead. */
if (!(state->state & bits)) {
- state = next_state(state);
+ state = next_search_state(state, end);
goto next;
}
@@ -714,18 +702,24 @@ hit_next:
prealloc = alloc_extent_state_atomic(prealloc);
if (!prealloc)
goto search_again;
- err = split_state(tree, state, prealloc, start);
- if (err)
- extent_io_tree_panic(tree, state, "split", err);
-
+ ret = split_state(tree, state, prealloc, start);
prealloc = NULL;
- if (err)
+ if (ret) {
+ extent_io_tree_panic(tree, state, "split", ret);
goto out;
+ }
if (state->end <= end) {
- state = clear_state_bit(tree, state, bits, wake, changeset);
+ state = clear_state_bit(tree, state, bits, wake, end,
+ changeset);
goto next;
}
- goto search_again;
+ if (need_resched())
+ goto search_again;
+ /*
+ * Fallthrough and try atomic extent state allocation if needed.
+ * If it fails we'll jump to 'search_again' retry the allocation
+ * in non-atomic mode and start the search again.
+ */
}
/*
* | ---- desired range ---- |
@@ -736,30 +730,31 @@ hit_next:
prealloc = alloc_extent_state_atomic(prealloc);
if (!prealloc)
goto search_again;
- err = split_state(tree, state, prealloc, end + 1);
- if (err)
- extent_io_tree_panic(tree, state, "split", err);
+ ret = split_state(tree, state, prealloc, end + 1);
+ if (ret) {
+ extent_io_tree_panic(tree, state, "split", ret);
+ prealloc = NULL;
+ goto out;
+ }
if (wake)
wake_up(&state->wq);
- clear_state_bit(tree, prealloc, bits, wake, changeset);
+ clear_state_bit(tree, prealloc, bits, wake, end, changeset);
prealloc = NULL;
goto out;
}
- state = clear_state_bit(tree, state, bits, wake, changeset);
+ state = clear_state_bit(tree, state, bits, wake, end, changeset);
next:
- if (last_end == (u64)-1)
+ if (last_end >= end)
goto out;
start = last_end + 1;
- if (start <= end && state && !need_resched())
+ if (state && !need_resched())
goto hit_next;
search_again:
- if (start > end)
- goto out;
spin_unlock(&tree->lock);
if (gfpflags_allow_blocking(mask))
cond_resched();
@@ -767,10 +762,9 @@ search_again:
out:
spin_unlock(&tree->lock);
- if (prealloc)
- free_extent_state(prealloc);
+ btrfs_free_extent_state(prealloc);
- return 0;
+ return ret;
}
@@ -820,7 +814,7 @@ process_node:
schedule();
spin_lock(&tree->lock);
finish_wait(&state->wq, &wait);
- free_extent_state(state);
+ btrfs_free_extent_state(state);
goto again;
}
start = state->end + 1;
@@ -838,7 +832,7 @@ out:
if (cached_state && *cached_state) {
state = *cached_state;
*cached_state = NULL;
- free_extent_state(state);
+ btrfs_free_extent_state(state);
}
spin_unlock(&tree->lock);
}
@@ -877,7 +871,7 @@ static struct extent_state *find_first_extent_bit_state(struct extent_io_tree *t
*/
state = tree_search(tree, start);
while (state) {
- if (state->end >= start && (state->state & bits))
+ if (state->state & bits)
return state;
state = next_state(state);
}
@@ -892,9 +886,9 @@ static struct extent_state *find_first_extent_bit_state(struct extent_io_tree *t
* Return true if we find something, and update @start_ret and @end_ret.
* Return false if we found nothing.
*/
-bool find_first_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, u32 bits,
- struct extent_state **cached_state)
+bool btrfs_find_first_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, u32 bits,
+ struct extent_state **cached_state)
{
struct extent_state *state;
bool ret = false;
@@ -914,13 +908,13 @@ bool find_first_extent_bit(struct extent_io_tree *tree, u64 start,
* again. If we haven't found any, clear as well since
* it's now useless.
*/
- free_extent_state(*cached_state);
+ btrfs_free_extent_state(*cached_state);
*cached_state = NULL;
if (state)
goto got_it;
goto out;
}
- free_extent_state(*cached_state);
+ btrfs_free_extent_state(*cached_state);
*cached_state = NULL;
}
@@ -952,14 +946,17 @@ out:
* contiguous area for given bits. We will search to the first bit we find, and
* then walk down the tree until we find a non-contiguous area. The area
* returned will be the full contiguous area with the bits set.
+ *
+ * Returns true if we found a range with the given bits set, in which case
+ * @start_ret and @end_ret are updated, or false if no range was found.
*/
-int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, u32 bits)
+bool btrfs_find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, u32 bits)
{
struct extent_state *state;
- int ret = 1;
+ bool ret = false;
- ASSERT(!btrfs_fs_incompat(extent_io_tree_to_fs_info(tree), NO_HOLES));
+ ASSERT(!btrfs_fs_incompat(btrfs_extent_io_tree_to_fs_info(tree), NO_HOLES));
spin_lock(&tree->lock);
state = find_first_extent_bit_state(tree, start, bits);
@@ -971,7 +968,7 @@ int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
break;
*end_ret = state->end;
}
- ret = 0;
+ ret = true;
}
spin_unlock(&tree->lock);
return ret;
@@ -1046,11 +1043,11 @@ out:
*
* [start, end] is inclusive This takes the tree lock.
*/
-static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, u64 *failed_start,
- struct extent_state **failed_state,
- struct extent_state **cached_state,
- struct extent_changeset *changeset)
+static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, u64 *failed_start,
+ struct extent_state **failed_state,
+ struct extent_state **cached_state,
+ struct extent_changeset *changeset)
{
struct extent_state *state;
struct extent_state *prealloc = NULL;
@@ -1129,12 +1126,11 @@ hit_next:
set_state_bits(tree, state, bits, changeset);
cache_state(state, cached_state);
merge_state(tree, state);
- if (last_end == (u64)-1)
+ if (last_end >= end)
goto out;
start = last_end + 1;
state = next_state(state);
- if (start < end && state && state->start == start &&
- !need_resched())
+ if (state && state->start == start && !need_resched())
goto hit_next;
goto search_again;
}
@@ -1186,12 +1182,11 @@ hit_next:
set_state_bits(tree, state, bits, changeset);
cache_state(state, cached_state);
merge_state(tree, state);
- if (last_end == (u64)-1)
+ if (last_end >= end)
goto out;
start = last_end + 1;
state = next_state(state);
- if (start < end && state && state->start == start &&
- !need_resched())
+ if (state && state->start == start && !need_resched())
goto hit_next;
}
goto search_again;
@@ -1204,14 +1199,8 @@ hit_next:
* extent we found.
*/
if (state->start > start) {
- u64 this_end;
struct extent_state *inserted_state;
- if (end < last_start)
- this_end = end;
- else
- this_end = last_start - 1;
-
prealloc = alloc_extent_state_atomic(prealloc);
if (!prealloc)
goto search_again;
@@ -1221,17 +1210,38 @@ hit_next:
* extent.
*/
prealloc->start = start;
- prealloc->end = this_end;
+ if (end < last_start)
+ prealloc->end = end;
+ else
+ prealloc->end = last_start - 1;
+
inserted_state = insert_state(tree, prealloc, bits, changeset);
if (IS_ERR(inserted_state)) {
ret = PTR_ERR(inserted_state);
extent_io_tree_panic(tree, prealloc, "insert", ret);
+ goto out;
}
cache_state(inserted_state, cached_state);
if (inserted_state == prealloc)
prealloc = NULL;
- start = this_end + 1;
+ start = inserted_state->end + 1;
+
+ /* Beyond target range, stop. */
+ if (start > end)
+ goto out;
+
+ if (need_resched())
+ goto search_again;
+
+ state = next_search_state(inserted_state, end);
+ /*
+ * If there's a next state, whether contiguous or not, we don't
+ * need to unlock and start search agian. If it's not contiguous
+ * we will end up here and try to allocate a prealloc state and insert.
+ */
+ if (state)
+ goto hit_next;
goto search_again;
}
/*
@@ -1252,8 +1262,11 @@ hit_next:
if (!prealloc)
goto search_again;
ret = split_state(tree, state, prealloc, end + 1);
- if (ret)
+ if (ret) {
extent_io_tree_panic(tree, state, "split", ret);
+ prealloc = NULL;
+ goto out;
+ }
set_state_bits(tree, prealloc, bits, changeset);
cache_state(prealloc, cached_state);
@@ -1272,18 +1285,16 @@ search_again:
out:
spin_unlock(&tree->lock);
- if (prealloc)
- free_extent_state(prealloc);
+ btrfs_free_extent_state(prealloc);
return ret;
}
-int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, struct extent_state **cached_state)
+int btrfs_set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_state **cached_state)
{
- return __set_extent_bit(tree, start, end, bits, NULL, NULL,
- cached_state, NULL);
+ return set_extent_bit(tree, start, end, bits, NULL, NULL, cached_state, NULL);
}
/*
@@ -1304,9 +1315,9 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
*
* All allocations are done with GFP_NOFS.
*/
-int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, u32 clear_bits,
- struct extent_state **cached_state)
+int btrfs_convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, u32 clear_bits,
+ struct extent_state **cached_state)
{
struct extent_state *state;
struct extent_state *prealloc = NULL;
@@ -1374,12 +1385,11 @@ hit_next:
if (state->start == start && state->end <= end) {
set_state_bits(tree, state, bits, NULL);
cache_state(state, cached_state);
- state = clear_state_bit(tree, state, clear_bits, 0, NULL);
- if (last_end == (u64)-1)
+ state = clear_state_bit(tree, state, clear_bits, 0, end, NULL);
+ if (last_end >= end)
goto out;
start = last_end + 1;
- if (start < end && state && state->start == start &&
- !need_resched())
+ if (state && state->start == start && !need_resched())
goto hit_next;
goto search_again;
}
@@ -1406,20 +1416,19 @@ hit_next:
goto out;
}
ret = split_state(tree, state, prealloc, start);
- if (ret)
- extent_io_tree_panic(tree, state, "split", ret);
prealloc = NULL;
- if (ret)
+ if (ret) {
+ extent_io_tree_panic(tree, state, "split", ret);
goto out;
+ }
if (state->end <= end) {
set_state_bits(tree, state, bits, NULL);
cache_state(state, cached_state);
- state = clear_state_bit(tree, state, clear_bits, 0, NULL);
- if (last_end == (u64)-1)
+ state = clear_state_bit(tree, state, clear_bits, 0, end, NULL);
+ if (last_end >= end)
goto out;
start = last_end + 1;
- if (start < end && state && state->start == start &&
- !need_resched())
+ if (state && state->start == start && !need_resched())
goto hit_next;
}
goto search_again;
@@ -1432,14 +1441,8 @@ hit_next:
* extent we found.
*/
if (state->start > start) {
- u64 this_end;
struct extent_state *inserted_state;
- if (end < last_start)
- this_end = end;
- else
- this_end = last_start - 1;
-
prealloc = alloc_extent_state_atomic(prealloc);
if (!prealloc) {
ret = -ENOMEM;
@@ -1451,16 +1454,37 @@ hit_next:
* extent.
*/
prealloc->start = start;
- prealloc->end = this_end;
+ if (end < last_start)
+ prealloc->end = end;
+ else
+ prealloc->end = last_start - 1;
+
inserted_state = insert_state(tree, prealloc, bits, NULL);
if (IS_ERR(inserted_state)) {
ret = PTR_ERR(inserted_state);
extent_io_tree_panic(tree, prealloc, "insert", ret);
+ goto out;
}
cache_state(inserted_state, cached_state);
if (inserted_state == prealloc)
prealloc = NULL;
- start = this_end + 1;
+ start = inserted_state->end + 1;
+
+ /* Beyond target range, stop. */
+ if (start > end)
+ goto out;
+
+ if (need_resched())
+ goto search_again;
+
+ state = next_search_state(inserted_state, end);
+ /*
+ * If there's a next state, whether contiguous or not, we don't
+ * need to unlock and start search again. If it's not contiguous
+ * we will end up here and try to allocate a prealloc state and insert.
+ */
+ if (state)
+ goto hit_next;
goto search_again;
}
/*
@@ -1477,12 +1501,15 @@ hit_next:
}
ret = split_state(tree, state, prealloc, end + 1);
- if (ret)
+ if (ret) {
extent_io_tree_panic(tree, state, "split", ret);
+ prealloc = NULL;
+ goto out;
+ }
set_state_bits(tree, prealloc, bits, NULL);
cache_state(prealloc, cached_state);
- clear_state_bit(tree, prealloc, clear_bits, 0, NULL);
+ clear_state_bit(tree, prealloc, clear_bits, 0, end, NULL);
prealloc = NULL;
goto out;
}
@@ -1497,8 +1524,7 @@ search_again:
out:
spin_unlock(&tree->lock);
- if (prealloc)
- free_extent_state(prealloc);
+ btrfs_free_extent_state(prealloc);
return ret;
}
@@ -1518,8 +1544,8 @@ out:
* spans (last_range_end, end of device]. In this case it's up to the caller to
* trim @end_ret to the appropriate size.
*/
-void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, u32 bits)
+void btrfs_find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, u32 bits)
{
struct extent_state *state;
struct extent_state *prev = NULL, *next = NULL;
@@ -1636,10 +1662,10 @@ out:
* all given bits set. If the returned number of bytes is greater than zero
* then @start is updated with the offset of the first byte with the bits set.
*/
-u64 count_range_bits(struct extent_io_tree *tree,
- u64 *start, u64 search_end, u64 max_bytes,
- u32 bits, int contig,
- struct extent_state **cached_state)
+u64 btrfs_count_range_bits(struct extent_io_tree *tree,
+ u64 *start, u64 search_end, u64 max_bytes,
+ u32 bits, int contig,
+ struct extent_state **cached_state)
{
struct extent_state *state = NULL;
struct extent_state *cached;
@@ -1710,7 +1736,7 @@ search:
}
if (cached_state) {
- free_extent_state(*cached_state);
+ btrfs_free_extent_state(*cached_state);
*cached_state = state;
if (state)
refcount_inc(&state->refs);
@@ -1724,16 +1750,16 @@ search:
/*
* Check if the single @bit exists in the given range.
*/
-bool test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 bit)
+bool btrfs_test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 bit)
{
- struct extent_state *state = NULL;
+ struct extent_state *state;
bool bitset = false;
ASSERT(is_power_of_2(bit));
spin_lock(&tree->lock);
state = tree_search(tree, start);
- while (state && start <= end) {
+ while (state) {
if (state->start > end)
break;
@@ -1742,9 +1768,7 @@ bool test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32
break;
}
- /* If state->end is (u64)-1, start will overflow to 0 */
- start = state->end + 1;
- if (start > end || start == 0)
+ if (state->end >= end)
break;
state = next_state(state);
}
@@ -1752,16 +1776,51 @@ bool test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32
return bitset;
}
+void btrfs_get_range_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 *bits,
+ struct extent_state **cached_state)
+{
+ struct extent_state *state;
+
+ /*
+ * The cached state is currently mandatory and not used to start the
+ * search, only to cache the first state record found in the range.
+ */
+ ASSERT(cached_state != NULL);
+ ASSERT(*cached_state == NULL);
+
+ *bits = 0;
+
+ spin_lock(&tree->lock);
+ state = tree_search(tree, start);
+ if (state && state->start < end) {
+ *cached_state = state;
+ refcount_inc(&state->refs);
+ }
+ while (state) {
+ if (state->start > end)
+ break;
+
+ *bits |= state->state;
+
+ if (state->end >= end)
+ break;
+
+ state = next_state(state);
+ }
+ spin_unlock(&tree->lock);
+}
+
/*
* Check if the whole range [@start,@end) contains the single @bit set.
*/
-bool test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit,
- struct extent_state *cached)
+bool btrfs_test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit,
+ struct extent_state *cached)
{
- struct extent_state *state = NULL;
+ struct extent_state *state;
bool bitset = true;
ASSERT(is_power_of_2(bit));
+ ASSERT(start < end);
spin_lock(&tree->lock);
if (cached && extent_state_in_tree(cached) && cached->start <= start &&
@@ -1769,30 +1828,22 @@ bool test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit,
state = cached;
else
state = tree_search(tree, start);
- while (state && start <= end) {
+ while (state) {
if (state->start > start) {
bitset = false;
break;
}
- if (state->start > end)
- break;
-
if ((state->state & bit) == 0) {
bitset = false;
break;
}
- if (state->end == (u64)-1)
+ if (state->end >= end)
break;
- /*
- * Last entry (if state->end is (u64)-1 and overflow happens),
- * or next entry starts after the range.
- */
+ /* Next state must start where this one ends. */
start = state->end + 1;
- if (start > end || start == 0)
- break;
state = next_state(state);
}
@@ -1804,8 +1855,8 @@ bool test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit,
}
/* Wrappers around set/clear extent bit */
-int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, struct extent_changeset *changeset)
+int btrfs_set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_changeset *changeset)
{
/*
* We don't support EXTENT_LOCK_BITS yet, as current changeset will
@@ -1814,11 +1865,11 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
*/
ASSERT(!(bits & EXTENT_LOCK_BITS));
- return __set_extent_bit(tree, start, end, bits, NULL, NULL, NULL, changeset);
+ return set_extent_bit(tree, start, end, bits, NULL, NULL, NULL, changeset);
}
-int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, struct extent_changeset *changeset)
+int btrfs_clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_changeset *changeset)
{
/*
* Don't support EXTENT_LOCK_BITS case, same reason as
@@ -1826,20 +1877,20 @@ int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
*/
ASSERT(!(bits & EXTENT_LOCK_BITS));
- return __clear_extent_bit(tree, start, end, bits, NULL, changeset);
+ return btrfs_clear_extent_bit_changeset(tree, start, end, bits, NULL, changeset);
}
-bool __try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
- struct extent_state **cached)
+bool btrfs_try_lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_state **cached)
{
- int err;
+ int ret;
u64 failed_start;
- err = __set_extent_bit(tree, start, end, bits, &failed_start,
- NULL, cached, NULL);
- if (err == -EEXIST) {
+ ret = set_extent_bit(tree, start, end, bits, &failed_start, NULL, cached, NULL);
+ if (ret == -EEXIST) {
if (failed_start > start)
- clear_extent_bit(tree, start, failed_start - 1, bits, cached);
+ btrfs_clear_extent_bit(tree, start, failed_start - 1,
+ bits, cached);
return 0;
}
return 1;
@@ -1849,35 +1900,54 @@ bool __try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits
* Either insert or lock state struct between start and end use mask to tell
* us if waiting is desired.
*/
-int __lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
- struct extent_state **cached_state)
+int btrfs_lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
+ struct extent_state **cached_state)
{
struct extent_state *failed_state = NULL;
- int err;
+ int ret;
u64 failed_start;
- err = __set_extent_bit(tree, start, end, bits, &failed_start,
- &failed_state, cached_state, NULL);
- while (err == -EEXIST) {
+ ret = set_extent_bit(tree, start, end, bits, &failed_start,
+ &failed_state, cached_state, NULL);
+ while (ret == -EEXIST) {
if (failed_start != start)
- clear_extent_bit(tree, start, failed_start - 1,
- bits, cached_state);
+ btrfs_clear_extent_bit(tree, start, failed_start - 1,
+ bits, cached_state);
wait_extent_bit(tree, failed_start, end, bits, &failed_state);
- err = __set_extent_bit(tree, start, end, bits,
- &failed_start, &failed_state,
- cached_state, NULL);
+ ret = set_extent_bit(tree, start, end, bits, &failed_start,
+ &failed_state, cached_state, NULL);
}
- return err;
+ return ret;
+}
+
+/*
+ * Get the extent state that follows the given extent state.
+ * This is meant to be used in a context where we know no other tasks can
+ * concurrently modify the tree.
+ */
+struct extent_state *btrfs_next_extent_state(struct extent_io_tree *tree,
+ struct extent_state *state)
+{
+ struct extent_state *next;
+
+ spin_lock(&tree->lock);
+ ASSERT(extent_state_in_tree(state));
+ next = next_state(state);
+ if (next)
+ refcount_inc(&next->refs);
+ spin_unlock(&tree->lock);
+
+ return next;
}
-void __cold extent_state_free_cachep(void)
+void __cold btrfs_extent_state_free_cachep(void)
{
btrfs_extent_state_leak_debug_check();
kmem_cache_destroy(extent_state_cache);
}
-int __init extent_state_init_cachep(void)
+int __init btrfs_extent_state_init_cachep(void)
{
extent_state_cache = kmem_cache_create("btrfs_extent_state",
sizeof(struct extent_state), 0, 0,
diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
index 6ffef1cd37c1..36facca37973 100644
--- a/fs/btrfs/extent-io-tree.h
+++ b/fs/btrfs/extent-io-tree.h
@@ -17,10 +17,10 @@ struct btrfs_inode;
/* Bits for the extent state */
enum {
ENUM_BIT(EXTENT_DIRTY),
- ENUM_BIT(EXTENT_UPTODATE),
ENUM_BIT(EXTENT_LOCKED),
ENUM_BIT(EXTENT_DIO_LOCKED),
- ENUM_BIT(EXTENT_NEW),
+ ENUM_BIT(EXTENT_DIRTY_LOG1),
+ ENUM_BIT(EXTENT_DIRTY_LOG2),
ENUM_BIT(EXTENT_DELALLOC),
ENUM_BIT(EXTENT_DEFRAG),
ENUM_BIT(EXTENT_BOUNDARY),
@@ -39,6 +39,11 @@ enum {
*/
ENUM_BIT(EXTENT_DELALLOC_NEW),
/*
+ * Mark that a range is being locked for finishing an ordered extent.
+ * Used together with EXTENT_LOCKED.
+ */
+ ENUM_BIT(EXTENT_FINISHING_ORDERED),
+ /*
* When an ordered extent successfully completes for a region marked as
* a new delalloc range, use this flag when clearing a new delalloc
* range to indicate that the VFS' inode number of bytes should be
@@ -130,117 +135,110 @@ struct extent_state {
#endif
};
-struct btrfs_inode *extent_io_tree_to_inode(struct extent_io_tree *tree);
-const struct btrfs_inode *extent_io_tree_to_inode_const(const struct extent_io_tree *tree);
-const struct btrfs_fs_info *extent_io_tree_to_fs_info(const struct extent_io_tree *tree);
+const struct btrfs_inode *btrfs_extent_io_tree_to_inode(const struct extent_io_tree *tree);
+const struct btrfs_fs_info *btrfs_extent_io_tree_to_fs_info(const struct extent_io_tree *tree);
-void extent_io_tree_init(struct btrfs_fs_info *fs_info,
- struct extent_io_tree *tree, unsigned int owner);
-void extent_io_tree_release(struct extent_io_tree *tree);
-int __lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
- struct extent_state **cached);
-bool __try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
- struct extent_state **cached);
+void btrfs_extent_io_tree_init(struct btrfs_fs_info *fs_info,
+ struct extent_io_tree *tree, unsigned int owner);
+void btrfs_extent_io_tree_release(struct extent_io_tree *tree);
+int btrfs_lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
+ struct extent_state **cached);
+bool btrfs_try_lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_state **cached);
-static inline int lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached)
+static inline int btrfs_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached)
{
- return __lock_extent(tree, start, end, EXTENT_LOCKED, cached);
+ return btrfs_lock_extent_bits(tree, start, end, EXTENT_LOCKED, cached);
}
-static inline bool try_lock_extent(struct extent_io_tree *tree, u64 start,
- u64 end, struct extent_state **cached)
+static inline bool btrfs_try_lock_extent(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached)
{
- return __try_lock_extent(tree, start, end, EXTENT_LOCKED, cached);
+ return btrfs_try_lock_extent_bits(tree, start, end, EXTENT_LOCKED, cached);
}
-int __init extent_state_init_cachep(void);
-void __cold extent_state_free_cachep(void);
-
-u64 count_range_bits(struct extent_io_tree *tree,
- u64 *start, u64 search_end,
- u64 max_bytes, u32 bits, int contig,
- struct extent_state **cached_state);
-
-void free_extent_state(struct extent_state *state);
-bool test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit,
- struct extent_state *cached_state);
-bool test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 bit);
-int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, struct extent_changeset *changeset);
-int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, struct extent_state **cached,
- struct extent_changeset *changeset);
-
-static inline int clear_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 end, u32 bits,
- struct extent_state **cached)
-{
- return __clear_extent_bit(tree, start, end, bits, cached, NULL);
-}
+int __init btrfs_extent_state_init_cachep(void);
+void __cold btrfs_extent_state_free_cachep(void);
-static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
- struct extent_state **cached)
-{
- return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, cached, NULL);
-}
+u64 btrfs_count_range_bits(struct extent_io_tree *tree,
+ u64 *start, u64 search_end,
+ u64 max_bytes, u32 bits, int contig,
+ struct extent_state **cached_state);
-static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start,
- u64 end, u32 bits)
+void btrfs_free_extent_state(struct extent_state *state);
+bool btrfs_test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit,
+ struct extent_state *cached_state);
+bool btrfs_test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 bit);
+void btrfs_get_range_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 *bits,
+ struct extent_state **cached_state);
+int btrfs_clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_changeset *changeset);
+int btrfs_clear_extent_bit_changeset(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_state **cached,
+ struct extent_changeset *changeset);
+
+static inline int btrfs_clear_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 end, u32 bits,
+ struct extent_state **cached)
{
- return clear_extent_bit(tree, start, end, bits, NULL);
+ return btrfs_clear_extent_bit_changeset(tree, start, end, bits, cached, NULL);
}
-int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, struct extent_changeset *changeset);
-int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, struct extent_state **cached_state);
-
-static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
- u64 end, struct extent_state **cached_state)
+static inline int btrfs_unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+ struct extent_state **cached)
{
- return __clear_extent_bit(tree, start, end, EXTENT_UPTODATE,
- cached_state, NULL);
+ return btrfs_clear_extent_bit_changeset(tree, start, end, EXTENT_LOCKED,
+ cached, NULL);
}
-static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start,
- u64 end, struct extent_state **cached)
+int btrfs_set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_changeset *changeset);
+int btrfs_set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, struct extent_state **cached_state);
+
+static inline int btrfs_clear_extent_dirty(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached)
{
- return clear_extent_bit(tree, start, end,
- EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING, cached);
+ return btrfs_clear_extent_bit(tree, start, end,
+ EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING, cached);
}
-int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- u32 bits, u32 clear_bits,
- struct extent_state **cached_state);
-
-bool find_first_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, u32 bits,
- struct extent_state **cached_state);
-void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, u32 bits);
-int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, u32 bits);
+int btrfs_convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ u32 bits, u32 clear_bits,
+ struct extent_state **cached_state);
+
+bool btrfs_find_first_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, u32 bits,
+ struct extent_state **cached_state);
+void btrfs_find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, u32 bits);
+bool btrfs_find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
+ u64 *start_ret, u64 *end_ret, u32 bits);
bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
u64 *end, u64 max_bytes,
struct extent_state **cached_state);
-static inline int lock_dio_extent(struct extent_io_tree *tree, u64 start,
- u64 end, struct extent_state **cached)
+static inline int btrfs_lock_dio_extent(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached)
{
- return __lock_extent(tree, start, end, EXTENT_DIO_LOCKED, cached);
+ return btrfs_lock_extent_bits(tree, start, end, EXTENT_DIO_LOCKED, cached);
}
-static inline bool try_lock_dio_extent(struct extent_io_tree *tree, u64 start,
- u64 end, struct extent_state **cached)
+static inline bool btrfs_try_lock_dio_extent(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached)
{
- return __try_lock_extent(tree, start, end, EXTENT_DIO_LOCKED, cached);
+ return btrfs_try_lock_extent_bits(tree, start, end, EXTENT_DIO_LOCKED, cached);
}
-static inline int unlock_dio_extent(struct extent_io_tree *tree, u64 start,
- u64 end, struct extent_state **cached)
+static inline int btrfs_unlock_dio_extent(struct extent_io_tree *tree, u64 start,
+ u64 end, struct extent_state **cached)
{
- return __clear_extent_bit(tree, start, end, EXTENT_DIO_LOCKED, cached, NULL);
+ return btrfs_clear_extent_bit_changeset(tree, start, end, EXTENT_DIO_LOCKED,
+ cached, NULL);
}
+struct extent_state *btrfs_next_extent_state(struct extent_io_tree *tree,
+ struct extent_state *state);
+
#endif /* BTRFS_EXTENT_IO_TREE_H */
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3014a1a23efd..97d517cdf2df 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -46,7 +46,7 @@
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *href,
- struct btrfs_delayed_ref_node *node,
+ const struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extra_op);
static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
struct extent_buffer *leaf,
@@ -56,12 +56,12 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
u64 flags, u64 owner, u64 offset,
struct btrfs_key *ins, int ref_mod, u64 oref_root);
static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_node *node,
+ const struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op);
-static int find_next_key(struct btrfs_path *path, int level,
+static int find_next_key(const struct btrfs_path *path, int level,
struct btrfs_key *key);
-static int block_group_bits(struct btrfs_block_group *cache, u64 bits)
+static int block_group_bits(const struct btrfs_block_group *cache, u64 bits)
{
return (cache->flags & bits) == bits;
}
@@ -70,20 +70,17 @@ static int block_group_bits(struct btrfs_block_group *cache, u64 bits)
int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
{
struct btrfs_root *root = btrfs_extent_root(fs_info, start);
- int ret;
struct btrfs_key key;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = start;
- key.offset = len;
key.type = BTRFS_EXTENT_ITEM_KEY;
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- btrfs_free_path(path);
- return ret;
+ key.offset = len;
+ return btrfs_search_slot(NULL, root, &key, path, 0, 0);
}
/*
@@ -103,7 +100,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root;
struct btrfs_delayed_ref_head *head;
struct btrfs_delayed_ref_root *delayed_refs;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
u64 num_refs;
u64 extent_flags;
@@ -125,16 +122,16 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
search_again:
key.objectid = bytenr;
- key.offset = offset;
if (metadata)
key.type = BTRFS_METADATA_ITEM_KEY;
else
key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = offset;
extent_root = btrfs_extent_root(fs_info, bytenr);
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
if (ret < 0)
- goto out_free;
+ return ret;
if (ret > 0 && key.type == BTRFS_METADATA_ITEM_KEY) {
if (path->slots[0]) {
@@ -159,7 +156,7 @@ search_again:
"unexpected extent item size, has %u expect >= %zu",
item_size, sizeof(*ei));
btrfs_abort_transaction(trans, ret);
- goto out_free;
+ return ret;
}
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
@@ -170,7 +167,7 @@ search_again:
"unexpected zero reference count for extent item (%llu %u %llu)",
key.objectid, key.type, key.offset);
btrfs_abort_transaction(trans, ret);
- goto out_free;
+ return ret;
}
extent_flags = btrfs_extent_flags(leaf, ei);
owner = btrfs_get_extent_owner_root(fs_info, leaf, path->slots[0]);
@@ -216,8 +213,7 @@ search_again:
*flags = extent_flags;
if (owning_root)
*owning_root = owner;
-out_free:
- btrfs_free_path(path);
+
return ret;
}
@@ -333,7 +329,7 @@ out_free:
* is_data == BTRFS_REF_TYPE_ANY, either type is OK.
*/
int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
- struct btrfs_extent_inline_ref *iref,
+ const struct btrfs_extent_inline_ref *iref,
enum btrfs_inline_ref_type is_data)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
@@ -405,23 +401,23 @@ u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
return ((u64)high_crc << 31) ^ (u64)low_crc;
}
-static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
- struct btrfs_extent_data_ref *ref)
+static u64 hash_extent_data_ref_item(const struct extent_buffer *leaf,
+ const struct btrfs_extent_data_ref *ref)
{
return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
btrfs_extent_data_ref_objectid(leaf, ref),
btrfs_extent_data_ref_offset(leaf, ref));
}
-static int match_extent_data_ref(struct extent_buffer *leaf,
- struct btrfs_extent_data_ref *ref,
- u64 root_objectid, u64 owner, u64 offset)
+static bool match_extent_data_ref(const struct extent_buffer *leaf,
+ const struct btrfs_extent_data_ref *ref,
+ u64 root_objectid, u64 owner, u64 offset)
{
if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
btrfs_extent_data_ref_offset(leaf, ref) != offset)
- return 0;
- return 1;
+ return false;
+ return true;
}
static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
@@ -501,7 +497,7 @@ fail:
static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
- struct btrfs_delayed_ref_node *node,
+ const struct btrfs_delayed_ref_node *node,
u64 bytenr)
{
struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
@@ -621,13 +617,13 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
return ret;
}
-static noinline u32 extent_data_ref_count(struct btrfs_path *path,
- struct btrfs_extent_inline_ref *iref)
+static noinline u32 extent_data_ref_count(const struct btrfs_path *path,
+ const struct btrfs_extent_inline_ref *iref)
{
struct btrfs_key key;
struct extent_buffer *leaf;
- struct btrfs_extent_data_ref *ref1;
- struct btrfs_shared_data_ref *ref2;
+ const struct btrfs_extent_data_ref *ref1;
+ const struct btrfs_shared_data_ref *ref2;
u32 num_refs = 0;
int type;
@@ -642,10 +638,10 @@ static noinline u32 extent_data_ref_count(struct btrfs_path *path,
type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
ASSERT(type != BTRFS_REF_TYPE_INVALID);
if (type == BTRFS_EXTENT_DATA_REF_KEY) {
- ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
+ ref1 = (const struct btrfs_extent_data_ref *)(&iref->offset);
num_refs = btrfs_extent_data_ref_count(leaf, ref1);
} else {
- ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
+ ref2 = (const struct btrfs_shared_data_ref *)(iref + 1);
num_refs = btrfs_shared_data_ref_count(leaf, ref2);
}
} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
@@ -688,7 +684,7 @@ static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
- struct btrfs_delayed_ref_node *node,
+ const struct btrfs_delayed_ref_node *node,
u64 bytenr)
{
struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
@@ -726,7 +722,7 @@ static inline int extent_ref_type(u64 parent, u64 owner)
return type;
}
-static int find_next_key(struct btrfs_path *path, int level,
+static int find_next_key(const struct btrfs_path *path, int level,
struct btrfs_key *key)
{
@@ -1484,10 +1480,10 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
*
*/
static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_node *node,
+ const struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
struct btrfs_extent_item *item;
struct btrfs_key key;
@@ -1508,7 +1504,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
node->parent, node->ref_root, owner,
offset, refs_to_add, extent_op);
if ((ret < 0 && ret != -EAGAIN) || !ret)
- goto out;
+ return ret;
/*
* Ok we had -EAGAIN which means we didn't have space to insert and
@@ -1526,20 +1522,21 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
/* now insert the actual backref */
- if (owner < BTRFS_FIRST_FREE_OBJECTID)
+ if (owner < BTRFS_FIRST_FREE_OBJECTID) {
ret = insert_tree_block_ref(trans, path, node, bytenr);
- else
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ } else {
ret = insert_extent_data_ref(trans, path, node, bytenr);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ }
- if (ret)
- btrfs_abort_transaction(trans, ret);
-out:
- btrfs_free_path(path);
return ret;
}
static void free_head_ref_squota_rsv(struct btrfs_fs_info *fs_info,
- struct btrfs_delayed_ref_head *href)
+ const struct btrfs_delayed_ref_head *href)
{
u64 root = href->owning_root;
@@ -1548,7 +1545,7 @@ static void free_head_ref_squota_rsv(struct btrfs_fs_info *fs_info,
* where it has already been unset.
*/
if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE ||
- !href->is_data || !is_fstree(root))
+ !href->is_data || !btrfs_is_fstree(root))
return;
btrfs_qgroup_free_refroot(fs_info, root, href->reserved_bytes,
@@ -1557,7 +1554,7 @@ static void free_head_ref_squota_rsv(struct btrfs_fs_info *fs_info,
static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *href,
- struct btrfs_delayed_ref_node *node,
+ const struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op,
bool insert_reserved)
{
@@ -1625,13 +1622,13 @@ static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
}
static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_head *head,
+ const struct btrfs_delayed_ref_head *head,
struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root;
struct btrfs_key key;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_extent_item *ei;
struct extent_buffer *leaf;
u32 item_size;
@@ -1662,7 +1659,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
again:
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
if (ret < 0) {
- goto out;
+ return ret;
} else if (ret > 0) {
if (metadata) {
if (path->slots[0] > 0) {
@@ -1679,8 +1676,8 @@ again:
metadata = 0;
key.objectid = head->bytenr;
- key.offset = head->num_bytes;
key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = head->num_bytes;
goto again;
}
} else {
@@ -1688,7 +1685,7 @@ again:
btrfs_err(fs_info,
"missing extent item for extent %llu num_bytes %llu level %d",
head->bytenr, head->num_bytes, head->level);
- goto out;
+ return ret;
}
}
@@ -1701,19 +1698,18 @@ again:
"unexpected extent item size, has %u expect >= %zu",
item_size, sizeof(*ei));
btrfs_abort_transaction(trans, ret);
- goto out;
+ return ret;
}
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
__run_delayed_extent_op(extent_op, leaf, ei);
-out:
- btrfs_free_path(path);
+
return ret;
}
static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *href,
- struct btrfs_delayed_ref_node *node,
+ const struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op,
bool insert_reserved)
{
@@ -1760,7 +1756,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
/* helper function to actually process a single delayed ref entry */
static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *href,
- struct btrfs_delayed_ref_node *node,
+ const struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op,
bool insert_reserved)
{
@@ -2012,7 +2008,12 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
delayed_refs = &trans->transaction->delayed_refs;
if (min_bytes == 0) {
- max_count = delayed_refs->num_heads_ready;
+ /*
+ * We may be subject to a harmless race if some task is
+ * concurrently adding or removing a delayed ref, so silence
+ * KCSAN and similar tools.
+ */
+ max_count = data_race(delayed_refs->num_heads_ready);
min_bytes = U64_MAX;
}
@@ -2348,8 +2349,8 @@ static noinline int check_committed_ref(struct btrfs_inode *inode,
int ret;
key.objectid = bytenr;
- key.offset = (u64)-1;
key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = (u64)-1;
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
if (ret < 0)
@@ -2604,8 +2605,8 @@ static int pin_down_extent(struct btrfs_trans_handle *trans,
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
- set_extent_bit(&trans->transaction->pinned_extents, bytenr,
- bytenr + num_bytes - 1, EXTENT_DIRTY, NULL);
+ btrfs_set_extent_bit(&trans->transaction->pinned_extents, bytenr,
+ bytenr + num_bytes - 1, EXTENT_DIRTY, NULL);
return 0;
}
@@ -2824,34 +2825,63 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_block_group *block_group, *tmp;
struct list_head *deleted_bgs;
- struct extent_io_tree *unpin;
+ struct extent_io_tree *unpin = &trans->transaction->pinned_extents;
+ struct extent_state *cached_state = NULL;
u64 start;
u64 end;
+ int unpin_error = 0;
int ret;
- unpin = &trans->transaction->pinned_extents;
+ mutex_lock(&fs_info->unused_bg_unpin_mutex);
+ btrfs_find_first_extent_bit(unpin, 0, &start, &end, EXTENT_DIRTY, &cached_state);
- while (!TRANS_ABORTED(trans)) {
- struct extent_state *cached_state = NULL;
-
- mutex_lock(&fs_info->unused_bg_unpin_mutex);
- if (!find_first_extent_bit(unpin, 0, &start, &end,
- EXTENT_DIRTY, &cached_state)) {
- mutex_unlock(&fs_info->unused_bg_unpin_mutex);
- break;
- }
+ while (!TRANS_ABORTED(trans) && cached_state) {
+ struct extent_state *next_state;
if (btrfs_test_opt(fs_info, DISCARD_SYNC))
ret = btrfs_discard_extent(fs_info, start,
end + 1 - start, NULL);
- clear_extent_dirty(unpin, start, end, &cached_state);
+ next_state = btrfs_next_extent_state(unpin, cached_state);
+ btrfs_clear_extent_dirty(unpin, start, end, &cached_state);
ret = unpin_extent_range(fs_info, start, end, true);
- BUG_ON(ret);
- mutex_unlock(&fs_info->unused_bg_unpin_mutex);
- free_extent_state(cached_state);
- cond_resched();
+ /*
+ * If we get an error unpinning an extent range, store the first
+ * error to return later after trying to unpin all ranges and do
+ * the sync discards. Our caller will abort the transaction
+ * (which already wrote new superblocks) and on the next mount
+ * the space will be available as it was pinned by in-memory
+ * only structures in this phase.
+ */
+ if (ret) {
+ btrfs_err_rl(fs_info,
+"failed to unpin extent range [%llu, %llu] when committing transaction %llu: %s (%d)",
+ start, end, trans->transid,
+ btrfs_decode_error(ret), ret);
+ if (!unpin_error)
+ unpin_error = ret;
+ }
+
+ btrfs_free_extent_state(cached_state);
+
+ if (need_resched()) {
+ btrfs_free_extent_state(next_state);
+ mutex_unlock(&fs_info->unused_bg_unpin_mutex);
+ cond_resched();
+ cached_state = NULL;
+ mutex_lock(&fs_info->unused_bg_unpin_mutex);
+ btrfs_find_first_extent_bit(unpin, 0, &start, &end,
+ EXTENT_DIRTY, &cached_state);
+ } else {
+ cached_state = next_state;
+ if (cached_state) {
+ start = cached_state->start;
+ end = cached_state->end;
+ }
+ }
}
+ mutex_unlock(&fs_info->unused_bg_unpin_mutex);
+ btrfs_free_extent_state(cached_state);
if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) {
btrfs_discard_calc_delay(&fs_info->discard_ctl);
@@ -2865,16 +2895,20 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
*/
deleted_bgs = &trans->transaction->deleted_bgs;
list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
- u64 trimmed = 0;
-
ret = -EROFS;
if (!TRANS_ABORTED(trans))
- ret = btrfs_discard_extent(fs_info,
- block_group->start,
- block_group->length,
- &trimmed);
+ ret = btrfs_discard_extent(fs_info, block_group->start,
+ block_group->length, NULL);
+ /*
+ * Not strictly necessary to lock, as the block_group should be
+ * read-only from btrfs_delete_unused_bgs().
+ */
+ ASSERT(block_group->ro);
+ spin_lock(&fs_info->unused_bgs_lock);
list_del_init(&block_group->bg_list);
+ spin_unlock(&fs_info->unused_bgs_lock);
+
btrfs_unfreeze_block_group(block_group);
btrfs_put_block_group(block_group);
@@ -2886,7 +2920,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
}
}
- return 0;
+ return unpin_error;
}
/*
@@ -2966,7 +3000,7 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
return ret;
}
- ret = add_to_free_space_tree(trans, bytenr, num_bytes);
+ ret = btrfs_add_to_free_space_tree(trans, bytenr, num_bytes);
if (ret) {
btrfs_abort_transaction(trans, ret);
return ret;
@@ -3047,7 +3081,7 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
*/
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *href,
- struct btrfs_delayed_ref_node *node,
+ const struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_fs_info *info = trans->fs_info;
@@ -3481,17 +3515,11 @@ int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
btrfs_add_free_space(bg, buf->start, buf->len);
- btrfs_free_reserved_bytes(bg, buf->len, 0);
+ btrfs_free_reserved_bytes(bg, buf->len, false);
btrfs_put_block_group(bg);
trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
out:
-
- /*
- * Deleting the buffer, clear the corrupt flag since it doesn't
- * matter anymore.
- */
- clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
return 0;
}
@@ -3623,6 +3651,21 @@ btrfs_release_block_group(struct btrfs_block_group *cache,
btrfs_put_block_group(cache);
}
+static bool find_free_extent_check_size_class(const struct find_free_extent_ctl *ffe_ctl,
+ const struct btrfs_block_group *bg)
+{
+ if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED)
+ return true;
+ if (!btrfs_block_group_should_use_size_class(bg))
+ return true;
+ if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS)
+ return true;
+ if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS &&
+ bg->size_class == BTRFS_BG_SZ_NONE)
+ return true;
+ return ffe_ctl->size_class == bg->size_class;
+}
+
/*
* Helper function for find_free_extent().
*
@@ -3644,7 +3687,8 @@ static int find_free_extent_clustered(struct btrfs_block_group *bg,
if (!cluster_bg)
goto refill_cluster;
if (cluster_bg != bg && (cluster_bg->ro ||
- !block_group_bits(cluster_bg, ffe_ctl->flags)))
+ !block_group_bits(cluster_bg, ffe_ctl->flags) ||
+ !find_free_extent_check_size_class(ffe_ctl, cluster_bg)))
goto release_cluster;
offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
@@ -4109,6 +4153,7 @@ static int can_allocate_chunk(struct btrfs_fs_info *fs_info,
static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
struct btrfs_key *ins,
struct find_free_extent_ctl *ffe_ctl,
+ struct btrfs_space_info *space_info,
bool full_search)
{
struct btrfs_root *root = fs_info->chunk_root;
@@ -4163,7 +4208,7 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
return ret;
}
- ret = btrfs_chunk_alloc(trans, ffe_ctl->flags,
+ ret = btrfs_chunk_alloc(trans, space_info, ffe_ctl->flags,
CHUNK_ALLOC_FORCE_FOR_EXTENT);
/* Do not bail out on ENOSPC since we can do more. */
@@ -4200,21 +4245,6 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
return -ENOSPC;
}
-static bool find_free_extent_check_size_class(struct find_free_extent_ctl *ffe_ctl,
- struct btrfs_block_group *bg)
-{
- if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED)
- return true;
- if (!btrfs_block_group_should_use_size_class(bg))
- return true;
- if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS)
- return true;
- if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS &&
- bg->size_class == BTRFS_BG_SZ_NONE)
- return true;
- return ffe_ctl->size_class == bg->size_class;
-}
-
static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
struct find_free_extent_ctl *ffe_ctl,
struct btrfs_space_info *space_info,
@@ -4380,11 +4410,22 @@ static noinline int find_free_extent(struct btrfs_root *root,
ins->objectid = 0;
ins->offset = 0;
- trace_find_free_extent(root, ffe_ctl);
+ trace_btrfs_find_free_extent(root, ffe_ctl);
space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags);
+ if (btrfs_is_zoned(fs_info) && space_info) {
+ /* Use dedicated sub-space_info for dedicated block group users. */
+ if (ffe_ctl->for_data_reloc) {
+ space_info = space_info->sub_group[0];
+ ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC);
+ } else if (ffe_ctl->for_treelog) {
+ space_info = space_info->sub_group[0];
+ ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_TREELOG);
+ }
+ }
if (!space_info) {
- btrfs_err(fs_info, "No space info for %llu", ffe_ctl->flags);
+ btrfs_err(fs_info, "no space info for %llu, tree-log %d, relocation %d",
+ ffe_ctl->flags, ffe_ctl->for_treelog, ffe_ctl->for_data_reloc);
return -ENOSPC;
}
@@ -4406,6 +4447,7 @@ static noinline int find_free_extent(struct btrfs_root *root,
* picked out then we don't care that the block group is cached.
*/
if (block_group && block_group_bits(block_group, ffe_ctl->flags) &&
+ block_group->space_info == space_info &&
block_group->cached != BTRFS_CACHE_NO) {
down_read(&space_info->groups_sem);
if (list_empty(&block_group->list) ||
@@ -4431,7 +4473,7 @@ static noinline int find_free_extent(struct btrfs_root *root,
}
}
search:
- trace_find_free_extent_search_loop(root, ffe_ctl);
+ trace_btrfs_find_free_extent_search_loop(root, ffe_ctl);
ffe_ctl->have_caching_bg = false;
if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) ||
ffe_ctl->index == 0)
@@ -4483,7 +4525,7 @@ search:
}
have_block_group:
- trace_find_free_extent_have_block_group(root, ffe_ctl, block_group);
+ trace_btrfs_find_free_extent_have_block_group(root, ffe_ctl, block_group);
ffe_ctl->cached = btrfs_block_group_done(block_group);
if (unlikely(!ffe_ctl->cached)) {
ffe_ctl->have_caching_bg = true;
@@ -4576,7 +4618,8 @@ loop:
}
up_read(&space_info->groups_sem);
- ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, full_search);
+ ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, space_info,
+ full_search);
if (ret > 0)
goto search;
@@ -4698,8 +4741,8 @@ again:
return ret;
}
-int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
- u64 start, u64 len, int delalloc)
+int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len,
+ bool is_delalloc)
{
struct btrfs_block_group *cache;
@@ -4711,7 +4754,7 @@ int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
}
btrfs_add_free_space(cache, start, len);
- btrfs_free_reserved_bytes(cache, len, delalloc);
+ btrfs_free_reserved_bytes(cache, len, is_delalloc);
trace_btrfs_reserved_extent_free(fs_info, start, len);
btrfs_put_block_group(cache);
@@ -4742,7 +4785,7 @@ static int alloc_reserved_extent(struct btrfs_trans_handle *trans, u64 bytenr,
struct btrfs_fs_info *fs_info = trans->fs_info;
int ret;
- ret = remove_from_free_space_tree(trans, bytenr, num_bytes);
+ ret = btrfs_remove_from_free_space_tree(trans, bytenr, num_bytes);
if (ret)
return ret;
@@ -4833,7 +4876,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
}
static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_node *node,
+ const struct btrfs_delayed_ref_node *node,
struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -4921,7 +4964,7 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
ASSERT(generic_ref.ref_root != BTRFS_TREE_LOG_OBJECTID);
- if (btrfs_is_data_reloc_root(root) && is_fstree(root->relocation_src_root))
+ if (btrfs_is_data_reloc_root(root) && btrfs_is_fstree(root->relocation_src_root))
generic_ref.owning_root = root->relocation_src_root;
btrfs_init_data_ref(&generic_ref, owner, offset, 0, false);
@@ -4943,7 +4986,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
int ret;
struct btrfs_block_group *block_group;
struct btrfs_space_info *space_info;
- struct btrfs_squota_delta delta = {
+ const struct btrfs_squota_delta delta = {
.root = root_objectid,
.num_bytes = ins->offset,
.generation = trans->transid,
@@ -5069,17 +5112,17 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
* EXTENT bit to differentiate dirty pages.
*/
if (buf->log_index == 0)
- set_extent_bit(&root->dirty_log_pages, buf->start,
- buf->start + buf->len - 1,
- EXTENT_DIRTY, NULL);
+ btrfs_set_extent_bit(&root->dirty_log_pages, buf->start,
+ buf->start + buf->len - 1,
+ EXTENT_DIRTY_LOG1, NULL);
else
- set_extent_bit(&root->dirty_log_pages, buf->start,
- buf->start + buf->len - 1,
- EXTENT_NEW, NULL);
+ btrfs_set_extent_bit(&root->dirty_log_pages, buf->start,
+ buf->start + buf->len - 1,
+ EXTENT_DIRTY_LOG2, NULL);
} else {
buf->log_index = -1;
- set_extent_bit(&trans->transaction->dirty_pages, buf->start,
- buf->start + buf->len - 1, EXTENT_DIRTY, NULL);
+ btrfs_set_extent_bit(&trans->transaction->dirty_pages, buf->start,
+ buf->start + buf->len - 1, EXTENT_DIRTY, NULL);
}
/* this returns a buffer locked for blocking */
return buf;
@@ -5185,7 +5228,7 @@ out_free_buf:
btrfs_tree_unlock(buf);
free_extent_buffer(buf);
out_free_reserved:
- btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
+ btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, false);
out_unuse:
btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize);
return ERR_PTR(ret);
@@ -5465,7 +5508,7 @@ static int check_ref_exists(struct btrfs_trans_handle *trans,
{
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_delayed_ref_head *head;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_extent_inline_ref *iref;
int ret;
bool exists = false;
@@ -5482,7 +5525,6 @@ again:
* If we get 0 then we found our reference, return 1, else
* return the error if it's not -ENOENT;
*/
- btrfs_free_path(path);
return (ret < 0 ) ? ret : 1;
}
@@ -5513,11 +5555,10 @@ again:
goto again;
}
- exists = btrfs_find_delayed_tree_ref(head, root->root_key.objectid, parent);
+ exists = btrfs_find_delayed_tree_ref(head, btrfs_root_id(root), parent);
mutex_unlock(&head->mutex);
out:
spin_unlock(&delayed_refs->lock);
- btrfs_free_path(path);
return exists ? 1 : 0;
}
@@ -5834,15 +5875,20 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
if (wc->refs[level] == 1) {
if (level == 0) {
- if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
+ if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
ret = btrfs_dec_ref(trans, root, eb, 1);
- else
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+ } else {
ret = btrfs_dec_ref(trans, root, eb, 0);
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- return ret;
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
}
- if (is_fstree(btrfs_root_id(root))) {
+ if (btrfs_is_fstree(btrfs_root_id(root))) {
ret = btrfs_qgroup_trace_leaf_items(trans, eb);
if (ret) {
btrfs_err_rl(fs_info,
@@ -6285,7 +6331,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
struct extent_buffer *parent)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct walk_control *wc;
int level;
int parent_level;
@@ -6298,14 +6344,12 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
return -ENOMEM;
wc = kzalloc(sizeof(*wc), GFP_NOFS);
- if (!wc) {
- btrfs_free_path(path);
+ if (!wc)
return -ENOMEM;
- }
btrfs_assert_tree_write_locked(parent);
parent_level = btrfs_header_level(parent);
- atomic_inc(&parent->refs);
+ refcount_inc(&parent->refs);
path->nodes[parent_level] = parent;
path->slots[parent_level] = btrfs_header_nritems(parent);
@@ -6338,7 +6382,6 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
}
kfree(wc);
- btrfs_free_path(path);
return ret;
}
@@ -6400,14 +6443,14 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
if (ret)
break;
- find_first_clear_extent_bit(&device->alloc_state, start,
- &start, &end,
- CHUNK_TRIMMED | CHUNK_ALLOCATED);
+ btrfs_find_first_clear_extent_bit(&device->alloc_state, start,
+ &start, &end,
+ CHUNK_TRIMMED | CHUNK_ALLOCATED);
/* Check if there are any CHUNK_* bits left */
if (start > device->total_bytes) {
- WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
- btrfs_warn_in_rcu(fs_info,
+ DEBUG_WARN();
+ btrfs_warn(fs_info,
"ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu",
start, end - start + 1,
btrfs_dev_name(device),
@@ -6439,8 +6482,8 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
ret = btrfs_issue_discard(device->bdev, start, len,
&bytes);
if (!ret)
- set_extent_bit(&device->alloc_state, start,
- start + bytes - 1, CHUNK_TRIMMED, NULL);
+ btrfs_set_extent_bit(&device->alloc_state, start,
+ start + bytes - 1, CHUNK_TRIMMED, NULL);
mutex_unlock(&fs_info->chunk_mutex);
if (ret)
diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
index cfa52264f678..82d3a82dc712 100644
--- a/fs/btrfs/extent-tree.h
+++ b/fs/btrfs/extent-tree.h
@@ -4,7 +4,6 @@
#define BTRFS_EXTENT_TREE_H
#include <linux/types.h>
-#include "misc.h"
#include "block-group.h"
#include "locking.h"
@@ -98,7 +97,7 @@ enum btrfs_inline_ref_type {
};
int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
- struct btrfs_extent_inline_ref *iref,
+ const struct btrfs_extent_inline_ref *iref,
enum btrfs_inline_ref_type is_data);
u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset);
@@ -150,8 +149,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref);
u64 btrfs_get_extent_owner_root(struct btrfs_fs_info *fs_info,
struct extent_buffer *leaf, int slot);
-int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
- u64 start, u64 len, int delalloc);
+int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len,
+ bool is_delalloc);
int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans,
const struct extent_buffer *eb);
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index a2cac9d0a1a9..c953297aa89a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -75,9 +75,9 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
while (!list_empty(&fs_info->allocated_ebs)) {
eb = list_first_entry(&fs_info->allocated_ebs,
struct extent_buffer, leak_list);
- pr_err(
- "BTRFS: buffer leak start %llu len %u refs %d bflags %lu owner %llu\n",
- eb->start, eb->len, atomic_read(&eb->refs), eb->bflags,
+ btrfs_err(fs_info,
+ "buffer leak start %llu len %u refs %d bflags %lu owner %llu",
+ eb->start, eb->len, refcount_read(&eb->refs), eb->bflags,
btrfs_header_owner(eb));
list_del(&eb->leak_list);
WARN_ON_ONCE(1);
@@ -96,6 +96,8 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
*/
struct btrfs_bio_ctrl {
struct btrfs_bio *bbio;
+ /* Last byte contained in bbio + 1 . */
+ loff_t next_file_offset;
enum btrfs_compression_type compress_type;
u32 len_to_oe_boundary;
blk_opf_t opf;
@@ -108,6 +110,7 @@ struct btrfs_bio_ctrl {
* This is to avoid touching ranges covered by compression/inline.
*/
unsigned long submit_bitmap;
+ struct readahead_control *ractl;
};
static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
@@ -221,22 +224,17 @@ static void __process_folios_contig(struct address_space *mapping,
}
static noinline void unlock_delalloc_folio(const struct inode *inode,
- const struct folio *locked_folio,
+ struct folio *locked_folio,
u64 start, u64 end)
{
- unsigned long index = start >> PAGE_SHIFT;
- unsigned long end_index = end >> PAGE_SHIFT;
-
ASSERT(locked_folio);
- if (index == locked_folio->index && end_index == index)
- return;
__process_folios_contig(inode->i_mapping, locked_folio, start, end,
PAGE_UNLOCK);
}
static noinline int lock_delalloc_folios(struct inode *inode,
- const struct folio *locked_folio,
+ struct folio *locked_folio,
u64 start, u64 end)
{
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
@@ -246,9 +244,6 @@ static noinline int lock_delalloc_folios(struct inode *inode,
u64 processed_end = start;
struct folio_batch fbatch;
- if (index == locked_folio->index && index == end_index)
- return 0;
-
folio_batch_init(&fbatch);
while (index <= end_index) {
unsigned int found_folios, i;
@@ -272,8 +267,7 @@ static noinline int lock_delalloc_folios(struct inode *inode,
goto out;
}
range_start = max_t(u64, folio_pos(folio), start);
- range_len = min_t(u64, folio_pos(folio) + folio_size(folio),
- end + 1) - range_start;
+ range_len = min_t(u64, folio_end(folio), end + 1) - range_start;
btrfs_folio_set_lock(fs_info, folio, range_start, range_len);
processed_end = range_start + range_len - 1;
@@ -327,7 +321,7 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
ASSERT(orig_end > orig_start);
/* The range should at least cover part of the folio */
- ASSERT(!(orig_start >= folio_pos(locked_folio) + folio_size(locked_folio) ||
+ ASSERT(!(orig_start >= folio_end(locked_folio) ||
orig_end <= folio_pos(locked_folio)));
again:
/* step one, find a bunch of delalloc bytes starting at start */
@@ -340,7 +334,7 @@ again:
/* @delalloc_end can be -1, never go beyond @orig_end */
*end = min(delalloc_end, orig_end);
- free_extent_state(cached_state);
+ btrfs_free_extent_state(cached_state);
return false;
}
@@ -366,7 +360,7 @@ again:
/* some of the folios are gone, lets avoid looping by
* shortening the size of the delalloc range we're searching
*/
- free_extent_state(cached_state);
+ btrfs_free_extent_state(cached_state);
cached_state = NULL;
if (!loops) {
max_bytes = PAGE_SIZE;
@@ -379,13 +373,13 @@ again:
}
/* step three, lock the state bits for the whole range */
- lock_extent(tree, delalloc_start, delalloc_end, &cached_state);
+ btrfs_lock_extent(tree, delalloc_start, delalloc_end, &cached_state);
/* then test to make sure it is all still delalloc */
- ret = test_range_bit(tree, delalloc_start, delalloc_end,
- EXTENT_DELALLOC, cached_state);
+ ret = btrfs_test_range_bit(tree, delalloc_start, delalloc_end,
+ EXTENT_DELALLOC, cached_state);
- unlock_extent(tree, delalloc_start, delalloc_end, &cached_state);
+ btrfs_unlock_extent(tree, delalloc_start, delalloc_end, &cached_state);
if (!ret) {
unlock_delalloc_folio(inode, locked_folio, delalloc_start,
delalloc_end);
@@ -403,7 +397,7 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
struct extent_state **cached,
u32 clear_bits, unsigned long page_ops)
{
- clear_extent_bit(&inode->io_tree, start, end, clear_bits, cached);
+ btrfs_clear_extent_bit(&inode->io_tree, start, end, clear_bits, cached);
__process_folios_contig(inode->vfs_inode.i_mapping, locked_folio, start,
end, page_ops);
@@ -425,14 +419,14 @@ static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 le
struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
ASSERT(folio_pos(folio) <= start &&
- start + len <= folio_pos(folio) + PAGE_SIZE);
+ start + len <= folio_end(folio));
if (uptodate && btrfs_verify_folio(folio, start, len))
btrfs_folio_set_uptodate(fs_info, folio, start, len);
else
btrfs_folio_clear_uptodate(fs_info, folio, start, len);
- if (!btrfs_is_subpage(fs_info, folio->mapping))
+ if (!btrfs_is_subpage(fs_info, folio))
folio_unlock(folio);
else
btrfs_folio_end_lock(fs_info, folio, start, len);
@@ -462,9 +456,6 @@ static void end_bbio_data_write(struct btrfs_bio *bbio)
u64 start = folio_pos(folio) + fi.offset;
u32 len = fi.length;
- /* Only order 0 (single page) folios are allowed for data. */
- ASSERT(folio_order(folio) == 0);
-
/* Our read/write should always be sector aligned. */
if (!IS_ALIGNED(fi.offset, sectorsize))
btrfs_err(fs_info,
@@ -488,11 +479,11 @@ static void end_bbio_data_write(struct btrfs_bio *bbio)
static void begin_folio_read(struct btrfs_fs_info *fs_info, struct folio *folio)
{
ASSERT(folio_test_locked(folio));
- if (!btrfs_is_subpage(fs_info, folio->mapping))
+ if (!btrfs_is_subpage(fs_info, folio))
return;
ASSERT(folio_test_private(folio));
- btrfs_folio_set_lock(fs_info, folio, folio_pos(folio), PAGE_SIZE);
+ btrfs_folio_set_lock(fs_info, folio, folio_pos(folio), folio_size(folio));
}
/*
@@ -512,47 +503,23 @@ static void end_bbio_data_read(struct btrfs_bio *bbio)
struct btrfs_fs_info *fs_info = bbio->fs_info;
struct bio *bio = &bbio->bio;
struct folio_iter fi;
- const u32 sectorsize = fs_info->sectorsize;
ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_folio_all(fi, &bbio->bio) {
bool uptodate = !bio->bi_status;
struct folio *folio = fi.folio;
struct inode *inode = folio->mapping->host;
- u64 start;
- u64 end;
- u32 len;
+ u64 start = folio_pos(folio) + fi.offset;
- /* For now only order 0 folios are supported for data. */
- ASSERT(folio_order(folio) == 0);
btrfs_debug(fs_info,
"%s: bi_sector=%llu, err=%d, mirror=%u",
__func__, bio->bi_iter.bi_sector, bio->bi_status,
bbio->mirror_num);
- /*
- * We always issue full-sector reads, but if some block in a
- * folio fails to read, blk_update_request() will advance
- * bv_offset and adjust bv_len to compensate. Print a warning
- * for unaligned offsets, and an error if they don't add up to
- * a full sector.
- */
- if (!IS_ALIGNED(fi.offset, sectorsize))
- btrfs_err(fs_info,
- "partial page read in btrfs with offset %zu and length %zu",
- fi.offset, fi.length);
- else if (!IS_ALIGNED(fi.offset + fi.length, sectorsize))
- btrfs_info(fs_info,
- "incomplete page read with offset %zu and length %zu",
- fi.offset, fi.length);
-
- start = folio_pos(folio) + fi.offset;
- end = start + fi.length - 1;
- len = fi.length;
if (likely(uptodate)) {
+ u64 end = start + fi.length - 1;
loff_t i_size = i_size_read(inode);
- pgoff_t end_index = i_size >> folio_shift(folio);
/*
* Zero out the remaining part if this range straddles
@@ -561,9 +528,11 @@ static void end_bbio_data_read(struct btrfs_bio *bbio)
* Here we should only zero the range inside the folio,
* not touch anything else.
*
- * NOTE: i_size is exclusive while end is inclusive.
+ * NOTE: i_size is exclusive while end is inclusive and
+ * folio_contains() takes PAGE_SIZE units.
*/
- if (folio_index(folio) == end_index && i_size <= end) {
+ if (folio_contains(folio, i_size >> PAGE_SHIFT) &&
+ i_size <= end) {
u32 zero_start = max(offset_in_folio(folio, i_size),
offset_in_folio(folio, start));
u32 zero_len = offset_in_folio(folio, end) + 1 -
@@ -574,7 +543,7 @@ static void end_bbio_data_read(struct btrfs_bio *bbio)
}
/* Update page status and unlock. */
- end_folio_read(folio, uptodate, start, len);
+ end_folio_read(folio, uptodate, start, fi.length);
}
bio_put(bio);
}
@@ -665,13 +634,10 @@ static int alloc_eb_folio_array(struct extent_buffer *eb, bool nofail)
}
static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl,
- struct folio *folio, u64 disk_bytenr,
- unsigned int pg_offset)
+ u64 disk_bytenr, loff_t file_offset)
{
struct bio *bio = &bio_ctrl->bbio->bio;
- struct bio_vec *bvec = bio_last_bvec_all(bio);
const sector_t sector = disk_bytenr >> SECTOR_SHIFT;
- struct folio *bv_folio = page_folio(bvec->bv_page);
if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) {
/*
@@ -682,19 +648,11 @@ static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl,
}
/*
- * The contig check requires the following conditions to be met:
- *
- * 1) The folios are belonging to the same inode
- * This is implied by the call chain.
- *
- * 2) The range has adjacent logical bytenr
- *
- * 3) The range has adjacent file offset
- * This is required for the usage of btrfs_bio->file_offset.
+ * To merge into a bio both the disk sector and the logical offset in
+ * the file need to be contiguous.
*/
- return bio_end_sector(bio) == sector &&
- folio_pos(bv_folio) + bvec->bv_offset + bvec->bv_len ==
- folio_pos(folio) + pg_offset;
+ return bio_ctrl->next_file_offset == file_offset &&
+ bio_end_sector(bio) == sector;
}
static void alloc_new_bio(struct btrfs_inode *inode,
@@ -712,6 +670,7 @@ static void alloc_new_bio(struct btrfs_inode *inode,
bbio->file_offset = file_offset;
bio_ctrl->bbio = bbio;
bio_ctrl->len_to_oe_boundary = U32_MAX;
+ bio_ctrl->next_file_offset = file_offset;
/* Limit data write bios to the ordered boundary. */
if (bio_ctrl->wbc) {
@@ -753,22 +712,21 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl,
size_t size, unsigned long pg_offset)
{
struct btrfs_inode *inode = folio_to_inode(folio);
+ loff_t file_offset = folio_pos(folio) + pg_offset;
- ASSERT(pg_offset + size <= PAGE_SIZE);
+ ASSERT(pg_offset + size <= folio_size(folio));
ASSERT(bio_ctrl->end_io_func);
if (bio_ctrl->bbio &&
- !btrfs_bio_is_contig(bio_ctrl, folio, disk_bytenr, pg_offset))
+ !btrfs_bio_is_contig(bio_ctrl, disk_bytenr, file_offset))
submit_one_bio(bio_ctrl);
do {
u32 len = size;
/* Allocate new bio if needed */
- if (!bio_ctrl->bbio) {
- alloc_new_bio(inode, bio_ctrl, disk_bytenr,
- folio_pos(folio) + pg_offset);
- }
+ if (!bio_ctrl->bbio)
+ alloc_new_bio(inode, bio_ctrl, disk_bytenr, file_offset);
/* Cap to the current ordered extent boundary if there is one. */
if (len > bio_ctrl->len_to_oe_boundary) {
@@ -782,14 +740,15 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl,
submit_one_bio(bio_ctrl);
continue;
}
+ bio_ctrl->next_file_offset += len;
if (bio_ctrl->wbc)
- wbc_account_cgroup_owner(bio_ctrl->wbc, folio,
- len);
+ wbc_account_cgroup_owner(bio_ctrl->wbc, folio, len);
size -= len;
pg_offset += len;
disk_bytenr += len;
+ file_offset += len;
/*
* len_to_oe_boundary defaults to U32_MAX, which isn't folio or
@@ -823,7 +782,7 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl,
static int attach_extent_buffer_folio(struct extent_buffer *eb,
struct folio *folio,
- struct btrfs_subpage *prealloc)
+ struct btrfs_folio_state *prealloc)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
int ret = 0;
@@ -837,7 +796,7 @@ static int attach_extent_buffer_folio(struct extent_buffer *eb,
if (folio->mapping)
lockdep_assert_held(&folio->mapping->i_private_lock);
- if (fs_info->nodesize >= PAGE_SIZE) {
+ if (!btrfs_meta_is_subpage(fs_info)) {
if (!folio_test_private(folio))
folio_attach_private(folio, eb);
else
@@ -847,7 +806,7 @@ static int attach_extent_buffer_folio(struct extent_buffer *eb,
/* Already mapped, just free prealloc */
if (folio_test_private(folio)) {
- btrfs_free_subpage(prealloc);
+ btrfs_free_folio_state(prealloc);
return 0;
}
@@ -856,7 +815,7 @@ static int attach_extent_buffer_folio(struct extent_buffer *eb,
folio_attach_private(folio, prealloc);
else
/* Do new allocation to attach subpage */
- ret = btrfs_attach_subpage(fs_info, folio, BTRFS_SUBPAGE_METADATA);
+ ret = btrfs_attach_folio_state(fs_info, folio, BTRFS_SUBPAGE_METADATA);
return ret;
}
@@ -871,8 +830,8 @@ int set_folio_extent_mapped(struct folio *folio)
fs_info = folio_to_fs_info(folio);
- if (btrfs_is_subpage(fs_info, folio->mapping))
- return btrfs_attach_subpage(fs_info, folio, BTRFS_SUBPAGE_DATA);
+ if (btrfs_is_subpage(fs_info, folio))
+ return btrfs_attach_folio_state(fs_info, folio, BTRFS_SUBPAGE_DATA);
folio_attach_private(folio, (void *)EXTENT_FOLIO_PRIVATE);
return 0;
@@ -888,8 +847,8 @@ void clear_folio_extent_mapped(struct folio *folio)
return;
fs_info = folio_to_fs_info(folio);
- if (btrfs_is_subpage(fs_info, folio->mapping))
- return btrfs_detach_subpage(fs_info, folio);
+ if (btrfs_is_subpage(fs_info, folio))
+ return btrfs_detach_folio_state(fs_info, folio, BTRFS_SUBPAGE_DATA);
folio_detach_private(folio);
}
@@ -899,33 +858,49 @@ static struct extent_map *get_extent_map(struct btrfs_inode *inode,
u64 len, struct extent_map **em_cached)
{
struct extent_map *em;
- struct extent_state *cached_state = NULL;
ASSERT(em_cached);
if (*em_cached) {
em = *em_cached;
- if (extent_map_in_tree(em) && start >= em->start &&
- start < extent_map_end(em)) {
+ if (btrfs_extent_map_in_tree(em) && start >= em->start &&
+ start < btrfs_extent_map_end(em)) {
refcount_inc(&em->refs);
return em;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
*em_cached = NULL;
}
- btrfs_lock_and_flush_ordered_range(inode, start, start + len - 1, &cached_state);
em = btrfs_get_extent(inode, folio, start, len);
if (!IS_ERR(em)) {
BUG_ON(*em_cached);
refcount_inc(&em->refs);
*em_cached = em;
}
- unlock_extent(&inode->io_tree, start, start + len - 1, &cached_state);
return em;
}
+
+static void btrfs_readahead_expand(struct readahead_control *ractl,
+ const struct extent_map *em)
+{
+ const u64 ra_pos = readahead_pos(ractl);
+ const u64 ra_end = ra_pos + readahead_length(ractl);
+ const u64 em_end = em->start + em->ram_bytes;
+
+ /* No expansion for holes and inline extents. */
+ if (em->disk_bytenr > EXTENT_MAP_LAST_BYTE)
+ return;
+
+ ASSERT(em_end >= ra_pos,
+ "extent_map %llu %llu ends before current readahead position %llu",
+ em->start, em->len, ra_pos);
+ if (em_end > ra_end)
+ readahead_expand(ractl, ra_pos, em_end - ra_pos);
+}
+
/*
* basic readpage implementation. Locked extent state structs are inserted
* into the tree that are removed when the IO is done (by the end_io
@@ -939,16 +914,12 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
struct inode *inode = folio->mapping->host;
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
u64 start = folio_pos(folio);
- const u64 end = start + PAGE_SIZE - 1;
- u64 cur = start;
+ const u64 end = start + folio_size(folio) - 1;
u64 extent_offset;
u64 last_byte = i_size_read(inode);
- u64 block_start;
struct extent_map *em;
int ret = 0;
- size_t pg_offset = 0;
- size_t iosize;
- size_t blocksize = fs_info->sectorsize;
+ const size_t blocksize = fs_info->sectorsize;
ret = set_folio_extent_mapped(folio);
if (ret < 0) {
@@ -956,48 +927,62 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
return ret;
}
- if (folio->index == last_byte >> folio_shift(folio)) {
+ if (folio_contains(folio, last_byte >> PAGE_SHIFT)) {
size_t zero_offset = offset_in_folio(folio, last_byte);
- if (zero_offset) {
- iosize = folio_size(folio) - zero_offset;
- folio_zero_range(folio, zero_offset, iosize);
- }
+ if (zero_offset)
+ folio_zero_range(folio, zero_offset,
+ folio_size(folio) - zero_offset);
}
bio_ctrl->end_io_func = end_bbio_data_read;
begin_folio_read(fs_info, folio);
- while (cur <= end) {
+ for (u64 cur = start; cur <= end; cur += blocksize) {
enum btrfs_compression_type compress_type = BTRFS_COMPRESS_NONE;
+ unsigned long pg_offset = offset_in_folio(folio, cur);
bool force_bio_submit = false;
u64 disk_bytenr;
+ u64 block_start;
ASSERT(IS_ALIGNED(cur, fs_info->sectorsize));
if (cur >= last_byte) {
- iosize = folio_size(folio) - pg_offset;
- folio_zero_range(folio, pg_offset, iosize);
- end_folio_read(folio, true, cur, iosize);
+ folio_zero_range(folio, pg_offset, end - cur + 1);
+ end_folio_read(folio, true, cur, end - cur + 1);
break;
}
+ if (btrfs_folio_test_uptodate(fs_info, folio, cur, blocksize)) {
+ end_folio_read(folio, true, cur, blocksize);
+ continue;
+ }
em = get_extent_map(BTRFS_I(inode), folio, cur, end - cur + 1, em_cached);
if (IS_ERR(em)) {
end_folio_read(folio, false, cur, end + 1 - cur);
return PTR_ERR(em);
}
extent_offset = cur - em->start;
- BUG_ON(extent_map_end(em) <= cur);
+ BUG_ON(btrfs_extent_map_end(em) <= cur);
BUG_ON(end < cur);
- compress_type = extent_map_compression(em);
+ compress_type = btrfs_extent_map_compression(em);
+
+ /*
+ * Only expand readahead for extents which are already creating
+ * the pages anyway in add_ra_bio_pages, which is compressed
+ * extents in the non subpage case.
+ */
+ if (bio_ctrl->ractl &&
+ !btrfs_is_subpage(fs_info, folio) &&
+ compress_type != BTRFS_COMPRESS_NONE)
+ btrfs_readahead_expand(bio_ctrl->ractl, em);
- iosize = min(extent_map_end(em) - cur, end - cur + 1);
- iosize = ALIGN(iosize, blocksize);
if (compress_type != BTRFS_COMPRESS_NONE)
disk_bytenr = em->disk_bytenr;
else
- disk_bytenr = extent_map_block_start(em) + extent_offset;
- block_start = extent_map_block_start(em);
+ disk_bytenr = btrfs_extent_map_block_start(em) + extent_offset;
+
if (em->flags & EXTENT_FLAG_PREALLOC)
block_start = EXTENT_MAP_HOLE;
+ else
+ block_start = btrfs_extent_map_block_start(em);
/*
* If we have a file range that points to a compressed extent
@@ -1041,23 +1026,18 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
if (prev_em_start)
*prev_em_start = em->start;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
em = NULL;
/* we've found a hole, just zero and go on */
if (block_start == EXTENT_MAP_HOLE) {
- folio_zero_range(folio, pg_offset, iosize);
-
- end_folio_read(folio, true, cur, iosize);
- cur = cur + iosize;
- pg_offset += iosize;
+ folio_zero_range(folio, pg_offset, blocksize);
+ end_folio_read(folio, true, cur, blocksize);
continue;
}
/* the get_extent function already copied into the folio */
if (block_start == EXTENT_MAP_INLINE) {
- end_folio_read(folio, true, cur, iosize);
- cur = cur + iosize;
- pg_offset += iosize;
+ end_folio_read(folio, true, cur, blocksize);
continue;
}
@@ -1068,23 +1048,205 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached,
if (force_bio_submit)
submit_one_bio(bio_ctrl);
- submit_extent_folio(bio_ctrl, disk_bytenr, folio, iosize,
+ submit_extent_folio(bio_ctrl, disk_bytenr, folio, blocksize,
pg_offset);
- cur = cur + iosize;
- pg_offset += iosize;
}
-
return 0;
}
+/*
+ * Check if we can skip waiting the @ordered extent covering the block at @fileoff.
+ *
+ * @fileoff: Both input and output.
+ * Input as the file offset where the check should start at.
+ * Output as where the next check should start at,
+ * if the function returns true.
+ *
+ * Return true if we can skip to @fileoff. The caller needs to check the new
+ * @fileoff value to make sure it covers the full range, before skipping the
+ * full OE.
+ *
+ * Return false if we must wait for the ordered extent.
+ */
+static bool can_skip_one_ordered_range(struct btrfs_inode *inode,
+ struct btrfs_ordered_extent *ordered,
+ u64 *fileoff)
+{
+ const struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct folio *folio;
+ const u32 blocksize = fs_info->sectorsize;
+ u64 cur = *fileoff;
+ bool ret;
+
+ folio = filemap_get_folio(inode->vfs_inode.i_mapping, cur >> PAGE_SHIFT);
+
+ /*
+ * We should have locked the folio(s) for range [start, end], thus
+ * there must be a folio and it must be locked.
+ */
+ ASSERT(!IS_ERR(folio));
+ ASSERT(folio_test_locked(folio));
+
+ /*
+ * There are several cases for the folio and OE combination:
+ *
+ * 1) Folio has no private flag
+ * The OE has all its IO done but not yet finished, and folio got
+ * invalidated.
+ *
+ * Have we have to wait for the OE to finish, as it may contain the
+ * to-be-inserted data checksum.
+ * Without the data checksum inserted into the csum tree, read will
+ * just fail with missing csum.
+ */
+ if (!folio_test_private(folio)) {
+ ret = false;
+ goto out;
+ }
+
+ /*
+ * 2) The first block is DIRTY.
+ *
+ * This means the OE is created by some other folios whose file pos is
+ * before this one. And since we are holding the folio lock, the writeback
+ * of this folio cannot start.
+ *
+ * We must skip the whole OE, because it will never start until we
+ * finished our folio read and unlocked the folio.
+ */
+ if (btrfs_folio_test_dirty(fs_info, folio, cur, blocksize)) {
+ u64 range_len = min(folio_end(folio),
+ ordered->file_offset + ordered->num_bytes) - cur;
+
+ ret = true;
+ /*
+ * At least inside the folio, all the remaining blocks should
+ * also be dirty.
+ */
+ ASSERT(btrfs_folio_test_dirty(fs_info, folio, cur, range_len));
+ *fileoff = ordered->file_offset + ordered->num_bytes;
+ goto out;
+ }
+
+ /*
+ * 3) The first block is uptodate.
+ *
+ * At least the first block can be skipped, but we are still not fully
+ * sure. E.g. if the OE has some other folios in the range that cannot
+ * be skipped.
+ * So we return true and update @next_ret to the OE/folio boundary.
+ */
+ if (btrfs_folio_test_uptodate(fs_info, folio, cur, blocksize)) {
+ u64 range_len = min(folio_end(folio),
+ ordered->file_offset + ordered->num_bytes) - cur;
+
+ /*
+ * The whole range to the OE end or folio boundary should also
+ * be uptodate.
+ */
+ ASSERT(btrfs_folio_test_uptodate(fs_info, folio, cur, range_len));
+ ret = true;
+ *fileoff = cur + range_len;
+ goto out;
+ }
+
+ /*
+ * 4) The first block is not uptodate.
+ *
+ * This means the folio is invalidated after the writeback was finished,
+ * but by some other operations (e.g. block aligned buffered write) the
+ * folio is inserted into filemap.
+ * Very much the same as case 1).
+ */
+ ret = false;
+out:
+ folio_put(folio);
+ return ret;
+}
+
+static bool can_skip_ordered_extent(struct btrfs_inode *inode,
+ struct btrfs_ordered_extent *ordered,
+ u64 start, u64 end)
+{
+ const u64 range_end = min(end, ordered->file_offset + ordered->num_bytes - 1);
+ u64 cur = max(start, ordered->file_offset);
+
+ while (cur < range_end) {
+ bool can_skip;
+
+ can_skip = can_skip_one_ordered_range(inode, ordered, &cur);
+ if (!can_skip)
+ return false;
+ }
+ return true;
+}
+
+/*
+ * Locking helper to make sure we get a stable view of extent maps for the
+ * involved range.
+ *
+ * This is for folio read paths (read and readahead), thus the involved range
+ * should have all the folios locked.
+ */
+static void lock_extents_for_read(struct btrfs_inode *inode, u64 start, u64 end,
+ struct extent_state **cached_state)
+{
+ u64 cur_pos;
+
+ /* Caller must provide a valid @cached_state. */
+ ASSERT(cached_state);
+
+ /* The range must at least be page aligned, as all read paths are folio based. */
+ ASSERT(IS_ALIGNED(start, PAGE_SIZE));
+ ASSERT(IS_ALIGNED(end + 1, PAGE_SIZE));
+
+again:
+ btrfs_lock_extent(&inode->io_tree, start, end, cached_state);
+ cur_pos = start;
+ while (cur_pos < end) {
+ struct btrfs_ordered_extent *ordered;
+
+ ordered = btrfs_lookup_ordered_range(inode, cur_pos,
+ end - cur_pos + 1);
+ /*
+ * No ordered extents in the range, and we hold the extent lock,
+ * no one can modify the extent maps in the range, we're safe to return.
+ */
+ if (!ordered)
+ break;
+
+ /* Check if we can skip waiting for the whole OE. */
+ if (can_skip_ordered_extent(inode, ordered, start, end)) {
+ cur_pos = min(ordered->file_offset + ordered->num_bytes,
+ end + 1);
+ btrfs_put_ordered_extent(ordered);
+ continue;
+ }
+
+ /* Now wait for the OE to finish. */
+ btrfs_unlock_extent(&inode->io_tree, start, end, cached_state);
+ btrfs_start_ordered_extent_nowriteback(ordered, start, end + 1 - start);
+ btrfs_put_ordered_extent(ordered);
+ /* We have unlocked the whole range, restart from the beginning. */
+ goto again;
+ }
+}
+
int btrfs_read_folio(struct file *file, struct folio *folio)
{
+ struct btrfs_inode *inode = folio_to_inode(folio);
+ const u64 start = folio_pos(folio);
+ const u64 end = start + folio_size(folio) - 1;
+ struct extent_state *cached_state = NULL;
struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ };
struct extent_map *em_cached = NULL;
int ret;
+ lock_extents_for_read(inode, start, end, &cached_state);
ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl, NULL);
- free_extent_map(em_cached);
+ btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state);
+
+ btrfs_free_extent_map(em_cached);
/*
* If btrfs_do_readpage() failed we will want to submit the assembled
@@ -1102,7 +1264,7 @@ static void set_delalloc_bitmap(struct folio *folio, unsigned long *delalloc_bit
unsigned int start_bit;
unsigned int nbits;
- ASSERT(start >= folio_start && start + len <= folio_start + PAGE_SIZE);
+ ASSERT(start >= folio_start && start + len <= folio_start + folio_size(folio));
start_bit = (start - folio_start) >> fs_info->sectorsize_bits;
nbits = len >> fs_info->sectorsize_bits;
ASSERT(bitmap_test_range_all_zero(delalloc_bitmap, start_bit, nbits));
@@ -1115,12 +1277,12 @@ static bool find_next_delalloc_bitmap(struct folio *folio,
{
struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
const u64 folio_start = folio_pos(folio);
- const unsigned int bitmap_size = fs_info->sectors_per_page;
+ const unsigned int bitmap_size = btrfs_blocks_per_folio(fs_info, folio);
unsigned int start_bit;
unsigned int first_zero;
unsigned int first_set;
- ASSERT(start >= folio_start && start < folio_start + PAGE_SIZE);
+ ASSERT(start >= folio_start && start < folio_start + folio_size(folio));
start_bit = (start - folio_start) >> fs_info->sectorsize_bits;
first_set = find_next_bit(delalloc_bitmap, bitmap_size, start_bit);
@@ -1154,9 +1316,10 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
{
struct btrfs_fs_info *fs_info = inode_to_fs_info(&inode->vfs_inode);
struct writeback_control *wbc = bio_ctrl->wbc;
- const bool is_subpage = btrfs_is_subpage(fs_info, folio->mapping);
+ const bool is_subpage = btrfs_is_subpage(fs_info, folio);
const u64 page_start = folio_pos(folio);
const u64 page_end = page_start + folio_size(folio) - 1;
+ const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
unsigned long delalloc_bitmap = 0;
/*
* Save the last found delalloc end. As the delalloc end can go beyond
@@ -1181,14 +1344,14 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
int bit;
/* Save the dirty bitmap as our submission bitmap will be a subset of it. */
- if (btrfs_is_subpage(fs_info, inode->vfs_inode.i_mapping)) {
- ASSERT(fs_info->sectors_per_page > 1);
+ if (btrfs_is_subpage(fs_info, folio)) {
+ ASSERT(blocks_per_folio > 1);
btrfs_get_subpage_dirty_bitmap(fs_info, folio, &bio_ctrl->submit_bitmap);
} else {
bio_ctrl->submit_bitmap = 1;
}
- for_each_set_bit(bit, &bio_ctrl->submit_bitmap, fs_info->sectors_per_page) {
+ for_each_set_bit(bit, &bio_ctrl->submit_bitmap, blocks_per_folio) {
u64 start = page_start + (bit << fs_info->sectorsize_bits);
btrfs_folio_set_lock(fs_info, folio, start, fs_info->sectorsize);
@@ -1261,7 +1424,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
btrfs_root_id(inode->root),
btrfs_ino(inode),
folio_pos(folio),
- fs_info->sectors_per_page,
+ blocks_per_folio,
&bio_ctrl->submit_bitmap,
found_start, found_len, ret);
} else {
@@ -1269,8 +1432,8 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
* We've hit an error during previous delalloc range,
* have to cleanup the remaining locked ranges.
*/
- unlock_extent(&inode->io_tree, found_start,
- found_start + found_len - 1, NULL);
+ btrfs_unlock_extent(&inode->io_tree, found_start,
+ found_start + found_len - 1, NULL);
unlock_delalloc_folio(&inode->vfs_inode, folio,
found_start,
found_start + found_len - 1);
@@ -1306,7 +1469,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
unsigned int bitmap_size = min(
(last_finished_delalloc_end - page_start) >>
fs_info->sectorsize_bits,
- fs_info->sectors_per_page);
+ blocks_per_folio);
for_each_set_bit(bit, &bio_ctrl->submit_bitmap, bitmap_size)
btrfs_mark_ordered_io_finished(inode, folio,
@@ -1321,7 +1484,7 @@ out:
delalloc_end = page_end;
/*
* delalloc_end is already one less than the total length, so
- * we don't subtract one from PAGE_SIZE
+ * we don't subtract one from PAGE_SIZE.
*/
delalloc_to_write +=
DIV_ROUND_UP(delalloc_end + 1 - page_start, PAGE_SIZE);
@@ -1330,7 +1493,7 @@ out:
* If all ranges are submitted asynchronously, we just need to account
* for them here.
*/
- if (bitmap_empty(&bio_ctrl->submit_bitmap, fs_info->sectors_per_page)) {
+ if (bitmap_empty(&bio_ctrl->submit_bitmap, blocks_per_folio)) {
wbc->nr_to_write -= delalloc_to_write;
return 1;
}
@@ -1349,7 +1512,7 @@ out:
/*
* Return 0 if we have submitted or queued the sector for submission.
- * Return <0 for critical errors.
+ * Return <0 for critical errors, and the sector will have its dirty flag cleared.
*
* Caller should make sure filepos < i_size and handle filepos >= i_size case.
*/
@@ -1372,23 +1535,32 @@ static int submit_one_sector(struct btrfs_inode *inode,
ASSERT(filepos < i_size);
em = btrfs_get_extent(inode, NULL, filepos, sectorsize);
- if (IS_ERR(em))
+ if (IS_ERR(em)) {
+ /*
+ * When submission failed, we should still clear the folio dirty.
+ * Or the folio will be written back again but without any
+ * ordered extent.
+ */
+ btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize);
+ btrfs_folio_set_writeback(fs_info, folio, filepos, sectorsize);
+ btrfs_folio_clear_writeback(fs_info, folio, filepos, sectorsize);
return PTR_ERR(em);
+ }
extent_offset = filepos - em->start;
- em_end = extent_map_end(em);
+ em_end = btrfs_extent_map_end(em);
ASSERT(filepos <= em_end);
ASSERT(IS_ALIGNED(em->start, sectorsize));
ASSERT(IS_ALIGNED(em->len, sectorsize));
- block_start = extent_map_block_start(em);
- disk_bytenr = extent_map_block_start(em) + extent_offset;
+ block_start = btrfs_extent_map_block_start(em);
+ disk_bytenr = btrfs_extent_map_block_start(em) + extent_offset;
- ASSERT(!extent_map_is_compressed(em));
+ ASSERT(!btrfs_extent_map_is_compressed(em));
ASSERT(block_start != EXTENT_MAP_HOLE);
ASSERT(block_start != EXTENT_MAP_INLINE);
- free_extent_map(em);
+ btrfs_free_extent_map(em);
em = NULL;
/*
@@ -1431,6 +1603,7 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
bool submitted_io = false;
bool error = false;
const u64 folio_start = folio_pos(folio);
+ const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
u64 cur;
int bit;
int ret = 0;
@@ -1439,21 +1612,27 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
start + len <= folio_start + folio_size(folio));
ret = btrfs_writepage_cow_fixup(folio);
- if (ret) {
+ if (ret == -EAGAIN) {
/* Fixup worker will requeue */
folio_redirty_for_writepage(bio_ctrl->wbc, folio);
folio_unlock(folio);
return 1;
}
+ if (ret < 0) {
+ btrfs_folio_clear_dirty(fs_info, folio, start, len);
+ btrfs_folio_set_writeback(fs_info, folio, start, len);
+ btrfs_folio_clear_writeback(fs_info, folio, start, len);
+ return ret;
+ }
for (cur = start; cur < start + len; cur += fs_info->sectorsize)
set_bit((cur - folio_start) >> fs_info->sectorsize_bits, &range_bitmap);
bitmap_and(&bio_ctrl->submit_bitmap, &bio_ctrl->submit_bitmap, &range_bitmap,
- fs_info->sectors_per_page);
+ blocks_per_folio);
bio_ctrl->end_io_func = end_bbio_data_write;
- for_each_set_bit(bit, &bio_ctrl->submit_bitmap, fs_info->sectors_per_page) {
+ for_each_set_bit(bit, &bio_ctrl->submit_bitmap, blocks_per_folio) {
cur = folio_pos(folio) + (bit << fs_info->sectorsize_bits);
if (cur >= i_size) {
@@ -1500,8 +1679,8 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
* Here we set writeback and clear for the range. If the full folio
* is no longer dirty then we clear the PAGECACHE_TAG_DIRTY tag.
*
- * If we hit any error, the corresponding sector will still be dirty
- * thus no need to clear PAGECACHE_TAG_DIRTY.
+ * If we hit any error, the corresponding sector will have its dirty
+ * flag cleared and writeback finished, thus no need to handle the error case.
*/
if (!submitted_io && !error) {
btrfs_folio_set_writeback(fs_info, folio, start, len);
@@ -1526,7 +1705,8 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
int ret;
size_t pg_offset;
loff_t i_size = i_size_read(&inode->vfs_inode);
- unsigned long end_index = i_size >> PAGE_SHIFT;
+ const pgoff_t end_index = i_size >> PAGE_SHIFT;
+ const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
trace_extent_writepage(folio, &inode->vfs_inode, bio_ctrl->wbc);
@@ -1540,7 +1720,7 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
return 0;
}
- if (folio->index == end_index)
+ if (folio_contains(folio, end_index))
folio_zero_range(folio, pg_offset, folio_size(folio) - pg_offset);
/*
@@ -1548,6 +1728,30 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
* The proper bitmap can only be initialized until writepage_delalloc().
*/
bio_ctrl->submit_bitmap = (unsigned long)-1;
+
+ /*
+ * If the page is dirty but without private set, it's marked dirty
+ * without informing the fs.
+ * Nowadays that is a bug, since the introduction of
+ * pin_user_pages*().
+ *
+ * So here we check if the page has private set to rule out such
+ * case.
+ * But we also have a long history of relying on the COW fixup,
+ * so here we only enable this check for experimental builds until
+ * we're sure it's safe.
+ */
+ if (IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL) &&
+ unlikely(!folio_test_private(folio))) {
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ btrfs_err_rl(fs_info,
+ "root %lld ino %llu folio %llu is marked dirty without notifying the fs",
+ btrfs_root_id(inode->root),
+ btrfs_ino(inode), folio_pos(folio));
+ ret = -EUCLEAN;
+ goto done;
+ }
+
ret = set_folio_extent_mapped(folio);
if (ret < 0)
goto done;
@@ -1559,14 +1763,14 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl
goto done;
ret = extent_writepage_io(inode, folio, folio_pos(folio),
- PAGE_SIZE, bio_ctrl, i_size);
+ folio_size(folio), bio_ctrl, i_size);
if (ret == 1)
return 0;
if (ret < 0)
btrfs_err_rl(fs_info,
"failed to submit blocks, root=%lld inode=%llu folio=%llu submit_bitmap=%*pbl: %d",
btrfs_root_id(inode->root), btrfs_ino(inode),
- folio_pos(folio), fs_info->sectors_per_page,
+ folio_pos(folio), blocks_per_folio,
&bio_ctrl->submit_bitmap, ret);
bio_ctrl->wbc->nr_to_write--;
@@ -1612,8 +1816,19 @@ static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *e
*/
spin_lock(&eb->refs_lock);
if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
+ XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits);
+ unsigned long flags;
+
set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
spin_unlock(&eb->refs_lock);
+
+ xas_lock_irqsave(&xas, flags);
+ xas_load(&xas);
+ xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK);
+ xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY);
+ xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
+ xas_unlock_irqrestore(&xas, flags);
+
btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
-eb->len,
@@ -1699,49 +1914,167 @@ static void set_btree_ioerr(struct extent_buffer *eb)
}
}
+static void buffer_tree_set_mark(const struct extent_buffer *eb, xa_mark_t mark)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits);
+ unsigned long flags;
+
+ xas_lock_irqsave(&xas, flags);
+ xas_load(&xas);
+ xas_set_mark(&xas, mark);
+ xas_unlock_irqrestore(&xas, flags);
+}
+
+static void buffer_tree_clear_mark(const struct extent_buffer *eb, xa_mark_t mark)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits);
+ unsigned long flags;
+
+ xas_lock_irqsave(&xas, flags);
+ xas_load(&xas);
+ xas_clear_mark(&xas, mark);
+ xas_unlock_irqrestore(&xas, flags);
+}
+
+static void buffer_tree_tag_for_writeback(struct btrfs_fs_info *fs_info,
+ unsigned long start, unsigned long end)
+{
+ XA_STATE(xas, &fs_info->buffer_tree, start);
+ unsigned int tagged = 0;
+ void *eb;
+
+ xas_lock_irq(&xas);
+ xas_for_each_marked(&xas, eb, end, PAGECACHE_TAG_DIRTY) {
+ xas_set_mark(&xas, PAGECACHE_TAG_TOWRITE);
+ if (++tagged % XA_CHECK_SCHED)
+ continue;
+ xas_pause(&xas);
+ xas_unlock_irq(&xas);
+ cond_resched();
+ xas_lock_irq(&xas);
+ }
+ xas_unlock_irq(&xas);
+}
+
+struct eb_batch {
+ unsigned int nr;
+ unsigned int cur;
+ struct extent_buffer *ebs[PAGEVEC_SIZE];
+};
+
+static inline bool eb_batch_add(struct eb_batch *batch, struct extent_buffer *eb)
+{
+ batch->ebs[batch->nr++] = eb;
+ return (batch->nr < PAGEVEC_SIZE);
+}
+
+static inline void eb_batch_init(struct eb_batch *batch)
+{
+ batch->nr = 0;
+ batch->cur = 0;
+}
+
+static inline struct extent_buffer *eb_batch_next(struct eb_batch *batch)
+{
+ if (batch->cur >= batch->nr)
+ return NULL;
+ return batch->ebs[batch->cur++];
+}
+
+static inline void eb_batch_release(struct eb_batch *batch)
+{
+ for (unsigned int i = 0; i < batch->nr; i++)
+ free_extent_buffer(batch->ebs[i]);
+ eb_batch_init(batch);
+}
+
+static inline struct extent_buffer *find_get_eb(struct xa_state *xas, unsigned long max,
+ xa_mark_t mark)
+{
+ struct extent_buffer *eb;
+
+retry:
+ eb = xas_find_marked(xas, max, mark);
+
+ if (xas_retry(xas, eb))
+ goto retry;
+
+ if (!eb)
+ return NULL;
+
+ if (!refcount_inc_not_zero(&eb->refs)) {
+ xas_reset(xas);
+ goto retry;
+ }
+
+ if (unlikely(eb != xas_reload(xas))) {
+ free_extent_buffer(eb);
+ xas_reset(xas);
+ goto retry;
+ }
+
+ return eb;
+}
+
+static unsigned int buffer_tree_get_ebs_tag(struct btrfs_fs_info *fs_info,
+ unsigned long *start,
+ unsigned long end, xa_mark_t tag,
+ struct eb_batch *batch)
+{
+ XA_STATE(xas, &fs_info->buffer_tree, *start);
+ struct extent_buffer *eb;
+
+ rcu_read_lock();
+ while ((eb = find_get_eb(&xas, end, tag)) != NULL) {
+ if (!eb_batch_add(batch, eb)) {
+ *start = ((eb->start + eb->len) >> fs_info->nodesize_bits);
+ goto out;
+ }
+ }
+ if (end == ULONG_MAX)
+ *start = ULONG_MAX;
+ else
+ *start = end + 1;
+out:
+ rcu_read_unlock();
+
+ return batch->nr;
+}
+
/*
* The endio specific version which won't touch any unsafe spinlock in endio
* context.
*/
static struct extent_buffer *find_extent_buffer_nolock(
- const struct btrfs_fs_info *fs_info, u64 start)
+ struct btrfs_fs_info *fs_info, u64 start)
{
struct extent_buffer *eb;
+ unsigned long index = (start >> fs_info->nodesize_bits);
rcu_read_lock();
- eb = radix_tree_lookup(&fs_info->buffer_radix,
- start >> fs_info->sectorsize_bits);
- if (eb && atomic_inc_not_zero(&eb->refs)) {
- rcu_read_unlock();
- return eb;
- }
+ eb = xa_load(&fs_info->buffer_tree, index);
+ if (eb && !refcount_inc_not_zero(&eb->refs))
+ eb = NULL;
rcu_read_unlock();
- return NULL;
+ return eb;
}
static void end_bbio_meta_write(struct btrfs_bio *bbio)
{
struct extent_buffer *eb = bbio->private;
- struct btrfs_fs_info *fs_info = eb->fs_info;
struct folio_iter fi;
- u32 bio_offset = 0;
if (bbio->bio.bi_status != BLK_STS_OK)
set_btree_ioerr(eb);
bio_for_each_folio_all(fi, &bbio->bio) {
- u64 start = eb->start + bio_offset;
- struct folio *folio = fi.folio;
- u32 len = fi.length;
-
- btrfs_folio_clear_writeback(fs_info, folio, start, len);
- bio_offset += len;
+ btrfs_meta_folio_clear_writeback(fi.folio, eb);
}
- clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
- smp_mb__after_atomic();
- wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
-
+ buffer_tree_clear_mark(eb, PAGECACHE_TAG_WRITEBACK);
+ clear_and_wake_up_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
bio_put(&bbio->bio);
}
@@ -1789,199 +2122,56 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
wbc_init_bio(wbc, &bbio->bio);
bbio->inode = BTRFS_I(eb->fs_info->btree_inode);
bbio->file_offset = eb->start;
- if (fs_info->nodesize < PAGE_SIZE) {
- struct folio *folio = eb->folios[0];
- bool ret;
+ for (int i = 0; i < num_extent_folios(eb); i++) {
+ struct folio *folio = eb->folios[i];
+ u64 range_start = max_t(u64, eb->start, folio_pos(folio));
+ u32 range_len = min_t(u64, folio_end(folio),
+ eb->start + eb->len) - range_start;
folio_lock(folio);
- btrfs_subpage_set_writeback(fs_info, folio, eb->start, eb->len);
- if (btrfs_subpage_clear_and_test_dirty(fs_info, folio, eb->start,
- eb->len)) {
- folio_clear_dirty_for_io(folio);
- wbc->nr_to_write--;
- }
- ret = bio_add_folio(&bbio->bio, folio, eb->len,
- eb->start - folio_pos(folio));
- ASSERT(ret);
- wbc_account_cgroup_owner(wbc, folio, eb->len);
- folio_unlock(folio);
- } else {
- int num_folios = num_extent_folios(eb);
-
- for (int i = 0; i < num_folios; i++) {
- struct folio *folio = eb->folios[i];
- bool ret;
-
- folio_lock(folio);
- folio_clear_dirty_for_io(folio);
- folio_start_writeback(folio);
- ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0);
- ASSERT(ret);
- wbc_account_cgroup_owner(wbc, folio, eb->folio_size);
+ btrfs_meta_folio_clear_dirty(folio, eb);
+ btrfs_meta_folio_set_writeback(folio, eb);
+ if (!folio_test_dirty(folio))
wbc->nr_to_write -= folio_nr_pages(folio);
- folio_unlock(folio);
- }
+ bio_add_folio_nofail(&bbio->bio, folio, range_len,
+ offset_in_folio(folio, range_start));
+ wbc_account_cgroup_owner(wbc, folio, range_len);
+ folio_unlock(folio);
}
btrfs_submit_bbio(bbio, 0);
}
/*
- * Submit one subpage btree page.
- *
- * The main difference to submit_eb_page() is:
- * - Page locking
- * For subpage, we don't rely on page locking at all.
- *
- * - Flush write bio
- * We only flush bio if we may be unable to fit current extent buffers into
- * current bio.
+ * Wait for all eb writeback in the given range to finish.
*
- * Return >=0 for the number of submitted extent buffers.
- * Return <0 for fatal error.
+ * @fs_info: The fs_info for this file system.
+ * @start: The offset of the range to start waiting on writeback.
+ * @end: The end of the range, inclusive. This is meant to be used in
+ * conjuction with wait_marked_extents, so this will usually be
+ * the_next_eb->start - 1.
*/
-static int submit_eb_subpage(struct folio *folio, struct writeback_control *wbc)
+void btrfs_btree_wait_writeback_range(struct btrfs_fs_info *fs_info, u64 start,
+ u64 end)
{
- struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
- int submitted = 0;
- u64 folio_start = folio_pos(folio);
- int bit_start = 0;
- int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits;
-
- /* Lock and write each dirty extent buffers in the range */
- while (bit_start < fs_info->sectors_per_page) {
- struct btrfs_subpage *subpage = folio_get_private(folio);
+ struct eb_batch batch;
+ unsigned long start_index = (start >> fs_info->nodesize_bits);
+ unsigned long end_index = (end >> fs_info->nodesize_bits);
+
+ eb_batch_init(&batch);
+ while (start_index <= end_index) {
struct extent_buffer *eb;
- unsigned long flags;
- u64 start;
+ unsigned int nr_ebs;
- /*
- * Take private lock to ensure the subpage won't be detached
- * in the meantime.
- */
- spin_lock(&folio->mapping->i_private_lock);
- if (!folio_test_private(folio)) {
- spin_unlock(&folio->mapping->i_private_lock);
+ nr_ebs = buffer_tree_get_ebs_tag(fs_info, &start_index, end_index,
+ PAGECACHE_TAG_WRITEBACK, &batch);
+ if (!nr_ebs)
break;
- }
- spin_lock_irqsave(&subpage->lock, flags);
- if (!test_bit(bit_start + btrfs_bitmap_nr_dirty * fs_info->sectors_per_page,
- subpage->bitmaps)) {
- spin_unlock_irqrestore(&subpage->lock, flags);
- spin_unlock(&folio->mapping->i_private_lock);
- bit_start++;
- continue;
- }
-
- start = folio_start + bit_start * fs_info->sectorsize;
- bit_start += sectors_per_node;
-
- /*
- * Here we just want to grab the eb without touching extra
- * spin locks, so call find_extent_buffer_nolock().
- */
- eb = find_extent_buffer_nolock(fs_info, start);
- spin_unlock_irqrestore(&subpage->lock, flags);
- spin_unlock(&folio->mapping->i_private_lock);
-
- /*
- * The eb has already reached 0 refs thus find_extent_buffer()
- * doesn't return it. We don't need to write back such eb
- * anyway.
- */
- if (!eb)
- continue;
-
- if (lock_extent_buffer_for_io(eb, wbc)) {
- write_one_eb(eb, wbc);
- submitted++;
- }
- free_extent_buffer(eb);
- }
- return submitted;
-}
-
-/*
- * Submit all page(s) of one extent buffer.
- *
- * @page: the page of one extent buffer
- * @eb_context: to determine if we need to submit this page, if current page
- * belongs to this eb, we don't need to submit
- *
- * The caller should pass each page in their bytenr order, and here we use
- * @eb_context to determine if we have submitted pages of one extent buffer.
- *
- * If we have, we just skip until we hit a new page that doesn't belong to
- * current @eb_context.
- *
- * If not, we submit all the page(s) of the extent buffer.
- *
- * Return >0 if we have submitted the extent buffer successfully.
- * Return 0 if we don't need to submit the page, as it's already submitted by
- * previous call.
- * Return <0 for fatal error.
- */
-static int submit_eb_page(struct folio *folio, struct btrfs_eb_write_context *ctx)
-{
- struct writeback_control *wbc = ctx->wbc;
- struct address_space *mapping = folio->mapping;
- struct extent_buffer *eb;
- int ret;
-
- if (!folio_test_private(folio))
- return 0;
-
- if (folio_to_fs_info(folio)->nodesize < PAGE_SIZE)
- return submit_eb_subpage(folio, wbc);
-
- spin_lock(&mapping->i_private_lock);
- if (!folio_test_private(folio)) {
- spin_unlock(&mapping->i_private_lock);
- return 0;
- }
-
- eb = folio_get_private(folio);
-
- /*
- * Shouldn't happen and normally this would be a BUG_ON but no point
- * crashing the machine for something we can survive anyway.
- */
- if (WARN_ON(!eb)) {
- spin_unlock(&mapping->i_private_lock);
- return 0;
- }
-
- if (eb == ctx->eb) {
- spin_unlock(&mapping->i_private_lock);
- return 0;
- }
- ret = atomic_inc_not_zero(&eb->refs);
- spin_unlock(&mapping->i_private_lock);
- if (!ret)
- return 0;
-
- ctx->eb = eb;
-
- ret = btrfs_check_meta_write_pointer(eb->fs_info, ctx);
- if (ret) {
- if (ret == -EBUSY)
- ret = 0;
- free_extent_buffer(eb);
- return ret;
- }
- if (!lock_extent_buffer_for_io(eb, wbc)) {
- free_extent_buffer(eb);
- return 0;
- }
- /* Implies write in zoned mode. */
- if (ctx->zoned_bg) {
- /* Mark the last eb in the block group. */
- btrfs_schedule_zone_finish_bg(ctx->zoned_bg, eb);
- ctx->zoned_bg->meta_write_pointer += eb->len;
+ while ((eb = eb_batch_next(&batch)) != NULL)
+ wait_on_extent_buffer_writeback(eb);
+ eb_batch_release(&batch);
+ cond_resched();
}
- write_one_eb(eb, wbc);
- free_extent_buffer(eb);
- return 1;
}
int btree_write_cache_pages(struct address_space *mapping,
@@ -1992,25 +2182,27 @@ int btree_write_cache_pages(struct address_space *mapping,
int ret = 0;
int done = 0;
int nr_to_write_done = 0;
- struct folio_batch fbatch;
- unsigned int nr_folios;
- pgoff_t index;
- pgoff_t end; /* Inclusive */
+ struct eb_batch batch;
+ unsigned int nr_ebs;
+ unsigned long index;
+ unsigned long end;
int scanned = 0;
xa_mark_t tag;
- folio_batch_init(&fbatch);
+ eb_batch_init(&batch);
if (wbc->range_cyclic) {
- index = mapping->writeback_index; /* Start from prev offset */
+ index = ((mapping->writeback_index << PAGE_SHIFT) >> fs_info->nodesize_bits);
end = -1;
+
/*
* Start from the beginning does not need to cycle over the
* range, mark it as scanned.
*/
scanned = (index == 0);
} else {
- index = wbc->range_start >> PAGE_SHIFT;
- end = wbc->range_end >> PAGE_SHIFT;
+ index = (wbc->range_start >> fs_info->nodesize_bits);
+ end = (wbc->range_end >> fs_info->nodesize_bits);
+
scanned = 1;
}
if (wbc->sync_mode == WB_SYNC_ALL)
@@ -2020,31 +2212,39 @@ int btree_write_cache_pages(struct address_space *mapping,
btrfs_zoned_meta_io_lock(fs_info);
retry:
if (wbc->sync_mode == WB_SYNC_ALL)
- tag_pages_for_writeback(mapping, index, end);
+ buffer_tree_tag_for_writeback(fs_info, index, end);
while (!done && !nr_to_write_done && (index <= end) &&
- (nr_folios = filemap_get_folios_tag(mapping, &index, end,
- tag, &fbatch))) {
- unsigned i;
+ (nr_ebs = buffer_tree_get_ebs_tag(fs_info, &index, end, tag, &batch))) {
+ struct extent_buffer *eb;
- for (i = 0; i < nr_folios; i++) {
- struct folio *folio = fbatch.folios[i];
+ while ((eb = eb_batch_next(&batch)) != NULL) {
+ ctx.eb = eb;
- ret = submit_eb_page(folio, &ctx);
- if (ret == 0)
+ ret = btrfs_check_meta_write_pointer(eb->fs_info, &ctx);
+ if (ret) {
+ if (ret == -EBUSY)
+ ret = 0;
+
+ if (ret) {
+ done = 1;
+ break;
+ }
continue;
- if (ret < 0) {
- done = 1;
- break;
}
- /*
- * the filesystem may choose to bump up nr_to_write.
- * We have to make sure to honor the new nr_to_write
- * at any time
- */
- nr_to_write_done = wbc->nr_to_write <= 0;
+ if (!lock_extent_buffer_for_io(eb, wbc))
+ continue;
+
+ /* Implies write in zoned mode. */
+ if (ctx.zoned_bg) {
+ /* Mark the last eb in the block group. */
+ btrfs_schedule_zone_finish_bg(ctx.zoned_bg, eb);
+ ctx.zoned_bg->meta_write_pointer += eb->len;
+ }
+ write_one_eb(eb, wbc);
}
- folio_batch_release(&fbatch);
+ nr_to_write_done = (wbc->nr_to_write <= 0);
+ eb_batch_release(&batch);
cond_resched();
}
if (!scanned && !done) {
@@ -2189,10 +2389,8 @@ retry:
done_index = folio_next_index(folio);
/*
* At this point we hold neither the i_pages lock nor
- * the page lock: the page may be truncated or
- * invalidated (changing page->mapping to NULL),
- * or even swizzled back from swapper_space to
- * tmpfs file mapping
+ * the folio lock: the folio may be truncated or
+ * invalidated (changing folio->mapping to NULL).
*/
if (!folio_trylock(folio)) {
submit_write_bio(bio_ctrl, 0);
@@ -2230,7 +2428,7 @@ retry:
* regular submission.
*/
if (wbc->sync_mode != WB_SYNC_NONE ||
- btrfs_is_subpage(inode_to_fs_info(inode), mapping)) {
+ btrfs_is_subpage(inode_to_fs_info(inode), folio)) {
if (folio_test_writeback(folio))
submit_write_bio(bio_ctrl, 0);
folio_wait_writeback(folio);
@@ -2311,8 +2509,8 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f
ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize));
while (cur <= end) {
- u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end);
- u32 cur_len = cur_end + 1 - cur;
+ u64 cur_end;
+ u32 cur_len;
struct folio *folio;
folio = filemap_get_folio(mapping, cur >> PAGE_SHIFT);
@@ -2322,13 +2520,18 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f
* code is just in case, but shouldn't actually be run.
*/
if (IS_ERR(folio)) {
+ cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end);
+ cur_len = cur_end + 1 - cur;
btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL,
cur, cur_len, false);
mapping_set_error(mapping, PTR_ERR(folio));
- cur = cur_end + 1;
+ cur = cur_end;
continue;
}
+ cur_end = min_t(u64, folio_end(folio) - 1, end);
+ cur_len = cur_end + 1 - cur;
+
ASSERT(folio_test_locked(folio));
if (pages_dirty && folio != locked_folio)
ASSERT(folio_test_dirty(folio));
@@ -2378,16 +2581,27 @@ int btrfs_writepages(struct address_space *mapping, struct writeback_control *wb
void btrfs_readahead(struct readahead_control *rac)
{
- struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ | REQ_RAHEAD };
+ struct btrfs_bio_ctrl bio_ctrl = {
+ .opf = REQ_OP_READ | REQ_RAHEAD,
+ .ractl = rac
+ };
struct folio *folio;
+ struct btrfs_inode *inode = BTRFS_I(rac->mapping->host);
+ const u64 start = readahead_pos(rac);
+ const u64 end = start + readahead_length(rac) - 1;
+ struct extent_state *cached_state = NULL;
struct extent_map *em_cached = NULL;
u64 prev_em_start = (u64)-1;
+ lock_extents_for_read(inode, start, end, &cached_state);
+
while ((folio = readahead_folio(rac)) != NULL)
btrfs_do_readpage(folio, &em_cached, &bio_ctrl, &prev_em_start);
+ btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state);
+
if (em_cached)
- free_extent_map(em_cached);
+ btrfs_free_extent_map(em_cached);
submit_one_bio(&bio_ctrl);
}
@@ -2411,7 +2625,7 @@ int extent_invalidate_folio(struct extent_io_tree *tree,
if (start > end)
return 0;
- lock_extent(tree, start, end, &cached_state);
+ btrfs_lock_extent(tree, start, end, &cached_state);
folio_wait_writeback(folio);
/*
@@ -2419,46 +2633,54 @@ int extent_invalidate_folio(struct extent_io_tree *tree,
* so here we only need to unlock the extent range to free any
* existing extent state.
*/
- unlock_extent(tree, start, end, &cached_state);
+ btrfs_unlock_extent(tree, start, end, &cached_state);
return 0;
}
/*
- * a helper for release_folio, this tests for areas of the page that
- * are locked or under IO and drops the related state bits if it is safe
- * to drop the page.
+ * A helper for struct address_space_operations::release_folio, this tests for
+ * areas of the folio that are locked or under IO and drops the related state
+ * bits if it is safe to drop the folio.
*/
static bool try_release_extent_state(struct extent_io_tree *tree,
struct folio *folio)
{
+ struct extent_state *cached_state = NULL;
u64 start = folio_pos(folio);
- u64 end = start + PAGE_SIZE - 1;
- bool ret;
+ u64 end = start + folio_size(folio) - 1;
+ u32 range_bits;
+ u32 clear_bits;
+ bool ret = false;
+ int ret2;
- if (test_range_bit_exists(tree, start, end, EXTENT_LOCKED)) {
- ret = false;
- } else {
- u32 clear_bits = ~(EXTENT_LOCKED | EXTENT_NODATASUM |
- EXTENT_DELALLOC_NEW | EXTENT_CTLBITS |
- EXTENT_QGROUP_RESERVED);
- int ret2;
+ btrfs_get_range_bits(tree, start, end, &range_bits, &cached_state);
- /*
- * At this point we can safely clear everything except the
- * locked bit, the nodatasum bit and the delalloc new bit.
- * The delalloc new bit will be cleared by ordered extent
- * completion.
- */
- ret2 = __clear_extent_bit(tree, start, end, clear_bits, NULL, NULL);
+ /*
+ * We can release the folio if it's locked only for ordered extent
+ * completion, since that doesn't require using the folio.
+ */
+ if ((range_bits & EXTENT_LOCKED) &&
+ !(range_bits & EXTENT_FINISHING_ORDERED))
+ goto out;
+
+ clear_bits = ~(EXTENT_LOCKED | EXTENT_NODATASUM | EXTENT_DELALLOC_NEW |
+ EXTENT_CTLBITS | EXTENT_QGROUP_RESERVED |
+ EXTENT_FINISHING_ORDERED);
+ /*
+ * At this point we can safely clear everything except the locked,
+ * nodatasum, delalloc new and finishing ordered bits. The delalloc new
+ * bit will be cleared by ordered extent completion.
+ */
+ ret2 = btrfs_clear_extent_bit(tree, start, end, clear_bits, &cached_state);
+ /*
+ * If clear_extent_bit failed for enomem reasons, we can't allow the
+ * release to continue.
+ */
+ if (ret2 == 0)
+ ret = true;
+out:
+ btrfs_free_extent_state(cached_state);
- /* if clear_extent_bit failed for enomem reasons,
- * we can't allow the release to continue.
- */
- if (ret2 < 0)
- ret = false;
- else
- ret = true;
- }
return ret;
}
@@ -2470,7 +2692,7 @@ static bool try_release_extent_state(struct extent_io_tree *tree,
bool try_release_extent_mapping(struct folio *folio, gfp_t mask)
{
u64 start = folio_pos(folio);
- u64 end = start + PAGE_SIZE - 1;
+ u64 end = start + folio_size(folio) - 1;
struct btrfs_inode *inode = folio_to_inode(folio);
struct extent_io_tree *io_tree = &inode->io_tree;
@@ -2481,18 +2703,19 @@ bool try_release_extent_mapping(struct folio *folio, gfp_t mask)
struct extent_map *em;
write_lock(&extent_tree->lock);
- em = lookup_extent_mapping(extent_tree, start, len);
+ em = btrfs_lookup_extent_mapping(extent_tree, start, len);
if (!em) {
write_unlock(&extent_tree->lock);
break;
}
if ((em->flags & EXTENT_FLAG_PINNED) || em->start != start) {
write_unlock(&extent_tree->lock);
- free_extent_map(em);
+ btrfs_free_extent_map(em);
break;
}
- if (test_range_bit_exists(io_tree, em->start,
- extent_map_end(em) - 1, EXTENT_LOCKED))
+ if (btrfs_test_range_bit_exists(io_tree, em->start,
+ btrfs_extent_map_end(em) - 1,
+ EXTENT_LOCKED))
goto next;
/*
* If it's not in the list of modified extents, used by a fast
@@ -2519,15 +2742,15 @@ remove_em:
* fsync performance for workloads with a data size that exceeds
* or is close to the system's memory).
*/
- remove_extent_mapping(inode, em);
+ btrfs_remove_extent_mapping(inode, em);
/* Once for the inode's extent map tree. */
- free_extent_map(em);
+ btrfs_free_extent_map(em);
next:
- start = extent_map_end(em);
+ start = btrfs_extent_map_end(em);
write_unlock(&extent_tree->lock);
/* Once for us, for the lookup_extent_mapping() reference. */
- free_extent_map(em);
+ btrfs_free_extent_map(em);
if (need_resched()) {
/*
@@ -2551,13 +2774,13 @@ static int extent_buffer_under_io(const struct extent_buffer *eb)
static bool folio_range_has_eb(struct folio *folio)
{
- struct btrfs_subpage *subpage;
+ struct btrfs_folio_state *bfs;
lockdep_assert_held(&folio->mapping->i_private_lock);
if (folio_test_private(folio)) {
- subpage = folio_get_private(folio);
- if (atomic_read(&subpage->eb_refs))
+ bfs = folio_get_private(folio);
+ if (atomic_read(&bfs->eb_refs))
return true;
}
return false;
@@ -2566,6 +2789,7 @@ static bool folio_range_has_eb(struct folio *folio)
static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct folio *folio)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
+ struct address_space *mapping = folio->mapping;
const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
/*
@@ -2573,21 +2797,20 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo
* be done under the i_private_lock.
*/
if (mapped)
- spin_lock(&folio->mapping->i_private_lock);
+ spin_lock(&mapping->i_private_lock);
if (!folio_test_private(folio)) {
if (mapped)
- spin_unlock(&folio->mapping->i_private_lock);
+ spin_unlock(&mapping->i_private_lock);
return;
}
- if (fs_info->nodesize >= PAGE_SIZE) {
+ if (!btrfs_meta_is_subpage(fs_info)) {
/*
- * We do this since we'll remove the pages after we've
- * removed the eb from the radix tree, so we could race
- * and have this page now attached to the new eb. So
- * only clear folio if it's still connected to
- * this eb.
+ * We do this since we'll remove the pages after we've removed
+ * the eb from the xarray, so we could race and have this page
+ * now attached to the new eb. So only clear folio if it's
+ * still connected to this eb.
*/
if (folio_test_private(folio) && folio_get_private(folio) == eb) {
BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
@@ -2597,7 +2820,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo
folio_detach_private(folio);
}
if (mapped)
- spin_unlock(&folio->mapping->i_private_lock);
+ spin_unlock(&mapping->i_private_lock);
return;
}
@@ -2607,7 +2830,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo
* attached to one dummy eb, no sharing.
*/
if (!mapped) {
- btrfs_detach_subpage(fs_info, folio);
+ btrfs_detach_folio_state(fs_info, folio, BTRFS_SUBPAGE_METADATA);
return;
}
@@ -2618,9 +2841,9 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo
* page range and no unfinished IO.
*/
if (!folio_range_has_eb(folio))
- btrfs_detach_subpage(fs_info, folio);
+ btrfs_detach_folio_state(fs_info, folio, BTRFS_SUBPAGE_METADATA);
- spin_unlock(&folio->mapping->i_private_lock);
+ spin_unlock(&mapping->i_private_lock);
}
/* Release all folios attached to the extent buffer */
@@ -2635,9 +2858,6 @@ static void btrfs_release_extent_buffer_folios(const struct extent_buffer *eb)
continue;
detach_extent_buffer_folio(eb, folio);
-
- /* One for when we allocated the folio. */
- folio_put(folio);
}
}
@@ -2651,35 +2871,52 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
kmem_cache_free(extent_buffer_cache, eb);
}
-static struct extent_buffer *
-__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
- unsigned long len)
+static struct extent_buffer *__alloc_extent_buffer(struct btrfs_fs_info *fs_info,
+ u64 start)
{
struct extent_buffer *eb = NULL;
eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL);
eb->start = start;
- eb->len = len;
+ eb->len = fs_info->nodesize;
eb->fs_info = fs_info;
init_rwsem(&eb->lock);
btrfs_leak_debug_add_eb(eb);
spin_lock_init(&eb->refs_lock);
- atomic_set(&eb->refs, 1);
+ refcount_set(&eb->refs, 1);
- ASSERT(len <= BTRFS_MAX_METADATA_BLOCKSIZE);
+ ASSERT(eb->len <= BTRFS_MAX_METADATA_BLOCKSIZE);
return eb;
}
+/*
+ * For use in eb allocation error cleanup paths, as btrfs_release_extent_buffer()
+ * does not call folio_put(), and we need to set the folios to NULL so that
+ * btrfs_release_extent_buffer() will not detach them a second time.
+ */
+static void cleanup_extent_buffer_folios(struct extent_buffer *eb)
+{
+ const int num_folios = num_extent_folios(eb);
+
+ /* We canont use num_extent_folios() as loop bound as eb->folios changes. */
+ for (int i = 0; i < num_folios; i++) {
+ ASSERT(eb->folios[i]);
+ detach_extent_buffer_folio(eb, eb->folios[i]);
+ folio_put(eb->folios[i]);
+ eb->folios[i] = NULL;
+ }
+}
+
struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
{
struct extent_buffer *new;
- int num_folios = num_extent_folios(src);
+ int num_folios;
int ret;
- new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
+ new = __alloc_extent_buffer(src->fs_info, src->start);
if (new == NULL)
return NULL;
@@ -2691,78 +2928,78 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
ret = alloc_eb_folio_array(new, false);
- if (ret) {
- btrfs_release_extent_buffer(new);
- return NULL;
- }
+ if (ret)
+ goto release_eb;
+ ASSERT(num_extent_folios(src) == num_extent_folios(new),
+ "%d != %d", num_extent_folios(src), num_extent_folios(new));
+ /* Explicitly use the cached num_extent value from now on. */
+ num_folios = num_extent_folios(src);
for (int i = 0; i < num_folios; i++) {
struct folio *folio = new->folios[i];
ret = attach_extent_buffer_folio(new, folio, NULL);
- if (ret < 0) {
- btrfs_release_extent_buffer(new);
- return NULL;
- }
+ if (ret < 0)
+ goto cleanup_folios;
WARN_ON(folio_test_dirty(folio));
}
+ for (int i = 0; i < num_folios; i++)
+ folio_put(new->folios[i]);
+
copy_extent_buffer_full(new, src);
set_extent_buffer_uptodate(new);
return new;
+
+cleanup_folios:
+ cleanup_extent_buffer_folios(new);
+release_eb:
+ btrfs_release_extent_buffer(new);
+ return NULL;
}
-struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
- u64 start, unsigned long len)
+struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
+ u64 start)
{
struct extent_buffer *eb;
- int num_folios = 0;
int ret;
- eb = __alloc_extent_buffer(fs_info, start, len);
+ eb = __alloc_extent_buffer(fs_info, start);
if (!eb)
return NULL;
ret = alloc_eb_folio_array(eb, false);
if (ret)
- goto err;
+ goto release_eb;
- num_folios = num_extent_folios(eb);
- for (int i = 0; i < num_folios; i++) {
+ for (int i = 0; i < num_extent_folios(eb); i++) {
ret = attach_extent_buffer_folio(eb, eb->folios[i], NULL);
if (ret < 0)
- goto err;
+ goto cleanup_folios;
}
+ for (int i = 0; i < num_extent_folios(eb); i++)
+ folio_put(eb->folios[i]);
set_extent_buffer_uptodate(eb);
btrfs_set_header_nritems(eb, 0);
set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
return eb;
-err:
- for (int i = 0; i < num_folios; i++) {
- if (eb->folios[i]) {
- detach_extent_buffer_folio(eb, eb->folios[i]);
- folio_put(eb->folios[i]);
- }
- }
- kmem_cache_free(extent_buffer_cache, eb);
- return NULL;
-}
-struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
- u64 start)
-{
- return __alloc_dummy_extent_buffer(fs_info, start, fs_info->nodesize);
+cleanup_folios:
+ cleanup_extent_buffer_folios(eb);
+release_eb:
+ btrfs_release_extent_buffer(eb);
+ return NULL;
}
static void check_buffer_tree_ref(struct extent_buffer *eb)
{
int refs;
/*
- * The TREE_REF bit is first set when the extent_buffer is added
- * to the radix tree. It is also reset, if unset, when a new reference
- * is created by find_extent_buffer.
+ * The TREE_REF bit is first set when the extent_buffer is added to the
+ * xarray. It is also reset, if unset, when a new reference is created
+ * by find_extent_buffer.
*
* It is only cleared in two cases: freeing the last non-tree
* reference to the extent_buffer when its STALE bit is set or
@@ -2774,31 +3011,28 @@ static void check_buffer_tree_ref(struct extent_buffer *eb)
* conditions between the calls to check_buffer_tree_ref in those
* codepaths and clearing TREE_REF in try_release_extent_buffer.
*
- * The actual lifetime of the extent_buffer in the radix tree is
- * adequately protected by the refcount, but the TREE_REF bit and
- * its corresponding reference are not. To protect against this
- * class of races, we call check_buffer_tree_ref from the codepaths
- * which trigger io. Note that once io is initiated, TREE_REF can no
- * longer be cleared, so that is the moment at which any such race is
- * best fixed.
+ * The actual lifetime of the extent_buffer in the xarray is adequately
+ * protected by the refcount, but the TREE_REF bit and its corresponding
+ * reference are not. To protect against this class of races, we call
+ * check_buffer_tree_ref() from the code paths which trigger io. Note that
+ * once io is initiated, TREE_REF can no longer be cleared, so that is
+ * the moment at which any such race is best fixed.
*/
- refs = atomic_read(&eb->refs);
+ refs = refcount_read(&eb->refs);
if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
return;
spin_lock(&eb->refs_lock);
if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
- atomic_inc(&eb->refs);
+ refcount_inc(&eb->refs);
spin_unlock(&eb->refs_lock);
}
static void mark_extent_buffer_accessed(struct extent_buffer *eb)
{
- int num_folios= num_extent_folios(eb);
-
check_buffer_tree_ref(eb);
- for (int i = 0; i < num_folios; i++)
+ for (int i = 0; i < num_extent_folios(eb); i++)
folio_mark_accessed(eb->folios[i]);
}
@@ -2831,10 +3065,10 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
return eb;
}
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start)
{
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct extent_buffer *eb, *exists = NULL;
int ret;
@@ -2846,32 +3080,34 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
return ERR_PTR(-ENOMEM);
eb->fs_info = fs_info;
again:
- ret = radix_tree_preload(GFP_NOFS);
- if (ret) {
- exists = ERR_PTR(ret);
- goto free_eb;
+ xa_lock_irq(&fs_info->buffer_tree);
+ exists = __xa_cmpxchg(&fs_info->buffer_tree, start >> fs_info->nodesize_bits,
+ NULL, eb, GFP_NOFS);
+ if (xa_is_err(exists)) {
+ ret = xa_err(exists);
+ xa_unlock_irq(&fs_info->buffer_tree);
+ btrfs_release_extent_buffer(eb);
+ return ERR_PTR(ret);
}
- spin_lock(&fs_info->buffer_lock);
- ret = radix_tree_insert(&fs_info->buffer_radix,
- start >> fs_info->sectorsize_bits, eb);
- spin_unlock(&fs_info->buffer_lock);
- radix_tree_preload_end();
- if (ret == -EEXIST) {
- exists = find_extent_buffer(fs_info, start);
- if (exists)
- goto free_eb;
- else
+ if (exists) {
+ if (!refcount_inc_not_zero(&exists->refs)) {
+ /* The extent buffer is being freed, retry. */
+ xa_unlock_irq(&fs_info->buffer_tree);
goto again;
+ }
+ xa_unlock_irq(&fs_info->buffer_tree);
+ btrfs_release_extent_buffer(eb);
+ return exists;
}
+ xa_unlock_irq(&fs_info->buffer_tree);
check_buffer_tree_ref(eb);
- set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
return eb;
-free_eb:
- btrfs_release_extent_buffer(eb);
- return exists;
-}
+#else
+ /* Stub to avoid linker error when compiled with optimizations turned off. */
+ return NULL;
#endif
+}
static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info,
struct folio *folio)
@@ -2881,11 +3117,11 @@ static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info,
lockdep_assert_held(&folio->mapping->i_private_lock);
/*
- * For subpage case, we completely rely on radix tree to ensure we
- * don't try to insert two ebs for the same bytenr. So here we always
- * return NULL and just continue.
+ * For subpage case, we completely rely on xarray to ensure we don't try
+ * to insert two ebs for the same bytenr. So here we always return NULL
+ * and just continue.
*/
- if (fs_info->nodesize < PAGE_SIZE)
+ if (btrfs_meta_is_subpage(fs_info))
return NULL;
/* Page not yet attached to an extent buffer */
@@ -2899,7 +3135,7 @@ static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info,
* just overwrite folio private.
*/
exists = folio_get_private(folio);
- if (atomic_inc_not_zero(&exists->refs))
+ if (refcount_inc_not_zero(&exists->refs))
return exists;
WARN_ON(folio_test_dirty(folio));
@@ -2917,10 +3153,9 @@ static bool check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
return true;
}
- if (fs_info->nodesize < PAGE_SIZE &&
- offset_in_page(start) + fs_info->nodesize > PAGE_SIZE) {
+ if (fs_info->nodesize < PAGE_SIZE && !IS_ALIGNED(start, fs_info->nodesize)) {
btrfs_err(fs_info,
- "tree block crosses page boundary, start %llu nodesize %u",
+ "tree block is not nodesize aligned, start %llu nodesize %u",
start, fs_info->nodesize);
return true;
}
@@ -2949,14 +3184,14 @@ static bool check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
* The caller needs to free the existing folios and retry using the same order.
*/
static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i,
- struct btrfs_subpage *prealloc,
+ struct btrfs_folio_state *prealloc,
struct extent_buffer **found_eb_ret)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
struct address_space *mapping = fs_info->btree_inode->i_mapping;
- const unsigned long index = eb->start >> PAGE_SHIFT;
- struct folio *existing_folio = NULL;
+ const pgoff_t index = eb->start >> PAGE_SHIFT;
+ struct folio *existing_folio;
int ret;
ASSERT(found_eb_ret);
@@ -2965,6 +3200,7 @@ static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i,
ASSERT(eb->folios[i]);
retry:
+ existing_folio = NULL;
ret = filemap_add_folio(mapping, eb->folios[i], index + i,
GFP_NOFS | __GFP_NOFAIL);
if (!ret)
@@ -2972,10 +3208,8 @@ retry:
existing_folio = filemap_lock_folio(mapping, index + i);
/* The page cache only exists for a very short time, just retry. */
- if (IS_ERR(existing_folio)) {
- existing_folio = NULL;
+ if (IS_ERR(existing_folio))
goto retry;
- }
/* For now, we should only have single-page folios for btree inode. */
ASSERT(folio_nr_pages(existing_folio) == 1);
@@ -2988,7 +3222,7 @@ retry:
finish:
spin_lock(&mapping->i_private_lock);
- if (existing_folio && fs_info->nodesize < PAGE_SIZE) {
+ if (existing_folio && btrfs_meta_is_subpage(fs_info)) {
/* We're going to reuse the existing page, can drop our folio now. */
__free_page(folio_page(eb->folios[i], 0));
eb->folios[i] = existing_folio;
@@ -3016,7 +3250,7 @@ finish:
/*
* To inform we have an extra eb under allocation, so that
* detach_extent_buffer_page() won't release the folio private when the
- * eb hasn't been inserted into radix tree yet.
+ * eb hasn't been inserted into the xarray yet.
*
* The ref will be decreased when the eb releases the page, in
* detach_extent_buffer_page(). Thus needs no special handling in the
@@ -3030,12 +3264,10 @@ finish:
struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start, u64 owner_root, int level)
{
- unsigned long len = fs_info->nodesize;
- int num_folios;
int attached = 0;
struct extent_buffer *eb;
struct extent_buffer *existing_eb = NULL;
- struct btrfs_subpage *prealloc = NULL;
+ struct btrfs_folio_state *prealloc = NULL;
u64 lockdep_owner = owner_root;
bool page_contig = true;
int uptodate = 1;
@@ -3059,7 +3291,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
if (eb)
return eb;
- eb = __alloc_extent_buffer(fs_info, start, len);
+ eb = __alloc_extent_buffer(fs_info, start);
if (!eb)
return ERR_PTR(-ENOMEM);
@@ -3079,8 +3311,8 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
* The memory will be freed by attach_extent_buffer_page() or freed
* manually if we exit earlier.
*/
- if (fs_info->nodesize < PAGE_SIZE) {
- prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA);
+ if (btrfs_meta_is_subpage(fs_info)) {
+ prealloc = btrfs_alloc_folio_state(fs_info, PAGE_SIZE, BTRFS_SUBPAGE_METADATA);
if (IS_ERR(prealloc)) {
ret = PTR_ERR(prealloc);
goto out;
@@ -3091,13 +3323,12 @@ reallocate:
/* Allocate all pages first. */
ret = alloc_eb_folio_array(eb, true);
if (ret < 0) {
- btrfs_free_subpage(prealloc);
+ btrfs_free_folio_state(prealloc);
goto out;
}
- num_folios = num_extent_folios(eb);
/* Attach all pages to the filemap. */
- for (int i = 0; i < num_folios; i++) {
+ for (int i = 0; i < num_extent_folios(eb); i++) {
struct folio *folio;
ret = attach_eb_folio_to_filemap(eb, i, prealloc, &existing_eb);
@@ -3126,7 +3357,7 @@ reallocate:
* using 0-order folios.
*/
if (unlikely(ret == -EAGAIN)) {
- ASSERT(0);
+ DEBUG_WARN("folio order mismatch between new eb and filemap");
goto reallocate;
}
attached++;
@@ -3137,7 +3368,7 @@ reallocate:
* and free the allocated page.
*/
folio = eb->folios[i];
- WARN_ON(btrfs_folio_test_dirty(fs_info, folio, eb->start, eb->len));
+ WARN_ON(btrfs_meta_folio_test_dirty(folio, eb));
/*
* Check if the current page is physically contiguous with previous eb
@@ -3148,15 +3379,14 @@ reallocate:
if (i && folio_page(eb->folios[i - 1], 0) + 1 != folio_page(folio, 0))
page_contig = false;
- if (!btrfs_folio_test_uptodate(fs_info, folio, eb->start, eb->len))
+ if (!btrfs_meta_folio_test_uptodate(folio, eb))
uptodate = 0;
/*
* We can't unlock the pages just yet since the extent buffer
- * hasn't been properly inserted in the radix tree, this
- * opens a race with btree_release_folio which can free a page
- * while we are still filling in all pages for the buffer and
- * we could crash.
+ * hasn't been properly inserted into the xarray, this opens a
+ * race with btree_release_folio() which can free a page while we
+ * are still filling in all pages for the buffer and we could crash.
*/
}
if (uptodate)
@@ -3165,38 +3395,46 @@ reallocate:
if (page_contig)
eb->addr = folio_address(eb->folios[0]) + offset_in_page(eb->start);
again:
- ret = radix_tree_preload(GFP_NOFS);
- if (ret)
+ xa_lock_irq(&fs_info->buffer_tree);
+ existing_eb = __xa_cmpxchg(&fs_info->buffer_tree,
+ start >> fs_info->nodesize_bits, NULL, eb,
+ GFP_NOFS);
+ if (xa_is_err(existing_eb)) {
+ ret = xa_err(existing_eb);
+ xa_unlock_irq(&fs_info->buffer_tree);
goto out;
-
- spin_lock(&fs_info->buffer_lock);
- ret = radix_tree_insert(&fs_info->buffer_radix,
- start >> fs_info->sectorsize_bits, eb);
- spin_unlock(&fs_info->buffer_lock);
- radix_tree_preload_end();
- if (ret == -EEXIST) {
- ret = 0;
- existing_eb = find_extent_buffer(fs_info, start);
- if (existing_eb)
- goto out;
- else
+ }
+ if (existing_eb) {
+ if (!refcount_inc_not_zero(&existing_eb->refs)) {
+ xa_unlock_irq(&fs_info->buffer_tree);
goto again;
+ }
+ xa_unlock_irq(&fs_info->buffer_tree);
+ goto out;
}
+ xa_unlock_irq(&fs_info->buffer_tree);
+
/* add one reference for the tree */
check_buffer_tree_ref(eb);
- set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
/*
* Now it's safe to unlock the pages because any calls to
* btree_release_folio will correctly detect that a page belongs to a
* live buffer and won't free them prematurely.
*/
- for (int i = 0; i < num_folios; i++)
+ for (int i = 0; i < num_extent_folios(eb); i++) {
folio_unlock(eb->folios[i]);
+ /*
+ * A folio that has been added to an address_space mapping
+ * should not continue holding the refcount from its original
+ * allocation indefinitely.
+ */
+ folio_put(eb->folios[i]);
+ }
return eb;
out:
- WARN_ON(!atomic_dec_and_test(&eb->refs));
+ WARN_ON(!refcount_dec_and_test(&eb->refs));
/*
* Any attached folios need to be detached before we unlock them. This
@@ -3206,26 +3444,22 @@ out:
* want that to grab this eb, as we're getting ready to free it. So we
* have to detach it first and then unlock it.
*
- * We have to drop our reference and NULL it out here because in the
- * subpage case detaching does a btrfs_folio_dec_eb_refs() for our eb.
- * Below when we call btrfs_release_extent_buffer() we will call
- * detach_extent_buffer_folio() on our remaining pages in the !subpage
- * case. If we left eb->folios[i] populated in the subpage case we'd
- * double put our reference and be super sad.
+ * Note: the bounds is num_extent_pages() as we need to go through all slots.
*/
- for (int i = 0; i < attached; i++) {
- ASSERT(eb->folios[i]);
- detach_extent_buffer_folio(eb, eb->folios[i]);
- folio_unlock(eb->folios[i]);
- folio_put(eb->folios[i]);
+ for (int i = 0; i < num_extent_pages(eb); i++) {
+ struct folio *folio = eb->folios[i];
+
+ if (i < attached) {
+ ASSERT(folio);
+ detach_extent_buffer_folio(eb, folio);
+ folio_unlock(folio);
+ } else if (!folio) {
+ continue;
+ }
+
+ folio_put(folio);
eb->folios[i] = NULL;
}
- /*
- * Now all pages of that extent buffer is unmapped, set UNMAPPED flag,
- * so it can be cleaned up without utilizing page->mapping.
- */
- set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
-
btrfs_release_extent_buffer(eb);
if (ret < 0)
return ERR_PTR(ret);
@@ -3246,20 +3480,28 @@ static int release_extent_buffer(struct extent_buffer *eb)
{
lockdep_assert_held(&eb->refs_lock);
- WARN_ON(atomic_read(&eb->refs) == 0);
- if (atomic_dec_and_test(&eb->refs)) {
- if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) {
- struct btrfs_fs_info *fs_info = eb->fs_info;
+ if (refcount_dec_and_test(&eb->refs)) {
+ struct btrfs_fs_info *fs_info = eb->fs_info;
- spin_unlock(&eb->refs_lock);
+ spin_unlock(&eb->refs_lock);
- spin_lock(&fs_info->buffer_lock);
- radix_tree_delete(&fs_info->buffer_radix,
- eb->start >> fs_info->sectorsize_bits);
- spin_unlock(&fs_info->buffer_lock);
- } else {
- spin_unlock(&eb->refs_lock);
- }
+ /*
+ * We're erasing, theoretically there will be no allocations, so
+ * just use GFP_ATOMIC.
+ *
+ * We use cmpxchg instead of erase because we do not know if
+ * this eb is actually in the tree or not, we could be cleaning
+ * up an eb that we allocated but never inserted into the tree.
+ * Thus use cmpxchg to remove it from the tree if it is there,
+ * or leave the other entry if this isn't in the tree.
+ *
+ * The documentation says that putting a NULL value is the same
+ * as erase as long as XA_FLAGS_ALLOC is not set, which it isn't
+ * in this case.
+ */
+ xa_cmpxchg_irq(&fs_info->buffer_tree,
+ eb->start >> fs_info->nodesize_bits, eb, NULL,
+ GFP_ATOMIC);
btrfs_leak_debug_del_eb(eb);
/* Should be safe to release folios at this point. */
@@ -3284,22 +3526,26 @@ void free_extent_buffer(struct extent_buffer *eb)
if (!eb)
return;
- refs = atomic_read(&eb->refs);
+ refs = refcount_read(&eb->refs);
while (1) {
- if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 3)
- || (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) &&
- refs == 1))
+ if (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags)) {
+ if (refs == 1)
+ break;
+ } else if (refs <= 3) {
break;
- if (atomic_try_cmpxchg(&eb->refs, &refs, refs - 1))
+ }
+
+ /* Optimization to avoid locking eb->refs_lock. */
+ if (atomic_try_cmpxchg(&eb->refs.refs, &refs, refs - 1))
return;
}
spin_lock(&eb->refs_lock);
- if (atomic_read(&eb->refs) == 2 &&
+ if (refcount_read(&eb->refs) == 2 &&
test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
!extent_buffer_under_io(eb) &&
test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
- atomic_dec(&eb->refs);
+ refcount_dec(&eb->refs);
/*
* I know this is terrible, but it's temporary until we stop tracking
@@ -3316,44 +3562,27 @@ void free_extent_buffer_stale(struct extent_buffer *eb)
spin_lock(&eb->refs_lock);
set_bit(EXTENT_BUFFER_STALE, &eb->bflags);
- if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
+ if (refcount_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
- atomic_dec(&eb->refs);
+ refcount_dec(&eb->refs);
release_extent_buffer(eb);
}
-static void btree_clear_folio_dirty(struct folio *folio)
+static void btree_clear_folio_dirty_tag(struct folio *folio)
{
- ASSERT(folio_test_dirty(folio));
+ ASSERT(!folio_test_dirty(folio));
ASSERT(folio_test_locked(folio));
- folio_clear_dirty_for_io(folio);
xa_lock_irq(&folio->mapping->i_pages);
if (!folio_test_dirty(folio))
- __xa_clear_mark(&folio->mapping->i_pages,
- folio_index(folio), PAGECACHE_TAG_DIRTY);
+ __xa_clear_mark(&folio->mapping->i_pages, folio->index,
+ PAGECACHE_TAG_DIRTY);
xa_unlock_irq(&folio->mapping->i_pages);
}
-static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb)
-{
- struct btrfs_fs_info *fs_info = eb->fs_info;
- struct folio *folio = eb->folios[0];
- bool last;
-
- /* btree_clear_folio_dirty() needs page locked. */
- folio_lock(folio);
- last = btrfs_subpage_clear_and_test_dirty(fs_info, folio, eb->start, eb->len);
- if (last)
- btree_clear_folio_dirty(folio);
- folio_unlock(folio);
- WARN_ON(atomic_read(&eb->refs) == 0);
-}
-
void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
struct extent_buffer *eb)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
- int num_folios;
btrfs_assert_tree_write_locked(eb);
@@ -3377,58 +3606,56 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
if (!test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags))
return;
+ buffer_tree_clear_mark(eb, PAGECACHE_TAG_DIRTY);
percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, -eb->len,
fs_info->dirty_metadata_batch);
- if (eb->fs_info->nodesize < PAGE_SIZE)
- return clear_subpage_extent_buffer_dirty(eb);
-
- num_folios = num_extent_folios(eb);
- for (int i = 0; i < num_folios; i++) {
+ for (int i = 0; i < num_extent_folios(eb); i++) {
struct folio *folio = eb->folios[i];
+ bool last;
if (!folio_test_dirty(folio))
continue;
folio_lock(folio);
- btree_clear_folio_dirty(folio);
+ last = btrfs_meta_folio_clear_and_test_dirty(folio, eb);
+ if (last)
+ btree_clear_folio_dirty_tag(folio);
folio_unlock(folio);
}
- WARN_ON(atomic_read(&eb->refs) == 0);
+ WARN_ON(refcount_read(&eb->refs) == 0);
}
void set_extent_buffer_dirty(struct extent_buffer *eb)
{
- int num_folios;
bool was_dirty;
check_buffer_tree_ref(eb);
was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
- num_folios = num_extent_folios(eb);
- WARN_ON(atomic_read(&eb->refs) == 0);
+ WARN_ON(refcount_read(&eb->refs) == 0);
WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
WARN_ON(test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags));
if (!was_dirty) {
- bool subpage = eb->fs_info->nodesize < PAGE_SIZE;
+ bool subpage = btrfs_meta_is_subpage(eb->fs_info);
/*
* For subpage case, we can have other extent buffers in the
- * same page, and in clear_subpage_extent_buffer_dirty() we
+ * same page, and in clear_extent_buffer_dirty() we
* have to clear page dirty without subpage lock held.
* This can cause race where our page gets dirty cleared after
* we just set it.
*
- * Thankfully, clear_subpage_extent_buffer_dirty() has locked
+ * Thankfully, clear_extent_buffer_dirty() has locked
* its page for other reasons, we can use page lock to prevent
* the above race.
*/
if (subpage)
folio_lock(eb->folios[0]);
- for (int i = 0; i < num_folios; i++)
- btrfs_folio_set_dirty(eb->fs_info, eb->folios[i],
- eb->start, eb->len);
+ for (int i = 0; i < num_extent_folios(eb); i++)
+ btrfs_meta_folio_set_dirty(eb->folios[i], eb);
+ buffer_tree_set_mark(eb, PAGECACHE_TAG_DIRTY);
if (subpage)
folio_unlock(eb->folios[0]);
percpu_counter_add_batch(&eb->fs_info->dirty_metadata_bytes,
@@ -3436,70 +3663,42 @@ void set_extent_buffer_dirty(struct extent_buffer *eb)
eb->fs_info->dirty_metadata_batch);
}
#ifdef CONFIG_BTRFS_DEBUG
- for (int i = 0; i < num_folios; i++)
+ for (int i = 0; i < num_extent_folios(eb); i++)
ASSERT(folio_test_dirty(eb->folios[i]));
#endif
}
void clear_extent_buffer_uptodate(struct extent_buffer *eb)
{
- struct btrfs_fs_info *fs_info = eb->fs_info;
- int num_folios = num_extent_folios(eb);
clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
- for (int i = 0; i < num_folios; i++) {
+ for (int i = 0; i < num_extent_folios(eb); i++) {
struct folio *folio = eb->folios[i];
if (!folio)
continue;
- /*
- * This is special handling for metadata subpage, as regular
- * btrfs_is_subpage() can not handle cloned/dummy metadata.
- */
- if (fs_info->nodesize >= PAGE_SIZE)
- folio_clear_uptodate(folio);
- else
- btrfs_subpage_clear_uptodate(fs_info, folio,
- eb->start, eb->len);
+ btrfs_meta_folio_clear_uptodate(folio, eb);
}
}
void set_extent_buffer_uptodate(struct extent_buffer *eb)
{
- struct btrfs_fs_info *fs_info = eb->fs_info;
- int num_folios = num_extent_folios(eb);
set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
- for (int i = 0; i < num_folios; i++) {
- struct folio *folio = eb->folios[i];
-
- /*
- * This is special handling for metadata subpage, as regular
- * btrfs_is_subpage() can not handle cloned/dummy metadata.
- */
- if (fs_info->nodesize >= PAGE_SIZE)
- folio_mark_uptodate(folio);
- else
- btrfs_subpage_set_uptodate(fs_info, folio,
- eb->start, eb->len);
- }
+ for (int i = 0; i < num_extent_folios(eb); i++)
+ btrfs_meta_folio_set_uptodate(eb->folios[i], eb);
}
static void clear_extent_buffer_reading(struct extent_buffer *eb)
{
- clear_bit(EXTENT_BUFFER_READING, &eb->bflags);
- smp_mb__after_atomic();
- wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING);
+ clear_and_wake_up_bit(EXTENT_BUFFER_READING, &eb->bflags);
}
static void end_bbio_meta_read(struct btrfs_bio *bbio)
{
struct extent_buffer *eb = bbio->private;
- struct btrfs_fs_info *fs_info = eb->fs_info;
bool uptodate = !bbio->bio.bi_status;
- struct folio_iter fi;
- u32 bio_offset = 0;
/*
* If the extent buffer is marked UPTODATE before the read operation
@@ -3514,25 +3713,10 @@ static void end_bbio_meta_read(struct btrfs_bio *bbio)
btrfs_validate_extent_buffer(eb, &bbio->parent_check) < 0)
uptodate = false;
- if (uptodate) {
+ if (uptodate)
set_extent_buffer_uptodate(eb);
- } else {
+ else
clear_extent_buffer_uptodate(eb);
- set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
- }
-
- bio_for_each_folio_all(fi, &bbio->bio) {
- struct folio *folio = fi.folio;
- u64 start = eb->start + bio_offset;
- u32 len = fi.length;
-
- if (uptodate)
- btrfs_folio_set_uptodate(fs_info, folio, start, len);
- else
- btrfs_folio_clear_uptodate(fs_info, folio, start, len);
-
- bio_offset += len;
- }
clear_extent_buffer_reading(eb);
free_extent_buffer(eb);
@@ -3544,7 +3728,6 @@ int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num,
const struct btrfs_tree_parent_check *check)
{
struct btrfs_bio *bbio;
- bool ret;
if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
return 0;
@@ -3572,10 +3755,9 @@ int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num,
return 0;
}
- clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
eb->read_mirror = 0;
check_buffer_tree_ref(eb);
- atomic_inc(&eb->refs);
+ refcount_inc(&eb->refs);
bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES,
REQ_OP_READ | REQ_META, eb->fs_info,
@@ -3584,19 +3766,14 @@ int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num,
bbio->inode = BTRFS_I(eb->fs_info->btree_inode);
bbio->file_offset = eb->start;
memcpy(&bbio->parent_check, check, sizeof(*check));
- if (eb->fs_info->nodesize < PAGE_SIZE) {
- ret = bio_add_folio(&bbio->bio, eb->folios[0], eb->len,
- eb->start - folio_pos(eb->folios[0]));
- ASSERT(ret);
- } else {
- int num_folios = num_extent_folios(eb);
-
- for (int i = 0; i < num_folios; i++) {
- struct folio *folio = eb->folios[i];
+ for (int i = 0; i < num_extent_folios(eb); i++) {
+ struct folio *folio = eb->folios[i];
+ u64 range_start = max_t(u64, eb->start, folio_pos(folio));
+ u32 range_len = min_t(u64, folio_end(folio),
+ eb->start + eb->len) - range_start;
- ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0);
- ASSERT(ret);
- }
+ bio_add_folio_nofail(&bbio->bio, folio, range_len,
+ offset_in_folio(folio, range_start));
}
btrfs_submit_bbio(bbio, mirror_num);
return 0;
@@ -3623,7 +3800,7 @@ static bool report_eb_range(const struct extent_buffer *eb, unsigned long start,
btrfs_warn(eb->fs_info,
"access to eb bytenr %llu len %u out of range start %lu len %lu",
eb->start, eb->len, start, len);
- WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ DEBUG_WARN();
return true;
}
@@ -3785,7 +3962,7 @@ static void assert_eb_folio_uptodate(const struct extent_buffer *eb, int i)
if (test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
return;
- if (fs_info->nodesize < PAGE_SIZE) {
+ if (btrfs_meta_is_subpage(fs_info)) {
folio = eb->folios[0];
ASSERT(i == 0);
if (WARN_ON(!btrfs_subpage_test_uptodate(fs_info, folio,
@@ -3971,8 +4148,8 @@ static inline void eb_bitmap_offset(const struct extent_buffer *eb,
* @start: offset of the bitmap item in the extent buffer
* @nr: bit number to test
*/
-int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
- unsigned long nr)
+bool extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
+ unsigned long nr)
{
unsigned long i;
size_t offset;
@@ -4159,82 +4336,29 @@ void memmove_extent_buffer(const struct extent_buffer *dst,
}
}
-#define GANG_LOOKUP_SIZE 16
-static struct extent_buffer *get_next_extent_buffer(
- const struct btrfs_fs_info *fs_info, struct folio *folio, u64 bytenr)
-{
- struct extent_buffer *gang[GANG_LOOKUP_SIZE];
- struct extent_buffer *found = NULL;
- u64 folio_start = folio_pos(folio);
- u64 cur = folio_start;
-
- ASSERT(in_range(bytenr, folio_start, PAGE_SIZE));
- lockdep_assert_held(&fs_info->buffer_lock);
-
- while (cur < folio_start + PAGE_SIZE) {
- int ret;
- int i;
-
- ret = radix_tree_gang_lookup(&fs_info->buffer_radix,
- (void **)gang, cur >> fs_info->sectorsize_bits,
- min_t(unsigned int, GANG_LOOKUP_SIZE,
- PAGE_SIZE / fs_info->nodesize));
- if (ret == 0)
- goto out;
- for (i = 0; i < ret; i++) {
- /* Already beyond page end */
- if (gang[i]->start >= folio_start + PAGE_SIZE)
- goto out;
- /* Found one */
- if (gang[i]->start >= bytenr) {
- found = gang[i];
- goto out;
- }
- }
- cur = gang[ret - 1]->start + gang[ret - 1]->len;
- }
-out:
- return found;
-}
-
static int try_release_subpage_extent_buffer(struct folio *folio)
{
struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
- u64 cur = folio_pos(folio);
- const u64 end = cur + PAGE_SIZE;
+ struct extent_buffer *eb;
+ unsigned long start = (folio_pos(folio) >> fs_info->nodesize_bits);
+ unsigned long index = start;
+ unsigned long end = index + (PAGE_SIZE >> fs_info->nodesize_bits) - 1;
int ret;
- while (cur < end) {
- struct extent_buffer *eb = NULL;
-
- /*
- * Unlike try_release_extent_buffer() which uses folio private
- * to grab buffer, for subpage case we rely on radix tree, thus
- * we need to ensure radix tree consistency.
- *
- * We also want an atomic snapshot of the radix tree, thus go
- * with spinlock rather than RCU.
- */
- spin_lock(&fs_info->buffer_lock);
- eb = get_next_extent_buffer(fs_info, folio, cur);
- if (!eb) {
- /* No more eb in the page range after or at cur */
- spin_unlock(&fs_info->buffer_lock);
- break;
- }
- cur = eb->start + eb->len;
-
+ rcu_read_lock();
+ xa_for_each_range(&fs_info->buffer_tree, index, eb, start, end) {
/*
* The same as try_release_extent_buffer(), to ensure the eb
* won't disappear out from under us.
*/
spin_lock(&eb->refs_lock);
- if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
+ rcu_read_unlock();
+
+ if (refcount_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
spin_unlock(&eb->refs_lock);
- spin_unlock(&fs_info->buffer_lock);
- break;
+ rcu_read_lock();
+ continue;
}
- spin_unlock(&fs_info->buffer_lock);
/*
* If tree ref isn't set then we know the ref on this eb is a
@@ -4252,7 +4376,10 @@ static int try_release_subpage_extent_buffer(struct folio *folio)
* release_extent_buffer() will release the refs_lock.
*/
release_extent_buffer(eb);
+ rcu_read_lock();
}
+ rcu_read_unlock();
+
/*
* Finally to check if we have cleared folio private, as if we have
* released all ebs in the page, the folio private should be cleared now.
@@ -4264,14 +4391,13 @@ static int try_release_subpage_extent_buffer(struct folio *folio)
ret = 0;
spin_unlock(&folio->mapping->i_private_lock);
return ret;
-
}
int try_release_extent_buffer(struct folio *folio)
{
struct extent_buffer *eb;
- if (folio_to_fs_info(folio)->nodesize < PAGE_SIZE)
+ if (btrfs_meta_is_subpage(folio_to_fs_info(folio)))
return try_release_subpage_extent_buffer(folio);
/*
@@ -4293,7 +4419,7 @@ int try_release_extent_buffer(struct folio *folio)
* this page.
*/
spin_lock(&eb->refs_lock);
- if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
+ if (refcount_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
spin_unlock(&eb->refs_lock);
spin_unlock(&folio->mapping->i_private_lock);
return 0;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 6c5328bfabc2..61130786b9a3 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -38,16 +38,10 @@ struct btrfs_tree_parent_check;
enum {
EXTENT_BUFFER_UPTODATE,
EXTENT_BUFFER_DIRTY,
- EXTENT_BUFFER_CORRUPT,
- /* this got triggered by readahead */
- EXTENT_BUFFER_READAHEAD,
EXTENT_BUFFER_TREE_REF,
EXTENT_BUFFER_STALE,
EXTENT_BUFFER_WRITEBACK,
- /* read IO error */
- EXTENT_BUFFER_READ_ERR,
EXTENT_BUFFER_UNMAPPED,
- EXTENT_BUFFER_IN_TREE,
/* write IO error */
EXTENT_BUFFER_WRITE_ERR,
/* Indicate the extent buffer is written zeroed out (for zoned) */
@@ -79,7 +73,7 @@ enum {
* single word in a bitmap may straddle two pages in the extent buffer.
*/
#define BIT_BYTE(nr) ((nr) / BITS_PER_BYTE)
-#define BYTE_MASK ((1 << BITS_PER_BYTE) - 1)
+#define BYTE_MASK ((1U << BITS_PER_BYTE) - 1)
#define BITMAP_FIRST_BYTE_MASK(start) \
((BYTE_MASK << ((start) & (BITS_PER_BYTE - 1))) & BYTE_MASK)
#define BITMAP_LAST_BYTE_MASK(nbits) \
@@ -104,7 +98,7 @@ struct extent_buffer {
void *addr;
spinlock_t refs_lock;
- atomic_t refs;
+ refcount_t refs;
int read_mirror;
/* >= 0 if eb belongs to a log tree, -1 otherwise */
s8 log_index;
@@ -246,14 +240,13 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f
int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc);
int btree_write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc);
+void btrfs_btree_wait_writeback_range(struct btrfs_fs_info *fs_info, u64 start, u64 end);
void btrfs_readahead(struct readahead_control *rac);
int set_folio_extent_mapped(struct folio *folio);
void clear_folio_extent_mapped(struct folio *folio);
struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start, u64 owner_root, int level);
-struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
- u64 start, unsigned long len);
struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start);
struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src);
@@ -276,7 +269,8 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
u64 bytenr, u64 owner_root, u64 gen, int level);
void btrfs_readahead_node_child(struct extent_buffer *node, int slot);
-static inline int num_extent_pages(const struct extent_buffer *eb)
+/* Note: this can be used in for loops without caching the value in a variable. */
+static inline int __pure num_extent_pages(const struct extent_buffer *eb)
{
/*
* For sectorsize == PAGE_SIZE case, since nodesize is always aligned to
@@ -294,9 +288,13 @@ static inline int num_extent_pages(const struct extent_buffer *eb)
* As we can have either one large folio covering the whole eb
* (either nodesize <= PAGE_SIZE, or high order folio), or multiple
* single-paged folios.
+ *
+ * Note: this can be used in for loops without caching the value in a variable.
*/
-static inline int num_extent_folios(const struct extent_buffer *eb)
+static inline int __pure num_extent_folios(const struct extent_buffer *eb)
{
+ if (!eb->folios[0])
+ return 0;
if (folio_order(eb->folios[0]))
return 1;
return num_extent_pages(eb);
@@ -347,8 +345,8 @@ void memmove_extent_buffer(const struct extent_buffer *dst,
unsigned long len);
void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
unsigned long len);
-int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
- unsigned long pos);
+bool extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
+ unsigned long pos);
void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start,
unsigned long pos, unsigned long len);
void extent_buffer_bitmap_clear(const struct extent_buffer *eb,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 67ce85ff0ae2..57f52585a6dd 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -13,7 +13,7 @@
static struct kmem_cache *extent_map_cache;
-int __init extent_map_init(void)
+int __init btrfs_extent_map_init(void)
{
extent_map_cache = kmem_cache_create("btrfs_extent_map",
sizeof(struct extent_map), 0, 0, NULL);
@@ -22,7 +22,7 @@ int __init extent_map_init(void)
return 0;
}
-void __cold extent_map_exit(void)
+void __cold btrfs_extent_map_exit(void)
{
kmem_cache_destroy(extent_map_cache);
}
@@ -31,7 +31,7 @@ void __cold extent_map_exit(void)
* Initialize the extent tree @tree. Should be called for each new inode or
* other user of the extent_map interface.
*/
-void extent_map_tree_init(struct extent_map_tree *tree)
+void btrfs_extent_map_tree_init(struct extent_map_tree *tree)
{
tree->root = RB_ROOT;
INIT_LIST_HEAD(&tree->modified_extents);
@@ -42,7 +42,7 @@ void extent_map_tree_init(struct extent_map_tree *tree)
* Allocate a new extent_map structure. The new structure is returned with a
* reference count of one and needs to be freed using free_extent_map()
*/
-struct extent_map *alloc_extent_map(void)
+struct extent_map *btrfs_alloc_extent_map(void)
{
struct extent_map *em;
em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS);
@@ -58,12 +58,12 @@ struct extent_map *alloc_extent_map(void)
* Drop the reference out on @em by one and free the structure if the reference
* count hits zero.
*/
-void free_extent_map(struct extent_map *em)
+void btrfs_free_extent_map(struct extent_map *em)
{
if (!em)
return;
if (refcount_dec_and_test(&em->refs)) {
- WARN_ON(extent_map_in_tree(em));
+ WARN_ON(btrfs_extent_map_in_tree(em));
WARN_ON(!list_empty(&em->list));
kmem_cache_free(extent_map_cache, em);
}
@@ -84,7 +84,7 @@ static void remove_em(struct btrfs_inode *inode, struct extent_map *em)
rb_erase(&em->rb_node, &inode->extent_tree.root);
RB_CLEAR_NODE(&em->rb_node);
- if (!btrfs_is_testing(fs_info) && is_fstree(btrfs_root_id(inode->root)))
+ if (!btrfs_is_testing(fs_info) && btrfs_is_fstree(btrfs_root_id(inode->root)))
percpu_counter_dec(&fs_info->evictable_extent_maps);
}
@@ -102,19 +102,19 @@ static int tree_insert(struct rb_root *root, struct extent_map *em)
if (em->start < entry->start)
p = &(*p)->rb_left;
- else if (em->start >= extent_map_end(entry))
+ else if (em->start >= btrfs_extent_map_end(entry))
p = &(*p)->rb_right;
else
return -EEXIST;
}
orig_parent = parent;
- while (parent && em->start >= extent_map_end(entry)) {
+ while (parent && em->start >= btrfs_extent_map_end(entry)) {
parent = rb_next(parent);
entry = rb_entry(parent, struct extent_map, rb_node);
}
if (parent)
- if (end > entry->start && em->start < extent_map_end(entry))
+ if (end > entry->start && em->start < btrfs_extent_map_end(entry))
return -EEXIST;
parent = orig_parent;
@@ -124,7 +124,7 @@ static int tree_insert(struct rb_root *root, struct extent_map *em)
entry = rb_entry(parent, struct extent_map, rb_node);
}
if (parent)
- if (end > entry->start && em->start < extent_map_end(entry))
+ if (end > entry->start && em->start < btrfs_extent_map_end(entry))
return -EEXIST;
rb_link_node(&em->rb_node, orig_parent, p);
@@ -136,8 +136,8 @@ static int tree_insert(struct rb_root *root, struct extent_map *em)
* Search through the tree for an extent_map with a given offset. If it can't
* be found, try to find some neighboring extents
*/
-static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
- struct rb_node **prev_or_next_ret)
+static struct rb_node *tree_search(struct rb_root *root, u64 offset,
+ struct rb_node **prev_or_next_ret)
{
struct rb_node *n = root->rb_node;
struct rb_node *prev = NULL;
@@ -154,14 +154,14 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
if (offset < entry->start)
n = n->rb_left;
- else if (offset >= extent_map_end(entry))
+ else if (offset >= btrfs_extent_map_end(entry))
n = n->rb_right;
else
return n;
}
orig_prev = prev;
- while (prev && offset >= extent_map_end(prev_entry)) {
+ while (prev && offset >= btrfs_extent_map_end(prev_entry)) {
prev = rb_next(prev);
prev_entry = rb_entry(prev, struct extent_map, rb_node);
}
@@ -188,14 +188,14 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
static inline u64 extent_map_block_len(const struct extent_map *em)
{
- if (extent_map_is_compressed(em))
+ if (btrfs_extent_map_is_compressed(em))
return em->disk_num_bytes;
return em->len;
}
static inline u64 extent_map_block_end(const struct extent_map *em)
{
- const u64 block_start = extent_map_block_start(em);
+ const u64 block_start = btrfs_extent_map_block_start(em);
const u64 block_end = block_start + extent_map_block_len(em);
if (block_end < block_start)
@@ -210,7 +210,7 @@ static bool can_merge_extent_map(const struct extent_map *em)
return false;
/* Don't merge compressed extents, we need to know their actual size. */
- if (extent_map_is_compressed(em))
+ if (btrfs_extent_map_is_compressed(em))
return false;
if (em->flags & EXTENT_FLAG_LOGGING)
@@ -230,7 +230,7 @@ static bool can_merge_extent_map(const struct extent_map *em)
/* Check to see if two extent_map structs are adjacent and safe to merge. */
static bool mergeable_maps(const struct extent_map *prev, const struct extent_map *next)
{
- if (extent_map_end(prev) != next->start)
+ if (btrfs_extent_map_end(prev) != next->start)
return false;
/*
@@ -242,7 +242,7 @@ static bool mergeable_maps(const struct extent_map *prev, const struct extent_ma
return false;
if (next->disk_bytenr < EXTENT_MAP_LAST_BYTE - 1)
- return extent_map_block_start(next) == extent_map_block_end(prev);
+ return btrfs_extent_map_block_start(next) == extent_map_block_end(prev);
/* HOLES and INLINE extents. */
return next->disk_bytenr == prev->disk_bytenr;
@@ -270,8 +270,8 @@ static void merge_ondisk_extents(const struct extent_map *prev, const struct ext
u64 new_offset;
/* @prev and @next should not be compressed. */
- ASSERT(!extent_map_is_compressed(prev));
- ASSERT(!extent_map_is_compressed(next));
+ ASSERT(!btrfs_extent_map_is_compressed(prev));
+ ASSERT(!btrfs_extent_map_is_compressed(next));
/*
* There are two different cases where @prev and @next can be merged.
@@ -327,9 +327,9 @@ static void validate_extent_map(struct btrfs_fs_info *fs_info, struct extent_map
if (em->offset + em->len > em->ram_bytes)
dump_extent_map(fs_info, "ram_bytes too small", em);
if (em->offset + em->len > em->disk_num_bytes &&
- !extent_map_is_compressed(em))
+ !btrfs_extent_map_is_compressed(em))
dump_extent_map(fs_info, "disk_num_bytes too small", em);
- if (!extent_map_is_compressed(em) &&
+ if (!btrfs_extent_map_is_compressed(em) &&
em->ram_bytes != em->disk_num_bytes)
dump_extent_map(fs_info,
"ram_bytes mismatch with disk_num_bytes for non-compressed em",
@@ -361,8 +361,8 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em)
if (em->start != 0) {
rb = rb_prev(&em->rb_node);
- if (rb)
- merge = rb_entry(rb, struct extent_map, rb_node);
+ merge = rb_entry_safe(rb, struct extent_map, rb_node);
+
if (rb && can_merge_extent_map(merge) && mergeable_maps(merge, em)) {
em->start = merge->start;
em->len += merge->len;
@@ -374,13 +374,13 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em)
validate_extent_map(fs_info, em);
remove_em(inode, merge);
- free_extent_map(merge);
+ btrfs_free_extent_map(merge);
}
}
rb = rb_next(&em->rb_node);
- if (rb)
- merge = rb_entry(rb, struct extent_map, rb_node);
+ merge = rb_entry_safe(rb, struct extent_map, rb_node);
+
if (rb && can_merge_extent_map(merge) && mergeable_maps(em, merge)) {
em->len += merge->len;
if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE)
@@ -389,7 +389,7 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em)
em->generation = max(em->generation, merge->generation);
em->flags |= EXTENT_FLAG_MERGED;
remove_em(inode, merge);
- free_extent_map(merge);
+ btrfs_free_extent_map(merge);
}
}
@@ -409,7 +409,7 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em)
* -ENOENT when the extent is not found in the tree
* -EUCLEAN if the found extent does not match the expected start
*/
-int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen)
+int btrfs_unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct extent_map_tree *tree = &inode->extent_tree;
@@ -417,7 +417,7 @@ int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen)
struct extent_map *em;
write_lock(&tree->lock);
- em = lookup_extent_mapping(tree, start, len);
+ em = btrfs_lookup_extent_mapping(tree, start, len);
if (WARN_ON(!em)) {
btrfs_warn(fs_info,
@@ -444,17 +444,17 @@ int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen)
out:
write_unlock(&tree->lock);
- free_extent_map(em);
+ btrfs_free_extent_map(em);
return ret;
}
-void clear_em_logging(struct btrfs_inode *inode, struct extent_map *em)
+void btrfs_clear_em_logging(struct btrfs_inode *inode, struct extent_map *em)
{
lockdep_assert_held_write(&inode->extent_tree.lock);
em->flags &= ~EXTENT_FLAG_LOGGING;
- if (extent_map_in_tree(em))
+ if (btrfs_extent_map_in_tree(em))
try_merge_map(inode, em);
}
@@ -502,22 +502,21 @@ static int add_extent_mapping(struct btrfs_inode *inode,
setup_extent_mapping(inode, em, modified);
- if (!btrfs_is_testing(fs_info) && is_fstree(btrfs_root_id(root)))
+ if (!btrfs_is_testing(fs_info) && btrfs_is_fstree(btrfs_root_id(root)))
percpu_counter_inc(&fs_info->evictable_extent_maps);
return 0;
}
-static struct extent_map *
-__lookup_extent_mapping(struct extent_map_tree *tree,
- u64 start, u64 len, int strict)
+static struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
+ u64 start, u64 len, int strict)
{
struct extent_map *em;
struct rb_node *rb_node;
struct rb_node *prev_or_next = NULL;
u64 end = range_end(start, len);
- rb_node = __tree_search(&tree->root, start, &prev_or_next);
+ rb_node = tree_search(&tree->root, start, &prev_or_next);
if (!rb_node) {
if (prev_or_next)
rb_node = prev_or_next;
@@ -527,7 +526,7 @@ __lookup_extent_mapping(struct extent_map_tree *tree,
em = rb_entry(rb_node, struct extent_map, rb_node);
- if (strict && !(end > em->start && start < extent_map_end(em)))
+ if (strict && !(end > em->start && start < btrfs_extent_map_end(em)))
return NULL;
refcount_inc(&em->refs);
@@ -546,10 +545,10 @@ __lookup_extent_mapping(struct extent_map_tree *tree,
* intersect, so check the object returned carefully to make sure that no
* additional lookups are needed.
*/
-struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
- u64 start, u64 len)
+struct extent_map *btrfs_lookup_extent_mapping(struct extent_map_tree *tree,
+ u64 start, u64 len)
{
- return __lookup_extent_mapping(tree, start, len, 1);
+ return lookup_extent_mapping(tree, start, len, 1);
}
/*
@@ -564,10 +563,10 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
*
* If one can't be found, any nearby extent may be returned
*/
-struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
- u64 start, u64 len)
+struct extent_map *btrfs_search_extent_mapping(struct extent_map_tree *tree,
+ u64 start, u64 len)
{
- return __lookup_extent_mapping(tree, start, len, 0);
+ return lookup_extent_mapping(tree, start, len, 0);
}
/*
@@ -579,7 +578,7 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
* Remove @em from the extent tree of @inode. No reference counts are dropped,
* and no checks are done to see if the range is in use.
*/
-void remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em)
+void btrfs_remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em)
{
struct extent_map_tree *tree = &inode->extent_tree;
@@ -605,7 +604,7 @@ static void replace_extent_mapping(struct btrfs_inode *inode,
validate_extent_map(fs_info, new);
WARN_ON(cur->flags & EXTENT_FLAG_PINNED);
- ASSERT(extent_map_in_tree(cur));
+ ASSERT(btrfs_extent_map_in_tree(cur));
if (!(cur->flags & EXTENT_FLAG_LOGGING))
list_del_init(&cur->list);
rb_replace_node(&cur->rb_node, &new->rb_node, &tree->root);
@@ -651,7 +650,7 @@ static noinline int merge_extent_mapping(struct btrfs_inode *inode,
u64 end;
u64 start_diff;
- if (map_start < em->start || map_start >= extent_map_end(em))
+ if (map_start < em->start || map_start >= btrfs_extent_map_end(em))
return -EINVAL;
if (existing->start > map_start) {
@@ -662,10 +661,10 @@ static noinline int merge_extent_mapping(struct btrfs_inode *inode,
next = next_extent_map(prev);
}
- start = prev ? extent_map_end(prev) : em->start;
+ start = prev ? btrfs_extent_map_end(prev) : em->start;
start = max_t(u64, start, em->start);
- end = next ? next->start : extent_map_end(em);
- end = min_t(u64, end, extent_map_end(em));
+ end = next ? next->start : btrfs_extent_map_end(em);
+ end = min_t(u64, end, btrfs_extent_map_end(em));
start_diff = start - em->start;
em->start = start;
em->len = end - start;
@@ -716,7 +715,7 @@ int btrfs_add_extent_mapping(struct btrfs_inode *inode,
if (ret == -EEXIST) {
struct extent_map *existing;
- existing = search_extent_mapping(&inode->extent_tree, start, len);
+ existing = btrfs_search_extent_mapping(&inode->extent_tree, start, len);
trace_btrfs_handle_em_exist(fs_info, existing, em, start, len);
@@ -725,8 +724,8 @@ int btrfs_add_extent_mapping(struct btrfs_inode *inode,
* extent causing the -EEXIST.
*/
if (start >= existing->start &&
- start < extent_map_end(existing)) {
- free_extent_map(em);
+ start < btrfs_extent_map_end(existing)) {
+ btrfs_free_extent_map(em);
*em_in = existing;
ret = 0;
} else {
@@ -739,14 +738,14 @@ int btrfs_add_extent_mapping(struct btrfs_inode *inode,
*/
ret = merge_extent_mapping(inode, existing, em, start);
if (WARN_ON(ret)) {
- free_extent_map(em);
+ btrfs_free_extent_map(em);
*em_in = NULL;
btrfs_warn(fs_info,
"extent map merge error existing [%llu, %llu) with em [%llu, %llu) start %llu",
- existing->start, extent_map_end(existing),
+ existing->start, btrfs_extent_map_end(existing),
orig_start, orig_start + orig_len, start);
}
- free_extent_map(existing);
+ btrfs_free_extent_map(existing);
}
}
@@ -772,8 +771,8 @@ static void drop_all_extent_maps_fast(struct btrfs_inode *inode)
em = rb_entry(node, struct extent_map, rb_node);
em->flags &= ~(EXTENT_FLAG_PINNED | EXTENT_FLAG_LOGGING);
- remove_extent_mapping(inode, em);
- free_extent_map(em);
+ btrfs_remove_extent_mapping(inode, em);
+ btrfs_free_extent_map(em);
if (cond_resched_rwlock_write(&tree->lock))
node = rb_first(&tree->root);
@@ -826,15 +825,15 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
* range ends after our range (and they might be the same extent map),
* because we need to split those two extent maps at the boundaries.
*/
- split = alloc_extent_map();
- split2 = alloc_extent_map();
+ split = btrfs_alloc_extent_map();
+ split2 = btrfs_alloc_extent_map();
write_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, start, len);
+ em = btrfs_lookup_extent_mapping(em_tree, start, len);
while (em) {
/* extent_map_end() returns exclusive value (last byte + 1). */
- const u64 em_end = extent_map_end(em);
+ const u64 em_end = btrfs_extent_map_end(em);
struct extent_map *next_em = NULL;
u64 gen;
unsigned long flags;
@@ -898,7 +897,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
split->generation = gen;
split->flags = flags;
replace_extent_mapping(inode, em, split, modified);
- free_extent_map(split);
+ btrfs_free_extent_map(split);
split = split2;
split2 = NULL;
}
@@ -925,7 +924,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
split->ram_bytes = split->len;
}
- if (extent_map_in_tree(em)) {
+ if (btrfs_extent_map_in_tree(em)) {
replace_extent_mapping(inode, em, split, modified);
} else {
int ret;
@@ -936,11 +935,11 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
if (WARN_ON(ret != 0) && modified)
btrfs_set_inode_full_sync(inode);
}
- free_extent_map(split);
+ btrfs_free_extent_map(split);
split = NULL;
}
remove_em:
- if (extent_map_in_tree(em)) {
+ if (btrfs_extent_map_in_tree(em)) {
/*
* If the extent map is still in the tree it means that
* either of the following is true:
@@ -965,25 +964,25 @@ remove_em:
ASSERT(!split);
btrfs_set_inode_full_sync(inode);
}
- remove_extent_mapping(inode, em);
+ btrfs_remove_extent_mapping(inode, em);
}
/*
* Once for the tree reference (we replaced or removed the
* extent map from the tree).
*/
- free_extent_map(em);
+ btrfs_free_extent_map(em);
next:
/* Once for us (for our lookup reference). */
- free_extent_map(em);
+ btrfs_free_extent_map(em);
em = next_em;
}
write_unlock(&em_tree->lock);
- free_extent_map(split);
- free_extent_map(split2);
+ btrfs_free_extent_map(split);
+ btrfs_free_extent_map(split2);
}
/*
@@ -1007,7 +1006,7 @@ int btrfs_replace_extent_map_range(struct btrfs_inode *inode,
struct extent_map_tree *tree = &inode->extent_tree;
int ret;
- ASSERT(!extent_map_in_tree(new_em));
+ ASSERT(!btrfs_extent_map_in_tree(new_em));
/*
* The caller has locked an appropriate file range in the inode's io
@@ -1033,8 +1032,8 @@ int btrfs_replace_extent_map_range(struct btrfs_inode *inode,
*
* This function is used when an ordered_extent needs to be split.
*/
-int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
- u64 new_logical)
+int btrfs_split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
+ u64 new_logical)
{
struct extent_map_tree *em_tree = &inode->extent_tree;
struct extent_map *em;
@@ -1046,25 +1045,25 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
ASSERT(pre != 0);
ASSERT(pre < len);
- split_pre = alloc_extent_map();
+ split_pre = btrfs_alloc_extent_map();
if (!split_pre)
return -ENOMEM;
- split_mid = alloc_extent_map();
+ split_mid = btrfs_alloc_extent_map();
if (!split_mid) {
ret = -ENOMEM;
goto out_free_pre;
}
- lock_extent(&inode->io_tree, start, start + len - 1, NULL);
+ btrfs_lock_extent(&inode->io_tree, start, start + len - 1, NULL);
write_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, start, len);
+ em = btrfs_lookup_extent_mapping(em_tree, start, len);
if (!em) {
ret = -EIO;
goto out_unlock;
}
ASSERT(em->len == len);
- ASSERT(!extent_map_is_compressed(em));
+ ASSERT(!btrfs_extent_map_is_compressed(em));
ASSERT(em->disk_bytenr < EXTENT_MAP_LAST_BYTE);
ASSERT(em->flags & EXTENT_FLAG_PINNED);
ASSERT(!(em->flags & EXTENT_FLAG_LOGGING));
@@ -1093,7 +1092,7 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
/* Insert the middle extent_map. */
split_mid->start = em->start + pre;
split_mid->len = em->len - pre;
- split_mid->disk_bytenr = extent_map_block_start(em) + pre;
+ split_mid->disk_bytenr = btrfs_extent_map_block_start(em) + pre;
split_mid->disk_num_bytes = split_mid->len;
split_mid->offset = 0;
split_mid->ram_bytes = split_mid->len;
@@ -1102,16 +1101,16 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
add_extent_mapping(inode, split_mid, 1);
/* Once for us */
- free_extent_map(em);
+ btrfs_free_extent_map(em);
/* Once for the tree */
- free_extent_map(em);
+ btrfs_free_extent_map(em);
out_unlock:
write_unlock(&em_tree->lock);
- unlock_extent(&inode->io_tree, start, start + len - 1, NULL);
- free_extent_map(split_mid);
+ btrfs_unlock_extent(&inode->io_tree, start, start + len - 1, NULL);
+ btrfs_free_extent_map(split_mid);
out_free_pre:
- free_extent_map(split_pre);
+ btrfs_free_extent_map(split_pre);
return ret;
}
@@ -1128,6 +1127,8 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c
long nr_dropped = 0;
struct rb_node *node;
+ lockdep_assert_held_write(&tree->lock);
+
/*
* Take the mmap lock so that we serialize with the inode logging phase
* of fsync because we may need to set the full sync flag on the inode,
@@ -1139,28 +1140,12 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c
* to find new extents, which may not be there yet because ordered
* extents haven't completed yet.
*
- * We also do a try lock because otherwise we could deadlock. This is
- * because the shrinker for this filesystem may be invoked while we are
- * in a path that is holding the mmap lock in write mode. For example in
- * a reflink operation while COWing an extent buffer, when allocating
- * pages for a new extent buffer and under memory pressure, the shrinker
- * may be invoked, and therefore we would deadlock by attempting to read
- * lock the mmap lock while we are holding already a write lock on it.
+ * We also do a try lock because we don't want to block for too long and
+ * we are holding the extent map tree's lock in write mode.
*/
if (!down_read_trylock(&inode->i_mmap_lock))
return 0;
- /*
- * We want to be fast so if the lock is busy we don't want to spend time
- * waiting for it - either some task is about to do IO for the inode or
- * we may have another task shrinking extent maps, here in this code, so
- * skip this inode.
- */
- if (!write_trylock(&tree->lock)) {
- up_read(&inode->i_mmap_lock);
- return 0;
- }
-
node = rb_first(&tree->root);
while (node) {
struct rb_node *next = rb_next(node);
@@ -1182,10 +1167,10 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c
if (!list_empty(&em->list) && em->generation >= cur_fs_gen)
btrfs_set_inode_full_sync(inode);
- remove_extent_mapping(inode, em);
+ btrfs_remove_extent_mapping(inode, em);
trace_btrfs_extent_map_shrinker_remove_em(inode, em);
/* Drop the reference for the tree. */
- free_extent_map(em);
+ btrfs_free_extent_map(em);
nr_dropped++;
next:
if (ctx->scanned >= ctx->nr_to_scan)
@@ -1201,12 +1186,61 @@ next:
break;
node = next;
}
- write_unlock(&tree->lock);
up_read(&inode->i_mmap_lock);
return nr_dropped;
}
+static struct btrfs_inode *find_first_inode_to_shrink(struct btrfs_root *root,
+ u64 min_ino)
+{
+ struct btrfs_inode *inode;
+ unsigned long from = min_ino;
+
+ xa_lock(&root->inodes);
+ while (true) {
+ struct extent_map_tree *tree;
+
+ inode = xa_find(&root->inodes, &from, ULONG_MAX, XA_PRESENT);
+ if (!inode)
+ break;
+
+ tree = &inode->extent_tree;
+
+ /*
+ * We want to be fast so if the lock is busy we don't want to
+ * spend time waiting for it (some task is about to do IO for
+ * the inode).
+ */
+ if (!write_trylock(&tree->lock))
+ goto next;
+
+ /*
+ * Skip inode if it doesn't have loaded extent maps, so we avoid
+ * getting a reference and doing an iput later. This includes
+ * cases like files that were opened for things like stat(2), or
+ * files with all extent maps previously released through the
+ * release folio callback (btrfs_release_folio()) or released in
+ * a previous run, or directories which never have extent maps.
+ */
+ if (RB_EMPTY_ROOT(&tree->root)) {
+ write_unlock(&tree->lock);
+ goto next;
+ }
+
+ if (igrab(&inode->vfs_inode))
+ break;
+
+ write_unlock(&tree->lock);
+next:
+ from = btrfs_ino(inode) + 1;
+ cond_resched_lock(&root->inodes.xa_lock);
+ }
+ xa_unlock(&root->inodes);
+
+ return inode;
+}
+
static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx)
{
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -1214,21 +1248,21 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx
long nr_dropped = 0;
u64 min_ino = fs_info->em_shrinker_last_ino + 1;
- inode = btrfs_find_first_inode(root, min_ino);
+ inode = find_first_inode_to_shrink(root, min_ino);
while (inode) {
nr_dropped += btrfs_scan_inode(inode, ctx);
+ write_unlock(&inode->extent_tree.lock);
min_ino = btrfs_ino(inode) + 1;
fs_info->em_shrinker_last_ino = btrfs_ino(inode);
- btrfs_add_delayed_iput(inode);
+ iput(&inode->vfs_inode);
- if (ctx->scanned >= ctx->nr_to_scan ||
- btrfs_fs_closing(inode->root->fs_info))
+ if (ctx->scanned >= ctx->nr_to_scan || btrfs_fs_closing(fs_info))
break;
cond_resched();
- inode = btrfs_find_first_inode(root, min_ino);
+ inode = find_first_inode_to_shrink(root, min_ino);
}
if (inode) {
@@ -1303,7 +1337,7 @@ static void btrfs_extent_map_shrinker_worker(struct work_struct *work)
if (!root)
continue;
- if (is_fstree(btrfs_root_id(root)))
+ if (btrfs_is_fstree(btrfs_root_id(root)))
nr_dropped += btrfs_scan_root(root, &ctx);
btrfs_put_root(root);
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index cd123b266b64..d4b81ee4d97b 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -108,8 +108,8 @@ struct extent_map_tree {
struct btrfs_inode;
-static inline void extent_map_set_compression(struct extent_map *em,
- enum btrfs_compression_type type)
+static inline void btrfs_extent_map_set_compression(struct extent_map *em,
+ enum btrfs_compression_type type)
{
if (type == BTRFS_COMPRESS_ZLIB)
em->flags |= EXTENT_FLAG_COMPRESS_ZLIB;
@@ -119,7 +119,8 @@ static inline void extent_map_set_compression(struct extent_map *em,
em->flags |= EXTENT_FLAG_COMPRESS_ZSTD;
}
-static inline enum btrfs_compression_type extent_map_compression(const struct extent_map *em)
+static inline enum btrfs_compression_type btrfs_extent_map_compression(
+ const struct extent_map *em)
{
if (em->flags & EXTENT_FLAG_COMPRESS_ZLIB)
return BTRFS_COMPRESS_ZLIB;
@@ -137,50 +138,50 @@ static inline enum btrfs_compression_type extent_map_compression(const struct ex
* More efficient way to determine if extent is compressed, instead of using
* 'extent_map_compression() != BTRFS_COMPRESS_NONE'.
*/
-static inline bool extent_map_is_compressed(const struct extent_map *em)
+static inline bool btrfs_extent_map_is_compressed(const struct extent_map *em)
{
return (em->flags & (EXTENT_FLAG_COMPRESS_ZLIB |
EXTENT_FLAG_COMPRESS_LZO |
EXTENT_FLAG_COMPRESS_ZSTD)) != 0;
}
-static inline int extent_map_in_tree(const struct extent_map *em)
+static inline int btrfs_extent_map_in_tree(const struct extent_map *em)
{
return !RB_EMPTY_NODE(&em->rb_node);
}
-static inline u64 extent_map_block_start(const struct extent_map *em)
+static inline u64 btrfs_extent_map_block_start(const struct extent_map *em)
{
if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) {
- if (extent_map_is_compressed(em))
+ if (btrfs_extent_map_is_compressed(em))
return em->disk_bytenr;
return em->disk_bytenr + em->offset;
}
return em->disk_bytenr;
}
-static inline u64 extent_map_end(const struct extent_map *em)
+static inline u64 btrfs_extent_map_end(const struct extent_map *em)
{
if (em->start + em->len < em->start)
return (u64)-1;
return em->start + em->len;
}
-void extent_map_tree_init(struct extent_map_tree *tree);
-struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
- u64 start, u64 len);
-void remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em);
-int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
- u64 new_logical);
-
-struct extent_map *alloc_extent_map(void);
-void free_extent_map(struct extent_map *em);
-int __init extent_map_init(void);
-void __cold extent_map_exit(void);
-int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen);
-void clear_em_logging(struct btrfs_inode *inode, struct extent_map *em);
-struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
- u64 start, u64 len);
+void btrfs_extent_map_tree_init(struct extent_map_tree *tree);
+struct extent_map *btrfs_lookup_extent_mapping(struct extent_map_tree *tree,
+ u64 start, u64 len);
+void btrfs_remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em);
+int btrfs_split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
+ u64 new_logical);
+
+struct extent_map *btrfs_alloc_extent_map(void);
+void btrfs_free_extent_map(struct extent_map *em);
+int __init btrfs_extent_map_init(void);
+void __cold btrfs_extent_map_exit(void);
+int btrfs_unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen);
+void btrfs_clear_em_logging(struct btrfs_inode *inode, struct extent_map *em);
+struct extent_map *btrfs_search_extent_mapping(struct extent_map_tree *tree,
+ u64 start, u64 len);
int btrfs_add_extent_mapping(struct btrfs_inode *inode,
struct extent_map **em_in, u64 start, u64 len);
void btrfs_drop_extent_map_range(struct btrfs_inode *inode,
diff --git a/fs/btrfs/fiemap.c b/fs/btrfs/fiemap.c
index b80c07ad8c5e..7935586a9dbd 100644
--- a/fs/btrfs/fiemap.c
+++ b/fs/btrfs/fiemap.c
@@ -320,7 +320,7 @@ static int fiemap_next_leaf_item(struct btrfs_inode *inode, struct btrfs_path *p
* the cost of allocating a new one.
*/
ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED, &clone->bflags));
- atomic_inc(&clone->refs);
+ refcount_inc(&clone->refs);
ret = btrfs_next_leaf(inode->root, path);
if (ret != 0)
@@ -634,7 +634,7 @@ static int extent_fiemap(struct btrfs_inode *inode,
const u64 ino = btrfs_ino(inode);
struct extent_state *cached_state = NULL;
struct extent_state *delalloc_cached_state = NULL;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct fiemap_cache cache = { 0 };
struct btrfs_backref_share_check_ctx *backref_ctx;
u64 last_extent_end = 0;
@@ -661,7 +661,7 @@ restart:
range_end = round_up(start + len, sectorsize);
prev_extent_end = range_start;
- lock_extent(&inode->io_tree, range_start, range_end, &cached_state);
+ btrfs_lock_extent(&inode->io_tree, range_start, range_end, &cached_state);
ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end);
if (ret < 0)
@@ -841,7 +841,7 @@ check_eof_delalloc:
}
out_unlock:
- unlock_extent(&inode->io_tree, range_start, range_end, &cached_state);
+ btrfs_unlock_extent(&inode->io_tree, range_start, range_end, &cached_state);
if (ret == BTRFS_FIEMAP_FLUSH_CACHE) {
btrfs_release_path(path);
@@ -871,10 +871,9 @@ out_unlock:
ret = emit_last_fiemap_cache(fieinfo, &cache);
out:
- free_extent_state(delalloc_cached_state);
+ btrfs_free_extent_state(delalloc_cached_state);
kfree(cache.entries);
btrfs_free_backref_share_ctx(backref_ctx);
- btrfs_free_path(path);
return ret;
}
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index d04a3b47b1fb..c09fbc257634 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -46,7 +46,7 @@
void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_size)
{
u64 start, end, i_size;
- int ret;
+ bool found;
spin_lock(&inode->lock);
i_size = new_i_size ?: i_size_read(&inode->vfs_inode);
@@ -55,9 +55,9 @@ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_siz
goto out_unlock;
}
- ret = find_contiguous_extent_bit(inode->file_extent_tree, 0, &start,
- &end, EXTENT_DIRTY);
- if (!ret && start == 0)
+ found = btrfs_find_contiguous_extent_bit(inode->file_extent_tree, 0, &start,
+ &end, EXTENT_DIRTY);
+ if (found && start == 0)
i_size = min(i_size, end + 1);
else
i_size = 0;
@@ -91,8 +91,8 @@ int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start,
ASSERT(IS_ALIGNED(start + len, inode->root->fs_info->sectorsize));
- return set_extent_bit(inode->file_extent_tree, start, start + len - 1,
- EXTENT_DIRTY, NULL);
+ return btrfs_set_extent_bit(inode->file_extent_tree, start, start + len - 1,
+ EXTENT_DIRTY, NULL);
}
/*
@@ -121,8 +121,8 @@ int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start,
ASSERT(IS_ALIGNED(start + len, inode->root->fs_info->sectorsize) ||
len == (u64)-1);
- return clear_extent_bit(inode->file_extent_tree, start,
- start + len - 1, EXTENT_DIRTY, NULL);
+ return btrfs_clear_extent_bit(inode->file_extent_tree, start,
+ start + len - 1, EXTENT_DIRTY, NULL);
}
static size_t bytes_to_csum_size(const struct btrfs_fs_info *fs_info, u32 bytes)
@@ -163,20 +163,21 @@ int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans,
int ret = 0;
struct btrfs_file_extent_item *item;
struct btrfs_key file_key;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
+
file_key.objectid = objectid;
- file_key.offset = pos;
file_key.type = BTRFS_EXTENT_DATA_KEY;
+ file_key.offset = pos;
ret = btrfs_insert_empty_item(trans, root, path, &file_key,
sizeof(*item));
if (ret < 0)
- goto out;
+ return ret;
leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
@@ -190,8 +191,7 @@ int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans,
btrfs_set_file_extent_compression(leaf, item, 0);
btrfs_set_file_extent_encryption(leaf, item, 0);
btrfs_set_file_extent_other_encoding(leaf, item, 0);
-out:
- btrfs_free_path(path);
+
return ret;
}
@@ -212,8 +212,8 @@ btrfs_lookup_csum(struct btrfs_trans_handle *trans,
int csums_in_item;
file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
- file_key.offset = bytenr;
file_key.type = BTRFS_EXTENT_CSUM_KEY;
+ file_key.offset = bytenr;
ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow);
if (ret < 0)
goto fail;
@@ -259,8 +259,8 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
int cow = mod != 0;
file_key.objectid = objectid;
- file_key.offset = offset;
file_key.type = BTRFS_EXTENT_DATA_KEY;
+ file_key.offset = offset;
return btrfs_search_slot(trans, root, &file_key, path, ins_len, cow);
}
@@ -336,23 +336,23 @@ out:
*
* Return: BLK_STS_RESOURCE if allocating memory fails, BLK_STS_OK otherwise.
*/
-blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
+int btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
{
struct btrfs_inode *inode = bbio->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct bio *bio = &bbio->bio;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
const u32 sectorsize = fs_info->sectorsize;
const u32 csum_size = fs_info->csum_size;
u32 orig_len = bio->bi_iter.bi_size;
u64 orig_disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT;
const unsigned int nblocks = orig_len >> fs_info->sectorsize_bits;
- blk_status_t ret = BLK_STS_OK;
+ int ret = 0;
u32 bio_offset = 0;
if ((inode->flags & BTRFS_INODE_NODATASUM) ||
test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state))
- return BLK_STS_OK;
+ return 0;
/*
* This function is only called for read bio.
@@ -369,14 +369,12 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
ASSERT(bio_op(bio) == REQ_OP_READ);
path = btrfs_alloc_path();
if (!path)
- return BLK_STS_RESOURCE;
+ return -ENOMEM;
if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) {
bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS);
- if (!bbio->csum) {
- btrfs_free_path(path);
- return BLK_STS_RESOURCE;
- }
+ if (!bbio->csum)
+ return -ENOMEM;
} else {
bbio->csum = bbio->csum_inline;
}
@@ -408,7 +406,7 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
count = search_csum_tree(fs_info, path, cur_disk_bytenr,
orig_len - bio_offset, csum_dst);
if (count < 0) {
- ret = errno_to_blk_status(count);
+ ret = count;
if (bbio->csum != bbio->csum_inline)
kfree(bbio->csum);
bbio->csum = NULL;
@@ -429,12 +427,12 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
memset(csum_dst, 0, csum_size);
count = 1;
- if (btrfs_root_id(inode->root) == BTRFS_DATA_RELOC_TREE_OBJECTID) {
+ if (btrfs_is_data_reloc_root(inode->root)) {
u64 file_offset = bbio->file_offset + bio_offset;
- set_extent_bit(&inode->io_tree, file_offset,
- file_offset + sectorsize - 1,
- EXTENT_NODATASUM, NULL);
+ btrfs_set_extent_bit(&inode->io_tree, file_offset,
+ file_offset + sectorsize - 1,
+ EXTENT_NODATASUM, NULL);
} else {
btrfs_warn_rl(fs_info,
"csum hole found for disk bytenr range [%llu, %llu)",
@@ -444,7 +442,6 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
bio_offset += count * sectorsize;
}
- btrfs_free_path(path);
return ret;
}
@@ -484,8 +481,8 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end,
path->nowait = nowait;
key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
- key.offset = start;
key.type = BTRFS_EXTENT_CSUM_KEY;
+ key.offset = start;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
@@ -738,7 +735,7 @@ fail:
/*
* Calculate checksums of the data contained inside a bio.
*/
-blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio)
+int btrfs_csum_one_bio(struct btrfs_bio *bbio)
{
struct btrfs_ordered_extent *ordered = bbio->ordered;
struct btrfs_inode *inode = bbio->inode;
@@ -760,7 +757,7 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio)
memalloc_nofs_restore(nofs_flag);
if (!sums)
- return BLK_STS_RESOURCE;
+ return -ENOMEM;
sums->len = bio->bi_iter.bi_size;
INIT_LIST_HEAD(&sums->list);
@@ -797,11 +794,11 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio)
* record the updated logical address on Zone Append completion.
* Allocate just the structure with an empty sums array here for that case.
*/
-blk_status_t btrfs_alloc_dummy_sum(struct btrfs_bio *bbio)
+int btrfs_alloc_dummy_sum(struct btrfs_bio *bbio)
{
bbio->sums = kmalloc(sizeof(*bbio->sums), GFP_NOFS);
if (!bbio->sums)
- return BLK_STS_RESOURCE;
+ return -ENOMEM;
bbio->sums->len = bbio->bio.bi_iter.bi_size;
bbio->sums->logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
btrfs_add_ordered_sum(bbio->ordered, bbio->sums);
@@ -874,7 +871,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr, u64 len)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
u64 end_byte = bytenr + len;
u64 csum_end;
@@ -892,8 +889,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
while (1) {
key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
- key.offset = end_byte - 1;
key.type = BTRFS_EXTENT_CSUM_KEY;
+ key.offset = end_byte - 1;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0) {
@@ -1010,7 +1007,6 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
}
btrfs_release_path(path);
}
- btrfs_free_path(path);
return ret;
}
@@ -1052,7 +1048,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key file_key;
struct btrfs_key found_key;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_csum_item *item;
struct btrfs_csum_item *item_end;
struct extent_buffer *leaf = NULL;
@@ -1074,8 +1070,8 @@ again:
found_next = 0;
bytenr = sums->logical + total_bytes;
file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
- file_key.offset = bytenr;
file_key.type = BTRFS_EXTENT_CSUM_KEY;
+ file_key.offset = bytenr;
item = btrfs_lookup_csum(trans, root, path, bytenr, 1);
if (!IS_ERR(item)) {
@@ -1263,7 +1259,6 @@ found:
goto again;
}
out:
- btrfs_free_path(path);
return ret;
}
@@ -1301,7 +1296,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
em->disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
em->offset = btrfs_file_extent_offset(leaf, fi);
if (compress_type != BTRFS_COMPRESS_NONE) {
- extent_map_set_compression(em, compress_type);
+ btrfs_extent_map_set_compression(em, compress_type);
} else {
/*
* Older kernels can create regular non-hole data
@@ -1321,7 +1316,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
em->start = 0;
em->len = fs_info->sectorsize;
em->offset = 0;
- extent_map_set_compression(em, compress_type);
+ btrfs_extent_map_set_compression(em, compress_type);
} else {
btrfs_err(fs_info,
"unknown file extent item type %d, inode %llu, offset %llu, "
diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h
index 0e13661a71f3..63216c43676d 100644
--- a/fs/btrfs/file-item.h
+++ b/fs/btrfs/file-item.h
@@ -3,8 +3,10 @@
#ifndef BTRFS_FILE_ITEM_H
#define BTRFS_FILE_ITEM_H
+#include <linux/blk_types.h>
#include <linux/list.h>
#include <uapi/linux/btrfs_tree.h>
+#include "ctree.h"
#include "accessors.h"
struct extent_map;
@@ -51,7 +53,7 @@ static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize)
int btrfs_del_csums(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr, u64 len);
-blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio);
+int btrfs_lookup_bio_sums(struct btrfs_bio *bbio);
int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 objectid, u64 pos,
u64 num_bytes);
@@ -62,8 +64,8 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_ordered_sum *sums);
-blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio);
-blk_status_t btrfs_alloc_dummy_sum(struct btrfs_bio *bbio);
+int btrfs_csum_one_bio(struct btrfs_bio *bbio);
+int btrfs_alloc_dummy_sum(struct btrfs_bio *bbio);
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
struct list_head *list, int search_commit,
bool nowait);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 36f51c311bb1..204674934795 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -89,8 +89,7 @@ int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos
num_bytes = round_up(write_bytes + pos - start_pos,
fs_info->sectorsize);
ASSERT(num_bytes <= U32_MAX);
- ASSERT(folio_pos(folio) <= pos &&
- folio_pos(folio) + folio_size(folio) >= pos + write_bytes);
+ ASSERT(folio_pos(folio) <= pos && folio_end(folio) >= pos + write_bytes);
end_of_last_block = start_pos + num_bytes - 1;
@@ -98,9 +97,9 @@ int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos
* The pages may have already been dirty, clear out old accounting so
* we can set things up properly
*/
- clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block,
- EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
- cached);
+ btrfs_clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block,
+ EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
+ cached);
ret = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
extra_bits, cached);
@@ -508,20 +507,19 @@ out:
return ret;
}
-static int extent_mergeable(struct extent_buffer *leaf, int slot,
- u64 objectid, u64 bytenr, u64 orig_offset,
- u64 *start, u64 *end)
+static bool extent_mergeable(struct extent_buffer *leaf, int slot, u64 objectid,
+ u64 bytenr, u64 orig_offset, u64 *start, u64 *end)
{
struct btrfs_file_extent_item *fi;
struct btrfs_key key;
u64 extent_end;
if (slot < 0 || slot >= btrfs_header_nritems(leaf))
- return 0;
+ return false;
btrfs_item_key_to_cpu(leaf, &key, slot);
if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
- return 0;
+ return false;
fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
@@ -530,15 +528,15 @@ static int extent_mergeable(struct extent_buffer *leaf, int slot,
btrfs_file_extent_compression(leaf, fi) ||
btrfs_file_extent_encryption(leaf, fi) ||
btrfs_file_extent_other_encoding(leaf, fi))
- return 0;
+ return false;
extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
if ((*start && *start != key.offset) || (*end && *end != extent_end))
- return 0;
+ return false;
*start = key.offset;
*end = extent_end;
- return 1;
+ return true;
}
/*
@@ -553,7 +551,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
{
struct btrfs_root *root = inode->root;
struct extent_buffer *leaf;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_file_extent_item *fi;
struct btrfs_ref ref = { 0 };
struct btrfs_key key;
@@ -791,7 +789,6 @@ again:
}
}
out:
- btrfs_free_path(path);
return ret;
}
@@ -800,18 +797,18 @@ out:
* On success return a locked folio and 0
*/
static int prepare_uptodate_folio(struct inode *inode, struct folio *folio, u64 pos,
- u64 len, bool force_uptodate)
+ u64 len)
{
u64 clamp_start = max_t(u64, pos, folio_pos(folio));
- u64 clamp_end = min_t(u64, pos + len, folio_pos(folio) + folio_size(folio));
+ u64 clamp_end = min_t(u64, pos + len, folio_end(folio));
+ const u32 blocksize = inode_to_fs_info(inode)->sectorsize;
int ret = 0;
if (folio_test_uptodate(folio))
return 0;
- if (!force_uptodate &&
- IS_ALIGNED(clamp_start, PAGE_SIZE) &&
- IS_ALIGNED(clamp_end, PAGE_SIZE))
+ if (IS_ALIGNED(clamp_start, blocksize) &&
+ IS_ALIGNED(clamp_end, blocksize))
return 0;
ret = btrfs_read_folio(NULL, folio);
@@ -857,33 +854,27 @@ static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait)
*/
static noinline int prepare_one_folio(struct inode *inode, struct folio **folio_ret,
loff_t pos, size_t write_bytes,
- bool force_uptodate, bool nowait)
+ bool nowait)
{
- unsigned long index = pos >> PAGE_SHIFT;
+ const pgoff_t index = pos >> PAGE_SHIFT;
gfp_t mask = get_prepare_gfp_flags(inode, nowait);
- fgf_t fgp_flags = (nowait ? FGP_WRITEBEGIN | FGP_NOWAIT : FGP_WRITEBEGIN);
+ fgf_t fgp_flags = (nowait ? FGP_WRITEBEGIN | FGP_NOWAIT : FGP_WRITEBEGIN) |
+ fgf_set_order(write_bytes);
struct folio *folio;
int ret = 0;
again:
folio = __filemap_get_folio(inode->i_mapping, index, fgp_flags, mask);
- if (IS_ERR(folio)) {
- if (nowait)
- ret = -EAGAIN;
- else
- ret = PTR_ERR(folio);
- return ret;
- }
- folio_wait_writeback(folio);
- /* Only support page sized folio yet. */
- ASSERT(folio_order(folio) == 0);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
+
ret = set_folio_extent_mapped(folio);
if (ret < 0) {
folio_unlock(folio);
folio_put(folio);
return ret;
}
- ret = prepare_uptodate_folio(inode, folio, pos, write_bytes, force_uptodate);
+ ret = prepare_uptodate_folio(inode, folio, pos, write_bytes);
if (ret) {
/* The folio is already unlocked. */
folio_put(folio);
@@ -924,14 +915,15 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio,
struct btrfs_ordered_extent *ordered;
if (nowait) {
- if (!try_lock_extent(&inode->io_tree, start_pos, last_pos,
- cached_state)) {
+ if (!btrfs_try_lock_extent(&inode->io_tree, start_pos,
+ last_pos, cached_state)) {
folio_unlock(folio);
folio_put(folio);
return -EAGAIN;
}
} else {
- lock_extent(&inode->io_tree, start_pos, last_pos, cached_state);
+ btrfs_lock_extent(&inode->io_tree, start_pos, last_pos,
+ cached_state);
}
ordered = btrfs_lookup_ordered_range(inode, start_pos,
@@ -939,8 +931,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio,
if (ordered &&
ordered->file_offset + ordered->num_bytes > start_pos &&
ordered->file_offset <= last_pos) {
- unlock_extent(&inode->io_tree, start_pos, last_pos,
- cached_state);
+ btrfs_unlock_extent(&inode->io_tree, start_pos, last_pos,
+ cached_state);
folio_unlock(folio);
folio_put(folio);
btrfs_start_ordered_extent(ordered);
@@ -970,6 +962,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio,
* @pos: File offset.
* @write_bytes: The length to write, will be updated to the nocow writeable
* range.
+ * @nowait: Indicate if we can block or not (non-blocking IO context).
*
* This function will flush ordered extents in the range to ensure proper
* nocow checks.
@@ -978,7 +971,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio,
* > 0 If we can nocow, and updates @write_bytes.
* 0 If we can't do a nocow write.
* -EAGAIN If we can't do a nocow write because snapshoting of the inode's
- * root is in progress.
+ * root is in progress or because we are in a non-blocking IO
+ * context and need to block (@nowait is true).
* < 0 If an error happened.
*
* NOTE: Callers need to call btrfs_check_nocow_unlock() if we return > 0.
@@ -990,8 +984,8 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
struct btrfs_root *root = inode->root;
struct extent_state *cached_state = NULL;
u64 lockstart, lockend;
- u64 num_bytes;
- int ret;
+ u64 cur_offset;
+ int ret = 0;
if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
return 0;
@@ -1002,7 +996,6 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
lockstart = round_down(pos, fs_info->sectorsize);
lockend = round_up(pos + *write_bytes,
fs_info->sectorsize) - 1;
- num_bytes = lockend - lockstart + 1;
if (nowait) {
if (!btrfs_try_lock_ordered_range(inode, lockstart, lockend,
@@ -1014,14 +1007,35 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend,
&cached_state);
}
- ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
- NULL, nowait);
- if (ret <= 0)
- btrfs_drew_write_unlock(&root->snapshot_lock);
- else
- *write_bytes = min_t(size_t, *write_bytes ,
- num_bytes - pos + lockstart);
- unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
+
+ cur_offset = lockstart;
+ while (cur_offset < lockend) {
+ u64 num_bytes = lockend - cur_offset + 1;
+
+ ret = can_nocow_extent(inode, cur_offset, &num_bytes, NULL, nowait);
+ if (ret <= 0) {
+ /*
+ * If cur_offset == lockstart it means we haven't found
+ * any extent against which we can NOCOW, so unlock the
+ * snapshot lock.
+ */
+ if (cur_offset == lockstart)
+ btrfs_drew_write_unlock(&root->snapshot_lock);
+ break;
+ }
+ cur_offset += num_bytes;
+ }
+
+ btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
+
+ /*
+ * cur_offset > lockstart means there's at least a partial range we can
+ * NOCOW, and that range can cover one or more extents.
+ */
+ if (cur_offset > lockstart) {
+ *write_bytes = min_t(size_t, *write_bytes, cur_offset - pos);
+ return 1;
+ }
return ret;
}
@@ -1039,7 +1053,6 @@ int btrfs_write_check(struct kiocb *iocb, size_t count)
loff_t pos = iocb->ki_pos;
int ret;
loff_t oldsize;
- loff_t start_pos;
/*
* Quickly bail out on NOWAIT writes if we don't have the nodatacow or
@@ -1066,9 +1079,8 @@ int btrfs_write_check(struct kiocb *iocb, size_t count)
inode_inc_iversion(inode);
}
- start_pos = round_down(pos, fs_info->sectorsize);
oldsize = i_size_read(inode);
- if (start_pos > oldsize) {
+ if (pos > oldsize) {
/* Expand hole size to cover write data, preventing empty gap */
loff_t end_pos = round_up(pos + count, fs_info->sectorsize);
@@ -1080,234 +1092,306 @@ int btrfs_write_check(struct kiocb *iocb, size_t count)
return 0;
}
-ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
+static void release_space(struct btrfs_inode *inode, struct extent_changeset *data_reserved,
+ u64 start, u64 len, bool only_release_metadata)
{
- struct file *file = iocb->ki_filp;
- loff_t pos;
- struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- struct extent_changeset *data_reserved = NULL;
- u64 release_bytes = 0;
- u64 lockstart;
- u64 lockend;
- size_t num_written = 0;
- ssize_t ret;
- loff_t old_isize = i_size_read(inode);
- unsigned int ilock_flags = 0;
- const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
- unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0);
- bool only_release_metadata = false;
+ if (len == 0)
+ return;
- if (nowait)
- ilock_flags |= BTRFS_ILOCK_TRY;
+ if (only_release_metadata) {
+ btrfs_check_nocow_unlock(inode);
+ btrfs_delalloc_release_metadata(inode, len, true);
+ } else {
+ const struct btrfs_fs_info *fs_info = inode->root->fs_info;
- ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
- if (ret < 0)
- return ret;
+ btrfs_delalloc_release_space(inode, data_reserved,
+ round_down(start, fs_info->sectorsize),
+ len, true);
+ }
+}
- ret = generic_write_checks(iocb, i);
- if (ret <= 0)
- goto out;
+/*
+ * Reserve data and metadata space for this buffered write range.
+ *
+ * Return >0 for the number of bytes reserved, which is always block aligned.
+ * Return <0 for error.
+ */
+static ssize_t reserve_space(struct btrfs_inode *inode,
+ struct extent_changeset **data_reserved,
+ u64 start, size_t *len, bool nowait,
+ bool *only_release_metadata)
+{
+ const struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ const unsigned int block_offset = (start & (fs_info->sectorsize - 1));
+ size_t reserve_bytes;
+ int ret;
- ret = btrfs_write_check(iocb, ret);
- if (ret < 0)
- goto out;
+ ret = btrfs_check_data_free_space(inode, data_reserved, start, *len, nowait);
+ if (ret < 0) {
+ int can_nocow;
- pos = iocb->ki_pos;
- while (iov_iter_count(i) > 0) {
- struct extent_state *cached_state = NULL;
- size_t offset = offset_in_page(pos);
- size_t sector_offset;
- size_t write_bytes = min(iov_iter_count(i), PAGE_SIZE - offset);
- size_t reserve_bytes;
- size_t copied;
- size_t dirty_sectors;
- size_t num_sectors;
- struct folio *folio = NULL;
- int extents_locked;
- bool force_page_uptodate = false;
+ if (nowait && (ret == -ENOSPC || ret == -EAGAIN))
+ return -EAGAIN;
/*
- * Fault pages before locking them in prepare_one_folio()
- * to avoid recursive lock
+ * If we don't have to COW at the offset, reserve metadata only.
+ * write_bytes may get smaller than requested here.
*/
- if (unlikely(fault_in_iov_iter_readable(i, write_bytes))) {
- ret = -EFAULT;
- break;
- }
+ can_nocow = btrfs_check_nocow_lock(inode, start, len, nowait);
+ if (can_nocow < 0)
+ ret = can_nocow;
+ if (can_nocow > 0)
+ ret = 0;
+ if (ret)
+ return ret;
+ *only_release_metadata = true;
+ }
- only_release_metadata = false;
- sector_offset = pos & (fs_info->sectorsize - 1);
+ reserve_bytes = round_up(*len + block_offset, fs_info->sectorsize);
+ WARN_ON(reserve_bytes == 0);
+ ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes,
+ reserve_bytes, nowait);
+ if (ret) {
+ if (!*only_release_metadata)
+ btrfs_free_reserved_data_space(inode, *data_reserved,
+ start, *len);
+ else
+ btrfs_check_nocow_unlock(inode);
- extent_changeset_release(data_reserved);
- ret = btrfs_check_data_free_space(BTRFS_I(inode),
- &data_reserved, pos,
- write_bytes, nowait);
- if (ret < 0) {
- int can_nocow;
+ if (nowait && ret == -ENOSPC)
+ ret = -EAGAIN;
+ return ret;
+ }
+ return reserve_bytes;
+}
- if (nowait && (ret == -ENOSPC || ret == -EAGAIN)) {
- ret = -EAGAIN;
- break;
- }
+/* Shrink the reserved data and metadata space from @reserved_len to @new_len. */
+static void shrink_reserved_space(struct btrfs_inode *inode,
+ struct extent_changeset *data_reserved,
+ u64 reserved_start, u64 reserved_len,
+ u64 new_len, bool only_release_metadata)
+{
+ const u64 diff = reserved_len - new_len;
- /*
- * If we don't have to COW at the offset, reserve
- * metadata only. write_bytes may get smaller than
- * requested here.
- */
- can_nocow = btrfs_check_nocow_lock(BTRFS_I(inode), pos,
- &write_bytes, nowait);
- if (can_nocow < 0)
- ret = can_nocow;
- if (can_nocow > 0)
- ret = 0;
- if (ret)
- break;
- only_release_metadata = true;
- }
+ ASSERT(new_len <= reserved_len);
+ btrfs_delalloc_shrink_extents(inode, reserved_len, new_len);
+ if (only_release_metadata)
+ btrfs_delalloc_release_metadata(inode, diff, true);
+ else
+ btrfs_delalloc_release_space(inode, data_reserved,
+ reserved_start + new_len, diff, true);
+}
- reserve_bytes = round_up(write_bytes + sector_offset,
- fs_info->sectorsize);
- WARN_ON(reserve_bytes == 0);
- ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
- reserve_bytes,
- reserve_bytes, nowait);
- if (ret) {
- if (!only_release_metadata)
- btrfs_free_reserved_data_space(BTRFS_I(inode),
- data_reserved, pos,
- write_bytes);
- else
- btrfs_check_nocow_unlock(BTRFS_I(inode));
+/* Calculate the maximum amount of bytes we can write into one folio. */
+static size_t calc_write_bytes(const struct btrfs_inode *inode,
+ const struct iov_iter *iter, u64 start)
+{
+ const size_t max_folio_size = mapping_max_folio_size(inode->vfs_inode.i_mapping);
- if (nowait && ret == -ENOSPC)
- ret = -EAGAIN;
- break;
- }
+ return min(max_folio_size - (start & (max_folio_size - 1)),
+ iov_iter_count(iter));
+}
+
+/*
+ * Do the heavy-lifting work to copy one range into one folio of the page cache.
+ *
+ * Return > 0 in case we copied all bytes or just some of them.
+ * Return 0 if no bytes were copied, in which case the caller should retry.
+ * Return <0 on error.
+ */
+static int copy_one_range(struct btrfs_inode *inode, struct iov_iter *iter,
+ struct extent_changeset **data_reserved, u64 start,
+ bool nowait)
+{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct extent_state *cached_state = NULL;
+ size_t write_bytes = calc_write_bytes(inode, iter, start);
+ size_t copied;
+ const u64 reserved_start = round_down(start, fs_info->sectorsize);
+ u64 reserved_len;
+ struct folio *folio = NULL;
+ int extents_locked;
+ u64 lockstart;
+ u64 lockend;
+ bool only_release_metadata = false;
+ const unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0);
+ int ret;
+
+ /*
+ * Fault all pages before locking them in prepare_one_folio() to avoid
+ * recursive lock.
+ */
+ if (unlikely(fault_in_iov_iter_readable(iter, write_bytes)))
+ return -EFAULT;
+ extent_changeset_release(*data_reserved);
+ ret = reserve_space(inode, data_reserved, start, &write_bytes, nowait,
+ &only_release_metadata);
+ if (ret < 0)
+ return ret;
+ reserved_len = ret;
+ /* Write range must be inside the reserved range. */
+ ASSERT(reserved_start <= start);
+ ASSERT(start + write_bytes <= reserved_start + reserved_len);
- release_bytes = reserve_bytes;
again:
- ret = balance_dirty_pages_ratelimited_flags(inode->i_mapping, bdp_flags);
- if (ret) {
- btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
- break;
- }
+ ret = balance_dirty_pages_ratelimited_flags(inode->vfs_inode.i_mapping,
+ bdp_flags);
+ if (ret) {
+ btrfs_delalloc_release_extents(inode, reserved_len);
+ release_space(inode, *data_reserved, reserved_start, reserved_len,
+ only_release_metadata);
+ return ret;
+ }
- ret = prepare_one_folio(inode, &folio, pos, write_bytes,
- force_page_uptodate, false);
- if (ret) {
- btrfs_delalloc_release_extents(BTRFS_I(inode),
- reserve_bytes);
- break;
- }
+ ret = prepare_one_folio(&inode->vfs_inode, &folio, start, write_bytes, false);
+ if (ret) {
+ btrfs_delalloc_release_extents(inode, reserved_len);
+ release_space(inode, *data_reserved, reserved_start, reserved_len,
+ only_release_metadata);
+ return ret;
+ }
- extents_locked = lock_and_cleanup_extent_if_need(BTRFS_I(inode),
- folio, pos, write_bytes, &lockstart,
- &lockend, nowait, &cached_state);
- if (extents_locked < 0) {
- if (!nowait && extents_locked == -EAGAIN)
- goto again;
+ /*
+ * The reserved range goes beyond the current folio, shrink the reserved
+ * space to the folio boundary.
+ */
+ if (reserved_start + reserved_len > folio_end(folio)) {
+ const u64 last_block = folio_end(folio);
+
+ shrink_reserved_space(inode, *data_reserved, reserved_start,
+ reserved_len, last_block - reserved_start,
+ only_release_metadata);
+ write_bytes = last_block - start;
+ reserved_len = last_block - reserved_start;
+ }
+
+ extents_locked = lock_and_cleanup_extent_if_need(inode, folio, start,
+ write_bytes, &lockstart,
+ &lockend, nowait,
+ &cached_state);
+ if (extents_locked < 0) {
+ if (!nowait && extents_locked == -EAGAIN)
+ goto again;
- btrfs_delalloc_release_extents(BTRFS_I(inode),
- reserve_bytes);
- ret = extents_locked;
- break;
- }
+ btrfs_delalloc_release_extents(inode, reserved_len);
+ release_space(inode, *data_reserved, reserved_start, reserved_len,
+ only_release_metadata);
+ ret = extents_locked;
+ return ret;
+ }
+
+ copied = copy_folio_from_iter_atomic(folio, offset_in_folio(folio, start),
+ write_bytes, iter);
+ flush_dcache_folio(folio);
- copied = copy_folio_from_iter_atomic(folio,
- offset_in_folio(folio, pos), write_bytes, i);
- flush_dcache_folio(folio);
+ if (unlikely(copied < write_bytes)) {
+ u64 last_block;
/*
- * If we get a partial write, we can end up with partially
- * uptodate page. Although if sector size < page size we can
- * handle it, but if it's not sector aligned it can cause
- * a lot of complexity, so make sure they don't happen by
- * forcing retry this copy.
+ * The original write range doesn't need an uptodate folio as
+ * the range is block aligned. But now a short copy happened.
+ * We cannot handle it without an uptodate folio.
+ *
+ * So just revert the range and we will retry.
*/
- if (unlikely(copied < write_bytes)) {
- if (!folio_test_uptodate(folio)) {
- iov_iter_revert(i, copied);
- copied = 0;
- }
+ if (!folio_test_uptodate(folio)) {
+ iov_iter_revert(iter, copied);
+ copied = 0;
}
- num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes);
- dirty_sectors = round_up(copied + sector_offset,
- fs_info->sectorsize);
- dirty_sectors = BTRFS_BYTES_TO_BLKS(fs_info, dirty_sectors);
-
+ /* No copied bytes, unlock, release reserved space and exit. */
if (copied == 0) {
- force_page_uptodate = true;
- dirty_sectors = 0;
- } else {
- force_page_uptodate = false;
+ if (extents_locked)
+ btrfs_unlock_extent(&inode->io_tree, lockstart, lockend,
+ &cached_state);
+ else
+ btrfs_free_extent_state(cached_state);
+ btrfs_delalloc_release_extents(inode, reserved_len);
+ release_space(inode, *data_reserved, reserved_start, reserved_len,
+ only_release_metadata);
+ btrfs_drop_folio(fs_info, folio, start, copied);
+ return 0;
}
- if (num_sectors > dirty_sectors) {
- /* release everything except the sectors we dirtied */
- release_bytes -= dirty_sectors << fs_info->sectorsize_bits;
- if (only_release_metadata) {
- btrfs_delalloc_release_metadata(BTRFS_I(inode),
- release_bytes, true);
- } else {
- u64 release_start = round_up(pos + copied,
- fs_info->sectorsize);
- btrfs_delalloc_release_space(BTRFS_I(inode),
- data_reserved, release_start,
- release_bytes, true);
- }
- }
+ /* Release the reserved space beyond the last block. */
+ last_block = round_up(start + copied, fs_info->sectorsize);
- release_bytes = round_up(copied + sector_offset,
- fs_info->sectorsize);
+ shrink_reserved_space(inode, *data_reserved, reserved_start,
+ reserved_len, last_block - reserved_start,
+ only_release_metadata);
+ reserved_len = last_block - reserved_start;
+ }
- ret = btrfs_dirty_folio(BTRFS_I(inode), folio, pos, copied,
- &cached_state, only_release_metadata);
+ ret = btrfs_dirty_folio(inode, folio, start, copied, &cached_state,
+ only_release_metadata);
+ /*
+ * If we have not locked the extent range, because the range's start
+ * offset is >= i_size, we might still have a non-NULL cached extent
+ * state, acquired while marking the extent range as delalloc through
+ * btrfs_dirty_page(). Therefore free any possible cached extent state
+ * to avoid a memory leak.
+ */
+ if (extents_locked)
+ btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
+ else
+ btrfs_free_extent_state(cached_state);
- /*
- * If we have not locked the extent range, because the range's
- * start offset is >= i_size, we might still have a non-NULL
- * cached extent state, acquired while marking the extent range
- * as delalloc through btrfs_dirty_page(). Therefore free any
- * possible cached extent state to avoid a memory leak.
- */
- if (extents_locked)
- unlock_extent(&BTRFS_I(inode)->io_tree, lockstart,
- lockend, &cached_state);
- else
- free_extent_state(cached_state);
+ btrfs_delalloc_release_extents(inode, reserved_len);
+ if (ret) {
+ btrfs_drop_folio(fs_info, folio, start, copied);
+ release_space(inode, *data_reserved, reserved_start, reserved_len,
+ only_release_metadata);
+ return ret;
+ }
+ if (only_release_metadata)
+ btrfs_check_nocow_unlock(inode);
- btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
- if (ret) {
- btrfs_drop_folio(fs_info, folio, pos, copied);
- break;
- }
+ btrfs_drop_folio(fs_info, folio, start, copied);
+ return copied;
+}
- release_bytes = 0;
- if (only_release_metadata)
- btrfs_check_nocow_unlock(BTRFS_I(inode));
+ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct file *file = iocb->ki_filp;
+ loff_t pos;
+ struct inode *inode = file_inode(file);
+ struct extent_changeset *data_reserved = NULL;
+ size_t num_written = 0;
+ ssize_t ret;
+ loff_t old_isize;
+ unsigned int ilock_flags = 0;
+ const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
- btrfs_drop_folio(fs_info, folio, pos, copied);
+ if (nowait)
+ ilock_flags |= BTRFS_ILOCK_TRY;
- cond_resched();
+ ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
+ if (ret < 0)
+ return ret;
- pos += copied;
- num_written += copied;
- }
+ /*
+ * We can only trust the isize with inode lock held, or it can race with
+ * other buffered writes and cause incorrect call of
+ * pagecache_isize_extended() to overwrite existing data.
+ */
+ old_isize = i_size_read(inode);
- if (release_bytes) {
- if (only_release_metadata) {
- btrfs_check_nocow_unlock(BTRFS_I(inode));
- btrfs_delalloc_release_metadata(BTRFS_I(inode),
- release_bytes, true);
- } else {
- btrfs_delalloc_release_space(BTRFS_I(inode),
- data_reserved,
- round_down(pos, fs_info->sectorsize),
- release_bytes, true);
- }
+ ret = generic_write_checks(iocb, iter);
+ if (ret <= 0)
+ goto out;
+
+ ret = btrfs_write_check(iocb, ret);
+ if (ret < 0)
+ goto out;
+
+ pos = iocb->ki_pos;
+ while (iov_iter_count(iter) > 0) {
+ ret = copy_one_range(BTRFS_I(inode), iter, &data_reserved, pos, nowait);
+ if (ret < 0)
+ break;
+ pos += ret;
+ num_written += ret;
+ cond_resched();
}
extent_changeset_free(data_reserved);
@@ -1402,7 +1486,7 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
if (private) {
kfree(private->filldir_buf);
- free_extent_state(private->llseek_cached_state);
+ btrfs_free_extent_state(private->llseek_cached_state);
kfree(private);
filp->private_data = NULL;
}
@@ -1770,27 +1854,25 @@ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
{
struct page *page = vmf->page;
struct folio *folio = page_folio(page);
- struct inode *inode = file_inode(vmf->vma->vm_file);
- struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct btrfs_inode *inode = BTRFS_I(file_inode(vmf->vma->vm_file));
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct extent_io_tree *io_tree = &inode->io_tree;
struct btrfs_ordered_extent *ordered;
struct extent_state *cached_state = NULL;
struct extent_changeset *data_reserved = NULL;
unsigned long zero_start;
loff_t size;
- vm_fault_t ret;
- int ret2;
- int reserved = 0;
+ size_t fsize = folio_size(folio);
+ int ret;
+ bool only_release_metadata = false;
u64 reserved_space;
u64 page_start;
u64 page_end;
u64 end;
- ASSERT(folio_order(folio) == 0);
+ reserved_space = fsize;
- reserved_space = PAGE_SIZE;
-
- sb_start_pagefault(inode->i_sb);
+ sb_start_pagefault(inode->vfs_inode.i_sb);
page_start = folio_pos(folio);
page_end = page_start + folio_size(folio) - 1;
end = page_end;
@@ -1803,38 +1885,53 @@ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
* end up waiting indefinitely to get a lock on the page currently
* being processed by btrfs_page_mkwrite() function.
*/
- ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
- page_start, reserved_space);
- if (!ret2) {
- ret2 = file_update_time(vmf->vma->vm_file);
- reserved = 1;
- }
- if (ret2) {
- ret = vmf_error(ret2);
- if (reserved)
- goto out;
+ ret = btrfs_check_data_free_space(inode, &data_reserved, page_start,
+ reserved_space, false);
+ if (ret < 0) {
+ size_t write_bytes = reserved_space;
+
+ if (btrfs_check_nocow_lock(inode, page_start, &write_bytes, false) <= 0)
+ goto out_noreserve;
+
+ only_release_metadata = true;
+
+ /*
+ * Can't write the whole range, there may be shared extents or
+ * holes in the range, bail out with @only_release_metadata set
+ * to true so that we unlock the nocow lock before returning the
+ * error.
+ */
+ if (write_bytes < reserved_space)
+ goto out_noreserve;
+ }
+ ret = btrfs_delalloc_reserve_metadata(inode, reserved_space,
+ reserved_space, false);
+ if (ret < 0) {
+ if (!only_release_metadata)
+ btrfs_free_reserved_data_space(inode, data_reserved,
+ page_start, reserved_space);
goto out_noreserve;
}
- /* Make the VM retry the fault. */
- ret = VM_FAULT_NOPAGE;
+ ret = file_update_time(vmf->vma->vm_file);
+ if (ret < 0)
+ goto out;
again:
- down_read(&BTRFS_I(inode)->i_mmap_lock);
+ down_read(&inode->i_mmap_lock);
folio_lock(folio);
- size = i_size_read(inode);
+ size = i_size_read(&inode->vfs_inode);
- if ((folio->mapping != inode->i_mapping) ||
+ if ((folio->mapping != inode->vfs_inode.i_mapping) ||
(page_start >= size)) {
/* Page got truncated out from underneath us. */
goto out_unlock;
}
folio_wait_writeback(folio);
- lock_extent(io_tree, page_start, page_end, &cached_state);
- ret2 = set_folio_extent_mapped(folio);
- if (ret2 < 0) {
- ret = vmf_error(ret2);
- unlock_extent(io_tree, page_start, page_end, &cached_state);
+ btrfs_lock_extent(io_tree, page_start, page_end, &cached_state);
+ ret = set_folio_extent_mapped(folio);
+ if (ret < 0) {
+ btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state);
goto out_unlock;
}
@@ -1842,23 +1939,27 @@ again:
* We can't set the delalloc bits if there are pending ordered
* extents. Drop our locks and wait for them to finish.
*/
- ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, PAGE_SIZE);
+ ordered = btrfs_lookup_ordered_range(inode, page_start, fsize);
if (ordered) {
- unlock_extent(io_tree, page_start, page_end, &cached_state);
+ btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state);
folio_unlock(folio);
- up_read(&BTRFS_I(inode)->i_mmap_lock);
+ up_read(&inode->i_mmap_lock);
btrfs_start_ordered_extent(ordered);
btrfs_put_ordered_extent(ordered);
goto again;
}
- if (folio->index == ((size - 1) >> PAGE_SHIFT)) {
+ if (folio_contains(folio, (size - 1) >> PAGE_SHIFT)) {
reserved_space = round_up(size - page_start, fs_info->sectorsize);
- if (reserved_space < PAGE_SIZE) {
+ if (reserved_space < fsize) {
+ const u64 to_free = fsize - reserved_space;
+
end = page_start + reserved_space - 1;
- btrfs_delalloc_release_space(BTRFS_I(inode),
- data_reserved, page_start,
- PAGE_SIZE - reserved_space, true);
+ if (only_release_metadata)
+ btrfs_delalloc_release_metadata(inode, to_free, true);
+ else
+ btrfs_delalloc_release_space(inode, data_reserved,
+ end + 1, to_free, true);
}
}
@@ -1869,15 +1970,13 @@ again:
* clear any delalloc bits within this page range since we have to
* reserve data&meta space before lock_page() (see above comments).
*/
- clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
- EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
- EXTENT_DEFRAG, &cached_state);
-
- ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0,
- &cached_state);
- if (ret2) {
- unlock_extent(io_tree, page_start, page_end, &cached_state);
- ret = VM_FAULT_SIGBUS;
+ btrfs_clear_extent_bit(io_tree, page_start, end,
+ EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
+ EXTENT_DEFRAG, &cached_state);
+
+ ret = btrfs_set_extent_delalloc(inode, page_start, end, 0, &cached_state);
+ if (ret < 0) {
+ btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state);
goto out_unlock;
}
@@ -1885,36 +1984,53 @@ again:
if (page_start + folio_size(folio) > size)
zero_start = offset_in_folio(folio, size);
else
- zero_start = PAGE_SIZE;
+ zero_start = fsize;
- if (zero_start != PAGE_SIZE)
+ if (zero_start != fsize)
folio_zero_range(folio, zero_start, folio_size(folio) - zero_start);
- btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE);
+ btrfs_folio_clear_checked(fs_info, folio, page_start, fsize);
btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start);
btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start);
- btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
+ btrfs_set_inode_last_sub_trans(inode);
+
+ if (only_release_metadata)
+ btrfs_set_extent_bit(io_tree, page_start, end, EXTENT_NORESERVE,
+ &cached_state);
- unlock_extent(io_tree, page_start, page_end, &cached_state);
- up_read(&BTRFS_I(inode)->i_mmap_lock);
+ btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state);
+ up_read(&inode->i_mmap_lock);
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
- sb_end_pagefault(inode->i_sb);
+ btrfs_delalloc_release_extents(inode, fsize);
+ if (only_release_metadata)
+ btrfs_check_nocow_unlock(inode);
+ sb_end_pagefault(inode->vfs_inode.i_sb);
extent_changeset_free(data_reserved);
return VM_FAULT_LOCKED;
out_unlock:
folio_unlock(folio);
- up_read(&BTRFS_I(inode)->i_mmap_lock);
+ up_read(&inode->i_mmap_lock);
out:
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
- btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start,
- reserved_space, (ret != 0));
-out_noreserve:
- sb_end_pagefault(inode->i_sb);
+ btrfs_delalloc_release_extents(inode, fsize);
+ if (only_release_metadata)
+ btrfs_delalloc_release_metadata(inode, reserved_space, true);
+ else
+ btrfs_delalloc_release_space(inode, data_reserved, page_start,
+ reserved_space, true);
extent_changeset_free(data_reserved);
- return ret;
+out_noreserve:
+ if (only_release_metadata)
+ btrfs_check_nocow_unlock(inode);
+
+ sb_end_pagefault(inode->vfs_inode.i_sb);
+
+ if (ret < 0)
+ return vmf_error(ret);
+
+ /* Make the VM retry the fault. */
+ return VM_FAULT_NOPAGE;
}
static const struct vm_operations_struct btrfs_file_vm_ops = {
@@ -1923,46 +2039,47 @@ static const struct vm_operations_struct btrfs_file_vm_ops = {
.page_mkwrite = btrfs_page_mkwrite,
};
-static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
+static int btrfs_file_mmap_prepare(struct vm_area_desc *desc)
{
+ struct file *filp = desc->file;
struct address_space *mapping = filp->f_mapping;
if (!mapping->a_ops->read_folio)
return -ENOEXEC;
file_accessed(filp);
- vma->vm_ops = &btrfs_file_vm_ops;
+ desc->vm_ops = &btrfs_file_vm_ops;
return 0;
}
-static int hole_mergeable(struct btrfs_inode *inode, struct extent_buffer *leaf,
- int slot, u64 start, u64 end)
+static bool hole_mergeable(struct btrfs_inode *inode, struct extent_buffer *leaf,
+ int slot, u64 start, u64 end)
{
struct btrfs_file_extent_item *fi;
struct btrfs_key key;
if (slot < 0 || slot >= btrfs_header_nritems(leaf))
- return 0;
+ return false;
btrfs_item_key_to_cpu(leaf, &key, slot);
if (key.objectid != btrfs_ino(inode) ||
key.type != BTRFS_EXTENT_DATA_KEY)
- return 0;
+ return false;
fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
- return 0;
+ return false;
if (btrfs_file_extent_disk_bytenr(leaf, fi))
- return 0;
+ return false;
if (key.offset == end)
- return 1;
+ return true;
if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)
- return 1;
- return 0;
+ return true;
+ return false;
}
static int fill_holes(struct btrfs_trans_handle *trans,
@@ -2036,7 +2153,7 @@ static int fill_holes(struct btrfs_trans_handle *trans,
out:
btrfs_release_path(path);
- hole_em = alloc_extent_map();
+ hole_em = btrfs_alloc_extent_map();
if (!hole_em) {
btrfs_drop_extent_map_range(inode, offset, end - 1, false);
btrfs_set_inode_full_sync(inode);
@@ -2050,7 +2167,7 @@ out:
hole_em->generation = trans->transid;
ret = btrfs_replace_extent_map_range(inode, hole_em, true);
- free_extent_map(hole_em);
+ btrfs_free_extent_map(hole_em);
if (ret)
btrfs_set_inode_full_sync(inode);
}
@@ -2083,15 +2200,33 @@ static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len)
0 : *start + *len - em->start - em->len;
*start = em->start + em->len;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
return ret;
}
-static void btrfs_punch_hole_lock_range(struct inode *inode,
- const u64 lockstart,
- const u64 lockend,
- struct extent_state **cached_state)
+/*
+ * Check if there is no folio in the range.
+ *
+ * We cannot utilize filemap_range_has_page() in a filemap with large folios
+ * as we can hit the following false positive:
+ *
+ * start end
+ * | |
+ * |//|//|//|//| | | | | | | | |//|//|
+ * \ / \ /
+ * Folio A Folio B
+ *
+ * That large folio A and B cover the start and end indexes.
+ * In that case filemap_range_has_page() will always return true, but the above
+ * case is fine for btrfs_punch_hole_lock_range() usage.
+ *
+ * So here we only ensure that no other folios is in the range, excluding the
+ * head/tail large folio.
+ */
+static bool check_range_has_page(struct inode *inode, u64 start, u64 end)
{
+ struct folio_batch fbatch;
+ bool ret = false;
/*
* For subpage case, if the range is not at page boundary, we could
* have pages at the leading/tailing part of the range.
@@ -2099,15 +2234,48 @@ static void btrfs_punch_hole_lock_range(struct inode *inode,
* will always return true.
* So here we need to do extra page alignment for
* filemap_range_has_page().
+ *
+ * And do not decrease page_lockend right now, as it can be 0.
*/
- const u64 page_lockstart = round_up(lockstart, PAGE_SIZE);
- const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE) - 1;
+ const u64 page_lockstart = round_up(start, PAGE_SIZE);
+ const u64 page_lockend = round_down(end + 1, PAGE_SIZE);
+ const pgoff_t start_index = page_lockstart >> PAGE_SHIFT;
+ const pgoff_t end_index = (page_lockend - 1) >> PAGE_SHIFT;
+ pgoff_t tmp = start_index;
+ int found_folios;
+
+ /* The same page or adjacent pages. */
+ if (page_lockend <= page_lockstart)
+ return false;
+
+ folio_batch_init(&fbatch);
+ found_folios = filemap_get_folios(inode->i_mapping, &tmp, end_index, &fbatch);
+ for (int i = 0; i < found_folios; i++) {
+ struct folio *folio = fbatch.folios[i];
+ /* A large folio begins before the start. Not a target. */
+ if (folio->index < start_index)
+ continue;
+ /* A large folio extends beyond the end. Not a target. */
+ if (folio_next_index(folio) > end_index)
+ continue;
+ /* A folio doesn't cover the head/tail index. Found a target. */
+ ret = true;
+ break;
+ }
+ folio_batch_release(&fbatch);
+ return ret;
+}
+
+static void btrfs_punch_hole_lock_range(struct inode *inode,
+ const u64 lockstart, const u64 lockend,
+ struct extent_state **cached_state)
+{
while (1) {
truncate_pagecache_range(inode, lockstart, lockend);
- lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- cached_state);
+ btrfs_lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ cached_state);
/*
* We can't have ordered extents in the range, nor dirty/writeback
* pages, because we have locked the inode's VFS lock in exclusive
@@ -2118,12 +2286,11 @@ static void btrfs_punch_hole_lock_range(struct inode *inode,
* locking the range check if we have pages in the range, and if
* we do, unlock the range and retry.
*/
- if (!filemap_range_has_page(inode->i_mapping, page_lockstart,
- page_lockend))
+ if (!check_range_has_page(inode, lockstart, lockend))
break;
- unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- cached_state);
+ btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ cached_state);
}
btrfs_assert_inode_range_clean(BTRFS_I(inode), lockstart, lockend);
@@ -2236,7 +2403,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1);
u64 ino_size = round_up(inode->vfs_inode.i_size, fs_info->sectorsize);
struct btrfs_trans_handle *trans = NULL;
- struct btrfs_block_rsv *rsv;
+ struct btrfs_block_rsv rsv;
unsigned int rsv_count;
u64 cur_offset;
u64 len = end - start;
@@ -2245,13 +2412,9 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
if (end <= start)
return -EINVAL;
- rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
- if (!rsv) {
- ret = -ENOMEM;
- goto out;
- }
- rsv->size = btrfs_calc_insert_metadata_size(fs_info, 1);
- rsv->failfast = true;
+ btrfs_init_metadata_block_rsv(fs_info, &rsv, BTRFS_BLOCK_RSV_TEMP);
+ rsv.size = btrfs_calc_insert_metadata_size(fs_info, 1);
+ rsv.failfast = true;
/*
* 1 - update the inode
@@ -2268,14 +2431,14 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
trans = NULL;
- goto out_free;
+ goto out_release;
}
- ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
+ ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, &rsv,
min_size, false);
if (WARN_ON(ret))
goto out_trans;
- trans->block_rsv = rsv;
+ trans->block_rsv = &rsv;
cur_offset = start;
drop_args.path = path;
@@ -2391,10 +2554,10 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
}
ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
- rsv, min_size, false);
+ &rsv, min_size, false);
if (WARN_ON(ret))
break;
- trans->block_rsv = rsv;
+ trans->block_rsv = &rsv;
cur_offset = drop_args.drop_end;
len = end - cur_offset;
@@ -2471,16 +2634,15 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode,
out_trans:
if (!trans)
- goto out_free;
+ goto out_release;
trans->block_rsv = &fs_info->trans_block_rsv;
if (ret)
btrfs_end_transaction(trans);
else
*trans_out = trans;
-out_free:
- btrfs_free_block_rsv(fs_info, rsv);
-out:
+out_release:
+ btrfs_block_rsv_release(fs_info, &rsv, (u64)-1, NULL);
return ret;
}
@@ -2496,7 +2658,8 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
u64 lockend;
u64 tail_start;
u64 tail_len;
- u64 orig_start = offset;
+ const u64 orig_start = offset;
+ const u64 orig_end = offset + len - 1;
int ret = 0;
bool same_block;
u64 ino_size;
@@ -2528,18 +2691,14 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset))
== (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1));
/*
- * We needn't truncate any block which is beyond the end of the file
- * because we are sure there is no data there.
- */
- /*
* Only do this if we are in the same block and we aren't doing the
* entire block.
*/
if (same_block && len < fs_info->sectorsize) {
if (offset < ino_size) {
truncated_block = true;
- ret = btrfs_truncate_block(BTRFS_I(inode), offset, len,
- 0);
+ ret = btrfs_truncate_block(BTRFS_I(inode), offset + len - 1,
+ orig_start, orig_end);
} else {
ret = 0;
}
@@ -2549,7 +2708,7 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
/* zero back part of the first block */
if (offset < ino_size) {
truncated_block = true;
- ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0);
+ ret = btrfs_truncate_block(BTRFS_I(inode), offset, orig_start, orig_end);
if (ret) {
btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
return ret;
@@ -2586,8 +2745,8 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
if (tail_start + tail_len < ino_size) {
truncated_block = true;
ret = btrfs_truncate_block(BTRFS_I(inode),
- tail_start + tail_len,
- 0, 1);
+ tail_start + tail_len - 1,
+ orig_start, orig_end);
if (ret)
goto out_only_mutex;
}
@@ -2621,8 +2780,8 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
out:
- unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- &cached_state);
+ btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ &cached_state);
out_only_mutex:
if (!updated_inode && truncated_block && !ret) {
/*
@@ -2740,7 +2899,7 @@ static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode,
else
ret = RANGE_BOUNDARY_WRITTEN_EXTENT;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
return ret;
}
@@ -2755,6 +2914,8 @@ static int btrfs_zero_range(struct inode *inode,
int ret;
u64 alloc_hint = 0;
const u64 sectorsize = fs_info->sectorsize;
+ const u64 orig_start = offset;
+ const u64 orig_end = offset + len - 1;
u64 alloc_start = round_down(offset, sectorsize);
u64 alloc_end = round_up(offset + len, sectorsize);
u64 bytes_to_reserve = 0;
@@ -2784,7 +2945,7 @@ static int btrfs_zero_range(struct inode *inode,
* do nothing except updating the inode's i_size if
* needed.
*/
- free_extent_map(em);
+ btrfs_free_extent_map(em);
ret = btrfs_fallocate_update_isize(inode, offset + len,
mode);
goto out;
@@ -2797,9 +2958,9 @@ static int btrfs_zero_range(struct inode *inode,
ASSERT(IS_ALIGNED(alloc_start, sectorsize));
len = offset + len - alloc_start;
offset = alloc_start;
- alloc_hint = extent_map_block_start(em) + em->len;
+ alloc_hint = btrfs_extent_map_block_start(em) + em->len;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
@@ -2810,22 +2971,22 @@ static int btrfs_zero_range(struct inode *inode,
}
if (em->flags & EXTENT_FLAG_PREALLOC) {
- free_extent_map(em);
+ btrfs_free_extent_map(em);
ret = btrfs_fallocate_update_isize(inode, offset + len,
mode);
goto out;
}
if (len < sectorsize && em->disk_bytenr != EXTENT_MAP_HOLE) {
- free_extent_map(em);
- ret = btrfs_truncate_block(BTRFS_I(inode), offset, len,
- 0);
+ btrfs_free_extent_map(em);
+ ret = btrfs_truncate_block(BTRFS_I(inode), offset + len - 1,
+ orig_start, orig_end);
if (!ret)
ret = btrfs_fallocate_update_isize(inode,
offset + len,
mode);
return ret;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
alloc_start = round_down(offset, sectorsize);
alloc_end = alloc_start + sectorsize;
goto reserve_space;
@@ -2849,7 +3010,8 @@ static int btrfs_zero_range(struct inode *inode,
alloc_start = round_down(offset, sectorsize);
ret = 0;
} else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
- ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0);
+ ret = btrfs_truncate_block(BTRFS_I(inode), offset,
+ orig_start, orig_end);
if (ret)
goto out;
} else {
@@ -2866,8 +3028,8 @@ static int btrfs_zero_range(struct inode *inode,
alloc_end = round_up(offset + len, sectorsize);
ret = 0;
} else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
- ret = btrfs_truncate_block(BTRFS_I(inode), offset + len,
- 0, 1);
+ ret = btrfs_truncate_block(BTRFS_I(inode), offset + len - 1,
+ orig_start, orig_end);
if (ret)
goto out;
} else {
@@ -2892,16 +3054,16 @@ reserve_space:
ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved,
alloc_start, bytes_to_reserve);
if (ret) {
- unlock_extent(&BTRFS_I(inode)->io_tree, lockstart,
- lockend, &cached_state);
+ btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart,
+ lockend, &cached_state);
goto out;
}
ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
alloc_end - alloc_start,
fs_info->sectorsize,
offset + len, &alloc_hint);
- unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- &cached_state);
+ btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ &cached_state);
/* btrfs_prealloc_file_range releases reserved space on error */
if (ret) {
space_reserved = false;
@@ -2987,7 +3149,8 @@ static long btrfs_fallocate(struct file *file, int mode,
* need to zero out the end of the block if i_size lands in the
* middle of a block.
*/
- ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0);
+ ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size,
+ inode->i_size, (u64)-1);
if (ret)
goto out;
}
@@ -3012,8 +3175,8 @@ static long btrfs_fallocate(struct file *file, int mode,
}
locked_end = alloc_end - 1;
- lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
- &cached_state);
+ btrfs_lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
+ &cached_state);
btrfs_assert_inode_range_clean(BTRFS_I(inode), alloc_start, locked_end);
@@ -3025,8 +3188,8 @@ static long btrfs_fallocate(struct file *file, int mode,
ret = PTR_ERR(em);
break;
}
- last_byte = min(extent_map_end(em), alloc_end);
- actual_end = min_t(u64, extent_map_end(em), offset + len);
+ last_byte = min(btrfs_extent_map_end(em), alloc_end);
+ actual_end = min_t(u64, btrfs_extent_map_end(em), offset + len);
last_byte = ALIGN(last_byte, blocksize);
if (em->disk_bytenr == EXTENT_MAP_HOLE ||
(cur_offset >= inode->i_size &&
@@ -3035,19 +3198,19 @@ static long btrfs_fallocate(struct file *file, int mode,
ret = add_falloc_range(&reserve_list, cur_offset, range_len);
if (ret < 0) {
- free_extent_map(em);
+ btrfs_free_extent_map(em);
break;
}
ret = btrfs_qgroup_reserve_data(BTRFS_I(inode),
&data_reserved, cur_offset, range_len);
if (ret < 0) {
- free_extent_map(em);
+ btrfs_free_extent_map(em);
break;
}
qgroup_reserved += range_len;
data_space_needed += range_len;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
cur_offset = last_byte;
}
@@ -3101,8 +3264,8 @@ static long btrfs_fallocate(struct file *file, int mode,
*/
ret = btrfs_fallocate_update_isize(inode, actual_end, mode);
out_unlock:
- unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
- &cached_state);
+ btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
+ &cached_state);
out:
btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
extent_changeset_free(data_reserved);
@@ -3136,10 +3299,10 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end
if (inode->delalloc_bytes > 0) {
spin_unlock(&inode->lock);
*delalloc_start_ret = start;
- delalloc_len = count_range_bits(&inode->io_tree,
- delalloc_start_ret, end,
- len, EXTENT_DELALLOC, 1,
- cached_state);
+ delalloc_len = btrfs_count_range_bits(&inode->io_tree,
+ delalloc_start_ret, end,
+ len, EXTENT_DELALLOC, 1,
+ cached_state);
} else {
spin_unlock(&inode->lock);
}
@@ -3448,7 +3611,7 @@ static loff_t find_desired_extent(struct file *file, loff_t offset, int whence)
last_extent_end = lockstart;
- lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
+ btrfs_lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0) {
@@ -3594,7 +3757,7 @@ static loff_t find_desired_extent(struct file *file, loff_t offset, int whence)
}
out:
- unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
+ btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
btrfs_free_path(path);
if (ret < 0)
@@ -3659,7 +3822,7 @@ const struct file_operations btrfs_file_operations = {
.splice_read = filemap_splice_read,
.write_iter = btrfs_file_write_iter,
.splice_write = iter_file_splice_write,
- .mmap = btrfs_file_mmap,
+ .mmap_prepare = btrfs_file_mmap_prepare,
.open = btrfs_file_open,
.release = btrfs_release_file,
.get_unmapped_area = thp_get_unmapped_area,
diff --git a/fs/btrfs/file.h b/fs/btrfs/file.h
index de89e644be29..d7df81388cbe 100644
--- a/fs/btrfs/file.h
+++ b/fs/btrfs/file.h
@@ -9,6 +9,8 @@ struct file;
struct extent_state;
struct kiocb;
struct iov_iter;
+struct inode;
+struct folio;
struct page;
struct btrfs_ioctl_encoded_io_args;
struct btrfs_drop_extents_args;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index d42b6f882f57..5d8d1570a5c9 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -88,13 +88,13 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
struct btrfs_disk_key disk_key;
struct btrfs_free_space_header *header;
struct extent_buffer *leaf;
- struct inode *inode = NULL;
+ struct btrfs_inode *inode;
unsigned nofs_flag;
int ret;
key.objectid = BTRFS_FREE_SPACE_OBJECTID;
- key.offset = offset;
key.type = 0;
+ key.offset = offset;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
@@ -120,13 +120,13 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
btrfs_release_path(path);
memalloc_nofs_restore(nofs_flag);
if (IS_ERR(inode))
- return inode;
+ return ERR_CAST(inode);
- mapping_set_gfp_mask(inode->i_mapping,
- mapping_gfp_constraint(inode->i_mapping,
+ mapping_set_gfp_mask(inode->vfs_inode.i_mapping,
+ mapping_gfp_constraint(inode->vfs_inode.i_mapping,
~(__GFP_FS | __GFP_HIGHMEM)));
- return inode;
+ return &inode->vfs_inode;
}
struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group,
@@ -201,8 +201,8 @@ static int __create_free_space_inode(struct btrfs_root *root,
btrfs_release_path(path);
key.objectid = BTRFS_FREE_SPACE_OBJECTID;
- key.offset = offset;
key.type = 0;
+ key.offset = offset;
ret = btrfs_insert_empty_item(trans, root, path, &key,
sizeof(struct btrfs_free_space_header));
if (ret < 0) {
@@ -244,7 +244,7 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans,
struct inode *inode,
struct btrfs_block_group *block_group)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
int ret = 0;
@@ -257,12 +257,12 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans,
if (IS_ERR(inode)) {
if (PTR_ERR(inode) != -ENOENT)
ret = PTR_ERR(inode);
- goto out;
+ return ret;
}
ret = btrfs_orphan_add(trans, BTRFS_I(inode));
if (ret) {
btrfs_add_delayed_iput(BTRFS_I(inode));
- goto out;
+ return ret;
}
clear_nlink(inode);
/* One for the block groups ref */
@@ -285,12 +285,9 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans,
if (ret) {
if (ret > 0)
ret = 0;
- goto out;
+ return ret;
}
- ret = btrfs_del_item(trans, trans->fs_info->tree_root, path);
-out:
- btrfs_free_path(path);
- return ret;
+ return btrfs_del_item(trans, trans->fs_info->tree_root, path);
}
int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
@@ -311,8 +308,9 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
bool locked = false;
if (block_group) {
- struct btrfs_path *path = btrfs_alloc_path();
+ BTRFS_PATH_AUTO_FREE(path);
+ path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
goto fail;
@@ -333,13 +331,12 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
spin_lock(&block_group->lock);
block_group->disk_cache_state = BTRFS_DC_CLEAR;
spin_unlock(&block_group->lock);
- btrfs_free_path(path);
}
btrfs_i_size_write(inode, 0);
truncate_pagecache(vfs_inode, 0);
- lock_extent(&inode->io_tree, 0, (u64)-1, &cached_state);
+ btrfs_lock_extent(&inode->io_tree, 0, (u64)-1, &cached_state);
btrfs_drop_extent_map_range(inode, 0, (u64)-1, false);
/*
@@ -351,7 +348,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans,
inode_sub_bytes(&inode->vfs_inode, control.sub_bytes);
btrfs_inode_safe_disk_i_size_write(inode, control.last_size);
- unlock_extent(&inode->io_tree, 0, (u64)-1, &cached_state);
+ btrfs_unlock_extent(&inode->io_tree, 0, (u64)-1, &cached_state);
if (ret)
goto fail;
@@ -369,7 +366,7 @@ fail:
static void readahead_cache(struct inode *inode)
{
struct file_ra_state ra;
- unsigned long last_index;
+ pgoff_t last_index;
file_ra_state_init(&ra, inode->i_mapping);
last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
@@ -447,7 +444,7 @@ static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate)
{
- struct page *page;
+ struct folio *folio;
struct inode *inode = io_ctl->inode;
gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
int i;
@@ -455,31 +452,33 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate)
for (i = 0; i < io_ctl->num_pages; i++) {
int ret;
- page = find_or_create_page(inode->i_mapping, i, mask);
- if (!page) {
+ folio = __filemap_get_folio(inode->i_mapping, i,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
+ mask);
+ if (IS_ERR(folio)) {
io_ctl_drop_pages(io_ctl);
- return -ENOMEM;
+ return PTR_ERR(folio);
}
- ret = set_folio_extent_mapped(page_folio(page));
+ ret = set_folio_extent_mapped(folio);
if (ret < 0) {
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
io_ctl_drop_pages(io_ctl);
return ret;
}
- io_ctl->pages[i] = page;
- if (uptodate && !PageUptodate(page)) {
- btrfs_read_folio(NULL, page_folio(page));
- lock_page(page);
- if (page->mapping != inode->i_mapping) {
+ io_ctl->pages[i] = &folio->page;
+ if (uptodate && !folio_test_uptodate(folio)) {
+ btrfs_read_folio(NULL, folio);
+ folio_lock(folio);
+ if (folio->mapping != inode->i_mapping) {
btrfs_err(BTRFS_I(inode)->root->fs_info,
"free space cache page truncated");
io_ctl_drop_pages(io_ctl);
return -EIO;
}
- if (!PageUptodate(page)) {
+ if (!folio_test_uptodate(folio)) {
btrfs_err(BTRFS_I(inode)->root->fs_info,
"error reading free space cache");
io_ctl_drop_pages(io_ctl);
@@ -753,8 +752,8 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
return 0;
key.objectid = BTRFS_FREE_SPACE_OBJECTID;
- key.offset = offset;
key.type = 0;
+ key.offset = offset;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
@@ -1081,9 +1080,8 @@ int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
/* Get the cluster for this block_group if it exists */
if (block_group && !list_empty(&block_group->cluster_list)) {
- cluster = list_entry(block_group->cluster_list.next,
- struct btrfs_free_cluster,
- block_group_list);
+ cluster = list_first_entry(&block_group->cluster_list,
+ struct btrfs_free_cluster, block_group_list);
}
if (!node && cluster) {
@@ -1156,13 +1154,13 @@ update_cache_item(struct btrfs_trans_handle *trans,
int ret;
key.objectid = BTRFS_FREE_SPACE_OBJECTID;
- key.offset = offset;
key.type = 0;
+ key.offset = offset;
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
if (ret < 0) {
- clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
- EXTENT_DELALLOC, NULL);
+ btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
+ EXTENT_DELALLOC, NULL);
goto fail;
}
leaf = path->nodes[0];
@@ -1173,9 +1171,9 @@ update_cache_item(struct btrfs_trans_handle *trans,
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
found_key.offset != offset) {
- clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
- inode->i_size - 1, EXTENT_DELALLOC,
- NULL);
+ btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
+ inode->i_size - 1, EXTENT_DELALLOC,
+ NULL);
btrfs_release_path(path);
goto fail;
}
@@ -1220,9 +1218,9 @@ static noinline_for_stack int write_pinned_extent_entries(
start = block_group->start;
while (start < block_group->start + block_group->length) {
- if (!find_first_extent_bit(unpin, start,
- &extent_start, &extent_end,
- EXTENT_DIRTY, NULL))
+ if (!btrfs_find_first_extent_bit(unpin, start,
+ &extent_start, &extent_end,
+ EXTENT_DIRTY, NULL))
return 0;
/* This pinned extent is out of our range */
@@ -1268,8 +1266,8 @@ static int flush_dirty_cache(struct inode *inode)
ret = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1);
if (ret)
- clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
- EXTENT_DELALLOC, NULL);
+ btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
+ EXTENT_DELALLOC, NULL);
return ret;
}
@@ -1289,8 +1287,8 @@ cleanup_write_cache_enospc(struct inode *inode,
struct extent_state **cached_state)
{
io_ctl_drop_pages(io_ctl);
- unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
- cached_state);
+ btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
+ cached_state);
}
static int __btrfs_wait_cache_io(struct btrfs_root *root,
@@ -1415,8 +1413,8 @@ static int __btrfs_write_out_cache(struct inode *inode,
if (ret)
goto out_unlock;
- lock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
- &cached_state);
+ btrfs_lock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
+ &cached_state);
io_ctl_set_generation(io_ctl, trans->transid);
@@ -1476,8 +1474,8 @@ static int __btrfs_write_out_cache(struct inode *inode,
io_ctl_drop_pages(io_ctl);
io_ctl_free(io_ctl);
- unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
- &cached_state);
+ btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
+ &cached_state);
/*
* at this point the pages are under IO and we're happy,
@@ -2343,9 +2341,8 @@ again:
struct rb_node *node;
struct btrfs_free_space *entry;
- cluster = list_entry(block_group->cluster_list.next,
- struct btrfs_free_cluster,
- block_group_list);
+ cluster = list_first_entry(&block_group->cluster_list,
+ struct btrfs_free_cluster, block_group_list);
spin_lock(&cluster->lock);
node = rb_first(&cluster->root);
if (!node) {
@@ -3195,7 +3192,7 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group,
u64 *max_extent_size)
{
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
- int err;
+ int ret2;
u64 search_start = cluster->window_start;
u64 search_bytes = bytes;
u64 ret = 0;
@@ -3203,8 +3200,8 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group *block_group,
search_start = min_start;
search_bytes = bytes;
- err = search_bitmap(ctl, entry, &search_start, &search_bytes, true);
- if (err) {
+ ret2 = search_bitmap(ctl, entry, &search_start, &search_bytes, true);
+ if (ret2) {
*max_extent_size = max(get_max_extent_size(entry),
*max_extent_size);
return 0;
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index cae540ec15ed..eba7f22ae49c 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -35,7 +35,7 @@ static struct btrfs_root *btrfs_free_space_root(
return btrfs_global_root(block_group->fs_info, &key);
}
-void set_free_space_tree_thresholds(struct btrfs_block_group *cache)
+void btrfs_set_free_space_tree_thresholds(struct btrfs_block_group *cache)
{
u32 bitmap_range;
size_t bitmap_size;
@@ -82,22 +82,19 @@ static int add_new_free_space_info(struct btrfs_trans_handle *trans,
ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*info));
if (ret)
- goto out;
+ return ret;
leaf = path->nodes[0];
info = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_free_space_info);
btrfs_set_free_space_extent_count(leaf, info, 0);
btrfs_set_free_space_flags(leaf, info, 0);
-
- ret = 0;
-out:
btrfs_release_path(path);
- return ret;
+ return 0;
}
EXPORT_FOR_TESTS
-struct btrfs_free_space_info *search_free_space_info(
+struct btrfs_free_space_info *btrfs_search_free_space_info(
struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct btrfs_path *path, int cow)
@@ -117,7 +114,7 @@ struct btrfs_free_space_info *search_free_space_info(
if (ret != 0) {
btrfs_warn(fs_info, "missing free space info for %llu",
block_group->start);
- ASSERT(0);
+ DEBUG_WARN();
return ERR_PTR(-ENOENT);
}
@@ -141,12 +138,12 @@ static int btrfs_search_prev_slot(struct btrfs_trans_handle *trans,
return ret;
if (ret == 0) {
- ASSERT(0);
+ DEBUG_WARN();
return -EIO;
}
if (p->slots[0] == 0) {
- ASSERT(0);
+ DEBUG_WARN("no previous slot found");
return -EIO;
}
p->slots[0]--;
@@ -201,9 +198,9 @@ static void le_bitmap_set(unsigned long *map, unsigned int start, int len)
}
EXPORT_FOR_TESTS
-int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *block_group,
- struct btrfs_path *path)
+int btrfs_convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root = btrfs_free_space_root(block_group);
@@ -223,6 +220,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
bitmap = alloc_bitmap(bitmap_size);
if (!bitmap) {
ret = -ENOMEM;
+ btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -235,8 +233,10 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
while (!done) {
ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
- if (ret)
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
goto out;
+ }
leaf = path->nodes[0];
nr = 0;
@@ -271,19 +271,24 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
}
ret = btrfs_del_items(trans, root, path, path->slots[0], nr);
- if (ret)
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
goto out;
+ }
btrfs_release_path(path);
}
- info = search_free_space_info(trans, block_group, path, 1);
+ info = btrfs_search_free_space_info(trans, block_group, path, 1);
if (IS_ERR(info)) {
ret = PTR_ERR(info);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
leaf = path->nodes[0];
flags = btrfs_free_space_flags(leaf, info);
flags |= BTRFS_FREE_SPACE_USING_BITMAPS;
+ block_group->using_free_space_bitmaps = true;
+ block_group->using_free_space_bitmaps_cached = true;
btrfs_set_free_space_flags(leaf, info, flags);
expected_extent_count = btrfs_free_space_extent_count(leaf, info);
btrfs_release_path(path);
@@ -293,8 +298,8 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
"incorrect extent count for %llu; counted %u, expected %u",
block_group->start, extent_count,
expected_extent_count);
- ASSERT(0);
ret = -EIO;
+ btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -315,8 +320,10 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
ret = btrfs_insert_empty_item(trans, root, path, &key,
data_size);
- if (ret)
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
goto out;
+ }
leaf = path->nodes[0];
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
@@ -331,15 +338,13 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
ret = 0;
out:
kvfree(bitmap);
- if (ret)
- btrfs_abort_transaction(trans, ret);
return ret;
}
EXPORT_FOR_TESTS
-int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *block_group,
- struct btrfs_path *path)
+int btrfs_convert_free_space_to_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *root = btrfs_free_space_root(block_group);
@@ -358,6 +363,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
bitmap = alloc_bitmap(bitmap_size);
if (!bitmap) {
ret = -ENOMEM;
+ btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -370,8 +376,10 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
while (!done) {
ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
- if (ret)
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
goto out;
+ }
leaf = path->nodes[0];
nr = 0;
@@ -400,49 +408,56 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
data_size = free_space_bitmap_size(fs_info,
found_key.offset);
- ptr = btrfs_item_ptr_offset(leaf, path->slots[0] - 1);
+ path->slots[0]--;
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
read_extent_buffer(leaf, bitmap_cursor, ptr,
data_size);
nr++;
- path->slots[0]--;
} else {
ASSERT(0);
}
}
ret = btrfs_del_items(trans, root, path, path->slots[0], nr);
- if (ret)
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
goto out;
+ }
btrfs_release_path(path);
}
- info = search_free_space_info(trans, block_group, path, 1);
+ info = btrfs_search_free_space_info(trans, block_group, path, 1);
if (IS_ERR(info)) {
ret = PTR_ERR(info);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
leaf = path->nodes[0];
flags = btrfs_free_space_flags(leaf, info);
flags &= ~BTRFS_FREE_SPACE_USING_BITMAPS;
+ block_group->using_free_space_bitmaps = false;
+ block_group->using_free_space_bitmaps_cached = true;
btrfs_set_free_space_flags(leaf, info, flags);
expected_extent_count = btrfs_free_space_extent_count(leaf, info);
btrfs_release_path(path);
- nrbits = block_group->length >> block_group->fs_info->sectorsize_bits;
+ nrbits = block_group->length >> fs_info->sectorsize_bits;
start_bit = find_next_bit_le(bitmap, nrbits, 0);
while (start_bit < nrbits) {
end_bit = find_next_zero_bit_le(bitmap, nrbits, start_bit);
ASSERT(start_bit < end_bit);
- key.objectid = start + start_bit * block_group->fs_info->sectorsize;
+ key.objectid = start + start_bit * fs_info->sectorsize;
key.type = BTRFS_FREE_SPACE_EXTENT_KEY;
- key.offset = (end_bit - start_bit) * block_group->fs_info->sectorsize;
+ key.offset = (end_bit - start_bit) * fs_info->sectorsize;
ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
- if (ret)
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
goto out;
+ }
btrfs_release_path(path);
extent_count++;
@@ -455,16 +470,14 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
"incorrect extent count for %llu; counted %u, expected %u",
block_group->start, extent_count,
expected_extent_count);
- ASSERT(0);
ret = -EIO;
+ btrfs_abort_transaction(trans, ret);
goto out;
}
ret = 0;
out:
kvfree(bitmap);
- if (ret)
- btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -481,11 +494,10 @@ static int update_free_space_extent_count(struct btrfs_trans_handle *trans,
if (new_extents == 0)
return 0;
- info = search_free_space_info(trans, block_group, path, 1);
- if (IS_ERR(info)) {
- ret = PTR_ERR(info);
- goto out;
- }
+ info = btrfs_search_free_space_info(trans, block_group, path, 1);
+ if (IS_ERR(info))
+ return PTR_ERR(info);
+
flags = btrfs_free_space_flags(path->nodes[0], info);
extent_count = btrfs_free_space_extent_count(path->nodes[0], info);
@@ -495,19 +507,18 @@ static int update_free_space_extent_count(struct btrfs_trans_handle *trans,
if (!(flags & BTRFS_FREE_SPACE_USING_BITMAPS) &&
extent_count > block_group->bitmap_high_thresh) {
- ret = convert_free_space_to_bitmaps(trans, block_group, path);
+ ret = btrfs_convert_free_space_to_bitmaps(trans, block_group, path);
} else if ((flags & BTRFS_FREE_SPACE_USING_BITMAPS) &&
extent_count < block_group->bitmap_low_thresh) {
- ret = convert_free_space_to_extents(trans, block_group, path);
+ ret = btrfs_convert_free_space_to_extents(trans, block_group, path);
}
-out:
return ret;
}
EXPORT_FOR_TESTS
-int free_space_test_bit(struct btrfs_block_group *block_group,
- struct btrfs_path *path, u64 offset)
+bool btrfs_free_space_test_bit(struct btrfs_block_group *block_group,
+ struct btrfs_path *path, u64 offset)
{
struct extent_buffer *leaf;
struct btrfs_key key;
@@ -525,13 +536,13 @@ int free_space_test_bit(struct btrfs_block_group *block_group,
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
i = div_u64(offset - found_start,
block_group->fs_info->sectorsize);
- return !!extent_buffer_test_bit(leaf, ptr, i);
+ return extent_buffer_test_bit(leaf, ptr, i);
}
-static void free_space_set_bits(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *block_group,
- struct btrfs_path *path, u64 *start, u64 *size,
- int bit)
+static void free_space_modify_bits(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path, u64 *start, u64 *size,
+ bool set_bits)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
struct extent_buffer *leaf;
@@ -555,7 +566,7 @@ static void free_space_set_bits(struct btrfs_trans_handle *trans,
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
first = (*start - found_start) >> fs_info->sectorsize_bits;
last = (end - found_start) >> fs_info->sectorsize_bits;
- if (bit)
+ if (set_bits)
extent_buffer_bitmap_set(leaf, ptr, first, last - first);
else
extent_buffer_bitmap_clear(leaf, ptr, first, last - first);
@@ -599,13 +610,14 @@ static int free_space_next_bitmap(struct btrfs_trans_handle *trans,
static int modify_free_space_bitmap(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct btrfs_path *path,
- u64 start, u64 size, int remove)
+ u64 start, u64 size, bool remove)
{
struct btrfs_root *root = btrfs_free_space_root(block_group);
struct btrfs_key key;
u64 end = start + size;
u64 cur_start, cur_size;
- int prev_bit, next_bit;
+ bool prev_bit_set = false;
+ bool next_bit_set = false;
int new_extents;
int ret;
@@ -622,16 +634,16 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans,
ret = btrfs_search_prev_slot(trans, root, &key, path, 0, 1);
if (ret)
- goto out;
+ return ret;
- prev_bit = free_space_test_bit(block_group, path, prev_block);
+ prev_bit_set = btrfs_free_space_test_bit(block_group, path, prev_block);
/* The previous block may have been in the previous bitmap. */
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
if (start >= key.objectid + key.offset) {
ret = free_space_next_bitmap(trans, root, path);
if (ret)
- goto out;
+ return ret;
}
} else {
key.objectid = start;
@@ -640,9 +652,7 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans,
ret = btrfs_search_prev_slot(trans, root, &key, path, 0, 1);
if (ret)
- goto out;
-
- prev_bit = -1;
+ return ret;
}
/*
@@ -652,13 +662,13 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans,
cur_start = start;
cur_size = size;
while (1) {
- free_space_set_bits(trans, block_group, path, &cur_start, &cur_size,
- !remove);
+ free_space_modify_bits(trans, block_group, path, &cur_start,
+ &cur_size, !remove);
if (cur_size == 0)
break;
ret = free_space_next_bitmap(trans, root, path);
if (ret)
- goto out;
+ return ret;
}
/*
@@ -671,42 +681,36 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans,
if (end >= key.objectid + key.offset) {
ret = free_space_next_bitmap(trans, root, path);
if (ret)
- goto out;
+ return ret;
}
- next_bit = free_space_test_bit(block_group, path, end);
- } else {
- next_bit = -1;
+ next_bit_set = btrfs_free_space_test_bit(block_group, path, end);
}
if (remove) {
new_extents = -1;
- if (prev_bit == 1) {
+ if (prev_bit_set) {
/* Leftover on the left. */
new_extents++;
}
- if (next_bit == 1) {
+ if (next_bit_set) {
/* Leftover on the right. */
new_extents++;
}
} else {
new_extents = 1;
- if (prev_bit == 1) {
+ if (prev_bit_set) {
/* Merging with neighbor on the left. */
new_extents--;
}
- if (next_bit == 1) {
+ if (next_bit_set) {
/* Merging with neighbor on the right. */
new_extents--;
}
}
btrfs_release_path(path);
- ret = update_free_space_extent_count(trans, block_group, path,
- new_extents);
-
-out:
- return ret;
+ return update_free_space_extent_count(trans, block_group, path, new_extents);
}
static int remove_free_space_extent(struct btrfs_trans_handle *trans,
@@ -727,7 +731,7 @@ static int remove_free_space_extent(struct btrfs_trans_handle *trans,
ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
if (ret)
- goto out;
+ return ret;
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
@@ -759,7 +763,7 @@ static int remove_free_space_extent(struct btrfs_trans_handle *trans,
/* Delete the existing key (cases 1-4). */
ret = btrfs_del_item(trans, root, path);
if (ret)
- goto out;
+ return ret;
/* Add a key for leftovers at the beginning (cases 3 and 4). */
if (start > found_start) {
@@ -770,7 +774,7 @@ static int remove_free_space_extent(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
if (ret)
- goto out;
+ return ret;
new_extents++;
}
@@ -783,50 +787,58 @@ static int remove_free_space_extent(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
if (ret)
- goto out;
+ return ret;
new_extents++;
}
btrfs_release_path(path);
- ret = update_free_space_extent_count(trans, block_group, path,
- new_extents);
-
-out:
- return ret;
+ return update_free_space_extent_count(trans, block_group, path, new_extents);
}
-EXPORT_FOR_TESTS
-int __remove_from_free_space_tree(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *block_group,
- struct btrfs_path *path, u64 start, u64 size)
+static int using_bitmaps(struct btrfs_block_group *bg, struct btrfs_path *path)
{
struct btrfs_free_space_info *info;
u32 flags;
- int ret;
- if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) {
- ret = __add_block_group_free_space(trans, block_group, path);
- if (ret)
- return ret;
- }
+ if (bg->using_free_space_bitmaps_cached)
+ return bg->using_free_space_bitmaps;
- info = search_free_space_info(NULL, block_group, path, 0);
+ info = btrfs_search_free_space_info(NULL, bg, path, 0);
if (IS_ERR(info))
return PTR_ERR(info);
flags = btrfs_free_space_flags(path->nodes[0], info);
btrfs_release_path(path);
- if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) {
+ bg->using_free_space_bitmaps = (flags & BTRFS_FREE_SPACE_USING_BITMAPS);
+ bg->using_free_space_bitmaps_cached = true;
+
+ return bg->using_free_space_bitmaps;
+}
+
+EXPORT_FOR_TESTS
+int __btrfs_remove_from_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path, u64 start, u64 size)
+{
+ int ret;
+
+ ret = __add_block_group_free_space(trans, block_group, path);
+ if (ret)
+ return ret;
+
+ ret = using_bitmaps(block_group, path);
+ if (ret < 0)
+ return ret;
+
+ if (ret)
return modify_free_space_bitmap(trans, block_group, path,
- start, size, 1);
- } else {
- return remove_free_space_extent(trans, block_group, path,
- start, size);
- }
+ start, size, true);
+
+ return remove_free_space_extent(trans, block_group, path, start, size);
}
-int remove_from_free_space_tree(struct btrfs_trans_handle *trans,
- u64 start, u64 size)
+int btrfs_remove_from_free_space_tree(struct btrfs_trans_handle *trans,
+ u64 start, u64 size)
{
struct btrfs_block_group *block_group;
struct btrfs_path *path;
@@ -838,26 +850,27 @@ int remove_from_free_space_tree(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
+ btrfs_abort_transaction(trans, ret);
goto out;
}
block_group = btrfs_lookup_block_group(trans->fs_info, start);
if (!block_group) {
- ASSERT(0);
+ DEBUG_WARN("no block group found for start=%llu", start);
ret = -ENOENT;
+ btrfs_abort_transaction(trans, ret);
goto out;
}
mutex_lock(&block_group->free_space_lock);
- ret = __remove_from_free_space_tree(trans, block_group, path, start,
- size);
+ ret = __btrfs_remove_from_free_space_tree(trans, block_group, path, start, size);
mutex_unlock(&block_group->free_space_lock);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
btrfs_put_block_group(block_group);
out:
btrfs_free_path(path);
- if (ret)
- btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -904,7 +917,7 @@ static int add_free_space_extent(struct btrfs_trans_handle *trans,
ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
if (ret)
- goto out;
+ return ret;
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
@@ -927,7 +940,7 @@ static int add_free_space_extent(struct btrfs_trans_handle *trans,
if (found_end == start) {
ret = btrfs_del_item(trans, root, path);
if (ret)
- goto out;
+ return ret;
new_key.objectid = found_start;
new_key.offset += key.offset;
new_extents--;
@@ -944,7 +957,7 @@ right:
ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
if (ret)
- goto out;
+ return ret;
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
@@ -968,7 +981,7 @@ right:
if (found_start == end) {
ret = btrfs_del_item(trans, root, path);
if (ret)
- goto out;
+ return ret;
new_key.offset += key.offset;
new_extents--;
}
@@ -978,48 +991,36 @@ insert:
/* Insert the new key (cases 1-4). */
ret = btrfs_insert_empty_item(trans, root, path, &new_key, 0);
if (ret)
- goto out;
+ return ret;
btrfs_release_path(path);
- ret = update_free_space_extent_count(trans, block_group, path,
- new_extents);
-
-out:
- return ret;
+ return update_free_space_extent_count(trans, block_group, path, new_extents);
}
EXPORT_FOR_TESTS
-int __add_to_free_space_tree(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *block_group,
- struct btrfs_path *path, u64 start, u64 size)
+int __btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path, u64 start, u64 size)
{
- struct btrfs_free_space_info *info;
- u32 flags;
int ret;
- if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) {
- ret = __add_block_group_free_space(trans, block_group, path);
- if (ret)
- return ret;
- }
+ ret = __add_block_group_free_space(trans, block_group, path);
+ if (ret)
+ return ret;
- info = search_free_space_info(NULL, block_group, path, 0);
- if (IS_ERR(info))
- return PTR_ERR(info);
- flags = btrfs_free_space_flags(path->nodes[0], info);
- btrfs_release_path(path);
+ ret = using_bitmaps(block_group, path);
+ if (ret < 0)
+ return ret;
- if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) {
+ if (ret)
return modify_free_space_bitmap(trans, block_group, path,
- start, size, 0);
- } else {
- return add_free_space_extent(trans, block_group, path, start,
- size);
- }
+ start, size, false);
+
+ return add_free_space_extent(trans, block_group, path, start, size);
}
-int add_to_free_space_tree(struct btrfs_trans_handle *trans,
- u64 start, u64 size)
+int btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans,
+ u64 start, u64 size)
{
struct btrfs_block_group *block_group;
struct btrfs_path *path;
@@ -1031,25 +1032,27 @@ int add_to_free_space_tree(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
+ btrfs_abort_transaction(trans, ret);
goto out;
}
block_group = btrfs_lookup_block_group(trans->fs_info, start);
if (!block_group) {
- ASSERT(0);
+ DEBUG_WARN("no block group found for start=%llu", start);
ret = -ENOENT;
+ btrfs_abort_transaction(trans, ret);
goto out;
}
mutex_lock(&block_group->free_space_lock);
- ret = __add_to_free_space_tree(trans, block_group, path, start, size);
+ ret = __btrfs_add_to_free_space_tree(trans, block_group, path, start, size);
mutex_unlock(&block_group->free_space_lock);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
btrfs_put_block_group(block_group);
out:
btrfs_free_path(path);
- if (ret)
- btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -1062,7 +1065,8 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group)
{
struct btrfs_root *extent_root;
- struct btrfs_path *path, *path2;
+ BTRFS_PATH_AUTO_FREE(path);
+ BTRFS_PATH_AUTO_FREE(path2);
struct btrfs_key key;
u64 start, end;
int ret;
@@ -1070,17 +1074,16 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->reada = READA_FORWARD;
path2 = btrfs_alloc_path();
- if (!path2) {
- btrfs_free_path(path);
+ if (!path2)
return -ENOMEM;
- }
+
+ path->reada = READA_FORWARD;
ret = add_new_free_space_info(trans, block_group, path2);
if (ret)
- goto out;
+ return ret;
mutex_lock(&block_group->free_space_lock);
@@ -1099,11 +1102,21 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot_for_read(extent_root, &key, path, 1, 0);
if (ret < 0)
goto out_locked;
- ASSERT(ret == 0);
+ /*
+ * If ret is 1 (no key found), it means this is an empty block group,
+ * without any extents allocated from it and there's no block group
+ * item (key BTRFS_BLOCK_GROUP_ITEM_KEY) located in the extent tree
+ * because we are using the block group tree feature, so block group
+ * items are stored in the block group tree. It also means there are no
+ * extents allocated for block groups with a start offset beyond this
+ * block group's end offset (this is the last, highest, block group).
+ */
+ if (!btrfs_fs_compat_ro(trans->fs_info, BLOCK_GROUP_TREE))
+ ASSERT(ret == 0);
start = block_group->start;
end = block_group->start + block_group->length;
- while (1) {
+ while (ret == 0) {
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
if (key.type == BTRFS_EXTENT_ITEM_KEY ||
@@ -1112,11 +1125,11 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
break;
if (start < key.objectid) {
- ret = __add_to_free_space_tree(trans,
- block_group,
- path2, start,
- key.objectid -
- start);
+ ret = __btrfs_add_to_free_space_tree(trans,
+ block_group,
+ path2, start,
+ key.objectid -
+ start);
if (ret)
goto out_locked;
}
@@ -1133,12 +1146,10 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
ret = btrfs_next_item(extent_root, path);
if (ret < 0)
goto out_locked;
- if (ret)
- break;
}
if (start < end) {
- ret = __add_to_free_space_tree(trans, block_group, path2,
- start, end - start);
+ ret = __btrfs_add_to_free_space_tree(trans, block_group, path2,
+ start, end - start);
if (ret)
goto out_locked;
}
@@ -1146,9 +1157,7 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans,
ret = 0;
out_locked:
mutex_unlock(&block_group->free_space_lock);
-out:
- btrfs_free_path(path2);
- btrfs_free_path(path);
+
return ret;
}
@@ -1217,8 +1226,9 @@ out_clear:
static int clear_free_space_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
+ struct rb_node *node;
int nr;
int ret;
@@ -1233,7 +1243,7 @@ static int clear_free_space_tree(struct btrfs_trans_handle *trans,
while (1) {
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
- goto out;
+ return ret;
nr = btrfs_header_nritems(path->nodes[0]);
if (!nr)
@@ -1242,15 +1252,22 @@ static int clear_free_space_tree(struct btrfs_trans_handle *trans,
path->slots[0] = 0;
ret = btrfs_del_items(trans, root, path, 0, nr);
if (ret)
- goto out;
+ return ret;
btrfs_release_path(path);
}
- ret = 0;
-out:
- btrfs_free_path(path);
- return ret;
+ node = rb_first_cached(&trans->fs_info->block_group_cache_tree);
+ while (node) {
+ struct btrfs_block_group *bg;
+
+ bg = rb_entry(node, struct btrfs_block_group, cache_node);
+ clear_bit(BLOCK_GROUP_FLAG_FREE_SPACE_ADDED, &bg->runtime_flags);
+ node = rb_next(node);
+ cond_resched();
+ }
+
+ return 0;
}
int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info)
@@ -1339,12 +1356,18 @@ int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info)
block_group = rb_entry(node, struct btrfs_block_group,
cache_node);
+
+ if (test_bit(BLOCK_GROUP_FLAG_FREE_SPACE_ADDED,
+ &block_group->runtime_flags))
+ goto next;
+
ret = populate_free_space_tree(trans, block_group);
if (ret) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
return ret;
}
+next:
if (btrfs_should_end_transaction(trans)) {
btrfs_end_transaction(trans);
trans = btrfs_start_transaction(free_space_root, 1);
@@ -1367,51 +1390,79 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct btrfs_path *path)
{
+ bool own_path = false;
int ret;
- clear_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags);
+ if (!test_and_clear_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE,
+ &block_group->runtime_flags))
+ return 0;
+
+ /*
+ * While rebuilding the free space tree we may allocate new metadata
+ * block groups while modifying the free space tree.
+ *
+ * Because during the rebuild (at btrfs_rebuild_free_space_tree()) we
+ * can use multiple transactions, every time btrfs_end_transaction() is
+ * called at btrfs_rebuild_free_space_tree() we finish the creation of
+ * new block groups by calling btrfs_create_pending_block_groups(), and
+ * that in turn calls us, through add_block_group_free_space(), to add
+ * a free space info item and a free space extent item for the block
+ * group.
+ *
+ * Then later btrfs_rebuild_free_space_tree() may find such new block
+ * groups and processes them with populate_free_space_tree(), which can
+ * fail with EEXIST since there are already items for the block group in
+ * the free space tree. Notice that we say "may find" because a new
+ * block group may be added to the block groups rbtree in a node before
+ * or after the block group currently being processed by the rebuild
+ * process. So signal the rebuild process to skip such new block groups
+ * if it finds them.
+ */
+ set_bit(BLOCK_GROUP_FLAG_FREE_SPACE_ADDED, &block_group->runtime_flags);
+
+ if (!path) {
+ path = btrfs_alloc_path();
+ if (!path) {
+ btrfs_abort_transaction(trans, -ENOMEM);
+ return -ENOMEM;
+ }
+ own_path = true;
+ }
ret = add_new_free_space_info(trans, block_group, path);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+ ret = __btrfs_add_to_free_space_tree(trans, block_group, path,
+ block_group->start, block_group->length);
if (ret)
- return ret;
+ btrfs_abort_transaction(trans, ret);
+
+out:
+ if (own_path)
+ btrfs_free_path(path);
- return __add_to_free_space_tree(trans, block_group, path,
- block_group->start,
- block_group->length);
+ return ret;
}
-int add_block_group_free_space(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *block_group)
+int btrfs_add_block_group_free_space(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group)
{
- struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_path *path = NULL;
- int ret = 0;
+ int ret;
- if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+ if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE))
return 0;
mutex_lock(&block_group->free_space_lock);
- if (!test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags))
- goto out;
-
- path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- goto out;
- }
-
- ret = __add_block_group_free_space(trans, block_group, path);
-
-out:
- btrfs_free_path(path);
+ ret = __add_block_group_free_space(trans, block_group, NULL);
mutex_unlock(&block_group->free_space_lock);
- if (ret)
- btrfs_abort_transaction(trans, ret);
return ret;
}
-int remove_block_group_free_space(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *block_group)
+int btrfs_remove_block_group_free_space(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group)
{
struct btrfs_root *root = btrfs_free_space_root(block_group);
struct btrfs_path *path;
@@ -1432,6 +1483,7 @@ int remove_block_group_free_space(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
+ btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -1444,8 +1496,10 @@ int remove_block_group_free_space(struct btrfs_trans_handle *trans,
while (!done) {
ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
- if (ret)
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
goto out;
+ }
leaf = path->nodes[0];
nr = 0;
@@ -1473,16 +1527,16 @@ int remove_block_group_free_space(struct btrfs_trans_handle *trans,
}
ret = btrfs_del_items(trans, root, path, path->slots[0], nr);
- if (ret)
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
goto out;
+ }
btrfs_release_path(path);
}
ret = 0;
out:
btrfs_free_path(path);
- if (ret)
- btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -1494,7 +1548,7 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
struct btrfs_fs_info *fs_info;
struct btrfs_root *root;
struct btrfs_key key;
- int prev_bit = 0, bit;
+ bool prev_bit_set = false;
/* Initialize to silence GCC. */
u64 extent_start = 0;
u64 end, offset;
@@ -1511,7 +1565,7 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
while (1) {
ret = btrfs_next_item(root, path);
if (ret < 0)
- goto out;
+ return ret;
if (ret)
break;
@@ -1525,10 +1579,12 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
offset = key.objectid;
while (offset < key.objectid + key.offset) {
- bit = free_space_test_bit(block_group, path, offset);
- if (prev_bit == 0 && bit == 1) {
+ bool bit_set;
+
+ bit_set = btrfs_free_space_test_bit(block_group, path, offset);
+ if (!prev_bit_set && bit_set) {
extent_start = offset;
- } else if (prev_bit == 1 && bit == 0) {
+ } else if (prev_bit_set && !bit_set) {
u64 space_added;
ret = btrfs_add_new_free_space(block_group,
@@ -1536,7 +1592,7 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
offset,
&space_added);
if (ret)
- goto out;
+ return ret;
total_found += space_added;
if (total_found > CACHING_CTL_WAKE_UP) {
total_found = 0;
@@ -1544,14 +1600,14 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
}
extent_count++;
}
- prev_bit = bit;
+ prev_bit_set = bit_set;
offset += fs_info->sectorsize;
}
}
- if (prev_bit == 1) {
+ if (prev_bit_set) {
ret = btrfs_add_new_free_space(block_group, extent_start, end, NULL);
if (ret)
- goto out;
+ return ret;
extent_count++;
}
@@ -1560,14 +1616,11 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
"incorrect extent count for %llu; counted %u, expected %u",
block_group->start, extent_count,
expected_extent_count);
- ASSERT(0);
- ret = -EIO;
- goto out;
+ DEBUG_WARN();
+ return -EIO;
}
- ret = 0;
-out:
- return ret;
+ return 0;
}
static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
@@ -1594,7 +1647,7 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
ret = btrfs_next_item(root, path);
if (ret < 0)
- goto out;
+ return ret;
if (ret)
break;
@@ -1610,7 +1663,7 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
key.objectid + key.offset,
&space_added);
if (ret)
- goto out;
+ return ret;
total_found += space_added;
if (total_found > CACHING_CTL_WAKE_UP) {
total_found = 0;
@@ -1624,23 +1677,19 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
"incorrect extent count for %llu; counted %u, expected %u",
block_group->start, extent_count,
expected_extent_count);
- ASSERT(0);
- ret = -EIO;
- goto out;
+ DEBUG_WARN();
+ return -EIO;
}
- ret = 0;
-out:
- return ret;
+ return 0;
}
-int load_free_space_tree(struct btrfs_caching_control *caching_ctl)
+int btrfs_load_free_space_tree(struct btrfs_caching_control *caching_ctl)
{
struct btrfs_block_group *block_group;
struct btrfs_free_space_info *info;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
u32 extent_count, flags;
- int ret;
block_group = caching_ctl->block_group;
@@ -1656,11 +1705,10 @@ int load_free_space_tree(struct btrfs_caching_control *caching_ctl)
path->search_commit_root = 1;
path->reada = READA_FORWARD;
- info = search_free_space_info(NULL, block_group, path, 0);
- if (IS_ERR(info)) {
- ret = PTR_ERR(info);
- goto out;
- }
+ info = btrfs_search_free_space_info(NULL, block_group, path, 0);
+ if (IS_ERR(info))
+ return PTR_ERR(info);
+
extent_count = btrfs_free_space_extent_count(path->nodes[0], info);
flags = btrfs_free_space_flags(path->nodes[0], info);
@@ -1670,11 +1718,7 @@ int load_free_space_tree(struct btrfs_caching_control *caching_ctl)
* there.
*/
if (flags & BTRFS_FREE_SPACE_USING_BITMAPS)
- ret = load_free_space_bitmaps(caching_ctl, path, extent_count);
+ return load_free_space_bitmaps(caching_ctl, path, extent_count);
else
- ret = load_free_space_extents(caching_ctl, path, extent_count);
-
-out:
- btrfs_free_path(path);
- return ret;
+ return load_free_space_extents(caching_ctl, path, extent_count);
}
diff --git a/fs/btrfs/free-space-tree.h b/fs/btrfs/free-space-tree.h
index e6c6d6f4f221..3d9a5d4477fc 100644
--- a/fs/btrfs/free-space-tree.h
+++ b/fs/btrfs/free-space-tree.h
@@ -22,39 +22,39 @@ struct btrfs_trans_handle;
#define BTRFS_FREE_SPACE_BITMAP_SIZE 256
#define BTRFS_FREE_SPACE_BITMAP_BITS (BTRFS_FREE_SPACE_BITMAP_SIZE * BITS_PER_BYTE)
-void set_free_space_tree_thresholds(struct btrfs_block_group *block_group);
+void btrfs_set_free_space_tree_thresholds(struct btrfs_block_group *block_group);
int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info);
int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info);
int btrfs_rebuild_free_space_tree(struct btrfs_fs_info *fs_info);
-int load_free_space_tree(struct btrfs_caching_control *caching_ctl);
-int add_block_group_free_space(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *block_group);
-int remove_block_group_free_space(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *block_group);
-int add_to_free_space_tree(struct btrfs_trans_handle *trans,
- u64 start, u64 size);
-int remove_from_free_space_tree(struct btrfs_trans_handle *trans,
- u64 start, u64 size);
+int btrfs_load_free_space_tree(struct btrfs_caching_control *caching_ctl);
+int btrfs_add_block_group_free_space(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group);
+int btrfs_remove_block_group_free_space(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group);
+int btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans,
+ u64 start, u64 size);
+int btrfs_remove_from_free_space_tree(struct btrfs_trans_handle *trans,
+ u64 start, u64 size);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct btrfs_free_space_info *
-search_free_space_info(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *block_group,
- struct btrfs_path *path, int cow);
-int __add_to_free_space_tree(struct btrfs_trans_handle *trans,
+btrfs_search_free_space_info(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
- struct btrfs_path *path, u64 start, u64 size);
-int __remove_from_free_space_tree(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *block_group,
- struct btrfs_path *path, u64 start, u64 size);
-int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *block_group,
- struct btrfs_path *path);
-int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
- struct btrfs_block_group *block_group,
- struct btrfs_path *path);
-int free_space_test_bit(struct btrfs_block_group *block_group,
- struct btrfs_path *path, u64 offset);
+ struct btrfs_path *path, int cow);
+int __btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path, u64 start, u64 size);
+int __btrfs_remove_from_free_space_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path, u64 start, u64 size);
+int btrfs_convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path);
+int btrfs_convert_free_space_to_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *block_group,
+ struct btrfs_path *path);
+bool btrfs_free_space_test_bit(struct btrfs_block_group *block_group,
+ struct btrfs_path *path, u64 offset);
#endif
#endif
diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c
index 09cfb43580cb..b2bb86f8d7cf 100644
--- a/fs/btrfs/fs.c
+++ b/fs/btrfs/fs.c
@@ -1,7 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
#include "messages.h"
-#include "ctree.h"
#include "fs.h"
#include "accessors.h"
#include "volumes.h"
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index b572d6b9730b..8cc07cc70b12 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -47,6 +47,18 @@ struct btrfs_subpage_info;
struct btrfs_stripe_hash_table;
struct btrfs_space_info;
+/*
+ * Minimum data and metadata block size.
+ *
+ * Normally it's 4K, but for testing subpage block size on 4K page systems, we
+ * allow DEBUG builds to accept 2K page size.
+ */
+#ifdef CONFIG_BTRFS_DEBUG
+#define BTRFS_MIN_BLOCKSIZE (SZ_2K)
+#else
+#define BTRFS_MIN_BLOCKSIZE (SZ_4K)
+#endif
+
#define BTRFS_MAX_EXTENT_SIZE SZ_128M
#define BTRFS_OLDEST_GENERATION 0ULL
@@ -105,6 +117,9 @@ enum {
/* Indicates there was an error cleaning up a log tree. */
BTRFS_FS_STATE_LOG_CLEANUP_ERROR,
+ /* No more delayed iput can be queued. */
+ BTRFS_FS_STATE_NO_DELAYED_IPUT,
+
BTRFS_FS_STATE_COUNT
};
@@ -285,6 +300,7 @@ enum {
#define BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR 0ULL
#define BTRFS_DEFAULT_COMMIT_INTERVAL (30)
+#define BTRFS_WARNING_COMMIT_INTERVAL (300)
#define BTRFS_DEFAULT_MAX_INLINE (2048)
struct btrfs_dev_replace {
@@ -404,6 +420,8 @@ struct btrfs_commit_stats {
u64 last_commit_dur;
/* The total commit duration in ns */
u64 total_commit_dur;
+ /* Start of the last critical section in ns. */
+ u64 critical_section_start_time;
};
struct btrfs_fs_info {
@@ -456,6 +474,8 @@ struct btrfs_fs_info {
struct btrfs_block_rsv delayed_block_rsv;
/* Block reservation for delayed refs */
struct btrfs_block_rsv delayed_refs_rsv;
+ /* Block reservation for treelog tree */
+ struct btrfs_block_rsv treelog_rsv;
struct btrfs_block_rsv empty_block_rsv;
@@ -485,8 +505,8 @@ struct btrfs_fs_info {
u64 last_trans_log_full_commit;
unsigned long long mount_opt;
- unsigned long compress_type:4;
- unsigned int compress_level;
+ int compress_type;
+ int compress_level;
u32 commit_interval;
/*
* It is a suggestive number, the read side is safe even it gets a
@@ -695,8 +715,6 @@ struct btrfs_fs_info {
u32 data_chunk_allocations;
u32 metadata_ratio;
- void *bdev_holder;
-
/* Private scrub information */
struct mutex scrub_lock;
atomic_t scrubs_running;
@@ -709,7 +727,6 @@ struct btrfs_fs_info {
* running.
*/
refcount_t scrub_workers_refcnt;
- u32 sectors_per_page;
struct workqueue_struct *scrub_workers;
struct btrfs_discard_ctl discard_ctl;
@@ -722,12 +739,6 @@ struct btrfs_fs_info {
spinlock_t qgroup_lock;
/*
- * Used to avoid frequently calling ulist_alloc()/ulist_free()
- * when doing qgroup accounting, it must be protected by qgroup_lock.
- */
- struct ulist *qgroup_ulist;
-
- /*
* Protect user change for quota operations. If a transaction is needed,
* it must be started before locking this lock.
*/
@@ -762,10 +773,8 @@ struct btrfs_fs_info {
struct btrfs_delayed_root *delayed_root;
- /* Extent buffer radix tree */
- spinlock_t buffer_lock;
- /* Entries are eb->start / sectorsize */
- struct radix_tree_root buffer_radix;
+ /* Entries are eb->start >> nodesize_bits */
+ struct xarray buffer_tree;
/* Next backup root to be overwritten */
int backup_root_index;
@@ -796,6 +805,7 @@ struct btrfs_fs_info {
/* Cached block sizes */
u32 nodesize;
+ u32 nodesize_bits;
u32 sectorsize;
/* ilog2 of sectorsize, use to avoid 64bit division */
u32 sectorsize_bits;
@@ -981,6 +991,12 @@ static inline u32 count_max_extents(const struct btrfs_fs_info *fs_info, u64 siz
return div_u64(size + fs_info->max_extent_size - 1, fs_info->max_extent_size);
}
+static inline unsigned int btrfs_blocks_per_folio(const struct btrfs_fs_info *fs_info,
+ const struct folio *folio)
+{
+ return folio_size(folio) >> fs_info->sectorsize_bits;
+}
+
bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
enum btrfs_exclusive_operation type);
bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 448aa1a682d6..f06cf701ae5a 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -78,13 +78,10 @@ struct btrfs_inode_extref *btrfs_find_name_in_ext_backref(
}
/* Returns NULL if no extref found */
-struct btrfs_inode_extref *
-btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- const struct fscrypt_str *name,
- u64 inode_objectid, u64 ref_objectid, int ins_len,
- int cow)
+struct btrfs_inode_extref *btrfs_lookup_inode_extref(struct btrfs_root *root,
+ struct btrfs_path *path,
+ const struct fscrypt_str *name,
+ u64 inode_objectid, u64 ref_objectid)
{
int ret;
struct btrfs_key key;
@@ -93,7 +90,7 @@ btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
key.type = BTRFS_INODE_EXTREF_KEY;
key.offset = btrfs_extref_hash(ref_objectid, name->name, name->len);
- ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
return ERR_PTR(ret);
if (ret > 0)
@@ -109,7 +106,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
u64 inode_objectid, u64 ref_objectid,
u64 *index)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct btrfs_inode_extref *extref;
struct extent_buffer *leaf;
@@ -129,9 +126,9 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0)
- ret = -ENOENT;
+ return -ENOENT;
if (ret < 0)
- goto out;
+ return ret;
/*
* Sanity check - did we find the right item for this name?
@@ -142,8 +139,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
ref_objectid, name);
if (!extref) {
btrfs_abort_transaction(trans, -ENOENT);
- ret = -ENOENT;
- goto out;
+ return -ENOENT;
}
leaf = path->nodes[0];
@@ -152,12 +148,8 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
*index = btrfs_inode_extref_index(leaf, extref);
if (del_len == item_size) {
- /*
- * Common case only one ref in the item, remove the
- * whole item.
- */
- ret = btrfs_del_item(trans, root, path);
- goto out;
+ /* Common case only one ref in the item, remove the whole item. */
+ return btrfs_del_item(trans, root, path);
}
ptr = (unsigned long)extref;
@@ -168,9 +160,6 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
btrfs_truncate_item(trans, path, item_size - del_len, 1);
-out:
- btrfs_free_path(path);
-
return ret;
}
@@ -191,8 +180,8 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
int del_len = name->len + sizeof(*ref);
key.objectid = inode_objectid;
- key.offset = ref_objectid;
key.type = BTRFS_INODE_REF_KEY;
+ key.offset = ref_objectid;
path = btrfs_alloc_path();
if (!path)
@@ -260,7 +249,7 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
int ret;
int ins_len = name->len + sizeof(*extref);
unsigned long ptr;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_key key;
struct extent_buffer *leaf;
@@ -279,13 +268,13 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
path->slots[0],
ref_objectid,
name))
- goto out;
+ return ret;
btrfs_extend_item(trans, path, ins_len);
ret = 0;
}
if (ret < 0)
- goto out;
+ return ret;
leaf = path->nodes[0];
ptr = (unsigned long)btrfs_item_ptr(leaf, path->slots[0], char);
@@ -298,9 +287,8 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
ptr = (unsigned long)&extref->name;
write_extent_buffer(path->nodes[0], name->name, ptr, name->len);
-out:
- btrfs_free_path(path);
- return ret;
+
+ return 0;
}
/* Will return 0, -ENOMEM, -EMLINK, or -EEXIST or anything from the CoW path */
@@ -317,8 +305,8 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
int ins_len = name->len + sizeof(*ref);
key.objectid = inode_objectid;
- key.offset = ref_objectid;
key.type = BTRFS_INODE_REF_KEY;
+ key.offset = ref_objectid;
path = btrfs_alloc_path();
if (!path)
@@ -493,8 +481,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
path->reada = READA_BACK;
key.objectid = control->ino;
- key.offset = (u64)-1;
key.type = (u8)-1;
+ key.offset = (u64)-1;
search_again:
/*
@@ -729,13 +717,12 @@ delete:
}
out:
if (ret >= 0 && pending_del_nr) {
- int err;
+ int ret2;
- err = btrfs_del_items(trans, root, path, pending_del_slot,
- pending_del_nr);
- if (err) {
- btrfs_abort_transaction(trans, err);
- ret = err;
+ ret2 = btrfs_del_items(trans, root, path, pending_del_slot, pending_del_nr);
+ if (ret2) {
+ btrfs_abort_transaction(trans, ret2);
+ ret = ret2;
}
}
diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h
index c11b97fdccc4..6d9f5ad20646 100644
--- a/fs/btrfs/inode-item.h
+++ b/fs/btrfs/inode-item.h
@@ -101,13 +101,10 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_path *path,
struct btrfs_key *location, int mod);
-struct btrfs_inode_extref *btrfs_lookup_inode_extref(
- struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- const struct fscrypt_str *name,
- u64 inode_objectid, u64 ref_objectid, int ins_len,
- int cow);
+struct btrfs_inode_extref *btrfs_lookup_inode_extref(struct btrfs_root *root,
+ struct btrfs_path *path,
+ const struct fscrypt_str *name,
+ u64 inode_objectid, u64 ref_objectid);
struct btrfs_inode_ref *btrfs_find_name_in_backref(const struct extent_buffer *leaf,
int slot,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a9322601ab5c..9e4aec7330cb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -308,7 +308,7 @@ static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode,
const u32 csum_size = root->fs_info->csum_size;
/* For data reloc tree, it's better to do a backref lookup instead. */
- if (btrfs_root_id(root) == BTRFS_DATA_RELOC_TREE_OBJECTID)
+ if (btrfs_is_data_reloc_root(root))
return print_data_reloc_error(inode, logical_start, csum,
csum_expected, mirror_num);
@@ -395,16 +395,18 @@ void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags)
static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
u64 offset, u64 bytes)
{
- unsigned long index = offset >> PAGE_SHIFT;
- unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
+ pgoff_t index = offset >> PAGE_SHIFT;
+ const pgoff_t end_index = (offset + bytes - 1) >> PAGE_SHIFT;
struct folio *folio;
while (index <= end_index) {
folio = filemap_get_folio(inode->vfs_inode.i_mapping, index);
- index++;
- if (IS_ERR(folio))
+ if (IS_ERR(folio)) {
+ index++;
continue;
+ }
+ index = folio_end(folio) >> PAGE_SHIFT;
/*
* Here we just clear all Ordered bits for every page in the
* range, then btrfs_mark_ordered_io_finished() will handle
@@ -423,18 +425,18 @@ static int btrfs_dirty_inode(struct btrfs_inode *inode);
static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
struct btrfs_new_inode_args *args)
{
- int err;
+ int ret;
if (args->default_acl) {
- err = __btrfs_set_acl(trans, args->inode, args->default_acl,
+ ret = __btrfs_set_acl(trans, args->inode, args->default_acl,
ACL_TYPE_DEFAULT);
- if (err)
- return err;
+ if (ret)
+ return ret;
}
if (args->acl) {
- err = __btrfs_set_acl(trans, args->inode, args->acl, ACL_TYPE_ACCESS);
- if (err)
- return err;
+ ret = __btrfs_set_acl(trans, args->inode, args->acl, ACL_TYPE_ACCESS);
+ if (ret)
+ return ret;
}
if (!args->default_acl && !args->acl)
cache_no_acl(args->inode);
@@ -489,8 +491,8 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
size_t datasize;
key.objectid = btrfs_ino(inode);
- key.offset = 0;
key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = 0;
datasize = btrfs_file_extent_calc_inline_size(cur_size);
ret = btrfs_insert_empty_item(trans, root, path, &key,
@@ -566,23 +568,14 @@ static bool can_cow_file_range_inline(struct btrfs_inode *inode,
if (offset != 0)
return false;
- /*
- * Due to the page size limit, for subpage we can only trigger the
- * writeback for the dirty sectors of page, that means data writeback
- * is doing more writeback than what we want.
- *
- * This is especially unexpected for some call sites like fallocate,
- * where we only increase i_size after everything is done.
- * This means we can trigger inline extent even if we didn't want to.
- * So here we skip inline extent creation completely.
- */
- if (fs_info->sectorsize != PAGE_SIZE)
- return false;
-
/* Inline extents are limited to sectorsize. */
if (size > fs_info->sectorsize)
return false;
+ /* We do not allow a non-compressed extent to be as large as block size. */
+ if (data_len >= fs_info->sectorsize)
+ return false;
+
/* We cannot exceed the maximum inline data size. */
if (data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info))
return false;
@@ -672,7 +665,7 @@ out:
* And at reserve time, it's always aligned to page size, so
* just free one page here.
*/
- btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE, NULL);
+ btrfs_qgroup_free_data(inode, NULL, 0, fs_info->sectorsize, NULL);
btrfs_free_path(path);
btrfs_end_transaction(trans);
return ret;
@@ -695,12 +688,12 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode,
if (!can_cow_file_range_inline(inode, offset, size, compressed_size))
return 1;
- lock_extent(&inode->io_tree, offset, end, &cached);
+ btrfs_lock_extent(&inode->io_tree, offset, end, &cached);
ret = __cow_file_range_inline(inode, size, compressed_size,
compress_type, compressed_folio,
update_i_size);
if (ret > 0) {
- unlock_extent(&inode->io_tree, offset, end, &cached);
+ btrfs_unlock_extent(&inode->io_tree, offset, end, &cached);
return ret;
}
@@ -786,33 +779,19 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
struct btrfs_fs_info *fs_info = inode->root->fs_info;
if (!btrfs_inode_can_compress(inode)) {
- WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
- KERN_ERR "BTRFS: unexpected compression for ino %llu\n",
- btrfs_ino(inode));
+ DEBUG_WARN("BTRFS: unexpected compression for ino %llu", btrfs_ino(inode));
return 0;
}
- /*
- * Only enable sector perfect compression for experimental builds.
- *
- * This is a big feature change for subpage cases, and can hit
- * different corner cases, so only limit this feature for
- * experimental build for now.
- *
- * ETA for moving this out of experimental builds is 6.15.
- */
- if (fs_info->sectorsize < PAGE_SIZE &&
- !IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL)) {
- if (!PAGE_ALIGNED(start) ||
- !PAGE_ALIGNED(end + 1))
- return 0;
- }
+ /* Defrag ioctl takes precedence over mount options and properties. */
+ if (inode->defrag_compress == BTRFS_DEFRAG_DONT_COMPRESS)
+ return 0;
+ if (BTRFS_COMPRESS_NONE < inode->defrag_compress &&
+ inode->defrag_compress < BTRFS_NR_COMPRESS_TYPES)
+ return 1;
/* force compress */
if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
return 1;
- /* defrag ioctl */
- if (inode->defrag_compress)
- return 1;
/* bad compression ratios */
if (inode->flags & BTRFS_INODE_NOCOMPRESS)
return 0;
@@ -832,21 +811,20 @@ static inline void inode_should_defrag(struct btrfs_inode *inode,
btrfs_add_inode_defrag(inode, small_write);
}
-static int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
+static int extent_range_clear_dirty_for_io(struct btrfs_inode *inode, u64 start, u64 end)
{
- unsigned long end_index = end >> PAGE_SHIFT;
+ const pgoff_t end_index = end >> PAGE_SHIFT;
struct folio *folio;
int ret = 0;
- for (unsigned long index = start >> PAGE_SHIFT;
- index <= end_index; index++) {
- folio = filemap_get_folio(inode->i_mapping, index);
+ for (pgoff_t index = start >> PAGE_SHIFT; index <= end_index; index++) {
+ folio = filemap_get_folio(inode->vfs_inode.i_mapping, index);
if (IS_ERR(folio)) {
if (!ret)
ret = PTR_ERR(folio);
continue;
}
- btrfs_folio_clamp_clear_dirty(inode_to_fs_info(inode), folio, start,
+ btrfs_folio_clamp_clear_dirty(inode->root->fs_info, folio, start,
end + 1 - start);
folio_put(folio);
}
@@ -886,6 +864,7 @@ static void compress_file_range(struct btrfs_work *work)
unsigned int poff;
int i;
int compress_type = fs_info->compress_type;
+ int compress_level = fs_info->compress_level;
inode_should_defrag(inode, start, end, end - start + 1, SZ_16K);
@@ -894,7 +873,7 @@ static void compress_file_range(struct btrfs_work *work)
* Otherwise applications with the file mmap'd can wander in and change
* the page contents while we are compressing them.
*/
- ret = extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end);
+ ret = extent_range_clear_dirty_for_io(inode, start, end);
/*
* All the folios should have been locked thus no failure.
@@ -968,13 +947,15 @@ again:
goto cleanup_and_bail_uncompressed;
}
- if (inode->defrag_compress)
+ if (0 < inode->defrag_compress && inode->defrag_compress < BTRFS_NR_COMPRESS_TYPES) {
compress_type = inode->defrag_compress;
- else if (inode->prop_compress)
+ compress_level = inode->defrag_compress_level;
+ } else if (inode->prop_compress) {
compress_type = inode->prop_compress;
+ }
/* Compression level is applied here. */
- ret = btrfs_compress_folios(compress_type | (fs_info->compress_level << 4),
+ ret = btrfs_compress_folios(compress_type, compress_level,
mapping, start, folios, &nr_folios, &total_in,
&total_compressed);
if (ret)
@@ -1090,7 +1071,6 @@ static void submit_uncompressed_range(struct btrfs_inode *inode,
&wbc, false);
wbc_detach_inode(&wbc);
if (ret < 0) {
- btrfs_cleanup_ordered_extents(inode, start, end - start + 1);
if (locked_folio)
btrfs_folio_end_lock(inode->root->fs_info, locked_folio,
start, async_extent->ram_size);
@@ -1116,6 +1096,7 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
struct extent_state *cached = NULL;
struct extent_map *em;
int ret = 0;
+ bool free_pages = false;
u64 start = async_extent->start;
u64 end = async_extent->start + async_extent->ram_size - 1;
@@ -1136,7 +1117,10 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
}
if (async_extent->compress_type == BTRFS_COMPRESS_NONE) {
+ ASSERT(!async_extent->folios);
+ ASSERT(async_extent->nr_folios == 0);
submit_uncompressed_range(inode, async_extent, locked_folio);
+ free_pages = true;
goto done;
}
@@ -1152,10 +1136,11 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
* fall back to uncompressed.
*/
submit_uncompressed_range(inode, async_extent, locked_folio);
+ free_pages = true;
goto done;
}
- lock_extent(io_tree, start, end, &cached);
+ btrfs_lock_extent(io_tree, start, end, &cached);
/* Here we're doing allocation and writeback of the compressed pages */
file_extent.disk_bytenr = ins.objectid;
@@ -1170,10 +1155,10 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
ret = PTR_ERR(em);
goto out_free_reserve;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent,
- 1 << BTRFS_ORDERED_COMPRESSED);
+ 1U << BTRFS_ORDERED_COMPRESSED);
if (IS_ERR(ordered)) {
btrfs_drop_extent_map_range(inode, start, end, false);
ret = PTR_ERR(ordered);
@@ -1193,12 +1178,14 @@ static void submit_one_async_extent(struct async_chunk *async_chunk,
done:
if (async_chunk->blkcg_css)
kthread_associate_blkcg(NULL);
+ if (free_pages)
+ free_async_extent_pages(async_extent);
kfree(async_extent);
return;
out_free_reserve:
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
- btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
+ btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true);
mapping_set_error(inode->vfs_inode.i_mapping, -EIO);
extent_clear_unlock_delalloc(inode, start, end,
NULL, &cached,
@@ -1225,7 +1212,7 @@ u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
u64 alloc_hint = 0;
read_lock(&em_tree->lock);
- em = search_extent_mapping(em_tree, start, num_bytes);
+ em = btrfs_search_extent_mapping(em_tree, start, num_bytes);
if (em) {
/*
* if block start isn't an actual block number then find the
@@ -1233,15 +1220,15 @@ u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
* block is also bogus then just don't worry about it.
*/
if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) {
- free_extent_map(em);
- em = search_extent_mapping(em_tree, 0, 0);
+ btrfs_free_extent_map(em);
+ em = btrfs_search_extent_mapping(em_tree, 0, 0);
if (em && em->disk_bytenr < EXTENT_MAP_LAST_BYTE)
- alloc_hint = extent_map_block_start(em);
+ alloc_hint = btrfs_extent_map_block_start(em);
if (em)
- free_extent_map(em);
+ btrfs_free_extent_map(em);
} else {
- alloc_hint = extent_map_block_start(em);
- free_extent_map(em);
+ alloc_hint = btrfs_extent_map_block_start(em);
+ btrfs_free_extent_map(em);
}
}
read_unlock(&em_tree->lock);
@@ -1272,10 +1259,7 @@ u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
* - Else all pages except for @locked_folio are unlocked.
*
* When a failure happens in the second or later iteration of the
- * while-loop, the ordered extents created in previous iterations are kept
- * intact. So, the caller must clean them up by calling
- * btrfs_cleanup_ordered_extents(). See btrfs_run_delalloc_range() for
- * example.
+ * while-loop, the ordered extents created in previous iterations are cleaned up.
*/
static noinline int cow_file_range(struct btrfs_inode *inode,
struct folio *locked_folio, u64 start,
@@ -1382,8 +1366,13 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
continue;
}
if (done_offset) {
- *done_offset = start - 1;
- return 0;
+ /*
+ * Move @end to the end of the processed range,
+ * and exit the loop to unlock the processed extents.
+ */
+ end = start - 1;
+ ret = 0;
+ break;
}
ret = -ENOSPC;
}
@@ -1402,24 +1391,24 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
* Locked range will be released either during error clean up or
* after the whole range is finished.
*/
- lock_extent(&inode->io_tree, start, start + cur_alloc_size - 1,
- &cached);
+ btrfs_lock_extent(&inode->io_tree, start, start + cur_alloc_size - 1,
+ &cached);
em = btrfs_create_io_em(inode, start, &file_extent,
BTRFS_ORDERED_REGULAR);
if (IS_ERR(em)) {
- unlock_extent(&inode->io_tree, start,
- start + cur_alloc_size - 1, &cached);
+ btrfs_unlock_extent(&inode->io_tree, start,
+ start + cur_alloc_size - 1, &cached);
ret = PTR_ERR(em);
goto out_reserve;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent,
- 1 << BTRFS_ORDERED_REGULAR);
+ 1U << BTRFS_ORDERED_REGULAR);
if (IS_ERR(ordered)) {
- unlock_extent(&inode->io_tree, start,
- start + cur_alloc_size - 1, &cached);
+ btrfs_unlock_extent(&inode->io_tree, start,
+ start + cur_alloc_size - 1, &cached);
ret = PTR_ERR(ordered);
goto out_drop_extent_cache;
}
@@ -1474,7 +1463,7 @@ out_drop_extent_cache:
btrfs_drop_extent_map_range(inode, start, start + cur_alloc_size - 1, false);
out_reserve:
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
- btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
+ btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true);
out_unlock:
/*
* Now, we have three regions to clean up:
@@ -1487,11 +1476,9 @@ out_unlock:
/*
* For the range (1). We have already instantiated the ordered extents
- * for this region. They are cleaned up by
- * btrfs_cleanup_ordered_extents() in e.g,
- * btrfs_run_delalloc_range().
+ * for this region, thus we need to cleanup those ordered extents.
* EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV
- * are also handled by the cleanup function.
+ * are also handled by the ordered extents cleanup.
*
* So here we only clear EXTENT_LOCKED and EXTENT_DELALLOC flag, and
* finish the writeback of the involved folios, which will be never submitted.
@@ -1502,6 +1489,8 @@ out_unlock:
if (!locked_folio)
mapping_set_error(inode->vfs_inode.i_mapping, ret);
+
+ btrfs_cleanup_ordered_extents(inode, orig_start, start - orig_start);
extent_clear_unlock_delalloc(inode, orig_start, start - 1,
locked_folio, NULL, clear_bits, page_ops);
}
@@ -1583,8 +1572,8 @@ static noinline void submit_compressed_extents(struct btrfs_work *work, bool do_
PAGE_SHIFT;
while (!list_empty(&async_chunk->extents)) {
- async_extent = list_entry(async_chunk->extents.next,
- struct async_extent, list);
+ async_extent = list_first_entry(&async_chunk->extents,
+ struct async_extent, list);
list_del(&async_extent->list);
submit_one_async_extent(async_chunk, async_extent, &alloc_hint);
}
@@ -1754,9 +1743,9 @@ static int fallback_to_cow(struct btrfs_inode *inode,
* group that contains that extent to RO mode and therefore force COW
* when starting writeback.
*/
- lock_extent(io_tree, start, end, &cached_state);
- count = count_range_bits(io_tree, &range_start, end, range_bytes,
- EXTENT_NORESERVE, 0, NULL);
+ btrfs_lock_extent(io_tree, start, end, &cached_state);
+ count = btrfs_count_range_bits(io_tree, &range_start, end, range_bytes,
+ EXTENT_NORESERVE, 0, NULL);
if (count > 0 || is_space_ino || is_reloc_ino) {
u64 bytes = count;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
@@ -1770,10 +1759,10 @@ static int fallback_to_cow(struct btrfs_inode *inode,
spin_unlock(&sinfo->lock);
if (count > 0)
- clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE,
- NULL);
+ btrfs_clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE,
+ &cached_state);
}
- unlock_extent(io_tree, start, end, &cached_state);
+ btrfs_unlock_extent(io_tree, start, end, &cached_state);
/*
* Don't try to create inline extents, as a mix of inline extent that
@@ -1971,6 +1960,65 @@ static void cleanup_dirty_folios(struct btrfs_inode *inode,
mapping_set_error(mapping, error);
}
+static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio,
+ struct extent_state **cached,
+ struct can_nocow_file_extent_args *nocow_args,
+ u64 file_pos, bool is_prealloc)
+{
+ struct btrfs_ordered_extent *ordered;
+ u64 len = nocow_args->file_extent.num_bytes;
+ u64 end = file_pos + len - 1;
+ int ret = 0;
+
+ btrfs_lock_extent(&inode->io_tree, file_pos, end, cached);
+
+ if (is_prealloc) {
+ struct extent_map *em;
+
+ em = btrfs_create_io_em(inode, file_pos, &nocow_args->file_extent,
+ BTRFS_ORDERED_PREALLOC);
+ if (IS_ERR(em)) {
+ btrfs_unlock_extent(&inode->io_tree, file_pos, end, cached);
+ return PTR_ERR(em);
+ }
+ btrfs_free_extent_map(em);
+ }
+
+ ordered = btrfs_alloc_ordered_extent(inode, file_pos, &nocow_args->file_extent,
+ is_prealloc
+ ? (1U << BTRFS_ORDERED_PREALLOC)
+ : (1U << BTRFS_ORDERED_NOCOW));
+ if (IS_ERR(ordered)) {
+ if (is_prealloc)
+ btrfs_drop_extent_map_range(inode, file_pos, end, false);
+ btrfs_unlock_extent(&inode->io_tree, file_pos, end, cached);
+ return PTR_ERR(ordered);
+ }
+
+ if (btrfs_is_data_reloc_root(inode->root))
+ /*
+ * Errors are handled later, as we must prevent
+ * extent_clear_unlock_delalloc() in error handler from freeing
+ * metadata of the created ordered extent.
+ */
+ ret = btrfs_reloc_clone_csums(ordered);
+ btrfs_put_ordered_extent(ordered);
+
+ extent_clear_unlock_delalloc(inode, file_pos, end, locked_folio, cached,
+ EXTENT_LOCKED | EXTENT_DELALLOC |
+ EXTENT_CLEAR_DATA_RESV,
+ PAGE_UNLOCK | PAGE_SET_ORDERED);
+ /*
+ * On error, we need to cleanup the ordered extents we created.
+ *
+ * We do not clear the folio Dirty flags because they are set and
+ * cleaered by the caller.
+ */
+ if (ret < 0)
+ btrfs_cleanup_ordered_extents(inode, file_pos, len);
+ return ret;
+}
+
/*
* when nowcow writeback call back. This checks for snapshots or COW copies
* of the extents that exist in the file, and COWs the file as required.
@@ -2015,15 +2063,12 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
while (cur_offset <= end) {
struct btrfs_block_group *nocow_bg = NULL;
- struct btrfs_ordered_extent *ordered;
struct btrfs_key found_key;
struct btrfs_file_extent_item *fi;
struct extent_buffer *leaf;
struct extent_state *cached_state = NULL;
u64 extent_end;
- u64 nocow_end;
int extent_type;
- bool is_prealloc;
ret = btrfs_lookup_file_extent(NULL, root, path, ino,
cur_offset, 0);
@@ -2078,12 +2123,13 @@ next_slot:
/*
* If the found extent starts after requested offset, then
- * adjust extent_end to be right before this extent begins
+ * adjust cur_offset to be right before this extent begins.
*/
if (found_key.offset > cur_offset) {
- extent_end = found_key.offset;
- extent_type = 0;
- goto must_cow;
+ if (cow_start == (u64)-1)
+ cow_start = cur_offset;
+ cur_offset = found_key.offset;
+ goto next_slot;
}
/*
@@ -2149,75 +2195,21 @@ must_cow:
if (cow_start != (u64)-1) {
ret = fallback_to_cow(inode, locked_folio, cow_start,
found_key.offset - 1);
- cow_start = (u64)-1;
if (ret) {
cow_end = found_key.offset - 1;
btrfs_dec_nocow_writers(nocow_bg);
goto error;
}
+ cow_start = (u64)-1;
}
- nocow_end = cur_offset + nocow_args.file_extent.num_bytes - 1;
- lock_extent(&inode->io_tree, cur_offset, nocow_end, &cached_state);
-
- is_prealloc = extent_type == BTRFS_FILE_EXTENT_PREALLOC;
- if (is_prealloc) {
- struct extent_map *em;
-
- em = btrfs_create_io_em(inode, cur_offset,
- &nocow_args.file_extent,
- BTRFS_ORDERED_PREALLOC);
- if (IS_ERR(em)) {
- unlock_extent(&inode->io_tree, cur_offset,
- nocow_end, &cached_state);
- btrfs_dec_nocow_writers(nocow_bg);
- ret = PTR_ERR(em);
- goto error;
- }
- free_extent_map(em);
- }
-
- ordered = btrfs_alloc_ordered_extent(inode, cur_offset,
- &nocow_args.file_extent,
- is_prealloc
- ? (1 << BTRFS_ORDERED_PREALLOC)
- : (1 << BTRFS_ORDERED_NOCOW));
+ ret = nocow_one_range(inode, locked_folio, &cached_state,
+ &nocow_args, cur_offset,
+ extent_type == BTRFS_FILE_EXTENT_PREALLOC);
btrfs_dec_nocow_writers(nocow_bg);
- if (IS_ERR(ordered)) {
- if (is_prealloc) {
- btrfs_drop_extent_map_range(inode, cur_offset,
- nocow_end, false);
- }
- unlock_extent(&inode->io_tree, cur_offset,
- nocow_end, &cached_state);
- ret = PTR_ERR(ordered);
+ if (ret < 0)
goto error;
- }
-
- if (btrfs_is_data_reloc_root(root))
- /*
- * Error handled later, as we must prevent
- * extent_clear_unlock_delalloc() in error handler
- * from freeing metadata of created ordered extent.
- */
- ret = btrfs_reloc_clone_csums(ordered);
- btrfs_put_ordered_extent(ordered);
-
- extent_clear_unlock_delalloc(inode, cur_offset, nocow_end,
- locked_folio, &cached_state,
- EXTENT_LOCKED | EXTENT_DELALLOC |
- EXTENT_CLEAR_DATA_RESV,
- PAGE_UNLOCK | PAGE_SET_ORDERED);
-
cur_offset = extent_end;
-
- /*
- * btrfs_reloc_clone_csums() error, now we're OK to call error
- * handler, as metadata for created ordered extent will only
- * be freed by btrfs_finish_ordered_io().
- */
- if (ret)
- goto error;
}
btrfs_release_path(path);
@@ -2226,11 +2218,11 @@ must_cow:
if (cow_start != (u64)-1) {
ret = fallback_to_cow(inode, locked_folio, cow_start, end);
- cow_start = (u64)-1;
if (ret) {
cow_end = end;
goto error;
}
+ cow_start = (u64)-1;
}
btrfs_free_path(path);
@@ -2244,27 +2236,44 @@ error:
* start cur_offset end
* |/////////////| |
*
+ * In this case, cow_start should be (u64)-1.
+ *
* For range [start, cur_offset) the folios are already unlocked (except
* @locked_folio), EXTENT_DELALLOC already removed.
- * Only need to clear the dirty flag as they will never be submitted.
- * Ordered extent and extent maps are handled by
- * btrfs_mark_ordered_io_finished() inside run_delalloc_range().
+ * Need to clear the dirty flags and finish the ordered extents.
+ *
+ * 2) Failed with error before calling fallback_to_cow()
+ *
+ * start cow_start end
+ * |/////////////| |
+ *
+ * In this case, only @cow_start is set, @cur_offset is between
+ * [cow_start, end)
*
- * 2) Failed with error from fallback_to_cow()
- * start cur_offset cow_end end
+ * It's mostly the same as case 1), just replace @cur_offset with
+ * @cow_start.
+ *
+ * 3) Failed with error from fallback_to_cow()
+ *
+ * start cow_start cow_end end
* |/////////////|-----------| |
*
- * For range [start, cur_offset) it's the same as case 1).
- * But for range [cur_offset, cow_end), the folios have dirty flag
- * cleared and unlocked, EXTENT_DEALLLOC cleared by cow_file_range().
+ * In this case, both @cow_start and @cow_end is set.
*
- * Thus we should not call extent_clear_unlock_delalloc() on range
- * [cur_offset, cow_end), as the folios are already unlocked.
+ * For range [start, cow_start) it's the same as case 1).
+ * But for range [cow_start, cow_end), all the cleanup is handled by
+ * cow_file_range(), we should not touch anything in that range.
*
- * So clear the folio dirty flags for [start, cur_offset) first.
+ * So for all above cases, if @cow_start is set, cleanup ordered extents
+ * for range [start, @cow_start), other wise cleanup range [start, @cur_offset).
*/
- if (cur_offset > start)
+ if (cow_start != (u64)-1)
+ cur_offset = cow_start;
+
+ if (cur_offset > start) {
+ btrfs_cleanup_ordered_extents(inode, start, cur_offset - start);
cleanup_dirty_folios(inode, locked_folio, start, cur_offset - 1, ret);
+ }
/*
* If an error happened while a COW region is outstanding, cur_offset
@@ -2281,7 +2290,7 @@ error:
if (cur_offset < end) {
struct extent_state *cached = NULL;
- lock_extent(&inode->io_tree, cur_offset, end, &cached);
+ btrfs_lock_extent(&inode->io_tree, cur_offset, end, &cached);
extent_clear_unlock_delalloc(inode, cur_offset, end,
locked_folio, &cached,
EXTENT_LOCKED | EXTENT_DELALLOC |
@@ -2303,7 +2312,7 @@ static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end)
{
if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) {
if (inode->defrag_bytes &&
- test_range_bit_exists(&inode->io_tree, start, end, EXTENT_DEFRAG))
+ btrfs_test_range_bit_exists(&inode->io_tree, start, end, EXTENT_DEFRAG))
return false;
return true;
}
@@ -2324,12 +2333,11 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol
* The range must cover part of the @locked_folio, or a return of 1
* can confuse the caller.
*/
- ASSERT(!(end <= folio_pos(locked_folio) ||
- start >= folio_pos(locked_folio) + folio_size(locked_folio)));
+ ASSERT(!(end <= folio_pos(locked_folio) || start >= folio_end(locked_folio)));
if (should_nocow(inode, start, end)) {
ret = run_delalloc_nocow(inode, locked_folio, start, end);
- goto out;
+ return ret;
}
if (btrfs_inode_can_compress(inode) &&
@@ -2343,10 +2351,6 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol
else
ret = cow_file_range(inode, locked_folio, start, end, NULL,
false, false);
-
-out:
- if (ret < 0)
- btrfs_cleanup_ordered_extents(inode, start, end - start + 1);
return ret;
}
@@ -2596,7 +2600,7 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
!btrfs_is_free_space_inode(inode) &&
!(state->state & EXTENT_NORESERVE) &&
(bits & EXTENT_CLEAR_DATA_RESV))
- btrfs_free_reserved_data_space_noquota(fs_info, len);
+ btrfs_free_reserved_data_space_noquota(inode, len);
percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
fs_info->delalloc_batch);
@@ -2680,12 +2684,12 @@ static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
if (em_len > search_len)
em_len = search_len;
- ret = set_extent_bit(&inode->io_tree, search_start,
- search_start + em_len - 1,
- EXTENT_DELALLOC_NEW, cached_state);
+ ret = btrfs_set_extent_bit(&inode->io_tree, search_start,
+ search_start + em_len - 1,
+ EXTENT_DELALLOC_NEW, cached_state);
next:
- search_start = extent_map_end(em);
- free_extent_map(em);
+ search_start = btrfs_extent_map_end(em);
+ btrfs_free_extent_map(em);
if (ret)
return ret;
}
@@ -2715,8 +2719,8 @@ int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
return ret;
}
- return set_extent_bit(&inode->io_tree, start, end,
- EXTENT_DELALLOC | extra_bits, cached_state);
+ return btrfs_set_extent_bit(&inode->io_tree, start, end,
+ EXTENT_DELALLOC | extra_bits, cached_state);
}
/* see btrfs_writepage_start_hook for details on why this is required */
@@ -2737,7 +2741,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
struct btrfs_inode *inode = fixup->inode;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
u64 page_start = folio_pos(folio);
- u64 page_end = folio_pos(folio) + folio_size(folio) - 1;
+ u64 page_end = folio_end(folio) - 1;
int ret = 0;
bool free_delalloc_space = true;
@@ -2791,7 +2795,7 @@ again:
if (ret)
goto out_page;
- lock_extent(&inode->io_tree, page_start, page_end, &cached_state);
+ btrfs_lock_extent(&inode->io_tree, page_start, page_end, &cached_state);
/* already ordered? We're done */
if (folio_test_ordered(folio))
@@ -2799,8 +2803,8 @@ again:
ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
if (ordered) {
- unlock_extent(&inode->io_tree, page_start, page_end,
- &cached_state);
+ btrfs_unlock_extent(&inode->io_tree, page_start, page_end,
+ &cached_state);
folio_unlock(folio);
btrfs_start_ordered_extent(ordered);
btrfs_put_ordered_extent(ordered);
@@ -2826,7 +2830,7 @@ out_reserved:
if (free_delalloc_space)
btrfs_delalloc_release_space(inode, data_reserved, page_start,
PAGE_SIZE, true);
- unlock_extent(&inode->io_tree, page_start, page_end, &cached_state);
+ btrfs_unlock_extent(&inode->io_tree, page_start, page_end, &cached_state);
out_page:
if (ret) {
/*
@@ -2873,6 +2877,21 @@ int btrfs_writepage_cow_fixup(struct folio *folio)
return 0;
/*
+ * For experimental build, we error out instead of EAGAIN.
+ *
+ * We should not hit such out-of-band dirty folios anymore.
+ */
+ if (IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL)) {
+ DEBUG_WARN();
+ btrfs_err_rl(fs_info,
+ "root %lld ino %llu folio %llu is marked dirty without notifying the fs",
+ btrfs_root_id(BTRFS_I(inode)->root),
+ btrfs_ino(BTRFS_I(inode)),
+ folio_pos(folio));
+ return -EUCLEAN;
+ }
+
+ /*
* folio_checked is set below when we create a fixup worker for this
* folio, don't try to create another one if we're already
* folio_test_checked.
@@ -2891,7 +2910,7 @@ int btrfs_writepage_cow_fixup(struct folio *folio)
* We are already holding a reference to this inode from
* write_cache_pages. We need to hold it because the space reservation
* takes place outside of the folio lock, and we can't trust
- * page->mapping outside of the folio lock.
+ * folio->mapping outside of the folio lock.
*/
ihold(inode);
btrfs_folio_set_checked(fs_info, folio, folio_pos(folio), folio_size(folio));
@@ -2912,7 +2931,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
{
struct btrfs_root *root = inode->root;
const u64 sectorsize = root->fs_info->sectorsize;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
struct btrfs_key ins;
u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi);
@@ -2947,8 +2966,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
if (!drop_args.extent_inserted) {
ins.objectid = btrfs_ino(inode);
- ins.offset = file_pos;
ins.type = BTRFS_EXTENT_DATA_KEY;
+ ins.offset = file_pos;
ret = btrfs_insert_empty_item(trans, root, path, &ins,
sizeof(*stack_fi));
@@ -2983,8 +3002,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
btrfs_update_inode_bytes(inode, num_bytes, drop_args.bytes_found);
ins.objectid = disk_bytenr;
- ins.offset = disk_num_bytes;
ins.type = BTRFS_EXTENT_ITEM_KEY;
+ ins.offset = disk_num_bytes;
ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes);
if (ret)
@@ -2994,8 +3013,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
file_pos - offset,
qgroup_reserved, &ins);
out:
- btrfs_free_path(path);
-
return ret;
}
@@ -3111,8 +3128,10 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
* depending on their current state).
*/
if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
- clear_bits |= EXTENT_LOCKED;
- lock_extent(io_tree, start, end, &cached_state);
+ clear_bits |= EXTENT_LOCKED | EXTENT_FINISHING_ORDERED;
+ btrfs_lock_extent_bits(io_tree, start, end,
+ EXTENT_LOCKED | EXTENT_FINISHING_ORDERED,
+ &cached_state);
}
if (freespace_inode)
@@ -3176,8 +3195,8 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
goto out;
}
- ret = unpin_extent_cache(inode, ordered_extent->file_offset,
- ordered_extent->num_bytes, trans->transid);
+ ret = btrfs_unpin_extent_cache(inode, ordered_extent->file_offset,
+ ordered_extent->num_bytes, trans->transid);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -3196,9 +3215,9 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
*/
if ((clear_bits & EXTENT_DELALLOC_NEW) &&
!test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags))
- clear_extent_bit(&inode->io_tree, start, end,
- EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES,
- &cached_state);
+ btrfs_clear_extent_bit(&inode->io_tree, start, end,
+ EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES,
+ &cached_state);
btrfs_inode_safe_disk_i_size_write(inode, 0);
ret = btrfs_update_inode_fallback(trans, inode);
@@ -3207,15 +3226,13 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
goto out;
}
out:
- clear_extent_bit(&inode->io_tree, start, end, clear_bits,
- &cached_state);
+ btrfs_clear_extent_bit(&inode->io_tree, start, end, clear_bits,
+ &cached_state);
if (trans)
btrfs_end_transaction(trans);
if (ret || truncated) {
- u64 unwritten_start = start;
-
/*
* If we failed to finish this ordered extent for any reason we
* need to make sure BTRFS_ORDERED_IOERR is set on the ordered
@@ -3227,10 +3244,6 @@ out:
if (ret)
btrfs_mark_ordered_extent_error(ordered_extent);
- if (truncated)
- unwritten_start += logical_len;
- clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
-
/*
* Drop extent maps for the part of the extent we didn't write.
*
@@ -3245,9 +3258,15 @@ out:
* we don't mess with the extent map tree in the NOCOW case, but
* for now simply skip this if we are the free space inode.
*/
- if (!btrfs_is_free_space_inode(inode))
+ if (!btrfs_is_free_space_inode(inode)) {
+ u64 unwritten_start = start;
+
+ if (truncated)
+ unwritten_start += logical_len;
+
btrfs_drop_extent_map_range(inode, unwritten_start,
end, false);
+ }
/*
* If the ordered extent had an IOERR or something else went
@@ -3274,7 +3293,7 @@ out:
NULL);
btrfs_free_reserved_extent(fs_info,
ordered_extent->disk_bytenr,
- ordered_extent->disk_num_bytes, 1);
+ ordered_extent->disk_num_bytes, true);
/*
* Actually free the qgroup rsv which was released when
* the ordered extent was created.
@@ -3311,20 +3330,16 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered)
/*
* Verify the checksum for a single sector without any extra action that depend
* on the type of I/O.
+ *
+ * @kaddr must be a properly kmapped address.
*/
-int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
- u32 pgoff, u8 *csum, const u8 * const csum_expected)
+int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, void *kaddr, u8 *csum,
+ const u8 * const csum_expected)
{
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
- char *kaddr;
-
- ASSERT(pgoff + fs_info->sectorsize <= PAGE_SIZE);
shash->tfm = fs_info->csum_shash;
-
- kaddr = kmap_local_page(page) + pgoff;
crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
- kunmap_local(kaddr);
if (memcmp(csum, csum_expected, fs_info->csum_size))
return -EIO;
@@ -3353,6 +3368,7 @@ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
u64 end = file_offset + bv->bv_len - 1;
u8 *csum_expected;
u8 csum[BTRFS_CSUM_SIZE];
+ void *kaddr;
ASSERT(bv->bv_len == fs_info->sectorsize);
@@ -3360,19 +3376,22 @@ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
return true;
if (btrfs_is_data_reloc_root(inode->root) &&
- test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM,
- NULL)) {
+ btrfs_test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM,
+ NULL)) {
/* Skip the range without csum for data reloc inode */
- clear_extent_bits(&inode->io_tree, file_offset, end,
- EXTENT_NODATASUM);
+ btrfs_clear_extent_bit(&inode->io_tree, file_offset, end,
+ EXTENT_NODATASUM, NULL);
return true;
}
csum_expected = bbio->csum + (bio_offset >> fs_info->sectorsize_bits) *
fs_info->csum_size;
- if (btrfs_check_sector_csum(fs_info, bv->bv_page, bv->bv_offset, csum,
- csum_expected))
+ kaddr = bvec_kmap_local(bv);
+ if (btrfs_check_sector_csum(fs_info, kaddr, csum, csum_expected)) {
+ kunmap_local(kaddr);
goto zeroit;
+ }
+ kunmap_local(kaddr);
return true;
zeroit:
@@ -3402,6 +3421,7 @@ void btrfs_add_delayed_iput(struct btrfs_inode *inode)
if (atomic_add_unless(&inode->vfs_inode.i_count, -1, 1))
return;
+ WARN_ON_ONCE(test_bit(BTRFS_FS_STATE_NO_DELAYED_IPUT, &fs_info->fs_state));
atomic_inc(&fs_info->nr_delayed_iputs);
/*
* Need to be irq safe here because we can be called from either an irq
@@ -3518,11 +3538,10 @@ static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
int btrfs_orphan_cleanup(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
struct btrfs_key key, found_key;
struct btrfs_trans_handle *trans;
- struct inode *inode;
u64 last_objectid = 0;
int ret = 0, nr_unlink = 0;
@@ -3541,6 +3560,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
key.offset = (u64)-1;
while (1) {
+ struct btrfs_inode *inode;
+
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto out;
@@ -3664,10 +3685,10 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
* deleted but wasn't. The inode number may have been reused,
* but either way, we can delete the orphan item.
*/
- if (!inode || inode->i_nlink) {
+ if (!inode || inode->vfs_inode.i_nlink) {
if (inode) {
- ret = btrfs_drop_verity_items(BTRFS_I(inode));
- iput(inode);
+ ret = btrfs_drop_verity_items(inode);
+ iput(&inode->vfs_inode);
inode = NULL;
if (ret)
goto out;
@@ -3690,7 +3711,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
nr_unlink++;
/* this will do delete_inode and everything for us */
- iput(inode);
+ iput(&inode->vfs_inode);
}
/* release the path since we're done with it */
btrfs_release_path(path);
@@ -3707,19 +3728,22 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
out:
if (ret)
btrfs_err(fs_info, "could not do orphan cleanup %d", ret);
- btrfs_free_path(path);
return ret;
}
/*
- * very simple check to peek ahead in the leaf looking for xattrs. If we
- * don't find any xattrs, we know there can't be any acls.
+ * Look ahead in the leaf for xattrs. If we don't find any then we know there
+ * can't be any ACLs.
*
- * slot is the slot the inode is in, objectid is the objectid of the inode
+ * @leaf: the eb leaf where to search
+ * @slot: the slot the inode is in
+ * @objectid: the objectid of the inode
+ *
+ * Return true if there is xattr/ACL, false otherwise.
*/
-static noinline int acls_after_inode_item(struct extent_buffer *leaf,
- int slot, u64 objectid,
- int *first_xattr_slot)
+static noinline bool acls_after_inode_item(struct extent_buffer *leaf,
+ int slot, u64 objectid,
+ int *first_xattr_slot)
{
u32 nritems = btrfs_header_nritems(leaf);
struct btrfs_key found_key;
@@ -3739,45 +3763,50 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
while (slot < nritems) {
btrfs_item_key_to_cpu(leaf, &found_key, slot);
- /* we found a different objectid, there must not be acls */
+ /* We found a different objectid, there must be no ACLs. */
if (found_key.objectid != objectid)
- return 0;
+ return false;
- /* we found an xattr, assume we've got an acl */
+ /* We found an xattr, assume we've got an ACL. */
if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
if (*first_xattr_slot == -1)
*first_xattr_slot = slot;
if (found_key.offset == xattr_access ||
found_key.offset == xattr_default)
- return 1;
+ return true;
}
/*
- * we found a key greater than an xattr key, there can't
- * be any acls later on
+ * We found a key greater than an xattr key, there can't be any
+ * ACLs later on.
*/
if (found_key.type > BTRFS_XATTR_ITEM_KEY)
- return 0;
+ return false;
slot++;
scanned++;
/*
- * it goes inode, inode backrefs, xattrs, extents,
- * so if there are a ton of hard links to an inode there can
- * be a lot of backrefs. Don't waste time searching too hard,
- * this is just an optimization
+ * The item order goes like:
+ * - inode
+ * - inode backrefs
+ * - xattrs
+ * - extents,
+ *
+ * so if there are lots of hard links to an inode there can be
+ * a lot of backrefs. Don't waste time searching too hard,
+ * this is just an optimization.
*/
if (scanned >= 8)
break;
}
- /* we hit the end of the leaf before we found an xattr or
- * something larger than an xattr. We have to assume the inode
- * has acls
+ /*
+ * We hit the end of the leaf before we found an xattr or something
+ * larger than an xattr. We have to assume the inode has ACLs.
*/
if (*first_xattr_slot == -1)
*first_xattr_slot = slot;
- return 1;
+ return true;
}
static int btrfs_init_file_extent_tree(struct btrfs_inode *inode)
@@ -3797,7 +3826,8 @@ static int btrfs_init_file_extent_tree(struct btrfs_inode *inode)
if (!inode->file_extent_tree)
return -ENOMEM;
- extent_io_tree_init(fs_info, inode->file_extent_tree, IO_TREE_INODE_FILE_EXTENT);
+ btrfs_extent_io_tree_init(fs_info, inode->file_extent_tree,
+ IO_TREE_INODE_FILE_EXTENT);
/* Lockdep class is set only for the file extent tree. */
lockdep_set_class(&inode->file_extent_tree->lock, &file_extent_tree_class);
@@ -3840,12 +3870,13 @@ static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc)
*
* On failure clean up the inode.
*/
-static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path)
+static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path *path)
{
- struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *leaf;
struct btrfs_inode_item *inode_item;
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct inode *vfs_inode = &inode->vfs_inode;
struct btrfs_key location;
unsigned long ptr;
int maybe_acls;
@@ -3854,7 +3885,7 @@ static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path)
bool filled = false;
int first_xattr_slot;
- ret = btrfs_init_file_extent_tree(BTRFS_I(inode));
+ ret = btrfs_init_file_extent_tree(inode);
if (ret)
goto out;
@@ -3864,7 +3895,7 @@ static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path)
ASSERT(path);
- btrfs_get_inode_key(BTRFS_I(inode), &location);
+ btrfs_get_inode_key(inode, &location);
ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
if (ret) {
@@ -3884,41 +3915,42 @@ static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path)
inode_item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_inode_item);
- inode->i_mode = btrfs_inode_mode(leaf, inode_item);
- set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
- i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
- i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
- btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item));
- btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
- round_up(i_size_read(inode), fs_info->sectorsize));
-
- inode_set_atime(inode, btrfs_timespec_sec(leaf, &inode_item->atime),
+ vfs_inode->i_mode = btrfs_inode_mode(leaf, inode_item);
+ set_nlink(vfs_inode, btrfs_inode_nlink(leaf, inode_item));
+ i_uid_write(vfs_inode, btrfs_inode_uid(leaf, inode_item));
+ i_gid_write(vfs_inode, btrfs_inode_gid(leaf, inode_item));
+ btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
+ btrfs_inode_set_file_extent_range(inode, 0,
+ round_up(i_size_read(vfs_inode), fs_info->sectorsize));
+
+ inode_set_atime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->atime),
btrfs_timespec_nsec(leaf, &inode_item->atime));
- inode_set_mtime(inode, btrfs_timespec_sec(leaf, &inode_item->mtime),
+ inode_set_mtime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->mtime),
btrfs_timespec_nsec(leaf, &inode_item->mtime));
- inode_set_ctime(inode, btrfs_timespec_sec(leaf, &inode_item->ctime),
+ inode_set_ctime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->ctime),
btrfs_timespec_nsec(leaf, &inode_item->ctime));
- BTRFS_I(inode)->i_otime_sec = btrfs_timespec_sec(leaf, &inode_item->otime);
- BTRFS_I(inode)->i_otime_nsec = btrfs_timespec_nsec(leaf, &inode_item->otime);
+ inode->i_otime_sec = btrfs_timespec_sec(leaf, &inode_item->otime);
+ inode->i_otime_nsec = btrfs_timespec_nsec(leaf, &inode_item->otime);
- inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
- BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
- BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
+ inode_set_bytes(vfs_inode, btrfs_inode_nbytes(leaf, inode_item));
+ inode->generation = btrfs_inode_generation(leaf, inode_item);
+ inode->last_trans = btrfs_inode_transid(leaf, inode_item);
- inode_set_iversion_queried(inode,
- btrfs_inode_sequence(leaf, inode_item));
- inode->i_generation = BTRFS_I(inode)->generation;
- inode->i_rdev = 0;
+ inode_set_iversion_queried(vfs_inode, btrfs_inode_sequence(leaf, inode_item));
+ vfs_inode->i_generation = inode->generation;
+ vfs_inode->i_rdev = 0;
rdev = btrfs_inode_rdev(leaf, inode_item);
- if (S_ISDIR(inode->i_mode))
- BTRFS_I(inode)->index_cnt = (u64)-1;
+ if (S_ISDIR(vfs_inode->i_mode))
+ inode->index_cnt = (u64)-1;
btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item),
- &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);
+ &inode->flags, &inode->ro_flags);
+ btrfs_update_inode_mapping_flags(inode);
+ btrfs_set_inode_mapping_order(inode);
cache_index:
/*
@@ -3930,9 +3962,8 @@ cache_index:
* This is required for both inode re-read from disk and delayed inode
* in the delayed_nodes xarray.
*/
- if (BTRFS_I(inode)->last_trans == btrfs_get_fs_generation(fs_info))
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &BTRFS_I(inode)->runtime_flags);
+ if (inode->last_trans == btrfs_get_fs_generation(fs_info))
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
/*
* We don't persist the id of the transaction where an unlink operation
@@ -3961,7 +3992,7 @@ cache_index:
* transaction commits on fsync if our inode is a directory, or if our
* inode is not a directory, logging its parent unnecessarily.
*/
- BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;
+ inode->last_unlink_trans = inode->last_trans;
/*
* Same logic as for last_unlink_trans. We don't persist the generation
@@ -3969,15 +4000,15 @@ cache_index:
* operation, so after eviction and reloading the inode we must be
* pessimistic and assume the last transaction that modified the inode.
*/
- BTRFS_I(inode)->last_reflink_trans = BTRFS_I(inode)->last_trans;
+ inode->last_reflink_trans = inode->last_trans;
path->slots[0]++;
- if (inode->i_nlink != 1 ||
+ if (vfs_inode->i_nlink != 1 ||
path->slots[0] >= btrfs_header_nritems(leaf))
goto cache_acl;
btrfs_item_key_to_cpu(leaf, &location, path->slots[0]);
- if (location.objectid != btrfs_ino(BTRFS_I(inode)))
+ if (location.objectid != btrfs_ino(inode))
goto cache_acl;
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
@@ -3985,13 +4016,12 @@ cache_index:
struct btrfs_inode_ref *ref;
ref = (struct btrfs_inode_ref *)ptr;
- BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref);
+ inode->dir_index = btrfs_inode_ref_index(leaf, ref);
} else if (location.type == BTRFS_INODE_EXTREF_KEY) {
struct btrfs_inode_extref *extref;
extref = (struct btrfs_inode_extref *)ptr;
- BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf,
- extref);
+ inode->dir_index = btrfs_inode_extref_index(leaf, extref);
}
cache_acl:
/*
@@ -3999,50 +4029,49 @@ cache_acl:
* any xattrs or acls
*/
maybe_acls = acls_after_inode_item(leaf, path->slots[0],
- btrfs_ino(BTRFS_I(inode)), &first_xattr_slot);
+ btrfs_ino(inode), &first_xattr_slot);
if (first_xattr_slot != -1) {
path->slots[0] = first_xattr_slot;
ret = btrfs_load_inode_props(inode, path);
if (ret)
btrfs_err(fs_info,
"error loading props for ino %llu (root %llu): %d",
- btrfs_ino(BTRFS_I(inode)),
- btrfs_root_id(root), ret);
+ btrfs_ino(inode), btrfs_root_id(root), ret);
}
if (!maybe_acls)
- cache_no_acl(inode);
+ cache_no_acl(vfs_inode);
- switch (inode->i_mode & S_IFMT) {
+ switch (vfs_inode->i_mode & S_IFMT) {
case S_IFREG:
- inode->i_mapping->a_ops = &btrfs_aops;
- inode->i_fop = &btrfs_file_operations;
- inode->i_op = &btrfs_file_inode_operations;
+ vfs_inode->i_mapping->a_ops = &btrfs_aops;
+ vfs_inode->i_fop = &btrfs_file_operations;
+ vfs_inode->i_op = &btrfs_file_inode_operations;
break;
case S_IFDIR:
- inode->i_fop = &btrfs_dir_file_operations;
- inode->i_op = &btrfs_dir_inode_operations;
+ vfs_inode->i_fop = &btrfs_dir_file_operations;
+ vfs_inode->i_op = &btrfs_dir_inode_operations;
break;
case S_IFLNK:
- inode->i_op = &btrfs_symlink_inode_operations;
- inode_nohighmem(inode);
- inode->i_mapping->a_ops = &btrfs_aops;
+ vfs_inode->i_op = &btrfs_symlink_inode_operations;
+ inode_nohighmem(vfs_inode);
+ vfs_inode->i_mapping->a_ops = &btrfs_aops;
break;
default:
- inode->i_op = &btrfs_special_inode_operations;
- init_special_inode(inode, inode->i_mode, rdev);
+ vfs_inode->i_op = &btrfs_special_inode_operations;
+ init_special_inode(vfs_inode, vfs_inode->i_mode, rdev);
break;
}
btrfs_sync_inode_flags_to_i_flags(inode);
- ret = btrfs_add_inode_to_root(BTRFS_I(inode), true);
+ ret = btrfs_add_inode_to_root(inode, true);
if (ret)
goto out;
return 0;
out:
- iget_failed(inode);
+ iget_failed(vfs_inode);
return ret;
}
@@ -4054,45 +4083,35 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
struct btrfs_inode_item *item,
struct inode *inode)
{
- struct btrfs_map_token token;
u64 flags;
- btrfs_init_map_token(&token, leaf);
-
- btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
- btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
- btrfs_set_token_inode_size(&token, item, BTRFS_I(inode)->disk_i_size);
- btrfs_set_token_inode_mode(&token, item, inode->i_mode);
- btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
-
- btrfs_set_token_timespec_sec(&token, &item->atime,
- inode_get_atime_sec(inode));
- btrfs_set_token_timespec_nsec(&token, &item->atime,
- inode_get_atime_nsec(inode));
-
- btrfs_set_token_timespec_sec(&token, &item->mtime,
- inode_get_mtime_sec(inode));
- btrfs_set_token_timespec_nsec(&token, &item->mtime,
- inode_get_mtime_nsec(inode));
-
- btrfs_set_token_timespec_sec(&token, &item->ctime,
- inode_get_ctime_sec(inode));
- btrfs_set_token_timespec_nsec(&token, &item->ctime,
- inode_get_ctime_nsec(inode));
-
- btrfs_set_token_timespec_sec(&token, &item->otime, BTRFS_I(inode)->i_otime_sec);
- btrfs_set_token_timespec_nsec(&token, &item->otime, BTRFS_I(inode)->i_otime_nsec);
-
- btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode));
- btrfs_set_token_inode_generation(&token, item,
- BTRFS_I(inode)->generation);
- btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
- btrfs_set_token_inode_transid(&token, item, trans->transid);
- btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
+ btrfs_set_inode_uid(leaf, item, i_uid_read(inode));
+ btrfs_set_inode_gid(leaf, item, i_gid_read(inode));
+ btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
+ btrfs_set_inode_mode(leaf, item, inode->i_mode);
+ btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
+
+ btrfs_set_timespec_sec(leaf, &item->atime, inode_get_atime_sec(inode));
+ btrfs_set_timespec_nsec(leaf, &item->atime, inode_get_atime_nsec(inode));
+
+ btrfs_set_timespec_sec(leaf, &item->mtime, inode_get_mtime_sec(inode));
+ btrfs_set_timespec_nsec(leaf, &item->mtime, inode_get_mtime_nsec(inode));
+
+ btrfs_set_timespec_sec(leaf, &item->ctime, inode_get_ctime_sec(inode));
+ btrfs_set_timespec_nsec(leaf, &item->ctime, inode_get_ctime_nsec(inode));
+
+ btrfs_set_timespec_sec(leaf, &item->otime, BTRFS_I(inode)->i_otime_sec);
+ btrfs_set_timespec_nsec(leaf, &item->otime, BTRFS_I(inode)->i_otime_nsec);
+
+ btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
+ btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
+ btrfs_set_inode_sequence(leaf, item, inode_peek_iversion(inode));
+ btrfs_set_inode_transid(leaf, item, trans->transid);
+ btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
BTRFS_I(inode)->ro_flags);
- btrfs_set_token_inode_flags(&token, item, flags);
- btrfs_set_token_inode_block_group(&token, item, 0);
+ btrfs_set_inode_flags(leaf, item, flags);
+ btrfs_set_inode_block_group(leaf, item, 0);
}
/*
@@ -4102,7 +4121,7 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode)
{
struct btrfs_inode_item *inode_item;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
struct btrfs_key key;
int ret;
@@ -4116,7 +4135,7 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
if (ret) {
if (ret > 0)
ret = -ENOENT;
- goto failed;
+ return ret;
}
leaf = path->nodes[0];
@@ -4125,10 +4144,7 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode);
btrfs_set_inode_last_trans(trans, inode);
- ret = 0;
-failed:
- btrfs_free_path(path);
- return ret;
+ return 0;
}
/*
@@ -4173,6 +4189,23 @@ int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
return ret;
}
+static void update_time_after_link_or_unlink(struct btrfs_inode *dir)
+{
+ struct timespec64 now;
+
+ /*
+ * If we are replaying a log tree, we do not want to update the mtime
+ * and ctime of the parent directory with the current time, since the
+ * log replay procedure is responsible for setting them to their correct
+ * values (the ones it had when the fsync was done).
+ */
+ if (test_bit(BTRFS_FS_LOG_RECOVERING, &dir->root->fs_info->flags))
+ return;
+
+ now = inode_set_ctime_current(&dir->vfs_inode);
+ inode_set_mtime_to_ts(&dir->vfs_inode, now);
+}
+
/*
* unlink helper that gets used here in inode.c and in the tree logging
* recovery code. It remove a link in a directory with a given name, and
@@ -4194,20 +4227,22 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
u64 dir_ino = btrfs_ino(dir);
path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!path)
+ return -ENOMEM;
di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, -1);
if (IS_ERR_OR_NULL(di)) {
- ret = di ? PTR_ERR(di) : -ENOENT;
- goto err;
+ btrfs_free_path(path);
+ return di ? PTR_ERR(di) : -ENOENT;
}
ret = btrfs_delete_one_dir_name(trans, root, path, di);
+ /*
+ * Down the call chains below we'll also need to allocate a path, so no
+ * need to hold on to this one for longer than necessary.
+ */
+ btrfs_free_path(path);
if (ret)
- goto err;
- btrfs_release_path(path);
+ return ret;
/*
* If we don't have dir index, we have to get it by looking up
@@ -4229,11 +4264,11 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
ret = btrfs_del_inode_ref(trans, root, name, ino, dir_ino, &index);
if (ret) {
- btrfs_info(fs_info,
- "failed to delete reference to %.*s, inode %llu parent %llu",
- name->len, name->name, ino, dir_ino);
+ btrfs_crit(fs_info,
+ "failed to delete reference to %.*s, root %llu inode %llu parent %llu",
+ name->len, name->name, btrfs_root_id(root), ino, dir_ino);
btrfs_abort_transaction(trans, ret);
- goto err;
+ return ret;
}
skip_backref:
if (rename_ctx)
@@ -4242,7 +4277,7 @@ skip_backref:
ret = btrfs_delete_delayed_dir_index(trans, dir, index);
if (ret) {
btrfs_abort_transaction(trans, ret);
- goto err;
+ return ret;
}
/*
@@ -4266,19 +4301,14 @@ skip_backref:
* holding.
*/
btrfs_run_delayed_iput(fs_info, inode);
-err:
- btrfs_free_path(path);
- if (ret)
- goto out;
btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2);
inode_inc_iversion(&inode->vfs_inode);
inode_set_ctime_current(&inode->vfs_inode);
inode_inc_iversion(&dir->vfs_inode);
- inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode));
- ret = btrfs_update_inode(trans, dir);
-out:
- return ret;
+ update_time_after_link_or_unlink(dir);
+
+ return btrfs_update_inode(trans, dir);
}
int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
@@ -4462,7 +4492,7 @@ out:
static noinline int may_destroy_subvol(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_dir_item *di;
struct btrfs_key key;
struct fscrypt_str name = FSTR_INIT("default", 7);
@@ -4484,7 +4514,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root)
btrfs_err(fs_info,
"deleting default subvolume %llu is not allowed",
key.objectid);
- goto out;
+ return ret;
}
btrfs_release_path(path);
}
@@ -4495,14 +4525,13 @@ static noinline int may_destroy_subvol(struct btrfs_root *root)
ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
if (ret < 0)
- goto out;
+ return ret;
if (ret == 0) {
/*
* Key with offset -1 found, there would have to exist a root
* with such id, but this is out of valid range.
*/
- ret = -EUCLEAN;
- goto out;
+ return -EUCLEAN;
}
ret = 0;
@@ -4512,8 +4541,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root)
if (key.objectid == btrfs_root_id(root) && key.type == BTRFS_ROOT_REF_KEY)
ret = -ENOTEMPTY;
}
-out:
- btrfs_free_path(path);
+
return ret;
}
@@ -4685,68 +4713,68 @@ out_up_write:
return ret;
}
-static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
+static int btrfs_rmdir(struct inode *vfs_dir, struct dentry *dentry)
{
- struct inode *inode = d_inode(dentry);
- struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ struct btrfs_inode *dir = BTRFS_I(vfs_dir);
+ struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
int ret = 0;
struct btrfs_trans_handle *trans;
- u64 last_unlink_trans;
struct fscrypt_name fname;
- if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
+ if (inode->vfs_inode.i_size > BTRFS_EMPTY_DIR_SIZE)
return -ENOTEMPTY;
- if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID) {
+ if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) {
if (unlikely(btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))) {
btrfs_err(fs_info,
"extent tree v2 doesn't support snapshot deletion yet");
return -EOPNOTSUPP;
}
- return btrfs_delete_subvolume(BTRFS_I(dir), dentry);
+ return btrfs_delete_subvolume(dir, dentry);
}
- ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
+ ret = fscrypt_setup_filename(vfs_dir, &dentry->d_name, 1, &fname);
if (ret)
return ret;
/* This needs to handle no-key deletions later on */
- trans = __unlink_start_trans(BTRFS_I(dir));
+ trans = __unlink_start_trans(dir);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto out_notrans;
}
- if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
- ret = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry);
+ /*
+ * Propagate the last_unlink_trans value of the deleted dir to its
+ * parent directory. This is to prevent an unrecoverable log tree in the
+ * case we do something like this:
+ * 1) create dir foo
+ * 2) create snapshot under dir foo
+ * 3) delete the snapshot
+ * 4) rmdir foo
+ * 5) mkdir foo
+ * 6) fsync foo or some file inside foo
+ *
+ * This is because we can't unlink other roots when replaying the dir
+ * deletes for directory foo.
+ */
+ if (inode->last_unlink_trans >= trans->transid)
+ btrfs_record_snapshot_destroy(trans, dir);
+
+ if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
+ ret = btrfs_unlink_subvol(trans, dir, dentry);
goto out;
}
- ret = btrfs_orphan_add(trans, BTRFS_I(inode));
+ ret = btrfs_orphan_add(trans, inode);
if (ret)
goto out;
- last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
-
/* now the directory is empty */
- ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
- &fname.disk_name);
- if (!ret) {
- btrfs_i_size_write(BTRFS_I(inode), 0);
- /*
- * Propagate the last_unlink_trans value of the deleted dir to
- * its parent directory. This is to prevent an unrecoverable
- * log tree in the case we do something like this:
- * 1) create dir foo
- * 2) create snapshot under dir foo
- * 3) delete the snapshot
- * 4) rmdir foo
- * 5) mkdir foo
- * 6) fsync foo or some file inside foo
- */
- if (last_unlink_trans >= trans->transid)
- BTRFS_I(dir)->last_unlink_trans = last_unlink_trans;
- }
+ ret = btrfs_unlink_inode(trans, dir, inode, &fname.disk_name);
+ if (!ret)
+ btrfs_i_size_write(inode, 0);
out:
btrfs_end_transaction(trans);
out_notrans:
@@ -4756,20 +4784,80 @@ out_notrans:
return ret;
}
+static bool is_inside_block(u64 bytenr, u64 blockstart, u32 blocksize)
+{
+ ASSERT(IS_ALIGNED(blockstart, blocksize), "blockstart=%llu blocksize=%u",
+ blockstart, blocksize);
+
+ if (blockstart <= bytenr && bytenr <= blockstart + blocksize - 1)
+ return true;
+ return false;
+}
+
+static int truncate_block_zero_beyond_eof(struct btrfs_inode *inode, u64 start)
+{
+ const pgoff_t index = (start >> PAGE_SHIFT);
+ struct address_space *mapping = inode->vfs_inode.i_mapping;
+ struct folio *folio;
+ u64 zero_start;
+ u64 zero_end;
+ int ret = 0;
+
+again:
+ folio = filemap_lock_folio(mapping, index);
+ /* No folio present. */
+ if (IS_ERR(folio))
+ return 0;
+
+ if (!folio_test_uptodate(folio)) {
+ ret = btrfs_read_folio(NULL, folio);
+ folio_lock(folio);
+ if (folio->mapping != mapping) {
+ folio_unlock(folio);
+ folio_put(folio);
+ goto again;
+ }
+ if (!folio_test_uptodate(folio)) {
+ ret = -EIO;
+ goto out_unlock;
+ }
+ }
+ folio_wait_writeback(folio);
+
+ /*
+ * We do not need to lock extents nor wait for OE, as it's already
+ * beyond EOF.
+ */
+
+ zero_start = max_t(u64, folio_pos(folio), start);
+ zero_end = folio_end(folio);
+ folio_zero_range(folio, zero_start - folio_pos(folio),
+ zero_end - zero_start);
+
+out_unlock:
+ folio_unlock(folio);
+ folio_put(folio);
+ return ret;
+}
+
/*
- * Read, zero a chunk and write a block.
+ * Handle the truncation of a fs block.
+ *
+ * @inode - inode that we're zeroing
+ * @offset - the file offset of the block to truncate
+ * The value must be inside [@start, @end], and the function will do
+ * extra checks if the block that covers @offset needs to be zeroed.
+ * @start - the start file offset of the range we want to zero
+ * @end - the end (inclusive) file offset of the range we want to zero.
*
- * @inode - inode that we're zeroing
- * @from - the offset to start zeroing
- * @len - the length to zero, 0 to zero the entire range respective to the
- * offset
- * @front - zero up to the offset instead of from the offset on
+ * If the range is not block aligned, read out the folio that covers @offset,
+ * and if needed zero blocks that are inside the folio and covered by [@start, @end).
+ * If @start or @end + 1 lands inside a block, that block will be marked dirty
+ * for writeback.
*
- * This will find the block for the "from" offset and cow the block and zero the
- * part we want to zero. This is used with truncate and hole punching.
+ * This is utilized by hole punch, zero range, file expansion.
*/
-int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
- int front)
+int btrfs_truncate_block(struct btrfs_inode *inode, u64 offset, u64 start, u64 end)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct address_space *mapping = inode->vfs_inode.i_mapping;
@@ -4779,27 +4867,66 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
struct extent_changeset *data_reserved = NULL;
bool only_release_metadata = false;
u32 blocksize = fs_info->sectorsize;
- pgoff_t index = from >> PAGE_SHIFT;
- unsigned offset = from & (blocksize - 1);
+ pgoff_t index = (offset >> PAGE_SHIFT);
struct folio *folio;
gfp_t mask = btrfs_alloc_write_mask(mapping);
- size_t write_bytes = blocksize;
int ret = 0;
+ const bool in_head_block = is_inside_block(offset, round_down(start, blocksize),
+ blocksize);
+ const bool in_tail_block = is_inside_block(offset, round_down(end, blocksize),
+ blocksize);
+ bool need_truncate_head = false;
+ bool need_truncate_tail = false;
+ u64 zero_start;
+ u64 zero_end;
u64 block_start;
u64 block_end;
- if (IS_ALIGNED(offset, blocksize) &&
- (!len || IS_ALIGNED(len, blocksize)))
+ /* @offset should be inside the range. */
+ ASSERT(start <= offset && offset <= end, "offset=%llu start=%llu end=%llu",
+ offset, start, end);
+
+ /* The range is aligned at both ends. */
+ if (IS_ALIGNED(start, blocksize) && IS_ALIGNED(end + 1, blocksize)) {
+ /*
+ * For block size < page size case, we may have polluted blocks
+ * beyond EOF. So we also need to zero them out.
+ */
+ if (end == (u64)-1 && blocksize < PAGE_SIZE)
+ ret = truncate_block_zero_beyond_eof(inode, start);
+ goto out;
+ }
+
+ /*
+ * @offset may not be inside the head nor tail block. In that case we
+ * don't need to do anything.
+ */
+ if (!in_head_block && !in_tail_block)
goto out;
- block_start = round_down(from, blocksize);
+ /*
+ * Skip the truncatioin if the range in the target block is already aligned.
+ * The seemingly complex check will also handle the same block case.
+ */
+ if (in_head_block && !IS_ALIGNED(start, blocksize))
+ need_truncate_head = true;
+ if (in_tail_block && !IS_ALIGNED(end + 1, blocksize))
+ need_truncate_tail = true;
+ if (!need_truncate_head && !need_truncate_tail)
+ goto out;
+
+ block_start = round_down(offset, blocksize);
block_end = block_start + blocksize - 1;
ret = btrfs_check_data_free_space(inode, &data_reserved, block_start,
blocksize, false);
if (ret < 0) {
+ size_t write_bytes = blocksize;
+
if (btrfs_check_nocow_lock(inode, block_start, &write_bytes, false) > 0) {
- /* For nocow case, no need to reserve data space */
+ /* For nocow case, no need to reserve data space. */
+ ASSERT(write_bytes == blocksize, "write_bytes=%zu blocksize=%u",
+ write_bytes, blocksize);
only_release_metadata = true;
} else {
goto out;
@@ -4816,10 +4943,13 @@ again:
folio = __filemap_get_folio(mapping, index,
FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask);
if (IS_ERR(folio)) {
- btrfs_delalloc_release_space(inode, data_reserved, block_start,
- blocksize, true);
+ if (only_release_metadata)
+ btrfs_delalloc_release_metadata(inode, blocksize, true);
+ else
+ btrfs_delalloc_release_space(inode, data_reserved,
+ block_start, blocksize, true);
btrfs_delalloc_release_extents(inode, blocksize);
- ret = -ENOMEM;
+ ret = PTR_ERR(folio);
goto out;
}
@@ -4849,11 +4979,11 @@ again:
folio_wait_writeback(folio);
- lock_extent(io_tree, block_start, block_end, &cached_state);
+ btrfs_lock_extent(io_tree, block_start, block_end, &cached_state);
ordered = btrfs_lookup_ordered_extent(inode, block_start);
if (ordered) {
- unlock_extent(io_tree, block_start, block_end, &cached_state);
+ btrfs_unlock_extent(io_tree, block_start, block_end, &cached_state);
folio_unlock(folio);
folio_put(folio);
btrfs_start_ordered_extent(ordered);
@@ -4861,37 +4991,46 @@ again:
goto again;
}
- clear_extent_bit(&inode->io_tree, block_start, block_end,
- EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
- &cached_state);
+ btrfs_clear_extent_bit(&inode->io_tree, block_start, block_end,
+ EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
+ &cached_state);
ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
&cached_state);
if (ret) {
- unlock_extent(io_tree, block_start, block_end, &cached_state);
+ btrfs_unlock_extent(io_tree, block_start, block_end, &cached_state);
goto out_unlock;
}
- if (offset != blocksize) {
- if (!len)
- len = blocksize - offset;
- if (front)
- folio_zero_range(folio, block_start - folio_pos(folio),
- offset);
- else
- folio_zero_range(folio,
- (block_start - folio_pos(folio)) + offset,
- len);
+ if (end == (u64)-1) {
+ /*
+ * We're truncating beyond EOF, the remaining blocks normally are
+ * already holes thus no need to zero again, but it's possible for
+ * fs block size < page size cases to have memory mapped writes
+ * to pollute ranges beyond EOF.
+ *
+ * In that case although such polluted blocks beyond EOF will
+ * not reach disk, it still affects our page caches.
+ */
+ zero_start = max_t(u64, folio_pos(folio), start);
+ zero_end = min_t(u64, folio_end(folio) - 1, end);
+ } else {
+ zero_start = max_t(u64, block_start, start);
+ zero_end = min_t(u64, block_end, end);
}
+ folio_zero_range(folio, zero_start - folio_pos(folio),
+ zero_end - zero_start + 1);
+
btrfs_folio_clear_checked(fs_info, folio, block_start,
block_end + 1 - block_start);
btrfs_folio_set_dirty(fs_info, folio, block_start,
block_end + 1 - block_start);
- unlock_extent(io_tree, block_start, block_end, &cached_state);
if (only_release_metadata)
- set_extent_bit(&inode->io_tree, block_start, block_end,
- EXTENT_NORESERVE, NULL);
+ btrfs_set_extent_bit(&inode->io_tree, block_start, block_end,
+ EXTENT_NORESERVE, &cached_state);
+
+ btrfs_unlock_extent(io_tree, block_start, block_end, &cached_state);
out_unlock:
if (ret) {
@@ -4984,7 +5123,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
* rest of the block before we expand the i_size, otherwise we could
* expose stale data.
*/
- ret = btrfs_truncate_block(inode, oldsize, 0, 0);
+ ret = btrfs_truncate_block(inode, oldsize, oldsize, -1);
if (ret)
return ret;
@@ -5001,7 +5140,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
em = NULL;
break;
}
- last_byte = min(extent_map_end(em), block_end);
+ last_byte = min(btrfs_extent_map_end(em), block_end);
last_byte = ALIGN(last_byte, fs_info->sectorsize);
hole_size = last_byte - cur_offset;
@@ -5017,7 +5156,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
if (ret)
break;
- hole_em = alloc_extent_map();
+ hole_em = btrfs_alloc_extent_map();
if (!hole_em) {
btrfs_drop_extent_map_range(inode, cur_offset,
cur_offset + hole_size - 1,
@@ -5034,7 +5173,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
hole_em->generation = btrfs_get_fs_generation(fs_info);
ret = btrfs_replace_extent_map_range(inode, hole_em, true);
- free_extent_map(hole_em);
+ btrfs_free_extent_map(hole_em);
} else {
ret = btrfs_inode_set_file_extent_range(inode,
cur_offset, hole_size);
@@ -5042,14 +5181,14 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
break;
}
next:
- free_extent_map(em);
+ btrfs_free_extent_map(em);
em = NULL;
cur_offset = last_byte;
if (cur_offset >= block_end)
break;
}
- free_extent_map(em);
- unlock_extent(io_tree, hole_start, block_end - 1, &cached_state);
+ btrfs_free_extent_map(em);
+ btrfs_unlock_extent(io_tree, hole_start, block_end - 1, &cached_state);
return ret;
}
@@ -5129,7 +5268,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
ret = btrfs_truncate(BTRFS_I(inode), newsize == oldsize);
if (ret && inode->i_nlink) {
- int err;
+ int ret2;
/*
* Truncate failed, so fix up the in-memory size. We
@@ -5137,9 +5276,9 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
* wait for disk_i_size to be stable and then update the
* in-memory size to match.
*/
- err = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1);
- if (err)
- return err;
+ ret2 = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1);
+ if (ret2)
+ return ret2;
i_size_write(inode, BTRFS_I(inode)->disk_i_size);
}
}
@@ -5152,31 +5291,31 @@ static int btrfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
{
struct inode *inode = d_inode(dentry);
struct btrfs_root *root = BTRFS_I(inode)->root;
- int err;
+ int ret;
if (btrfs_root_readonly(root))
return -EROFS;
- err = setattr_prepare(idmap, dentry, attr);
- if (err)
- return err;
+ ret = setattr_prepare(idmap, dentry, attr);
+ if (ret)
+ return ret;
if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
- err = btrfs_setsize(inode, attr);
- if (err)
- return err;
+ ret = btrfs_setsize(inode, attr);
+ if (ret)
+ return ret;
}
if (attr->ia_valid) {
setattr_copy(idmap, inode, attr);
inode_inc_iversion(inode);
- err = btrfs_dirty_inode(BTRFS_I(inode));
+ ret = btrfs_dirty_inode(BTRFS_I(inode));
- if (!err && attr->ia_valid & ATTR_MODE)
- err = posix_acl_chmod(idmap, dentry, inode->i_mode);
+ if (!ret && attr->ia_valid & ATTR_MODE)
+ ret = posix_acl_chmod(idmap, dentry, inode->i_mode);
}
- return err;
+ return ret;
}
/*
@@ -5233,7 +5372,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
state_flags = state->state;
spin_unlock(&io_tree->lock);
- lock_extent(io_tree, start, end, &cached_state);
+ btrfs_lock_extent(io_tree, start, end, &cached_state);
/*
* If still has DELALLOC flag, the extent didn't reach disk,
@@ -5247,9 +5386,9 @@ static void evict_inode_truncate_pages(struct inode *inode)
btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start,
end - start + 1, NULL);
- clear_extent_bit(io_tree, start, end,
- EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING,
- &cached_state);
+ btrfs_clear_extent_bit(io_tree, start, end,
+ EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING,
+ &cached_state);
cond_resched();
spin_lock(&io_tree->lock);
@@ -5310,7 +5449,7 @@ void btrfs_evict_inode(struct inode *inode)
struct btrfs_fs_info *fs_info;
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_block_rsv *rsv = NULL;
+ struct btrfs_block_rsv rsv;
int ret;
trace_btrfs_inode_evict(inode);
@@ -5358,11 +5497,9 @@ void btrfs_evict_inode(struct inode *inode)
*/
btrfs_kill_delayed_inode_items(BTRFS_I(inode));
- rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
- if (!rsv)
- goto out;
- rsv->size = btrfs_calc_metadata_size(fs_info, 1);
- rsv->failfast = true;
+ btrfs_init_metadata_block_rsv(fs_info, &rsv, BTRFS_BLOCK_RSV_TEMP);
+ rsv.size = btrfs_calc_metadata_size(fs_info, 1);
+ rsv.failfast = true;
btrfs_i_size_write(BTRFS_I(inode), 0);
@@ -5374,11 +5511,11 @@ void btrfs_evict_inode(struct inode *inode)
.min_type = 0,
};
- trans = evict_refill_and_join(root, rsv);
+ trans = evict_refill_and_join(root, &rsv);
if (IS_ERR(trans))
- goto out;
+ goto out_release;
- trans->block_rsv = rsv;
+ trans->block_rsv = &rsv;
ret = btrfs_truncate_inode_items(trans, root, &control);
trans->block_rsv = &fs_info->trans_block_rsv;
@@ -5390,7 +5527,7 @@ void btrfs_evict_inode(struct inode *inode)
*/
btrfs_btree_balance_dirty_nodelay(fs_info);
if (ret && ret != -ENOSPC && ret != -EAGAIN)
- goto out;
+ goto out_release;
else if (!ret)
break;
}
@@ -5404,16 +5541,17 @@ void btrfs_evict_inode(struct inode *inode)
* If it turns out that we are dropping too many of these, we might want
* to add a mechanism for retrying these after a commit.
*/
- trans = evict_refill_and_join(root, rsv);
+ trans = evict_refill_and_join(root, &rsv);
if (!IS_ERR(trans)) {
- trans->block_rsv = rsv;
+ trans->block_rsv = &rsv;
btrfs_orphan_del(trans, BTRFS_I(inode));
trans->block_rsv = &fs_info->trans_block_rsv;
btrfs_end_transaction(trans);
}
+out_release:
+ btrfs_block_rsv_release(fs_info, &rsv, (u64)-1, NULL);
out:
- btrfs_free_block_rsv(fs_info, rsv);
/*
* If we didn't successfully delete, the orphan item will still be in
* the tree and we'll retry on the next mount. Again, we might also want
@@ -5435,7 +5573,7 @@ static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry,
struct btrfs_key *location, u8 *type)
{
struct btrfs_dir_item *di;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_root *root = dir->root;
int ret = 0;
struct fscrypt_name fname;
@@ -5446,7 +5584,7 @@ static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry,
ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname);
if (ret < 0)
- goto out;
+ return ret;
/*
* fscrypt_setup_filename() should never return a positive value, but
* gcc on sparc/parisc thinks it can, so assert that doesn't happen.
@@ -5475,7 +5613,6 @@ static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry,
*type = btrfs_dir_ftype(path->nodes[0], di);
out:
fscrypt_free_filename(&fname);
- btrfs_free_path(path);
return ret;
}
@@ -5490,7 +5627,7 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
struct btrfs_key *location,
struct btrfs_root **sub_root)
{
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct btrfs_root *new_root;
struct btrfs_root_ref *ref;
struct extent_buffer *leaf;
@@ -5546,7 +5683,6 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
location->offset = 0;
err = 0;
out:
- btrfs_free_path(path);
fscrypt_free_filename(&fname);
return err;
}
@@ -5597,7 +5733,7 @@ static int btrfs_find_actor(struct inode *inode, void *opaque)
args->root == BTRFS_I(inode)->root;
}
-static struct inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root)
+static struct btrfs_inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root)
{
struct inode *inode;
struct btrfs_iget_args args;
@@ -5609,40 +5745,42 @@ static struct inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root)
inode = iget5_locked_rcu(root->fs_info->sb, hashval, btrfs_find_actor,
btrfs_init_locked_inode,
(void *)&args);
- return inode;
+ if (!inode)
+ return NULL;
+ return BTRFS_I(inode);
}
/*
* Get an inode object given its inode number and corresponding root. Path is
* preallocated to prevent recursing back to iget through allocator.
*/
-struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root,
- struct btrfs_path *path)
+struct btrfs_inode *btrfs_iget_path(u64 ino, struct btrfs_root *root,
+ struct btrfs_path *path)
{
- struct inode *inode;
+ struct btrfs_inode *inode;
int ret;
inode = btrfs_iget_locked(ino, root);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode->vfs_inode.i_state & I_NEW))
return inode;
ret = btrfs_read_locked_inode(inode, path);
if (ret)
return ERR_PTR(ret);
- unlock_new_inode(inode);
+ unlock_new_inode(&inode->vfs_inode);
return inode;
}
/*
* Get an inode object given its inode number and corresponding root.
*/
-struct inode *btrfs_iget(u64 ino, struct btrfs_root *root)
+struct btrfs_inode *btrfs_iget(u64 ino, struct btrfs_root *root)
{
- struct inode *inode;
+ struct btrfs_inode *inode;
struct btrfs_path *path;
int ret;
@@ -5650,55 +5788,60 @@ struct inode *btrfs_iget(u64 ino, struct btrfs_root *root)
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode->vfs_inode.i_state & I_NEW))
return inode;
path = btrfs_alloc_path();
- if (!path)
+ if (!path) {
+ iget_failed(&inode->vfs_inode);
return ERR_PTR(-ENOMEM);
+ }
ret = btrfs_read_locked_inode(inode, path);
btrfs_free_path(path);
if (ret)
return ERR_PTR(ret);
- unlock_new_inode(inode);
+ unlock_new_inode(&inode->vfs_inode);
return inode;
}
-static struct inode *new_simple_dir(struct inode *dir,
- struct btrfs_key *key,
- struct btrfs_root *root)
+static struct btrfs_inode *new_simple_dir(struct inode *dir,
+ struct btrfs_key *key,
+ struct btrfs_root *root)
{
struct timespec64 ts;
- struct inode *inode = new_inode(dir->i_sb);
+ struct inode *vfs_inode;
+ struct btrfs_inode *inode;
- if (!inode)
+ vfs_inode = new_inode(dir->i_sb);
+ if (!vfs_inode)
return ERR_PTR(-ENOMEM);
- BTRFS_I(inode)->root = btrfs_grab_root(root);
- BTRFS_I(inode)->ref_root_id = key->objectid;
- set_bit(BTRFS_INODE_ROOT_STUB, &BTRFS_I(inode)->runtime_flags);
- set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
+ inode = BTRFS_I(vfs_inode);
+ inode->root = btrfs_grab_root(root);
+ inode->ref_root_id = key->objectid;
+ set_bit(BTRFS_INODE_ROOT_STUB, &inode->runtime_flags);
+ set_bit(BTRFS_INODE_DUMMY, &inode->runtime_flags);
- btrfs_set_inode_number(BTRFS_I(inode), BTRFS_EMPTY_SUBVOL_DIR_OBJECTID);
+ btrfs_set_inode_number(inode, BTRFS_EMPTY_SUBVOL_DIR_OBJECTID);
/*
* We only need lookup, the rest is read-only and there's no inode
* associated with the dentry
*/
- inode->i_op = &simple_dir_inode_operations;
- inode->i_opflags &= ~IOP_XATTR;
- inode->i_fop = &simple_dir_operations;
- inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
+ vfs_inode->i_op = &simple_dir_inode_operations;
+ vfs_inode->i_opflags &= ~IOP_XATTR;
+ vfs_inode->i_fop = &simple_dir_operations;
+ vfs_inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
- ts = inode_set_ctime_current(inode);
- inode_set_mtime_to_ts(inode, ts);
- inode_set_atime_to_ts(inode, inode_get_atime(dir));
- BTRFS_I(inode)->i_otime_sec = ts.tv_sec;
- BTRFS_I(inode)->i_otime_nsec = ts.tv_nsec;
+ ts = inode_set_ctime_current(vfs_inode);
+ inode_set_mtime_to_ts(vfs_inode, ts);
+ inode_set_atime_to_ts(vfs_inode, inode_get_atime(dir));
+ inode->i_otime_sec = ts.tv_sec;
+ inode->i_otime_nsec = ts.tv_nsec;
- inode->i_uid = dir->i_uid;
- inode->i_gid = dir->i_gid;
+ vfs_inode->i_uid = dir->i_uid;
+ vfs_inode->i_gid = dir->i_gid;
return inode;
}
@@ -5712,15 +5855,15 @@ static_assert(BTRFS_FT_FIFO == FT_FIFO);
static_assert(BTRFS_FT_SOCK == FT_SOCK);
static_assert(BTRFS_FT_SYMLINK == FT_SYMLINK);
-static inline u8 btrfs_inode_type(struct inode *inode)
+static inline u8 btrfs_inode_type(const struct btrfs_inode *inode)
{
- return fs_umode_to_ftype(inode->i_mode);
+ return fs_umode_to_ftype(inode->vfs_inode.i_mode);
}
struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
{
struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
- struct inode *inode;
+ struct btrfs_inode *inode;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_root *sub_root = root;
struct btrfs_key location = { 0 };
@@ -5737,18 +5880,18 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
if (location.type == BTRFS_INODE_ITEM_KEY) {
inode = btrfs_iget(location.objectid, root);
if (IS_ERR(inode))
- return inode;
+ return ERR_CAST(inode);
/* Do extra check against inode mode with di_type */
if (btrfs_inode_type(inode) != di_type) {
btrfs_crit(fs_info,
"inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u",
- inode->i_mode, btrfs_inode_type(inode),
+ inode->vfs_inode.i_mode, btrfs_inode_type(inode),
di_type);
- iput(inode);
+ iput(&inode->vfs_inode);
return ERR_PTR(-EUCLEAN);
}
- return inode;
+ return &inode->vfs_inode;
}
ret = fixup_tree_root_location(fs_info, BTRFS_I(dir), dentry,
@@ -5763,19 +5906,22 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
btrfs_put_root(sub_root);
if (IS_ERR(inode))
- return inode;
+ return ERR_CAST(inode);
down_read(&fs_info->cleanup_work_sem);
- if (!sb_rdonly(inode->i_sb))
+ if (!sb_rdonly(inode->vfs_inode.i_sb))
ret = btrfs_orphan_cleanup(sub_root);
up_read(&fs_info->cleanup_work_sem);
if (ret) {
- iput(inode);
+ iput(&inode->vfs_inode);
inode = ERR_PTR(ret);
}
}
- return inode;
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
+
+ return &inode->vfs_inode;
}
static int btrfs_dentry_delete(const struct dentry *dentry)
@@ -5815,7 +5961,7 @@ static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
{
struct btrfs_root *root = inode->root;
struct btrfs_key key, found_key;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
int ret;
@@ -5829,15 +5975,14 @@ static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
- goto out;
+ return ret;
/* FIXME: we should be able to handle this */
if (ret == 0)
- goto out;
- ret = 0;
+ return ret;
if (path->slots[0] == 0) {
inode->index_cnt = BTRFS_DIR_START_INDEX;
- goto out;
+ return 0;
}
path->slots[0]--;
@@ -5848,13 +5993,12 @@ static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
if (found_key.objectid != btrfs_ino(inode) ||
found_key.type != BTRFS_DIR_INDEX_KEY) {
inode->index_cnt = BTRFS_DIR_START_INDEX;
- goto out;
+ return 0;
}
inode->index_cnt = found_key.offset + 1;
-out:
- btrfs_free_path(path);
- return ret;
+
+ return 0;
}
static int btrfs_get_dir_last_index(struct btrfs_inode *dir, u64 *index)
@@ -5957,7 +6101,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
struct btrfs_dir_item *di;
struct btrfs_key key;
struct btrfs_key found_key;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
void *addr;
LIST_HEAD(ins_list);
LIST_HEAD(del_list);
@@ -6040,8 +6184,7 @@ again:
if (ret)
goto nopos;
- ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
- if (ret)
+ if (btrfs_readdir_delayed_dir_index(ctx, &ins_list))
goto nopos;
/*
@@ -6070,7 +6213,6 @@ nopos:
err:
if (put)
btrfs_readdir_put_delayed_items(BTRFS_I(inode), &ins_list, &del_list);
- btrfs_free_path(path);
return ret;
}
@@ -6248,7 +6390,7 @@ static void btrfs_inherit_iflags(struct btrfs_inode *inode, struct btrfs_inode *
inode->flags |= BTRFS_INODE_NODATASUM;
}
- btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode);
+ btrfs_sync_inode_flags_to_i_flags(inode);
}
int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
@@ -6334,6 +6476,8 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
if (btrfs_test_opt(fs_info, NODATACOW))
BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
BTRFS_INODE_NODATASUM;
+ btrfs_update_inode_mapping_flags(BTRFS_I(inode));
+ btrfs_set_inode_mapping_order(BTRFS_I(inode));
}
ret = btrfs_insert_inode_locked(inode);
@@ -6427,7 +6571,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
path = NULL;
if (args->subvol) {
- struct inode *parent;
+ struct btrfs_inode *parent;
/*
* Subvolumes inherit properties from their parent subvolume,
@@ -6437,11 +6581,13 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
if (IS_ERR(parent)) {
ret = PTR_ERR(parent);
} else {
- ret = btrfs_inode_inherit_props(trans, inode, parent);
- iput(parent);
+ ret = btrfs_inode_inherit_props(trans, BTRFS_I(inode),
+ parent);
+ iput(&parent->vfs_inode);
}
} else {
- ret = btrfs_inode_inherit_props(trans, inode, dir);
+ ret = btrfs_inode_inherit_props(trans, BTRFS_I(inode),
+ BTRFS_I(dir));
}
if (ret) {
btrfs_err(fs_info,
@@ -6475,13 +6621,17 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
if (args->orphan) {
ret = btrfs_orphan_add(trans, BTRFS_I(inode));
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto discard;
+ }
} else {
ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
0, BTRFS_I(inode)->dir_index);
- }
- if (ret) {
- btrfs_abort_transaction(trans, ret);
- goto discard;
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto discard;
+ }
}
return 0;
@@ -6539,7 +6689,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
return ret;
ret = btrfs_insert_dir_item(trans, name, parent_inode, &key,
- btrfs_inode_type(&inode->vfs_inode), index);
+ btrfs_inode_type(inode), index);
if (ret == -EEXIST || ret == -EOVERFLOW)
goto fail_dir_item;
else if (ret) {
@@ -6550,15 +6700,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size +
name->len * 2);
inode_inc_iversion(&parent_inode->vfs_inode);
- /*
- * If we are replaying a log tree, we do not want to update the mtime
- * and ctime of the parent directory with the current time, since the
- * log replay procedure is responsible for setting them to their correct
- * values (the ones it had when the fsync was done).
- */
- if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags))
- inode_set_mtime_to_ts(&parent_inode->vfs_inode,
- inode_set_ctime_current(&parent_inode->vfs_inode));
+ update_time_after_link_or_unlink(parent_inode);
ret = btrfs_update_inode(trans, parent_inode);
if (ret)
@@ -6568,20 +6710,18 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
fail_dir_item:
if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
u64 local_index;
- int err;
- err = btrfs_del_root_ref(trans, key.objectid,
- btrfs_root_id(root), parent_ino,
- &local_index, name);
- if (err)
- btrfs_abort_transaction(trans, err);
+ int ret2;
+
+ ret2 = btrfs_del_root_ref(trans, key.objectid, btrfs_root_id(root),
+ parent_ino, &local_index, name);
+ if (ret2)
+ btrfs_abort_transaction(trans, ret2);
} else if (add_backref) {
- u64 local_index;
- int err;
+ int ret2;
- err = btrfs_del_inode_ref(trans, root, name, ino, parent_ino,
- &local_index);
- if (err)
- btrfs_abort_transaction(trans, err);
+ ret2 = btrfs_del_inode_ref(trans, root, name, ino, parent_ino, NULL);
+ if (ret2)
+ btrfs_abort_transaction(trans, ret2);
}
/* Return the original error code */
@@ -6600,20 +6740,20 @@ static int btrfs_create_common(struct inode *dir, struct dentry *dentry,
};
unsigned int trans_num_items;
struct btrfs_trans_handle *trans;
- int err;
+ int ret;
- err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
- if (err)
+ ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
+ if (ret)
goto out_inode;
trans = btrfs_start_transaction(root, trans_num_items);
if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
+ ret = PTR_ERR(trans);
goto out_new_inode_args;
}
- err = btrfs_create_new_inode(trans, &new_inode_args);
- if (!err)
+ ret = btrfs_create_new_inode(trans, &new_inode_args);
+ if (!ret)
d_instantiate_new(dentry, inode);
btrfs_end_transaction(trans);
@@ -6621,9 +6761,9 @@ static int btrfs_create_common(struct inode *dir, struct dentry *dentry,
out_new_inode_args:
btrfs_new_inode_args_destroy(&new_inode_args);
out_inode:
- if (err)
+ if (ret)
iput(inode);
- return err;
+ return ret;
}
static int btrfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
@@ -6664,7 +6804,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
struct fscrypt_name fname;
u64 index;
- int err;
+ int ret;
int drop_inode = 0;
/* do not allow sys_link's with other subvols of the same device */
@@ -6674,12 +6814,12 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
if (inode->i_nlink >= BTRFS_LINK_MAX)
return -EMLINK;
- err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname);
- if (err)
+ ret = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname);
+ if (ret)
goto fail;
- err = btrfs_set_inode_index(BTRFS_I(dir), &index);
- if (err)
+ ret = btrfs_set_inode_index(BTRFS_I(dir), &index);
+ if (ret)
goto fail;
/*
@@ -6690,7 +6830,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
*/
trans = btrfs_start_transaction(root, inode->i_nlink ? 5 : 6);
if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
+ ret = PTR_ERR(trans);
trans = NULL;
goto fail;
}
@@ -6703,24 +6843,24 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
ihold(inode);
set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
- err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
+ ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
&fname.disk_name, 1, index);
- if (err) {
+ if (ret) {
drop_inode = 1;
} else {
struct dentry *parent = dentry->d_parent;
- err = btrfs_update_inode(trans, BTRFS_I(inode));
- if (err)
+ ret = btrfs_update_inode(trans, BTRFS_I(inode));
+ if (ret)
goto fail;
if (inode->i_nlink == 1) {
/*
* If new hard link count is 1, it's a file created
* with open(2) O_TMPFILE flag.
*/
- err = btrfs_orphan_del(trans, BTRFS_I(inode));
- if (err)
+ ret = btrfs_orphan_del(trans, BTRFS_I(inode));
+ if (ret)
goto fail;
}
d_instantiate(dentry, inode);
@@ -6736,21 +6876,21 @@ fail:
iput(inode);
}
btrfs_btree_balance_dirty(fs_info);
- return err;
+ return ret;
}
-static int btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct inode *inode;
inode = new_inode(dir->i_sb);
if (!inode)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
inode_init_owner(idmap, inode, dir, S_IFDIR | mode);
inode->i_op = &btrfs_dir_inode_operations;
inode->i_fop = &btrfs_dir_file_operations;
- return btrfs_create_common(dir, dentry, inode);
+ return ERR_PTR(btrfs_create_common(dir, dentry, inode));
}
static noinline int uncompress_inline(struct btrfs_path *path,
@@ -6759,6 +6899,7 @@ static noinline int uncompress_inline(struct btrfs_path *path,
{
int ret;
struct extent_buffer *leaf = path->nodes[0];
+ const u32 blocksize = leaf->fs_info->sectorsize;
char *tmp;
size_t max_size;
unsigned long inline_size;
@@ -6775,7 +6916,7 @@ static noinline int uncompress_inline(struct btrfs_path *path,
read_extent_buffer(leaf, tmp, ptr, inline_size);
- max_size = min_t(unsigned long, PAGE_SIZE, max_size);
+ max_size = min_t(unsigned long, blocksize, max_size);
ret = btrfs_decompress(compress_type, tmp, folio, 0, inline_size,
max_size);
@@ -6787,14 +6928,15 @@ static noinline int uncompress_inline(struct btrfs_path *path,
* cover that region here.
*/
- if (max_size < PAGE_SIZE)
- folio_zero_range(folio, max_size, PAGE_SIZE - max_size);
+ if (max_size < blocksize)
+ folio_zero_range(folio, max_size, blocksize - max_size);
kfree(tmp);
return ret;
}
static int read_inline_extent(struct btrfs_path *path, struct folio *folio)
{
+ const u32 blocksize = path->nodes[0]->fs_info->sectorsize;
struct btrfs_file_extent_item *fi;
void *kaddr;
size_t copy_size;
@@ -6809,14 +6951,14 @@ static int read_inline_extent(struct btrfs_path *path, struct folio *folio)
if (btrfs_file_extent_compression(path->nodes[0], fi) != BTRFS_COMPRESS_NONE)
return uncompress_inline(path, folio, fi);
- copy_size = min_t(u64, PAGE_SIZE,
+ copy_size = min_t(u64, blocksize,
btrfs_file_extent_ram_bytes(path->nodes[0], fi));
kaddr = kmap_local_folio(folio, 0);
read_extent_buffer(path->nodes[0], kaddr,
btrfs_file_extent_inline_start(fi), copy_size);
kunmap_local(kaddr);
- if (copy_size < PAGE_SIZE)
- folio_zero_range(folio, copy_size, PAGE_SIZE - copy_size);
+ if (copy_size < blocksize)
+ folio_zero_range(folio, copy_size, blocksize - copy_size);
return 0;
}
@@ -6855,18 +6997,18 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
struct extent_map_tree *em_tree = &inode->extent_tree;
read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, start, len);
+ em = btrfs_lookup_extent_mapping(em_tree, start, len);
read_unlock(&em_tree->lock);
if (em) {
if (em->start > start || em->start + em->len <= start)
- free_extent_map(em);
+ btrfs_free_extent_map(em);
else if (em->disk_bytenr == EXTENT_MAP_INLINE && folio)
- free_extent_map(em);
+ btrfs_free_extent_map(em);
else
goto out;
}
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
ret = -ENOMEM;
goto out;
@@ -7003,7 +7145,7 @@ not_found:
insert:
ret = 0;
btrfs_release_path(path);
- if (em->start > start || extent_map_end(em) <= start) {
+ if (em->start > start || btrfs_extent_map_end(em) <= start) {
btrfs_err(fs_info,
"bad extent! em: [%llu %llu] passed [%llu %llu]",
em->start, em->len, start, len);
@@ -7020,7 +7162,7 @@ out:
trace_btrfs_get_extent(root, inode, em);
if (ret) {
- free_extent_map(em);
+ btrfs_free_extent_map(em);
return ERR_PTR(ret);
}
return em;
@@ -7057,17 +7199,17 @@ static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
* NOTE: This only checks the file extents, caller is responsible to wait for
* any ordered extents.
*/
-noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
+noinline int can_nocow_extent(struct btrfs_inode *inode, u64 offset, u64 *len,
struct btrfs_file_extent *file_extent,
bool nowait)
{
- struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct can_nocow_file_extent_args nocow_args = { 0 };
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
int ret;
struct extent_buffer *leaf;
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct extent_io_tree *io_tree = &inode->io_tree;
struct btrfs_file_extent_item *fi;
struct btrfs_key key;
int found_type;
@@ -7077,35 +7219,34 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
return -ENOMEM;
path->nowait = nowait;
- ret = btrfs_lookup_file_extent(NULL, root, path,
- btrfs_ino(BTRFS_I(inode)), offset, 0);
+ ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
+ offset, 0);
if (ret < 0)
- goto out;
+ return ret;
if (ret == 1) {
if (path->slots[0] == 0) {
- /* can't find the item, must cow */
- ret = 0;
- goto out;
+ /* Can't find the item, must COW. */
+ return 0;
}
path->slots[0]--;
}
ret = 0;
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
- if (key.objectid != btrfs_ino(BTRFS_I(inode)) ||
+ if (key.objectid != btrfs_ino(inode) ||
key.type != BTRFS_EXTENT_DATA_KEY) {
- /* not our file or wrong item type, must cow */
- goto out;
+ /* Not our file or wrong item type, must COW. */
+ return 0;
}
if (key.offset > offset) {
- /* Wrong offset, must cow */
- goto out;
+ /* Wrong offset, must COW. */
+ return 0;
}
if (btrfs_file_extent_end(path) <= offset)
- goto out;
+ return 0;
fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
found_type = btrfs_file_extent_type(leaf, fi);
@@ -7114,43 +7255,38 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
nocow_args.end = offset + *len - 1;
nocow_args.free_path = true;
- ret = can_nocow_file_extent(path, &key, BTRFS_I(inode), &nocow_args);
+ ret = can_nocow_file_extent(path, &key, inode, &nocow_args);
/* can_nocow_file_extent() has freed the path. */
path = NULL;
if (ret != 1) {
/* Treat errors as not being able to NOCOW. */
- ret = 0;
- goto out;
+ return 0;
}
- ret = 0;
if (btrfs_extent_readonly(fs_info,
nocow_args.file_extent.disk_bytenr +
nocow_args.file_extent.offset))
- goto out;
+ return 0;
- if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
+ if (!(inode->flags & BTRFS_INODE_NODATACOW) &&
found_type == BTRFS_FILE_EXTENT_PREALLOC) {
u64 range_end;
range_end = round_up(offset + nocow_args.file_extent.num_bytes,
root->fs_info->sectorsize) - 1;
- ret = test_range_bit_exists(io_tree, offset, range_end, EXTENT_DELALLOC);
- if (ret) {
- ret = -EAGAIN;
- goto out;
- }
+ ret = btrfs_test_range_bit_exists(io_tree, offset, range_end,
+ EXTENT_DELALLOC);
+ if (ret)
+ return -EAGAIN;
}
if (file_extent)
memcpy(file_extent, &nocow_args.file_extent, sizeof(*file_extent));
*len = nocow_args.file_extent.num_bytes;
- ret = 1;
-out:
- btrfs_free_path(path);
- return ret;
+
+ return 1;
}
/* The callers of this must take lock_extent() */
@@ -7198,7 +7334,7 @@ struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start,
break;
}
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em)
return ERR_PTR(-ENOMEM);
@@ -7211,15 +7347,15 @@ struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start,
em->offset = file_extent->offset;
em->flags |= EXTENT_FLAG_PINNED;
if (type == BTRFS_ORDERED_COMPRESSED)
- extent_map_set_compression(em, file_extent->compression);
+ btrfs_extent_map_set_compression(em, file_extent->compression);
ret = btrfs_replace_extent_map_range(inode, em, true);
if (ret) {
- free_extent_map(em);
+ btrfs_free_extent_map(em);
return ERR_PTR(ret);
}
- /* em got 2 refs now, callers needs to do free_extent_map once. */
+ /* em got 2 refs now, callers needs to do btrfs_free_extent_map once. */
return em;
}
@@ -7233,13 +7369,13 @@ struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start,
static void wait_subpage_spinlock(struct folio *folio)
{
struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
- struct btrfs_subpage *subpage;
+ struct btrfs_folio_state *bfs;
- if (!btrfs_is_subpage(fs_info, folio->mapping))
+ if (!btrfs_is_subpage(fs_info, folio))
return;
ASSERT(folio_test_private(folio) && folio_get_private(folio));
- subpage = folio_get_private(folio);
+ bfs = folio_get_private(folio);
/*
* This may look insane as we just acquire the spinlock and release it,
@@ -7252,14 +7388,14 @@ static void wait_subpage_spinlock(struct folio *folio)
* Here we just acquire the spinlock so that all existing callers
* should exit and we're safe to release/invalidate the page.
*/
- spin_lock_irq(&subpage->lock);
- spin_unlock_irq(&subpage->lock);
+ spin_lock_irq(&bfs->lock);
+ spin_unlock_irq(&bfs->lock);
}
static int btrfs_launder_folio(struct folio *folio)
{
return btrfs_qgroup_free_data(folio_to_inode(folio), NULL, folio_pos(folio),
- PAGE_SIZE, NULL);
+ folio_size(folio), NULL);
}
static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
@@ -7346,7 +7482,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
}
if (!inode_evicting)
- lock_extent(tree, page_start, page_end, &cached_state);
+ btrfs_lock_extent(tree, page_start, page_end, &cached_state);
cur = page_start;
while (cur < page_end) {
@@ -7402,10 +7538,10 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
* btrfs_finish_ordered_io().
*/
if (!inode_evicting)
- clear_extent_bit(tree, cur, range_end,
- EXTENT_DELALLOC |
- EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
- EXTENT_DEFRAG, &cached_state);
+ btrfs_clear_extent_bit(tree, cur, range_end,
+ EXTENT_DELALLOC |
+ EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
+ EXTENT_DEFRAG, &cached_state);
spin_lock_irq(&inode->ordered_tree_lock);
set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
@@ -7447,12 +7583,11 @@ next:
* Since the IO will never happen for this page.
*/
btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur, NULL);
- if (!inode_evicting) {
- clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED |
- EXTENT_DELALLOC | EXTENT_UPTODATE |
- EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG |
- extra_flags, &cached_state);
- }
+ if (!inode_evicting)
+ btrfs_clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED |
+ EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
+ EXTENT_DEFRAG | extra_flags,
+ &cached_state);
cur = range_end + 1;
}
/*
@@ -7477,7 +7612,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
};
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_block_rsv *rsv;
+ struct btrfs_block_rsv rsv;
int ret;
struct btrfs_trans_handle *trans;
u64 mask = fs_info->sectorsize - 1;
@@ -7519,11 +7654,9 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
* 2) fs_info->trans_block_rsv - this will have 1 items worth left for
* updating the inode.
*/
- rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
- if (!rsv)
- return -ENOMEM;
- rsv->size = min_size;
- rsv->failfast = true;
+ btrfs_init_metadata_block_rsv(fs_info, &rsv, BTRFS_BLOCK_RSV_TEMP);
+ rsv.size = min_size;
+ rsv.failfast = true;
/*
* 1 for the truncate slack space
@@ -7536,7 +7669,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
}
/* Migrate the slack space for the truncate to our reserve */
- ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
+ ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, &rsv,
min_size, false);
/*
* We have reserved 2 metadata units when we started the transaction and
@@ -7548,7 +7681,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
goto out;
}
- trans->block_rsv = rsv;
+ trans->block_rsv = &rsv;
while (1) {
struct extent_state *cached_state = NULL;
@@ -7556,7 +7689,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize);
control.new_size = new_size;
- lock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state);
+ btrfs_lock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state);
/*
* We want to drop from the next block forward in case this new
* size is not block aligned since we will be keeping the last
@@ -7571,7 +7704,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
inode_sub_bytes(&inode->vfs_inode, control.sub_bytes);
btrfs_inode_safe_disk_i_size_write(inode, control.last_size);
- unlock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state);
+ btrfs_unlock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state);
trans->block_rsv = &fs_info->trans_block_rsv;
if (ret != -ENOSPC && ret != -EAGAIN)
@@ -7591,9 +7724,9 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
break;
}
- btrfs_block_rsv_release(fs_info, rsv, -1, NULL);
+ btrfs_block_rsv_release(fs_info, &rsv, -1, NULL);
ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
- rsv, min_size, false);
+ &rsv, min_size, false);
/*
* We have reserved 2 metadata units when we started the
* transaction and min_size matches 1 unit, so this should never
@@ -7602,7 +7735,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
if (WARN_ON(ret))
break;
- trans->block_rsv = rsv;
+ trans->block_rsv = &rsv;
}
/*
@@ -7615,7 +7748,8 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
- ret = btrfs_truncate_block(inode, inode->vfs_inode.i_size, 0, 0);
+ ret = btrfs_truncate_block(inode, inode->vfs_inode.i_size,
+ inode->vfs_inode.i_size, (u64)-1);
if (ret)
goto out;
trans = btrfs_start_transaction(root, 1);
@@ -7640,7 +7774,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
btrfs_btree_balance_dirty(fs_info);
}
out:
- btrfs_free_block_rsv(fs_info, rsv);
+ btrfs_block_rsv_release(fs_info, &rsv, (u64)-1, NULL);
/*
* So if we truncate and then write and fsync we normally would just
* write the extents that changed, which is a problem if we need to
@@ -7727,10 +7861,10 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->i_otime_nsec = 0;
inode = &ei->vfs_inode;
- extent_map_tree_init(&ei->extent_tree);
+ btrfs_extent_map_tree_init(&ei->extent_tree);
/* This io tree sets the valid inode. */
- extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO);
+ btrfs_extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO);
ei->io_tree.inode = ei;
ei->file_extent_tree = NULL;
@@ -7895,7 +8029,7 @@ static int btrfs_getattr(struct mnt_idmap *idmap,
generic_fillattr(idmap, request_mask, inode, stat);
stat->dev = BTRFS_I(inode)->root->anon_dev;
- stat->subvol = BTRFS_I(inode)->root->root_key.objectid;
+ stat->subvol = btrfs_root_id(BTRFS_I(inode)->root);
stat->result_mask |= STATX_SUBVOL;
spin_lock(&BTRFS_I(inode)->lock);
@@ -7928,6 +8062,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
int ret;
int ret2;
bool need_abort = false;
+ bool logs_pinned = false;
struct fscrypt_name old_fname, new_fname;
struct fscrypt_str *old_name, *new_name;
@@ -8051,6 +8186,31 @@ static int btrfs_rename_exchange(struct inode *old_dir,
inode_inc_iversion(new_inode);
simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID &&
+ new_ino != BTRFS_FIRST_FREE_OBJECTID) {
+ /*
+ * If we are renaming in the same directory (and it's not for
+ * root entries) pin the log early to prevent any concurrent
+ * task from logging the directory after we removed the old
+ * entries and before we add the new entries, otherwise that
+ * task can sync a log without any entry for the inodes we are
+ * renaming and therefore replaying that log, if a power failure
+ * happens after syncing the log, would result in deleting the
+ * inodes.
+ *
+ * If the rename affects two different directories, we want to
+ * make sure the that there's no log commit that contains
+ * updates for only one of the directories but not for the
+ * other.
+ *
+ * If we are renaming an entry for a root, we don't care about
+ * log updates since we called btrfs_set_log_full_commit().
+ */
+ btrfs_pin_log_trans(root);
+ btrfs_pin_log_trans(dest);
+ logs_pinned = true;
+ }
+
if (old_dentry->d_parent != new_dentry->d_parent) {
btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
BTRFS_I(old_inode), true);
@@ -8122,30 +8282,23 @@ static int btrfs_rename_exchange(struct inode *old_dir,
BTRFS_I(new_inode)->dir_index = new_idx;
/*
- * Now pin the logs of the roots. We do it to ensure that no other task
- * can sync the logs while we are in progress with the rename, because
- * that could result in an inconsistency in case any of the inodes that
- * are part of this rename operation were logged before.
+ * Do the log updates for all inodes.
+ *
+ * If either entry is for a root we don't need to update the logs since
+ * we've called btrfs_set_log_full_commit() before.
*/
- if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
- btrfs_pin_log_trans(root);
- if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
- btrfs_pin_log_trans(dest);
-
- /* Do the log updates for all inodes. */
- if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
+ if (logs_pinned) {
btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
old_rename_ctx.index, new_dentry->d_parent);
- if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir),
new_rename_ctx.index, old_dentry->d_parent);
+ }
- /* Now unpin the logs. */
- if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
+out_fail:
+ if (logs_pinned) {
btrfs_end_log_trans(root);
- if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
btrfs_end_log_trans(dest);
-out_fail:
+ }
ret2 = btrfs_end_transaction(trans);
ret = ret ? ret : ret2;
out_notrans:
@@ -8195,6 +8348,7 @@ static int btrfs_rename(struct mnt_idmap *idmap,
int ret2;
u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
struct fscrypt_name old_fname, new_fname;
+ bool logs_pinned = false;
if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
return -EPERM;
@@ -8329,6 +8483,29 @@ static int btrfs_rename(struct mnt_idmap *idmap,
inode_inc_iversion(old_inode);
simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
+ /*
+ * If we are renaming in the same directory (and it's not a
+ * root entry) pin the log to prevent any concurrent task from
+ * logging the directory after we removed the old entry and
+ * before we add the new entry, otherwise that task can sync
+ * a log without any entry for the inode we are renaming and
+ * therefore replaying that log, if a power failure happens
+ * after syncing the log, would result in deleting the inode.
+ *
+ * If the rename affects two different directories, we want to
+ * make sure the that there's no log commit that contains
+ * updates for only one of the directories but not for the
+ * other.
+ *
+ * If we are renaming an entry for a root, we don't care about
+ * log updates since we called btrfs_set_log_full_commit().
+ */
+ btrfs_pin_log_trans(root);
+ btrfs_pin_log_trans(dest);
+ logs_pinned = true;
+ }
+
if (old_dentry->d_parent != new_dentry->d_parent)
btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
BTRFS_I(old_inode), true);
@@ -8393,7 +8570,7 @@ static int btrfs_rename(struct mnt_idmap *idmap,
if (old_inode->i_nlink == 1)
BTRFS_I(old_inode)->dir_index = index;
- if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
+ if (logs_pinned)
btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
rename_ctx.index, new_dentry->d_parent);
@@ -8409,6 +8586,10 @@ static int btrfs_rename(struct mnt_idmap *idmap,
}
}
out_fail:
+ if (logs_pinned) {
+ btrfs_end_log_trans(root);
+ btrfs_end_log_trans(dest);
+ }
ret2 = btrfs_end_transaction(trans);
ret = ret ? ret : ret2;
out_notrans:
@@ -8494,8 +8675,6 @@ static int start_delalloc_inodes(struct btrfs_root *root,
struct writeback_control *wbc, bool snapshot,
bool in_reclaim_context)
{
- struct btrfs_inode *binode;
- struct inode *inode;
struct btrfs_delalloc_work *work, *next;
LIST_HEAD(works);
LIST_HEAD(splice);
@@ -8506,30 +8685,30 @@ static int start_delalloc_inodes(struct btrfs_root *root,
spin_lock(&root->delalloc_lock);
list_splice_init(&root->delalloc_inodes, &splice);
while (!list_empty(&splice)) {
- binode = list_entry(splice.next, struct btrfs_inode,
- delalloc_inodes);
+ struct btrfs_inode *inode;
+ struct inode *tmp_inode;
+
+ inode = list_first_entry(&splice, struct btrfs_inode, delalloc_inodes);
- list_move_tail(&binode->delalloc_inodes,
- &root->delalloc_inodes);
+ list_move_tail(&inode->delalloc_inodes, &root->delalloc_inodes);
if (in_reclaim_context &&
- test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &binode->runtime_flags))
+ test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags))
continue;
- inode = igrab(&binode->vfs_inode);
- if (!inode) {
+ tmp_inode = igrab(&inode->vfs_inode);
+ if (!tmp_inode) {
cond_resched_lock(&root->delalloc_lock);
continue;
}
spin_unlock(&root->delalloc_lock);
if (snapshot)
- set_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
- &binode->runtime_flags);
+ set_bit(BTRFS_INODE_SNAPSHOT_FLUSH, &inode->runtime_flags);
if (full_flush) {
- work = btrfs_alloc_delalloc_work(inode);
+ work = btrfs_alloc_delalloc_work(&inode->vfs_inode);
if (!work) {
- iput(inode);
+ iput(&inode->vfs_inode);
ret = -ENOMEM;
goto out;
}
@@ -8537,8 +8716,8 @@ static int start_delalloc_inodes(struct btrfs_root *root,
btrfs_queue_work(root->fs_info->flush_workers,
&work->work);
} else {
- ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc);
- btrfs_add_delayed_iput(BTRFS_I(inode));
+ ret = filemap_fdatawrite_wbc(inode->vfs_inode.i_mapping, wbc);
+ btrfs_add_delayed_iput(inode);
if (ret || wbc->nr_to_write <= 0)
goto out;
}
@@ -8647,7 +8826,7 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
.dentry = dentry,
};
unsigned int trans_num_items;
- int err;
+ int ret;
int name_len;
int datasize;
unsigned long ptr;
@@ -8655,7 +8834,12 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
struct extent_buffer *leaf;
name_len = strlen(symname);
- if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info))
+ /*
+ * Symlinks utilize uncompressed inline extent data, which should not
+ * reach block size.
+ */
+ if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
+ name_len >= fs_info->sectorsize)
return -ENAMETOOLONG;
inode = new_inode(dir->i_sb);
@@ -8669,38 +8853,37 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
inode_set_bytes(inode, name_len);
new_inode_args.inode = inode;
- err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
- if (err)
+ ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
+ if (ret)
goto out_inode;
/* 1 additional item for the inline extent */
trans_num_items++;
trans = btrfs_start_transaction(root, trans_num_items);
if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
+ ret = PTR_ERR(trans);
goto out_new_inode_args;
}
- err = btrfs_create_new_inode(trans, &new_inode_args);
- if (err)
+ ret = btrfs_create_new_inode(trans, &new_inode_args);
+ if (ret)
goto out;
path = btrfs_alloc_path();
if (!path) {
- err = -ENOMEM;
- btrfs_abort_transaction(trans, err);
+ ret = -ENOMEM;
+ btrfs_abort_transaction(trans, ret);
discard_new_inode(inode);
inode = NULL;
goto out;
}
key.objectid = btrfs_ino(BTRFS_I(inode));
- key.offset = 0;
key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = 0;
datasize = btrfs_file_extent_calc_inline_size(name_len);
- err = btrfs_insert_empty_item(trans, root, path, &key,
- datasize);
- if (err) {
- btrfs_abort_transaction(trans, err);
+ ret = btrfs_insert_empty_item(trans, root, path, &key, datasize);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
btrfs_free_path(path);
discard_new_inode(inode);
inode = NULL;
@@ -8722,16 +8905,16 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
btrfs_free_path(path);
d_instantiate_new(dentry, inode);
- err = 0;
+ ret = 0;
out:
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
out_new_inode_args:
btrfs_new_inode_args_destroy(&new_inode_args);
out_inode:
- if (err)
+ if (ret)
iput(inode);
- return err;
+ return ret;
}
static struct btrfs_trans_handle *insert_prealloc_file_extent(
@@ -8868,11 +9051,11 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
btrfs_free_reserved_extent(fs_info, ins.objectid,
- ins.offset, 0);
+ ins.offset, false);
break;
}
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
btrfs_drop_extent_map_range(BTRFS_I(inode), cur_offset,
cur_offset + ins.offset - 1, false);
@@ -8890,7 +9073,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
em->generation = trans->transid;
ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, true);
- free_extent_map(em);
+ btrfs_free_extent_map(em);
next:
num_bytes -= ins.offset;
cur_offset += ins.offset;
@@ -9062,7 +9245,7 @@ static ssize_t btrfs_encoded_read_inline(
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_io_tree *io_tree = &inode->io_tree;
- struct btrfs_path *path;
+ BTRFS_PATH_AUTO_FREE(path);
struct extent_buffer *leaf;
struct btrfs_file_extent_item *item;
u64 ram_bytes;
@@ -9072,10 +9255,8 @@ static ssize_t btrfs_encoded_read_inline(
const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!path)
+ return -ENOMEM;
path->nowait = nowait;
@@ -9084,9 +9265,9 @@ static ssize_t btrfs_encoded_read_inline(
if (ret) {
if (ret > 0) {
/* The extent item disappeared? */
- ret = -EIO;
+ return -EIO;
}
- goto out;
+ return ret;
}
leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
@@ -9099,17 +9280,16 @@ static ssize_t btrfs_encoded_read_inline(
ret = btrfs_encoded_io_compression_from_extent(fs_info,
btrfs_file_extent_compression(leaf, item));
if (ret < 0)
- goto out;
+ return ret;
encoded->compression = ret;
if (encoded->compression) {
size_t inline_size;
inline_size = btrfs_file_extent_inline_item_len(leaf,
path->slots[0]);
- if (inline_size > count) {
- ret = -ENOBUFS;
- goto out;
- }
+ if (inline_size > count)
+ return -ENOBUFS;
+
count = inline_size;
encoded->unencoded_len = ram_bytes;
encoded->unencoded_offset = iocb->ki_pos - extent_start;
@@ -9121,13 +9301,12 @@ static ssize_t btrfs_encoded_read_inline(
}
tmp = kmalloc(count, GFP_NOFS);
- if (!tmp) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!tmp)
+ return -ENOMEM;
+
read_extent_buffer(leaf, tmp, ptr, count);
btrfs_release_path(path);
- unlock_extent(io_tree, start, lockend, cached_state);
+ btrfs_unlock_extent(io_tree, start, lockend, cached_state);
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
*unlocked = true;
@@ -9135,13 +9314,12 @@ static ssize_t btrfs_encoded_read_inline(
if (ret != count)
ret = -EFAULT;
kfree(tmp);
-out:
- btrfs_free_path(path);
+
return ret;
}
struct btrfs_encoded_read_private {
- struct completion done;
+ struct completion *sync_reads;
void *uring_ctx;
refcount_t pending_refs;
blk_status_t status;
@@ -9153,11 +9331,10 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
if (bbio->bio.bi_status) {
/*
- * The memory barrier implied by the atomic_dec_return() here
- * pairs with the memory barrier implied by the
- * atomic_dec_return() or io_wait_event() in
- * btrfs_encoded_read_regular_fill_pages() to ensure that this
- * write is observed before the load of status in
+ * The memory barrier implied by the refcount_dec_and_test() here
+ * pairs with the memory barrier implied by the refcount_dec_and_test()
+ * in btrfs_encoded_read_regular_fill_pages() to ensure that
+ * this write is observed before the load of status in
* btrfs_encoded_read_regular_fill_pages().
*/
WRITE_ONCE(priv->status, bbio->bio.bi_status);
@@ -9169,7 +9346,7 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
btrfs_uring_read_extent_endio(priv->uring_ctx, err);
kfree(priv);
} else {
- complete(&priv->done);
+ complete(priv->sync_reads);
}
}
bio_put(&bbio->bio);
@@ -9180,16 +9357,26 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
struct page **pages, void *uring_ctx)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct btrfs_encoded_read_private *priv;
+ struct btrfs_encoded_read_private *priv, sync_priv;
+ struct completion sync_reads;
unsigned long i = 0;
struct btrfs_bio *bbio;
int ret;
- priv = kmalloc(sizeof(struct btrfs_encoded_read_private), GFP_NOFS);
- if (!priv)
- return -ENOMEM;
+ /*
+ * Fast path for synchronous reads which completes in this call, io_uring
+ * needs longer time span.
+ */
+ if (uring_ctx) {
+ priv = kmalloc(sizeof(struct btrfs_encoded_read_private), GFP_NOFS);
+ if (!priv)
+ return -ENOMEM;
+ } else {
+ priv = &sync_priv;
+ init_completion(&sync_reads);
+ priv->sync_reads = &sync_reads;
+ }
- init_completion(&priv->done);
refcount_set(&priv->pending_refs, 1);
priv->status = 0;
priv->uring_ctx = uring_ctx;
@@ -9232,11 +9419,9 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
return -EIOCBQUEUED;
} else {
if (!refcount_dec_and_test(&priv->pending_refs))
- wait_for_completion_io(&priv->done);
+ wait_for_completion_io(&sync_reads);
/* See btrfs_encoded_read_endio() for ordering. */
- ret = blk_status_to_errno(READ_ONCE(priv->status));
- kfree(priv);
- return ret;
+ return blk_status_to_errno(READ_ONCE(priv->status));
}
}
@@ -9269,7 +9454,7 @@ ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter,
if (ret)
goto out;
- unlock_extent(io_tree, start, lockend, cached_state);
+ btrfs_unlock_extent(io_tree, start, lockend, cached_state);
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
*unlocked = true;
@@ -9346,7 +9531,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
goto out_unlock_inode;
}
- if (!try_lock_extent(io_tree, start, lockend, cached_state)) {
+ if (!btrfs_try_lock_extent(io_tree, start, lockend, cached_state)) {
ret = -EAGAIN;
goto out_unlock_inode;
}
@@ -9355,7 +9540,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
lockend - start + 1);
if (ordered) {
btrfs_put_ordered_extent(ordered);
- unlock_extent(io_tree, start, lockend, cached_state);
+ btrfs_unlock_extent(io_tree, start, lockend, cached_state);
ret = -EAGAIN;
goto out_unlock_inode;
}
@@ -9368,13 +9553,13 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
if (ret)
goto out_unlock_inode;
- lock_extent(io_tree, start, lockend, cached_state);
+ btrfs_lock_extent(io_tree, start, lockend, cached_state);
ordered = btrfs_lookup_ordered_range(inode, start,
lockend - start + 1);
if (!ordered)
break;
btrfs_put_ordered_extent(ordered);
- unlock_extent(io_tree, start, lockend, cached_state);
+ btrfs_unlock_extent(io_tree, start, lockend, cached_state);
cond_resched();
}
}
@@ -9392,7 +9577,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
* For inline extents we get everything we need out of the
* extent item.
*/
- free_extent_map(em);
+ btrfs_free_extent_map(em);
em = NULL;
ret = btrfs_encoded_read_inline(iocb, iter, start, lockend,
cached_state, extent_start,
@@ -9404,7 +9589,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
* We only want to return up to EOF even if the extent extends beyond
* that.
*/
- encoded->len = min_t(u64, extent_map_end(em),
+ encoded->len = min_t(u64, btrfs_extent_map_end(em),
inode->vfs_inode.i_size) - iocb->ki_pos;
if (em->disk_bytenr == EXTENT_MAP_HOLE ||
(em->flags & EXTENT_FLAG_PREALLOC)) {
@@ -9412,7 +9597,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
count = min_t(u64, count, encoded->len);
encoded->len = count;
encoded->unencoded_len = count;
- } else if (extent_map_is_compressed(em)) {
+ } else if (btrfs_extent_map_is_compressed(em)) {
*disk_bytenr = em->disk_bytenr;
/*
* Bail if the buffer isn't large enough to return the whole
@@ -9427,12 +9612,12 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
encoded->unencoded_len = em->ram_bytes;
encoded->unencoded_offset = iocb->ki_pos - (em->start - em->offset);
ret = btrfs_encoded_io_compression_from_extent(fs_info,
- extent_map_compression(em));
+ btrfs_extent_map_compression(em));
if (ret < 0)
goto out_em;
encoded->compression = ret;
} else {
- *disk_bytenr = extent_map_block_start(em) + (start - em->start);
+ *disk_bytenr = btrfs_extent_map_block_start(em) + (start - em->start);
if (encoded->len > count)
encoded->len = count;
/*
@@ -9445,11 +9630,11 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
encoded->unencoded_len = count;
*disk_io_size = ALIGN(*disk_io_size, fs_info->sectorsize);
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
em = NULL;
if (*disk_bytenr == EXTENT_MAP_HOLE) {
- unlock_extent(io_tree, start, lockend, cached_state);
+ btrfs_unlock_extent(io_tree, start, lockend, cached_state);
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
unlocked = true;
ret = iov_iter_zero(count, iter);
@@ -9461,11 +9646,11 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
}
out_em:
- free_extent_map(em);
+ btrfs_free_extent_map(em);
out_unlock_extent:
/* Leave inode and extent locked if we need to do a read. */
if (!unlocked && ret != -EIOCBQUEUED)
- unlock_extent(io_tree, start, lockend, cached_state);
+ btrfs_unlock_extent(io_tree, start, lockend, cached_state);
out_unlock_inode:
if (!unlocked && ret != -EIOCBQUEUED)
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
@@ -9612,14 +9797,14 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
end >> PAGE_SHIFT);
if (ret)
goto out_folios;
- lock_extent(io_tree, start, end, &cached_state);
+ btrfs_lock_extent(io_tree, start, end, &cached_state);
ordered = btrfs_lookup_ordered_range(inode, start, num_bytes);
if (!ordered &&
!filemap_range_has_page(inode->vfs_inode.i_mapping, start, end))
break;
if (ordered)
btrfs_put_ordered_extent(ordered);
- unlock_extent(io_tree, start, end, &cached_state);
+ btrfs_unlock_extent(io_tree, start, end, &cached_state);
cond_resched();
}
@@ -9669,11 +9854,11 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
ret = PTR_ERR(em);
goto out_free_reserved;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent,
- (1 << BTRFS_ORDERED_ENCODED) |
- (1 << BTRFS_ORDERED_COMPRESSED));
+ (1U << BTRFS_ORDERED_ENCODED) |
+ (1U << BTRFS_ORDERED_COMPRESSED));
if (IS_ERR(ordered)) {
btrfs_drop_extent_map_range(inode, start, end, false);
ret = PTR_ERR(ordered);
@@ -9684,7 +9869,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
if (start + encoded->len > inode->vfs_inode.i_size)
i_size_write(&inode->vfs_inode, start + encoded->len);
- unlock_extent(io_tree, start, end, &cached_state);
+ btrfs_unlock_extent(io_tree, start, end, &cached_state);
btrfs_delalloc_release_extents(inode, num_bytes);
@@ -9694,7 +9879,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
out_free_reserved:
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
- btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
+ btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true);
out_delalloc_release:
btrfs_delalloc_release_extents(inode, num_bytes);
btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0);
@@ -9707,9 +9892,9 @@ out_free_data_space:
* bytes_may_use.
*/
if (!extent_reserved)
- btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes);
+ btrfs_free_reserved_data_space_noquota(inode, disk_num_bytes);
out_unlock:
- unlock_extent(io_tree, start, end, &cached_state);
+ btrfs_unlock_extent(io_tree, start, end, &cached_state);
out_folios:
for (i = 0; i < nr_folios; i++) {
if (folios[i])
@@ -9974,7 +10159,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize);
- lock_extent(io_tree, 0, isize - 1, &cached_state);
+ btrfs_lock_extent(io_tree, 0, isize - 1, &cached_state);
while (prev_extent_end < isize) {
struct btrfs_key key;
struct extent_buffer *leaf;
@@ -10152,7 +10337,7 @@ out:
if (!IS_ERR_OR_NULL(map))
btrfs_free_chunk_map(map);
- unlock_extent(io_tree, 0, isize - 1, &cached_state);
+ btrfs_unlock_extent(io_tree, 0, isize - 1, &cached_state);
if (ret)
btrfs_swap_deactivate(file);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 6c18bad53cd3..7e13de2bdcbf 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -118,8 +118,8 @@ struct btrfs_ioctl_encoded_io_args_32 {
#endif
/* Mask out flags that are inappropriate for the given type of inode. */
-static unsigned int btrfs_mask_fsflags_for_type(struct inode *inode,
- unsigned int flags)
+static unsigned int btrfs_mask_fsflags_for_type(const struct inode *inode,
+ unsigned int flags)
{
if (S_ISDIR(inode->i_mode))
return flags;
@@ -133,11 +133,11 @@ static unsigned int btrfs_mask_fsflags_for_type(struct inode *inode,
* Export internal inode flags to the format expected by the FS_IOC_GETFLAGS
* ioctl.
*/
-static unsigned int btrfs_inode_flags_to_fsflags(struct btrfs_inode *binode)
+static unsigned int btrfs_inode_flags_to_fsflags(const struct btrfs_inode *inode)
{
unsigned int iflags = 0;
- u32 flags = binode->flags;
- u32 ro_flags = binode->ro_flags;
+ u32 flags = inode->flags;
+ u32 ro_flags = inode->ro_flags;
if (flags & BTRFS_INODE_SYNC)
iflags |= FS_SYNC_FL;
@@ -167,25 +167,24 @@ static unsigned int btrfs_inode_flags_to_fsflags(struct btrfs_inode *binode)
/*
* Update inode->i_flags based on the btrfs internal flags.
*/
-void btrfs_sync_inode_flags_to_i_flags(struct inode *inode)
+void btrfs_sync_inode_flags_to_i_flags(struct btrfs_inode *inode)
{
- struct btrfs_inode *binode = BTRFS_I(inode);
unsigned int new_fl = 0;
- if (binode->flags & BTRFS_INODE_SYNC)
+ if (inode->flags & BTRFS_INODE_SYNC)
new_fl |= S_SYNC;
- if (binode->flags & BTRFS_INODE_IMMUTABLE)
+ if (inode->flags & BTRFS_INODE_IMMUTABLE)
new_fl |= S_IMMUTABLE;
- if (binode->flags & BTRFS_INODE_APPEND)
+ if (inode->flags & BTRFS_INODE_APPEND)
new_fl |= S_APPEND;
- if (binode->flags & BTRFS_INODE_NOATIME)
+ if (inode->flags & BTRFS_INODE_NOATIME)
new_fl |= S_NOATIME;
- if (binode->flags & BTRFS_INODE_DIRSYNC)
+ if (inode->flags & BTRFS_INODE_DIRSYNC)
new_fl |= S_DIRSYNC;
- if (binode->ro_flags & BTRFS_INODE_RO_VERITY)
+ if (inode->ro_flags & BTRFS_INODE_RO_VERITY)
new_fl |= S_VERITY;
- set_mask_bits(&inode->i_flags,
+ set_mask_bits(&inode->vfs_inode.i_flags,
S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC |
S_VERITY, new_fl);
}
@@ -219,7 +218,7 @@ static int check_fsflags(unsigned int old_flags, unsigned int flags)
return 0;
}
-static int check_fsflags_compatible(struct btrfs_fs_info *fs_info,
+static int check_fsflags_compatible(const struct btrfs_fs_info *fs_info,
unsigned int flags)
{
if (btrfs_is_zoned(fs_info) && (flags & FS_NOCOW_FL))
@@ -246,26 +245,25 @@ static int btrfs_check_ioctl_vol_args2_subvol_name(const struct btrfs_ioctl_vol_
* Set flags/xflags from the internal inode flags. The remaining items of
* fsxattr are zeroed.
*/
-int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+int btrfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
{
- struct btrfs_inode *binode = BTRFS_I(d_inode(dentry));
+ const struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
- fileattr_fill_flags(fa, btrfs_inode_flags_to_fsflags(binode));
+ fileattr_fill_flags(fa, btrfs_inode_flags_to_fsflags(inode));
return 0;
}
int btrfs_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa)
+ struct dentry *dentry, struct file_kattr *fa)
{
- struct inode *inode = d_inode(dentry);
- struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- struct btrfs_inode *binode = BTRFS_I(inode);
- struct btrfs_root *root = binode->root;
+ struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans;
unsigned int fsflags, old_fsflags;
int ret;
const char *comp = NULL;
- u32 binode_flags;
+ u32 inode_flags;
if (btrfs_root_readonly(root))
return -EROFS;
@@ -273,8 +271,8 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap,
if (fileattr_has_fsx(fa))
return -EOPNOTSUPP;
- fsflags = btrfs_mask_fsflags_for_type(inode, fa->flags);
- old_fsflags = btrfs_inode_flags_to_fsflags(binode);
+ fsflags = btrfs_mask_fsflags_for_type(&inode->vfs_inode, fa->flags);
+ old_fsflags = btrfs_inode_flags_to_fsflags(inode);
ret = check_fsflags(old_fsflags, fsflags);
if (ret)
return ret;
@@ -283,27 +281,27 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap,
if (ret)
return ret;
- binode_flags = binode->flags;
+ inode_flags = inode->flags;
if (fsflags & FS_SYNC_FL)
- binode_flags |= BTRFS_INODE_SYNC;
+ inode_flags |= BTRFS_INODE_SYNC;
else
- binode_flags &= ~BTRFS_INODE_SYNC;
+ inode_flags &= ~BTRFS_INODE_SYNC;
if (fsflags & FS_IMMUTABLE_FL)
- binode_flags |= BTRFS_INODE_IMMUTABLE;
+ inode_flags |= BTRFS_INODE_IMMUTABLE;
else
- binode_flags &= ~BTRFS_INODE_IMMUTABLE;
+ inode_flags &= ~BTRFS_INODE_IMMUTABLE;
if (fsflags & FS_APPEND_FL)
- binode_flags |= BTRFS_INODE_APPEND;
+ inode_flags |= BTRFS_INODE_APPEND;
else
- binode_flags &= ~BTRFS_INODE_APPEND;
+ inode_flags &= ~BTRFS_INODE_APPEND;
if (fsflags & FS_NODUMP_FL)
- binode_flags |= BTRFS_INODE_NODUMP;
+ inode_flags |= BTRFS_INODE_NODUMP;
else
- binode_flags &= ~BTRFS_INODE_NODUMP;
+ inode_flags &= ~BTRFS_INODE_NODUMP;
if (fsflags & FS_NOATIME_FL)
- binode_flags |= BTRFS_INODE_NOATIME;
+ inode_flags |= BTRFS_INODE_NOATIME;
else
- binode_flags &= ~BTRFS_INODE_NOATIME;
+ inode_flags &= ~BTRFS_INODE_NOATIME;
/* If coming from FS_IOC_FSSETXATTR then skip unconverted flags */
if (!fa->flags_valid) {
@@ -315,32 +313,32 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap,
}
if (fsflags & FS_DIRSYNC_FL)
- binode_flags |= BTRFS_INODE_DIRSYNC;
+ inode_flags |= BTRFS_INODE_DIRSYNC;
else
- binode_flags &= ~BTRFS_INODE_DIRSYNC;
+ inode_flags &= ~BTRFS_INODE_DIRSYNC;
if (fsflags & FS_NOCOW_FL) {
- if (S_ISREG(inode->i_mode)) {
+ if (S_ISREG(inode->vfs_inode.i_mode)) {
/*
* It's safe to turn csums off here, no extents exist.
* Otherwise we want the flag to reflect the real COW
* status of the file and will not set it.
*/
- if (inode->i_size == 0)
- binode_flags |= BTRFS_INODE_NODATACOW |
- BTRFS_INODE_NODATASUM;
+ if (inode->vfs_inode.i_size == 0)
+ inode_flags |= BTRFS_INODE_NODATACOW |
+ BTRFS_INODE_NODATASUM;
} else {
- binode_flags |= BTRFS_INODE_NODATACOW;
+ inode_flags |= BTRFS_INODE_NODATACOW;
}
} else {
/*
* Revert back under same assumptions as above
*/
- if (S_ISREG(inode->i_mode)) {
- if (inode->i_size == 0)
- binode_flags &= ~(BTRFS_INODE_NODATACOW |
- BTRFS_INODE_NODATASUM);
+ if (S_ISREG(inode->vfs_inode.i_mode)) {
+ if (inode->vfs_inode.i_size == 0)
+ inode_flags &= ~(BTRFS_INODE_NODATACOW |
+ BTRFS_INODE_NODATASUM);
} else {
- binode_flags &= ~BTRFS_INODE_NODATACOW;
+ inode_flags &= ~BTRFS_INODE_NODATACOW;
}
}
@@ -350,21 +348,21 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap,
* things smaller.
*/
if (fsflags & FS_NOCOMP_FL) {
- binode_flags &= ~BTRFS_INODE_COMPRESS;
- binode_flags |= BTRFS_INODE_NOCOMPRESS;
+ inode_flags &= ~BTRFS_INODE_COMPRESS;
+ inode_flags |= BTRFS_INODE_NOCOMPRESS;
} else if (fsflags & FS_COMPR_FL) {
- if (IS_SWAPFILE(inode))
+ if (IS_SWAPFILE(&inode->vfs_inode))
return -ETXTBSY;
- binode_flags |= BTRFS_INODE_COMPRESS;
- binode_flags &= ~BTRFS_INODE_NOCOMPRESS;
+ inode_flags |= BTRFS_INODE_COMPRESS;
+ inode_flags &= ~BTRFS_INODE_NOCOMPRESS;
comp = btrfs_compress_type2str(fs_info->compress_type);
if (!comp || comp[0] == 0)
comp = btrfs_compress_type2str(BTRFS_COMPRESS_ZLIB);
} else {
- binode_flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
+ inode_flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
}
/*
@@ -376,15 +374,14 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap,
return PTR_ERR(trans);
if (comp) {
- ret = btrfs_set_prop(trans, BTRFS_I(inode), "btrfs.compression",
+ ret = btrfs_set_prop(trans, inode, "btrfs.compression",
comp, strlen(comp), 0);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
}
} else {
- ret = btrfs_set_prop(trans, BTRFS_I(inode), "btrfs.compression",
- NULL, 0, 0);
+ ret = btrfs_set_prop(trans, inode, "btrfs.compression", NULL, 0, 0);
if (ret && ret != -ENODATA) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
@@ -392,18 +389,19 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap,
}
update_flags:
- binode->flags = binode_flags;
+ inode->flags = inode_flags;
+ btrfs_update_inode_mapping_flags(inode);
btrfs_sync_inode_flags_to_i_flags(inode);
- inode_inc_iversion(inode);
- inode_set_ctime_current(inode);
- ret = btrfs_update_inode(trans, BTRFS_I(inode));
+ inode_inc_iversion(&inode->vfs_inode);
+ inode_set_ctime_current(&inode->vfs_inode);
+ ret = btrfs_update_inode(trans, inode);
out_end_trans:
btrfs_end_transaction(trans);
return ret;
}
-static int btrfs_ioctl_getversion(struct inode *inode, int __user *arg)
+static int btrfs_ioctl_getversion(const struct inode *inode, int __user *arg)
{
return put_user(inode->i_generation, arg);
}
@@ -475,7 +473,7 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info,
* Calculate the number of transaction items to reserve for creating a subvolume
* or snapshot, not including the inode, directory entries, or parent directory.
*/
-static unsigned int create_subvol_num_items(struct btrfs_qgroup_inherit *inherit)
+static unsigned int create_subvol_num_items(const struct btrfs_qgroup_inherit *inherit)
{
/*
* 1 to add root block
@@ -617,8 +615,8 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
btrfs_set_root_dirid(root_item, BTRFS_FIRST_FREE_OBJECTID);
key.objectid = objectid;
- key.offset = 0;
key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = 0;
ret = btrfs_insert_root(trans, fs_info->tree_root, &key,
root_item);
if (ret) {
@@ -668,14 +666,14 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
goto out;
}
+ btrfs_record_new_subvolume(trans, BTRFS_I(dir));
+
ret = btrfs_create_new_inode(trans, &new_inode_args);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
- btrfs_record_new_subvolume(trans, BTRFS_I(dir));
-
d_instantiate_new(dentry, new_inode_args.inode);
new_inode_args.inode = NULL;
@@ -843,7 +841,7 @@ free_pending:
static int btrfs_may_delete(struct mnt_idmap *idmap,
struct inode *dir, struct dentry *victim, int isdir)
{
- int error;
+ int ret;
if (d_really_is_negative(victim))
return -ENOENT;
@@ -853,9 +851,9 @@ static int btrfs_may_delete(struct mnt_idmap *idmap,
return -EINVAL;
audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
- error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC);
- if (error)
- return error;
+ ret = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC);
+ if (ret)
+ return ret;
if (IS_APPEND(dir))
return -EPERM;
if (check_sticky(idmap, dir, d_inode(victim)) ||
@@ -878,7 +876,7 @@ static int btrfs_may_delete(struct mnt_idmap *idmap,
/* copy of may_create in fs/namei.c() */
static inline int btrfs_may_create(struct mnt_idmap *idmap,
- struct inode *dir, struct dentry *child)
+ struct inode *dir, const struct dentry *child)
{
if (d_really_is_positive(child))
return -EEXIST;
@@ -894,39 +892,37 @@ static inline int btrfs_may_create(struct mnt_idmap *idmap,
* sys_mkdirat and vfs_mkdir, but we only do a single component lookup
* inside this filesystem so it's quite a bit simpler.
*/
-static noinline int btrfs_mksubvol(const struct path *parent,
+static noinline int btrfs_mksubvol(struct dentry *parent,
struct mnt_idmap *idmap,
- const char *name, int namelen,
- struct btrfs_root *snap_src,
+ struct qstr *qname, struct btrfs_root *snap_src,
bool readonly,
struct btrfs_qgroup_inherit *inherit)
{
- struct inode *dir = d_inode(parent->dentry);
+ struct inode *dir = d_inode(parent);
struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
struct dentry *dentry;
- struct fscrypt_str name_str = FSTR_INIT((char *)name, namelen);
- int error;
+ struct fscrypt_str name_str = FSTR_INIT((char *)qname->name, qname->len);
+ int ret;
- error = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
- if (error == -EINTR)
- return error;
+ ret = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
+ if (ret == -EINTR)
+ return ret;
- dentry = lookup_one(idmap, name, parent->dentry, namelen);
- error = PTR_ERR(dentry);
+ dentry = lookup_one(idmap, qname, parent);
+ ret = PTR_ERR(dentry);
if (IS_ERR(dentry))
goto out_unlock;
- error = btrfs_may_create(idmap, dir, dentry);
- if (error)
+ ret = btrfs_may_create(idmap, dir, dentry);
+ if (ret)
goto out_dput;
/*
* even if this name doesn't exist, we may get hash collisions.
* check for them now when we can safely fail
*/
- error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root,
- dir->i_ino, &name_str);
- if (error)
+ ret = btrfs_check_dir_item_collision(BTRFS_I(dir)->root, dir->i_ino, &name_str);
+ if (ret)
goto out_dput;
down_read(&fs_info->subvol_sem);
@@ -935,11 +931,11 @@ static noinline int btrfs_mksubvol(const struct path *parent,
goto out_up_read;
if (snap_src)
- error = create_snapshot(snap_src, dir, dentry, readonly, inherit);
+ ret = create_snapshot(snap_src, dir, dentry, readonly, inherit);
else
- error = create_subvol(idmap, dir, dentry, inherit);
+ ret = create_subvol(idmap, dir, dentry, inherit);
- if (!error)
+ if (!ret)
fsnotify_mkdir(dir, dentry);
out_up_read:
up_read(&fs_info->subvol_sem);
@@ -947,12 +943,12 @@ out_dput:
dput(dentry);
out_unlock:
btrfs_inode_unlock(BTRFS_I(dir), 0);
- return error;
+ return ret;
}
-static noinline int btrfs_mksnapshot(const struct path *parent,
+static noinline int btrfs_mksnapshot(struct dentry *parent,
struct mnt_idmap *idmap,
- const char *name, int namelen,
+ struct qstr *qname,
struct btrfs_root *root,
bool readonly,
struct btrfs_qgroup_inherit *inherit)
@@ -979,8 +975,8 @@ static noinline int btrfs_mksnapshot(const struct path *parent,
btrfs_wait_ordered_extents(root, U64_MAX, NULL);
- ret = btrfs_mksubvol(parent, idmap, name, namelen,
- root, readonly, inherit);
+ ret = btrfs_mksubvol(parent, idmap, qname, root, readonly, inherit);
+
atomic_dec(&root->snapshot_force_cow);
out:
btrfs_drew_read_unlock(&root->snapshot_lock);
@@ -1033,17 +1029,14 @@ static noinline int btrfs_ioctl_resize(struct file *file,
void __user *arg)
{
BTRFS_DEV_LOOKUP_ARGS(args);
- struct inode *inode = file_inode(file);
- struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
+ struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
u64 new_size;
u64 old_size;
u64 devid = 1;
- struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_ioctl_vol_args *vol_args;
- struct btrfs_trans_handle *trans;
struct btrfs_device *device = NULL;
char *sizestr;
- char *retptr;
char *devstr = NULL;
int ret = 0;
int mod = 0;
@@ -1111,6 +1104,8 @@ static noinline int btrfs_ioctl_resize(struct file *file,
if (!strcmp(sizestr, "max"))
new_size = bdev_nr_bytes(device->bdev);
else {
+ char *retptr;
+
if (sizestr[0] == '-') {
mod = -1;
sizestr++;
@@ -1158,6 +1153,8 @@ static noinline int btrfs_ioctl_resize(struct file *file,
new_size = round_down(new_size, fs_info->sectorsize);
if (new_size > old_size) {
+ struct btrfs_trans_handle *trans;
+
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
@@ -1170,7 +1167,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
} /* equal, nothing need to do */
if (ret == 0 && new_size != old_size)
- btrfs_info_in_rcu(fs_info,
+ btrfs_info(fs_info,
"resize device %s (devid %llu) from %llu to %llu",
btrfs_dev_name(device), device->devid,
old_size, new_size);
@@ -1185,12 +1182,12 @@ out_drop:
static noinline int __btrfs_ioctl_snap_create(struct file *file,
struct mnt_idmap *idmap,
- const char *name, unsigned long fd, int subvol,
+ const char *name, unsigned long fd, bool subvol,
bool readonly,
struct btrfs_qgroup_inherit *inherit)
{
- int namelen;
int ret = 0;
+ struct qstr qname = QSTR_INIT(name, strlen(name));
if (!S_ISDIR(file_inode(file)->i_mode))
return -ENOTDIR;
@@ -1199,21 +1196,20 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file,
if (ret)
goto out;
- namelen = strlen(name);
if (strchr(name, '/')) {
ret = -EINVAL;
goto out_drop_write;
}
- if (name[0] == '.' &&
- (namelen == 1 || (name[1] == '.' && namelen == 2))) {
+ if (qname.name[0] == '.' &&
+ (qname.len == 1 || (qname.name[1] == '.' && qname.len == 2))) {
ret = -EEXIST;
goto out_drop_write;
}
if (subvol) {
- ret = btrfs_mksubvol(&file->f_path, idmap, name,
- namelen, NULL, readonly, inherit);
+ ret = btrfs_mksubvol(file_dentry(file), idmap, &qname, NULL,
+ readonly, inherit);
} else {
CLASS(fd, src)(fd);
struct inode *src_inode;
@@ -1243,8 +1239,7 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file,
*/
ret = -EINVAL;
} else {
- ret = btrfs_mksnapshot(&file->f_path, idmap,
- name, namelen,
+ ret = btrfs_mksnapshot(file_dentry(file), idmap, &qname,
BTRFS_I(src_inode)->root,
readonly, inherit);
}
@@ -1281,7 +1276,7 @@ out:
}
static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
- void __user *arg, int subvol)
+ void __user *arg, bool subvol)
{
struct btrfs_ioctl_vol_args_v2 *vol_args;
int ret;
@@ -1336,15 +1331,15 @@ free_args:
return ret;
}
-static noinline int btrfs_ioctl_subvol_getflags(struct inode *inode,
+static noinline int btrfs_ioctl_subvol_getflags(struct btrfs_inode *inode,
void __user *arg)
{
- struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
int ret = 0;
u64 flags = 0;
- if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID)
+ if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID)
return -EINVAL;
down_read(&fs_info->subvol_sem);
@@ -1447,8 +1442,8 @@ out:
return ret;
}
-static noinline int key_in_sk(struct btrfs_key *key,
- struct btrfs_ioctl_search_key *sk)
+static noinline bool key_in_sk(const struct btrfs_key *key,
+ const struct btrfs_ioctl_search_key *sk)
{
struct btrfs_key test;
int ret;
@@ -1459,7 +1454,7 @@ static noinline int key_in_sk(struct btrfs_key *key,
ret = btrfs_comp_cpu_keys(key, &test);
if (ret < 0)
- return 0;
+ return false;
test.objectid = sk->max_objectid;
test.type = sk->max_type;
@@ -1467,13 +1462,13 @@ static noinline int key_in_sk(struct btrfs_key *key,
ret = btrfs_comp_cpu_keys(key, &test);
if (ret > 0)
- return 0;
- return 1;
+ return false;
+ return true;
}
static noinline int copy_to_sk(struct btrfs_path *path,
struct btrfs_key *key,
- struct btrfs_ioctl_search_key *sk,
+ const struct btrfs_ioctl_search_key *sk,
u64 *buf_size,
char __user *ubuf,
unsigned long *sk_offset,
@@ -1530,8 +1525,8 @@ static noinline int copy_to_sk(struct btrfs_path *path,
}
sh.objectid = key->objectid;
- sh.offset = key->offset;
sh.type = key->type;
+ sh.offset = key->offset;
sh.len = item_len;
sh.transid = found_transid;
@@ -1604,13 +1599,12 @@ out:
return ret;
}
-static noinline int search_ioctl(struct inode *inode,
+static noinline int search_ioctl(struct btrfs_root *root,
struct btrfs_ioctl_search_key *sk,
u64 *buf_size,
char __user *ubuf)
{
- struct btrfs_fs_info *info = inode_to_fs_info(inode);
- struct btrfs_root *root;
+ struct btrfs_fs_info *info = root->fs_info;
struct btrfs_key key;
struct btrfs_path *path;
int ret;
@@ -1627,9 +1621,10 @@ static noinline int search_ioctl(struct inode *inode,
return -ENOMEM;
if (sk->tree_id == 0) {
- /* search the root of the inode that was passed */
- root = btrfs_grab_root(BTRFS_I(inode)->root);
+ /* Search the root that we got passed. */
+ root = btrfs_grab_root(root);
} else {
+ /* Look up the root from the arguments. */
root = btrfs_get_fs_root(info, sk->tree_id, true);
if (IS_ERR(root)) {
btrfs_free_path(path);
@@ -1642,21 +1637,19 @@ static noinline int search_ioctl(struct inode *inode,
key.offset = sk->min_offset;
while (1) {
- ret = -EFAULT;
/*
* Ensure that the whole user buffer is faulted in at sub-page
* granularity, otherwise the loop may live-lock.
*/
- if (fault_in_subpage_writeable(ubuf + sk_offset,
- *buf_size - sk_offset))
+ if (fault_in_subpage_writeable(ubuf + sk_offset, *buf_size - sk_offset)) {
+ ret = -EFAULT;
break;
+ }
ret = btrfs_search_forward(root, &key, path, sk->min_transid);
- if (ret != 0) {
- if (ret > 0)
- ret = 0;
- goto err;
- }
+ if (ret)
+ break;
+
ret = copy_to_sk(path, &key, sk, buf_size, ubuf,
&sk_offset, &num_found);
btrfs_release_path(path);
@@ -1664,16 +1657,17 @@ static noinline int search_ioctl(struct inode *inode,
break;
}
+ /* Normalize return values from btrfs_search_forward() and copy_to_sk(). */
if (ret > 0)
ret = 0;
-err:
+
sk->nr_items = num_found;
btrfs_put_root(root);
btrfs_free_path(path);
return ret;
}
-static noinline int btrfs_ioctl_tree_search(struct inode *inode,
+static noinline int btrfs_ioctl_tree_search(struct btrfs_root *root,
void __user *argp)
{
struct btrfs_ioctl_search_args __user *uargs = argp;
@@ -1689,7 +1683,7 @@ static noinline int btrfs_ioctl_tree_search(struct inode *inode,
buf_size = sizeof(uargs->buf);
- ret = search_ioctl(inode, &sk, &buf_size, uargs->buf);
+ ret = search_ioctl(root, &sk, &buf_size, uargs->buf);
/*
* In the origin implementation an overflow is handled by returning a
@@ -1703,7 +1697,7 @@ static noinline int btrfs_ioctl_tree_search(struct inode *inode,
return ret;
}
-static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode,
+static noinline int btrfs_ioctl_tree_search_v2(struct btrfs_root *root,
void __user *argp)
{
struct btrfs_ioctl_search_args_v2 __user *uarg = argp;
@@ -1725,7 +1719,7 @@ static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode,
if (buf_size > buf_limit)
buf_size = buf_limit;
- ret = search_ioctl(inode, &args.key, &buf_size,
+ ret = search_ioctl(root, &args.key, &buf_size,
(char __user *)(&uarg->buf[0]));
if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key)))
ret = -EFAULT;
@@ -1833,7 +1827,6 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap,
struct btrfs_path *path;
struct btrfs_key key, key2;
struct extent_buffer *leaf;
- struct inode *temp_inode;
char *ptr;
int slot;
int len;
@@ -1861,6 +1854,8 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap,
key.type = BTRFS_INODE_REF_KEY;
key.offset = (u64)-1;
while (1) {
+ struct btrfs_inode *temp_inode;
+
ret = btrfs_search_backwards(root, &key, path);
if (ret < 0)
goto out_put;
@@ -1915,9 +1910,9 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap,
ret = PTR_ERR(temp_inode);
goto out_put;
}
- ret = inode_permission(idmap, temp_inode,
+ ret = inode_permission(idmap, &temp_inode->vfs_inode,
MAY_READ | MAY_EXEC);
- iput(temp_inode);
+ iput(&temp_inode->vfs_inode);
if (ret) {
ret = -EACCES;
goto out_put;
@@ -2289,7 +2284,6 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
struct btrfs_ioctl_vol_args_v2 *vol_args2 = NULL;
struct mnt_idmap *idmap = file_mnt_idmap(file);
char *subvol_name, *subvol_name_ptr = NULL;
- int subvol_namelen;
int ret = 0;
bool destroy_parent = false;
@@ -2412,10 +2406,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
goto out;
}
- subvol_namelen = strlen(subvol_name);
-
if (strchr(subvol_name, '/') ||
- strncmp(subvol_name, "..", subvol_namelen) == 0) {
+ strcmp(subvol_name, "..") == 0) {
ret = -EINVAL;
goto free_subvol_name;
}
@@ -2428,7 +2420,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
ret = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
if (ret == -EINTR)
goto free_subvol_name;
- dentry = lookup_one(idmap, subvol_name, parent, subvol_namelen);
+ dentry = lookup_one(idmap, &QSTR(subvol_name), parent);
if (IS_ERR(dentry)) {
ret = PTR_ERR(dentry);
goto out_unlock_dir;
@@ -2562,8 +2554,14 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
ret = -EOPNOTSUPP;
goto out;
}
- /* compression requires us to start the IO */
- if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
+ if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS) &&
+ (range.flags & BTRFS_DEFRAG_RANGE_NOCOMPRESS)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ /* Compression or no-compression require to start the IO. */
+ if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS) ||
+ (range.flags & BTRFS_DEFRAG_RANGE_NOCOMPRESS)) {
range.flags |= BTRFS_DEFRAG_RANGE_START_IO;
range.extent_thresh = (u32)-1;
}
@@ -2571,7 +2569,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
/* the rest are all set to zero by kzalloc */
range.len = (u64)-1;
}
- ret = btrfs_defrag_file(file_inode(file), &file->f_ra,
+ ret = btrfs_defrag_file(BTRFS_I(file_inode(file)), &file->f_ra,
&range, BTRFS_OLDEST_GENERATION, 0);
if (ret > 0)
ret = 0;
@@ -2704,7 +2702,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
err_drop:
mnt_drop_write_file(file);
if (bdev_file)
- fput(bdev_file);
+ bdev_fput(bdev_file);
out:
btrfs_put_dev_args_from_path(&args);
kfree(vol_args);
@@ -2755,7 +2753,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
mnt_drop_write_file(file);
if (bdev_file)
- fput(bdev_file);
+ bdev_fput(bdev_file);
out:
btrfs_put_dev_args_from_path(&args);
out_free:
@@ -2763,7 +2761,7 @@ out_free:
return ret;
}
-static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info,
+static long btrfs_ioctl_fs_info(const struct btrfs_fs_info *fs_info,
void __user *arg)
{
struct btrfs_ioctl_fs_info_args *fi_args;
@@ -2817,7 +2815,7 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info,
return ret;
}
-static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info,
+static long btrfs_ioctl_dev_info(const struct btrfs_fs_info *fs_info,
void __user *arg)
{
BTRFS_DEV_LOOKUP_ARGS(args);
@@ -2894,7 +2892,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
ret = PTR_ERR(new_root);
goto out;
}
- if (!is_fstree(btrfs_root_id(new_root))) {
+ if (!btrfs_is_fstree(btrfs_root_id(new_root))) {
ret = -ENOENT;
goto out_free;
}
@@ -3143,7 +3141,7 @@ static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
return -EPERM;
if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
- btrfs_err(fs_info, "scrub is not supported on extent tree v2 yet");
+ btrfs_err(fs_info, "scrub: extent tree v2 not yet supported");
return -EINVAL;
}
@@ -3361,7 +3359,6 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
int size;
struct btrfs_ioctl_logical_ino_args *loi;
struct btrfs_data_container *inodes = NULL;
- struct btrfs_path *path = NULL;
bool ignore_offset;
if (!capable(CAP_SYS_ADMIN))
@@ -3395,14 +3392,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
goto out_loi;
}
- path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- goto out;
- }
- ret = iterate_inodes_from_logical(loi->logical, fs_info, path,
- inodes, ignore_offset);
- btrfs_free_path(path);
+ ret = iterate_inodes_from_logical(loi->logical, fs_info, inodes, ignore_offset);
if (ret == -EINVAL)
ret = -ENOENT;
if (ret < 0)
@@ -3719,22 +3709,6 @@ drop_write:
return ret;
}
-/*
- * Quick check for ioctl handlers if quotas are enabled. Proper locking must be
- * done before any operations.
- */
-static bool qgroup_enabled(struct btrfs_fs_info *fs_info)
-{
- bool ret = true;
-
- mutex_lock(&fs_info->qgroup_ioctl_lock);
- if (!fs_info->quota_root)
- ret = false;
- mutex_unlock(&fs_info->qgroup_ioctl_lock);
-
- return ret;
-}
-
static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
{
struct inode *inode = file_inode(file);
@@ -3749,7 +3723,7 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!qgroup_enabled(root->fs_info))
+ if (!btrfs_qgroup_enabled(fs_info))
return -ENOTCONN;
ret = mnt_want_write_file(file);
@@ -3819,7 +3793,7 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!qgroup_enabled(root->fs_info))
+ if (!btrfs_qgroup_enabled(root->fs_info))
return -ENOTCONN;
ret = mnt_want_write_file(file);
@@ -3837,7 +3811,7 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
goto out;
}
- if (sa->create && is_fstree(sa->qgroupid)) {
+ if (sa->create && btrfs_is_fstree(sa->qgroupid)) {
ret = -EINVAL;
goto out;
}
@@ -3878,7 +3852,7 @@ static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!qgroup_enabled(root->fs_info))
+ if (!btrfs_qgroup_enabled(root->fs_info))
return -ENOTCONN;
ret = mnt_want_write_file(file);
@@ -3926,7 +3900,7 @@ static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!qgroup_enabled(fs_info))
+ if (!btrfs_qgroup_enabled(fs_info))
return -ENOTCONN;
ret = mnt_want_write_file(file);
@@ -4204,7 +4178,7 @@ static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg)
}
spin_lock(&fs_info->super_lock);
- strcpy(super_block->label, label);
+ strscpy(super_block->label, label);
spin_unlock(&fs_info->super_lock);
ret = btrfs_commit_transaction(trans);
@@ -4248,7 +4222,7 @@ static int btrfs_ioctl_get_features(struct btrfs_fs_info *fs_info,
return 0;
}
-static int check_feature_bits(struct btrfs_fs_info *fs_info,
+static int check_feature_bits(const struct btrfs_fs_info *fs_info,
enum btrfs_feature_set set,
u64 change_mask, u64 flags, u64 supported_flags,
u64 safe_set, u64 safe_clear)
@@ -4384,7 +4358,7 @@ out_drop_write:
return ret;
}
-static int _btrfs_ioctl_send(struct btrfs_inode *inode, void __user *argp, bool compat)
+static int _btrfs_ioctl_send(struct btrfs_root *root, void __user *argp, bool compat)
{
struct btrfs_ioctl_send_args *arg;
int ret;
@@ -4415,7 +4389,7 @@ static int _btrfs_ioctl_send(struct btrfs_inode *inode, void __user *argp, bool
if (IS_ERR(arg))
return PTR_ERR(arg);
}
- ret = btrfs_ioctl_send(inode, arg);
+ ret = btrfs_ioctl_send(root, arg);
kfree(arg);
return ret;
}
@@ -4511,7 +4485,7 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp,
args.compression, &unlocked);
if (!unlocked) {
- unlock_extent(io_tree, start, lockend, &cached_state);
+ btrfs_unlock_extent(io_tree, start, lockend, &cached_state);
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
}
}
@@ -4633,6 +4607,13 @@ out_acct:
return ret;
}
+struct btrfs_uring_encoded_data {
+ struct btrfs_ioctl_encoded_io_args args;
+ struct iovec iovstack[UIO_FASTIOV];
+ struct iovec *iov;
+ struct iov_iter iter;
+};
+
/*
* Context that's attached to an encoded read io_uring command, in cmd->pdu. It
* contains the fields in btrfs_uring_read_extent that are necessary to finish
@@ -4654,6 +4635,7 @@ struct btrfs_uring_priv {
};
struct io_btrfs_cmd {
+ struct btrfs_uring_encoded_data *data;
struct btrfs_uring_priv *priv;
};
@@ -4663,7 +4645,7 @@ static void btrfs_uring_read_finished(struct io_uring_cmd *cmd, unsigned int iss
struct btrfs_uring_priv *priv = bc->priv;
struct btrfs_inode *inode = BTRFS_I(file_inode(priv->iocb.ki_filp));
struct extent_io_tree *io_tree = &inode->io_tree;
- unsigned long index;
+ pgoff_t index;
u64 cur;
size_t page_offset;
ssize_t ret;
@@ -4700,7 +4682,7 @@ static void btrfs_uring_read_finished(struct io_uring_cmd *cmd, unsigned int iss
ret = priv->count;
out:
- unlock_extent(io_tree, priv->start, priv->lockend, &priv->cached_state);
+ btrfs_unlock_extent(io_tree, priv->start, priv->lockend, &priv->cached_state);
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
io_uring_cmd_done(cmd, ret, 0, issue_flags);
@@ -4712,6 +4694,7 @@ out:
kfree(priv->pages);
kfree(priv->iov);
kfree(priv);
+ kfree(bc->data);
}
void btrfs_uring_read_extent_endio(void *ctx, int err)
@@ -4789,19 +4772,12 @@ static int btrfs_uring_read_extent(struct kiocb *iocb, struct iov_iter *iter,
return -EIOCBQUEUED;
out_fail:
- unlock_extent(io_tree, start, lockend, &cached_state);
+ btrfs_unlock_extent(io_tree, start, lockend, &cached_state);
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
kfree(priv);
return ret;
}
-struct btrfs_uring_encoded_data {
- struct btrfs_ioctl_encoded_io_args args;
- struct iovec iovstack[UIO_FASTIOV];
- struct iovec *iov;
- struct iov_iter iter;
-};
-
static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue_flags)
{
size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, flags);
@@ -4817,7 +4793,11 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue
struct extent_state *cached_state = NULL;
u64 start, lockend;
void __user *sqe_addr;
- struct btrfs_uring_encoded_data *data = io_uring_cmd_get_async_data(cmd)->op_data;
+ struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd);
+ struct btrfs_uring_encoded_data *data = NULL;
+
+ if (cmd->flags & IORING_URING_CMD_REISSUE)
+ data = bc->data;
if (!capable(CAP_SYS_ADMIN)) {
ret = -EPERM;
@@ -4833,7 +4813,8 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue
#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32, flags);
#else
- return -ENOTTY;
+ ret = -ENOTTY;
+ goto out_acct;
#endif
} else {
copy_end = copy_end_kernel;
@@ -4846,7 +4827,7 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue
goto out_acct;
}
- io_uring_cmd_get_async_data(cmd)->op_data = data;
+ bc->data = data;
if (issue_flags & IO_URING_F_COMPAT) {
#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
@@ -4903,6 +4884,8 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue
ret = btrfs_encoded_read(&kiocb, &data->iter, &data->args, &cached_state,
&disk_bytenr, &disk_io_size);
+ if (ret == -EAGAIN)
+ goto out_acct;
if (ret < 0 && ret != -EIOCBQUEUED)
goto out_free;
@@ -4912,7 +4895,7 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue
(const char *)&data->args + copy_end_kernel,
sizeof(data->args) - copy_end_kernel)) {
if (ret == -EIOCBQUEUED) {
- unlock_extent(io_tree, start, lockend, &cached_state);
+ btrfs_unlock_extent(io_tree, start, lockend, &cached_state);
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
}
ret = -EFAULT;
@@ -4942,6 +4925,9 @@ out_acct:
add_rchar(current, ret);
inc_syscr(current);
+ if (ret != -EIOCBQUEUED && ret != -EAGAIN)
+ kfree(data);
+
return ret;
}
@@ -4952,7 +4938,11 @@ static int btrfs_uring_encoded_write(struct io_uring_cmd *cmd, unsigned int issu
struct file *file;
ssize_t ret;
void __user *sqe_addr;
- struct btrfs_uring_encoded_data *data = io_uring_cmd_get_async_data(cmd)->op_data;
+ struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd);
+ struct btrfs_uring_encoded_data *data = NULL;
+
+ if (cmd->flags & IORING_URING_CMD_REISSUE)
+ data = bc->data;
if (!capable(CAP_SYS_ADMIN)) {
ret = -EPERM;
@@ -4974,7 +4964,7 @@ static int btrfs_uring_encoded_write(struct io_uring_cmd *cmd, unsigned int issu
goto out_acct;
}
- io_uring_cmd_get_async_data(cmd)->op_data = data;
+ bc->data = data;
if (issue_flags & IO_URING_F_COMPAT) {
#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
@@ -5064,6 +5054,9 @@ out_acct:
if (ret > 0)
add_wchar(current, ret);
inc_syscw(current);
+
+ if (ret != -EAGAIN)
+ kfree(data);
return ret;
}
@@ -5242,7 +5235,7 @@ long btrfs_ioctl(struct file *file, unsigned int
case BTRFS_IOC_SNAP_DESTROY_V2:
return btrfs_ioctl_snap_destroy(file, argp, true);
case BTRFS_IOC_SUBVOL_GETFLAGS:
- return btrfs_ioctl_subvol_getflags(inode, argp);
+ return btrfs_ioctl_subvol_getflags(BTRFS_I(inode), argp);
case BTRFS_IOC_SUBVOL_SETFLAGS:
return btrfs_ioctl_subvol_setflags(file, argp);
case BTRFS_IOC_DEFAULT_SUBVOL:
@@ -5264,9 +5257,9 @@ long btrfs_ioctl(struct file *file, unsigned int
case BTRFS_IOC_DEV_INFO:
return btrfs_ioctl_dev_info(fs_info, argp);
case BTRFS_IOC_TREE_SEARCH:
- return btrfs_ioctl_tree_search(inode, argp);
+ return btrfs_ioctl_tree_search(root, argp);
case BTRFS_IOC_TREE_SEARCH_V2:
- return btrfs_ioctl_tree_search_v2(inode, argp);
+ return btrfs_ioctl_tree_search_v2(root, argp);
case BTRFS_IOC_INO_LOOKUP:
return btrfs_ioctl_ino_lookup(root, argp);
case BTRFS_IOC_INO_PATHS:
@@ -5314,10 +5307,10 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_set_received_subvol_32(file, argp);
#endif
case BTRFS_IOC_SEND:
- return _btrfs_ioctl_send(BTRFS_I(inode), argp, false);
+ return _btrfs_ioctl_send(root, argp, false);
#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
case BTRFS_IOC_SEND_32:
- return _btrfs_ioctl_send(BTRFS_I(inode), argp, true);
+ return _btrfs_ioctl_send(root, argp, true);
#endif
case BTRFS_IOC_GET_DEV_STATS:
return btrfs_ioctl_get_dev_stats(fs_info, argp);
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index ce915fcda43b..ccf6bed9cc24 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -8,17 +8,19 @@
struct file;
struct dentry;
struct mnt_idmap;
-struct fileattr;
+struct file_kattr;
+struct io_uring_cmd;
+struct btrfs_inode;
struct btrfs_fs_info;
struct btrfs_ioctl_balance_args;
long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
-int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+int btrfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa);
int btrfs_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa);
+ struct dentry *dentry, struct file_kattr *fa);
int btrfs_ioctl_get_supported_features(void __user *arg);
-void btrfs_sync_inode_flags_to_i_flags(struct inode *inode);
+void btrfs_sync_inode_flags_to_i_flags(struct btrfs_inode *inode);
void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_balance_args *bargs);
int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags);
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 9a7a7b723305..a3e6d9616e60 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -9,7 +9,6 @@
#include <linux/page-flags.h>
#include <asm/bug.h>
#include <trace/events/btrfs.h>
-#include "misc.h"
#include "ctree.h"
#include "extent_io.h"
#include "locking.h"
@@ -150,15 +149,15 @@ void btrfs_tree_read_lock_nested(struct extent_buffer *eb, enum btrfs_lock_nesti
/*
* Try-lock for read.
*
- * Return 1 if the rwlock has been taken, 0 otherwise
+ * Return true if the rwlock has been taken, false otherwise
*/
-int btrfs_try_tree_read_lock(struct extent_buffer *eb)
+bool btrfs_try_tree_read_lock(struct extent_buffer *eb)
{
if (down_read_trylock(&eb->lock)) {
trace_btrfs_try_tree_read_lock(eb);
- return 1;
+ return true;
}
- return 0;
+ return false;
}
/*
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index c69e57ff804b..af29df98ac14 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -189,7 +189,7 @@ static inline void btrfs_tree_read_lock(struct extent_buffer *eb)
}
void btrfs_tree_read_unlock(struct extent_buffer *eb);
-int btrfs_try_tree_read_lock(struct extent_buffer *eb);
+bool btrfs_try_tree_read_lock(struct extent_buffer *eb);
struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root);
struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root);
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index a45bc11f8665..d403641889ca 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -252,9 +252,8 @@ int lzo_compress_folios(struct list_head *ws, struct address_space *mapping,
/* Compress at most one sector of data each time */
in_len = min_t(u32, start + len - cur_in, sectorsize - sector_off);
ASSERT(in_len);
- data_in = kmap_local_folio(folio_in, 0);
- ret = lzo1x_1_compress(data_in +
- offset_in_page(cur_in), in_len,
+ data_in = kmap_local_folio(folio_in, offset_in_folio(folio_in, cur_in));
+ ret = lzo1x_1_compress(data_in, in_len,
workspace->cbuf, &out_len,
workspace->mem);
kunmap_local(data_in);
diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h
index 08a9272399d2..022ebc89af85 100644
--- a/fs/btrfs/messages.h
+++ b/fs/btrfs/messages.h
@@ -4,6 +4,7 @@
#define BTRFS_MESSAGES_H
#include <linux/types.h>
+#include <linux/types.h>
#include <linux/printk.h>
#include <linux/bug.h>
@@ -36,106 +37,46 @@ void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...);
btrfs_no_printk(fs_info, fmt, ##args)
#endif
-#define btrfs_emerg(fs_info, fmt, args...) \
- btrfs_printk(fs_info, KERN_EMERG fmt, ##args)
-#define btrfs_alert(fs_info, fmt, args...) \
- btrfs_printk(fs_info, KERN_ALERT fmt, ##args)
-#define btrfs_crit(fs_info, fmt, args...) \
- btrfs_printk(fs_info, KERN_CRIT fmt, ##args)
-#define btrfs_err(fs_info, fmt, args...) \
- btrfs_printk(fs_info, KERN_ERR fmt, ##args)
-#define btrfs_warn(fs_info, fmt, args...) \
- btrfs_printk(fs_info, KERN_WARNING fmt, ##args)
-#define btrfs_notice(fs_info, fmt, args...) \
- btrfs_printk(fs_info, KERN_NOTICE fmt, ##args)
-#define btrfs_info(fs_info, fmt, args...) \
- btrfs_printk(fs_info, KERN_INFO fmt, ##args)
-
/*
- * Wrappers that use printk_in_rcu
+ * Print a message with filesystem info, enclosed in RCU protection.
*/
-#define btrfs_emerg_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_in_rcu(fs_info, KERN_EMERG fmt, ##args)
-#define btrfs_alert_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_in_rcu(fs_info, KERN_ALERT fmt, ##args)
-#define btrfs_crit_in_rcu(fs_info, fmt, args...) \
+#define btrfs_crit(fs_info, fmt, args...) \
btrfs_printk_in_rcu(fs_info, KERN_CRIT fmt, ##args)
-#define btrfs_err_in_rcu(fs_info, fmt, args...) \
+#define btrfs_err(fs_info, fmt, args...) \
btrfs_printk_in_rcu(fs_info, KERN_ERR fmt, ##args)
-#define btrfs_warn_in_rcu(fs_info, fmt, args...) \
+#define btrfs_warn(fs_info, fmt, args...) \
btrfs_printk_in_rcu(fs_info, KERN_WARNING fmt, ##args)
-#define btrfs_notice_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_in_rcu(fs_info, KERN_NOTICE fmt, ##args)
-#define btrfs_info_in_rcu(fs_info, fmt, args...) \
+#define btrfs_info(fs_info, fmt, args...) \
btrfs_printk_in_rcu(fs_info, KERN_INFO fmt, ##args)
/*
- * Wrappers that use a ratelimited printk_in_rcu
- */
-#define btrfs_emerg_rl_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_EMERG fmt, ##args)
-#define btrfs_alert_rl_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_ALERT fmt, ##args)
-#define btrfs_crit_rl_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_CRIT fmt, ##args)
-#define btrfs_err_rl_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_ERR fmt, ##args)
-#define btrfs_warn_rl_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_WARNING fmt, ##args)
-#define btrfs_notice_rl_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_NOTICE fmt, ##args)
-#define btrfs_info_rl_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_INFO fmt, ##args)
-
-/*
* Wrappers that use a ratelimited printk
*/
-#define btrfs_emerg_rl(fs_info, fmt, args...) \
- btrfs_printk_ratelimited(fs_info, KERN_EMERG fmt, ##args)
-#define btrfs_alert_rl(fs_info, fmt, args...) \
- btrfs_printk_ratelimited(fs_info, KERN_ALERT fmt, ##args)
#define btrfs_crit_rl(fs_info, fmt, args...) \
- btrfs_printk_ratelimited(fs_info, KERN_CRIT fmt, ##args)
+ btrfs_printk_rl_in_rcu(fs_info, KERN_CRIT fmt, ##args)
#define btrfs_err_rl(fs_info, fmt, args...) \
- btrfs_printk_ratelimited(fs_info, KERN_ERR fmt, ##args)
+ btrfs_printk_rl_in_rcu(fs_info, KERN_ERR fmt, ##args)
#define btrfs_warn_rl(fs_info, fmt, args...) \
- btrfs_printk_ratelimited(fs_info, KERN_WARNING fmt, ##args)
-#define btrfs_notice_rl(fs_info, fmt, args...) \
- btrfs_printk_ratelimited(fs_info, KERN_NOTICE fmt, ##args)
+ btrfs_printk_rl_in_rcu(fs_info, KERN_WARNING fmt, ##args)
#define btrfs_info_rl(fs_info, fmt, args...) \
- btrfs_printk_ratelimited(fs_info, KERN_INFO fmt, ##args)
+ btrfs_printk_rl_in_rcu(fs_info, KERN_INFO fmt, ##args)
#if defined(CONFIG_DYNAMIC_DEBUG)
#define btrfs_debug(fs_info, fmt, args...) \
- _dynamic_func_call_no_desc(fmt, btrfs_printk, \
- fs_info, KERN_DEBUG fmt, ##args)
-#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
_dynamic_func_call_no_desc(fmt, btrfs_printk_in_rcu, \
fs_info, KERN_DEBUG fmt, ##args)
-#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
- _dynamic_func_call_no_desc(fmt, btrfs_printk_rl_in_rcu, \
- fs_info, KERN_DEBUG fmt, ##args)
#define btrfs_debug_rl(fs_info, fmt, args...) \
- _dynamic_func_call_no_desc(fmt, btrfs_printk_ratelimited, \
+ _dynamic_func_call_no_desc(fmt, btrfs_printk_rl_in_rcu, \
fs_info, KERN_DEBUG fmt, ##args)
#elif defined(DEBUG)
#define btrfs_debug(fs_info, fmt, args...) \
- btrfs_printk(fs_info, KERN_DEBUG fmt, ##args)
-#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
-#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
- btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
#define btrfs_debug_rl(fs_info, fmt, args...) \
- btrfs_printk_ratelimited(fs_info, KERN_DEBUG fmt, ##args)
+ btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
#else
-#define btrfs_debug(fs_info, fmt, args...) \
- btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args)
-#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
- btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
-#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
- btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
-#define btrfs_debug_rl(fs_info, fmt, args...) \
- btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args)
+/* When printk() is no_printk(), expand to no-op. */
+#define btrfs_debug(fs_info, fmt, args...) do { (void)(fs_info); } while(0)
+#define btrfs_debug_rl(fs_info, fmt, args...) do { (void)(fs_info); } while(0)
#endif
#define btrfs_printk_in_rcu(fs_info, fmt, args...) \
@@ -145,40 +86,97 @@ do { \
rcu_read_unlock(); \
} while (0)
-#define btrfs_no_printk_in_rcu(fs_info, fmt, args...) \
-do { \
- rcu_read_lock(); \
- btrfs_no_printk(fs_info, fmt, ##args); \
- rcu_read_unlock(); \
-} while (0)
-
-#define btrfs_printk_ratelimited(fs_info, fmt, args...) \
+#define btrfs_printk_rl_in_rcu(fs_info, fmt, args...) \
do { \
static DEFINE_RATELIMIT_STATE(_rs, \
DEFAULT_RATELIMIT_INTERVAL, \
DEFAULT_RATELIMIT_BURST); \
+ \
+ rcu_read_lock(); \
if (__ratelimit(&_rs)) \
btrfs_printk(fs_info, fmt, ##args); \
-} while (0)
-
-#define btrfs_printk_rl_in_rcu(fs_info, fmt, args...) \
-do { \
- rcu_read_lock(); \
- btrfs_printk_ratelimited(fs_info, fmt, ##args); \
rcu_read_unlock(); \
} while (0)
#ifdef CONFIG_BTRFS_ASSERT
-#define btrfs_assertfail(expr, file, line) ({ \
- pr_err("assertion failed: %s, in %s:%d\n", (expr), (file), (line)); \
- BUG(); \
-})
+__printf(1, 2)
+static inline void verify_assert_printk_format(const char *fmt, ...) {
+ /* Stub to verify the assertion format string. */
+}
+
+/* Take the first token if any. */
+#define __FIRST_ARG(_, ...) _
+/*
+ * Skip the first token and return the rest, if it's empty the comma is dropped.
+ * As ##__VA_ARGS__ cannot be at the beginning of the macro the __VA_OPT__ is needed
+ * and supported since GCC 8 and Clang 12.
+ */
+#define __REST_ARGS(_, ... ) __VA_OPT__(,) __VA_ARGS__
+
+#if defined(CONFIG_CC_IS_CLANG) || GCC_VERSION >= 80000
+/*
+ * Assertion with optional printk() format.
+ *
+ * Accepted syntax:
+ * ASSERT(condition);
+ * ASSERT(condition, "string");
+ * ASSERT(condition, "variable=%d", variable);
+ *
+ * How it works:
+ * - if there's no format string, ""[0] evaluates at compile time to 0 and the
+ * true branch is executed
+ * - any non-empty format string with the "" prefix evaluates to != 0 at
+ * compile time and the false branch is executed
+ * - stringified condition is printed as %s so we don't accidentally mix format
+ * strings (the % operator)
+ * - there can be only one printk() call, so the format strings and arguments are
+ * spliced together:
+ * DEFAULT_FMT [USER_FMT], DEFAULT_ARGS [, USER_ARGS]
+ * - comma between DEFAULT_ARGS and USER_ARGS is handled by preprocessor
+ * (requires __VA_OPT__ support)
+ * - otherwise we could use __VA_OPT(,) __VA_ARGS__ for the 2nd+ argument of args,
+ */
+#define ASSERT(cond, args...) \
+do { \
+ verify_assert_printk_format("check the format string" args); \
+ if (!likely(cond)) { \
+ if (("" __FIRST_ARG(args) [0]) == 0) { \
+ pr_err("assertion failed: %s :: %ld, in %s:%d\n", \
+ #cond, (long)(cond), __FILE__, __LINE__); \
+ } else { \
+ pr_err("assertion failed: %s :: %ld, in %s:%d (" __FIRST_ARG(args) ")\n", \
+ #cond, (long)(cond), __FILE__, __LINE__ __REST_ARGS(args)); \
+ } \
+ BUG(); \
+ } \
+} while(0)
+
+#else
+
+/* For GCC < 8.x only the simple output. */
+
+#define ASSERT(cond, args...) \
+do { \
+ verify_assert_printk_format("check the format string" args); \
+ if (!likely(cond)) { \
+ pr_err("assertion failed: %s :: %ld, in %s:%d\n", \
+ #cond, (long)(cond), __FILE__, __LINE__); \
+ BUG(); \
+ } \
+} while(0)
+
+#endif
+
+#else
+#define ASSERT(cond, args...) (void)(cond)
+#endif
-#define ASSERT(expr) \
- (likely(expr) ? (void)0 : btrfs_assertfail(#expr, __FILE__, __LINE__))
+#ifdef CONFIG_BTRFS_DEBUG
+/* Verbose warning only under debug build. */
+#define DEBUG_WARN(args...) WARN(1, KERN_ERR args)
#else
-#define ASSERT(expr) (void)(expr)
+#define DEBUG_WARN(...) do {} while(0)
#endif
__printf(5, 6)
diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h
index 0d599fd847c9..ff5eac84d819 100644
--- a/fs/btrfs/misc.h
+++ b/fs/btrfs/misc.h
@@ -7,6 +7,8 @@
#include <linux/bitmap.h>
#include <linux/sched.h>
#include <linux/wait.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
#include <linux/math64.h>
#include <linux/rbtree.h>
@@ -119,28 +121,23 @@ static inline struct rb_node *rb_simple_search_first(const struct rb_root *root,
return ret;
}
-static inline struct rb_node *rb_simple_insert(struct rb_root *root, u64 bytenr,
- struct rb_node *node)
+static int rb_simple_node_bytenr_cmp(struct rb_node *new, const struct rb_node *existing)
{
- struct rb_node **p = &root->rb_node;
- struct rb_node *parent = NULL;
- struct rb_simple_node *entry;
+ struct rb_simple_node *new_entry = rb_entry(new, struct rb_simple_node, rb_node);
+ struct rb_simple_node *existing_entry = rb_entry(existing, struct rb_simple_node, rb_node);
- while (*p) {
- parent = *p;
- entry = rb_entry(parent, struct rb_simple_node, rb_node);
+ if (new_entry->bytenr < existing_entry->bytenr)
+ return -1;
+ else if (new_entry->bytenr > existing_entry->bytenr)
+ return 1;
- if (bytenr < entry->bytenr)
- p = &(*p)->rb_left;
- else if (bytenr > entry->bytenr)
- p = &(*p)->rb_right;
- else
- return parent;
- }
+ return 0;
+}
- rb_link_node(node, parent, p);
- rb_insert_color(node, root);
- return NULL;
+static inline struct rb_node *rb_simple_insert(struct rb_root *root,
+ struct rb_simple_node *simple_node)
+{
+ return rb_find_add(&simple_node->rb_node, root, rb_simple_node_bytenr_cmp);
}
static inline bool bitmap_test_range_all_set(const unsigned long *addr,
@@ -163,4 +160,9 @@ static inline bool bitmap_test_range_all_zero(const unsigned long *addr,
return (found_set == start + nbits);
}
+static inline u64 folio_end(struct folio *folio)
+{
+ return folio_pos(folio) + folio_size(folio);
+}
+
#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 30eceaf829a7..2829f20d7bb5 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -153,25 +153,30 @@ static struct btrfs_ordered_extent *alloc_ordered_extent(
struct btrfs_ordered_extent *entry;
int ret;
u64 qgroup_rsv = 0;
+ const bool is_nocow = (flags &
+ ((1U << BTRFS_ORDERED_NOCOW) | (1U << BTRFS_ORDERED_PREALLOC)));
- if (flags &
- ((1 << BTRFS_ORDERED_NOCOW) | (1 << BTRFS_ORDERED_PREALLOC))) {
- /* For nocow write, we can release the qgroup rsv right now */
+ /*
+ * For a NOCOW write we can free the qgroup reserve right now. For a COW
+ * one we transfer the reserved space from the inode's iotree into the
+ * ordered extent by calling btrfs_qgroup_release_data() and tracking
+ * the qgroup reserved amount in the ordered extent, so that later after
+ * completing the ordered extent, when running the data delayed ref it
+ * creates, we free the reserved data with btrfs_qgroup_free_refroot().
+ */
+ if (is_nocow)
ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes, &qgroup_rsv);
- if (ret < 0)
- return ERR_PTR(ret);
- } else {
- /*
- * The ordered extent has reserved qgroup space, release now
- * and pass the reserved number for qgroup_record to free.
- */
+ else
ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes, &qgroup_rsv);
- if (ret < 0)
- return ERR_PTR(ret);
- }
+
+ if (ret < 0)
+ return ERR_PTR(ret);
+
entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
- if (!entry)
- return ERR_PTR(-ENOMEM);
+ if (!entry) {
+ entry = ERR_PTR(-ENOMEM);
+ goto out;
+ }
entry->file_offset = file_offset;
entry->num_bytes = num_bytes;
@@ -180,7 +185,12 @@ static struct btrfs_ordered_extent *alloc_ordered_extent(
entry->disk_num_bytes = disk_num_bytes;
entry->offset = offset;
entry->bytes_left = num_bytes;
- entry->inode = BTRFS_I(igrab(&inode->vfs_inode));
+ if (WARN_ON_ONCE(!igrab(&inode->vfs_inode))) {
+ kmem_cache_free(btrfs_ordered_extent_cache, entry);
+ entry = ERR_PTR(-ESTALE);
+ goto out;
+ }
+ entry->inode = inode;
entry->compress_type = compress_type;
entry->truncated_len = (u64)-1;
entry->qgroup_rsv = qgroup_rsv;
@@ -203,6 +213,12 @@ static struct btrfs_ordered_extent *alloc_ordered_extent(
btrfs_mod_outstanding_extents(inode, 1);
spin_unlock(&inode->lock);
+out:
+ if (IS_ERR(entry) && !is_nocow)
+ btrfs_qgroup_free_refroot(inode->root->fs_info,
+ btrfs_root_id(inode->root),
+ qgroup_rsv, BTRFS_QGROUP_RSV_DATA);
+
return entry;
}
@@ -253,7 +269,7 @@ static void insert_ordered_extent(struct btrfs_ordered_extent *entry)
* @disk_bytenr: Offset of extent on disk.
* @disk_num_bytes: Size of extent on disk.
* @offset: Offset into unencoded data where file data starts.
- * @flags: Flags specifying type of extent (1 << BTRFS_ORDERED_*).
+ * @flags: Flags specifying type of extent (1U << BTRFS_ORDERED_*).
* @compress_type: Compression algorithm used for data.
*
* Most of these parameters correspond to &struct btrfs_file_extent_item. The
@@ -343,7 +359,7 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
if (folio) {
ASSERT(folio->mapping);
ASSERT(folio_pos(folio) <= file_offset);
- ASSERT(file_offset + len <= folio_pos(folio) + folio_size(folio));
+ ASSERT(file_offset + len <= folio_end(folio));
/*
* Ordered flag indicates whether we still have
@@ -607,23 +623,18 @@ out:
*/
void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
{
- struct list_head *cur;
- struct btrfs_ordered_sum *sum;
-
trace_btrfs_ordered_extent_put(entry->inode, entry);
if (refcount_dec_and_test(&entry->refs)) {
+ struct btrfs_ordered_sum *sum;
+ struct btrfs_ordered_sum *tmp;
+
ASSERT(list_empty(&entry->root_extent_list));
ASSERT(list_empty(&entry->log_list));
ASSERT(RB_EMPTY_NODE(&entry->rb_node));
- if (entry->inode)
- btrfs_add_delayed_iput(entry->inode);
- while (!list_empty(&entry->list)) {
- cur = entry->list.next;
- sum = list_entry(cur, struct btrfs_ordered_sum, list);
- list_del(&sum->list);
+ btrfs_add_delayed_iput(entry->inode);
+ list_for_each_entry_safe(sum, tmp, &entry->list, list)
kvfree(sum);
- }
kmem_cache_free(btrfs_ordered_extent_cache, entry);
}
}
@@ -842,10 +853,12 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
/*
* Start IO and wait for a given ordered extent to finish.
*
- * Wait on page writeback for all the pages in the extent and the IO completion
- * code to insert metadata into the btree corresponding to the extent.
+ * Wait on page writeback for all the pages in the extent but not in
+ * [@nowriteback_start, @nowriteback_start + @nowriteback_len) and the
+ * IO completion code to insert metadata into the btree corresponding to the extent.
*/
-void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry)
+void btrfs_start_ordered_extent_nowriteback(struct btrfs_ordered_extent *entry,
+ u64 nowriteback_start, u32 nowriteback_len)
{
u64 start = entry->file_offset;
u64 end = start + entry->num_bytes - 1;
@@ -865,8 +878,19 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry)
* start IO on any dirty ones so the wait doesn't stall waiting
* for the flusher thread to find them
*/
- if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
- filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end);
+ if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) {
+ if (!nowriteback_len) {
+ filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end);
+ } else {
+ if (start < nowriteback_start)
+ filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start,
+ nowriteback_start - 1);
+ if (nowriteback_start + nowriteback_len < end)
+ filemap_fdatawrite_range(inode->vfs_inode.i_mapping,
+ nowriteback_start + nowriteback_len,
+ end);
+ }
+ }
if (!freespace_inode)
btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent);
@@ -1160,7 +1184,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
cachedp = cached_state;
while (1) {
- lock_extent(&inode->io_tree, start, end, cachedp);
+ btrfs_lock_extent(&inode->io_tree, start, end, cachedp);
ordered = btrfs_lookup_ordered_range(inode, start,
end - start + 1);
if (!ordered) {
@@ -1173,7 +1197,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
refcount_dec(&cache->refs);
break;
}
- unlock_extent(&inode->io_tree, start, end, cachedp);
+ btrfs_unlock_extent(&inode->io_tree, start, end, cachedp);
btrfs_start_ordered_extent(ordered);
btrfs_put_ordered_extent(ordered);
}
@@ -1191,7 +1215,7 @@ bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end,
{
struct btrfs_ordered_extent *ordered;
- if (!try_lock_extent(&inode->io_tree, start, end, cached_state))
+ if (!btrfs_try_lock_extent(&inode->io_tree, start, end, cached_state))
return false;
ordered = btrfs_lookup_ordered_range(inode, start, end - start + 1);
@@ -1199,7 +1223,7 @@ bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end,
return true;
btrfs_put_ordered_extent(ordered);
- unlock_extent(&inode->io_tree, start, end, cached_state);
+ btrfs_unlock_extent(&inode->io_tree, start, end, cached_state);
return false;
}
@@ -1229,6 +1253,18 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent(
*/
if (WARN_ON_ONCE(len >= ordered->num_bytes))
return ERR_PTR(-EINVAL);
+ /*
+ * If our ordered extent had an error there's no point in continuing.
+ * The error may have come from a transaction abort done either by this
+ * task or some other concurrent task, and the transaction abort path
+ * iterates over all existing ordered extents and sets the flag
+ * BTRFS_ORDERED_IOERR on them.
+ */
+ if (unlikely(flags & (1U << BTRFS_ORDERED_IOERR))) {
+ const int fs_error = BTRFS_FS_ERROR(fs_info);
+
+ return fs_error ? ERR_PTR(fs_error) : ERR_PTR(-EIO);
+ }
/* We cannot split partially completed ordered extents. */
if (ordered->bytes_left) {
ASSERT(!(flags & ~BTRFS_ORDERED_TYPE_FLAGS));
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 4e152736d06c..1e6b0b182b29 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -17,6 +17,7 @@
struct inode;
struct page;
struct extent_state;
+struct btrfs_block_group;
struct btrfs_inode;
struct btrfs_root;
struct btrfs_fs_info;
@@ -191,7 +192,13 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
struct btrfs_ordered_sum *sum);
struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
u64 file_offset);
-void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry);
+void btrfs_start_ordered_extent_nowriteback(struct btrfs_ordered_extent *entry,
+ u64 nowriteback_start, u32 nowriteback_len);
+static inline void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry)
+{
+ return btrfs_start_ordered_extent_nowriteback(entry, 0, 0);
+}
+
int btrfs_wait_ordered_range(struct btrfs_inode *inode, u64 start, u64 len);
struct btrfs_ordered_extent *
btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset);
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index fc821aa446f0..74e38da9bd39 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -190,7 +190,7 @@ static void print_uuid_item(const struct extent_buffer *l, unsigned long offset,
u32 item_size)
{
if (!IS_ALIGNED(item_size, sizeof(u64))) {
- pr_warn("BTRFS: uuid item with illegal size %lu!\n",
+ btrfs_warn(l->fs_info, "uuid item with illegal size %lu",
(unsigned long)item_size);
return;
}
@@ -223,7 +223,7 @@ static void print_eb_refs_lock(const struct extent_buffer *eb)
{
#ifdef CONFIG_BTRFS_DEBUG
btrfs_info(eb->fs_info, "refs %u lock_owner %u current %u",
- atomic_read(&eb->refs), eb->lock_owner, current->pid);
+ refcount_read(&eb->refs), eb->lock_owner, current->pid);
#endif
}
diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h
index 8504bf1702c7..d0e620bf5f5a 100644
--- a/fs/btrfs/print-tree.h
+++ b/fs/btrfs/print-tree.h
@@ -6,6 +6,8 @@
#ifndef BTRFS_PRINT_TREE_H
#define BTRFS_PRINT_TREE_H
+#include <linux/types.h>
+
/* Buffer size to contain tree name and possibly additional data (offset) */
#define BTRFS_ROOT_NAME_BUF_LEN 48
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index b8fa34e16abb..adc956432d2f 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -26,8 +26,8 @@ struct prop_handler {
const char *xattr_name;
int (*validate)(const struct btrfs_inode *inode, const char *value,
size_t len);
- int (*apply)(struct inode *inode, const char *value, size_t len);
- const char *(*extract)(const struct inode *inode);
+ int (*apply)(struct btrfs_inode *inode, const char *value, size_t len);
+ const char *(*extract)(const struct btrfs_inode *inode);
bool (*ignore)(const struct btrfs_inode *inode);
int inheritable;
};
@@ -121,7 +121,7 @@ int btrfs_set_prop(struct btrfs_trans_handle *trans, struct btrfs_inode *inode,
if (ret)
return ret;
- ret = handler->apply(&inode->vfs_inode, NULL, 0);
+ ret = handler->apply(inode, NULL, 0);
ASSERT(ret == 0);
return ret;
@@ -131,7 +131,7 @@ int btrfs_set_prop(struct btrfs_trans_handle *trans, struct btrfs_inode *inode,
value_len, flags);
if (ret)
return ret;
- ret = handler->apply(&inode->vfs_inode, value, value_len);
+ ret = handler->apply(inode, value, value_len);
if (ret) {
btrfs_setxattr(trans, &inode->vfs_inode, handler->xattr_name, NULL,
0, flags);
@@ -263,7 +263,7 @@ static void inode_prop_iterator(void *ctx,
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret;
- ret = handler->apply(inode, value, len);
+ ret = handler->apply(BTRFS_I(inode), value, len);
if (unlikely(ret))
btrfs_warn(root->fs_info,
"error applying prop %s to ino %llu (root %llu): %d",
@@ -273,12 +273,13 @@ static void inode_prop_iterator(void *ctx,
set_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(inode)->runtime_flags);
}
-int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path)
+int btrfs_load_inode_props(struct btrfs_inode *inode, struct btrfs_path *path)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
- u64 ino = btrfs_ino(BTRFS_I(inode));
+ struct btrfs_root *root = inode->root;
+ u64 ino = btrfs_ino(inode);
- return iterate_object_props(root, path, ino, inode_prop_iterator, inode);
+ return iterate_object_props(root, path, ino, inode_prop_iterator,
+ &inode->vfs_inode);
}
static int prop_compression_validate(const struct btrfs_inode *inode,
@@ -300,26 +301,26 @@ static int prop_compression_validate(const struct btrfs_inode *inode,
return -EINVAL;
}
-static int prop_compression_apply(struct inode *inode, const char *value,
+static int prop_compression_apply(struct btrfs_inode *inode, const char *value,
size_t len)
{
- struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
int type;
/* Reset to defaults */
if (len == 0) {
- BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
- BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
- BTRFS_I(inode)->prop_compress = BTRFS_COMPRESS_NONE;
+ inode->flags &= ~BTRFS_INODE_COMPRESS;
+ inode->flags &= ~BTRFS_INODE_NOCOMPRESS;
+ inode->prop_compress = BTRFS_COMPRESS_NONE;
return 0;
}
/* Set NOCOMPRESS flag */
if ((len == 2 && strncmp("no", value, 2) == 0) ||
(len == 4 && strncmp("none", value, 4) == 0)) {
- BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
- BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
- BTRFS_I(inode)->prop_compress = BTRFS_COMPRESS_NONE;
+ inode->flags |= BTRFS_INODE_NOCOMPRESS;
+ inode->flags &= ~BTRFS_INODE_COMPRESS;
+ inode->prop_compress = BTRFS_COMPRESS_NONE;
return 0;
}
@@ -336,9 +337,9 @@ static int prop_compression_apply(struct inode *inode, const char *value,
return -EINVAL;
}
- BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
- BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
- BTRFS_I(inode)->prop_compress = type;
+ inode->flags &= ~BTRFS_INODE_NOCOMPRESS;
+ inode->flags |= BTRFS_INODE_COMPRESS;
+ inode->prop_compress = type;
return 0;
}
@@ -359,13 +360,13 @@ static bool prop_compression_ignore(const struct btrfs_inode *inode)
return false;
}
-static const char *prop_compression_extract(const struct inode *inode)
+static const char *prop_compression_extract(const struct btrfs_inode *inode)
{
- switch (BTRFS_I(inode)->prop_compress) {
+ switch (inode->prop_compress) {
case BTRFS_COMPRESS_ZLIB:
case BTRFS_COMPRESS_LZO:
case BTRFS_COMPRESS_ZSTD:
- return btrfs_compress_type2str(BTRFS_I(inode)->prop_compress);
+ return btrfs_compress_type2str(inode->prop_compress);
default:
break;
}
@@ -385,16 +386,16 @@ static struct prop_handler prop_handlers[] = {
};
int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
- struct inode *inode, const struct inode *parent)
+ struct btrfs_inode *inode,
+ const struct btrfs_inode *parent)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
int i;
bool need_reserve = false;
- if (!test_bit(BTRFS_INODE_HAS_PROPS,
- &BTRFS_I(parent)->runtime_flags))
+ if (!test_bit(BTRFS_INODE_HAS_PROPS, &parent->runtime_flags))
return 0;
for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) {
@@ -405,7 +406,7 @@ int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
if (!h->inheritable)
continue;
- if (h->ignore(BTRFS_I(inode)))
+ if (h->ignore(inode))
continue;
value = h->extract(parent);
@@ -416,7 +417,7 @@ int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
* This is not strictly necessary as the property should be
* valid, but in case it isn't, don't propagate it further.
*/
- ret = h->validate(BTRFS_I(inode), value, strlen(value));
+ ret = h->validate(inode, value, strlen(value));
if (ret)
continue;
@@ -436,16 +437,15 @@ int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
return ret;
}
- ret = btrfs_setxattr(trans, inode, h->xattr_name, value,
+ ret = btrfs_setxattr(trans, &inode->vfs_inode, h->xattr_name, value,
strlen(value), 0);
if (!ret) {
ret = h->apply(inode, value, strlen(value));
if (ret)
- btrfs_setxattr(trans, inode, h->xattr_name,
+ btrfs_setxattr(trans, &inode->vfs_inode, h->xattr_name,
NULL, 0, 0);
else
- set_bit(BTRFS_INODE_HAS_PROPS,
- &BTRFS_I(inode)->runtime_flags);
+ set_bit(BTRFS_INODE_HAS_PROPS, &inode->runtime_flags);
}
if (need_reserve) {
diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h
index 63546d0a9444..15d9a025c923 100644
--- a/fs/btrfs/props.h
+++ b/fs/btrfs/props.h
@@ -6,9 +6,9 @@
#ifndef BTRFS_PROPS_H
#define BTRFS_PROPS_H
+#include <linux/types.h>
#include <linux/compiler_types.h>
-struct inode;
struct btrfs_inode;
struct btrfs_path;
struct btrfs_trans_handle;
@@ -22,10 +22,10 @@ int btrfs_validate_prop(const struct btrfs_inode *inode, const char *name,
const char *value, size_t value_len);
bool btrfs_ignore_prop(const struct btrfs_inode *inode, const char *name);
-int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path);
+int btrfs_load_inode_props(struct btrfs_inode *inode, struct btrfs_path *path);
int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
- struct inode *inode,
- const struct inode *dir);
+ struct btrfs_inode *inode,
+ const struct btrfs_inode *dir);
#endif
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index b90fabe302e6..ccaa9a3cf1ce 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -83,7 +83,7 @@ static void qgroup_rsv_add(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup *qgroup, u64 num_bytes,
enum btrfs_qgroup_rsv_type type)
{
- trace_qgroup_update_reserve(fs_info, qgroup, num_bytes, type);
+ trace_btrfs_qgroup_update_reserve(fs_info, qgroup, num_bytes, type);
qgroup->rsv.values[type] += num_bytes;
}
@@ -91,7 +91,7 @@ static void qgroup_rsv_release(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup *qgroup, u64 num_bytes,
enum btrfs_qgroup_rsv_type type)
{
- trace_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes, type);
+ trace_btrfs_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes, type);
if (qgroup->rsv.values[type] >= num_bytes) {
qgroup->rsv.values[type] -= num_bytes;
return;
@@ -160,23 +160,34 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
int init_flags);
static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info);
+static int btrfs_qgroup_qgroupid_key_cmp(const void *key, const struct rb_node *node)
+{
+ const u64 *qgroupid = key;
+ const struct btrfs_qgroup *qgroup = rb_entry(node, struct btrfs_qgroup, node);
+
+ if (qgroup->qgroupid < *qgroupid)
+ return -1;
+ else if (qgroup->qgroupid > *qgroupid)
+ return 1;
+
+ return 0;
+}
+
/* must be called with qgroup_ioctl_lock held */
static struct btrfs_qgroup *find_qgroup_rb(const struct btrfs_fs_info *fs_info,
u64 qgroupid)
{
- struct rb_node *n = fs_info->qgroup_tree.rb_node;
- struct btrfs_qgroup *qgroup;
+ struct rb_node *node;
- while (n) {
- qgroup = rb_entry(n, struct btrfs_qgroup, node);
- if (qgroup->qgroupid < qgroupid)
- n = n->rb_left;
- else if (qgroup->qgroupid > qgroupid)
- n = n->rb_right;
- else
- return qgroup;
- }
- return NULL;
+ node = rb_find(&qgroupid, &fs_info->qgroup_tree, btrfs_qgroup_qgroupid_key_cmp);
+ return rb_entry_safe(node, struct btrfs_qgroup, node);
+}
+
+static int btrfs_qgroup_qgroupid_cmp(struct rb_node *new, const struct rb_node *existing)
+{
+ const struct btrfs_qgroup *new_qgroup = rb_entry(new, struct btrfs_qgroup, node);
+
+ return btrfs_qgroup_qgroupid_key_cmp(&new_qgroup->qgroupid, existing);
}
/*
@@ -191,39 +202,25 @@ static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup *prealloc,
u64 qgroupid)
{
- struct rb_node **p = &fs_info->qgroup_tree.rb_node;
- struct rb_node *parent = NULL;
- struct btrfs_qgroup *qgroup;
+ struct rb_node *node;
/* Caller must have pre-allocated @prealloc. */
ASSERT(prealloc);
- while (*p) {
- parent = *p;
- qgroup = rb_entry(parent, struct btrfs_qgroup, node);
-
- if (qgroup->qgroupid < qgroupid) {
- p = &(*p)->rb_left;
- } else if (qgroup->qgroupid > qgroupid) {
- p = &(*p)->rb_right;
- } else {
- kfree(prealloc);
- return qgroup;
- }
+ prealloc->qgroupid = qgroupid;
+ node = rb_find_add(&prealloc->node, &fs_info->qgroup_tree, btrfs_qgroup_qgroupid_cmp);
+ if (node) {
+ kfree(prealloc);
+ return rb_entry(node, struct btrfs_qgroup, node);
}
- qgroup = prealloc;
- qgroup->qgroupid = qgroupid;
- INIT_LIST_HEAD(&qgroup->groups);
- INIT_LIST_HEAD(&qgroup->members);
- INIT_LIST_HEAD(&qgroup->dirty);
- INIT_LIST_HEAD(&qgroup->iterator);
- INIT_LIST_HEAD(&qgroup->nested_iterator);
+ INIT_LIST_HEAD(&prealloc->groups);
+ INIT_LIST_HEAD(&prealloc->members);
+ INIT_LIST_HEAD(&prealloc->dirty);
+ INIT_LIST_HEAD(&prealloc->iterator);
+ INIT_LIST_HEAD(&prealloc->nested_iterator);
- rb_link_node(&qgroup->node, parent, p);
- rb_insert_color(&qgroup->node, &fs_info->qgroup_tree);
-
- return qgroup;
+ return prealloc;
}
static void __del_qgroup_rb(struct btrfs_qgroup *qgroup)
@@ -349,13 +346,27 @@ int btrfs_verify_qgroup_counts(const struct btrfs_fs_info *fs_info, u64 qgroupid
}
#endif
-static void qgroup_mark_inconsistent(struct btrfs_fs_info *fs_info)
+__printf(2, 3)
+static void qgroup_mark_inconsistent(struct btrfs_fs_info *fs_info, const char *fmt, ...)
{
+ const u64 old_flags = fs_info->qgroup_flags;
+
if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE)
return;
fs_info->qgroup_flags |= (BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT |
BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN |
BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING);
+ if (!(old_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT)) {
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ btrfs_warn_rl(fs_info, "qgroup marked inconsistent, %pV", &vaf);
+ va_end(args);
+ }
}
static void qgroup_read_enable_gen(struct btrfs_fs_info *fs_info,
@@ -386,12 +397,6 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
if (!fs_info->quota_root)
return 0;
- fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL);
- if (!fs_info->qgroup_ulist) {
- ret = -ENOMEM;
- goto out;
- }
-
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
@@ -434,13 +439,10 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
goto out;
}
fs_info->qgroup_flags = btrfs_qgroup_status_flags(l, ptr);
- if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE) {
+ if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE)
qgroup_read_enable_gen(fs_info, l, slot, ptr);
- } else if (btrfs_qgroup_status_generation(l, ptr) != fs_info->generation) {
- qgroup_mark_inconsistent(fs_info);
- btrfs_err(fs_info,
- "qgroup generation mismatch, marked as inconsistent");
- }
+ else if (btrfs_qgroup_status_generation(l, ptr) != fs_info->generation)
+ qgroup_mark_inconsistent(fs_info, "qgroup generation mismatch");
rescan_progress = btrfs_qgroup_status_rescan(l, ptr);
goto next1;
}
@@ -451,10 +453,8 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
qgroup = find_qgroup_rb(fs_info, found_key.offset);
if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) ||
- (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) {
- btrfs_err(fs_info, "inconsistent qgroup config");
- qgroup_mark_inconsistent(fs_info);
- }
+ (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY))
+ qgroup_mark_inconsistent(fs_info, "inconsistent qgroup config");
if (!qgroup) {
struct btrfs_qgroup *prealloc;
struct btrfs_root *tree_root = fs_info->tree_root;
@@ -476,7 +476,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
* during mount before we start doing things like creating
* subvolumes.
*/
- if (is_fstree(qgroup->qgroupid) &&
+ if (btrfs_is_fstree(qgroup->qgroupid) &&
qgroup->qgroupid > tree_root->free_objectid)
/*
* Don't need to check against BTRFS_LAST_FREE_OBJECTID,
@@ -581,8 +581,6 @@ out:
if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
ret = qgroup_rescan_init(fs_info, rescan_progress, 0);
} else {
- ulist_free(fs_info->qgroup_ulist);
- fs_info->qgroup_ulist = NULL;
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
btrfs_sysfs_del_qgroups(fs_info);
}
@@ -630,29 +628,30 @@ bool btrfs_check_quota_leak(const struct btrfs_fs_info *fs_info)
/*
* This is called from close_ctree() or open_ctree() or btrfs_quota_disable(),
- * first two are in single-threaded paths.And for the third one, we have set
- * quota_root to be null with qgroup_lock held before, so it is safe to clean
- * up the in-memory structures without qgroup_lock held.
+ * first two are in single-threaded paths.
*/
void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
{
struct rb_node *n;
struct btrfs_qgroup *qgroup;
+ /*
+ * btrfs_quota_disable() can be called concurrently with
+ * btrfs_qgroup_rescan() -> qgroup_rescan_zero_tracking(), so take the
+ * lock.
+ */
+ spin_lock(&fs_info->qgroup_lock);
while ((n = rb_first(&fs_info->qgroup_tree))) {
qgroup = rb_entry(n, struct btrfs_qgroup, node);
rb_erase(n, &fs_info->qgroup_tree);
__del_qgroup_rb(qgroup);
+ spin_unlock(&fs_info->qgroup_lock);
btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
kfree(qgroup);
+ spin_lock(&fs_info->qgroup_lock);
}
- /*
- * We call btrfs_free_qgroup_config() when unmounting
- * filesystem and disabling quota, so we set qgroup_ulist
- * to be null here to avoid double free.
- */
- ulist_free(fs_info->qgroup_ulist);
- fs_info->qgroup_ulist = NULL;
+ spin_unlock(&fs_info->qgroup_lock);
+
btrfs_sysfs_del_qgroups(fs_info);
}
@@ -956,8 +955,8 @@ static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans,
return -ENOMEM;
key.objectid = 0;
- key.offset = 0;
key.type = 0;
+ key.offset = 0;
while (1) {
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
@@ -998,7 +997,6 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup *qgroup = NULL;
struct btrfs_qgroup *prealloc = NULL;
struct btrfs_trans_handle *trans = NULL;
- struct ulist *ulist = NULL;
const bool simple = (quota_ctl_args->cmd == BTRFS_QUOTA_CTL_ENABLE_SIMPLE_QUOTA);
int ret = 0;
int slot;
@@ -1021,12 +1019,6 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
if (fs_info->quota_root)
goto out;
- ulist = ulist_alloc(GFP_KERNEL);
- if (!ulist) {
- ret = -ENOMEM;
- goto out;
- }
-
ret = btrfs_sysfs_add_qgroups(fs_info);
if (ret < 0)
goto out;
@@ -1066,9 +1058,6 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
if (fs_info->quota_root)
goto out;
- fs_info->qgroup_ulist = ulist;
- ulist = NULL;
-
/*
* initially create the quota tree
*/
@@ -1155,11 +1144,6 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info,
qgroup = add_qgroup_rb(fs_info, prealloc, found_key.offset);
prealloc = NULL;
- if (IS_ERR(qgroup)) {
- ret = PTR_ERR(qgroup);
- btrfs_abort_transaction(trans, ret);
- goto out_free_path;
- }
ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
@@ -1272,17 +1256,13 @@ out_free_root:
if (ret)
btrfs_put_root(quota_root);
out:
- if (ret) {
- ulist_free(fs_info->qgroup_ulist);
- fs_info->qgroup_ulist = NULL;
+ if (ret)
btrfs_sysfs_del_qgroups(fs_info);
- }
mutex_unlock(&fs_info->qgroup_ioctl_lock);
if (ret && trans)
btrfs_end_transaction(trans);
else if (trans)
ret = btrfs_end_transaction(trans);
- ulist_free(ulist);
kfree(prealloc);
return ret;
}
@@ -1354,11 +1334,14 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
/*
* We have nothing held here and no trans handle, just return the error
- * if there is one.
+ * if there is one and set back the quota enabled bit since we didn't
+ * actually disable quotas.
*/
ret = flush_reservations(fs_info);
- if (ret)
+ if (ret) {
+ set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
return ret;
+ }
/*
* 1 For the root item
@@ -1470,7 +1453,6 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, u64 ref_root,
struct btrfs_qgroup *src, int sign)
{
struct btrfs_qgroup *qgroup;
- struct btrfs_qgroup *cur;
LIST_HEAD(qgroup_list);
u64 num_bytes = src->excl;
int ret = 0;
@@ -1480,7 +1462,7 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, u64 ref_root,
goto out;
qgroup_iterator_add(&qgroup_list, qgroup);
- list_for_each_entry(cur, &qgroup_list, iterator) {
+ list_for_each_entry(qgroup, &qgroup_list, iterator) {
struct btrfs_qgroup_list *glist;
qgroup->rfer += sign * num_bytes;
@@ -1679,9 +1661,6 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
struct btrfs_qgroup *prealloc = NULL;
int ret = 0;
- if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED)
- return 0;
-
mutex_lock(&fs_info->qgroup_ioctl_lock);
if (!fs_info->quota_root) {
ret = -ENOTCONN;
@@ -1823,7 +1802,7 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
if (qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] ||
qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] ||
qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]) {
- WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ DEBUG_WARN();
btrfs_warn_rl(fs_info,
"to be deleted qgroup %u/%llu has non-zero numbers, data %llu meta prealloc %llu meta pertrans %llu",
btrfs_qgroup_level(qgroup->qgroupid),
@@ -1843,14 +1822,13 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT)) {
if (qgroup->rfer || qgroup->excl ||
qgroup->rfer_cmpr || qgroup->excl_cmpr) {
- WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
- btrfs_warn_rl(fs_info,
-"to be deleted qgroup %u/%llu has non-zero numbers, rfer %llu rfer_cmpr %llu excl %llu excl_cmpr %llu",
- btrfs_qgroup_level(qgroup->qgroupid),
- btrfs_qgroup_subvolid(qgroup->qgroupid),
- qgroup->rfer, qgroup->rfer_cmpr,
- qgroup->excl, qgroup->excl_cmpr);
- qgroup_mark_inconsistent(fs_info);
+ DEBUG_WARN();
+ qgroup_mark_inconsistent(fs_info,
+ "to be deleted qgroup %u/%llu has non-zero numbers, rfer %llu rfer_cmpr %llu excl %llu excl_cmpr %llu",
+ btrfs_qgroup_level(qgroup->qgroupid),
+ btrfs_qgroup_subvolid(qgroup->qgroupid),
+ qgroup->rfer, qgroup->rfer_cmpr,
+ qgroup->excl, qgroup->excl_cmpr);
}
}
del_qgroup_rb(fs_info, qgroupid);
@@ -1873,18 +1851,15 @@ int btrfs_qgroup_cleanup_dropped_subvolume(struct btrfs_fs_info *fs_info, u64 su
struct btrfs_trans_handle *trans;
int ret;
- if (!is_fstree(subvolid) || !btrfs_qgroup_enabled(fs_info) || !fs_info->quota_root)
+ if (!btrfs_is_fstree(subvolid) || !btrfs_qgroup_enabled(fs_info) ||
+ !fs_info->quota_root)
return 0;
/*
* Commit current transaction to make sure all the rfer/excl numbers
* get updated.
*/
- trans = btrfs_start_transaction(fs_info->quota_root, 0);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
-
- ret = btrfs_commit_transaction(trans);
+ ret = btrfs_commit_current_transaction(fs_info->quota_root);
if (ret < 0)
return ret;
@@ -1897,8 +1872,11 @@ int btrfs_qgroup_cleanup_dropped_subvolume(struct btrfs_fs_info *fs_info, u64 su
/*
* It's squota and the subvolume still has numbers needed for future
* accounting, in this case we can not delete it. Just skip it.
+ *
+ * Or the qgroup is already removed by a qgroup rescan. For both cases we're
+ * safe to ignore them.
*/
- if (ret == -EBUSY)
+ if (ret == -EBUSY || ret == -ENOENT)
ret = 0;
return ret;
}
@@ -1969,11 +1947,8 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid,
spin_unlock(&fs_info->qgroup_lock);
ret = update_qgroup_limit_item(trans, qgroup);
- if (ret) {
- qgroup_mark_inconsistent(fs_info);
- btrfs_info(fs_info, "unable to update quota limit for %llu",
- qgroupid);
- }
+ if (ret)
+ qgroup_mark_inconsistent(fs_info, "qgroup item update error %d", ret);
out:
mutex_unlock(&fs_info->qgroup_ioctl_lock);
@@ -2028,7 +2003,7 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
ret = __xa_store(&delayed_refs->dirty_extents, index, record, GFP_ATOMIC);
xa_unlock(&delayed_refs->dirty_extents);
if (xa_is_err(ret)) {
- qgroup_mark_inconsistent(fs_info);
+ qgroup_mark_inconsistent(fs_info, "xarray insert error: %d", xa_err(ret));
return xa_err(ret);
}
@@ -2095,10 +2070,8 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
ret = btrfs_find_all_roots(&ctx, true);
if (ret < 0) {
- qgroup_mark_inconsistent(fs_info);
- btrfs_warn(fs_info,
-"error accounting new delayed refs extent (err code: %d), quota inconsistent",
- ret);
+ qgroup_mark_inconsistent(fs_info,
+ "error accounting new delayed refs extent: %d", ret);
return 0;
}
@@ -2342,7 +2315,7 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
btrfs_item_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
/* For src_path */
- atomic_inc(&src_eb->refs);
+ refcount_inc(&src_eb->refs);
src_path->nodes[root_level] = src_eb;
src_path->slots[root_level] = dst_path->slots[root_level];
src_path->locks[root_level] = 0;
@@ -2575,7 +2548,7 @@ static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
goto out;
}
/* For dst_path */
- atomic_inc(&dst_eb->refs);
+ refcount_inc(&dst_eb->refs);
dst_path->nodes[level] = dst_eb;
dst_path->slots[level] = 0;
dst_path->locks[level] = 0;
@@ -2590,7 +2563,7 @@ static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
out:
btrfs_free_path(dst_path);
if (ret < 0)
- qgroup_mark_inconsistent(fs_info);
+ qgroup_mark_inconsistent(fs_info, "%s error: %d", __func__, ret);
return ret;
}
@@ -2634,7 +2607,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
* mark qgroup inconsistent.
*/
if (root_level >= drop_subptree_thres) {
- qgroup_mark_inconsistent(fs_info);
+ qgroup_mark_inconsistent(fs_info, "subtree level reached threshold");
return 0;
}
@@ -2667,7 +2640,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
* walk back up the tree (adjusting slot pointers as we go)
* and restart the search process.
*/
- atomic_inc(&root_eb->refs); /* For path */
+ refcount_inc(&root_eb->refs); /* For path */
path->nodes[root_level] = root_eb;
path->slots[root_level] = 0;
path->locks[root_level] = 0; /* so release_path doesn't try to unlock */
@@ -2838,8 +2811,8 @@ static void qgroup_update_counters(struct btrfs_fs_info *fs_info,
cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq);
cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq);
- trace_qgroup_update_counters(fs_info, qg, cur_old_count,
- cur_new_count);
+ trace_btrfs_qgroup_update_counters(fs_info, qg, cur_old_count,
+ cur_new_count);
/* Rfer update part */
if (cur_old_count == 0 && cur_new_count > 0) {
@@ -2933,7 +2906,7 @@ static int maybe_fs_roots(struct ulist *roots)
* trees.
* If it contains a non-fs tree, it won't be shared with fs/subvol trees.
*/
- return is_fstree(unode->val);
+ return btrfs_is_fstree(unode->val);
}
int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
@@ -3101,8 +3074,7 @@ cleanup:
kfree(record);
}
- trace_qgroup_num_dirty_extents(fs_info, trans->transid,
- num_dirty_extents);
+ trace_btrfs_qgroup_num_dirty_extents(fs_info, trans->transid, num_dirty_extents);
return ret;
}
@@ -3135,10 +3107,12 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
spin_unlock(&fs_info->qgroup_lock);
ret = update_qgroup_info_item(trans, qgroup);
if (ret)
- qgroup_mark_inconsistent(fs_info);
+ qgroup_mark_inconsistent(fs_info,
+ "qgroup info item update error %d", ret);
ret = update_qgroup_limit_item(trans, qgroup);
if (ret)
- qgroup_mark_inconsistent(fs_info);
+ qgroup_mark_inconsistent(fs_info,
+ "qgroup limit item update error %d", ret);
spin_lock(&fs_info->qgroup_lock);
}
if (btrfs_qgroup_enabled(fs_info))
@@ -3149,7 +3123,8 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
ret = update_qgroup_status_item(trans);
if (ret)
- qgroup_mark_inconsistent(fs_info);
+ qgroup_mark_inconsistent(fs_info,
+ "qgroup status item update error %d", ret);
return ret;
}
@@ -3331,6 +3306,9 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
u32 level_size = 0;
u64 nums;
+ if (!btrfs_qgroup_enabled(fs_info))
+ return 0;
+
prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS);
if (!prealloc)
return -ENOMEM;
@@ -3354,8 +3332,6 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
if (!committing)
mutex_lock(&fs_info->qgroup_ioctl_lock);
- if (!btrfs_qgroup_enabled(fs_info))
- goto out;
quota_root = fs_info->quota_root;
if (!quota_root) {
@@ -3556,7 +3532,7 @@ out:
if (!committing)
mutex_unlock(&fs_info->qgroup_ioctl_lock);
if (need_rescan)
- qgroup_mark_inconsistent(fs_info);
+ qgroup_mark_inconsistent(fs_info, "qgroup inherit needs a rescan");
if (qlist_prealloc) {
for (int i = 0; i < inherit->num_qgroups; i++)
kfree(qlist_prealloc[i]);
@@ -3590,7 +3566,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
int ret = 0;
LIST_HEAD(qgroup_list);
- if (!is_fstree(ref_root))
+ if (!btrfs_is_fstree(ref_root))
return 0;
if (num_bytes == 0)
@@ -3650,7 +3626,7 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
struct btrfs_qgroup *qgroup;
LIST_HEAD(qgroup_list);
- if (!is_fstree(ref_root))
+ if (!btrfs_is_fstree(ref_root))
return;
if (num_bytes == 0)
@@ -4038,12 +4014,21 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
qgroup_rescan_zero_tracking(fs_info);
mutex_lock(&fs_info->qgroup_rescan_lock);
- fs_info->qgroup_rescan_running = true;
- btrfs_queue_work(fs_info->qgroup_rescan_workers,
- &fs_info->qgroup_rescan_work);
+ /*
+ * The rescan worker is only for full accounting qgroups, check if it's
+ * enabled as it is pointless to queue it otherwise. A concurrent quota
+ * disable may also have just cleared BTRFS_FS_QUOTA_ENABLED.
+ */
+ if (btrfs_qgroup_full_accounting(fs_info)) {
+ fs_info->qgroup_rescan_running = true;
+ btrfs_queue_work(fs_info->qgroup_rescan_workers,
+ &fs_info->qgroup_rescan_work);
+ } else {
+ ret = -ENOTCONN;
+ }
mutex_unlock(&fs_info->qgroup_rescan_lock);
- return 0;
+ return ret;
}
int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info,
@@ -4130,8 +4115,8 @@ static int qgroup_unreserve_range(struct btrfs_inode *inode,
* Now the entry is in [start, start + len), revert the
* EXTENT_QGROUP_RESERVED bit.
*/
- clear_ret = clear_extent_bits(&inode->io_tree, entry_start,
- entry_end, EXTENT_QGROUP_RESERVED);
+ clear_ret = btrfs_clear_extent_bit(&inode->io_tree, entry_start, entry_end,
+ EXTENT_QGROUP_RESERVED, NULL);
if (!ret && clear_ret < 0)
ret = clear_ret;
@@ -4218,7 +4203,7 @@ static int qgroup_reserve_data(struct btrfs_inode *inode,
int ret;
if (btrfs_qgroup_mode(root->fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
- !is_fstree(btrfs_root_id(root)) || len == 0)
+ !btrfs_is_fstree(btrfs_root_id(root)) || len == 0)
return 0;
/* @reserved parameter is mandatory for qgroup */
@@ -4233,8 +4218,9 @@ static int qgroup_reserve_data(struct btrfs_inode *inode,
reserved = *reserved_ret;
/* Record already reserved space */
orig_reserved = reserved->bytes_changed;
- ret = set_record_extent_bits(&inode->io_tree, start,
- start + len -1, EXTENT_QGROUP_RESERVED, reserved);
+ ret = btrfs_set_record_extent_bits(&inode->io_tree, start,
+ start + len - 1, EXTENT_QGROUP_RESERVED,
+ reserved);
/* Newly reserved space */
to_reserve = reserved->bytes_changed - orig_reserved;
@@ -4327,9 +4313,10 @@ static int qgroup_free_reserved_data(struct btrfs_inode *inode,
* EXTENT_QGROUP_RESERVED, we won't double free.
* So not need to rush.
*/
- ret = clear_record_extent_bits(&inode->io_tree, free_start,
- free_start + free_len - 1,
- EXTENT_QGROUP_RESERVED, &changeset);
+ ret = btrfs_clear_record_extent_bits(&inode->io_tree, free_start,
+ free_start + free_len - 1,
+ EXTENT_QGROUP_RESERVED,
+ &changeset);
if (ret < 0)
goto out;
freed += changeset.bytes_changed;
@@ -4353,9 +4340,9 @@ static int __btrfs_qgroup_release_data(struct btrfs_inode *inode,
int ret;
if (btrfs_qgroup_mode(inode->root->fs_info) == BTRFS_QGROUP_MODE_DISABLED) {
- return clear_record_extent_bits(&inode->io_tree, start,
- start + len - 1,
- EXTENT_QGROUP_RESERVED, NULL);
+ return btrfs_clear_record_extent_bits(&inode->io_tree, start,
+ start + len - 1,
+ EXTENT_QGROUP_RESERVED, NULL);
}
/* In release case, we shouldn't have @reserved */
@@ -4363,8 +4350,8 @@ static int __btrfs_qgroup_release_data(struct btrfs_inode *inode,
if (free && reserved)
return qgroup_free_reserved_data(inode, reserved, start, len, released);
extent_changeset_init(&changeset);
- ret = clear_record_extent_bits(&inode->io_tree, start, start + len -1,
- EXTENT_QGROUP_RESERVED, &changeset);
+ ret = btrfs_clear_record_extent_bits(&inode->io_tree, start, start + len - 1,
+ EXTENT_QGROUP_RESERVED, &changeset);
if (ret < 0)
goto out;
@@ -4469,11 +4456,11 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
int ret;
if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
- !is_fstree(btrfs_root_id(root)) || num_bytes == 0)
+ !btrfs_is_fstree(btrfs_root_id(root)) || num_bytes == 0)
return 0;
BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
- trace_qgroup_meta_reserve(root, (s64)num_bytes, type);
+ trace_btrfs_qgroup_meta_reserve(root, (s64)num_bytes, type);
ret = qgroup_reserve(root, num_bytes, enforce, type);
if (ret < 0)
return ret;
@@ -4514,11 +4501,11 @@ void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root)
struct btrfs_fs_info *fs_info = root->fs_info;
if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
- !is_fstree(btrfs_root_id(root)))
+ !btrfs_is_fstree(btrfs_root_id(root)))
return;
/* TODO: Update trace point to handle such free */
- trace_qgroup_meta_free_all_pertrans(root);
+ trace_btrfs_qgroup_meta_free_all_pertrans(root);
/* Special value -1 means to free all reserved space */
btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(root), (u64)-1,
BTRFS_QGROUP_RSV_META_PERTRANS);
@@ -4530,7 +4517,7 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
struct btrfs_fs_info *fs_info = root->fs_info;
if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
- !is_fstree(btrfs_root_id(root)))
+ !btrfs_is_fstree(btrfs_root_id(root)))
return;
/*
@@ -4540,7 +4527,7 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
*/
num_bytes = sub_root_meta_rsv(root, num_bytes, type);
BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
- trace_qgroup_meta_reserve(root, -(s64)num_bytes, type);
+ trace_btrfs_qgroup_meta_reserve(root, -(s64)num_bytes, type);
btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(root), num_bytes, type);
}
@@ -4589,12 +4576,12 @@ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes)
struct btrfs_fs_info *fs_info = root->fs_info;
if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED ||
- !is_fstree(btrfs_root_id(root)))
+ !btrfs_is_fstree(btrfs_root_id(root)))
return;
/* Same as btrfs_qgroup_free_meta_prealloc() */
num_bytes = sub_root_meta_rsv(root, num_bytes,
BTRFS_QGROUP_RSV_META_PREALLOC);
- trace_qgroup_meta_convert(root, num_bytes);
+ trace_btrfs_qgroup_meta_convert(root, num_bytes);
qgroup_convert_meta(fs_info, btrfs_root_id(root), num_bytes);
if (!sb_rdonly(fs_info->sb))
add_root_meta_rsv(root, num_bytes, BTRFS_QGROUP_RSV_META_PERTRANS);
@@ -4612,8 +4599,8 @@ void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode)
int ret;
extent_changeset_init(&changeset);
- ret = clear_record_extent_bits(&inode->io_tree, 0, (u64)-1,
- EXTENT_QGROUP_RESERVED, &changeset);
+ ret = btrfs_clear_record_extent_bits(&inode->io_tree, 0, (u64)-1,
+ EXTENT_QGROUP_RESERVED, &changeset);
WARN_ON(ret < 0);
if (WARN_ON(changeset.bytes_changed)) {
@@ -4673,6 +4660,28 @@ out:
spin_unlock(&swapped_blocks->lock);
}
+static int qgroup_swapped_block_bytenr_key_cmp(const void *key, const struct rb_node *node)
+{
+ const u64 *bytenr = key;
+ const struct btrfs_qgroup_swapped_block *block = rb_entry(node,
+ struct btrfs_qgroup_swapped_block, node);
+
+ if (block->subvol_bytenr < *bytenr)
+ return -1;
+ else if (block->subvol_bytenr > *bytenr)
+ return 1;
+
+ return 0;
+}
+
+static int qgroup_swapped_block_bytenr_cmp(struct rb_node *new, const struct rb_node *existing)
+{
+ const struct btrfs_qgroup_swapped_block *new_block = rb_entry(new,
+ struct btrfs_qgroup_swapped_block, node);
+
+ return qgroup_swapped_block_bytenr_key_cmp(&new_block->subvol_bytenr, existing);
+}
+
/*
* Add subtree roots record into @subvol_root.
*
@@ -4692,8 +4701,7 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_root *subvol_root,
struct btrfs_fs_info *fs_info = subvol_root->fs_info;
struct btrfs_qgroup_swapped_blocks *blocks = &subvol_root->swapped_blocks;
struct btrfs_qgroup_swapped_block *block;
- struct rb_node **cur;
- struct rb_node *parent = NULL;
+ struct rb_node *node;
int level = btrfs_header_level(subvol_parent) - 1;
int ret = 0;
@@ -4742,46 +4750,32 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_root *subvol_root,
/* Insert @block into @blocks */
spin_lock(&blocks->lock);
- cur = &blocks->blocks[level].rb_node;
- while (*cur) {
+ node = rb_find_add(&block->node, &blocks->blocks[level], qgroup_swapped_block_bytenr_cmp);
+ if (node) {
struct btrfs_qgroup_swapped_block *entry;
- parent = *cur;
- entry = rb_entry(parent, struct btrfs_qgroup_swapped_block,
- node);
+ entry = rb_entry(node, struct btrfs_qgroup_swapped_block, node);
- if (entry->subvol_bytenr < block->subvol_bytenr) {
- cur = &(*cur)->rb_left;
- } else if (entry->subvol_bytenr > block->subvol_bytenr) {
- cur = &(*cur)->rb_right;
- } else {
- if (entry->subvol_generation !=
- block->subvol_generation ||
- entry->reloc_bytenr != block->reloc_bytenr ||
- entry->reloc_generation !=
- block->reloc_generation) {
- /*
- * Duplicated but mismatch entry found.
- * Shouldn't happen.
- *
- * Marking qgroup inconsistent should be enough
- * for end users.
- */
- WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
- ret = -EEXIST;
- }
- kfree(block);
- goto out_unlock;
+ if (entry->subvol_generation != block->subvol_generation ||
+ entry->reloc_bytenr != block->reloc_bytenr ||
+ entry->reloc_generation != block->reloc_generation) {
+ /*
+ * Duplicated but mismatch entry found. Shouldn't happen.
+ * Marking qgroup inconsistent should be enough for end
+ * users.
+ */
+ DEBUG_WARN("duplicated but mismatched entry found");
+ ret = -EEXIST;
}
+ kfree(block);
+ goto out_unlock;
}
- rb_link_node(&block->node, parent, cur);
- rb_insert_color(&block->node, &blocks->blocks[level]);
blocks->swapped = true;
out_unlock:
spin_unlock(&blocks->lock);
out:
if (ret < 0)
- qgroup_mark_inconsistent(fs_info);
+ qgroup_mark_inconsistent(fs_info, "%s error: %d", __func__, ret);
return ret;
}
@@ -4801,7 +4795,6 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
struct btrfs_qgroup_swapped_block *block;
struct extent_buffer *reloc_eb = NULL;
struct rb_node *node;
- bool found = false;
bool swapped = false;
int level = btrfs_header_level(subvol_eb);
int ret = 0;
@@ -4809,7 +4802,7 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
if (!btrfs_qgroup_full_accounting(fs_info))
return 0;
- if (!is_fstree(btrfs_root_id(root)) || !root->reloc_root)
+ if (!btrfs_is_fstree(btrfs_root_id(root)) || !root->reloc_root)
return 0;
spin_lock(&blocks->lock);
@@ -4817,23 +4810,14 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
spin_unlock(&blocks->lock);
return 0;
}
- node = blocks->blocks[level].rb_node;
-
- while (node) {
- block = rb_entry(node, struct btrfs_qgroup_swapped_block, node);
- if (block->subvol_bytenr < subvol_eb->start) {
- node = node->rb_left;
- } else if (block->subvol_bytenr > subvol_eb->start) {
- node = node->rb_right;
- } else {
- found = true;
- break;
- }
- }
- if (!found) {
+ node = rb_find(&subvol_eb->start, &blocks->blocks[level],
+ qgroup_swapped_block_bytenr_key_cmp);
+ if (!node) {
spin_unlock(&blocks->lock);
goto out;
}
+ block = rb_entry(node, struct btrfs_qgroup_swapped_block, node);
+
/* Found one, remove it from @blocks first and update blocks->swapped */
rb_erase(&block->node, &blocks->blocks[level]);
for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
@@ -4869,10 +4853,9 @@ free_out:
free_extent_buffer(reloc_eb);
out:
if (ret < 0) {
- btrfs_err_rl(fs_info,
- "failed to account subtree at bytenr %llu: %d",
- subvol_eb->start, ret);
- qgroup_mark_inconsistent(fs_info);
+ qgroup_mark_inconsistent(fs_info,
+ "failed to account subtree at bytenr %llu: %d",
+ subvol_eb->start, ret);
}
return ret;
}
@@ -4903,7 +4886,7 @@ int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info,
if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE)
return 0;
- if (!is_fstree(root))
+ if (!btrfs_is_fstree(root))
return 0;
/* If the extent predates enabling quotas, don't count it. */
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index e233cc79af18..a979fd59a4da 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -22,6 +22,9 @@ struct btrfs_ioctl_quota_ctl_args;
struct btrfs_trans_handle;
struct btrfs_delayed_ref_root;
struct btrfs_inode;
+struct btrfs_transaction;
+struct btrfs_block_group;
+struct btrfs_qgroup_swapped_blocks;
/*
* Btrfs qgroup overview
diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
index 1834011ccc49..cab0b291088c 100644
--- a/fs/btrfs/raid-stripe-tree.c
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -329,11 +329,14 @@ int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans,
ret = btrfs_insert_item(trans, stripe_root, &stripe_key, stripe_extent,
item_size);
- if (ret == -EEXIST)
+ if (ret == -EEXIST) {
ret = update_raid_extent_item(trans, &stripe_key, stripe_extent,
item_size);
- if (ret)
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ } else if (ret) {
btrfs_abort_transaction(trans, ret);
+ }
kfree(stripe_extent);
diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h
index 541836421778..69942ad43140 100644
--- a/fs/btrfs/raid-stripe-tree.h
+++ b/fs/btrfs/raid-stripe-tree.h
@@ -9,6 +9,7 @@
#include <linux/types.h>
#include <uapi/linux/btrfs_tree.h>
#include "fs.h"
+#include "accessors.h"
#define BTRFS_RST_SUPP_BLOCK_GROUP_MASK (BTRFS_BLOCK_GROUP_DUP | \
BTRFS_BLOCK_GROUP_RAID1_MASK | \
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index cdd373c27784..3ff2bedfb3a4 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -134,14 +134,17 @@ struct btrfs_stripe_hash_table {
};
/*
- * A bvec like structure to present a sector inside a page.
- *
- * Unlike bvec we don't need bvlen, as it's fixed to sectorsize.
+ * A structure to present a sector inside a page, the length is fixed to
+ * sectorsize;
*/
struct sector_ptr {
- struct page *page;
- unsigned int pgoff:24;
- unsigned int uptodate:8;
+ /*
+ * Blocks from the bio list can still be highmem.
+ * So here we use physical address to present a page and the offset inside it.
+ */
+ phys_addr_t paddr;
+ bool has_paddr;
+ bool uptodate;
};
static void rmw_rbio_work(struct work_struct *work);
@@ -200,8 +203,7 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
struct btrfs_stripe_hash_table *x;
struct btrfs_stripe_hash *cur;
struct btrfs_stripe_hash *h;
- int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
- int i;
+ unsigned int num_entries = 1U << BTRFS_STRIPE_HASH_TABLE_BITS;
if (info->stripe_hash_table)
return 0;
@@ -222,7 +224,7 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
h = table->table;
- for (i = 0; i < num_entries; i++) {
+ for (unsigned int i = 0; i < num_entries; i++) {
cur = h + i;
INIT_LIST_HEAD(&cur->hash_list);
spin_lock_init(&cur->lock);
@@ -233,6 +235,14 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
return 0;
}
+static void memcpy_sectors(const struct sector_ptr *dst,
+ const struct sector_ptr *src, u32 blocksize)
+{
+ memcpy_page(phys_to_page(dst->paddr), offset_in_page(dst->paddr),
+ phys_to_page(src->paddr), offset_in_page(src->paddr),
+ blocksize);
+}
+
/*
* caching an rbio means to copy anything from the
* bio_sectors array into the stripe_pages array. We
@@ -253,7 +263,7 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
for (i = 0; i < rbio->nr_sectors; i++) {
/* Some range not covered by bio (partial write), skip it */
- if (!rbio->bio_sectors[i].page) {
+ if (!rbio->bio_sectors[i].has_paddr) {
/*
* Even if the sector is not covered by bio, if it is
* a data sector it should still be uptodate as it is
@@ -264,12 +274,8 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
continue;
}
- ASSERT(rbio->stripe_sectors[i].page);
- memcpy_page(rbio->stripe_sectors[i].page,
- rbio->stripe_sectors[i].pgoff,
- rbio->bio_sectors[i].page,
- rbio->bio_sectors[i].pgoff,
- rbio->bioc->fs_info->sectorsize);
+ memcpy_sectors(&rbio->stripe_sectors[i], &rbio->bio_sectors[i],
+ rbio->bioc->fs_info->sectorsize);
rbio->stripe_sectors[i].uptodate = 1;
}
set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
@@ -326,8 +332,13 @@ static void index_stripe_sectors(struct btrfs_raid_bio *rbio)
int page_index = offset >> PAGE_SHIFT;
ASSERT(page_index < rbio->nr_pages);
- rbio->stripe_sectors[i].page = rbio->stripe_pages[page_index];
- rbio->stripe_sectors[i].pgoff = offset_in_page(offset);
+ if (!rbio->stripe_pages[page_index])
+ continue;
+
+ rbio->stripe_sectors[i].has_paddr = true;
+ rbio->stripe_sectors[i].paddr =
+ page_to_phys(rbio->stripe_pages[page_index]) +
+ offset_in_page(offset);
}
}
@@ -507,9 +518,8 @@ static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
spin_lock(&table->cache_lock);
while (!list_empty(&table->stripe_cache)) {
- rbio = list_entry(table->stripe_cache.next,
- struct btrfs_raid_bio,
- stripe_cache);
+ rbio = list_first_entry(&table->stripe_cache,
+ struct btrfs_raid_bio, stripe_cache);
__remove_rbio_from_cache(rbio);
}
spin_unlock(&table->cache_lock);
@@ -567,9 +577,9 @@ static void cache_rbio(struct btrfs_raid_bio *rbio)
if (table->cache_size > RBIO_CACHE_SIZE) {
struct btrfs_raid_bio *found;
- found = list_entry(table->stripe_cache.prev,
- struct btrfs_raid_bio,
- stripe_cache);
+ found = list_last_entry(&table->stripe_cache,
+ struct btrfs_raid_bio,
+ stripe_cache);
if (found != rbio)
__remove_rbio_from_cache(found);
@@ -882,14 +892,14 @@ done_nolock:
remove_rbio_from_cache(rbio);
}
-static void rbio_endio_bio_list(struct bio *cur, blk_status_t err)
+static void rbio_endio_bio_list(struct bio *cur, blk_status_t status)
{
struct bio *next;
while (cur) {
next = cur->bi_next;
cur->bi_next = NULL;
- cur->bi_status = err;
+ cur->bi_status = status;
bio_endio(cur);
cur = next;
}
@@ -899,7 +909,7 @@ static void rbio_endio_bio_list(struct bio *cur, blk_status_t err)
* this frees the rbio and runs through all the bios in the
* bio_list and calls end_io on them
*/
-static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
+static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t status)
{
struct bio *cur = bio_list_get(&rbio->bio_list);
struct bio *extra;
@@ -928,9 +938,9 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
extra = bio_list_get(&rbio->bio_list);
free_raid_bio(rbio);
- rbio_endio_bio_list(cur, err);
+ rbio_endio_bio_list(cur, status);
if (extra)
- rbio_endio_bio_list(extra, err);
+ rbio_endio_bio_list(extra, status);
}
/*
@@ -962,9 +972,9 @@ static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio,
spin_lock(&rbio->bio_list_lock);
sector = &rbio->bio_sectors[index];
- if (sector->page || bio_list_only) {
+ if (sector->has_paddr || bio_list_only) {
/* Don't return sector without a valid page pointer */
- if (!sector->page)
+ if (!sector->has_paddr)
sector = NULL;
spin_unlock(&rbio->bio_list_lock);
return sector;
@@ -1142,7 +1152,7 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
rbio, stripe_nr);
ASSERT_RBIO_SECTOR(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors,
rbio, sector_nr);
- ASSERT(sector->page);
+ ASSERT(sector->has_paddr);
stripe = &rbio->bioc->stripes[stripe_nr];
disk_start = stripe->physical + sector_nr * sectorsize;
@@ -1173,8 +1183,8 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
*/
if (last_end == disk_start && !last->bi_status &&
last->bi_bdev == stripe->dev->bdev) {
- ret = bio_add_page(last, sector->page, sectorsize,
- sector->pgoff);
+ ret = bio_add_page(last, phys_to_page(sector->paddr),
+ sectorsize, offset_in_page(sector->paddr));
if (ret == sectorsize)
return 0;
}
@@ -1187,7 +1197,8 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
bio->bi_iter.bi_sector = disk_start >> SECTOR_SHIFT;
bio->bi_private = rbio;
- __bio_add_page(bio, sector->page, sectorsize, sector->pgoff);
+ __bio_add_page(bio, phys_to_page(sector->paddr), sectorsize,
+ offset_in_page(sector->paddr));
bio_list_add(bio_list, bio);
return 0;
}
@@ -1195,23 +1206,20 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio)
{
const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
- struct bio_vec bvec;
- struct bvec_iter iter;
+ const u32 sectorsize_bits = rbio->bioc->fs_info->sectorsize_bits;
+ struct bvec_iter iter = bio->bi_iter;
u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
rbio->bioc->full_stripe_logical;
- bio_for_each_segment(bvec, bio, iter) {
- u32 bvec_offset;
-
- for (bvec_offset = 0; bvec_offset < bvec.bv_len;
- bvec_offset += sectorsize, offset += sectorsize) {
- int index = offset / sectorsize;
- struct sector_ptr *sector = &rbio->bio_sectors[index];
+ while (iter.bi_size) {
+ unsigned int index = (offset >> sectorsize_bits);
+ struct sector_ptr *sector = &rbio->bio_sectors[index];
+ struct bio_vec bv = bio_iter_iovec(bio, iter);
- sector->page = bvec.bv_page;
- sector->pgoff = bvec.bv_offset + bvec_offset;
- ASSERT(sector->pgoff < PAGE_SIZE);
- }
+ sector->has_paddr = true;
+ sector->paddr = bvec_phys(&bv);
+ bio_advance_iter_single(bio, &iter, sectorsize);
+ offset += sectorsize;
}
}
@@ -1289,6 +1297,15 @@ static void assert_rbio(struct btrfs_raid_bio *rbio)
ASSERT_RBIO(rbio->nr_data < rbio->real_stripes, rbio);
}
+static inline void *kmap_local_sector(const struct sector_ptr *sector)
+{
+ /* The sector pointer must have a page mapped to it. */
+ ASSERT(sector->has_paddr);
+
+ return kmap_local_page(phys_to_page(sector->paddr)) +
+ offset_in_page(sector->paddr);
+}
+
/* Generate PQ for one vertical stripe. */
static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr)
{
@@ -1301,14 +1318,13 @@ static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr)
/* First collect one sector from each data stripe */
for (stripe = 0; stripe < rbio->nr_data; stripe++) {
sector = sector_in_rbio(rbio, stripe, sectornr, 0);
- pointers[stripe] = kmap_local_page(sector->page) +
- sector->pgoff;
+ pointers[stripe] = kmap_local_sector(sector);
}
/* Then add the parity stripe */
sector = rbio_pstripe_sector(rbio, sectornr);
sector->uptodate = 1;
- pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff;
+ pointers[stripe++] = kmap_local_sector(sector);
if (has_qstripe) {
/*
@@ -1317,8 +1333,7 @@ static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr)
*/
sector = rbio_qstripe_sector(rbio, sectornr);
sector->uptodate = 1;
- pointers[stripe++] = kmap_local_page(sector->page) +
- sector->pgoff;
+ pointers[stripe++] = kmap_local_sector(sector);
assert_rbio(rbio);
raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
@@ -1477,15 +1492,14 @@ static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio)
* stripe_pages[], thus we need to locate the sector.
*/
static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio,
- struct page *page,
- unsigned int pgoff)
+ phys_addr_t paddr)
{
int i;
for (i = 0; i < rbio->nr_sectors; i++) {
struct sector_ptr *sector = &rbio->stripe_sectors[i];
- if (sector->page == page && sector->pgoff == pgoff)
+ if (sector->has_paddr && sector->paddr == paddr)
return sector;
}
return NULL;
@@ -1505,11 +1519,10 @@ static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio)
bio_for_each_segment_all(bvec, bio, iter_all) {
struct sector_ptr *sector;
- int pgoff;
+ phys_addr_t paddr = bvec_phys(bvec);
- for (pgoff = bvec->bv_offset; pgoff - bvec->bv_offset < bvec->bv_len;
- pgoff += sectorsize) {
- sector = find_stripe_sector(rbio, bvec->bv_page, pgoff);
+ for (u32 off = 0; off < bvec->bv_len; off += sectorsize) {
+ sector = find_stripe_sector(rbio, paddr + off);
ASSERT(sector);
if (sector)
sector->uptodate = 1;
@@ -1519,17 +1532,14 @@ static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio)
static int get_bio_sector_nr(struct btrfs_raid_bio *rbio, struct bio *bio)
{
- struct bio_vec *bv = bio_first_bvec_all(bio);
+ phys_addr_t bvec_paddr = bvec_phys(bio_first_bvec_all(bio));
int i;
for (i = 0; i < rbio->nr_sectors; i++) {
- struct sector_ptr *sector;
-
- sector = &rbio->stripe_sectors[i];
- if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset)
+ if (rbio->stripe_sectors[i].paddr == bvec_paddr)
break;
- sector = &rbio->bio_sectors[i];
- if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset)
+ if (rbio->bio_sectors[i].has_paddr &&
+ rbio->bio_sectors[i].paddr == bvec_paddr)
break;
}
ASSERT(i < rbio->nr_sectors);
@@ -1575,11 +1585,11 @@ static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio,
return;
bio_for_each_segment_all(bvec, bio, iter_all) {
- int bv_offset;
+ void *kaddr;
- for (bv_offset = bvec->bv_offset;
- bv_offset < bvec->bv_offset + bvec->bv_len;
- bv_offset += fs_info->sectorsize, total_sector_nr++) {
+ kaddr = bvec_kmap_local(bvec);
+ for (u32 off = 0; off < bvec->bv_len;
+ off += fs_info->sectorsize, total_sector_nr++) {
u8 csum_buf[BTRFS_CSUM_SIZE];
u8 *expected_csum = rbio->csum_buf +
total_sector_nr * fs_info->csum_size;
@@ -1589,11 +1599,12 @@ static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio,
if (!test_bit(total_sector_nr, rbio->csum_bitmap))
continue;
- ret = btrfs_check_sector_csum(fs_info, bvec->bv_page,
- bv_offset, csum_buf, expected_csum);
+ ret = btrfs_check_sector_csum(fs_info, kaddr + off,
+ csum_buf, expected_csum);
if (ret < 0)
set_bit(total_sector_nr, rbio->error_bitmap);
}
+ kunmap_local(kaddr);
}
}
@@ -1689,8 +1700,8 @@ static void raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
list_sort(NULL, &plug->rbio_list, plug_cmp);
while (!list_empty(&plug->rbio_list)) {
- cur = list_entry(plug->rbio_list.next,
- struct btrfs_raid_bio, plug_list);
+ cur = list_first_entry(&plug->rbio_list,
+ struct btrfs_raid_bio, plug_list);
list_del_init(&cur->plug_list);
if (rbio_is_full(cur)) {
@@ -1791,6 +1802,7 @@ static int verify_one_sector(struct btrfs_raid_bio *rbio,
struct sector_ptr *sector;
u8 csum_buf[BTRFS_CSUM_SIZE];
u8 *csum_expected;
+ void *kaddr;
int ret;
if (!rbio->csum_bitmap || !rbio->csum_buf)
@@ -1809,13 +1821,12 @@ static int verify_one_sector(struct btrfs_raid_bio *rbio,
sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr);
}
- ASSERT(sector->page);
-
csum_expected = rbio->csum_buf +
(stripe_nr * rbio->stripe_nsectors + sector_nr) *
fs_info->csum_size;
- ret = btrfs_check_sector_csum(fs_info, sector->page, sector->pgoff,
- csum_buf, csum_expected);
+ kaddr = kmap_local_sector(sector);
+ ret = btrfs_check_sector_csum(fs_info, kaddr, csum_buf, csum_expected);
+ kunmap_local(kaddr);
return ret;
}
@@ -1872,9 +1883,7 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
} else {
sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr);
}
- ASSERT(sector->page);
- pointers[stripe_nr] = kmap_local_page(sector->page) +
- sector->pgoff;
+ pointers[stripe_nr] = kmap_local_sector(sector);
unmap_array[stripe_nr] = pointers[stripe_nr];
}
@@ -2282,9 +2291,8 @@ static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio)
static void raid_wait_write_end_io(struct bio *bio)
{
struct btrfs_raid_bio *rbio = bio->bi_private;
- blk_status_t err = bio->bi_status;
- if (err)
+ if (bio->bi_status)
rbio_update_error_bitmap(rbio, bio);
bio_put(bio);
if (atomic_dec_and_test(&rbio->stripes_pending))
@@ -2326,7 +2334,7 @@ static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio)
* thus this rbio can not be cached one, as cached one must
* have all its data sectors present and uptodate.
*/
- if (!sector->page || !sector->uptodate)
+ if (!sector->has_paddr || !sector->uptodate)
return true;
}
return false;
@@ -2516,6 +2524,7 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio)
int stripe;
int sectornr;
bool has_qstripe;
+ struct page *page;
struct sector_ptr p_sector = { 0 };
struct sector_ptr q_sector = { 0 };
struct bio_list bio_list;
@@ -2547,29 +2556,33 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio)
*/
clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
- p_sector.page = alloc_page(GFP_NOFS);
- if (!p_sector.page)
+ page = alloc_page(GFP_NOFS);
+ if (!page)
return -ENOMEM;
- p_sector.pgoff = 0;
+ p_sector.has_paddr = true;
+ p_sector.paddr = page_to_phys(page);
p_sector.uptodate = 1;
+ page = NULL;
if (has_qstripe) {
/* RAID6, allocate and map temp space for the Q stripe */
- q_sector.page = alloc_page(GFP_NOFS);
- if (!q_sector.page) {
- __free_page(p_sector.page);
- p_sector.page = NULL;
+ page = alloc_page(GFP_NOFS);
+ if (!page) {
+ __free_page(phys_to_page(p_sector.paddr));
+ p_sector.has_paddr = false;
return -ENOMEM;
}
- q_sector.pgoff = 0;
+ q_sector.has_paddr = true;
+ q_sector.paddr = page_to_phys(page);
q_sector.uptodate = 1;
- pointers[rbio->real_stripes - 1] = kmap_local_page(q_sector.page);
+ page = NULL;
+ pointers[rbio->real_stripes - 1] = kmap_local_sector(&q_sector);
}
bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
/* Map the parity stripe just once */
- pointers[nr_data] = kmap_local_page(p_sector.page);
+ pointers[nr_data] = kmap_local_sector(&p_sector);
for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
struct sector_ptr *sector;
@@ -2578,8 +2591,7 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio)
/* first collect one page from each data stripe */
for (stripe = 0; stripe < nr_data; stripe++) {
sector = sector_in_rbio(rbio, stripe, sectornr, 0);
- pointers[stripe] = kmap_local_page(sector->page) +
- sector->pgoff;
+ pointers[stripe] = kmap_local_sector(sector);
}
if (has_qstripe) {
@@ -2595,7 +2607,7 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio)
/* Check scrubbing parity and repair it */
sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
- parity = kmap_local_page(sector->page) + sector->pgoff;
+ parity = kmap_local_sector(sector);
if (memcmp(parity, pointers[rbio->scrubp], sectorsize) != 0)
memcpy(parity, pointers[rbio->scrubp], sectorsize);
else
@@ -2608,12 +2620,11 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio)
}
kunmap_local(pointers[nr_data]);
- __free_page(p_sector.page);
- p_sector.page = NULL;
- if (q_sector.page) {
- kunmap_local(pointers[rbio->real_stripes - 1]);
- __free_page(q_sector.page);
- q_sector.page = NULL;
+ __free_page(phys_to_page(p_sector.paddr));
+ p_sector.has_paddr = false;
+ if (q_sector.has_paddr) {
+ __free_page(phys_to_page(q_sector.paddr));
+ q_sector.has_paddr = false;
}
/*
diff --git a/fs/btrfs/rcu-string.h b/fs/btrfs/rcu-string.h
deleted file mode 100644
index 1c2d7cb1fe6f..000000000000
--- a/fs/btrfs/rcu-string.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 2012 Red Hat. All rights reserved.
- */
-
-#ifndef BTRFS_RCU_STRING_H
-#define BTRFS_RCU_STRING_H
-
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/slab.h>
-#include <linux/rcupdate.h>
-#include <linux/printk.h>
-
-struct rcu_string {
- struct rcu_head rcu;
- char str[];
-};
-
-static inline struct rcu_string *rcu_string_strdup(const char *src, gfp_t mask)
-{
- size_t len = strlen(src) + 1;
- struct rcu_string *ret = kzalloc(sizeof(struct rcu_string) +
- (len * sizeof(char)), mask);
- if (!ret)
- return ret;
- /* Warn if the source got unexpectedly truncated. */
- if (WARN_ON(strscpy(ret->str, src, len) < 0)) {
- kfree(ret);
- return NULL;
- }
- return ret;
-}
-
-static inline void rcu_string_free(struct rcu_string *str)
-{
- if (str)
- kfree_rcu(str, rcu);
-}
-
-#define printk_in_rcu(fmt, ...) do { \
- rcu_read_lock(); \
- printk(fmt, __VA_ARGS__); \
- rcu_read_unlock(); \
-} while (0)
-
-#define printk_ratelimited_in_rcu(fmt, ...) do { \
- rcu_read_lock(); \
- printk_ratelimited(fmt, __VA_ARGS__); \
- rcu_read_unlock(); \
-} while (0)
-
-#define rcu_str_deref(rcu_str) ({ \
- struct rcu_string *__str = rcu_dereference(rcu_str); \
- __str->str; \
-})
-
-#endif
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index 2928abf7eb82..3871c3a6c743 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -75,69 +75,70 @@ struct block_entry {
struct list_head actions;
};
+static int block_entry_bytenr_key_cmp(const void *key, const struct rb_node *node)
+{
+ const u64 *bytenr = key;
+ const struct block_entry *entry = rb_entry(node, struct block_entry, node);
+
+ if (entry->bytenr < *bytenr)
+ return 1;
+ else if (entry->bytenr > *bytenr)
+ return -1;
+
+ return 0;
+}
+
+static int block_entry_bytenr_cmp(struct rb_node *new, const struct rb_node *existing)
+{
+ const struct block_entry *new_entry = rb_entry(new, struct block_entry, node);
+
+ return block_entry_bytenr_key_cmp(&new_entry->bytenr, existing);
+}
+
static struct block_entry *insert_block_entry(struct rb_root *root,
struct block_entry *be)
{
- struct rb_node **p = &root->rb_node;
- struct rb_node *parent_node = NULL;
- struct block_entry *entry;
-
- while (*p) {
- parent_node = *p;
- entry = rb_entry(parent_node, struct block_entry, node);
- if (entry->bytenr > be->bytenr)
- p = &(*p)->rb_left;
- else if (entry->bytenr < be->bytenr)
- p = &(*p)->rb_right;
- else
- return entry;
- }
+ struct rb_node *node;
- rb_link_node(&be->node, parent_node, p);
- rb_insert_color(&be->node, root);
- return NULL;
+ node = rb_find_add(&be->node, root, block_entry_bytenr_cmp);
+ return rb_entry_safe(node, struct block_entry, node);
}
static struct block_entry *lookup_block_entry(struct rb_root *root, u64 bytenr)
{
- struct rb_node *n;
- struct block_entry *entry = NULL;
+ struct rb_node *node;
- n = root->rb_node;
- while (n) {
- entry = rb_entry(n, struct block_entry, node);
- if (entry->bytenr < bytenr)
- n = n->rb_right;
- else if (entry->bytenr > bytenr)
- n = n->rb_left;
- else
- return entry;
- }
- return NULL;
+ node = rb_find(&bytenr, root, block_entry_bytenr_key_cmp);
+ return rb_entry_safe(node, struct block_entry, node);
+}
+
+static int root_entry_root_objectid_key_cmp(const void *key, const struct rb_node *node)
+{
+ const u64 *objectid = key;
+ const struct root_entry *entry = rb_entry(node, struct root_entry, node);
+
+ if (entry->root_objectid < *objectid)
+ return 1;
+ else if (entry->root_objectid > *objectid)
+ return -1;
+
+ return 0;
+}
+
+static int root_entry_root_objectid_cmp(struct rb_node *new, const struct rb_node *existing)
+{
+ const struct root_entry *new_entry = rb_entry(new, struct root_entry, node);
+
+ return root_entry_root_objectid_key_cmp(&new_entry->root_objectid, existing);
}
static struct root_entry *insert_root_entry(struct rb_root *root,
struct root_entry *re)
{
- struct rb_node **p = &root->rb_node;
- struct rb_node *parent_node = NULL;
- struct root_entry *entry;
-
- while (*p) {
- parent_node = *p;
- entry = rb_entry(parent_node, struct root_entry, node);
- if (entry->root_objectid > re->root_objectid)
- p = &(*p)->rb_left;
- else if (entry->root_objectid < re->root_objectid)
- p = &(*p)->rb_right;
- else
- return entry;
- }
-
- rb_link_node(&re->node, parent_node, p);
- rb_insert_color(&re->node, root);
- return NULL;
+ struct rb_node *node;
+ node = rb_find_add(&re->node, root, root_entry_root_objectid_cmp);
+ return rb_entry_safe(node, struct root_entry, node);
}
static int comp_refs(struct ref_entry *ref1, struct ref_entry *ref2)
@@ -161,48 +162,29 @@ static int comp_refs(struct ref_entry *ref1, struct ref_entry *ref2)
return 0;
}
+static int ref_entry_cmp(struct rb_node *new, const struct rb_node *existing)
+{
+ struct ref_entry *new_entry = rb_entry(new, struct ref_entry, node);
+ struct ref_entry *existing_entry = rb_entry(existing, struct ref_entry, node);
+
+ return comp_refs(new_entry, existing_entry);
+}
+
static struct ref_entry *insert_ref_entry(struct rb_root *root,
struct ref_entry *ref)
{
- struct rb_node **p = &root->rb_node;
- struct rb_node *parent_node = NULL;
- struct ref_entry *entry;
- int cmp;
-
- while (*p) {
- parent_node = *p;
- entry = rb_entry(parent_node, struct ref_entry, node);
- cmp = comp_refs(entry, ref);
- if (cmp > 0)
- p = &(*p)->rb_left;
- else if (cmp < 0)
- p = &(*p)->rb_right;
- else
- return entry;
- }
-
- rb_link_node(&ref->node, parent_node, p);
- rb_insert_color(&ref->node, root);
- return NULL;
+ struct rb_node *node;
+ node = rb_find_add(&ref->node, root, ref_entry_cmp);
+ return rb_entry_safe(node, struct ref_entry, node);
}
static struct root_entry *lookup_root_entry(struct rb_root *root, u64 objectid)
{
- struct rb_node *n;
- struct root_entry *entry = NULL;
+ struct rb_node *node;
- n = root->rb_node;
- while (n) {
- entry = rb_entry(n, struct root_entry, node);
- if (entry->root_objectid < objectid)
- n = n->rb_right;
- else if (entry->root_objectid > objectid)
- n = n->rb_left;
- else
- return entry;
- }
- return NULL;
+ node = rb_find(&objectid, root, root_entry_root_objectid_key_cmp);
+ return rb_entry_safe(node, struct root_entry, node);
}
#ifdef CONFIG_STACKTRACE
@@ -668,7 +650,7 @@ static void dump_block_entry(struct btrfs_fs_info *fs_info,
* our sanity checks pass as they are no longer needed.
*/
int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
- struct btrfs_ref *generic_ref)
+ const struct btrfs_ref *generic_ref)
{
struct ref_entry *ref = NULL, *exist;
struct ref_action *ra = NULL;
diff --git a/fs/btrfs/ref-verify.h b/fs/btrfs/ref-verify.h
index 3511e1a5c96b..559bd25a2b7a 100644
--- a/fs/btrfs/ref-verify.h
+++ b/fs/btrfs/ref-verify.h
@@ -19,7 +19,7 @@ struct btrfs_ref;
int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info);
void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info);
int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
- struct btrfs_ref *generic_ref);
+ const struct btrfs_ref *generic_ref);
void btrfs_free_ref_tree_range(struct btrfs_fs_info *fs_info, u64 start,
u64 len);
@@ -39,7 +39,7 @@ static inline void btrfs_free_ref_cache(struct btrfs_fs_info *fs_info)
}
static inline int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
- struct btrfs_ref *generic_ref)
+ const struct btrfs_ref *generic_ref)
{
return 0;
}
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index f0824c948cb7..ce25ab7f0e99 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -46,11 +46,9 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
if (ret) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
- goto out;
+ return ret;
}
- ret = btrfs_end_transaction(trans);
-out:
- return ret;
+ return btrfs_end_transaction(trans);
}
static int copy_inline_to_page(struct btrfs_inode *inode,
@@ -87,7 +85,7 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
btrfs_alloc_write_mask(mapping));
if (IS_ERR(folio)) {
- ret = -ENOMEM;
+ ret = PTR_ERR(folio);
goto out_unlock;
}
@@ -95,9 +93,8 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
if (ret < 0)
goto out_unlock;
- clear_extent_bit(&inode->io_tree, file_offset, range_end,
- EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
- NULL);
+ btrfs_clear_extent_bit(&inode->io_tree, file_offset, range_end,
+ EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, NULL);
ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL);
if (ret)
goto out_unlock;
@@ -165,7 +162,7 @@ out:
* the source inode to destination inode when possible. When not possible we
* copy the inline extent's data into the respective page of the inode.
*/
-static int clone_copy_inline_extent(struct inode *dst,
+static int clone_copy_inline_extent(struct btrfs_inode *inode,
struct btrfs_path *path,
struct btrfs_key *new_key,
const u64 drop_start,
@@ -175,8 +172,8 @@ static int clone_copy_inline_extent(struct inode *dst,
char *inline_data,
struct btrfs_trans_handle **trans_out)
{
- struct btrfs_fs_info *fs_info = inode_to_fs_info(dst);
- struct btrfs_root *root = BTRFS_I(dst)->root;
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
const u64 aligned_end = ALIGN(new_key->offset + datal,
fs_info->sectorsize);
struct btrfs_trans_handle *trans = NULL;
@@ -185,12 +182,12 @@ static int clone_copy_inline_extent(struct inode *dst,
struct btrfs_key key;
if (new_key->offset > 0) {
- ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
+ ret = copy_inline_to_page(inode, new_key->offset,
inline_data, size, datal, comp_type);
goto out;
}
- key.objectid = btrfs_ino(BTRFS_I(dst));
+ key.objectid = btrfs_ino(inode);
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = 0;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -205,7 +202,7 @@ static int clone_copy_inline_extent(struct inode *dst,
goto copy_inline_extent;
}
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
- if (key.objectid == btrfs_ino(BTRFS_I(dst)) &&
+ if (key.objectid == btrfs_ino(inode) &&
key.type == BTRFS_EXTENT_DATA_KEY) {
/*
* There's an implicit hole at file offset 0, copy the
@@ -214,7 +211,7 @@ static int clone_copy_inline_extent(struct inode *dst,
ASSERT(key.offset > 0);
goto copy_to_page;
}
- } else if (i_size_read(dst) <= datal) {
+ } else if (i_size_read(&inode->vfs_inode) <= datal) {
struct btrfs_file_extent_item *ei;
ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
@@ -236,7 +233,7 @@ copy_inline_extent:
* We have no extent items, or we have an extent at offset 0 which may
* or may not be inlined. All these cases are dealt the same way.
*/
- if (i_size_read(dst) > datal) {
+ if (i_size_read(&inode->vfs_inode) > datal) {
/*
* At the destination offset 0 we have either a hole, a regular
* extent or an inline extent larger then the one we want to
@@ -270,20 +267,26 @@ copy_inline_extent:
drop_args.start = drop_start;
drop_args.end = aligned_end;
drop_args.drop_cache = true;
- ret = btrfs_drop_extents(trans, root, BTRFS_I(dst), &drop_args);
- if (ret)
+ ret = btrfs_drop_extents(trans, root, inode, &drop_args);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
goto out;
+ }
ret = btrfs_insert_empty_item(trans, root, path, new_key, size);
- if (ret)
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
goto out;
+ }
write_extent_buffer(path->nodes[0], inline_data,
btrfs_item_ptr_offset(path->nodes[0],
path->slots[0]),
size);
- btrfs_update_inode_bytes(BTRFS_I(dst), datal, drop_args.bytes_found);
- btrfs_set_inode_full_sync(BTRFS_I(dst));
- ret = btrfs_inode_set_file_extent_range(BTRFS_I(dst), 0, aligned_end);
+ btrfs_update_inode_bytes(inode, datal, drop_args.bytes_found);
+ btrfs_set_inode_full_sync(inode);
+ ret = btrfs_inode_set_file_extent_range(inode, 0, aligned_end);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
out:
if (!ret && !trans) {
/*
@@ -298,10 +301,8 @@ out:
trans = NULL;
}
}
- if (ret && trans) {
- btrfs_abort_transaction(trans, ret);
+ if (ret && trans)
btrfs_end_transaction(trans);
- }
if (!ret)
*trans_out = trans;
@@ -318,7 +319,7 @@ copy_to_page:
*/
btrfs_release_path(path);
- ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
+ ret = copy_inline_to_page(inode, new_key->offset,
inline_data, size, datal, comp_type);
goto out;
}
@@ -526,7 +527,7 @@ process_slot:
goto out;
}
- ret = clone_copy_inline_extent(inode, path, &new_key,
+ ret = clone_copy_inline_extent(BTRFS_I(inode), path, &new_key,
drop_start, datal, size,
comp, buf, &trans);
if (ret)
@@ -617,26 +618,26 @@ out:
return ret;
}
-static void btrfs_double_mmap_lock(struct inode *inode1, struct inode *inode2)
+static void btrfs_double_mmap_lock(struct btrfs_inode *inode1, struct btrfs_inode *inode2)
{
if (inode1 < inode2)
swap(inode1, inode2);
- down_write(&BTRFS_I(inode1)->i_mmap_lock);
- down_write_nested(&BTRFS_I(inode2)->i_mmap_lock, SINGLE_DEPTH_NESTING);
+ down_write(&inode1->i_mmap_lock);
+ down_write_nested(&inode2->i_mmap_lock, SINGLE_DEPTH_NESTING);
}
-static void btrfs_double_mmap_unlock(struct inode *inode1, struct inode *inode2)
+static void btrfs_double_mmap_unlock(struct btrfs_inode *inode1, struct btrfs_inode *inode2)
{
- up_write(&BTRFS_I(inode1)->i_mmap_lock);
- up_write(&BTRFS_I(inode2)->i_mmap_lock);
+ up_write(&inode1->i_mmap_lock);
+ up_write(&inode2->i_mmap_lock);
}
-static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
- struct inode *dst, u64 dst_loff)
+static int btrfs_extent_same_range(struct btrfs_inode *src, u64 loff, u64 len,
+ struct btrfs_inode *dst, u64 dst_loff)
{
const u64 end = dst_loff + len - 1;
struct extent_state *cached_state = NULL;
- struct btrfs_fs_info *fs_info = BTRFS_I(src)->root->fs_info;
+ struct btrfs_fs_info *fs_info = src->root->fs_info;
const u64 bs = fs_info->sectorsize;
int ret;
@@ -646,9 +647,10 @@ static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
* because we have already locked the inode's i_mmap_lock in exclusive
* mode.
*/
- lock_extent(&BTRFS_I(dst)->io_tree, dst_loff, end, &cached_state);
- ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1);
- unlock_extent(&BTRFS_I(dst)->io_tree, dst_loff, end, &cached_state);
+ btrfs_lock_extent(&dst->io_tree, dst_loff, end, &cached_state);
+ ret = btrfs_clone(&src->vfs_inode, &dst->vfs_inode, loff, len,
+ ALIGN(len, bs), dst_loff, 1);
+ btrfs_unlock_extent(&dst->io_tree, dst_loff, end, &cached_state);
btrfs_btree_balance_dirty(fs_info);
@@ -678,8 +680,8 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN);
for (i = 0; i < chunk_count; i++) {
- ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN,
- dst, dst_loff);
+ ret = btrfs_extent_same_range(BTRFS_I(src), loff, BTRFS_MAX_DEDUPE_LEN,
+ BTRFS_I(dst), dst_loff);
if (ret)
goto out;
@@ -688,7 +690,8 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
}
if (tail_len > 0)
- ret = btrfs_extent_same_range(src, loff, tail_len, dst, dst_loff);
+ ret = btrfs_extent_same_range(BTRFS_I(src), loff, tail_len,
+ BTRFS_I(dst), dst_loff);
out:
spin_lock(&root_dst->root_item_lock);
root_dst->dedupe_in_progress--;
@@ -747,9 +750,9 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
* mode.
*/
end = destoff + len - 1;
- lock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state);
+ btrfs_lock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state);
ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
- unlock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state);
+ btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state);
/*
* We may have copied an inline extent into a page of the destination
@@ -775,24 +778,24 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
loff_t *len, unsigned int remap_flags)
{
- struct inode *inode_in = file_inode(file_in);
- struct inode *inode_out = file_inode(file_out);
- u64 bs = BTRFS_I(inode_out)->root->fs_info->sectorsize;
+ struct btrfs_inode *inode_in = BTRFS_I(file_inode(file_in));
+ struct btrfs_inode *inode_out = BTRFS_I(file_inode(file_out));
+ u64 bs = inode_out->root->fs_info->sectorsize;
u64 wb_len;
int ret;
if (!(remap_flags & REMAP_FILE_DEDUP)) {
- struct btrfs_root *root_out = BTRFS_I(inode_out)->root;
+ struct btrfs_root *root_out = inode_out->root;
if (btrfs_root_readonly(root_out))
return -EROFS;
- ASSERT(inode_in->i_sb == inode_out->i_sb);
+ ASSERT(inode_in->vfs_inode.i_sb == inode_out->vfs_inode.i_sb);
}
/* Don't make the dst file partly checksummed */
- if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) !=
- (BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) {
+ if ((inode_in->flags & BTRFS_INODE_NODATASUM) !=
+ (inode_out->flags & BTRFS_INODE_NODATASUM)) {
return -EINVAL;
}
@@ -811,7 +814,7 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
* to complete so that new file extent items are in the fs tree.
*/
if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP))
- wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs);
+ wb_len = ALIGN(inode_in->vfs_inode.i_size, bs) - ALIGN_DOWN(pos_in, bs);
else
wb_len = ALIGN(*len, bs);
@@ -832,16 +835,14 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
* Also we don't need to check ASYNC_EXTENT, as async extent will be
* CoWed anyway, not affecting nocow part.
*/
- ret = filemap_flush(inode_in->i_mapping);
+ ret = filemap_flush(inode_in->vfs_inode.i_mapping);
if (ret < 0)
return ret;
- ret = btrfs_wait_ordered_range(BTRFS_I(inode_in), ALIGN_DOWN(pos_in, bs),
- wb_len);
+ ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs), wb_len);
if (ret < 0)
return ret;
- ret = btrfs_wait_ordered_range(BTRFS_I(inode_out), ALIGN_DOWN(pos_out, bs),
- wb_len);
+ ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs), wb_len);
if (ret < 0)
return ret;
@@ -863,8 +864,8 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
struct file *dst_file, loff_t destoff, loff_t len,
unsigned int remap_flags)
{
- struct inode *src_inode = file_inode(src_file);
- struct inode *dst_inode = file_inode(dst_file);
+ struct btrfs_inode *src_inode = BTRFS_I(file_inode(src_file));
+ struct btrfs_inode *dst_inode = BTRFS_I(file_inode(dst_file));
bool same_inode = dst_inode == src_inode;
int ret;
@@ -872,9 +873,9 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
return -EINVAL;
if (same_inode) {
- btrfs_inode_lock(BTRFS_I(src_inode), BTRFS_ILOCK_MMAP);
+ btrfs_inode_lock(src_inode, BTRFS_ILOCK_MMAP);
} else {
- lock_two_nondirectories(src_inode, dst_inode);
+ lock_two_nondirectories(&src_inode->vfs_inode, &dst_inode->vfs_inode);
btrfs_double_mmap_lock(src_inode, dst_inode);
}
@@ -884,16 +885,18 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
goto out_unlock;
if (remap_flags & REMAP_FILE_DEDUP)
- ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff);
+ ret = btrfs_extent_same(&src_inode->vfs_inode, off, len,
+ &dst_inode->vfs_inode, destoff);
else
ret = btrfs_clone_files(dst_file, src_file, off, len, destoff);
out_unlock:
if (same_inode) {
- btrfs_inode_unlock(BTRFS_I(src_inode), BTRFS_ILOCK_MMAP);
+ btrfs_inode_unlock(src_inode, BTRFS_ILOCK_MMAP);
} else {
btrfs_double_mmap_unlock(src_inode, dst_inode);
- unlock_two_nondirectories(src_inode, dst_inode);
+ unlock_two_nondirectories(&src_inode->vfs_inode,
+ &dst_inode->vfs_inode);
}
/*
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index af0969b70b53..7256f6748c8f 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -90,10 +90,15 @@
* map address of tree root to tree
*/
struct mapping_node {
- struct {
- struct rb_node rb_node;
- u64 bytenr;
- }; /* Use rb_simle_node for search/insert */
+ union {
+ /* Use rb_simple_node for search/insert */
+ struct {
+ struct rb_node rb_node;
+ u64 bytenr;
+ };
+
+ struct rb_simple_node simple_node;
+ };
void *data;
};
@@ -106,10 +111,15 @@ struct mapping_tree {
* present a tree block to process
*/
struct tree_block {
- struct {
- struct rb_node rb_node;
- u64 bytenr;
- }; /* Use rb_simple_node for search/insert */
+ union {
+ /* Use rb_simple_node for search/insert */
+ struct {
+ struct rb_node rb_node;
+ u64 bytenr;
+ };
+
+ struct rb_simple_node simple_node;
+ };
u64 owner;
struct btrfs_key key;
u8 level;
@@ -178,8 +188,9 @@ static void mark_block_processed(struct reloc_control *rc,
in_range(node->bytenr, rc->block_group->start,
rc->block_group->length)) {
blocksize = rc->extent_root->fs_info->nodesize;
- set_extent_bit(&rc->processed_blocks, node->bytenr,
- node->bytenr + blocksize - 1, EXTENT_DIRTY, NULL);
+ btrfs_set_extent_bit(&rc->processed_blocks, node->bytenr,
+ node->bytenr + blocksize - 1, EXTENT_DIRTY,
+ NULL);
}
node->processed = 1;
}
@@ -195,8 +206,8 @@ static struct btrfs_backref_node *walk_up_backref(
int idx = *index;
while (!list_empty(&node->upper)) {
- edge = list_entry(node->upper.next,
- struct btrfs_backref_edge, list[LOWER]);
+ edge = list_first_entry(&node->upper, struct btrfs_backref_edge,
+ list[LOWER]);
edges[idx++] = edge;
node = edge->node[UPPER];
}
@@ -222,8 +233,8 @@ static struct btrfs_backref_node *walk_down_backref(
idx--;
continue;
}
- edge = list_entry(edge->list[LOWER].next,
- struct btrfs_backref_edge, list[LOWER]);
+ edge = list_first_entry(&edge->list[LOWER], struct btrfs_backref_edge,
+ list[LOWER]);
edges[idx - 1] = edge;
*index = idx;
return edge->node[UPPER];
@@ -347,8 +358,8 @@ static bool handle_useless_nodes(struct reloc_control *rc,
struct btrfs_backref_edge *edge;
struct btrfs_backref_node *lower;
- edge = list_entry(cur->lower.next,
- struct btrfs_backref_edge, list[UPPER]);
+ edge = list_first_entry(&cur->lower, struct btrfs_backref_edge,
+ list[UPPER]);
list_del(&edge->list[UPPER]);
list_del(&edge->list[LOWER]);
lower = edge->node[LOWER];
@@ -479,8 +490,7 @@ static int __add_reloc_root(struct btrfs_root *root)
node->data = root;
spin_lock(&rc->reloc_root_tree.lock);
- rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root,
- node->bytenr, &node->rb_node);
+ rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root, &node->simple_node);
spin_unlock(&rc->reloc_root_tree.lock);
if (rb_node) {
btrfs_err(fs_info,
@@ -563,8 +573,7 @@ static int __update_reloc_root(struct btrfs_root *root)
spin_lock(&rc->reloc_root_tree.lock);
node->bytenr = root->node->start;
- rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root,
- node->bytenr, &node->rb_node);
+ rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root, &node->simple_node);
spin_unlock(&rc->reloc_root_tree.lock);
if (rb_node)
btrfs_backref_panic(fs_info, node->bytenr, -EEXIST);
@@ -593,6 +602,25 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
if (btrfs_root_id(root) == objectid) {
u64 commit_root_gen;
+ /*
+ * Relocation will wait for cleaner thread, and any half-dropped
+ * subvolume will be fully cleaned up at mount time.
+ * So here we shouldn't hit a subvolume with non-zero drop_progress.
+ *
+ * If this isn't the case, error out since it can make us attempt to
+ * drop references for extents that were already dropped before.
+ */
+ if (unlikely(btrfs_disk_key_objectid(&root->root_item.drop_progress))) {
+ struct btrfs_key cpu_key;
+
+ btrfs_disk_key_to_cpu(&cpu_key, &root->root_item.drop_progress);
+ btrfs_err(fs_info,
+ "cannot relocate partially dropped subvolume %llu, drop progress key (%llu %u %llu)",
+ objectid, cpu_key.objectid, cpu_key.type, cpu_key.offset);
+ ret = -EUCLEAN;
+ goto fail;
+ }
+
/* called by btrfs_init_reloc_root */
ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
BTRFS_TREE_RELOC_OBJECTID);
@@ -910,16 +938,16 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
/* Take mmap lock to serialize with reflinks. */
if (!down_read_trylock(&inode->i_mmap_lock))
continue;
- ret = try_lock_extent(&inode->io_tree, key.offset,
- end, &cached_state);
+ ret = btrfs_try_lock_extent(&inode->io_tree, key.offset,
+ end, &cached_state);
if (!ret) {
up_read(&inode->i_mmap_lock);
continue;
}
btrfs_drop_extent_map_range(inode, key.offset, end, true);
- unlock_extent(&inode->io_tree, key.offset, end,
- &cached_state);
+ btrfs_unlock_extent(&inode->io_tree, key.offset, end,
+ &cached_state);
up_read(&inode->i_mmap_lock);
}
}
@@ -1378,9 +1406,9 @@ static int invalidate_extent_cache(struct btrfs_root *root,
}
/* the lock_extent waits for read_folio to complete */
- lock_extent(&inode->io_tree, start, end, &cached_state);
+ btrfs_lock_extent(&inode->io_tree, start, end, &cached_state);
btrfs_drop_extent_map_range(inode, start, end, true);
- unlock_extent(&inode->io_tree, start, end, &cached_state);
+ btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state);
}
return 0;
}
@@ -1515,7 +1543,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
level = btrfs_root_level(root_item);
- atomic_inc(&reloc_root->node->refs);
+ refcount_inc(&reloc_root->node->refs);
path->nodes[level] = reloc_root->node;
path->slots[level] = 0;
} else {
@@ -1697,8 +1725,8 @@ again:
rc->merge_reloc_tree = true;
while (!list_empty(&rc->reloc_roots)) {
- reloc_root = list_entry(rc->reloc_roots.next,
- struct btrfs_root, root_list);
+ reloc_root = list_first_entry(&rc->reloc_roots,
+ struct btrfs_root, root_list);
list_del_init(&reloc_root->root_list);
root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
@@ -1813,8 +1841,7 @@ again:
while (!list_empty(&reloc_roots)) {
found = 1;
- reloc_root = list_entry(reloc_roots.next,
- struct btrfs_root, root_list);
+ reloc_root = list_first_entry(&reloc_roots, struct btrfs_root, root_list);
root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
false);
@@ -1930,11 +1957,11 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
* reloc root without a corresponding root this could return ENOENT.
*/
if (IS_ERR(root)) {
- ASSERT(0);
+ DEBUG_WARN("error %ld reading root for reloc root", PTR_ERR(root));
return PTR_ERR(root);
}
if (root->reloc_root != reloc_root) {
- ASSERT(0);
+ DEBUG_WARN("unexpected reloc root found");
btrfs_err(fs_info,
"root %llu has two reloc roots associated with it",
reloc_root->root_key.offset);
@@ -2109,8 +2136,8 @@ static noinline_for_stack u64 calcu_metadata_size(struct reloc_control *rc,
if (list_empty(&next->upper))
break;
- edge = list_entry(next->upper.next,
- struct btrfs_backref_edge, list[LOWER]);
+ edge = list_first_entry(&next->upper, struct btrfs_backref_edge,
+ list[LOWER]);
edges[index++] = edge;
next = edge->node[UPPER];
}
@@ -2356,8 +2383,8 @@ static int finish_pending_nodes(struct btrfs_trans_handle *trans,
for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
while (!list_empty(&cache->pending[level])) {
- node = list_entry(cache->pending[level].next,
- struct btrfs_backref_node, list);
+ node = list_first_entry(&cache->pending[level],
+ struct btrfs_backref_node, list);
list_move_tail(&node->list, &list);
BUG_ON(!node->pending);
@@ -2395,8 +2422,8 @@ static void update_processed_blocks(struct reloc_control *rc,
if (list_empty(&next->upper))
break;
- edge = list_entry(next->upper.next,
- struct btrfs_backref_edge, list[LOWER]);
+ edge = list_first_entry(&next->upper, struct btrfs_backref_edge,
+ list[LOWER]);
edges[index++] = edge;
next = edge->node[UPPER];
}
@@ -2408,8 +2435,8 @@ static int tree_block_processed(u64 bytenr, struct reloc_control *rc)
{
u32 blocksize = rc->extent_root->fs_info->nodesize;
- if (test_range_bit(&rc->processed_blocks, bytenr,
- bytenr + blocksize - 1, EXTENT_DIRTY, NULL))
+ if (btrfs_test_range_bit(&rc->processed_blocks, bytenr,
+ bytenr + blocksize - 1, EXTENT_DIRTY, NULL))
return 1;
return 0;
}
@@ -2617,7 +2644,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
* tree.
*/
if (block->owner &&
- (!is_fstree(block->owner) ||
+ (!btrfs_is_fstree(block->owner) ||
block->owner == BTRFS_DATA_RELOC_TREE_OBJECTID)) {
ret = relocate_cowonly_block(trans, rc, block, path);
if (ret)
@@ -2658,69 +2685,24 @@ static noinline_for_stack int prealloc_file_extent_cluster(struct reloc_control
u64 num_bytes;
int nr;
int ret = 0;
- u64 i_size = i_size_read(&inode->vfs_inode);
u64 prealloc_start = cluster->start - offset;
u64 prealloc_end = cluster->end - offset;
u64 cur_offset = prealloc_start;
/*
- * For subpage case, previous i_size may not be aligned to PAGE_SIZE.
- * This means the range [i_size, PAGE_END + 1) is filled with zeros by
- * btrfs_do_readpage() call of previously relocated file cluster.
+ * For blocksize < folio size case (either bs < page size or large folios),
+ * beyond i_size, all blocks are filled with zero.
*
- * If the current cluster starts in the above range, btrfs_do_readpage()
+ * If the current cluster covers the above range, btrfs_do_readpage()
* will skip the read, and relocate_one_folio() will later writeback
* the padding zeros as new data, causing data corruption.
*
- * Here we have to manually invalidate the range (i_size, PAGE_END + 1).
+ * Here we have to invalidate the cache covering our cluster.
*/
- if (!PAGE_ALIGNED(i_size)) {
- struct address_space *mapping = inode->vfs_inode.i_mapping;
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- const u32 sectorsize = fs_info->sectorsize;
- struct folio *folio;
-
- ASSERT(sectorsize < PAGE_SIZE);
- ASSERT(IS_ALIGNED(i_size, sectorsize));
-
- /*
- * Subpage can't handle page with DIRTY but without UPTODATE
- * bit as it can lead to the following deadlock:
- *
- * btrfs_read_folio()
- * | Page already *locked*
- * |- btrfs_lock_and_flush_ordered_range()
- * |- btrfs_start_ordered_extent()
- * |- extent_write_cache_pages()
- * |- lock_page()
- * We try to lock the page we already hold.
- *
- * Here we just writeback the whole data reloc inode, so that
- * we will be ensured to have no dirty range in the page, and
- * are safe to clear the uptodate bits.
- *
- * This shouldn't cause too much overhead, as we need to write
- * the data back anyway.
- */
- ret = filemap_write_and_wait(mapping);
- if (ret < 0)
- return ret;
-
- clear_extent_bits(&inode->io_tree, i_size,
- round_up(i_size, PAGE_SIZE) - 1,
- EXTENT_UPTODATE);
- folio = filemap_lock_folio(mapping, i_size >> PAGE_SHIFT);
- /*
- * If page is freed we don't need to do anything then, as we
- * will re-read the whole page anyway.
- */
- if (!IS_ERR(folio)) {
- btrfs_subpage_clear_uptodate(fs_info, folio, i_size,
- round_up(i_size, PAGE_SIZE) - i_size);
- folio_unlock(folio);
- folio_put(folio);
- }
- }
+ ret = filemap_invalidate_inode(&inode->vfs_inode, true, prealloc_start,
+ prealloc_end);
+ if (ret < 0)
+ return ret;
BUG_ON(cluster->start != cluster->boundary[0]);
ret = btrfs_alloc_data_chunk_ondemand(inode,
@@ -2738,21 +2720,21 @@ static noinline_for_stack int prealloc_file_extent_cluster(struct reloc_control
else
end = cluster->end - offset;
- lock_extent(&inode->io_tree, start, end, &cached_state);
+ btrfs_lock_extent(&inode->io_tree, start, end, &cached_state);
num_bytes = end + 1 - start;
ret = btrfs_prealloc_file_range(&inode->vfs_inode, 0, start,
num_bytes, num_bytes,
end + 1, &alloc_hint);
cur_offset = end + 1;
- unlock_extent(&inode->io_tree, start, end, &cached_state);
+ btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state);
if (ret)
break;
}
btrfs_inode_unlock(inode, 0);
if (cur_offset < prealloc_end)
- btrfs_free_reserved_data_space_noquota(inode->root->fs_info,
- prealloc_end + 1 - cur_offset);
+ btrfs_free_reserved_data_space_noquota(inode,
+ prealloc_end + 1 - cur_offset);
return ret;
}
@@ -2766,7 +2748,7 @@ static noinline_for_stack int setup_relocation_extent_mapping(struct reloc_contr
u64 end = rc->cluster.end - offset;
int ret = 0;
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em)
return -ENOMEM;
@@ -2777,10 +2759,10 @@ static noinline_for_stack int setup_relocation_extent_mapping(struct reloc_contr
em->ram_bytes = em->len;
em->flags |= EXTENT_FLAG_PINNED;
- lock_extent(&inode->io_tree, start, end, &cached_state);
+ btrfs_lock_extent(&inode->io_tree, start, end, &cached_state);
ret = btrfs_replace_extent_map_range(inode, em, false);
- unlock_extent(&inode->io_tree, start, end, &cached_state);
- free_extent_map(em);
+ btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state);
+ btrfs_free_extent_map(em);
return ret;
}
@@ -2809,13 +2791,15 @@ static u64 get_cluster_boundary_end(const struct file_extent_cluster *cluster,
static int relocate_one_folio(struct reloc_control *rc,
struct file_ra_state *ra,
- int *cluster_nr, unsigned long index)
+ int *cluster_nr, u64 *file_offset_ret)
{
const struct file_extent_cluster *cluster = &rc->cluster;
struct inode *inode = rc->data_inode;
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
+ const u64 orig_file_offset = *file_offset_ret;
u64 offset = BTRFS_I(inode)->reloc_block_group_start;
- const unsigned long last_index = (cluster->end - offset) >> PAGE_SHIFT;
+ const pgoff_t last_index = (cluster->end - offset) >> PAGE_SHIFT;
+ const pgoff_t index = orig_file_offset >> PAGE_SHIFT;
gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
struct folio *folio;
u64 folio_start;
@@ -2848,8 +2832,6 @@ again:
return PTR_ERR(folio);
}
- WARN_ON(folio_order(folio));
-
if (folio_test_readahead(folio) && !use_rst)
page_cache_async_readahead(inode->i_mapping, ra, NULL,
folio, last_index + 1 - index);
@@ -2878,7 +2860,7 @@ again:
goto release_folio;
folio_start = folio_pos(folio);
- folio_end = folio_start + PAGE_SIZE - 1;
+ folio_end = folio_start + folio_size(folio) - 1;
/*
* Start from the cluster, as for subpage case, the cluster can start
@@ -2902,15 +2884,15 @@ again:
goto release_folio;
/* Mark the range delalloc and dirty for later writeback */
- lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end,
- &cached_state);
+ btrfs_lock_extent(&BTRFS_I(inode)->io_tree, clamped_start,
+ clamped_end, &cached_state);
ret = btrfs_set_extent_delalloc(BTRFS_I(inode), clamped_start,
clamped_end, 0, &cached_state);
if (ret) {
- clear_extent_bit(&BTRFS_I(inode)->io_tree,
- clamped_start, clamped_end,
- EXTENT_LOCKED | EXTENT_BOUNDARY,
- &cached_state);
+ btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree,
+ clamped_start, clamped_end,
+ EXTENT_LOCKED | EXTENT_BOUNDARY,
+ &cached_state);
btrfs_delalloc_release_metadata(BTRFS_I(inode),
clamped_len, true);
btrfs_delalloc_release_extents(BTRFS_I(inode),
@@ -2926,18 +2908,19 @@ again:
* EXTENT_BOUNDARY bit prevents current extent from being merged
* with previous extent.
*/
- if (in_range(cluster->boundary[*cluster_nr] - offset, folio_start, PAGE_SIZE)) {
+ if (in_range(cluster->boundary[*cluster_nr] - offset,
+ folio_start, folio_size(folio))) {
u64 boundary_start = cluster->boundary[*cluster_nr] -
offset;
u64 boundary_end = boundary_start +
fs_info->sectorsize - 1;
- set_extent_bit(&BTRFS_I(inode)->io_tree,
- boundary_start, boundary_end,
- EXTENT_BOUNDARY, NULL);
+ btrfs_set_extent_bit(&BTRFS_I(inode)->io_tree,
+ boundary_start, boundary_end,
+ EXTENT_BOUNDARY, NULL);
}
- unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end,
- &cached_state);
+ btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end,
+ &cached_state);
btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len);
cur += clamped_len;
@@ -2956,6 +2939,7 @@ again:
btrfs_throttle(fs_info);
if (btrfs_should_cancel_balance(fs_info))
ret = -ECANCELED;
+ *file_offset_ret = folio_end + 1;
return ret;
release_folio:
@@ -2969,8 +2953,7 @@ static int relocate_file_extent_cluster(struct reloc_control *rc)
struct inode *inode = rc->data_inode;
const struct file_extent_cluster *cluster = &rc->cluster;
u64 offset = BTRFS_I(inode)->reloc_block_group_start;
- unsigned long index;
- unsigned long last_index;
+ u64 cur_file_offset = cluster->start - offset;
struct file_ra_state *ra;
int cluster_nr = 0;
int ret = 0;
@@ -2992,10 +2975,11 @@ static int relocate_file_extent_cluster(struct reloc_control *rc)
if (ret)
goto out;
- last_index = (cluster->end - offset) >> PAGE_SHIFT;
- for (index = (cluster->start - offset) >> PAGE_SHIFT;
- index <= last_index && !ret; index++)
- ret = relocate_one_folio(rc, ra, &cluster_nr, index);
+ while (cur_file_offset < cluster->end - offset) {
+ ret = relocate_one_folio(rc, ra, &cluster_nr, &cur_file_offset);
+ if (ret)
+ break;
+ }
if (ret == 0)
WARN_ON(cluster_nr != cluster->nr);
out:
@@ -3158,7 +3142,7 @@ static int add_tree_block(struct reloc_control *rc,
block->key_ready = false;
block->owner = owner;
- rb_node = rb_simple_insert(blocks, block->bytenr, &block->rb_node);
+ rb_node = rb_simple_insert(blocks, &block->simple_node);
if (rb_node)
btrfs_backref_panic(rc->extent_root->fs_info, block->bytenr,
-EEXIST);
@@ -3239,21 +3223,23 @@ out:
return ret;
}
-static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
- struct btrfs_block_group *block_group,
+static int delete_block_group_cache(struct btrfs_block_group *block_group,
struct inode *inode,
u64 ino)
{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
struct btrfs_root *root = fs_info->tree_root;
struct btrfs_trans_handle *trans;
+ struct btrfs_inode *btrfs_inode;
int ret = 0;
if (inode)
goto truncate;
- inode = btrfs_iget(ino, root);
- if (IS_ERR(inode))
+ btrfs_inode = btrfs_iget(ino, root);
+ if (IS_ERR(btrfs_inode))
return -ENOENT;
+ inode = &btrfs_inode->vfs_inode;
truncate:
ret = btrfs_check_trunc_cache_free_space(fs_info,
@@ -3313,8 +3299,7 @@ static int delete_v1_space_cache(struct extent_buffer *leaf,
}
if (!found)
return -ENOENT;
- ret = delete_block_group_cache(leaf->fs_info, block_group, NULL,
- space_cache_ino);
+ ret = delete_block_group_cache(block_group, NULL, space_cache_ino);
return ret;
}
@@ -3434,9 +3419,9 @@ next:
goto next;
}
- block_found = find_first_extent_bit(&rc->processed_blocks,
- key.objectid, &start, &end,
- EXTENT_DIRTY, NULL);
+ block_found = btrfs_find_first_extent_bit(&rc->processed_blocks,
+ key.objectid, &start, &end,
+ EXTENT_DIRTY, NULL);
if (block_found && start <= key.objectid) {
btrfs_release_path(path);
@@ -3645,7 +3630,7 @@ restart:
}
btrfs_release_path(path);
- clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY);
+ btrfs_clear_extent_bit(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY, NULL);
if (trans) {
btrfs_end_transaction_throttle(trans);
@@ -3761,10 +3746,10 @@ out:
* the inode is in data relocation tree and its link count is 0
*/
static noinline_for_stack struct inode *create_reloc_inode(
- struct btrfs_fs_info *fs_info,
const struct btrfs_block_group *group)
{
- struct inode *inode = NULL;
+ struct btrfs_fs_info *fs_info = group->fs_info;
+ struct btrfs_inode *inode = NULL;
struct btrfs_trans_handle *trans;
struct btrfs_root *root;
u64 objectid;
@@ -3792,18 +3777,19 @@ static noinline_for_stack struct inode *create_reloc_inode(
inode = NULL;
goto out;
}
- BTRFS_I(inode)->reloc_block_group_start = group->start;
+ inode->reloc_block_group_start = group->start;
- ret = btrfs_orphan_add(trans, BTRFS_I(inode));
+ ret = btrfs_orphan_add(trans, inode);
out:
btrfs_put_root(root);
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
if (ret) {
- iput(inode);
- inode = ERR_PTR(ret);
+ if (inode)
+ iput(&inode->vfs_inode);
+ return ERR_PTR(ret);
}
- return inode;
+ return &inode->vfs_inode;
}
/*
@@ -3860,7 +3846,7 @@ static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
btrfs_backref_init_cache(fs_info, &rc->backref_cache, true);
rc->reloc_root_tree.rb_root = RB_ROOT;
spin_lock_init(&rc->reloc_root_tree.lock);
- extent_io_tree_init(fs_info, &rc->processed_blocks, IO_TREE_RELOC_BLOCKS);
+ btrfs_extent_io_tree_init(fs_info, &rc->processed_blocks, IO_TREE_RELOC_BLOCKS);
return rc;
}
@@ -3881,7 +3867,7 @@ static void free_reloc_control(struct reloc_control *rc)
*/
static void describe_relocation(struct btrfs_block_group *block_group)
{
- char buf[128] = {'\0'};
+ char buf[128] = "NONE";
btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf));
@@ -3901,7 +3887,8 @@ static const char *stage_to_string(enum reloc_stage stage)
/*
* function to relocate all extents in a block group.
*/
-int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
+int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start,
+ bool verbose)
{
struct btrfs_block_group *bg;
struct btrfs_root *extent_root = btrfs_extent_root(fs_info, group_start);
@@ -3977,7 +3964,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
btrfs_free_path(path);
if (!IS_ERR(inode))
- ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0);
+ ret = delete_block_group_cache(rc->block_group, inode, 0);
else
ret = PTR_ERR(inode);
@@ -3986,14 +3973,15 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
goto out;
}
- rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
+ rc->data_inode = create_reloc_inode(rc->block_group);
if (IS_ERR(rc->data_inode)) {
err = PTR_ERR(rc->data_inode);
rc->data_inode = NULL;
goto out;
}
- describe_relocation(rc->block_group);
+ if (verbose)
+ describe_relocation(rc->block_group);
btrfs_wait_block_group_reservations(rc->block_group);
btrfs_wait_nocow_writers(rc->block_group);
@@ -4037,8 +4025,10 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
if (rc->extents_found == 0)
break;
- btrfs_info(fs_info, "found %llu extents, stage: %s",
- rc->extents_found, stage_to_string(finishes_stage));
+ if (verbose)
+ btrfs_info(fs_info, "found %llu extents, stage: %s",
+ rc->extents_found,
+ stage_to_string(finishes_stage));
}
WARN_ON(rc->block_group->pinned > 0);
@@ -4183,8 +4173,7 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info)
rc->merge_reloc_tree = true;
while (!list_empty(&reloc_roots)) {
- reloc_root = list_entry(reloc_roots.next,
- struct btrfs_root, root_list);
+ reloc_root = list_first_entry(&reloc_roots, struct btrfs_root, root_list);
list_del(&reloc_root->root_list);
if (btrfs_root_refs(&reloc_root->root_item) == 0) {
@@ -4277,7 +4266,7 @@ int btrfs_reloc_clone_csums(struct btrfs_ordered_extent *ordered)
while (!list_empty(&list)) {
struct btrfs_ordered_sum *sums =
- list_entry(list.next, struct btrfs_ordered_sum, list);
+ list_first_entry(&list, struct btrfs_ordered_sum, list);
list_del_init(&sums->list);
@@ -4341,7 +4330,7 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
}
btrfs_backref_drop_node_buffer(node);
- atomic_inc(&cow->refs);
+ refcount_inc(&cow->refs);
node->eb = cow;
node->new_bytenr = cow->start;
diff --git a/fs/btrfs/relocation.h b/fs/btrfs/relocation.h
index 788c86d8633a..5c36b3f84b57 100644
--- a/fs/btrfs/relocation.h
+++ b/fs/btrfs/relocation.h
@@ -12,7 +12,8 @@ struct btrfs_trans_handle;
struct btrfs_ordered_extent;
struct btrfs_pending_snapshot;
-int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start);
+int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start,
+ bool verbose);
int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root);
int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 531312efee8d..6776e6ab8d10 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -66,8 +66,6 @@ struct scrub_ctx;
/* Represent one sector and its needed info to verify the content. */
struct scrub_sector_verification {
- bool is_metadata;
-
union {
/*
* Csum pointer for data csum verification. Should point to a
@@ -100,6 +98,38 @@ enum scrub_stripe_flags {
SCRUB_STRIPE_FLAG_NO_REPORT,
};
+/*
+ * We have multiple bitmaps for one scrub_stripe.
+ * However each bitmap has at most (BTRFS_STRIPE_LEN / blocksize) bits,
+ * which is normally 16, and much smaller than BITS_PER_LONG (32 or 64).
+ *
+ * So to reduce memory usage for each scrub_stripe, we pack those bitmaps
+ * into a larger one.
+ *
+ * These enum records where the sub-bitmap are inside the larger one.
+ * Each subbitmap starts at scrub_bitmap_nr_##name * nr_sectors bit.
+ */
+enum {
+ /* Which blocks are covered by extent items. */
+ scrub_bitmap_nr_has_extent = 0,
+
+ /* Which blocks are meteadata. */
+ scrub_bitmap_nr_is_metadata,
+
+ /*
+ * Which blocks have errors, including IO, csum, and metadata
+ * errors.
+ * This sub-bitmap is the OR results of the next few error related
+ * sub-bitmaps.
+ */
+ scrub_bitmap_nr_error,
+ scrub_bitmap_nr_io_error,
+ scrub_bitmap_nr_csum_error,
+ scrub_bitmap_nr_meta_error,
+ scrub_bitmap_nr_meta_gen_error,
+ scrub_bitmap_nr_last,
+};
+
#define SCRUB_STRIPE_PAGES (BTRFS_STRIPE_LEN / PAGE_SIZE)
/*
@@ -138,36 +168,15 @@ struct scrub_stripe {
*/
unsigned long state;
- /* Indicate which sectors are covered by extent items. */
- unsigned long extent_sector_bitmap;
-
- /*
- * The errors hit during the initial read of the stripe.
- *
- * Would be utilized for error reporting and repair.
- *
- * The remaining init_nr_* records the number of errors hit, only used
- * by error reporting.
- */
- unsigned long init_error_bitmap;
- unsigned int init_nr_io_errors;
- unsigned int init_nr_csum_errors;
- unsigned int init_nr_meta_errors;
+ /* The large bitmap contains all the sub-bitmaps. */
+ unsigned long bitmaps[BITS_TO_LONGS(scrub_bitmap_nr_last *
+ (BTRFS_STRIPE_LEN / BTRFS_MIN_BLOCKSIZE))];
/*
- * The following error bitmaps are all for the current status.
- * Every time we submit a new read, these bitmaps may be updated.
- *
- * error_bitmap = io_error_bitmap | csum_error_bitmap | meta_error_bitmap;
- *
- * IO and csum errors can happen for both metadata and data.
+ * For writeback (repair or replace) error reporting.
+ * This one is protected by a spinlock, thus can not be packed into
+ * the larger bitmap.
*/
- unsigned long error_bitmap;
- unsigned long io_error_bitmap;
- unsigned long csum_error_bitmap;
- unsigned long meta_error_bitmap;
-
- /* For writeback (repair or replace) error reporting. */
unsigned long write_error_bitmap;
/* Writeback can be concurrent, thus we need to protect the bitmap. */
@@ -219,6 +228,90 @@ struct scrub_ctx {
refcount_t refs;
};
+#define scrub_calc_start_bit(stripe, name, block_nr) \
+({ \
+ unsigned int __start_bit; \
+ \
+ ASSERT(block_nr < stripe->nr_sectors, \
+ "nr_sectors=%u block_nr=%u", stripe->nr_sectors, block_nr); \
+ __start_bit = scrub_bitmap_nr_##name * stripe->nr_sectors + block_nr; \
+ __start_bit; \
+})
+
+#define IMPLEMENT_SCRUB_BITMAP_OPS(name) \
+static inline void scrub_bitmap_set_##name(struct scrub_stripe *stripe, \
+ unsigned int block_nr, \
+ unsigned int nr_blocks) \
+{ \
+ const unsigned int start_bit = scrub_calc_start_bit(stripe, \
+ name, block_nr); \
+ \
+ bitmap_set(stripe->bitmaps, start_bit, nr_blocks); \
+} \
+static inline void scrub_bitmap_clear_##name(struct scrub_stripe *stripe, \
+ unsigned int block_nr, \
+ unsigned int nr_blocks) \
+{ \
+ const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \
+ block_nr); \
+ \
+ bitmap_clear(stripe->bitmaps, start_bit, nr_blocks); \
+} \
+static inline bool scrub_bitmap_test_bit_##name(struct scrub_stripe *stripe, \
+ unsigned int block_nr) \
+{ \
+ const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \
+ block_nr); \
+ \
+ return test_bit(start_bit, stripe->bitmaps); \
+} \
+static inline void scrub_bitmap_set_bit_##name(struct scrub_stripe *stripe, \
+ unsigned int block_nr) \
+{ \
+ const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \
+ block_nr); \
+ \
+ set_bit(start_bit, stripe->bitmaps); \
+} \
+static inline void scrub_bitmap_clear_bit_##name(struct scrub_stripe *stripe, \
+ unsigned int block_nr) \
+{ \
+ const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \
+ block_nr); \
+ \
+ clear_bit(start_bit, stripe->bitmaps); \
+} \
+static inline unsigned long scrub_bitmap_read_##name(struct scrub_stripe *stripe) \
+{ \
+ const unsigned int nr_blocks = stripe->nr_sectors; \
+ \
+ ASSERT(nr_blocks > 0 && nr_blocks <= BITS_PER_LONG, \
+ "nr_blocks=%u BITS_PER_LONG=%u", \
+ nr_blocks, BITS_PER_LONG); \
+ \
+ return bitmap_read(stripe->bitmaps, nr_blocks * scrub_bitmap_nr_##name, \
+ stripe->nr_sectors); \
+} \
+static inline bool scrub_bitmap_empty_##name(struct scrub_stripe *stripe) \
+{ \
+ unsigned long bitmap = scrub_bitmap_read_##name(stripe); \
+ \
+ return bitmap_empty(&bitmap, stripe->nr_sectors); \
+} \
+static inline unsigned int scrub_bitmap_weight_##name(struct scrub_stripe *stripe) \
+{ \
+ unsigned long bitmap = scrub_bitmap_read_##name(stripe); \
+ \
+ return bitmap_weight(&bitmap, stripe->nr_sectors); \
+}
+IMPLEMENT_SCRUB_BITMAP_OPS(has_extent);
+IMPLEMENT_SCRUB_BITMAP_OPS(is_metadata);
+IMPLEMENT_SCRUB_BITMAP_OPS(error);
+IMPLEMENT_SCRUB_BITMAP_OPS(io_error);
+IMPLEMENT_SCRUB_BITMAP_OPS(csum_error);
+IMPLEMENT_SCRUB_BITMAP_OPS(meta_error);
+IMPLEMENT_SCRUB_BITMAP_OPS(meta_gen_error);
+
struct scrub_warning {
struct btrfs_path *path;
u64 extent_item_size;
@@ -228,6 +321,19 @@ struct scrub_warning {
struct btrfs_device *dev;
};
+struct scrub_error_records {
+ /*
+ * Bitmap recording which blocks hit errors (IO/csum/...) during the
+ * initial read.
+ */
+ unsigned long init_error_bitmap;
+
+ unsigned int nr_io_errors;
+ unsigned int nr_csum_errors;
+ unsigned int nr_meta_errors;
+ unsigned int nr_meta_gen_errors;
+};
+
static void release_scrub_stripe(struct scrub_stripe *stripe)
{
if (!stripe)
@@ -450,8 +556,8 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
* hold all of the paths here
*/
for (i = 0; i < ipath->fspath->elem_cnt; ++i)
- btrfs_warn_in_rcu(fs_info,
-"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)",
+ btrfs_warn(fs_info,
+"scrub: %s at logical %llu on dev %s, physical %llu root %llu inode %llu offset %llu length %u links %u (path: %s)",
swarn->errstr, swarn->logical,
btrfs_dev_name(swarn->dev),
swarn->physical,
@@ -464,8 +570,8 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
return 0;
err:
- btrfs_warn_in_rcu(fs_info,
- "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
+ btrfs_warn(fs_info,
+ "scrub: %s at logical %llu on dev %s, physical %llu root %llu inode %llu offset %llu: path resolving failed with ret=%d",
swarn->errstr, swarn->logical,
btrfs_dev_name(swarn->dev),
swarn->physical,
@@ -490,7 +596,7 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device *
/* Super block error, no need to search extent tree. */
if (is_super) {
- btrfs_warn_in_rcu(fs_info, "%s on device %s, physical %llu",
+ btrfs_warn(fs_info, "scrub: %s on device %s, physical %llu",
errstr, btrfs_dev_name(dev), physical);
return;
}
@@ -525,14 +631,14 @@ static void scrub_print_common_warning(const char *errstr, struct btrfs_device *
&ref_level);
if (ret < 0) {
btrfs_warn(fs_info,
- "failed to resolve tree backref for logical %llu: %d",
- swarn.logical, ret);
+ "scrub: failed to resolve tree backref for logical %llu: %d",
+ swarn.logical, ret);
break;
}
if (ret > 0)
break;
- btrfs_warn_in_rcu(fs_info,
-"%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
+ btrfs_warn(fs_info,
+"scrub: %s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
errstr, swarn.logical, btrfs_dev_name(dev),
swarn.physical, (ref_level ? "node" : "leaf"),
ref_level, ref_root);
@@ -579,20 +685,15 @@ static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
return ret;
}
-static struct page *scrub_stripe_get_page(struct scrub_stripe *stripe, int sector_nr)
-{
- struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
- int page_index = (sector_nr << fs_info->sectorsize_bits) >> PAGE_SHIFT;
-
- return stripe->pages[page_index];
-}
-
-static unsigned int scrub_stripe_get_page_offset(struct scrub_stripe *stripe,
- int sector_nr)
+static void *scrub_stripe_get_kaddr(struct scrub_stripe *stripe, int sector_nr)
{
- struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
+ u32 offset = (sector_nr << stripe->bg->fs_info->sectorsize_bits);
+ const struct page *page = stripe->pages[offset >> PAGE_SHIFT];
- return offset_in_page(sector_nr << fs_info->sectorsize_bits);
+ /* stripe->pages[] is allocated by us and no highmem is allowed. */
+ ASSERT(page);
+ ASSERT(!PageHighMem(page));
+ return page_address(page) + offset_in_page(offset);
}
static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr)
@@ -600,46 +701,44 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr
struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits;
const u64 logical = stripe->logical + (sector_nr << fs_info->sectorsize_bits);
- const struct page *first_page = scrub_stripe_get_page(stripe, sector_nr);
- const unsigned int first_off = scrub_stripe_get_page_offset(stripe, sector_nr);
+ void *first_kaddr = scrub_stripe_get_kaddr(stripe, sector_nr);
+ struct btrfs_header *header = first_kaddr;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
u8 on_disk_csum[BTRFS_CSUM_SIZE];
u8 calculated_csum[BTRFS_CSUM_SIZE];
- struct btrfs_header *header;
/*
* Here we don't have a good way to attach the pages (and subpages)
* to a dummy extent buffer, thus we have to directly grab the members
* from pages.
*/
- header = (struct btrfs_header *)(page_address(first_page) + first_off);
memcpy(on_disk_csum, header->csum, fs_info->csum_size);
if (logical != btrfs_stack_header_bytenr(header)) {
- bitmap_set(&stripe->csum_error_bitmap, sector_nr, sectors_per_tree);
- bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
+ scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree);
+ scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree);
btrfs_warn_rl(fs_info,
- "tree block %llu mirror %u has bad bytenr, has %llu want %llu",
+ "scrub: tree block %llu mirror %u has bad bytenr, has %llu want %llu",
logical, stripe->mirror_num,
btrfs_stack_header_bytenr(header), logical);
return;
}
if (memcmp(header->fsid, fs_info->fs_devices->metadata_uuid,
BTRFS_FSID_SIZE) != 0) {
- bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
- bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
+ scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree);
+ scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree);
btrfs_warn_rl(fs_info,
- "tree block %llu mirror %u has bad fsid, has %pU want %pU",
+ "scrub: tree block %llu mirror %u has bad fsid, has %pU want %pU",
logical, stripe->mirror_num,
header->fsid, fs_info->fs_devices->fsid);
return;
}
if (memcmp(header->chunk_tree_uuid, fs_info->chunk_tree_uuid,
BTRFS_UUID_SIZE) != 0) {
- bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
- bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
+ scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree);
+ scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree);
btrfs_warn_rl(fs_info,
- "tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU",
+ "scrub: tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU",
logical, stripe->mirror_num,
header->chunk_tree_uuid, fs_info->chunk_tree_uuid);
return;
@@ -648,23 +747,20 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr
/* Now check tree block csum. */
shash->tfm = fs_info->csum_shash;
crypto_shash_init(shash);
- crypto_shash_update(shash, page_address(first_page) + first_off +
- BTRFS_CSUM_SIZE, fs_info->sectorsize - BTRFS_CSUM_SIZE);
+ crypto_shash_update(shash, first_kaddr + BTRFS_CSUM_SIZE,
+ fs_info->sectorsize - BTRFS_CSUM_SIZE);
for (int i = sector_nr + 1; i < sector_nr + sectors_per_tree; i++) {
- struct page *page = scrub_stripe_get_page(stripe, i);
- unsigned int page_off = scrub_stripe_get_page_offset(stripe, i);
-
- crypto_shash_update(shash, page_address(page) + page_off,
+ crypto_shash_update(shash, scrub_stripe_get_kaddr(stripe, i),
fs_info->sectorsize);
}
crypto_shash_final(shash, calculated_csum);
if (memcmp(calculated_csum, on_disk_csum, fs_info->csum_size) != 0) {
- bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
- bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
+ scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree);
+ scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree);
btrfs_warn_rl(fs_info,
- "tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT,
+"scrub: tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT,
logical, stripe->mirror_num,
CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum),
CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum));
@@ -672,18 +768,19 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr
}
if (stripe->sectors[sector_nr].generation !=
btrfs_stack_header_generation(header)) {
- bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
- bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
+ scrub_bitmap_set_meta_gen_error(stripe, sector_nr, sectors_per_tree);
+ scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree);
btrfs_warn_rl(fs_info,
- "tree block %llu mirror %u has bad generation, has %llu want %llu",
+ "scrub: tree block %llu mirror %u has bad generation, has %llu want %llu",
logical, stripe->mirror_num,
btrfs_stack_header_generation(header),
stripe->sectors[sector_nr].generation);
return;
}
- bitmap_clear(&stripe->error_bitmap, sector_nr, sectors_per_tree);
- bitmap_clear(&stripe->csum_error_bitmap, sector_nr, sectors_per_tree);
- bitmap_clear(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
+ scrub_bitmap_clear_error(stripe, sector_nr, sectors_per_tree);
+ scrub_bitmap_clear_csum_error(stripe, sector_nr, sectors_per_tree);
+ scrub_bitmap_clear_meta_error(stripe, sector_nr, sectors_per_tree);
+ scrub_bitmap_clear_meta_gen_error(stripe, sector_nr, sectors_per_tree);
}
static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr)
@@ -691,23 +788,22 @@ static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr)
struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
struct scrub_sector_verification *sector = &stripe->sectors[sector_nr];
const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits;
- struct page *page = scrub_stripe_get_page(stripe, sector_nr);
- unsigned int pgoff = scrub_stripe_get_page_offset(stripe, sector_nr);
+ void *kaddr = scrub_stripe_get_kaddr(stripe, sector_nr);
u8 csum_buf[BTRFS_CSUM_SIZE];
int ret;
ASSERT(sector_nr >= 0 && sector_nr < stripe->nr_sectors);
/* Sector not utilized, skip it. */
- if (!test_bit(sector_nr, &stripe->extent_sector_bitmap))
+ if (!scrub_bitmap_test_bit_has_extent(stripe, sector_nr))
return;
/* IO error, no need to check. */
- if (test_bit(sector_nr, &stripe->io_error_bitmap))
+ if (scrub_bitmap_test_bit_io_error(stripe, sector_nr))
return;
/* Metadata, verify the full tree block. */
- if (sector->is_metadata) {
+ if (scrub_bitmap_test_bit_is_metadata(stripe, sector_nr)) {
/*
* Check if the tree block crosses the stripe boundary. If
* crossed the boundary, we cannot verify it but only give a
@@ -718,7 +814,7 @@ static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr)
*/
if (unlikely(sector_nr + sectors_per_tree > stripe->nr_sectors)) {
btrfs_warn_rl(fs_info,
- "tree block at %llu crosses stripe boundary %llu",
+ "scrub: tree block at %llu crosses stripe boundary %llu",
stripe->logical +
(sector_nr << fs_info->sectorsize_bits),
stripe->logical);
@@ -733,17 +829,17 @@ static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr)
* cases without csum, we have no other choice but to trust it.
*/
if (!sector->csum) {
- clear_bit(sector_nr, &stripe->error_bitmap);
+ scrub_bitmap_clear_bit_error(stripe, sector_nr);
return;
}
- ret = btrfs_check_sector_csum(fs_info, page, pgoff, csum_buf, sector->csum);
+ ret = btrfs_check_sector_csum(fs_info, kaddr, csum_buf, sector->csum);
if (ret < 0) {
- set_bit(sector_nr, &stripe->csum_error_bitmap);
- set_bit(sector_nr, &stripe->error_bitmap);
+ scrub_bitmap_set_bit_csum_error(stripe, sector_nr);
+ scrub_bitmap_set_bit_error(stripe, sector_nr);
} else {
- clear_bit(sector_nr, &stripe->csum_error_bitmap);
- clear_bit(sector_nr, &stripe->error_bitmap);
+ scrub_bitmap_clear_bit_csum_error(stripe, sector_nr);
+ scrub_bitmap_clear_bit_error(stripe, sector_nr);
}
}
@@ -756,7 +852,7 @@ static void scrub_verify_one_stripe(struct scrub_stripe *stripe, unsigned long b
for_each_set_bit(sector_nr, &bitmap, stripe->nr_sectors) {
scrub_verify_one_sector(stripe, sector_nr);
- if (stripe->sectors[sector_nr].is_metadata)
+ if (scrub_bitmap_test_bit_is_metadata(stripe, sector_nr))
sector_nr += sectors_per_tree - 1;
}
}
@@ -766,8 +862,7 @@ static int calc_sector_number(struct scrub_stripe *stripe, struct bio_vec *first
int i;
for (i = 0; i < stripe->nr_sectors; i++) {
- if (scrub_stripe_get_page(stripe, i) == first_bvec->bv_page &&
- scrub_stripe_get_page_offset(stripe, i) == first_bvec->bv_offset)
+ if (scrub_stripe_get_kaddr(stripe, i) == bvec_virt(first_bvec))
break;
}
ASSERT(i < stripe->nr_sectors);
@@ -795,13 +890,13 @@ static void scrub_repair_read_endio(struct btrfs_bio *bbio)
bio_size += bvec->bv_len;
if (bbio->bio.bi_status) {
- bitmap_set(&stripe->io_error_bitmap, sector_nr,
- bio_size >> fs_info->sectorsize_bits);
- bitmap_set(&stripe->error_bitmap, sector_nr,
- bio_size >> fs_info->sectorsize_bits);
+ scrub_bitmap_set_io_error(stripe, sector_nr,
+ bio_size >> fs_info->sectorsize_bits);
+ scrub_bitmap_set_error(stripe, sector_nr,
+ bio_size >> fs_info->sectorsize_bits);
} else {
- bitmap_clear(&stripe->io_error_bitmap, sector_nr,
- bio_size >> fs_info->sectorsize_bits);
+ scrub_bitmap_clear_io_error(stripe, sector_nr,
+ bio_size >> fs_info->sectorsize_bits);
}
bio_put(&bbio->bio);
if (atomic_dec_and_test(&stripe->pending_io))
@@ -814,27 +909,39 @@ static int calc_next_mirror(int mirror, int num_copies)
return (mirror + 1 > num_copies) ? 1 : mirror + 1;
}
+static void scrub_bio_add_sector(struct btrfs_bio *bbio, struct scrub_stripe *stripe,
+ int sector_nr)
+{
+ void *kaddr = scrub_stripe_get_kaddr(stripe, sector_nr);
+ int ret;
+
+ ret = bio_add_page(&bbio->bio, virt_to_page(kaddr), bbio->fs_info->sectorsize,
+ offset_in_page(kaddr));
+ /*
+ * Caller should ensure the bbio has enough size.
+ * And we cannot use __bio_add_page(), which doesn't do any merge.
+ *
+ * Meanwhile for scrub_submit_initial_read() we fully rely on the merge
+ * to create the minimal amount of bio vectors, for fs block size < page
+ * size cases.
+ */
+ ASSERT(ret == bbio->fs_info->sectorsize);
+}
+
static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe,
int mirror, int blocksize, bool wait)
{
struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
struct btrfs_bio *bbio = NULL;
- const unsigned long old_error_bitmap = stripe->error_bitmap;
+ const unsigned long old_error_bitmap = scrub_bitmap_read_error(stripe);
int i;
ASSERT(stripe->mirror_num >= 1);
ASSERT(atomic_read(&stripe->pending_io) == 0);
for_each_set_bit(i, &old_error_bitmap, stripe->nr_sectors) {
- struct page *page;
- int pgoff;
- int ret;
-
- page = scrub_stripe_get_page(stripe, i);
- pgoff = scrub_stripe_get_page_offset(stripe, i);
-
/* The current sector cannot be merged, submit the bio. */
- if (bbio && ((i > 0 && !test_bit(i - 1, &stripe->error_bitmap)) ||
+ if (bbio && ((i > 0 && !test_bit(i - 1, &old_error_bitmap)) ||
bbio->bio.bi_iter.bi_size >= blocksize)) {
ASSERT(bbio->bio.bi_iter.bi_size);
atomic_inc(&stripe->pending_io);
@@ -851,8 +958,7 @@ static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe,
(i << fs_info->sectorsize_bits)) >> SECTOR_SHIFT;
}
- ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff);
- ASSERT(ret == fs_info->sectorsize);
+ scrub_bio_add_sector(bbio, stripe, i);
}
if (bbio) {
ASSERT(bbio->bio.bi_iter.bi_size);
@@ -864,12 +970,15 @@ static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe,
}
static void scrub_stripe_report_errors(struct scrub_ctx *sctx,
- struct scrub_stripe *stripe)
+ struct scrub_stripe *stripe,
+ const struct scrub_error_records *errors)
{
static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
struct btrfs_fs_info *fs_info = sctx->fs_info;
struct btrfs_device *dev = NULL;
+ const unsigned long extent_bitmap = scrub_bitmap_read_has_extent(stripe);
+ const unsigned long error_bitmap = scrub_bitmap_read_error(stripe);
u64 physical = 0;
int nr_data_sectors = 0;
int nr_meta_sectors = 0;
@@ -886,7 +995,7 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx,
* Although our scrub_stripe infrastructure is mostly based on btrfs_submit_bio()
* thus no need for dev/physical, error reporting still needs dev and physical.
*/
- if (!bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors)) {
+ if (!bitmap_empty(&errors->init_error_bitmap, stripe->nr_sectors)) {
u64 mapped_len = fs_info->sectorsize;
struct btrfs_io_context *bioc = NULL;
int stripe_index = stripe->mirror_num - 1;
@@ -909,10 +1018,10 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx,
}
skip:
- for_each_set_bit(sector_nr, &stripe->extent_sector_bitmap, stripe->nr_sectors) {
+ for_each_set_bit(sector_nr, &extent_bitmap, stripe->nr_sectors) {
bool repaired = false;
- if (stripe->sectors[sector_nr].is_metadata) {
+ if (scrub_bitmap_test_bit_is_metadata(stripe, sector_nr)) {
nr_meta_sectors++;
} else {
nr_data_sectors++;
@@ -920,14 +1029,14 @@ skip:
nr_nodatacsum_sectors++;
}
- if (test_bit(sector_nr, &stripe->init_error_bitmap) &&
- !test_bit(sector_nr, &stripe->error_bitmap)) {
+ if (test_bit(sector_nr, &errors->init_error_bitmap) &&
+ !test_bit(sector_nr, &error_bitmap)) {
nr_repaired_sectors++;
repaired = true;
}
/* Good sector from the beginning, nothing need to be done. */
- if (!test_bit(sector_nr, &stripe->init_error_bitmap))
+ if (!test_bit(sector_nr, &errors->init_error_bitmap))
continue;
/*
@@ -936,13 +1045,13 @@ skip:
*/
if (repaired) {
if (dev) {
- btrfs_err_rl_in_rcu(fs_info,
- "fixed up error at logical %llu on dev %s physical %llu",
+ btrfs_err_rl(fs_info,
+ "scrub: fixed up error at logical %llu on dev %s physical %llu",
stripe->logical, btrfs_dev_name(dev),
physical);
} else {
- btrfs_err_rl_in_rcu(fs_info,
- "fixed up error at logical %llu on mirror %u",
+ btrfs_err_rl(fs_info,
+ "scrub: fixed up error at logical %llu on mirror %u",
stripe->logical, stripe->mirror_num);
}
continue;
@@ -950,41 +1059,56 @@ skip:
/* The remaining are all for unrepaired. */
if (dev) {
- btrfs_err_rl_in_rcu(fs_info,
- "unable to fixup (regular) error at logical %llu on dev %s physical %llu",
+ btrfs_err_rl(fs_info,
+"scrub: unable to fixup (regular) error at logical %llu on dev %s physical %llu",
stripe->logical, btrfs_dev_name(dev),
physical);
} else {
- btrfs_err_rl_in_rcu(fs_info,
- "unable to fixup (regular) error at logical %llu on mirror %u",
+ btrfs_err_rl(fs_info,
+ "scrub: unable to fixup (regular) error at logical %llu on mirror %u",
stripe->logical, stripe->mirror_num);
}
- if (test_bit(sector_nr, &stripe->io_error_bitmap))
+ if (scrub_bitmap_test_bit_io_error(stripe, sector_nr))
if (__ratelimit(&rs) && dev)
scrub_print_common_warning("i/o error", dev, false,
stripe->logical, physical);
- if (test_bit(sector_nr, &stripe->csum_error_bitmap))
+ if (scrub_bitmap_test_bit_csum_error(stripe, sector_nr))
if (__ratelimit(&rs) && dev)
scrub_print_common_warning("checksum error", dev, false,
stripe->logical, physical);
- if (test_bit(sector_nr, &stripe->meta_error_bitmap))
+ if (scrub_bitmap_test_bit_meta_error(stripe, sector_nr))
if (__ratelimit(&rs) && dev)
scrub_print_common_warning("header error", dev, false,
stripe->logical, physical);
+ if (scrub_bitmap_test_bit_meta_gen_error(stripe, sector_nr))
+ if (__ratelimit(&rs) && dev)
+ scrub_print_common_warning("generation error", dev, false,
+ stripe->logical, physical);
}
+ /* Update the device stats. */
+ for (int i = 0; i < errors->nr_io_errors; i++)
+ btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_READ_ERRS);
+ for (int i = 0; i < errors->nr_csum_errors; i++)
+ btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_CORRUPTION_ERRS);
+ /* Generation mismatch error is based on each metadata, not each block. */
+ for (int i = 0; i < errors->nr_meta_gen_errors;
+ i += (fs_info->nodesize >> fs_info->sectorsize_bits))
+ btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_GENERATION_ERRS);
+
spin_lock(&sctx->stat_lock);
sctx->stat.data_extents_scrubbed += stripe->nr_data_extents;
sctx->stat.tree_extents_scrubbed += stripe->nr_meta_extents;
sctx->stat.data_bytes_scrubbed += nr_data_sectors << fs_info->sectorsize_bits;
sctx->stat.tree_bytes_scrubbed += nr_meta_sectors << fs_info->sectorsize_bits;
sctx->stat.no_csum += nr_nodatacsum_sectors;
- sctx->stat.read_errors += stripe->init_nr_io_errors;
- sctx->stat.csum_errors += stripe->init_nr_csum_errors;
- sctx->stat.verify_errors += stripe->init_nr_meta_errors;
+ sctx->stat.read_errors += errors->nr_io_errors;
+ sctx->stat.csum_errors += errors->nr_csum_errors;
+ sctx->stat.verify_errors += errors->nr_meta_errors +
+ errors->nr_meta_gen_errors;
sctx->stat.uncorrectable_errors +=
- bitmap_weight(&stripe->error_bitmap, stripe->nr_sectors);
+ bitmap_weight(&error_bitmap, stripe->nr_sectors);
sctx->stat.corrected_errors += nr_repaired_sectors;
spin_unlock(&sctx->stat_lock);
}
@@ -1010,26 +1134,26 @@ static void scrub_stripe_read_repair_worker(struct work_struct *work)
struct scrub_stripe *stripe = container_of(work, struct scrub_stripe, work);
struct scrub_ctx *sctx = stripe->sctx;
struct btrfs_fs_info *fs_info = sctx->fs_info;
+ struct scrub_error_records errors = { 0 };
int num_copies = btrfs_num_copies(fs_info, stripe->bg->start,
stripe->bg->length);
unsigned long repaired;
+ unsigned long error;
int mirror;
int i;
ASSERT(stripe->mirror_num > 0);
wait_scrub_stripe_io(stripe);
- scrub_verify_one_stripe(stripe, stripe->extent_sector_bitmap);
+ scrub_verify_one_stripe(stripe, scrub_bitmap_read_has_extent(stripe));
/* Save the initial failed bitmap for later repair and report usage. */
- stripe->init_error_bitmap = stripe->error_bitmap;
- stripe->init_nr_io_errors = bitmap_weight(&stripe->io_error_bitmap,
- stripe->nr_sectors);
- stripe->init_nr_csum_errors = bitmap_weight(&stripe->csum_error_bitmap,
- stripe->nr_sectors);
- stripe->init_nr_meta_errors = bitmap_weight(&stripe->meta_error_bitmap,
- stripe->nr_sectors);
-
- if (bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors))
+ errors.init_error_bitmap = scrub_bitmap_read_error(stripe);
+ errors.nr_io_errors = scrub_bitmap_weight_io_error(stripe);
+ errors.nr_csum_errors = scrub_bitmap_weight_csum_error(stripe);
+ errors.nr_meta_errors = scrub_bitmap_weight_meta_error(stripe);
+ errors.nr_meta_gen_errors = scrub_bitmap_weight_meta_gen_error(stripe);
+
+ if (bitmap_empty(&errors.init_error_bitmap, stripe->nr_sectors))
goto out;
/*
@@ -1041,13 +1165,13 @@ static void scrub_stripe_read_repair_worker(struct work_struct *work)
for (mirror = calc_next_mirror(stripe->mirror_num, num_copies);
mirror != stripe->mirror_num;
mirror = calc_next_mirror(mirror, num_copies)) {
- const unsigned long old_error_bitmap = stripe->error_bitmap;
+ const unsigned long old_error_bitmap = scrub_bitmap_read_error(stripe);
scrub_stripe_submit_repair_read(stripe, mirror,
BTRFS_STRIPE_LEN, false);
wait_scrub_stripe_io(stripe);
scrub_verify_one_stripe(stripe, old_error_bitmap);
- if (bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors))
+ if (scrub_bitmap_empty_error(stripe))
goto out;
}
@@ -1065,21 +1189,22 @@ static void scrub_stripe_read_repair_worker(struct work_struct *work)
for (i = 0, mirror = stripe->mirror_num;
i < num_copies;
i++, mirror = calc_next_mirror(mirror, num_copies)) {
- const unsigned long old_error_bitmap = stripe->error_bitmap;
+ const unsigned long old_error_bitmap = scrub_bitmap_read_error(stripe);
scrub_stripe_submit_repair_read(stripe, mirror,
fs_info->sectorsize, true);
wait_scrub_stripe_io(stripe);
scrub_verify_one_stripe(stripe, old_error_bitmap);
- if (bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors))
+ if (scrub_bitmap_empty_error(stripe))
goto out;
}
out:
+ error = scrub_bitmap_read_error(stripe);
/*
* Submit the repaired sectors. For zoned case, we cannot do repair
* in-place, but queue the bg to be relocated.
*/
- bitmap_andnot(&repaired, &stripe->init_error_bitmap, &stripe->error_bitmap,
+ bitmap_andnot(&repaired, &errors.init_error_bitmap, &error,
stripe->nr_sectors);
if (!sctx->readonly && !bitmap_empty(&repaired, stripe->nr_sectors)) {
if (btrfs_is_zoned(fs_info)) {
@@ -1090,7 +1215,7 @@ out:
}
}
- scrub_stripe_report_errors(sctx, stripe);
+ scrub_stripe_report_errors(sctx, stripe, &errors);
set_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state);
wake_up(&stripe->repair_wait);
}
@@ -1110,10 +1235,10 @@ static void scrub_read_endio(struct btrfs_bio *bbio)
num_sectors = bio_size >> stripe->bg->fs_info->sectorsize_bits;
if (bbio->bio.bi_status) {
- bitmap_set(&stripe->io_error_bitmap, sector_nr, num_sectors);
- bitmap_set(&stripe->error_bitmap, sector_nr, num_sectors);
+ scrub_bitmap_set_io_error(stripe, sector_nr, num_sectors);
+ scrub_bitmap_set_error(stripe, sector_nr, num_sectors);
} else {
- bitmap_clear(&stripe->io_error_bitmap, sector_nr, num_sectors);
+ scrub_bitmap_clear_io_error(stripe, sector_nr, num_sectors);
}
bio_put(&bbio->bio);
if (atomic_dec_and_test(&stripe->pending_io)) {
@@ -1142,6 +1267,9 @@ static void scrub_write_endio(struct btrfs_bio *bbio)
bitmap_set(&stripe->write_error_bitmap, sector_nr,
bio_size >> fs_info->sectorsize_bits);
spin_unlock_irqrestore(&stripe->write_error_lock, flags);
+ for (int i = 0; i < (bio_size >> fs_info->sectorsize_bits); i++)
+ btrfs_dev_stat_inc_and_print(stripe->dev,
+ BTRFS_DEV_STAT_WRITE_ERRS);
}
bio_put(&bbio->bio);
@@ -1199,12 +1327,8 @@ static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *str
int sector_nr;
for_each_set_bit(sector_nr, &write_bitmap, stripe->nr_sectors) {
- struct page *page = scrub_stripe_get_page(stripe, sector_nr);
- unsigned int pgoff = scrub_stripe_get_page_offset(stripe, sector_nr);
- int ret;
-
/* We should only writeback sectors covered by an extent. */
- ASSERT(test_bit(sector_nr, &stripe->extent_sector_bitmap));
+ ASSERT(scrub_bitmap_test_bit_has_extent(stripe, sector_nr));
/* Cannot merge with previous sector, submit the current one. */
if (bbio && sector_nr && !test_bit(sector_nr - 1, &write_bitmap)) {
@@ -1218,8 +1342,7 @@ static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *str
(sector_nr << fs_info->sectorsize_bits)) >>
SECTOR_SHIFT;
}
- ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff);
- ASSERT(ret == fs_info->sectorsize);
+ scrub_bio_add_sector(bbio, stripe, sector_nr);
}
if (bbio)
scrub_submit_write_bio(sctx, stripe, bbio, dev_replace);
@@ -1380,11 +1503,11 @@ static int find_first_extent_item(struct btrfs_root *extent_root,
if (path->nodes[0])
goto search_forward;
+ key.objectid = search_start;
if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
key.type = BTRFS_METADATA_ITEM_KEY;
else
key.type = BTRFS_EXTENT_ITEM_KEY;
- key.objectid = search_start;
key.offset = (u64)-1;
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
@@ -1470,8 +1593,7 @@ static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
physical,
sctx->write_pointer);
if (ret)
- btrfs_err(fs_info,
- "zoned: failed to recover write pointer");
+ btrfs_err(fs_info, "scrub: zoned: failed to recover write pointer");
}
mutex_unlock(&sctx->wr_lock);
btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical);
@@ -1493,9 +1615,9 @@ static void fill_one_extent_info(struct btrfs_fs_info *fs_info,
struct scrub_sector_verification *sector =
&stripe->sectors[nr_sector];
- set_bit(nr_sector, &stripe->extent_sector_bitmap);
+ scrub_bitmap_set_bit_has_extent(stripe, nr_sector);
if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
- sector->is_metadata = true;
+ scrub_bitmap_set_bit_is_metadata(stripe, nr_sector);
sector->generation = extent_gen;
}
}
@@ -1503,15 +1625,8 @@ static void fill_one_extent_info(struct btrfs_fs_info *fs_info,
static void scrub_stripe_reset_bitmaps(struct scrub_stripe *stripe)
{
- stripe->extent_sector_bitmap = 0;
- stripe->init_error_bitmap = 0;
- stripe->init_nr_io_errors = 0;
- stripe->init_nr_csum_errors = 0;
- stripe->init_nr_meta_errors = 0;
- stripe->error_bitmap = 0;
- stripe->io_error_bitmap = 0;
- stripe->csum_error_bitmap = 0;
- stripe->meta_error_bitmap = 0;
+ ASSERT(stripe->nr_sectors);
+ bitmap_zero(stripe->bitmaps, scrub_bitmap_nr_last * stripe->nr_sectors);
}
/*
@@ -1541,8 +1656,8 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg,
u64 extent_gen;
int ret;
- if (unlikely(!extent_root)) {
- btrfs_err(fs_info, "no valid extent root for scrub");
+ if (unlikely(!extent_root || !csum_root)) {
+ btrfs_err(fs_info, "scrub: no valid extent or csum root found");
return -EUCLEAN;
}
memset(stripe->sectors, 0, sizeof(struct scrub_sector_verification) *
@@ -1646,7 +1761,6 @@ static void scrub_reset_stripe(struct scrub_stripe *stripe)
stripe->state = 0;
for (int i = 0; i < stripe->nr_sectors; i++) {
- stripe->sectors[i].is_metadata = false;
stripe->sectors[i].csum = NULL;
stripe->sectors[i].generation = 0;
}
@@ -1665,24 +1779,21 @@ static void scrub_submit_extent_sector_read(struct scrub_stripe *stripe)
struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
struct btrfs_bio *bbio = NULL;
unsigned int nr_sectors = stripe_length(stripe) >> fs_info->sectorsize_bits;
+ const unsigned long has_extent = scrub_bitmap_read_has_extent(stripe);
u64 stripe_len = BTRFS_STRIPE_LEN;
int mirror = stripe->mirror_num;
int i;
atomic_inc(&stripe->pending_io);
- for_each_set_bit(i, &stripe->extent_sector_bitmap, stripe->nr_sectors) {
- struct page *page = scrub_stripe_get_page(stripe, i);
- unsigned int pgoff = scrub_stripe_get_page_offset(stripe, i);
-
+ for_each_set_bit(i, &has_extent, stripe->nr_sectors) {
/* We're beyond the chunk boundary, no need to read anymore. */
if (i >= nr_sectors)
break;
/* The current sector cannot be merged, submit the bio. */
if (bbio &&
- ((i > 0 &&
- !test_bit(i - 1, &stripe->extent_sector_bitmap)) ||
+ ((i > 0 && !test_bit(i - 1, &has_extent)) ||
bbio->bio.bi_iter.bi_size >= stripe_len)) {
ASSERT(bbio->bio.bi_iter.bi_size);
atomic_inc(&stripe->pending_io);
@@ -1695,7 +1806,7 @@ static void scrub_submit_extent_sector_read(struct scrub_stripe *stripe)
struct btrfs_io_context *bioc = NULL;
const u64 logical = stripe->logical +
(i << fs_info->sectorsize_bits);
- int err;
+ int ret;
io_stripe.rst_search_commit_root = true;
stripe_len = (nr_sectors - i) << fs_info->sectorsize_bits;
@@ -1703,11 +1814,11 @@ static void scrub_submit_extent_sector_read(struct scrub_stripe *stripe)
* For RST cases, we need to manually split the bbio to
* follow the RST boundary.
*/
- err = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
+ ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
&stripe_len, &bioc, &io_stripe, &mirror);
btrfs_put_bioc(bioc);
- if (err < 0) {
- if (err != -ENODATA) {
+ if (ret < 0) {
+ if (ret != -ENODATA) {
/*
* Earlier btrfs_get_raid_extent_offset()
* returned -ENODATA, which means there's
@@ -1716,8 +1827,8 @@ static void scrub_submit_extent_sector_read(struct scrub_stripe *stripe)
* the extent tree, then it's a preallocated
* extent and not an error.
*/
- set_bit(i, &stripe->io_error_bitmap);
- set_bit(i, &stripe->error_bitmap);
+ scrub_bitmap_set_bit_io_error(stripe, i);
+ scrub_bitmap_set_bit_error(stripe, i);
}
continue;
}
@@ -1727,7 +1838,7 @@ static void scrub_submit_extent_sector_read(struct scrub_stripe *stripe)
bbio->bio.bi_iter.bi_sector = logical >> SECTOR_SHIFT;
}
- __bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff);
+ scrub_bio_add_sector(bbio, stripe, i);
}
if (bbio) {
@@ -1765,15 +1876,8 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx,
bbio->bio.bi_iter.bi_sector = stripe->logical >> SECTOR_SHIFT;
/* Read the whole range inside the chunk boundary. */
- for (unsigned int cur = 0; cur < nr_sectors; cur++) {
- struct page *page = scrub_stripe_get_page(stripe, cur);
- unsigned int pgoff = scrub_stripe_get_page_offset(stripe, cur);
- int ret;
-
- ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff);
- /* We should have allocated enough bio vectors. */
- ASSERT(ret == fs_info->sectorsize);
- }
+ for (unsigned int cur = 0; cur < nr_sectors; cur++)
+ scrub_bio_add_sector(bbio, stripe, cur);
atomic_inc(&stripe->pending_io);
/*
@@ -1794,14 +1898,15 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx,
static bool stripe_has_metadata_error(struct scrub_stripe *stripe)
{
+ const unsigned long error = scrub_bitmap_read_error(stripe);
int i;
- for_each_set_bit(i, &stripe->error_bitmap, stripe->nr_sectors) {
- if (stripe->sectors[i].is_metadata) {
+ for_each_set_bit(i, &error, stripe->nr_sectors) {
+ if (scrub_bitmap_test_bit_is_metadata(stripe, i)) {
struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
btrfs_err(fs_info,
- "stripe %llu has unrepaired metadata sector at %llu",
+ "scrub: stripe %llu has unrepaired metadata sector at logical %llu",
stripe->logical,
stripe->logical + (i << fs_info->sectorsize_bits));
return true;
@@ -1872,13 +1977,16 @@ static int flush_scrub_stripes(struct scrub_ctx *sctx)
}
for (int i = 0; i < nr_stripes; i++) {
unsigned long good;
+ unsigned long has_extent;
+ unsigned long error;
stripe = &sctx->stripes[i];
ASSERT(stripe->dev == fs_info->dev_replace.srcdev);
- bitmap_andnot(&good, &stripe->extent_sector_bitmap,
- &stripe->error_bitmap, stripe->nr_sectors);
+ has_extent = scrub_bitmap_read_has_extent(stripe);
+ error = scrub_bitmap_read_error(stripe);
+ bitmap_andnot(&good, &has_extent, &error, stripe->nr_sectors);
scrub_write_sectors(sctx, stripe, good, true);
}
}
@@ -2012,7 +2120,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
/* Check if all data stripes are empty. */
for (int i = 0; i < data_stripes; i++) {
stripe = &sctx->raid56_data_stripes[i];
- if (!bitmap_empty(&stripe->extent_sector_bitmap, stripe->nr_sectors)) {
+ if (!scrub_bitmap_empty_has_extent(stripe)) {
all_empty = false;
break;
}
@@ -2044,25 +2152,28 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
*/
for (int i = 0; i < data_stripes; i++) {
unsigned long error;
+ unsigned long has_extent;
stripe = &sctx->raid56_data_stripes[i];
+ error = scrub_bitmap_read_error(stripe);
+ has_extent = scrub_bitmap_read_has_extent(stripe);
+
/*
* We should only check the errors where there is an extent.
* As we may hit an empty data stripe while it's missing.
*/
- bitmap_and(&error, &stripe->error_bitmap,
- &stripe->extent_sector_bitmap, stripe->nr_sectors);
+ bitmap_and(&error, &error, &has_extent, stripe->nr_sectors);
if (!bitmap_empty(&error, stripe->nr_sectors)) {
btrfs_err(fs_info,
-"unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl",
+"scrub: unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl",
full_stripe_start, i, stripe->nr_sectors,
&error);
ret = -EIO;
goto out;
}
- bitmap_or(&extent_bitmap, &extent_bitmap,
- &stripe->extent_sector_bitmap, stripe->nr_sectors);
+ bitmap_or(&extent_bitmap, &extent_bitmap, &has_extent,
+ stripe->nr_sectors);
}
/* Now we can check and regenerate the P/Q stripe. */
@@ -2497,8 +2608,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
path->skip_locking = 1;
key.objectid = scrub_dev->devid;
- key.offset = 0ull;
key.type = BTRFS_DEV_EXTENT_KEY;
+ key.offset = 0ull;
while (1) {
u64 dev_extent_len;
@@ -2677,14 +2788,14 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
ro_set = 0;
} else if (ret == -ETXTBSY) {
btrfs_warn(fs_info,
- "skipping scrub of block group %llu due to active swapfile",
+ "scrub: skipping scrub of block group %llu due to active swapfile",
cache->start);
scrub_pause_off(fs_info);
ret = 0;
goto skip_unfreeze;
} else {
- btrfs_warn(fs_info,
- "failed setting block group ro: %d", ret);
+ btrfs_warn(fs_info, "scrub: failed setting block group ro: %d",
+ ret);
btrfs_unfreeze_block_group(cache);
btrfs_put_block_group(cache);
scrub_pause_off(fs_info);
@@ -2770,29 +2881,23 @@ static int scrub_one_super(struct scrub_ctx *sctx, struct btrfs_device *dev,
struct page *page, u64 physical, u64 generation)
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
- struct bio_vec bvec;
- struct bio bio;
struct btrfs_super_block *sb = page_address(page);
int ret;
- bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_READ);
- bio.bi_iter.bi_sector = physical >> SECTOR_SHIFT;
- __bio_add_page(&bio, page, BTRFS_SUPER_INFO_SIZE, 0);
- ret = submit_bio_wait(&bio);
- bio_uninit(&bio);
-
+ ret = bdev_rw_virt(dev->bdev, physical >> SECTOR_SHIFT, sb,
+ BTRFS_SUPER_INFO_SIZE, REQ_OP_READ);
if (ret < 0)
return ret;
ret = btrfs_check_super_csum(fs_info, sb);
if (ret != 0) {
btrfs_err_rl(fs_info,
- "super block at physical %llu devid %llu has bad csum",
+ "scrub: super block at physical %llu devid %llu has bad csum",
physical, dev->devid);
return -EIO;
}
if (btrfs_super_generation(sb) != generation) {
btrfs_err_rl(fs_info,
-"super block at physical %llu devid %llu has bad generation %llu expect %llu",
+"scrub: super block at physical %llu devid %llu has bad generation %llu expect %llu",
physical, dev->devid,
btrfs_super_generation(sb), generation);
return -EUCLEAN;
@@ -2952,8 +3057,8 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
if (!is_dev_replace && !readonly &&
!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
- btrfs_err_in_rcu(fs_info,
- "scrub on devid %llu: filesystem on %s is not writable",
+ btrfs_err(fs_info,
+ "scrub: devid %llu: filesystem on %s is not writable",
devid, btrfs_dev_name(dev));
ret = -EROFS;
goto out;
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index f437138fefbc..7664025a5af4 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -4,6 +4,7 @@
*/
#include <linux/bsearch.h>
+#include <linux/falloc.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/sort.h>
@@ -16,7 +17,6 @@
#include <linux/compat.h>
#include <linux/crc32c.h>
#include <linux/fsverity.h>
-
#include "send.h"
#include "ctree.h"
#include "backref.h"
@@ -178,6 +178,7 @@ struct send_ctx {
u64 cur_inode_rdev;
u64 cur_inode_last_extent;
u64 cur_inode_next_write_offset;
+ struct fs_path cur_inode_path;
bool cur_inode_new;
bool cur_inode_new_gen;
bool cur_inode_deleted;
@@ -383,11 +384,11 @@ static void inconsistent_snapshot_error(struct send_ctx *sctx,
result_string = "updated";
break;
case BTRFS_COMPARE_TREE_SAME:
- ASSERT(0);
+ DEBUG_WARN("no change between trees");
result_string = "unchanged";
break;
default:
- ASSERT(0);
+ DEBUG_WARN("unexpected comparison result %d", result);
result_string = "unexpected";
}
@@ -425,15 +426,21 @@ static int need_send_hole(struct send_ctx *sctx)
static void fs_path_reset(struct fs_path *p)
{
- if (p->reversed) {
+ if (p->reversed)
p->start = p->buf + p->buf_len - 1;
- p->end = p->start;
- *p->start = 0;
- } else {
+ else
p->start = p->buf;
- p->end = p->start;
- *p->start = 0;
- }
+
+ p->end = p->start;
+ *p->start = 0;
+}
+
+static void init_path(struct fs_path *p)
+{
+ p->reversed = 0;
+ p->buf = p->inline_buf;
+ p->buf_len = FS_PATH_INLINE_SIZE;
+ fs_path_reset(p);
}
static struct fs_path *fs_path_alloc(void)
@@ -443,10 +450,7 @@ static struct fs_path *fs_path_alloc(void)
p = kmalloc(sizeof(*p), GFP_KERNEL);
if (!p)
return NULL;
- p->reversed = 0;
- p->buf = p->inline_buf;
- p->buf_len = FS_PATH_INLINE_SIZE;
- fs_path_reset(p);
+ init_path(p);
return p;
}
@@ -471,7 +475,7 @@ static void fs_path_free(struct fs_path *p)
kfree(p);
}
-static int fs_path_len(struct fs_path *p)
+static inline int fs_path_len(const struct fs_path *p)
{
return p->end - p->start;
}
@@ -487,12 +491,10 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
if (p->buf_len >= len)
return 0;
- if (len > PATH_MAX) {
- WARN_ON(1);
- return -ENOMEM;
- }
+ if (WARN_ON(len > PATH_MAX))
+ return -ENAMETOOLONG;
- path_len = p->end - p->start;
+ path_len = fs_path_len(p);
old_buf_len = p->buf_len;
/*
@@ -533,12 +535,12 @@ static int fs_path_prepare_for_add(struct fs_path *p, int name_len,
int ret;
int new_len;
- new_len = p->end - p->start + name_len;
+ new_len = fs_path_len(p) + name_len;
if (p->start != p->end)
new_len++;
ret = fs_path_ensure_buf(p, new_len);
if (ret < 0)
- goto out;
+ return ret;
if (p->reversed) {
if (p->start != p->end)
@@ -553,8 +555,7 @@ static int fs_path_prepare_for_add(struct fs_path *p, int name_len,
*p->end = 0;
}
-out:
- return ret;
+ return 0;
}
static int fs_path_add(struct fs_path *p, const char *name, int name_len)
@@ -564,25 +565,15 @@ static int fs_path_add(struct fs_path *p, const char *name, int name_len)
ret = fs_path_prepare_for_add(p, name_len, &prepared);
if (ret < 0)
- goto out;
+ return ret;
memcpy(prepared, name, name_len);
-out:
- return ret;
+ return 0;
}
-static int fs_path_add_path(struct fs_path *p, struct fs_path *p2)
+static inline int fs_path_add_path(struct fs_path *p, const struct fs_path *p2)
{
- int ret;
- char *prepared;
-
- ret = fs_path_prepare_for_add(p, p2->end - p2->start, &prepared);
- if (ret < 0)
- goto out;
- memcpy(prepared, p2->start, p2->end - p2->start);
-
-out:
- return ret;
+ return fs_path_add(p, p2->start, fs_path_len(p2));
}
static int fs_path_add_from_extent_buffer(struct fs_path *p,
@@ -594,12 +585,11 @@ static int fs_path_add_from_extent_buffer(struct fs_path *p,
ret = fs_path_prepare_for_add(p, len, &prepared);
if (ret < 0)
- goto out;
+ return ret;
read_extent_buffer(eb, prepared, off, len);
-out:
- return ret;
+ return 0;
}
static int fs_path_copy(struct fs_path *p, struct fs_path *from)
@@ -619,13 +609,21 @@ static void fs_path_unreverse(struct fs_path *p)
return;
tmp = p->start;
- len = p->end - p->start;
+ len = fs_path_len(p);
p->start = p->buf;
p->end = p->start + len;
memmove(p->start, tmp, len + 1);
p->reversed = 0;
}
+static inline bool is_current_inode_path(const struct send_ctx *sctx,
+ const struct fs_path *path)
+{
+ const struct fs_path *cur = &sctx->cur_inode_path;
+
+ return (strncmp(path->start, cur->start, fs_path_len(cur)) == 0);
+}
+
static struct btrfs_path *alloc_path_for_send(void)
{
struct btrfs_path *path;
@@ -740,7 +738,7 @@ static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr,
#define TLV_PUT_PATH(sctx, attrtype, p) \
do { \
ret = tlv_put_string(sctx, attrtype, p->start, \
- p->end - p->start); \
+ fs_path_len((p))); \
if (ret < 0) \
goto tlv_put_failure; \
} while(0)
@@ -761,7 +759,7 @@ static int send_header(struct send_ctx *sctx)
{
struct btrfs_stream_header hdr;
- strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC);
+ strscpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC);
hdr.version = cpu_to_le32(sctx->proto);
return write_buf(sctx->send_filp, &hdr, sizeof(hdr),
&sctx->send_off);
@@ -819,14 +817,11 @@ static int send_cmd(struct send_ctx *sctx)
static int send_rename(struct send_ctx *sctx,
struct fs_path *from, struct fs_path *to)
{
- struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
int ret;
- btrfs_debug(fs_info, "send_rename %s -> %s", from->start, to->start);
-
ret = begin_cmd(sctx, BTRFS_SEND_C_RENAME);
if (ret < 0)
- goto out;
+ return ret;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, from);
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_TO, to);
@@ -834,7 +829,6 @@ static int send_rename(struct send_ctx *sctx,
ret = send_cmd(sctx);
tlv_put_failure:
-out:
return ret;
}
@@ -844,14 +838,11 @@ out:
static int send_link(struct send_ctx *sctx,
struct fs_path *path, struct fs_path *lnk)
{
- struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
int ret;
- btrfs_debug(fs_info, "send_link %s -> %s", path->start, lnk->start);
-
ret = begin_cmd(sctx, BTRFS_SEND_C_LINK);
if (ret < 0)
- goto out;
+ return ret;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, lnk);
@@ -859,7 +850,6 @@ static int send_link(struct send_ctx *sctx,
ret = send_cmd(sctx);
tlv_put_failure:
-out:
return ret;
}
@@ -868,21 +858,17 @@ out:
*/
static int send_unlink(struct send_ctx *sctx, struct fs_path *path)
{
- struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
int ret;
- btrfs_debug(fs_info, "send_unlink %s", path->start);
-
ret = begin_cmd(sctx, BTRFS_SEND_C_UNLINK);
if (ret < 0)
- goto out;
+ return ret;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
ret = send_cmd(sctx);
tlv_put_failure:
-out:
return ret;
}
@@ -891,21 +877,17 @@ out:
*/
static int send_rmdir(struct send_ctx *sctx, struct fs_path *path)
{
- struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
int ret;
- btrfs_debug(fs_info, "send_rmdir %s", path->start);
-
ret = begin_cmd(sctx, BTRFS_SEND_C_RMDIR);
if (ret < 0)
- goto out;
+ return ret;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
ret = send_cmd(sctx);
tlv_put_failure:
-out:
return ret;
}
@@ -1580,7 +1562,6 @@ static int find_extent_clone(struct send_ctx *sctx,
struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
int ret;
int extent_type;
- u64 logical;
u64 disk_byte;
u64 num_bytes;
struct btrfs_file_extent_item *fi;
@@ -1611,7 +1592,6 @@ static int find_extent_clone(struct send_ctx *sctx,
compressed = btrfs_file_extent_compression(eb, fi);
num_bytes = btrfs_file_extent_num_bytes(eb, fi);
- logical = disk_byte + btrfs_file_extent_offset(eb, fi);
/*
* Setup the clone roots.
@@ -1693,14 +1673,8 @@ static int find_extent_clone(struct send_ctx *sctx,
}
up_read(&fs_info->commit_root_sem);
- btrfs_debug(fs_info,
- "find_extent_clone: data_offset=%llu, ino=%llu, num_bytes=%llu, logical=%llu",
- data_offset, ino, num_bytes, logical);
-
- if (!backref_ctx.found) {
- btrfs_debug(fs_info, "no clones found");
+ if (!backref_ctx.found)
return -ENOENT;
- }
cur_clone_root = NULL;
for (i = 0; i < sctx->clone_roots_cnt; i++) {
@@ -1831,7 +1805,7 @@ static int gen_unique_name(struct send_ctx *sctx,
ino, gen, idx);
ASSERT(len < sizeof(tmp));
tmp_name.name = tmp;
- tmp_name.len = strlen(tmp);
+ tmp_name.len = len;
di = btrfs_lookup_dir_item(NULL, sctx->send_root,
path, BTRFS_FIRST_FREE_OBJECTID,
@@ -1870,7 +1844,7 @@ static int gen_unique_name(struct send_ctx *sctx,
break;
}
- ret = fs_path_add(dest, tmp, strlen(tmp));
+ ret = fs_path_add(dest, tmp, len);
out:
btrfs_free_path(path);
@@ -1897,7 +1871,7 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen,
ret = get_inode_info(sctx->send_root, ino, &info);
if (ret < 0 && ret != -ENOENT)
- goto out;
+ return ret;
left_ret = (info.nlink == 0) ? -ENOENT : ret;
left_gen = info.gen;
if (send_gen)
@@ -1908,7 +1882,7 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen,
} else {
ret = get_inode_info(sctx->parent_root, ino, &info);
if (ret < 0 && ret != -ENOENT)
- goto out;
+ return ret;
right_ret = (info.nlink == 0) ? -ENOENT : ret;
right_gen = info.gen;
if (parent_gen)
@@ -1953,7 +1927,6 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen,
ret = -ENOENT;
}
-out:
return ret;
}
@@ -1967,17 +1940,14 @@ static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen,
ret = get_cur_inode_state(sctx, ino, gen, send_gen, parent_gen);
if (ret < 0)
- goto out;
+ return ret;
if (ret == inode_state_no_change ||
ret == inode_state_did_create ||
ret == inode_state_will_delete)
- ret = 1;
- else
- ret = 0;
+ return 1;
-out:
- return ret;
+ return 0;
}
/*
@@ -2326,9 +2296,8 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
*parent_gen = nce->parent_gen;
ret = fs_path_add(dest, nce->name, nce->name_len);
if (ret < 0)
- goto out;
- ret = nce->ret;
- goto out;
+ return ret;
+ return nce->ret;
}
}
@@ -2339,12 +2308,12 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
*/
ret = is_inode_existent(sctx, ino, gen, NULL, NULL);
if (ret < 0)
- goto out;
+ return ret;
if (!ret) {
ret = gen_unique_name(sctx, ino, gen, dest);
if (ret < 0)
- goto out;
+ return ret;
ret = 1;
goto out_cache;
}
@@ -2360,21 +2329,21 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
ret = get_first_ref(sctx->parent_root, ino,
parent_ino, parent_gen, dest);
if (ret < 0)
- goto out;
+ return ret;
/*
* Check if the ref was overwritten by an inode's ref that was processed
* earlier. If yes, treat as orphan and return 1.
*/
ret = did_overwrite_ref(sctx, *parent_ino, *parent_gen, ino, gen,
- dest->start, dest->end - dest->start);
+ dest->start, fs_path_len(dest));
if (ret < 0)
- goto out;
+ return ret;
if (ret) {
fs_path_reset(dest);
ret = gen_unique_name(sctx, ino, gen, dest);
if (ret < 0)
- goto out;
+ return ret;
ret = 1;
}
@@ -2383,10 +2352,8 @@ out_cache:
* Store the result of the lookup in the name cache.
*/
nce = kmalloc(sizeof(*nce) + fs_path_len(dest), GFP_KERNEL);
- if (!nce) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!nce)
+ return -ENOMEM;
nce->entry.key = ino;
nce->entry.gen = gen;
@@ -2404,10 +2371,9 @@ out_cache:
nce_ret = btrfs_lru_cache_store(&sctx->name_cache, &nce->entry, GFP_KERNEL);
if (nce_ret < 0) {
kfree(nce);
- ret = nce_ret;
+ return nce_ret;
}
-out:
return ret;
}
@@ -2444,6 +2410,14 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
u64 parent_inode = 0;
u64 parent_gen = 0;
int stop = 0;
+ const bool is_cur_inode = (ino == sctx->cur_ino && gen == sctx->cur_inode_gen);
+
+ if (is_cur_inode && fs_path_len(&sctx->cur_inode_path) > 0) {
+ if (dest != &sctx->cur_inode_path)
+ return fs_path_copy(dest, &sctx->cur_inode_path);
+
+ return 0;
+ }
name = fs_path_alloc();
if (!name) {
@@ -2495,8 +2469,12 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
out:
fs_path_free(name);
- if (!ret)
+ if (!ret) {
fs_path_unreverse(dest);
+ if (is_cur_inode && dest != &sctx->cur_inode_path)
+ ret = fs_path_copy(&sctx->cur_inode_path, dest);
+ }
+
return ret;
}
@@ -2591,25 +2569,60 @@ out:
return ret;
}
+static struct fs_path *get_cur_inode_path(struct send_ctx *sctx)
+{
+ if (fs_path_len(&sctx->cur_inode_path) == 0) {
+ int ret;
+
+ ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen,
+ &sctx->cur_inode_path);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ }
+
+ return &sctx->cur_inode_path;
+}
+
+static struct fs_path *get_path_for_command(struct send_ctx *sctx, u64 ino, u64 gen)
+{
+ struct fs_path *path;
+ int ret;
+
+ if (ino == sctx->cur_ino && gen == sctx->cur_inode_gen)
+ return get_cur_inode_path(sctx);
+
+ path = fs_path_alloc();
+ if (!path)
+ return ERR_PTR(-ENOMEM);
+
+ ret = get_cur_path(sctx, ino, gen, path);
+ if (ret < 0) {
+ fs_path_free(path);
+ return ERR_PTR(ret);
+ }
+
+ return path;
+}
+
+static void free_path_for_command(const struct send_ctx *sctx, struct fs_path *path)
+{
+ if (path != &sctx->cur_inode_path)
+ fs_path_free(path);
+}
+
static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size)
{
- struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
int ret = 0;
struct fs_path *p;
- btrfs_debug(fs_info, "send_truncate %llu size=%llu", ino, size);
-
- p = fs_path_alloc();
- if (!p)
- return -ENOMEM;
+ p = get_path_for_command(sctx, ino, gen);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
ret = begin_cmd(sctx, BTRFS_SEND_C_TRUNCATE);
if (ret < 0)
goto out;
- ret = get_cur_path(sctx, ino, gen, p);
- if (ret < 0)
- goto out;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, size);
@@ -2617,29 +2630,23 @@ static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size)
tlv_put_failure:
out:
- fs_path_free(p);
+ free_path_for_command(sctx, p);
return ret;
}
static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode)
{
- struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
int ret = 0;
struct fs_path *p;
- btrfs_debug(fs_info, "send_chmod %llu mode=%llu", ino, mode);
-
- p = fs_path_alloc();
- if (!p)
- return -ENOMEM;
+ p = get_path_for_command(sctx, ino, gen);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
ret = begin_cmd(sctx, BTRFS_SEND_C_CHMOD);
if (ret < 0)
goto out;
- ret = get_cur_path(sctx, ino, gen, p);
- if (ret < 0)
- goto out;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode & 07777);
@@ -2647,32 +2654,26 @@ static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode)
tlv_put_failure:
out:
- fs_path_free(p);
+ free_path_for_command(sctx, p);
return ret;
}
static int send_fileattr(struct send_ctx *sctx, u64 ino, u64 gen, u64 fileattr)
{
- struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
int ret = 0;
struct fs_path *p;
if (sctx->proto < 2)
return 0;
- btrfs_debug(fs_info, "send_fileattr %llu fileattr=%llu", ino, fileattr);
-
- p = fs_path_alloc();
- if (!p)
- return -ENOMEM;
+ p = get_path_for_command(sctx, ino, gen);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
ret = begin_cmd(sctx, BTRFS_SEND_C_FILEATTR);
if (ret < 0)
goto out;
- ret = get_cur_path(sctx, ino, gen, p);
- if (ret < 0)
- goto out;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
TLV_PUT_U64(sctx, BTRFS_SEND_A_FILEATTR, fileattr);
@@ -2680,30 +2681,23 @@ static int send_fileattr(struct send_ctx *sctx, u64 ino, u64 gen, u64 fileattr)
tlv_put_failure:
out:
- fs_path_free(p);
+ free_path_for_command(sctx, p);
return ret;
}
static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid)
{
- struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
int ret = 0;
struct fs_path *p;
- btrfs_debug(fs_info, "send_chown %llu uid=%llu, gid=%llu",
- ino, uid, gid);
-
- p = fs_path_alloc();
- if (!p)
- return -ENOMEM;
+ p = get_path_for_command(sctx, ino, gen);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
ret = begin_cmd(sctx, BTRFS_SEND_C_CHOWN);
if (ret < 0)
goto out;
- ret = get_cur_path(sctx, ino, gen, p);
- if (ret < 0)
- goto out;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
TLV_PUT_U64(sctx, BTRFS_SEND_A_UID, uid);
TLV_PUT_U64(sctx, BTRFS_SEND_A_GID, gid);
@@ -2712,13 +2706,12 @@ static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid)
tlv_put_failure:
out:
- fs_path_free(p);
+ free_path_for_command(sctx, p);
return ret;
}
static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen)
{
- struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
int ret = 0;
struct fs_path *p = NULL;
struct btrfs_inode_item *ii;
@@ -2727,11 +2720,9 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen)
struct btrfs_key key;
int slot;
- btrfs_debug(fs_info, "send_utimes %llu", ino);
-
- p = fs_path_alloc();
- if (!p)
- return -ENOMEM;
+ p = get_path_for_command(sctx, ino, gen);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
path = alloc_path_for_send();
if (!path) {
@@ -2756,9 +2747,6 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen)
if (ret < 0)
goto out;
- ret = get_cur_path(sctx, ino, gen, p);
- if (ret < 0)
- goto out;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, &ii->atime);
TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, &ii->mtime);
@@ -2770,7 +2758,7 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen)
tlv_put_failure:
out:
- fs_path_free(p);
+ free_path_for_command(sctx, p);
btrfs_free_path(path);
return ret;
}
@@ -2838,7 +2826,6 @@ static int trim_dir_utimes_cache(struct send_ctx *sctx)
*/
static int send_create_inode(struct send_ctx *sctx, u64 ino)
{
- struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
int ret = 0;
struct fs_path *p;
int cmd;
@@ -2847,8 +2834,6 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino)
u64 mode;
u64 rdev;
- btrfs_debug(fs_info, "send_create_inode %llu", ino);
-
p = fs_path_alloc();
if (!p)
return -ENOMEM;
@@ -3075,7 +3060,7 @@ static void __free_recorded_refs(struct list_head *head)
struct recorded_ref *cur;
while (!list_empty(head)) {
- cur = list_entry(head->next, struct recorded_ref, list);
+ cur = list_first_entry(head, struct recorded_ref, list);
recorded_ref_free(cur);
}
}
@@ -3106,6 +3091,11 @@ static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen,
goto out;
ret = send_rename(sctx, path, orphan);
+ if (ret < 0)
+ goto out;
+
+ if (ino == sctx->cur_ino && gen == sctx->cur_inode_gen)
+ ret = fs_path_copy(&sctx->cur_inode_path, orphan);
out:
fs_path_free(orphan);
@@ -4158,6 +4148,23 @@ out:
return ret;
}
+static int rename_current_inode(struct send_ctx *sctx,
+ struct fs_path *current_path,
+ struct fs_path *new_path)
+{
+ int ret;
+
+ ret = send_rename(sctx, current_path, new_path);
+ if (ret < 0)
+ return ret;
+
+ ret = fs_path_copy(&sctx->cur_inode_path, new_path);
+ if (ret < 0)
+ return ret;
+
+ return fs_path_copy(current_path, new_path);
+}
+
/*
* This does all the move/link/unlink/rmdir magic.
*/
@@ -4172,15 +4179,13 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
u64 ow_inode = 0;
u64 ow_gen;
u64 ow_mode;
- int did_overwrite = 0;
- int is_orphan = 0;
u64 last_dir_ino_rm = 0;
+ bool did_overwrite = false;
+ bool is_orphan = false;
bool can_rename = true;
bool orphanized_dir = false;
bool orphanized_ancestor = false;
- btrfs_debug(fs_info, "process_recorded_refs %llu", sctx->cur_ino);
-
/*
* This should never happen as the root dir always has the same ref
* which is always '..'
@@ -4216,14 +4221,14 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
if (ret < 0)
goto out;
if (ret)
- did_overwrite = 1;
+ did_overwrite = true;
}
if (sctx->cur_inode_new || did_overwrite) {
ret = gen_unique_name(sctx, sctx->cur_ino,
sctx->cur_inode_gen, valid_path);
if (ret < 0)
goto out;
- is_orphan = 1;
+ is_orphan = true;
} else {
ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen,
valid_path);
@@ -4348,6 +4353,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
if (ret > 0) {
orphanized_ancestor = true;
fs_path_reset(valid_path);
+ fs_path_reset(&sctx->cur_inode_path);
ret = get_cur_path(sctx, sctx->cur_ino,
sctx->cur_inode_gen,
valid_path);
@@ -4443,13 +4449,10 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
* it depending on the inode mode.
*/
if (is_orphan && can_rename) {
- ret = send_rename(sctx, valid_path, cur->full_path);
- if (ret < 0)
- goto out;
- is_orphan = 0;
- ret = fs_path_copy(valid_path, cur->full_path);
+ ret = rename_current_inode(sctx, valid_path, cur->full_path);
if (ret < 0)
goto out;
+ is_orphan = false;
} else if (can_rename) {
if (S_ISDIR(sctx->cur_inode_mode)) {
/*
@@ -4457,10 +4460,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
* dirs, we always have one new and one deleted
* ref. The deleted ref is ignored later.
*/
- ret = send_rename(sctx, valid_path,
- cur->full_path);
- if (!ret)
- ret = fs_path_copy(valid_path,
+ ret = rename_current_inode(sctx, valid_path,
cur->full_path);
if (ret < 0)
goto out;
@@ -4507,7 +4507,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
sctx->cur_inode_gen, valid_path);
if (ret < 0)
goto out;
- is_orphan = 1;
+ is_orphan = true;
}
list_for_each_entry(cur, &sctx->deleted_refs, list) {
@@ -4520,8 +4520,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
/*
* We have a moved dir. Add the old parent to check_dirs
*/
- cur = list_entry(sctx->deleted_refs.next, struct recorded_ref,
- list);
+ cur = list_first_entry(&sctx->deleted_refs, struct recorded_ref, list);
ret = dup_ref(cur, &check_dirs);
if (ret < 0)
goto out;
@@ -4553,6 +4552,8 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
ret = send_unlink(sctx, cur->full_path);
if (ret < 0)
goto out;
+ if (is_current_inode_path(sctx, cur->full_path))
+ fs_path_reset(&sctx->cur_inode_path);
}
ret = dup_ref(cur, &check_dirs);
if (ret < 0)
@@ -4628,7 +4629,6 @@ static int rbtree_ref_comp(const void *k, const struct rb_node *node)
{
const struct recorded_ref *data = k;
const struct recorded_ref *ref = rb_entry(node, struct recorded_ref, node);
- int result;
if (data->dir > ref->dir)
return 1;
@@ -4642,12 +4642,7 @@ static int rbtree_ref_comp(const void *k, const struct rb_node *node)
return 1;
if (data->name_len < ref->name_len)
return -1;
- result = strcmp(data->name, ref->name);
- if (result > 0)
- return 1;
- if (result < 0)
- return -1;
- return 0;
+ return strcmp(data->name, ref->name);
}
static bool rbtree_ref_less(struct rb_node *node, const struct rb_node *parent)
@@ -4701,7 +4696,7 @@ out:
static int record_new_ref_if_needed(u64 dir, struct fs_path *name, void *ctx)
{
- int ret = 0;
+ int ret;
struct send_ctx *sctx = ctx;
struct rb_node *node = NULL;
struct recorded_ref data;
@@ -4710,7 +4705,7 @@ static int record_new_ref_if_needed(u64 dir, struct fs_path *name, void *ctx)
ret = get_inode_gen(sctx->send_root, dir, &dir_gen);
if (ret < 0)
- goto out;
+ return ret;
data.dir = dir;
data.dir_gen = dir_gen;
@@ -4724,13 +4719,13 @@ static int record_new_ref_if_needed(u64 dir, struct fs_path *name, void *ctx)
&sctx->new_refs, name, dir, dir_gen,
sctx);
}
-out:
+
return ret;
}
static int record_deleted_ref_if_needed(u64 dir, struct fs_path *name, void *ctx)
{
- int ret = 0;
+ int ret;
struct send_ctx *sctx = ctx;
struct rb_node *node = NULL;
struct recorded_ref data;
@@ -4739,7 +4734,7 @@ static int record_deleted_ref_if_needed(u64 dir, struct fs_path *name, void *ctx
ret = get_inode_gen(sctx->parent_root, dir, &dir_gen);
if (ret < 0)
- goto out;
+ return ret;
data.dir = dir;
data.dir_gen = dir_gen;
@@ -4753,7 +4748,7 @@ static int record_deleted_ref_if_needed(u64 dir, struct fs_path *name, void *ctx
&sctx->deleted_refs, name, dir,
dir_gen, sctx);
}
-out:
+
return ret;
}
@@ -4764,11 +4759,9 @@ static int record_new_ref(struct send_ctx *sctx)
ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
sctx->cmp_key, 0, record_new_ref_if_needed, sctx);
if (ret < 0)
- goto out;
- ret = 0;
+ return ret;
-out:
- return ret;
+ return 0;
}
static int record_deleted_ref(struct send_ctx *sctx)
@@ -4779,29 +4772,25 @@ static int record_deleted_ref(struct send_ctx *sctx)
sctx->cmp_key, 0, record_deleted_ref_if_needed,
sctx);
if (ret < 0)
- goto out;
- ret = 0;
+ return ret;
-out:
- return ret;
+ return 0;
}
static int record_changed_ref(struct send_ctx *sctx)
{
- int ret = 0;
+ int ret;
ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
sctx->cmp_key, 0, record_new_ref_if_needed, sctx);
if (ret < 0)
- goto out;
+ return ret;
ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
sctx->cmp_key, 0, record_deleted_ref_if_needed, sctx);
if (ret < 0)
- goto out;
- ret = 0;
+ return ret;
-out:
- return ret;
+ return 0;
}
/*
@@ -4869,15 +4858,19 @@ out:
}
static int send_set_xattr(struct send_ctx *sctx,
- struct fs_path *path,
const char *name, int name_len,
const char *data, int data_len)
{
- int ret = 0;
+ struct fs_path *path;
+ int ret;
+
+ path = get_cur_inode_path(sctx);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
ret = begin_cmd(sctx, BTRFS_SEND_C_SET_XATTR);
if (ret < 0)
- goto out;
+ return ret;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len);
@@ -4886,7 +4879,6 @@ static int send_set_xattr(struct send_ctx *sctx,
ret = send_cmd(sctx);
tlv_put_failure:
-out:
return ret;
}
@@ -4894,11 +4886,11 @@ static int send_remove_xattr(struct send_ctx *sctx,
struct fs_path *path,
const char *name, int name_len)
{
- int ret = 0;
+ int ret;
ret = begin_cmd(sctx, BTRFS_SEND_C_REMOVE_XATTR);
if (ret < 0)
- goto out;
+ return ret;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len);
@@ -4906,7 +4898,6 @@ static int send_remove_xattr(struct send_ctx *sctx,
ret = send_cmd(sctx);
tlv_put_failure:
-out:
return ret;
}
@@ -4914,19 +4905,13 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key,
const char *name, int name_len, const char *data,
int data_len, void *ctx)
{
- int ret;
struct send_ctx *sctx = ctx;
- struct fs_path *p;
struct posix_acl_xattr_header dummy_acl;
/* Capabilities are emitted by finish_inode_if_needed */
if (!strncmp(name, XATTR_NAME_CAPS, name_len))
return 0;
- p = fs_path_alloc();
- if (!p)
- return -ENOMEM;
-
/*
* This hack is needed because empty acls are stored as zero byte
* data in xattrs. Problem with that is, that receiving these zero byte
@@ -4943,48 +4928,27 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key,
}
}
- ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
- if (ret < 0)
- goto out;
-
- ret = send_set_xattr(sctx, p, name, name_len, data, data_len);
-
-out:
- fs_path_free(p);
- return ret;
+ return send_set_xattr(sctx, name, name_len, data, data_len);
}
static int __process_deleted_xattr(int num, struct btrfs_key *di_key,
const char *name, int name_len,
const char *data, int data_len, void *ctx)
{
- int ret;
struct send_ctx *sctx = ctx;
struct fs_path *p;
- p = fs_path_alloc();
- if (!p)
- return -ENOMEM;
-
- ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
- if (ret < 0)
- goto out;
+ p = get_cur_inode_path(sctx);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
- ret = send_remove_xattr(sctx, p, name, name_len);
-
-out:
- fs_path_free(p);
- return ret;
+ return send_remove_xattr(sctx, p, name, name_len);
}
static int process_new_xattr(struct send_ctx *sctx)
{
- int ret = 0;
-
- ret = iterate_dir_item(sctx->send_root, sctx->left_path,
- __process_new_xattr, sctx);
-
- return ret;
+ return iterate_dir_item(sctx->send_root, sctx->left_path,
+ __process_new_xattr, sctx);
}
static int process_deleted_xattr(struct send_ctx *sctx)
@@ -5100,17 +5064,15 @@ static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key,
static int process_changed_xattr(struct send_ctx *sctx)
{
- int ret = 0;
+ int ret;
ret = iterate_dir_item(sctx->send_root, sctx->left_path,
__process_changed_new_xattr, sctx);
if (ret < 0)
- goto out;
- ret = iterate_dir_item(sctx->parent_root, sctx->right_path,
- __process_changed_deleted_xattr, sctx);
+ return ret;
-out:
- return ret;
+ return iterate_dir_item(sctx->parent_root, sctx->right_path,
+ __process_changed_deleted_xattr, sctx);
}
static int process_all_new_xattrs(struct send_ctx *sctx)
@@ -5157,7 +5119,7 @@ static int send_verity(struct send_ctx *sctx, struct fs_path *path,
ret = begin_cmd(sctx, BTRFS_SEND_C_ENABLE_VERITY);
if (ret < 0)
- goto out;
+ return ret;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
TLV_PUT_U8(sctx, BTRFS_SEND_A_VERITY_ALGORITHM,
@@ -5172,21 +5134,20 @@ static int send_verity(struct send_ctx *sctx, struct fs_path *path,
ret = send_cmd(sctx);
tlv_put_failure:
-out:
return ret;
}
static int process_verity(struct send_ctx *sctx)
{
int ret = 0;
- struct inode *inode;
+ struct btrfs_inode *inode;
struct fs_path *p;
inode = btrfs_iget(sctx->cur_ino, sctx->send_root);
if (IS_ERR(inode))
return PTR_ERR(inode);
- ret = btrfs_get_verity_descriptor(inode, NULL, 0);
+ ret = btrfs_get_verity_descriptor(&inode->vfs_inode, NULL, 0);
if (ret < 0)
goto iput;
@@ -5203,27 +5164,19 @@ static int process_verity(struct send_ctx *sctx)
}
}
- ret = btrfs_get_verity_descriptor(inode, sctx->verity_descriptor, ret);
+ ret = btrfs_get_verity_descriptor(&inode->vfs_inode, sctx->verity_descriptor, ret);
if (ret < 0)
goto iput;
- p = fs_path_alloc();
- if (!p) {
- ret = -ENOMEM;
+ p = get_cur_inode_path(sctx);
+ if (IS_ERR(p)) {
+ ret = PTR_ERR(p);
goto iput;
}
- ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
- if (ret < 0)
- goto free_path;
ret = send_verity(sctx, p, sctx->verity_descriptor);
- if (ret < 0)
- goto free_path;
-
-free_path:
- fs_path_free(p);
iput:
- iput(inode);
+ iput(&inode->vfs_inode);
return ret;
}
@@ -5263,10 +5216,9 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
{
struct btrfs_root *root = sctx->send_root;
struct btrfs_fs_info *fs_info = root->fs_info;
- struct folio *folio;
- pgoff_t index = offset >> PAGE_SHIFT;
- pgoff_t last_index;
- unsigned pg_offset = offset_in_page(offset);
+ u64 cur = offset;
+ const u64 end = offset + len;
+ const pgoff_t last_index = ((end - 1) >> PAGE_SHIFT);
struct address_space *mapping = sctx->cur_inode->i_mapping;
int ret;
@@ -5274,13 +5226,12 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
if (ret)
return ret;
- last_index = (offset + len - 1) >> PAGE_SHIFT;
+ while (cur < end) {
+ pgoff_t index = (cur >> PAGE_SHIFT);
+ unsigned int cur_len;
+ unsigned int pg_offset;
+ struct folio *folio;
- while (index <= last_index) {
- unsigned cur_len = min_t(unsigned, len,
- PAGE_SIZE - pg_offset);
-
-again:
folio = filemap_lock_folio(mapping, index);
if (IS_ERR(folio)) {
page_cache_sync_readahead(mapping,
@@ -5293,8 +5244,8 @@ again:
break;
}
}
-
- WARN_ON(folio_order(folio));
+ pg_offset = offset_in_folio(folio, cur);
+ cur_len = min_t(unsigned int, end - cur, folio_size(folio) - pg_offset);
if (folio_test_readahead(folio))
page_cache_async_readahead(mapping, &sctx->ra, NULL, folio,
@@ -5316,7 +5267,7 @@ again:
if (folio->mapping != mapping) {
folio_unlock(folio);
folio_put(folio);
- goto again;
+ continue;
}
}
@@ -5324,9 +5275,7 @@ again:
pg_offset, cur_len);
folio_unlock(folio);
folio_put(folio);
- index++;
- pg_offset = 0;
- len -= cur_len;
+ cur += cur_len;
sctx->send_size += cur_len;
}
@@ -5339,35 +5288,26 @@ again:
*/
static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
{
- struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
int ret = 0;
struct fs_path *p;
- p = fs_path_alloc();
- if (!p)
- return -ENOMEM;
-
- btrfs_debug(fs_info, "send_write offset=%llu, len=%d", offset, len);
+ p = get_cur_inode_path(sctx);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
if (ret < 0)
- goto out;
-
- ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
- if (ret < 0)
- goto out;
+ return ret;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
ret = put_file_data(sctx, offset, len);
if (ret < 0)
- goto out;
+ return ret;
ret = send_cmd(sctx);
tlv_put_failure:
-out:
- fs_path_free(p);
return ret;
}
@@ -5380,12 +5320,12 @@ static int send_clone(struct send_ctx *sctx,
{
int ret = 0;
struct fs_path *p;
+ struct fs_path *cur_inode_path;
u64 gen;
- btrfs_debug(sctx->send_root->fs_info,
- "send_clone offset=%llu, len=%d, clone_root=%llu, clone_inode=%llu, clone_offset=%llu",
- offset, len, btrfs_root_id(clone_root->root),
- clone_root->ino, clone_root->offset);
+ cur_inode_path = get_cur_inode_path(sctx);
+ if (IS_ERR(cur_inode_path))
+ return PTR_ERR(cur_inode_path);
p = fs_path_alloc();
if (!p)
@@ -5395,13 +5335,9 @@ static int send_clone(struct send_ctx *sctx,
if (ret < 0)
goto out;
- ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
- if (ret < 0)
- goto out;
-
TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_LEN, len);
- TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, cur_inode_path);
if (clone_root->root == sctx->send_root) {
ret = get_inode_gen(sctx->send_root, clone_root->ino, &gen);
@@ -5452,27 +5388,45 @@ static int send_update_extent(struct send_ctx *sctx,
int ret = 0;
struct fs_path *p;
- p = fs_path_alloc();
- if (!p)
- return -ENOMEM;
+ p = get_cur_inode_path(sctx);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
ret = begin_cmd(sctx, BTRFS_SEND_C_UPDATE_EXTENT);
if (ret < 0)
- goto out;
+ return ret;
+
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, len);
+
+ ret = send_cmd(sctx);
+
+tlv_put_failure:
+ return ret;
+}
+
+static int send_fallocate(struct send_ctx *sctx, u32 mode, u64 offset, u64 len)
+{
+ struct fs_path *path;
+ int ret;
- ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+ path = get_cur_inode_path(sctx);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
+
+ ret = begin_cmd(sctx, BTRFS_SEND_C_FALLOCATE);
if (ret < 0)
- goto out;
+ return ret;
- TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
+ TLV_PUT_U32(sctx, BTRFS_SEND_A_FALLOCATE_MODE, mode);
TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, len);
ret = send_cmd(sctx);
tlv_put_failure:
-out:
- fs_path_free(p);
return ret;
}
@@ -5484,6 +5438,14 @@ static int send_hole(struct send_ctx *sctx, u64 end)
int ret = 0;
/*
+ * Starting with send stream v2 we have fallocate and can use it to
+ * punch holes instead of sending writes full of zeroes.
+ */
+ if (proto_cmd_ok(sctx, BTRFS_SEND_C_FALLOCATE))
+ return send_fallocate(sctx, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+ offset, end - offset);
+
+ /*
* A hole that starts at EOF or beyond it. Since we do not yet support
* fallocate (for extent preallocation and hole punching), sending a
* write of zeroes starting at EOF or beyond would later require issuing
@@ -5501,12 +5463,10 @@ static int send_hole(struct send_ctx *sctx, u64 end)
if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
return send_update_extent(sctx, offset, end - offset);
- p = fs_path_alloc();
- if (!p)
- return -ENOMEM;
- ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
- if (ret < 0)
- goto tlv_put_failure;
+ p = get_cur_inode_path(sctx);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
+
while (offset < end) {
u64 len = min(end - offset, read_size);
@@ -5527,7 +5487,6 @@ static int send_hole(struct send_ctx *sctx, u64 end)
}
sctx->cur_inode_next_write_offset = offset;
tlv_put_failure:
- fs_path_free(p);
return ret;
}
@@ -5535,9 +5494,7 @@ static int send_encoded_inline_extent(struct send_ctx *sctx,
struct btrfs_path *path, u64 offset,
u64 len)
{
- struct btrfs_root *root = sctx->send_root;
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct inode *inode;
+ struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
struct fs_path *fspath;
struct extent_buffer *leaf = path->nodes[0];
struct btrfs_key key;
@@ -5546,23 +5503,13 @@ static int send_encoded_inline_extent(struct send_ctx *sctx,
size_t inline_size;
int ret;
- inode = btrfs_iget(sctx->cur_ino, root);
- if (IS_ERR(inode))
- return PTR_ERR(inode);
-
- fspath = fs_path_alloc();
- if (!fspath) {
- ret = -ENOMEM;
- goto out;
- }
+ fspath = get_cur_inode_path(sctx);
+ if (IS_ERR(fspath))
+ return PTR_ERR(fspath);
ret = begin_cmd(sctx, BTRFS_SEND_C_ENCODED_WRITE);
if (ret < 0)
- goto out;
-
- ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath);
- if (ret < 0)
- goto out;
+ return ret;
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
@@ -5578,12 +5525,12 @@ static int send_encoded_inline_extent(struct send_ctx *sctx,
ret = btrfs_encoded_io_compression_from_extent(fs_info,
btrfs_file_extent_compression(leaf, ei));
if (ret < 0)
- goto out;
+ return ret;
TLV_PUT_U32(sctx, BTRFS_SEND_A_COMPRESSION, ret);
ret = put_data_header(sctx, inline_size);
if (ret < 0)
- goto out;
+ return ret;
read_extent_buffer(leaf, sctx->send_buf + sctx->send_size,
btrfs_file_extent_inline_start(ei), inline_size);
sctx->send_size += inline_size;
@@ -5591,9 +5538,6 @@ static int send_encoded_inline_extent(struct send_ctx *sctx,
ret = send_cmd(sctx);
tlv_put_failure:
-out:
- fs_path_free(fspath);
- iput(inode);
return ret;
}
@@ -5602,7 +5546,7 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path,
{
struct btrfs_root *root = sctx->send_root;
struct btrfs_fs_info *fs_info = root->fs_info;
- struct inode *inode;
+ struct btrfs_inode *inode;
struct fs_path *fspath;
struct extent_buffer *leaf = path->nodes[0];
struct btrfs_key key;
@@ -5617,9 +5561,9 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path,
if (IS_ERR(inode))
return PTR_ERR(inode);
- fspath = fs_path_alloc();
- if (!fspath) {
- ret = -ENOMEM;
+ fspath = get_cur_inode_path(sctx);
+ if (IS_ERR(fspath)) {
+ ret = PTR_ERR(fspath);
goto out;
}
@@ -5627,10 +5571,6 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path,
if (ret < 0)
goto out;
- ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath);
- if (ret < 0)
- goto out;
-
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
@@ -5672,7 +5612,7 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path,
* Note that send_buf is a mapping of send_buf_pages, so this is really
* reading into send_buf.
*/
- ret = btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode),
+ ret = btrfs_encoded_read_regular_fill_pages(inode,
disk_bytenr, disk_num_bytes,
sctx->send_buf_pages +
(data_offset >> PAGE_SHIFT),
@@ -5698,8 +5638,7 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path,
tlv_put_failure:
out:
- fs_path_free(fspath);
- iput(inode);
+ iput(&inode->vfs_inode);
return ret;
}
@@ -5741,15 +5680,14 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path,
}
if (sctx->cur_inode == NULL) {
+ struct btrfs_inode *btrfs_inode;
struct btrfs_root *root = sctx->send_root;
- sctx->cur_inode = btrfs_iget(sctx->cur_ino, root);
- if (IS_ERR(sctx->cur_inode)) {
- int err = PTR_ERR(sctx->cur_inode);
+ btrfs_inode = btrfs_iget(sctx->cur_ino, root);
+ if (IS_ERR(btrfs_inode))
+ return PTR_ERR(btrfs_inode);
- sctx->cur_inode = NULL;
- return err;
- }
+ sctx->cur_inode = &btrfs_inode->vfs_inode;
memset(&sctx->ra, 0, sizeof(struct file_ra_state));
file_ra_state_init(&sctx->ra, sctx->cur_inode->i_mapping);
@@ -5828,7 +5766,6 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path,
*/
static int send_capabilities(struct send_ctx *sctx)
{
- struct fs_path *fspath = NULL;
struct btrfs_path *path;
struct btrfs_dir_item *di;
struct extent_buffer *leaf;
@@ -5854,25 +5791,19 @@ static int send_capabilities(struct send_ctx *sctx)
leaf = path->nodes[0];
buf_len = btrfs_dir_data_len(leaf, di);
- fspath = fs_path_alloc();
buf = kmalloc(buf_len, GFP_KERNEL);
- if (!fspath || !buf) {
+ if (!buf) {
ret = -ENOMEM;
goto out;
}
- ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath);
- if (ret < 0)
- goto out;
-
data_ptr = (unsigned long)(di + 1) + btrfs_dir_name_len(leaf, di);
read_extent_buffer(leaf, buf, data_ptr, buf_len);
- ret = send_set_xattr(sctx, fspath, XATTR_NAME_CAPS,
+ ret = send_set_xattr(sctx, XATTR_NAME_CAPS,
strlen(XATTR_NAME_CAPS), buf, buf_len);
out:
kfree(buf);
- fs_path_free(fspath);
btrfs_free_path(path);
return ret;
}
@@ -6898,6 +6829,7 @@ static int changed_inode(struct send_ctx *sctx,
sctx->cur_inode_last_extent = (u64)-1;
sctx->cur_inode_next_write_offset = 0;
sctx->ignore_cur_inode = false;
+ fs_path_reset(&sctx->cur_inode_path);
/*
* Set send_progress to current inode. This will tell all get_cur_xxx
@@ -8107,10 +8039,9 @@ static void dedupe_in_progress_warn(const struct btrfs_root *root)
btrfs_root_id(root), root->dedupe_in_progress);
}
-long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_args *arg)
+long btrfs_ioctl_send(struct btrfs_root *send_root, const struct btrfs_ioctl_send_args *arg)
{
int ret = 0;
- struct btrfs_root *send_root = inode->root;
struct btrfs_fs_info *fs_info = send_root->fs_info;
struct btrfs_root *clone_root;
struct send_ctx *sctx = NULL;
@@ -8173,6 +8104,7 @@ long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_a
goto out;
}
+ init_path(&sctx->cur_inode_path);
INIT_LIST_HEAD(&sctx->new_refs);
INIT_LIST_HEAD(&sctx->deleted_refs);
@@ -8449,6 +8381,9 @@ out:
btrfs_lru_cache_clear(&sctx->dir_created_cache);
btrfs_lru_cache_clear(&sctx->dir_utimes_cache);
+ if (sctx->cur_inode_path.buf != sctx->cur_inode_path.inline_buf)
+ kfree(sctx->cur_inode_path.buf);
+
kfree(sctx);
}
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index 9309886c5ea1..652bb28f63d4 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -11,7 +11,7 @@
#include <linux/sizes.h>
#include <linux/align.h>
-struct btrfs_inode;
+struct btrfs_root;
struct btrfs_ioctl_send_args;
#define BTRFS_SEND_STREAM_MAGIC "btrfs-stream"
@@ -182,6 +182,6 @@ enum {
__BTRFS_SEND_A_MAX = 35,
};
-long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_args *arg);
+long btrfs_ioctl_send(struct btrfs_root *send_root, const struct btrfs_ioctl_send_args *arg);
#endif
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index a341d087567a..0481c693ac2e 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
-#include "linux/spinlock.h"
+#include <linux/spinlock.h>
#include <linux/minmax.h>
#include "misc.h"
#include "ctree.h"
@@ -50,11 +50,11 @@
* num_bytes we want to reserve.
*
* ->reserve
- * space_info->bytes_may_reserve += num_bytes
+ * space_info->bytes_may_use += num_bytes
*
* ->extent allocation
* Call btrfs_add_reserved_bytes() which does
- * space_info->bytes_may_reserve -= num_bytes
+ * space_info->bytes_may_use -= num_bytes
* space_info->bytes_reserved += extent_bytes
*
* ->insert reference
@@ -234,19 +234,11 @@ void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info,
WRITE_ONCE(space_info->chunk_size, chunk_size);
}
-static int create_space_info(struct btrfs_fs_info *info, u64 flags)
+static void init_space_info(struct btrfs_fs_info *info,
+ struct btrfs_space_info *space_info, u64 flags)
{
-
- struct btrfs_space_info *space_info;
- int i;
- int ret;
-
- space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
- if (!space_info)
- return -ENOMEM;
-
space_info->fs_info = info;
- for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
+ for (int i = 0; i < BTRFS_NR_RAID_TYPES; i++)
INIT_LIST_HEAD(&space_info->block_groups[i]);
init_rwsem(&space_info->groups_sem);
spin_lock_init(&space_info->lock);
@@ -257,9 +249,64 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags)
INIT_LIST_HEAD(&space_info->priority_tickets);
space_info->clamp = 1;
btrfs_update_space_info_chunk_size(space_info, calc_chunk_size(info, flags));
+ space_info->subgroup_id = BTRFS_SUB_GROUP_PRIMARY;
if (btrfs_is_zoned(info))
space_info->bg_reclaim_threshold = BTRFS_DEFAULT_ZONED_RECLAIM_THRESH;
+}
+
+static int create_space_info_sub_group(struct btrfs_space_info *parent, u64 flags,
+ enum btrfs_space_info_sub_group id, int index)
+{
+ struct btrfs_fs_info *fs_info = parent->fs_info;
+ struct btrfs_space_info *sub_group;
+ int ret;
+
+ ASSERT(parent->subgroup_id == BTRFS_SUB_GROUP_PRIMARY);
+ ASSERT(id != BTRFS_SUB_GROUP_PRIMARY);
+
+ sub_group = kzalloc(sizeof(*sub_group), GFP_NOFS);
+ if (!sub_group)
+ return -ENOMEM;
+
+ init_space_info(fs_info, sub_group, flags);
+ parent->sub_group[index] = sub_group;
+ sub_group->parent = parent;
+ sub_group->subgroup_id = id;
+
+ ret = btrfs_sysfs_add_space_info_type(fs_info, sub_group);
+ if (ret) {
+ kfree(sub_group);
+ parent->sub_group[index] = NULL;
+ }
+ return ret;
+}
+
+static int create_space_info(struct btrfs_fs_info *info, u64 flags)
+{
+
+ struct btrfs_space_info *space_info;
+ int ret = 0;
+
+ space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
+ if (!space_info)
+ return -ENOMEM;
+
+ init_space_info(info, space_info, flags);
+
+ if (btrfs_is_zoned(info)) {
+ if (flags & BTRFS_BLOCK_GROUP_DATA)
+ ret = create_space_info_sub_group(space_info, flags,
+ BTRFS_SUB_GROUP_DATA_RELOC,
+ 0);
+ else if (flags & BTRFS_BLOCK_GROUP_METADATA)
+ ret = create_space_info_sub_group(space_info, flags,
+ BTRFS_SUB_GROUP_TREELOG,
+ 0);
+
+ if (ret)
+ return ret;
+ }
ret = btrfs_sysfs_add_space_info_type(info, space_info);
if (ret)
@@ -312,31 +359,29 @@ out:
void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info,
struct btrfs_block_group *block_group)
{
- struct btrfs_space_info *found;
+ struct btrfs_space_info *space_info = block_group->space_info;
int factor, index;
factor = btrfs_bg_type_to_factor(block_group->flags);
- found = btrfs_find_space_info(info, block_group->flags);
- ASSERT(found);
- spin_lock(&found->lock);
- found->total_bytes += block_group->length;
- found->disk_total += block_group->length * factor;
- found->bytes_used += block_group->used;
- found->disk_used += block_group->used * factor;
- found->bytes_readonly += block_group->bytes_super;
- btrfs_space_info_update_bytes_zone_unusable(found, block_group->zone_unusable);
+ spin_lock(&space_info->lock);
+ space_info->total_bytes += block_group->length;
+ space_info->disk_total += block_group->length * factor;
+ space_info->bytes_used += block_group->used;
+ space_info->disk_used += block_group->used * factor;
+ space_info->bytes_readonly += block_group->bytes_super;
+ btrfs_space_info_update_bytes_zone_unusable(space_info, block_group->zone_unusable);
if (block_group->length > 0)
- found->full = 0;
- btrfs_try_granting_tickets(info, found);
- spin_unlock(&found->lock);
+ space_info->full = 0;
+ btrfs_try_granting_tickets(info, space_info);
+ spin_unlock(&space_info->lock);
- block_group->space_info = found;
+ block_group->space_info = space_info;
index = btrfs_bg_flags_to_raid_index(block_group->flags);
- down_write(&found->groups_sem);
- list_add_tail(&block_group->list, &found->block_groups[index]);
- up_write(&found->groups_sem);
+ down_write(&space_info->groups_sem);
+ list_add_tail(&block_group->list, &space_info->block_groups[index]);
+ up_write(&space_info->groups_sem);
}
struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
@@ -556,8 +601,9 @@ static void __btrfs_dump_space_info(const struct btrfs_fs_info *fs_info,
lockdep_assert_held(&info->lock);
/* The free space could be negative in case of overcommit */
- btrfs_info(fs_info, "space_info %s has %lld free, is %sfull",
- flag_str,
+ btrfs_info(fs_info,
+ "space_info %s (sub-group id %d) has %lld free, is %sfull",
+ flag_str, info->subgroup_id,
(s64)(info->total_bytes - btrfs_space_info_used(info, true)),
info->full ? "" : "not ");
btrfs_info(fs_info,
@@ -569,7 +615,7 @@ static void __btrfs_dump_space_info(const struct btrfs_fs_info *fs_info,
void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *info, u64 bytes,
- int dump_block_groups)
+ bool dump_block_groups)
{
struct btrfs_block_group *cache;
u64 total_avail = 0;
@@ -812,7 +858,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
ret = PTR_ERR(trans);
break;
}
- ret = btrfs_chunk_alloc(trans,
+ ret = btrfs_chunk_alloc(trans, space_info,
btrfs_get_alloc_profile(fs_info, space_info->flags),
(state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE :
CHUNK_ALLOC_FORCE);
@@ -1083,23 +1129,15 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
return (tickets_id != space_info->tickets_id);
}
-/*
- * This is for normal flushers, we can wait all goddamned day if we want to. We
- * will loop and continuously try to flush as long as we are making progress.
- * We count progress as clearing off tickets each time we have to loop.
- */
-static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
+static void do_async_reclaim_metadata_space(struct btrfs_space_info *space_info)
{
- struct btrfs_fs_info *fs_info;
- struct btrfs_space_info *space_info;
+ struct btrfs_fs_info *fs_info = space_info->fs_info;
u64 to_reclaim;
enum btrfs_flush_state flush_state;
int commit_cycles = 0;
u64 last_tickets_id;
enum btrfs_flush_state final_state;
- fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
- space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
if (btrfs_is_zoned(fs_info))
final_state = RESET_ZONES;
else
@@ -1174,6 +1212,25 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
}
/*
+ * This is for normal flushers, it can wait as much time as needed. We will
+ * loop and continuously try to flush as long as we are making progress. We
+ * count progress as clearing off tickets each time we have to loop.
+ */
+static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
+{
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_space_info *space_info;
+
+ fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
+ space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+ do_async_reclaim_metadata_space(space_info);
+ for (int i = 0; i < BTRFS_SPACE_INFO_SUB_GROUP_MAX; i++) {
+ if (space_info->sub_group[i])
+ do_async_reclaim_metadata_space(space_info->sub_group[i]);
+ }
+}
+
+/*
* This handles pre-flushing of metadata space before we get to the point that
* we need to start blocking threads on tickets. The logic here is different
* from the other flush paths because it doesn't rely on tickets to tell us how
@@ -1318,16 +1375,12 @@ static const enum btrfs_flush_state data_flush_states[] = {
ALLOC_CHUNK_FORCE,
};
-static void btrfs_async_reclaim_data_space(struct work_struct *work)
+static void do_async_reclaim_data_space(struct btrfs_space_info *space_info)
{
- struct btrfs_fs_info *fs_info;
- struct btrfs_space_info *space_info;
+ struct btrfs_fs_info *fs_info = space_info->fs_info;
u64 last_tickets_id;
enum btrfs_flush_state flush_state = 0;
- fs_info = container_of(work, struct btrfs_fs_info, async_data_reclaim_work);
- space_info = fs_info->data_sinfo;
-
spin_lock(&space_info->lock);
if (list_empty(&space_info->tickets)) {
space_info->flush = 0;
@@ -1395,6 +1448,19 @@ aborted_fs:
spin_unlock(&space_info->lock);
}
+static void btrfs_async_reclaim_data_space(struct work_struct *work)
+{
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_space_info *space_info;
+
+ fs_info = container_of(work, struct btrfs_fs_info, async_data_reclaim_work);
+ space_info = fs_info->data_sinfo;
+ do_async_reclaim_data_space(space_info);
+ for (int i = 0; i < BTRFS_SPACE_INFO_SUB_GROUP_MAX; i++)
+ if (space_info->sub_group[i])
+ do_async_reclaim_data_space(space_info->sub_group[i]);
+}
+
void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info)
{
INIT_WORK(&fs_info->async_reclaim_work, btrfs_async_reclaim_metadata_space);
@@ -1821,7 +1887,7 @@ int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
space_info->flags, orig_bytes, 1);
if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
- btrfs_dump_space_info(fs_info, space_info, orig_bytes, 0);
+ btrfs_dump_space_info(fs_info, space_info, orig_bytes, false);
}
return ret;
}
@@ -1836,10 +1902,10 @@ int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
* This will reserve bytes from the data space info. If there is not enough
* space then we will attempt to flush space as specified by flush.
*/
-int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
+int btrfs_reserve_data_bytes(struct btrfs_space_info *space_info, u64 bytes,
enum btrfs_reserve_flush_enum flush)
{
- struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
+ struct btrfs_fs_info *fs_info = space_info->fs_info;
int ret;
ASSERT(flush == BTRFS_RESERVE_FLUSH_DATA ||
@@ -1847,12 +1913,12 @@ int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
flush == BTRFS_RESERVE_NO_FLUSH);
ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA);
- ret = __reserve_bytes(fs_info, data_sinfo, bytes, flush);
+ ret = __reserve_bytes(fs_info, space_info, bytes, flush);
if (ret == -ENOSPC) {
trace_btrfs_space_reservation(fs_info, "space_info:enospc",
- data_sinfo->flags, bytes, 1);
+ space_info->flags, bytes, 1);
if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
- btrfs_dump_space_info(fs_info, data_sinfo, bytes, 0);
+ btrfs_dump_space_info(fs_info, space_info, bytes, false);
}
return ret;
}
@@ -1907,13 +1973,13 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
static u64 calc_pct_ratio(u64 x, u64 y)
{
- int err;
+ int ret;
if (!y)
return 0;
again:
- err = check_mul_overflow(100, x, &x);
- if (err)
+ ret = check_mul_overflow(100, x, &x);
+ if (ret)
goto lose_precision;
return div64_u64(x, y);
lose_precision:
@@ -2073,7 +2139,7 @@ void btrfs_set_periodic_reclaim_ready(struct btrfs_space_info *space_info, bool
}
}
-bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info)
+static bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info)
{
bool ret;
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index a96efdb5e681..679f22efb407 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -98,8 +98,18 @@ enum btrfs_flush_state {
RESET_ZONES = 12,
};
+enum btrfs_space_info_sub_group {
+ BTRFS_SUB_GROUP_PRIMARY,
+ BTRFS_SUB_GROUP_DATA_RELOC,
+ BTRFS_SUB_GROUP_TREELOG,
+};
+
+#define BTRFS_SPACE_INFO_SUB_GROUP_MAX 1
struct btrfs_space_info {
struct btrfs_fs_info *fs_info;
+ struct btrfs_space_info *parent;
+ struct btrfs_space_info *sub_group[BTRFS_SPACE_INFO_SUB_GROUP_MAX];
+ int subgroup_id;
spinlock_t lock;
u64 total_bytes; /* total bytes in the space,
@@ -268,7 +278,7 @@ u64 __pure btrfs_space_info_used(const struct btrfs_space_info *s_info,
void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *info, u64 bytes,
- int dump_block_groups);
+ bool dump_block_groups);
int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
u64 orig_bytes,
@@ -288,7 +298,7 @@ static inline void btrfs_space_info_free_bytes_may_use(
btrfs_try_granting_tickets(space_info->fs_info, space_info);
spin_unlock(&space_info->lock);
}
-int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
+int btrfs_reserve_data_bytes(struct btrfs_space_info *space_info, u64 bytes,
enum btrfs_reserve_flush_enum flush);
void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info);
void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info);
@@ -296,7 +306,6 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
void btrfs_space_info_update_reclaimable(struct btrfs_space_info *space_info, s64 bytes);
void btrfs_set_periodic_reclaim_ready(struct btrfs_space_info *space_info, bool ready);
-bool btrfs_should_periodic_reclaim(struct btrfs_space_info *space_info);
int btrfs_calc_reclaim_threshold(const struct btrfs_space_info *space_info);
void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info);
void btrfs_return_free_space(struct btrfs_space_info *space_info, u64 len);
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index 722acf768396..cb4f97833dc3 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -2,12 +2,11 @@
#include <linux/slab.h>
#include "messages.h"
-#include "ctree.h"
#include "subpage.h"
#include "btrfs_inode.h"
/*
- * Subpage (sectorsize < PAGE_SIZE) support overview:
+ * Subpage (block size < folio size) support overview:
*
* Limitations:
*
@@ -50,7 +49,7 @@
* Implementation:
*
* - Common
- * Both metadata and data will use a new structure, btrfs_subpage, to
+ * Both metadata and data will use a new structure, btrfs_folio_state, to
* record the status of each sector inside a page. This provides the extra
* granularity needed.
*
@@ -64,34 +63,14 @@
* This means a slightly higher tree locking latency.
*/
-#if PAGE_SIZE > SZ_4K
-bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space *mapping)
+int btrfs_attach_folio_state(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, enum btrfs_folio_type type)
{
- if (fs_info->sectorsize >= PAGE_SIZE)
- return false;
+ struct btrfs_folio_state *bfs;
- /*
- * Only data pages (either through DIO or compression) can have no
- * mapping. And if page->mapping->host is data inode, it's subpage.
- * As we have ruled our sectorsize >= PAGE_SIZE case already.
- */
- if (!mapping || !mapping->host || is_data_inode(BTRFS_I(mapping->host)))
- return true;
-
- /*
- * Now the only remaining case is metadata, which we only go subpage
- * routine if nodesize < PAGE_SIZE.
- */
- if (fs_info->nodesize < PAGE_SIZE)
- return true;
- return false;
-}
-#endif
-
-int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
- struct folio *folio, enum btrfs_subpage_type type)
-{
- struct btrfs_subpage *subpage;
+ /* For metadata we don't support large folio yet. */
+ if (type == BTRFS_SUBPAGE_METADATA)
+ ASSERT(!folio_test_large(folio));
/*
* We have cases like a dummy extent buffer page, which is not mapped
@@ -101,40 +80,50 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
ASSERT(folio_test_locked(folio));
/* Either not subpage, or the folio already has private attached. */
- if (!btrfs_is_subpage(fs_info, folio->mapping) || folio_test_private(folio))
+ if (folio_test_private(folio))
+ return 0;
+ if (type == BTRFS_SUBPAGE_METADATA && !btrfs_meta_is_subpage(fs_info))
+ return 0;
+ if (type == BTRFS_SUBPAGE_DATA && !btrfs_is_subpage(fs_info, folio))
return 0;
- subpage = btrfs_alloc_subpage(fs_info, type);
- if (IS_ERR(subpage))
- return PTR_ERR(subpage);
+ bfs = btrfs_alloc_folio_state(fs_info, folio_size(folio), type);
+ if (IS_ERR(bfs))
+ return PTR_ERR(bfs);
- folio_attach_private(folio, subpage);
+ folio_attach_private(folio, bfs);
return 0;
}
-void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio)
+void btrfs_detach_folio_state(const struct btrfs_fs_info *fs_info, struct folio *folio,
+ enum btrfs_folio_type type)
{
- struct btrfs_subpage *subpage;
+ struct btrfs_folio_state *bfs;
/* Either not subpage, or the folio already has private attached. */
- if (!btrfs_is_subpage(fs_info, folio->mapping) || !folio_test_private(folio))
+ if (!folio_test_private(folio))
+ return;
+ if (type == BTRFS_SUBPAGE_METADATA && !btrfs_meta_is_subpage(fs_info))
+ return;
+ if (type == BTRFS_SUBPAGE_DATA && !btrfs_is_subpage(fs_info, folio))
return;
- subpage = folio_detach_private(folio);
- ASSERT(subpage);
- btrfs_free_subpage(subpage);
+ bfs = folio_detach_private(folio);
+ ASSERT(bfs);
+ btrfs_free_folio_state(bfs);
}
-struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
- enum btrfs_subpage_type type)
+struct btrfs_folio_state *btrfs_alloc_folio_state(const struct btrfs_fs_info *fs_info,
+ size_t fsize, enum btrfs_folio_type type)
{
- struct btrfs_subpage *ret;
+ struct btrfs_folio_state *ret;
unsigned int real_size;
- ASSERT(fs_info->sectorsize < PAGE_SIZE);
+ ASSERT(fs_info->sectorsize < fsize);
real_size = struct_size(ret, bitmaps,
- BITS_TO_LONGS(btrfs_bitmap_nr_max * fs_info->sectors_per_page));
+ BITS_TO_LONGS(btrfs_bitmap_nr_max *
+ (fsize >> fs_info->sectorsize_bits)));
ret = kzalloc(real_size, GFP_NOFS);
if (!ret)
return ERR_PTR(-ENOMEM);
@@ -147,11 +136,6 @@ struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
return ret;
}
-void btrfs_free_subpage(struct btrfs_subpage *subpage)
-{
- kfree(subpage);
-}
-
/*
* Increase the eb_refs of current subpage.
*
@@ -163,39 +147,36 @@ void btrfs_free_subpage(struct btrfs_subpage *subpage)
*/
void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio)
{
- struct btrfs_subpage *subpage;
+ struct btrfs_folio_state *bfs;
- if (!btrfs_is_subpage(fs_info, folio->mapping))
+ if (!btrfs_meta_is_subpage(fs_info))
return;
ASSERT(folio_test_private(folio) && folio->mapping);
lockdep_assert_held(&folio->mapping->i_private_lock);
- subpage = folio_get_private(folio);
- atomic_inc(&subpage->eb_refs);
+ bfs = folio_get_private(folio);
+ atomic_inc(&bfs->eb_refs);
}
void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio)
{
- struct btrfs_subpage *subpage;
+ struct btrfs_folio_state *bfs;
- if (!btrfs_is_subpage(fs_info, folio->mapping))
+ if (!btrfs_meta_is_subpage(fs_info))
return;
ASSERT(folio_test_private(folio) && folio->mapping);
lockdep_assert_held(&folio->mapping->i_private_lock);
- subpage = folio_get_private(folio);
- ASSERT(atomic_read(&subpage->eb_refs));
- atomic_dec(&subpage->eb_refs);
+ bfs = folio_get_private(folio);
+ ASSERT(atomic_read(&bfs->eb_refs));
+ atomic_dec(&bfs->eb_refs);
}
static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- /* For subpage support, the folio must be single page. */
- ASSERT(folio_order(folio) == 0);
-
/* Basic checks */
ASSERT(folio_test_private(folio) && folio_get_private(folio));
ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
@@ -205,17 +186,20 @@ static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info,
* unmapped page like dummy extent buffer pages.
*/
if (folio->mapping)
- ASSERT(folio_pos(folio) <= start &&
- start + len <= folio_pos(folio) + PAGE_SIZE);
+ ASSERT(folio_pos(folio) <= start && start + len <= folio_end(folio),
+ "start=%llu len=%u folio_pos=%llu folio_size=%zu",
+ start, len, folio_pos(folio), folio_size(folio));
}
#define subpage_calc_start_bit(fs_info, folio, name, start, len) \
({ \
- unsigned int __start_bit; \
+ unsigned int __start_bit; \
+ const unsigned int blocks_per_folio = \
+ btrfs_blocks_per_folio(fs_info, folio); \
\
btrfs_subpage_assert(fs_info, folio, start, len); \
- __start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \
- __start_bit += fs_info->sectors_per_page * btrfs_bitmap_nr_##name; \
+ __start_bit = offset_in_folio(folio, start) >> fs_info->sectorsize_bits; \
+ __start_bit += blocks_per_folio * btrfs_bitmap_nr_##name; \
__start_bit; \
})
@@ -233,14 +217,13 @@ static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len)
if (folio_pos(folio) >= orig_start + orig_len)
*len = 0;
else
- *len = min_t(u64, folio_pos(folio) + PAGE_SIZE,
- orig_start + orig_len) - *start;
+ *len = min_t(u64, folio_end(folio), orig_start + orig_len) - *start;
}
static bool btrfs_subpage_end_and_test_lock(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = folio_get_private(folio);
+ struct btrfs_folio_state *bfs = folio_get_private(folio);
const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len);
const int nbits = (len >> fs_info->sectorsize_bits);
unsigned long flags;
@@ -250,7 +233,7 @@ static bool btrfs_subpage_end_and_test_lock(const struct btrfs_fs_info *fs_info,
btrfs_subpage_assert(fs_info, folio, start, len);
- spin_lock_irqsave(&subpage->lock, flags);
+ spin_lock_irqsave(&bfs->lock, flags);
/*
* We have call sites passing @lock_page into
* extent_clear_unlock_delalloc() for compression path.
@@ -258,18 +241,18 @@ static bool btrfs_subpage_end_and_test_lock(const struct btrfs_fs_info *fs_info,
* This @locked_page is locked by plain lock_page(), thus its
* subpage::locked is 0. Handle them in a special way.
*/
- if (atomic_read(&subpage->nr_locked) == 0) {
- spin_unlock_irqrestore(&subpage->lock, flags);
+ if (atomic_read(&bfs->nr_locked) == 0) {
+ spin_unlock_irqrestore(&bfs->lock, flags);
return true;
}
- for_each_set_bit_from(bit, subpage->bitmaps, start_bit + nbits) {
- clear_bit(bit, subpage->bitmaps);
+ for_each_set_bit_from(bit, bfs->bitmaps, start_bit + nbits) {
+ clear_bit(bit, bfs->bitmaps);
cleared++;
}
- ASSERT(atomic_read(&subpage->nr_locked) >= cleared);
- last = atomic_sub_and_test(cleared, &subpage->nr_locked);
- spin_unlock_irqrestore(&subpage->lock, flags);
+ ASSERT(atomic_read(&bfs->nr_locked) >= cleared);
+ last = atomic_sub_and_test(cleared, &bfs->nr_locked);
+ spin_unlock_irqrestore(&bfs->lock, flags);
return last;
}
@@ -292,11 +275,11 @@ static bool btrfs_subpage_end_and_test_lock(const struct btrfs_fs_info *fs_info,
void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = folio_get_private(folio);
+ struct btrfs_folio_state *bfs = folio_get_private(folio);
ASSERT(folio_test_locked(folio));
- if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) {
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio)) {
folio_unlock(folio);
return;
}
@@ -308,7 +291,7 @@ void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info,
* Since we own the page lock, no one else could touch subpage::locked
* and we are safe to do several atomic operations without spinlock.
*/
- if (atomic_read(&subpage->nr_locked) == 0) {
+ if (atomic_read(&bfs->nr_locked) == 0) {
/* No subpage lock, locked by plain lock_page(). */
folio_unlock(folio);
return;
@@ -322,86 +305,99 @@ void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info,
void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info,
struct folio *folio, unsigned long bitmap)
{
- struct btrfs_subpage *subpage = folio_get_private(folio);
- const int start_bit = fs_info->sectors_per_page * btrfs_bitmap_nr_locked;
+ struct btrfs_folio_state *bfs = folio_get_private(folio);
+ const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
+ const int start_bit = blocks_per_folio * btrfs_bitmap_nr_locked;
unsigned long flags;
bool last = false;
int cleared = 0;
int bit;
- if (!btrfs_is_subpage(fs_info, folio->mapping)) {
+ if (!btrfs_is_subpage(fs_info, folio)) {
folio_unlock(folio);
return;
}
- if (atomic_read(&subpage->nr_locked) == 0) {
+ if (atomic_read(&bfs->nr_locked) == 0) {
/* No subpage lock, locked by plain lock_page(). */
folio_unlock(folio);
return;
}
- spin_lock_irqsave(&subpage->lock, flags);
- for_each_set_bit(bit, &bitmap, fs_info->sectors_per_page) {
- if (test_and_clear_bit(bit + start_bit, subpage->bitmaps))
+ spin_lock_irqsave(&bfs->lock, flags);
+ for_each_set_bit(bit, &bitmap, blocks_per_folio) {
+ if (test_and_clear_bit(bit + start_bit, bfs->bitmaps))
cleared++;
}
- ASSERT(atomic_read(&subpage->nr_locked) >= cleared);
- last = atomic_sub_and_test(cleared, &subpage->nr_locked);
- spin_unlock_irqrestore(&subpage->lock, flags);
+ ASSERT(atomic_read(&bfs->nr_locked) >= cleared);
+ last = atomic_sub_and_test(cleared, &bfs->nr_locked);
+ spin_unlock_irqrestore(&bfs->lock, flags);
if (last)
folio_unlock(folio);
}
-#define subpage_test_bitmap_all_set(fs_info, subpage, name) \
- bitmap_test_range_all_set(subpage->bitmaps, \
- fs_info->sectors_per_page * btrfs_bitmap_nr_##name, \
- fs_info->sectors_per_page)
+#define subpage_test_bitmap_all_set(fs_info, folio, name) \
+({ \
+ struct btrfs_folio_state *bfs = folio_get_private(folio); \
+ const unsigned int blocks_per_folio = \
+ btrfs_blocks_per_folio(fs_info, folio); \
+ \
+ bitmap_test_range_all_set(bfs->bitmaps, \
+ blocks_per_folio * btrfs_bitmap_nr_##name, \
+ blocks_per_folio); \
+})
-#define subpage_test_bitmap_all_zero(fs_info, subpage, name) \
- bitmap_test_range_all_zero(subpage->bitmaps, \
- fs_info->sectors_per_page * btrfs_bitmap_nr_##name, \
- fs_info->sectors_per_page)
+#define subpage_test_bitmap_all_zero(fs_info, folio, name) \
+({ \
+ struct btrfs_folio_state *bfs = folio_get_private(folio); \
+ const unsigned int blocks_per_folio = \
+ btrfs_blocks_per_folio(fs_info, folio); \
+ \
+ bitmap_test_range_all_zero(bfs->bitmaps, \
+ blocks_per_folio * btrfs_bitmap_nr_##name, \
+ blocks_per_folio); \
+})
void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = folio_get_private(folio);
+ struct btrfs_folio_state *bfs = folio_get_private(folio);
unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
uptodate, start, len);
unsigned long flags;
- spin_lock_irqsave(&subpage->lock, flags);
- bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
- if (subpage_test_bitmap_all_set(fs_info, subpage, uptodate))
+ spin_lock_irqsave(&bfs->lock, flags);
+ bitmap_set(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ if (subpage_test_bitmap_all_set(fs_info, folio, uptodate))
folio_mark_uptodate(folio);
- spin_unlock_irqrestore(&subpage->lock, flags);
+ spin_unlock_irqrestore(&bfs->lock, flags);
}
void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = folio_get_private(folio);
+ struct btrfs_folio_state *bfs = folio_get_private(folio);
unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
uptodate, start, len);
unsigned long flags;
- spin_lock_irqsave(&subpage->lock, flags);
- bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ spin_lock_irqsave(&bfs->lock, flags);
+ bitmap_clear(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
folio_clear_uptodate(folio);
- spin_unlock_irqrestore(&subpage->lock, flags);
+ spin_unlock_irqrestore(&bfs->lock, flags);
}
void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = folio_get_private(folio);
+ struct btrfs_folio_state *bfs = folio_get_private(folio);
unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
dirty, start, len);
unsigned long flags;
- spin_lock_irqsave(&subpage->lock, flags);
- bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
- spin_unlock_irqrestore(&subpage->lock, flags);
+ spin_lock_irqsave(&bfs->lock, flags);
+ bitmap_set(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ spin_unlock_irqrestore(&bfs->lock, flags);
folio_mark_dirty(folio);
}
@@ -418,17 +414,17 @@ void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info,
bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = folio_get_private(folio);
+ struct btrfs_folio_state *bfs = folio_get_private(folio);
unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
dirty, start, len);
unsigned long flags;
bool last = false;
- spin_lock_irqsave(&subpage->lock, flags);
- bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
- if (subpage_test_bitmap_all_zero(fs_info, subpage, dirty))
+ spin_lock_irqsave(&bfs->lock, flags);
+ bitmap_clear(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ if (subpage_test_bitmap_all_zero(fs_info, folio, dirty))
last = true;
- spin_unlock_irqrestore(&subpage->lock, flags);
+ spin_unlock_irqrestore(&bfs->lock, flags);
return last;
}
@@ -445,91 +441,108 @@ void btrfs_subpage_clear_dirty(const struct btrfs_fs_info *fs_info,
void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = folio_get_private(folio);
+ struct btrfs_folio_state *bfs = folio_get_private(folio);
unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
writeback, start, len);
unsigned long flags;
- spin_lock_irqsave(&subpage->lock, flags);
- bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ spin_lock_irqsave(&bfs->lock, flags);
+ bitmap_set(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+
+ /*
+ * Don't clear the TOWRITE tag when starting writeback on a still-dirty
+ * folio. Doing so can cause WB_SYNC_ALL writepages() to overlook it,
+ * assume writeback is complete, and exit too early — violating sync
+ * ordering guarantees.
+ */
if (!folio_test_writeback(folio))
- folio_start_writeback(folio);
- spin_unlock_irqrestore(&subpage->lock, flags);
+ __folio_start_writeback(folio, true);
+ if (!folio_test_dirty(folio)) {
+ struct address_space *mapping = folio_mapping(folio);
+ XA_STATE(xas, &mapping->i_pages, folio->index);
+ unsigned long flags;
+
+ xas_lock_irqsave(&xas, flags);
+ xas_load(&xas);
+ xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
+ xas_unlock_irqrestore(&xas, flags);
+ }
+ spin_unlock_irqrestore(&bfs->lock, flags);
}
void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = folio_get_private(folio);
+ struct btrfs_folio_state *bfs = folio_get_private(folio);
unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
writeback, start, len);
unsigned long flags;
- spin_lock_irqsave(&subpage->lock, flags);
- bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
- if (subpage_test_bitmap_all_zero(fs_info, subpage, writeback)) {
+ spin_lock_irqsave(&bfs->lock, flags);
+ bitmap_clear(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ if (subpage_test_bitmap_all_zero(fs_info, folio, writeback)) {
ASSERT(folio_test_writeback(folio));
folio_end_writeback(folio);
}
- spin_unlock_irqrestore(&subpage->lock, flags);
+ spin_unlock_irqrestore(&bfs->lock, flags);
}
void btrfs_subpage_set_ordered(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = folio_get_private(folio);
+ struct btrfs_folio_state *bfs = folio_get_private(folio);
unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
ordered, start, len);
unsigned long flags;
- spin_lock_irqsave(&subpage->lock, flags);
- bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ spin_lock_irqsave(&bfs->lock, flags);
+ bitmap_set(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
folio_set_ordered(folio);
- spin_unlock_irqrestore(&subpage->lock, flags);
+ spin_unlock_irqrestore(&bfs->lock, flags);
}
void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = folio_get_private(folio);
+ struct btrfs_folio_state *bfs = folio_get_private(folio);
unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
ordered, start, len);
unsigned long flags;
- spin_lock_irqsave(&subpage->lock, flags);
- bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
- if (subpage_test_bitmap_all_zero(fs_info, subpage, ordered))
+ spin_lock_irqsave(&bfs->lock, flags);
+ bitmap_clear(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ if (subpage_test_bitmap_all_zero(fs_info, folio, ordered))
folio_clear_ordered(folio);
- spin_unlock_irqrestore(&subpage->lock, flags);
+ spin_unlock_irqrestore(&bfs->lock, flags);
}
void btrfs_subpage_set_checked(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = folio_get_private(folio);
+ struct btrfs_folio_state *bfs = folio_get_private(folio);
unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
checked, start, len);
unsigned long flags;
- spin_lock_irqsave(&subpage->lock, flags);
- bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
- if (subpage_test_bitmap_all_set(fs_info, subpage, checked))
+ spin_lock_irqsave(&bfs->lock, flags);
+ bitmap_set(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ if (subpage_test_bitmap_all_set(fs_info, folio, checked))
folio_set_checked(folio);
- spin_unlock_irqrestore(&subpage->lock, flags);
+ spin_unlock_irqrestore(&bfs->lock, flags);
}
void btrfs_subpage_clear_checked(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = folio_get_private(folio);
+ struct btrfs_folio_state *bfs = folio_get_private(folio);
unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
checked, start, len);
unsigned long flags;
- spin_lock_irqsave(&subpage->lock, flags);
- bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
+ spin_lock_irqsave(&bfs->lock, flags);
+ bitmap_clear(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
folio_clear_checked(folio);
- spin_unlock_irqrestore(&subpage->lock, flags);
+ spin_unlock_irqrestore(&bfs->lock, flags);
}
/*
@@ -540,16 +553,16 @@ void btrfs_subpage_clear_checked(const struct btrfs_fs_info *fs_info,
bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \
struct folio *folio, u64 start, u32 len) \
{ \
- struct btrfs_subpage *subpage = folio_get_private(folio); \
+ struct btrfs_folio_state *bfs = folio_get_private(folio); \
unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, \
name, start, len); \
unsigned long flags; \
bool ret; \
\
- spin_lock_irqsave(&subpage->lock, flags); \
- ret = bitmap_test_range_all_set(subpage->bitmaps, start_bit, \
+ spin_lock_irqsave(&bfs->lock, flags); \
+ ret = bitmap_test_range_all_set(bfs->bitmaps, start_bit, \
len >> fs_info->sectorsize_bits); \
- spin_unlock_irqrestore(&subpage->lock, flags); \
+ spin_unlock_irqrestore(&bfs->lock, flags); \
return ret; \
}
IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(uptodate);
@@ -569,7 +582,7 @@ void btrfs_folio_set_##name(const struct btrfs_fs_info *fs_info, \
struct folio *folio, u64 start, u32 len) \
{ \
if (unlikely(!fs_info) || \
- !btrfs_is_subpage(fs_info, folio->mapping)) { \
+ !btrfs_is_subpage(fs_info, folio)) { \
folio_set_func(folio); \
return; \
} \
@@ -579,7 +592,7 @@ void btrfs_folio_clear_##name(const struct btrfs_fs_info *fs_info, \
struct folio *folio, u64 start, u32 len) \
{ \
if (unlikely(!fs_info) || \
- !btrfs_is_subpage(fs_info, folio->mapping)) { \
+ !btrfs_is_subpage(fs_info, folio)) { \
folio_clear_func(folio); \
return; \
} \
@@ -589,7 +602,7 @@ bool btrfs_folio_test_##name(const struct btrfs_fs_info *fs_info, \
struct folio *folio, u64 start, u32 len) \
{ \
if (unlikely(!fs_info) || \
- !btrfs_is_subpage(fs_info, folio->mapping)) \
+ !btrfs_is_subpage(fs_info, folio)) \
return folio_test_func(folio); \
return btrfs_subpage_test_##name(fs_info, folio, start, len); \
} \
@@ -597,7 +610,7 @@ void btrfs_folio_clamp_set_##name(const struct btrfs_fs_info *fs_info, \
struct folio *folio, u64 start, u32 len) \
{ \
if (unlikely(!fs_info) || \
- !btrfs_is_subpage(fs_info, folio->mapping)) { \
+ !btrfs_is_subpage(fs_info, folio)) { \
folio_set_func(folio); \
return; \
} \
@@ -608,7 +621,7 @@ void btrfs_folio_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \
struct folio *folio, u64 start, u32 len) \
{ \
if (unlikely(!fs_info) || \
- !btrfs_is_subpage(fs_info, folio->mapping)) { \
+ !btrfs_is_subpage(fs_info, folio)) { \
folio_clear_func(folio); \
return; \
} \
@@ -619,10 +632,32 @@ bool btrfs_folio_clamp_test_##name(const struct btrfs_fs_info *fs_info, \
struct folio *folio, u64 start, u32 len) \
{ \
if (unlikely(!fs_info) || \
- !btrfs_is_subpage(fs_info, folio->mapping)) \
+ !btrfs_is_subpage(fs_info, folio)) \
return folio_test_func(folio); \
btrfs_subpage_clamp_range(folio, &start, &len); \
return btrfs_subpage_test_##name(fs_info, folio, start, len); \
+} \
+void btrfs_meta_folio_set_##name(struct folio *folio, const struct extent_buffer *eb) \
+{ \
+ if (!btrfs_meta_is_subpage(eb->fs_info)) { \
+ folio_set_func(folio); \
+ return; \
+ } \
+ btrfs_subpage_set_##name(eb->fs_info, folio, eb->start, eb->len); \
+} \
+void btrfs_meta_folio_clear_##name(struct folio *folio, const struct extent_buffer *eb) \
+{ \
+ if (!btrfs_meta_is_subpage(eb->fs_info)) { \
+ folio_clear_func(folio); \
+ return; \
+ } \
+ btrfs_subpage_clear_##name(eb->fs_info, folio, eb->start, eb->len); \
+} \
+bool btrfs_meta_folio_test_##name(struct folio *folio, const struct extent_buffer *eb) \
+{ \
+ if (!btrfs_meta_is_subpage(eb->fs_info)) \
+ return folio_test_func(folio); \
+ return btrfs_subpage_test_##name(eb->fs_info, folio, eb->start, eb->len); \
}
IMPLEMENT_BTRFS_PAGE_OPS(uptodate, folio_mark_uptodate, folio_clear_uptodate,
folio_test_uptodate);
@@ -635,26 +670,29 @@ IMPLEMENT_BTRFS_PAGE_OPS(ordered, folio_set_ordered, folio_clear_ordered,
IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked,
folio_test_checked);
-#define GET_SUBPAGE_BITMAP(subpage, fs_info, name, dst) \
+#define GET_SUBPAGE_BITMAP(fs_info, folio, name, dst) \
{ \
- const int sectors_per_page = fs_info->sectors_per_page; \
+ const unsigned int blocks_per_folio = \
+ btrfs_blocks_per_folio(fs_info, folio); \
+ const struct btrfs_folio_state *bfs = folio_get_private(folio); \
\
- ASSERT(sectors_per_page < BITS_PER_LONG); \
- *dst = bitmap_read(subpage->bitmaps, \
- sectors_per_page * btrfs_bitmap_nr_##name, \
- sectors_per_page); \
+ ASSERT(blocks_per_folio <= BITS_PER_LONG); \
+ *dst = bitmap_read(bfs->bitmaps, \
+ blocks_per_folio * btrfs_bitmap_nr_##name, \
+ blocks_per_folio); \
}
#define SUBPAGE_DUMP_BITMAP(fs_info, folio, name, start, len) \
{ \
- const struct btrfs_subpage *subpage = folio_get_private(folio); \
unsigned long bitmap; \
+ const unsigned int blocks_per_folio = \
+ btrfs_blocks_per_folio(fs_info, folio); \
\
- GET_SUBPAGE_BITMAP(subpage, fs_info, name, &bitmap); \
+ GET_SUBPAGE_BITMAP(fs_info, folio, name, &bitmap); \
btrfs_warn(fs_info, \
"dumpping bitmap start=%llu len=%u folio=%llu " #name "_bitmap=%*pbl", \
start, len, folio_pos(folio), \
- fs_info->sectors_per_page, &bitmap); \
+ blocks_per_folio, &bitmap); \
}
/*
@@ -664,7 +702,7 @@ IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked,
void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage;
+ struct btrfs_folio_state *bfs;
unsigned int start_bit;
unsigned int nbits;
unsigned long flags;
@@ -672,22 +710,22 @@ void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info,
if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
return;
- if (!btrfs_is_subpage(fs_info, folio->mapping)) {
+ if (!btrfs_is_subpage(fs_info, folio)) {
ASSERT(!folio_test_dirty(folio));
return;
}
start_bit = subpage_calc_start_bit(fs_info, folio, dirty, start, len);
nbits = len >> fs_info->sectorsize_bits;
- subpage = folio_get_private(folio);
- ASSERT(subpage);
- spin_lock_irqsave(&subpage->lock, flags);
- if (unlikely(!bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits))) {
+ bfs = folio_get_private(folio);
+ ASSERT(bfs);
+ spin_lock_irqsave(&bfs->lock, flags);
+ if (unlikely(!bitmap_test_range_all_zero(bfs->bitmaps, start_bit, nbits))) {
SUBPAGE_DUMP_BITMAP(fs_info, folio, dirty, start, len);
- ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
+ ASSERT(bitmap_test_range_all_zero(bfs->bitmaps, start_bit, nbits));
}
- ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
- spin_unlock_irqrestore(&subpage->lock, flags);
+ ASSERT(bitmap_test_range_all_zero(bfs->bitmaps, start_bit, nbits));
+ spin_unlock_irqrestore(&bfs->lock, flags);
}
/*
@@ -700,36 +738,58 @@ void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info,
void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage;
+ struct btrfs_folio_state *bfs;
unsigned long flags;
unsigned int start_bit;
unsigned int nbits;
int ret;
ASSERT(folio_test_locked(folio));
- if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping))
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio))
return;
- subpage = folio_get_private(folio);
+ bfs = folio_get_private(folio);
start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len);
nbits = len >> fs_info->sectorsize_bits;
- spin_lock_irqsave(&subpage->lock, flags);
+ spin_lock_irqsave(&bfs->lock, flags);
/* Target range should not yet be locked. */
- if (unlikely(!bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits))) {
+ if (unlikely(!bitmap_test_range_all_zero(bfs->bitmaps, start_bit, nbits))) {
SUBPAGE_DUMP_BITMAP(fs_info, folio, locked, start, len);
- ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
+ ASSERT(bitmap_test_range_all_zero(bfs->bitmaps, start_bit, nbits));
}
- bitmap_set(subpage->bitmaps, start_bit, nbits);
- ret = atomic_add_return(nbits, &subpage->nr_locked);
- ASSERT(ret <= fs_info->sectors_per_page);
- spin_unlock_irqrestore(&subpage->lock, flags);
+ bitmap_set(bfs->bitmaps, start_bit, nbits);
+ ret = atomic_add_return(nbits, &bfs->nr_locked);
+ ASSERT(ret <= btrfs_blocks_per_folio(fs_info, folio));
+ spin_unlock_irqrestore(&bfs->lock, flags);
+}
+
+/*
+ * Clear the dirty flag for the folio.
+ *
+ * If the affected folio is no longer dirty, return true. Otherwise return false.
+ */
+bool btrfs_meta_folio_clear_and_test_dirty(struct folio *folio, const struct extent_buffer *eb)
+{
+ bool last;
+
+ if (!btrfs_meta_is_subpage(eb->fs_info)) {
+ folio_clear_dirty_for_io(folio);
+ return true;
+ }
+
+ last = btrfs_subpage_clear_and_test_dirty(eb->fs_info, folio, eb->start, eb->len);
+ if (last) {
+ folio_clear_dirty_for_io(folio);
+ return true;
+ }
+ return false;
}
void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage;
- const u32 sectors_per_page = fs_info->sectors_per_page;
+ struct btrfs_folio_state *bfs;
+ const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
unsigned long uptodate_bitmap;
unsigned long dirty_bitmap;
unsigned long writeback_bitmap;
@@ -739,42 +799,42 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
unsigned long flags;
ASSERT(folio_test_private(folio) && folio_get_private(folio));
- ASSERT(sectors_per_page > 1);
- subpage = folio_get_private(folio);
-
- spin_lock_irqsave(&subpage->lock, flags);
- GET_SUBPAGE_BITMAP(subpage, fs_info, uptodate, &uptodate_bitmap);
- GET_SUBPAGE_BITMAP(subpage, fs_info, dirty, &dirty_bitmap);
- GET_SUBPAGE_BITMAP(subpage, fs_info, writeback, &writeback_bitmap);
- GET_SUBPAGE_BITMAP(subpage, fs_info, ordered, &ordered_bitmap);
- GET_SUBPAGE_BITMAP(subpage, fs_info, checked, &checked_bitmap);
- GET_SUBPAGE_BITMAP(subpage, fs_info, locked, &locked_bitmap);
- spin_unlock_irqrestore(&subpage->lock, flags);
-
- dump_page(folio_page(folio, 0), "btrfs subpage dump");
+ ASSERT(blocks_per_folio > 1);
+ bfs = folio_get_private(folio);
+
+ spin_lock_irqsave(&bfs->lock, flags);
+ GET_SUBPAGE_BITMAP(fs_info, folio, uptodate, &uptodate_bitmap);
+ GET_SUBPAGE_BITMAP(fs_info, folio, dirty, &dirty_bitmap);
+ GET_SUBPAGE_BITMAP(fs_info, folio, writeback, &writeback_bitmap);
+ GET_SUBPAGE_BITMAP(fs_info, folio, ordered, &ordered_bitmap);
+ GET_SUBPAGE_BITMAP(fs_info, folio, checked, &checked_bitmap);
+ GET_SUBPAGE_BITMAP(fs_info, folio, locked, &locked_bitmap);
+ spin_unlock_irqrestore(&bfs->lock, flags);
+
+ dump_page(folio_page(folio, 0), "btrfs folio state dump");
btrfs_warn(fs_info,
"start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl dirty=%*pbl locked=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl",
start, len, folio_pos(folio),
- sectors_per_page, &uptodate_bitmap,
- sectors_per_page, &dirty_bitmap,
- sectors_per_page, &locked_bitmap,
- sectors_per_page, &writeback_bitmap,
- sectors_per_page, &ordered_bitmap,
- sectors_per_page, &checked_bitmap);
+ blocks_per_folio, &uptodate_bitmap,
+ blocks_per_folio, &dirty_bitmap,
+ blocks_per_folio, &locked_bitmap,
+ blocks_per_folio, &writeback_bitmap,
+ blocks_per_folio, &ordered_bitmap,
+ blocks_per_folio, &checked_bitmap);
}
void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info,
struct folio *folio,
unsigned long *ret_bitmap)
{
- struct btrfs_subpage *subpage;
+ struct btrfs_folio_state *bfs;
unsigned long flags;
ASSERT(folio_test_private(folio) && folio_get_private(folio));
- ASSERT(fs_info->sectors_per_page > 1);
- subpage = folio_get_private(folio);
+ ASSERT(btrfs_blocks_per_folio(fs_info, folio) > 1);
+ bfs = folio_get_private(folio);
- spin_lock_irqsave(&subpage->lock, flags);
- GET_SUBPAGE_BITMAP(subpage, fs_info, dirty, ret_bitmap);
- spin_unlock_irqrestore(&subpage->lock, flags);
+ spin_lock_irqsave(&bfs->lock, flags);
+ GET_SUBPAGE_BITMAP(fs_info, folio, dirty, ret_bitmap);
+ spin_unlock_irqrestore(&bfs->lock, flags);
}
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
index 44fff1f4eac4..ee0710eb13fd 100644
--- a/fs/btrfs/subpage.h
+++ b/fs/btrfs/subpage.h
@@ -6,10 +6,11 @@
#include <linux/spinlock.h>
#include <linux/atomic.h>
#include <linux/sizes.h>
+#include "btrfs_inode.h"
+#include "fs.h"
struct address_space;
struct folio;
-struct btrfs_fs_info;
/*
* Extra info for subpapge bitmap.
@@ -31,9 +32,31 @@ struct btrfs_fs_info;
enum {
btrfs_bitmap_nr_uptodate = 0,
btrfs_bitmap_nr_dirty,
+
+ /*
+ * This can be changed to atomic eventually. But this change will rely
+ * on the async delalloc range rework for locked bitmap. As async
+ * delalloc can unlock its range and mark blocks writeback at random
+ * timing.
+ */
btrfs_bitmap_nr_writeback,
+
+ /*
+ * The ordered and checked flags are for COW fixup, already marked
+ * deprecated, and will be removed eventually.
+ */
btrfs_bitmap_nr_ordered,
btrfs_bitmap_nr_checked,
+
+ /*
+ * The locked bit is for async delalloc range (compression), currently
+ * async extent is queued with the range locked, until the compression
+ * is done.
+ * So an async extent can unlock the range at any random timing.
+ *
+ * This will need a rework on the async extent lifespan (mark writeback
+ * and do compression) before deprecating this flag.
+ */
btrfs_bitmap_nr_locked,
btrfs_bitmap_nr_max
};
@@ -42,7 +65,7 @@ enum {
* Structure to trace status of each sector inside a page, attached to
* page::private for both data and metadata inodes.
*/
-struct btrfs_subpage {
+struct btrfs_folio_state {
/* Common members for both data and metadata pages */
spinlock_t lock;
union {
@@ -50,7 +73,7 @@ struct btrfs_subpage {
* Structures only used by metadata
*
* @eb_refs should only be operated under private_lock, as it
- * manages whether the subpage can be detached.
+ * manages whether the btrfs_folio_state can be detached.
*/
atomic_t eb_refs;
@@ -64,29 +87,44 @@ struct btrfs_subpage {
unsigned long bitmaps[];
};
-enum btrfs_subpage_type {
+enum btrfs_folio_type {
BTRFS_SUBPAGE_METADATA,
BTRFS_SUBPAGE_DATA,
};
-#if PAGE_SIZE > SZ_4K
-bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space *mapping);
-#else
+/*
+ * Subpage support for metadata is more complex, as we can have dummy extent
+ * buffers, where folios have no mapping to determine the owning inode.
+ *
+ * Thankfully we only need to check if node size is smaller than page size.
+ * Even with larger folio support, we will only allocate a folio as large as
+ * node size.
+ * Thus if nodesize < PAGE_SIZE, we know metadata needs need to subpage routine.
+ */
+static inline bool btrfs_meta_is_subpage(const struct btrfs_fs_info *fs_info)
+{
+ return fs_info->nodesize < PAGE_SIZE;
+}
static inline bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info,
- struct address_space *mapping)
+ struct folio *folio)
{
- return false;
+ if (folio->mapping && folio->mapping->host)
+ ASSERT(is_data_inode(BTRFS_I(folio->mapping->host)));
+ return fs_info->sectorsize < folio_size(folio);
}
-#endif
-int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
- struct folio *folio, enum btrfs_subpage_type type);
-void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio);
+int btrfs_attach_folio_state(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, enum btrfs_folio_type type);
+void btrfs_detach_folio_state(const struct btrfs_fs_info *fs_info, struct folio *folio,
+ enum btrfs_folio_type type);
/* Allocate additional data where page represents more than one sector */
-struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
- enum btrfs_subpage_type type);
-void btrfs_free_subpage(struct btrfs_subpage *subpage);
+struct btrfs_folio_state *btrfs_alloc_folio_state(const struct btrfs_fs_info *fs_info,
+ size_t fsize, enum btrfs_folio_type type);
+static inline void btrfs_free_folio_state(struct btrfs_folio_state *bfs)
+{
+ kfree(bfs);
+}
void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio);
void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio);
@@ -110,6 +148,13 @@ void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info,
* btrfs_folio_clamp_*() are similar to btrfs_folio_*(), except the range doesn't
* need to be inside the page. Those functions will truncate the range
* automatically.
+ *
+ * Both btrfs_folio_*() and btrfs_folio_clamp_*() are for data folios.
+ *
+ * For metadata, one should use btrfs_meta_folio_*() helpers instead, and there
+ * is no clamp version for metadata helpers, as we either go subpage
+ * (nodesize < PAGE_SIZE) or go regular folio helpers (nodesize >= PAGE_SIZE,
+ * and our folio is never larger than nodesize).
*/
#define DECLARE_BTRFS_SUBPAGE_OPS(name) \
void btrfs_subpage_set_##name(const struct btrfs_fs_info *fs_info, \
@@ -129,7 +174,10 @@ void btrfs_folio_clamp_set_##name(const struct btrfs_fs_info *fs_info, \
void btrfs_folio_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \
struct folio *folio, u64 start, u32 len); \
bool btrfs_folio_clamp_test_##name(const struct btrfs_fs_info *fs_info, \
- struct folio *folio, u64 start, u32 len);
+ struct folio *folio, u64 start, u32 len); \
+void btrfs_meta_folio_set_##name(struct folio *folio, const struct extent_buffer *eb); \
+void btrfs_meta_folio_clear_##name(struct folio *folio, const struct extent_buffer *eb); \
+bool btrfs_meta_folio_test_##name(struct folio *folio, const struct extent_buffer *eb);
DECLARE_BTRFS_SUBPAGE_OPS(uptodate);
DECLARE_BTRFS_SUBPAGE_OPS(dirty);
@@ -155,6 +203,7 @@ bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info,
struct folio *folio, u64 start, u32 len);
+bool btrfs_meta_folio_clear_and_test_dirty(struct folio *folio, const struct extent_buffer *eb);
void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info,
struct folio *folio,
unsigned long *ret_bitmap);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index dc4fee519ca6..a262b494a89f 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -84,10 +84,13 @@ struct btrfs_fs_context {
u32 thread_pool_size;
unsigned long long mount_opt;
unsigned long compress_type:4;
- unsigned int compress_level;
+ int compress_level;
refcount_t refs;
};
+static void btrfs_emit_options(struct btrfs_fs_info *info,
+ struct btrfs_fs_context *old);
+
enum {
Opt_acl,
Opt_clear_cache,
@@ -125,7 +128,6 @@ enum {
/* Rescue options */
Opt_rescue,
Opt_usebackuproot,
- Opt_nologreplay,
/* Debugging options */
Opt_enospc_debug,
@@ -246,8 +248,6 @@ static const struct fs_parameter_spec btrfs_fs_parameters[] = {
/* Rescue options. */
fsparam_enum("rescue", Opt_rescue, btrfs_parameter_rescue),
- /* Deprecated, with alias rescue=nologreplay */
- __fsparam(NULL, "nologreplay", Opt_nologreplay, fs_param_deprecated, NULL),
/* Deprecated, with alias rescue=usebackuproot */
__fsparam(NULL, "usebackuproot", Opt_usebackuproot, fs_param_deprecated, NULL),
/* For compatibility only, alias for "rescue=nologreplay". */
@@ -264,10 +264,65 @@ static const struct fs_parameter_spec btrfs_fs_parameters[] = {
{}
};
-/* No support for restricting writes to btrfs devices yet... */
-static inline blk_mode_t btrfs_open_mode(struct fs_context *fc)
+static bool btrfs_match_compress_type(const char *string, const char *type, bool may_have_level)
{
- return sb_open_mode(fc->sb_flags) & ~BLK_OPEN_RESTRICT_WRITES;
+ const int len = strlen(type);
+
+ return (strncmp(string, type, len) == 0) &&
+ ((may_have_level && string[len] == ':') || string[len] == '\0');
+}
+
+static int btrfs_parse_compress(struct btrfs_fs_context *ctx,
+ const struct fs_parameter *param, int opt)
+{
+ const char *string = param->string;
+
+ /*
+ * Provide the same semantics as older kernels that don't use fs
+ * context, specifying the "compress" option clears "force-compress"
+ * without the need to pass "compress-force=[no|none]" before
+ * specifying "compress".
+ */
+ if (opt != Opt_compress_force && opt != Opt_compress_force_type)
+ btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS);
+
+ if (opt == Opt_compress || opt == Opt_compress_force) {
+ ctx->compress_type = BTRFS_COMPRESS_ZLIB;
+ ctx->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL;
+ btrfs_set_opt(ctx->mount_opt, COMPRESS);
+ btrfs_clear_opt(ctx->mount_opt, NODATACOW);
+ btrfs_clear_opt(ctx->mount_opt, NODATASUM);
+ } else if (btrfs_match_compress_type(string, "zlib", true)) {
+ ctx->compress_type = BTRFS_COMPRESS_ZLIB;
+ ctx->compress_level = btrfs_compress_str2level(BTRFS_COMPRESS_ZLIB,
+ string + 4);
+ btrfs_set_opt(ctx->mount_opt, COMPRESS);
+ btrfs_clear_opt(ctx->mount_opt, NODATACOW);
+ btrfs_clear_opt(ctx->mount_opt, NODATASUM);
+ } else if (btrfs_match_compress_type(string, "lzo", false)) {
+ ctx->compress_type = BTRFS_COMPRESS_LZO;
+ ctx->compress_level = 0;
+ btrfs_set_opt(ctx->mount_opt, COMPRESS);
+ btrfs_clear_opt(ctx->mount_opt, NODATACOW);
+ btrfs_clear_opt(ctx->mount_opt, NODATASUM);
+ } else if (btrfs_match_compress_type(string, "zstd", true)) {
+ ctx->compress_type = BTRFS_COMPRESS_ZSTD;
+ ctx->compress_level = btrfs_compress_str2level(BTRFS_COMPRESS_ZSTD,
+ string + 4);
+ btrfs_set_opt(ctx->mount_opt, COMPRESS);
+ btrfs_clear_opt(ctx->mount_opt, NODATACOW);
+ btrfs_clear_opt(ctx->mount_opt, NODATASUM);
+ } else if (btrfs_match_compress_type(string, "no", false) ||
+ btrfs_match_compress_type(string, "none", false)) {
+ ctx->compress_level = 0;
+ ctx->compress_type = 0;
+ btrfs_clear_opt(ctx->mount_opt, COMPRESS);
+ btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS);
+ } else {
+ btrfs_err(NULL, "unrecognized compression value %s", string);
+ return -EINVAL;
+ }
+ return 0;
}
static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
@@ -306,10 +361,9 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
break;
case Opt_device: {
struct btrfs_device *device;
- blk_mode_t mode = btrfs_open_mode(fc);
mutex_lock(&uuid_mutex);
- device = btrfs_scan_one_device(param->string, mode, false);
+ device = btrfs_scan_one_device(param->string, false);
mutex_unlock(&uuid_mutex);
if (IS_ERR(device))
return PTR_ERR(device);
@@ -339,53 +393,8 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
fallthrough;
case Opt_compress:
case Opt_compress_type:
- /*
- * Provide the same semantics as older kernels that don't use fs
- * context, specifying the "compress" option clears
- * "force-compress" without the need to pass
- * "compress-force=[no|none]" before specifying "compress".
- */
- if (opt != Opt_compress_force && opt != Opt_compress_force_type)
- btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS);
-
- if (opt == Opt_compress || opt == Opt_compress_force) {
- ctx->compress_type = BTRFS_COMPRESS_ZLIB;
- ctx->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL;
- btrfs_set_opt(ctx->mount_opt, COMPRESS);
- btrfs_clear_opt(ctx->mount_opt, NODATACOW);
- btrfs_clear_opt(ctx->mount_opt, NODATASUM);
- } else if (strncmp(param->string, "zlib", 4) == 0) {
- ctx->compress_type = BTRFS_COMPRESS_ZLIB;
- ctx->compress_level =
- btrfs_compress_str2level(BTRFS_COMPRESS_ZLIB,
- param->string + 4);
- btrfs_set_opt(ctx->mount_opt, COMPRESS);
- btrfs_clear_opt(ctx->mount_opt, NODATACOW);
- btrfs_clear_opt(ctx->mount_opt, NODATASUM);
- } else if (strncmp(param->string, "lzo", 3) == 0) {
- ctx->compress_type = BTRFS_COMPRESS_LZO;
- ctx->compress_level = 0;
- btrfs_set_opt(ctx->mount_opt, COMPRESS);
- btrfs_clear_opt(ctx->mount_opt, NODATACOW);
- btrfs_clear_opt(ctx->mount_opt, NODATASUM);
- } else if (strncmp(param->string, "zstd", 4) == 0) {
- ctx->compress_type = BTRFS_COMPRESS_ZSTD;
- ctx->compress_level =
- btrfs_compress_str2level(BTRFS_COMPRESS_ZSTD,
- param->string + 4);
- btrfs_set_opt(ctx->mount_opt, COMPRESS);
- btrfs_clear_opt(ctx->mount_opt, NODATACOW);
- btrfs_clear_opt(ctx->mount_opt, NODATASUM);
- } else if (strncmp(param->string, "no", 2) == 0) {
- ctx->compress_level = 0;
- ctx->compress_type = 0;
- btrfs_clear_opt(ctx->mount_opt, COMPRESS);
- btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS);
- } else {
- btrfs_err(NULL, "unrecognized compression value %s",
- param->string);
+ if (btrfs_parse_compress(ctx, param, opt))
return -EINVAL;
- }
break;
case Opt_ssd:
if (result.negated) {
@@ -449,11 +458,6 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
else
btrfs_clear_opt(ctx->mount_opt, NOTREELOG);
break;
- case Opt_nologreplay:
- btrfs_warn(NULL,
- "'nologreplay' is deprecated, use 'rescue=nologreplay' instead");
- btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY);
- break;
case Opt_norecovery:
btrfs_info(NULL,
"'norecovery' is for compatibility only, recommended to use 'rescue=nologreplay'");
@@ -569,6 +573,10 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
break;
case Opt_commit_interval:
ctx->commit_interval = result.uint_32;
+ if (ctx->commit_interval > BTRFS_WARNING_COMMIT_INTERVAL) {
+ btrfs_warn(NULL, "excessive commit interval %u, use with care",
+ ctx->commit_interval);
+ }
if (ctx->commit_interval == 0)
ctx->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
break;
@@ -693,12 +701,9 @@ bool btrfs_check_options(const struct btrfs_fs_info *info,
if (!test_bit(BTRFS_FS_STATE_REMOUNTING, &info->fs_state)) {
if (btrfs_raw_test_opt(*mount_opt, SPACE_CACHE)) {
- btrfs_info(info, "disk space caching is enabled");
btrfs_warn(info,
"space cache v1 is being deprecated and will be removed in a future release, please use -o space_cache=v2");
}
- if (btrfs_raw_test_opt(*mount_opt, FREE_SPACE_TREE))
- btrfs_info(info, "using free-space-tree");
}
return ret;
@@ -947,14 +952,14 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec
static int btrfs_fill_super(struct super_block *sb,
struct btrfs_fs_devices *fs_devices)
{
- struct inode *inode;
+ struct btrfs_inode *inode;
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
- int err;
+ int ret;
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_magic = BTRFS_SUPER_MAGIC;
sb->s_op = &btrfs_super_ops;
- sb->s_d_op = &btrfs_dentry_operations;
+ set_default_d_op(sb, &btrfs_dentry_operations);
sb->s_export_op = &btrfs_export_ops;
#ifdef CONFIG_FS_VERITY
sb->s_vop = &btrfs_verityops;
@@ -963,28 +968,30 @@ static int btrfs_fill_super(struct super_block *sb,
sb->s_time_gran = 1;
sb->s_iflags |= SB_I_CGROUPWB | SB_I_ALLOW_HSM;
- err = super_setup_bdi(sb);
- if (err) {
+ ret = super_setup_bdi(sb);
+ if (ret) {
btrfs_err(fs_info, "super_setup_bdi failed");
- return err;
+ return ret;
}
- err = open_ctree(sb, fs_devices);
- if (err) {
- btrfs_err(fs_info, "open_ctree failed: %d", err);
- return err;
+ ret = open_ctree(sb, fs_devices);
+ if (ret) {
+ btrfs_err(fs_info, "open_ctree failed: %d", ret);
+ return ret;
}
+ btrfs_emit_options(fs_info, NULL);
+
inode = btrfs_iget(BTRFS_FIRST_FREE_OBJECTID, fs_info->fs_root);
if (IS_ERR(inode)) {
- err = PTR_ERR(inode);
- btrfs_handle_fs_error(fs_info, err, NULL);
+ ret = PTR_ERR(inode);
+ btrfs_handle_fs_error(fs_info, ret, NULL);
goto fail_close;
}
- sb->s_root = d_make_root(inode);
+ sb->s_root = d_make_root(&inode->vfs_inode);
if (!sb->s_root) {
- err = -ENOMEM;
+ ret = -ENOMEM;
goto fail_close;
}
@@ -993,7 +1000,7 @@ static int btrfs_fill_super(struct super_block *sb,
fail_close:
close_ctree(fs_info);
- return err;
+ return ret;
}
int btrfs_sync_fs(struct super_block *sb, int wait)
@@ -1139,8 +1146,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
subvol_name = btrfs_get_subvol_name_from_objectid(info,
btrfs_root_id(BTRFS_I(d_inode(dentry))->root));
if (!IS_ERR(subvol_name)) {
- seq_puts(seq, ",subvol=");
- seq_escape(seq, subvol_name, " \t\n\\");
+ seq_show_option(seq, "subvol", subvol_name);
kfree(subvol_name);
}
return 0;
@@ -1149,11 +1155,11 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
/*
* subvolumes are identified by ino 256
*/
-static inline int is_subvolume_inode(struct inode *inode)
+static inline bool is_subvolume_inode(struct inode *inode)
{
if (inode && inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
- return 1;
- return 0;
+ return true;
+ return false;
}
static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid,
@@ -1433,7 +1439,7 @@ static void btrfs_emit_options(struct btrfs_fs_info *info,
{
btrfs_info_if_set(info, old, NODATASUM, "setting nodatasum");
btrfs_info_if_set(info, old, DEGRADED, "allowing degraded mounts");
- btrfs_info_if_set(info, old, NODATASUM, "setting nodatasum");
+ btrfs_info_if_set(info, old, NODATACOW, "setting nodatacow");
btrfs_info_if_set(info, old, SSD, "enabling ssd optimizations");
btrfs_info_if_set(info, old, SSD_SPREAD, "using spread ssd allocation scheme");
btrfs_info_if_set(info, old, NOBARRIER, "turning off barriers");
@@ -1455,10 +1461,11 @@ static void btrfs_emit_options(struct btrfs_fs_info *info,
btrfs_info_if_set(info, old, IGNOREMETACSUMS, "ignoring meta csums");
btrfs_info_if_set(info, old, IGNORESUPERFLAGS, "ignoring unknown super block flags");
+ btrfs_info_if_unset(info, old, NODATASUM, "setting datasum");
btrfs_info_if_unset(info, old, NODATACOW, "setting datacow");
btrfs_info_if_unset(info, old, SSD, "not using ssd optimizations");
btrfs_info_if_unset(info, old, SSD_SPREAD, "not using spread ssd allocation scheme");
- btrfs_info_if_unset(info, old, NOBARRIER, "turning off barriers");
+ btrfs_info_if_unset(info, old, NOBARRIER, "turning on barriers");
btrfs_info_if_unset(info, old, NOTREELOG, "enabling tree log");
btrfs_info_if_unset(info, old, SPACE_CACHE, "disabling disk space caching");
btrfs_info_if_unset(info, old, FREE_SPACE_TREE, "disabling free space tree");
@@ -1831,10 +1838,9 @@ static int btrfs_get_tree_super(struct fs_context *fc)
struct btrfs_fs_info *fs_info = fc->s_fs_info;
struct btrfs_fs_context *ctx = fc->fs_private;
struct btrfs_fs_devices *fs_devices = NULL;
- struct block_device *bdev;
struct btrfs_device *device;
struct super_block *sb;
- blk_mode_t mode = btrfs_open_mode(fc);
+ blk_mode_t mode = sb_open_mode(fc->sb_flags);
int ret;
btrfs_ctx_to_info(fs_info, ctx);
@@ -1844,47 +1850,60 @@ static int btrfs_get_tree_super(struct fs_context *fc)
* With 'true' passed to btrfs_scan_one_device() (mount time) we expect
* either a valid device or an error.
*/
- device = btrfs_scan_one_device(fc->source, mode, true);
+ device = btrfs_scan_one_device(fc->source, true);
ASSERT(device != NULL);
if (IS_ERR(device)) {
mutex_unlock(&uuid_mutex);
return PTR_ERR(device);
}
-
fs_devices = device->fs_devices;
+ /*
+ * We cannot hold uuid_mutex calling sget_fc(), it will lead to a
+ * locking order reversal with s_umount.
+ *
+ * So here we increase the holding number of fs_devices, this will ensure
+ * the fs_devices itself won't be freed.
+ */
+ btrfs_fs_devices_inc_holding(fs_devices);
fs_info->fs_devices = fs_devices;
-
- ret = btrfs_open_devices(fs_devices, mode, &btrfs_fs_type);
mutex_unlock(&uuid_mutex);
- if (ret)
- return ret;
- if (!(fc->sb_flags & SB_RDONLY) && fs_devices->rw_devices == 0) {
- ret = -EACCES;
- goto error;
- }
-
- bdev = fs_devices->latest_dev->bdev;
- /*
- * From now on the error handling is not straightforward.
- *
- * If successful, this will transfer the fs_info into the super block,
- * and fc->s_fs_info will be NULL. However if there's an existing
- * super, we'll still have fc->s_fs_info populated. If we error
- * completely out it'll be cleaned up when we drop the fs_context,
- * otherwise it's tied to the lifetime of the super_block.
- */
sb = sget_fc(fc, btrfs_fc_test_super, set_anon_super_fc);
if (IS_ERR(sb)) {
- ret = PTR_ERR(sb);
- goto error;
+ mutex_lock(&uuid_mutex);
+ btrfs_fs_devices_dec_holding(fs_devices);
+ /*
+ * Since the fs_devices is not opened, it can be freed at any
+ * time after unlocking uuid_mutex. We need to avoid double
+ * free through put_fs_context()->btrfs_free_fs_info().
+ * So here we reset fs_info->fs_devices to NULL, and let the
+ * regular fs_devices reclaim path to handle it.
+ *
+ * This applies to all later branches where no fs_devices is
+ * opened.
+ */
+ fs_info->fs_devices = NULL;
+ mutex_unlock(&uuid_mutex);
+ return PTR_ERR(sb);
}
set_device_specific_options(fs_info);
if (sb->s_root) {
- btrfs_close_devices(fs_devices);
+ /*
+ * Not the first mount of the fs thus got an existing super block.
+ * Will reuse the returned super block, fs_info and fs_devices.
+ *
+ * fc->s_fs_info is not touched and will be later freed by
+ * put_fs_context() through btrfs_free_fs_context().
+ */
+ ASSERT(fc->s_fs_info == fs_info);
+
+ mutex_lock(&uuid_mutex);
+ btrfs_fs_devices_dec_holding(fs_devices);
+ fs_info->fs_devices = NULL;
+ mutex_unlock(&uuid_mutex);
/*
* At this stage we may have RO flag mismatch between
* fc->sb_flags and sb->s_flags. Caller should detect such
@@ -1892,9 +1911,32 @@ static int btrfs_get_tree_super(struct fs_context *fc)
* needed.
*/
} else {
+ struct block_device *bdev;
+
+ /*
+ * The first mount of the fs thus a new superblock, fc->s_fs_info
+ * must be NULL, and the ownership of our fs_info and fs_devices is
+ * transferred to the super block.
+ */
+ ASSERT(fc->s_fs_info == NULL);
+
+ mutex_lock(&uuid_mutex);
+ btrfs_fs_devices_dec_holding(fs_devices);
+ ret = btrfs_open_devices(fs_devices, mode, sb);
+ if (ret < 0)
+ fs_info->fs_devices = NULL;
+ mutex_unlock(&uuid_mutex);
+ if (ret < 0) {
+ deactivate_locked_super(sb);
+ return ret;
+ }
+ if (!(fc->sb_flags & SB_RDONLY) && fs_devices->rw_devices == 0) {
+ deactivate_locked_super(sb);
+ return -EACCES;
+ }
+ bdev = fs_devices->latest_dev->bdev;
snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev);
shrinker_debugfs_rename(sb->s_shrink, "sb-btrfs:%s", sb->s_id);
- btrfs_sb(sb)->bdev_holder = &btrfs_fs_type;
ret = btrfs_fill_super(sb, fs_devices);
if (ret) {
deactivate_locked_super(sb);
@@ -1906,10 +1948,6 @@ static int btrfs_get_tree_super(struct fs_context *fc)
fc->root = dget(sb->s_root);
return 0;
-
-error:
- btrfs_close_devices(fs_devices);
- return ret;
}
/*
@@ -1985,17 +2023,13 @@ error:
* btrfs or not, setting the whole super block RO. To make per-subvolume mounting
* work with different options work we need to keep backward compatibility.
*/
-static int btrfs_reconfigure_for_mount(struct fs_context *fc, struct vfsmount *mnt)
+static int btrfs_reconfigure_for_mount(struct fs_context *fc)
{
int ret = 0;
- if (fc->sb_flags & SB_RDONLY)
- return ret;
-
- down_write(&mnt->mnt_sb->s_umount);
- if (!(fc->sb_flags & SB_RDONLY) && (mnt->mnt_sb->s_flags & SB_RDONLY))
+ if (!(fc->sb_flags & SB_RDONLY) && (fc->root->d_sb->s_flags & SB_RDONLY))
ret = btrfs_reconfigure(fc);
- up_write(&mnt->mnt_sb->s_umount);
+
return ret;
}
@@ -2040,25 +2074,18 @@ static int btrfs_get_tree_subvol(struct fs_context *fc)
*/
dup_fc->s_fs_info = fs_info;
- /*
- * We'll do the security settings in our btrfs_get_tree_super() mount
- * loop, they were duplicated into dup_fc, we can drop the originals
- * here.
- */
- security_free_mnt_opts(&fc->security);
- fc->security = NULL;
+ ret = btrfs_get_tree_super(dup_fc);
+ if (ret)
+ goto error;
- mnt = fc_mount(dup_fc);
- if (IS_ERR(mnt)) {
- put_fs_context(dup_fc);
- return PTR_ERR(mnt);
- }
- ret = btrfs_reconfigure_for_mount(dup_fc, mnt);
+ ret = btrfs_reconfigure_for_mount(dup_fc);
+ up_write(&dup_fc->root->d_sb->s_umount);
+ if (ret)
+ goto error;
+ mnt = vfs_create_mount(dup_fc);
put_fs_context(dup_fc);
- if (ret) {
- mntput(mnt);
- return ret;
- }
+ if (IS_ERR(mnt))
+ return PTR_ERR(mnt);
/*
* This free's ->subvol_name, because if it isn't set we have to
@@ -2072,25 +2099,15 @@ static int btrfs_get_tree_subvol(struct fs_context *fc)
fc->root = dentry;
return 0;
+error:
+ put_fs_context(dup_fc);
+ return ret;
}
static int btrfs_get_tree(struct fs_context *fc)
{
- /*
- * Since we use mount_subtree to mount the default/specified subvol, we
- * have to do mounts in two steps.
- *
- * First pass through we call btrfs_get_tree_subvol(), this is just a
- * wrapper around fc_mount() to call back into here again, and this time
- * we'll call btrfs_get_tree_super(). This will do the open_ctree() and
- * everything to open the devices and file system. Then we return back
- * with a fully constructed vfsmount in btrfs_get_tree_subvol(), and
- * from there we can do our mount_subvol() call, which will lookup
- * whichever subvol we're mounting and setup this fc with the
- * appropriate dentry for the subvol.
- */
- if (fc->s_fs_info)
- return btrfs_get_tree_super(fc);
+ ASSERT(fc->s_fs_info == NULL);
+
return btrfs_get_tree_subvol(fc);
}
@@ -2222,7 +2239,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
* Scanning outside of mount can return NULL which would turn
* into 0 error code.
*/
- device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ, false);
+ device = btrfs_scan_one_device(vol->name, false);
ret = PTR_ERR_OR_ZERO(device);
mutex_unlock(&uuid_mutex);
break;
@@ -2240,7 +2257,7 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
* Scanning outside of mount can return NULL which would turn
* into 0 error code.
*/
- device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ, false);
+ device = btrfs_scan_one_device(vol->name, false);
if (IS_ERR_OR_NULL(device)) {
mutex_unlock(&uuid_mutex);
if (IS_ERR(device))
@@ -2293,7 +2310,7 @@ static int check_dev_super(struct btrfs_device *dev)
return 0;
/* Only need to check the primary super block. */
- sb = btrfs_read_dev_one_super(dev->bdev, 0, true);
+ sb = btrfs_read_disk_super(dev->bdev, 0, true);
if (IS_ERR(sb))
return PTR_ERR(sb);
@@ -2526,8 +2543,8 @@ static const struct init_sequence mod_init_seq[] = {
.init_func = btrfs_free_space_init,
.exit_func = btrfs_free_space_exit,
}, {
- .init_func = extent_state_init_cachep,
- .exit_func = extent_state_free_cachep,
+ .init_func = btrfs_extent_state_init_cachep,
+ .exit_func = btrfs_extent_state_free_cachep,
}, {
.init_func = extent_buffer_init_cachep,
.exit_func = extent_buffer_free_cachep,
@@ -2535,8 +2552,8 @@ static const struct init_sequence mod_init_seq[] = {
.init_func = btrfs_bioset_init,
.exit_func = btrfs_bioset_exit,
}, {
- .init_func = extent_map_init,
- .exit_func = extent_map_exit,
+ .init_func = btrfs_extent_map_init,
+ .exit_func = btrfs_extent_map_exit,
#ifdef CONFIG_BTRFS_EXPERIMENTAL
}, {
.init_func = btrfs_read_policy_init,
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 53b846d99ece..9d398f7a36ad 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -160,8 +160,7 @@ static int can_modify_feature(struct btrfs_feature_attr *fa)
clear = BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR;
break;
default:
- pr_warn("btrfs: sysfs: unknown feature set %d\n",
- fa->feature_set);
+ btrfs_warn(NULL, "sysfs: unknown feature set %d", fa->feature_set);
return 0;
}
@@ -411,7 +410,8 @@ static ssize_t supported_sectorsizes_show(struct kobject *kobj,
{
ssize_t ret = 0;
- /* An artificial limit to only support 4K and PAGE_SIZE */
+ if (BTRFS_MIN_BLOCKSIZE != SZ_4K && BTRFS_MIN_BLOCKSIZE != PAGE_SIZE)
+ ret += sysfs_emit_at(buf, ret, "%u ", BTRFS_MIN_BLOCKSIZE);
if (PAGE_SIZE > SZ_4K)
ret += sysfs_emit_at(buf, ret, "%u ", SZ_4K);
ret += sysfs_emit_at(buf, ret, "%lu\n", PAGE_SIZE);
@@ -1137,13 +1137,21 @@ static ssize_t btrfs_commit_stats_show(struct kobject *kobj,
struct kobj_attribute *a, char *buf)
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+ u64 now = ktime_get_ns();
+ u64 start_time = fs_info->commit_stats.critical_section_start_time;
+ u64 pending = 0;
+
+ if (start_time)
+ pending = now - start_time;
return sysfs_emit(buf,
"commits %llu\n"
+ "cur_commit_ms %llu\n"
"last_commit_ms %llu\n"
"max_commit_ms %llu\n"
"total_commit_ms %llu\n",
fs_info->commit_stats.commit_count,
+ div_u64(pending, NSEC_PER_MSEC),
div_u64(fs_info->commit_stats.last_commit_dur, NSEC_PER_MSEC),
div_u64(fs_info->commit_stats.max_commit_dur, NSEC_PER_MSEC),
div_u64(fs_info->commit_stats.total_commit_dur, NSEC_PER_MSEC));
@@ -1201,7 +1209,7 @@ static ssize_t quota_override_store(struct kobject *kobj,
{
struct btrfs_fs_info *fs_info = to_fs_info(kobj);
unsigned long knob;
- int err;
+ int ret;
if (!fs_info)
return -EPERM;
@@ -1209,9 +1217,9 @@ static ssize_t quota_override_store(struct kobject *kobj,
if (!capable(CAP_SYS_RESOURCE))
return -EPERM;
- err = kstrtoul(buf, 10, &knob);
- if (err)
- return err;
+ ret = kstrtoul(buf, 10, &knob);
+ if (ret)
+ return ret;
if (knob > 1)
return -EINVAL;
@@ -1330,29 +1338,30 @@ MODULE_PARM_DESC(read_policy,
int btrfs_read_policy_to_enum(const char *str, s64 *value_ret)
{
- char param[32] = { 0 };
+ char param[32];
char __maybe_unused *value_str;
if (!str || strlen(str) == 0)
return 0;
- strncpy(param, str, sizeof(param) - 1);
+ strscpy(param, str);
#ifdef CONFIG_BTRFS_EXPERIMENTAL
/* Separate value from input in policy:value format. */
value_str = strchr(param, ':');
if (value_str) {
- int ret;
+ char *retptr;
*value_str = 0;
value_str++;
if (!value_ret)
return -EINVAL;
- ret = kstrtos64(value_str, 10, value_ret);
- if (ret)
+
+ *value_ret = memparse(value_str, &retptr);
+ /* There could be any trailing typos after the value. */
+ retptr = skip_spaces(retptr);
+ if (*retptr != 0 || *value_ret <= 0)
return -EINVAL;
- if (*value_ret < 0)
- return -ERANGE;
}
#endif
@@ -1928,16 +1937,35 @@ void btrfs_sysfs_remove_space_info(struct btrfs_space_info *space_info)
kobject_put(&space_info->kobj);
}
-static const char *alloc_name(u64 flags)
+static const char *alloc_name(struct btrfs_space_info *space_info)
{
+ u64 flags = space_info->flags;
+
switch (flags) {
case BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA:
return "mixed";
case BTRFS_BLOCK_GROUP_METADATA:
- return "metadata";
+ switch (space_info->subgroup_id) {
+ case BTRFS_SUB_GROUP_PRIMARY:
+ return "metadata";
+ case BTRFS_SUB_GROUP_TREELOG:
+ return "metadata-treelog";
+ default:
+ WARN_ON_ONCE(1);
+ return "metadata (unknown sub-group)";
+ }
case BTRFS_BLOCK_GROUP_DATA:
- return "data";
+ switch (space_info->subgroup_id) {
+ case BTRFS_SUB_GROUP_PRIMARY:
+ return "data";
+ case BTRFS_SUB_GROUP_DATA_RELOC:
+ return "data-reloc";
+ default:
+ WARN_ON_ONCE(1);
+ return "data (unknown sub-group)";
+ }
case BTRFS_BLOCK_GROUP_SYSTEM:
+ ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_PRIMARY);
return "system";
default:
WARN_ON(1);
@@ -1956,7 +1984,7 @@ int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info,
ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype,
fs_info->space_info_kobj, "%s",
- alloc_name(space_info->flags));
+ alloc_name(space_info));
if (ret) {
kobject_put(&space_info->kobj);
return ret;
@@ -2218,7 +2246,7 @@ void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action)
ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
if (ret)
- pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n",
+ btrfs_warn(NULL, "sending event %d to kobject: '%s' (%p): failed",
action, kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
&disk_to_dev(bdev->bd_disk)->kobj);
}
@@ -2261,15 +2289,15 @@ static struct kset *btrfs_kset;
*/
int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs)
{
- int error;
+ int ret;
init_completion(&fs_devs->kobj_unregister);
fs_devs->fsid_kobj.kset = btrfs_kset;
- error = kobject_init_and_add(&fs_devs->fsid_kobj, &btrfs_ktype, NULL,
- "%pU", fs_devs->fsid);
- if (error) {
+ ret = kobject_init_and_add(&fs_devs->fsid_kobj, &btrfs_ktype, NULL,
+ "%pU", fs_devs->fsid);
+ if (ret) {
kobject_put(&fs_devs->fsid_kobj);
- return error;
+ return ret;
}
fs_devs->devices_kobj = kobject_create_and_add("devices",
@@ -2295,71 +2323,70 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs)
int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info)
{
- int error;
+ int ret;
struct btrfs_fs_devices *fs_devs = fs_info->fs_devices;
struct kobject *fsid_kobj = &fs_devs->fsid_kobj;
- error = btrfs_sysfs_add_fs_devices(fs_devs);
- if (error)
- return error;
+ ret = btrfs_sysfs_add_fs_devices(fs_devs);
+ if (ret)
+ return ret;
- error = sysfs_create_files(fsid_kobj, btrfs_attrs);
- if (error) {
+ ret = sysfs_create_files(fsid_kobj, btrfs_attrs);
+ if (ret) {
btrfs_sysfs_remove_fs_devices(fs_devs);
- return error;
+ return ret;
}
- error = sysfs_create_group(fsid_kobj,
- &btrfs_feature_attr_group);
- if (error)
+ ret = sysfs_create_group(fsid_kobj, &btrfs_feature_attr_group);
+ if (ret)
goto failure;
#ifdef CONFIG_BTRFS_DEBUG
fs_info->debug_kobj = kobject_create_and_add("debug", fsid_kobj);
if (!fs_info->debug_kobj) {
- error = -ENOMEM;
+ ret = -ENOMEM;
goto failure;
}
- error = sysfs_create_files(fs_info->debug_kobj, btrfs_debug_mount_attrs);
- if (error)
+ ret = sysfs_create_files(fs_info->debug_kobj, btrfs_debug_mount_attrs);
+ if (ret)
goto failure;
#endif
/* Discard directory */
fs_info->discard_kobj = kobject_create_and_add("discard", fsid_kobj);
if (!fs_info->discard_kobj) {
- error = -ENOMEM;
+ ret = -ENOMEM;
goto failure;
}
- error = sysfs_create_files(fs_info->discard_kobj, discard_attrs);
- if (error)
+ ret = sysfs_create_files(fs_info->discard_kobj, discard_attrs);
+ if (ret)
goto failure;
- error = addrm_unknown_feature_attrs(fs_info, true);
- if (error)
+ ret = addrm_unknown_feature_attrs(fs_info, true);
+ if (ret)
goto failure;
- error = sysfs_create_link(fsid_kobj, &fs_info->sb->s_bdi->dev->kobj, "bdi");
- if (error)
+ ret = sysfs_create_link(fsid_kobj, &fs_info->sb->s_bdi->dev->kobj, "bdi");
+ if (ret)
goto failure;
fs_info->space_info_kobj = kobject_create_and_add("allocation",
fsid_kobj);
if (!fs_info->space_info_kobj) {
- error = -ENOMEM;
+ ret = -ENOMEM;
goto failure;
}
- error = sysfs_create_files(fs_info->space_info_kobj, allocation_attrs);
- if (error)
+ ret = sysfs_create_files(fs_info->space_info_kobj, allocation_attrs);
+ if (ret)
goto failure;
return 0;
failure:
btrfs_sysfs_remove_mounted(fs_info);
- return error;
+ return ret;
}
static ssize_t qgroup_enabled_show(struct kobject *qgroups_kobj,
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index 3fc5c6f90dc4..0f94ae923210 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -7,6 +7,7 @@
#include <linux/compiler_types.h>
#include <linux/kobject.h>
+struct block_device;
struct btrfs_fs_info;
struct btrfs_device;
struct btrfs_fs_devices;
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index 5eff8d7d2360..b576897d71cc 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -102,7 +102,7 @@ struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info)
if (!dev)
return ERR_PTR(-ENOMEM);
- extent_io_tree_init(fs_info, &dev->alloc_state, 0);
+ btrfs_extent_io_tree_init(fs_info, &dev->alloc_state, 0);
INIT_LIST_HEAD(&dev->dev_list);
list_add(&dev->dev_list, &fs_info->fs_devices->devices);
@@ -111,7 +111,7 @@ struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info)
static void btrfs_free_dummy_device(struct btrfs_device *dev)
{
- extent_io_tree_release(&dev->alloc_state);
+ btrfs_extent_io_tree_release(&dev->alloc_state);
kfree(dev);
}
@@ -157,9 +157,9 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize)
void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
{
- struct radix_tree_iter iter;
- void **slot;
struct btrfs_device *dev, *tmp;
+ struct extent_buffer *eb;
+ unsigned long index;
if (!fs_info)
return;
@@ -169,25 +169,13 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
test_mnt->mnt_sb->s_fs_info = NULL;
- spin_lock(&fs_info->buffer_lock);
- radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) {
- struct extent_buffer *eb;
-
- eb = radix_tree_deref_slot_protected(slot, &fs_info->buffer_lock);
- if (!eb)
- continue;
- /* Shouldn't happen but that kind of thinking creates CVE's */
- if (radix_tree_exception(eb)) {
- if (radix_tree_deref_retry(eb))
- slot = radix_tree_iter_retry(&iter);
- continue;
- }
- slot = radix_tree_iter_resume(slot, &iter);
- spin_unlock(&fs_info->buffer_lock);
- free_extent_buffer_stale(eb);
- spin_lock(&fs_info->buffer_lock);
+ xa_lock_irq(&fs_info->buffer_tree);
+ xa_for_each(&fs_info->buffer_tree, index, eb) {
+ xa_unlock_irq(&fs_info->buffer_tree);
+ free_extent_buffer(eb);
+ xa_lock_irq(&fs_info->buffer_tree);
}
- spin_unlock(&fs_info->buffer_lock);
+ xa_unlock_irq(&fs_info->buffer_tree);
btrfs_mapping_tree_free(fs_info);
list_for_each_entry_safe(dev, tmp, &fs_info->fs_devices->devices,
diff --git a/fs/btrfs/tests/delayed-refs-tests.c b/fs/btrfs/tests/delayed-refs-tests.c
index 6558508c2ddf..265370e79a54 100644
--- a/fs/btrfs/tests/delayed-refs-tests.c
+++ b/fs/btrfs/tests/delayed-refs-tests.c
@@ -1009,6 +1009,7 @@ int btrfs_test_delayed_refs(u32 sectorsize, u32 nodesize)
if (!ret)
ret = select_delayed_refs_test(&trans);
+ kfree(transaction);
out_free_fs_info:
btrfs_free_dummy_fs_info(fs_info);
return ret;
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index 0a2dbfaaf49e..b19328d077d3 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -14,17 +14,17 @@
#include "../disk-io.h"
#include "../btrfs_inode.h"
-#define PROCESS_UNLOCK (1 << 0)
-#define PROCESS_RELEASE (1 << 1)
-#define PROCESS_TEST_LOCKED (1 << 2)
+#define PROCESS_UNLOCK (1U << 0)
+#define PROCESS_RELEASE (1U << 1)
+#define PROCESS_TEST_LOCKED (1U << 2)
static noinline int process_page_range(struct inode *inode, u64 start, u64 end,
unsigned long flags)
{
int ret;
struct folio_batch fbatch;
- unsigned long index = start >> PAGE_SHIFT;
- unsigned long end_index = end >> PAGE_SHIFT;
+ pgoff_t index = start >> PAGE_SHIFT;
+ pgoff_t end_index = end >> PAGE_SHIFT;
int i;
int count = 0;
int loops = 0;
@@ -74,9 +74,9 @@ static void extent_flag_to_str(const struct extent_state *state, char *dest)
dest[0] = 0;
PRINT_ONE_FLAG(state, dest, cur, DIRTY);
- PRINT_ONE_FLAG(state, dest, cur, UPTODATE);
PRINT_ONE_FLAG(state, dest, cur, LOCKED);
- PRINT_ONE_FLAG(state, dest, cur, NEW);
+ PRINT_ONE_FLAG(state, dest, cur, DIRTY_LOG1);
+ PRINT_ONE_FLAG(state, dest, cur, DIRTY_LOG2);
PRINT_ONE_FLAG(state, dest, cur, DELALLOC);
PRINT_ONE_FLAG(state, dest, cur, DEFRAG);
PRINT_ONE_FLAG(state, dest, cur, BOUNDARY);
@@ -114,7 +114,6 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize)
struct extent_io_tree *tmp;
struct page *page;
struct page *locked_page = NULL;
- unsigned long index = 0;
/* In this test we need at least 2 file extents at its maximum size */
u64 max_bytes = BTRFS_MAX_EXTENT_SIZE;
u64 total_dirty = 2 * max_bytes;
@@ -150,14 +149,14 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize)
* Passing NULL as we don't have fs_info but tracepoints are not used
* at this point
*/
- extent_io_tree_init(NULL, tmp, IO_TREE_SELFTEST);
+ btrfs_extent_io_tree_init(NULL, tmp, IO_TREE_SELFTEST);
/*
* First go through and create and mark all of our pages dirty, we pin
* everything to make sure our pages don't get evicted and screw up our
* test.
*/
- for (index = 0; index < (total_dirty >> PAGE_SHIFT); index++) {
+ for (pgoff_t index = 0; index < (total_dirty >> PAGE_SHIFT); index++) {
page = find_or_create_page(inode->i_mapping, index, GFP_KERNEL);
if (!page) {
test_err("failed to allocate test page");
@@ -177,7 +176,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize)
* |--- delalloc ---|
* |--- search ---|
*/
- set_extent_bit(tmp, 0, sectorsize - 1, EXTENT_DELALLOC, NULL);
+ btrfs_set_extent_bit(tmp, 0, sectorsize - 1, EXTENT_DELALLOC, NULL);
start = 0;
end = start + PAGE_SIZE - 1;
found = find_lock_delalloc_range(inode, page_folio(locked_page), &start,
@@ -191,7 +190,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize)
sectorsize - 1, start, end);
goto out_bits;
}
- unlock_extent(tmp, start, end, NULL);
+ btrfs_unlock_extent(tmp, start, end, NULL);
unlock_page(locked_page);
put_page(locked_page);
@@ -208,7 +207,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize)
test_err("couldn't find the locked page");
goto out_bits;
}
- set_extent_bit(tmp, sectorsize, max_bytes - 1, EXTENT_DELALLOC, NULL);
+ btrfs_set_extent_bit(tmp, sectorsize, max_bytes - 1, EXTENT_DELALLOC, NULL);
start = test_start;
end = start + PAGE_SIZE - 1;
found = find_lock_delalloc_range(inode, page_folio(locked_page), &start,
@@ -227,7 +226,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize)
test_err("there were unlocked pages in the range");
goto out_bits;
}
- unlock_extent(tmp, start, end, NULL);
+ btrfs_unlock_extent(tmp, start, end, NULL);
/* locked_page was unlocked above */
put_page(locked_page);
@@ -263,7 +262,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize)
*
* We are re-using our test_start from above since it works out well.
*/
- set_extent_bit(tmp, max_bytes, total_dirty - 1, EXTENT_DELALLOC, NULL);
+ btrfs_set_extent_bit(tmp, max_bytes, total_dirty - 1, EXTENT_DELALLOC, NULL);
start = test_start;
end = start + PAGE_SIZE - 1;
found = find_lock_delalloc_range(inode, page_folio(locked_page), &start,
@@ -282,7 +281,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize)
test_err("pages in range were not all locked");
goto out_bits;
}
- unlock_extent(tmp, start, end, NULL);
+ btrfs_unlock_extent(tmp, start, end, NULL);
/*
* Now to test where we run into a page that is no longer dirty in the
@@ -327,7 +326,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize)
out_bits:
if (ret)
dump_extent_io_tree(tmp);
- clear_extent_bits(tmp, 0, total_dirty - 1, (unsigned)-1);
+ btrfs_clear_extent_bit(tmp, 0, total_dirty - 1, (unsigned)-1, NULL);
out:
if (locked_page)
put_page(locked_page);
@@ -344,11 +343,11 @@ static int check_eb_bitmap(unsigned long *bitmap, struct extent_buffer *eb)
unsigned long i;
for (i = 0; i < eb->len * BITS_PER_BYTE; i++) {
- int bit, bit1;
+ bool bit_set, bit1_set;
- bit = !!test_bit(i, bitmap);
- bit1 = !!extent_buffer_test_bit(eb, 0, i);
- if (bit1 != bit) {
+ bit_set = test_bit(i, bitmap);
+ bit1_set = extent_buffer_test_bit(eb, 0, i);
+ if (bit1_set != bit_set) {
u8 has;
u8 expect;
@@ -361,9 +360,9 @@ static int check_eb_bitmap(unsigned long *bitmap, struct extent_buffer *eb)
return -EINVAL;
}
- bit1 = !!extent_buffer_test_bit(eb, i / BITS_PER_BYTE,
- i % BITS_PER_BYTE);
- if (bit1 != bit) {
+ bit1_set = extent_buffer_test_bit(eb, i / BITS_PER_BYTE,
+ i % BITS_PER_BYTE);
+ if (bit1_set != bit_set) {
u8 has;
u8 expect;
@@ -525,7 +524,7 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize)
goto out;
}
- eb = __alloc_dummy_extent_buffer(fs_info, 0, nodesize);
+ eb = alloc_dummy_extent_buffer(fs_info, 0);
if (!eb) {
test_std_err(TEST_ALLOC_ROOT);
ret = -ENOMEM;
@@ -542,7 +541,7 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize)
* Test again for case where the tree block is sectorsize aligned but
* not nodesize aligned.
*/
- eb = __alloc_dummy_extent_buffer(fs_info, sectorsize, nodesize);
+ eb = alloc_dummy_extent_buffer(fs_info, sectorsize);
if (!eb) {
test_std_err(TEST_ALLOC_ROOT);
ret = -ENOMEM;
@@ -565,10 +564,10 @@ static int test_find_first_clear_extent_bit(void)
test_msg("running find_first_clear_extent_bit test");
- extent_io_tree_init(NULL, &tree, IO_TREE_SELFTEST);
+ btrfs_extent_io_tree_init(NULL, &tree, IO_TREE_SELFTEST);
/* Test correct handling of empty tree */
- find_first_clear_extent_bit(&tree, 0, &start, &end, CHUNK_TRIMMED);
+ btrfs_find_first_clear_extent_bit(&tree, 0, &start, &end, CHUNK_TRIMMED);
if (start != 0 || end != -1) {
test_err(
"error getting a range from completely empty tree: start %llu end %llu",
@@ -579,11 +578,11 @@ static int test_find_first_clear_extent_bit(void)
* Set 1M-4M alloc/discard and 32M-64M thus leaving a hole between
* 4M-32M
*/
- set_extent_bit(&tree, SZ_1M, SZ_4M - 1,
- CHUNK_TRIMMED | CHUNK_ALLOCATED, NULL);
+ btrfs_set_extent_bit(&tree, SZ_1M, SZ_4M - 1,
+ CHUNK_TRIMMED | CHUNK_ALLOCATED, NULL);
- find_first_clear_extent_bit(&tree, SZ_512K, &start, &end,
- CHUNK_TRIMMED | CHUNK_ALLOCATED);
+ btrfs_find_first_clear_extent_bit(&tree, SZ_512K, &start, &end,
+ CHUNK_TRIMMED | CHUNK_ALLOCATED);
if (start != 0 || end != SZ_1M - 1) {
test_err("error finding beginning range: start %llu end %llu",
@@ -592,14 +591,14 @@ static int test_find_first_clear_extent_bit(void)
}
/* Now add 32M-64M so that we have a hole between 4M-32M */
- set_extent_bit(&tree, SZ_32M, SZ_64M - 1,
- CHUNK_TRIMMED | CHUNK_ALLOCATED, NULL);
+ btrfs_set_extent_bit(&tree, SZ_32M, SZ_64M - 1,
+ CHUNK_TRIMMED | CHUNK_ALLOCATED, NULL);
/*
* Request first hole starting at 12M, we should get 4M-32M
*/
- find_first_clear_extent_bit(&tree, 12 * SZ_1M, &start, &end,
- CHUNK_TRIMMED | CHUNK_ALLOCATED);
+ btrfs_find_first_clear_extent_bit(&tree, 12 * SZ_1M, &start, &end,
+ CHUNK_TRIMMED | CHUNK_ALLOCATED);
if (start != SZ_4M || end != SZ_32M - 1) {
test_err("error finding trimmed range: start %llu end %llu",
@@ -611,8 +610,8 @@ static int test_find_first_clear_extent_bit(void)
* Search in the middle of allocated range, should get the next one
* available, which happens to be unallocated -> 4M-32M
*/
- find_first_clear_extent_bit(&tree, SZ_2M, &start, &end,
- CHUNK_TRIMMED | CHUNK_ALLOCATED);
+ btrfs_find_first_clear_extent_bit(&tree, SZ_2M, &start, &end,
+ CHUNK_TRIMMED | CHUNK_ALLOCATED);
if (start != SZ_4M || end != SZ_32M - 1) {
test_err("error finding next unalloc range: start %llu end %llu",
@@ -624,9 +623,9 @@ static int test_find_first_clear_extent_bit(void)
* Set 64M-72M with CHUNK_ALLOC flag, then search for CHUNK_TRIMMED flag
* being unset in this range, we should get the entry in range 64M-72M
*/
- set_extent_bit(&tree, SZ_64M, SZ_64M + SZ_8M - 1, CHUNK_ALLOCATED, NULL);
- find_first_clear_extent_bit(&tree, SZ_64M + SZ_1M, &start, &end,
- CHUNK_TRIMMED);
+ btrfs_set_extent_bit(&tree, SZ_64M, SZ_64M + SZ_8M - 1, CHUNK_ALLOCATED, NULL);
+ btrfs_find_first_clear_extent_bit(&tree, SZ_64M + SZ_1M, &start, &end,
+ CHUNK_TRIMMED);
if (start != SZ_64M || end != SZ_64M + SZ_8M - 1) {
test_err("error finding exact range: start %llu end %llu",
@@ -634,8 +633,8 @@ static int test_find_first_clear_extent_bit(void)
goto out;
}
- find_first_clear_extent_bit(&tree, SZ_64M - SZ_8M, &start, &end,
- CHUNK_TRIMMED);
+ btrfs_find_first_clear_extent_bit(&tree, SZ_64M - SZ_8M, &start, &end,
+ CHUNK_TRIMMED);
/*
* Search in the middle of set range whose immediate neighbour doesn't
@@ -651,7 +650,7 @@ static int test_find_first_clear_extent_bit(void)
* Search beyond any known range, shall return after last known range
* and end should be -1
*/
- find_first_clear_extent_bit(&tree, -1, &start, &end, CHUNK_TRIMMED);
+ btrfs_find_first_clear_extent_bit(&tree, -1, &start, &end, CHUNK_TRIMMED);
if (start != SZ_64M + SZ_8M || end != -1) {
test_err(
"error handling beyond end of range search: start %llu end %llu",
@@ -663,7 +662,7 @@ static int test_find_first_clear_extent_bit(void)
out:
if (ret)
dump_extent_io_tree(&tree);
- clear_extent_bits(&tree, 0, (u64)-1, CHUNK_TRIMMED | CHUNK_ALLOCATED);
+ btrfs_clear_extent_bit(&tree, 0, (u64)-1, CHUNK_TRIMMED | CHUNK_ALLOCATED, NULL);
return ret;
}
@@ -730,7 +729,7 @@ static int test_eb_mem_ops(u32 sectorsize, u32 nodesize)
goto out;
}
- eb = __alloc_dummy_extent_buffer(fs_info, SZ_1M, nodesize);
+ eb = alloc_dummy_extent_buffer(fs_info, SZ_1M);
if (!eb) {
test_std_err(TEST_ALLOC_EXTENT_BUFFER);
ret = -ENOMEM;
diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
index 56e61ac1cc64..3a86534c116f 100644
--- a/fs/btrfs/tests/extent-map-tests.c
+++ b/fs/btrfs/tests/extent-map-tests.c
@@ -22,7 +22,7 @@ static int free_extent_map_tree(struct btrfs_inode *inode)
while (!RB_EMPTY_ROOT(&em_tree->root)) {
node = rb_first(&em_tree->root);
em = rb_entry(node, struct extent_map, rb_node);
- remove_extent_mapping(inode, em);
+ btrfs_remove_extent_mapping(inode, em);
#ifdef CONFIG_BTRFS_DEBUG
if (refcount_read(&em->refs) != 1) {
@@ -36,7 +36,7 @@ static int free_extent_map_tree(struct btrfs_inode *inode)
refcount_set(&em->refs, 1);
}
#endif
- free_extent_map(em);
+ btrfs_free_extent_map(em);
}
write_unlock(&em_tree->lock);
@@ -68,7 +68,7 @@ static int test_case_1(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
int ret;
int ret2;
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
return -ENOMEM;
@@ -87,10 +87,10 @@ static int test_case_1(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
test_err("cannot add extent range [0, 16K)");
goto out;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
/* Add [16K, 20K) following [0, 16K) */
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
ret = -ENOMEM;
@@ -109,9 +109,9 @@ static int test_case_1(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
test_err("cannot add extent range [16K, 20K)");
goto out;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
ret = -ENOMEM;
@@ -137,7 +137,7 @@ static int test_case_1(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
ret = -ENOENT;
goto out;
}
- if (em->start != 0 || extent_map_end(em) != SZ_16K ||
+ if (em->start != 0 || btrfs_extent_map_end(em) != SZ_16K ||
em->disk_bytenr != 0 || em->disk_num_bytes != SZ_16K) {
test_err(
"case1 [%llu %llu]: ret %d return a wrong em (start %llu len %llu disk_bytenr %llu disk_num_bytes %llu",
@@ -145,7 +145,7 @@ static int test_case_1(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
em->disk_bytenr, em->disk_num_bytes);
ret = -EINVAL;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
out:
ret2 = free_extent_map_tree(inode);
if (ret == 0)
@@ -167,7 +167,7 @@ static int test_case_2(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
int ret;
int ret2;
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
return -ENOMEM;
@@ -186,10 +186,10 @@ static int test_case_2(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
test_err("cannot add extent range [0, 1K)");
goto out;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
/* Add [4K, 8K) following [0, 1K) */
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
ret = -ENOMEM;
@@ -208,9 +208,9 @@ static int test_case_2(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
test_err("cannot add extent range [4K, 8K)");
goto out;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
ret = -ENOMEM;
@@ -235,14 +235,14 @@ static int test_case_2(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
ret = -ENOENT;
goto out;
}
- if (em->start != 0 || extent_map_end(em) != SZ_1K ||
+ if (em->start != 0 || btrfs_extent_map_end(em) != SZ_1K ||
em->disk_bytenr != EXTENT_MAP_INLINE) {
test_err(
"case2 [0 1K]: ret %d return a wrong em (start %llu len %llu disk_bytenr %llu",
ret, em->start, em->len, em->disk_bytenr);
ret = -EINVAL;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
out:
ret2 = free_extent_map_tree(inode);
if (ret == 0)
@@ -260,7 +260,7 @@ static int __test_case_3(struct btrfs_fs_info *fs_info,
int ret;
int ret2;
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
return -ENOMEM;
@@ -279,9 +279,9 @@ static int __test_case_3(struct btrfs_fs_info *fs_info,
test_err("cannot add extent range [4K, 8K)");
goto out;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
ret = -ENOMEM;
@@ -312,15 +312,15 @@ static int __test_case_3(struct btrfs_fs_info *fs_info,
* Since bytes within em are contiguous, em->block_start is identical to
* em->start.
*/
- if (start < em->start || start + len > extent_map_end(em) ||
- em->start != extent_map_block_start(em)) {
+ if (start < em->start || start + len > btrfs_extent_map_end(em) ||
+ em->start != btrfs_extent_map_block_start(em)) {
test_err(
"case3 [%llu %llu): ret %d em (start %llu len %llu disk_bytenr %llu block_len %llu)",
start, start + len, ret, em->start, em->len,
em->disk_bytenr, em->disk_num_bytes);
ret = -EINVAL;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
out:
ret2 = free_extent_map_tree(inode);
if (ret == 0)
@@ -369,7 +369,7 @@ static int __test_case_4(struct btrfs_fs_info *fs_info,
int ret;
int ret2;
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
return -ENOMEM;
@@ -388,9 +388,9 @@ static int __test_case_4(struct btrfs_fs_info *fs_info,
test_err("cannot add extent range [0, 8K)");
goto out;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
ret = -ENOMEM;
@@ -410,9 +410,9 @@ static int __test_case_4(struct btrfs_fs_info *fs_info,
test_err("cannot add extent range [8K, 32K)");
goto out;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
ret = -ENOMEM;
@@ -438,14 +438,14 @@ static int __test_case_4(struct btrfs_fs_info *fs_info,
ret = -ENOENT;
goto out;
}
- if (start < em->start || start + len > extent_map_end(em)) {
+ if (start < em->start || start + len > btrfs_extent_map_end(em)) {
test_err(
"case4 [%llu %llu): ret %d, added wrong em (start %llu len %llu disk_bytenr %llu disk_num_bytes %llu)",
start, start + len, ret, em->start, em->len,
em->disk_bytenr, em->disk_num_bytes);
ret = -EINVAL;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
out:
ret2 = free_extent_map_tree(inode);
if (ret == 0)
@@ -498,7 +498,7 @@ static int add_compressed_extent(struct btrfs_inode *inode,
struct extent_map *em;
int ret;
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
return -ENOMEM;
@@ -513,7 +513,7 @@ static int add_compressed_extent(struct btrfs_inode *inode,
write_lock(&em_tree->lock);
ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
- free_extent_map(em);
+ btrfs_free_extent_map(em);
if (ret < 0) {
test_err("cannot add extent map [%llu, %llu)", start, start + len);
return ret;
@@ -719,7 +719,7 @@ static int test_case_6(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
if (ret)
goto out;
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
ret = -ENOMEM;
@@ -751,7 +751,7 @@ static int test_case_6(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
}
ret = 0;
out:
- free_extent_map(em);
+ btrfs_free_extent_map(em);
ret2 = free_extent_map_tree(inode);
if (ret == 0)
ret = ret2;
@@ -773,7 +773,7 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
test_msg("Running btrfs_drop_extent_cache with pinned");
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
return -ENOMEM;
@@ -793,9 +793,9 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
test_err("couldn't add extent map");
goto out;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
ret = -ENOMEM;
@@ -815,7 +815,7 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
test_err("couldn't add extent map");
goto out;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
/*
* Drop [0, 36K) This should skip the [0, 4K) extent and then split the
@@ -826,7 +826,7 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
/* Make sure our extent maps look sane. */
ret = -EINVAL;
- em = lookup_extent_mapping(em_tree, 0, SZ_16K);
+ em = btrfs_lookup_extent_mapping(em_tree, 0, SZ_16K);
if (!em) {
test_err("didn't find an em at 0 as expected");
goto out;
@@ -842,10 +842,10 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
goto out;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, SZ_16K, SZ_16K);
+ em = btrfs_lookup_extent_mapping(em_tree, SZ_16K, SZ_16K);
read_unlock(&em_tree->lock);
if (em) {
test_err("found an em when we weren't expecting one");
@@ -853,7 +853,7 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
}
read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, SZ_32K, SZ_16K);
+ em = btrfs_lookup_extent_mapping(em_tree, SZ_32K, SZ_16K);
read_unlock(&em_tree->lock);
if (!em) {
test_err("didn't find an em at 32K as expected");
@@ -870,16 +870,16 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
goto out;
}
- if (extent_map_block_start(em) != SZ_32K + SZ_4K) {
+ if (btrfs_extent_map_block_start(em) != SZ_32K + SZ_4K) {
test_err("em->block_start is %llu, expected 36K",
- extent_map_block_start(em));
+ btrfs_extent_map_block_start(em));
goto out;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, 48 * SZ_1K, (u64)-1);
+ em = btrfs_lookup_extent_mapping(em_tree, 48 * SZ_1K, (u64)-1);
read_unlock(&em_tree->lock);
if (em) {
test_err("found an unexpected em above 48K");
@@ -888,9 +888,9 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
ret = 0;
out:
- free_extent_map(em);
+ btrfs_free_extent_map(em);
/* Unpin our extent to prevent warning when removing it below. */
- ret2 = unpin_extent_cache(inode, 0, SZ_16K, 0);
+ ret2 = btrfs_unpin_extent_cache(inode, 0, SZ_16K, 0);
if (ret == 0)
ret = ret2;
ret2 = free_extent_map_tree(inode);
@@ -913,7 +913,7 @@ static int test_case_8(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
int ret;
int ret2;
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
return -ENOMEM;
@@ -928,13 +928,13 @@ static int test_case_8(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
write_lock(&em_tree->lock);
ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len);
write_unlock(&em_tree->lock);
- free_extent_map(em);
+ btrfs_free_extent_map(em);
if (ret < 0) {
test_err("couldn't add extent map for range [120K, 128K)");
goto out;
}
- em = alloc_extent_map();
+ em = btrfs_alloc_extent_map();
if (!em) {
test_std_err(TEST_ALLOC_EXTENT_MAP);
ret = -ENOMEM;
@@ -967,7 +967,7 @@ static int test_case_8(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode)
write_lock(&em_tree->lock);
ret = btrfs_add_extent_mapping(inode, &em, SZ_1K * 140, SZ_4K);
write_unlock(&em_tree->lock);
- free_extent_map(em);
+ btrfs_free_extent_map(em);
if (ret < 0) {
test_err("couldn't add extent map for range [108K, 144K)");
goto out;
@@ -1045,6 +1045,7 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info,
ret = btrfs_add_chunk_map(fs_info, map);
if (ret) {
test_err("error adding chunk map to mapping tree");
+ btrfs_free_chunk_map(map);
goto out_free;
}
diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c
index b61972046feb..c8822edd32e2 100644
--- a/fs/btrfs/tests/free-space-tree-tests.c
+++ b/fs/btrfs/tests/free-space-tree-tests.c
@@ -32,7 +32,7 @@ static int __check_free_space_extents(struct btrfs_trans_handle *trans,
unsigned int i;
int ret;
- info = search_free_space_info(trans, cache, path, 0);
+ info = btrfs_search_free_space_info(trans, cache, path, 0);
if (IS_ERR(info)) {
test_err("could not find free space info");
ret = PTR_ERR(info);
@@ -57,7 +57,7 @@ static int __check_free_space_extents(struct btrfs_trans_handle *trans,
goto invalid;
offset = key.objectid;
while (offset < key.objectid + key.offset) {
- bit = free_space_test_bit(cache, path, offset);
+ bit = btrfs_free_space_test_bit(cache, path, offset);
if (prev_bit == 0 && bit == 1) {
extent_start = offset;
} else if (prev_bit == 1 && bit == 0) {
@@ -115,7 +115,7 @@ static int check_free_space_extents(struct btrfs_trans_handle *trans,
u32 flags;
int ret;
- info = search_free_space_info(trans, cache, path, 0);
+ info = btrfs_search_free_space_info(trans, cache, path, 0);
if (IS_ERR(info)) {
test_err("could not find free space info");
btrfs_release_path(path);
@@ -131,13 +131,13 @@ static int check_free_space_extents(struct btrfs_trans_handle *trans,
/* Flip it to the other format and check that for good measure. */
if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) {
- ret = convert_free_space_to_extents(trans, cache, path);
+ ret = btrfs_convert_free_space_to_extents(trans, cache, path);
if (ret) {
test_err("could not convert to extents");
return ret;
}
} else {
- ret = convert_free_space_to_bitmaps(trans, cache, path);
+ ret = btrfs_convert_free_space_to_bitmaps(trans, cache, path);
if (ret) {
test_err("could not convert to bitmaps");
return ret;
@@ -170,9 +170,8 @@ static int test_remove_all(struct btrfs_trans_handle *trans,
const struct free_space_extent extents[] = {};
int ret;
- ret = __remove_from_free_space_tree(trans, cache, path,
- cache->start,
- cache->length);
+ ret = __btrfs_remove_from_free_space_tree(trans, cache, path,
+ cache->start, cache->length);
if (ret) {
test_err("could not remove free space");
return ret;
@@ -193,8 +192,8 @@ static int test_remove_beginning(struct btrfs_trans_handle *trans,
};
int ret;
- ret = __remove_from_free_space_tree(trans, cache, path,
- cache->start, alignment);
+ ret = __btrfs_remove_from_free_space_tree(trans, cache, path,
+ cache->start, alignment);
if (ret) {
test_err("could not remove free space");
return ret;
@@ -216,7 +215,7 @@ static int test_remove_end(struct btrfs_trans_handle *trans,
};
int ret;
- ret = __remove_from_free_space_tree(trans, cache, path,
+ ret = __btrfs_remove_from_free_space_tree(trans, cache, path,
cache->start + cache->length - alignment,
alignment);
if (ret) {
@@ -240,9 +239,9 @@ static int test_remove_middle(struct btrfs_trans_handle *trans,
};
int ret;
- ret = __remove_from_free_space_tree(trans, cache, path,
- cache->start + alignment,
- alignment);
+ ret = __btrfs_remove_from_free_space_tree(trans, cache, path,
+ cache->start + alignment,
+ alignment);
if (ret) {
test_err("could not remove free space");
return ret;
@@ -263,23 +262,22 @@ static int test_merge_left(struct btrfs_trans_handle *trans,
};
int ret;
- ret = __remove_from_free_space_tree(trans, cache, path,
- cache->start, cache->length);
+ ret = __btrfs_remove_from_free_space_tree(trans, cache, path,
+ cache->start, cache->length);
if (ret) {
test_err("could not remove free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, cache, path, cache->start,
- alignment);
+ ret = __btrfs_add_to_free_space_tree(trans, cache, path, cache->start,
+ alignment);
if (ret) {
test_err("could not add free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, cache, path,
- cache->start + alignment,
- alignment);
+ ret = __btrfs_add_to_free_space_tree(trans, cache, path,
+ cache->start + alignment, alignment);
if (ret) {
test_err("could not add free space");
return ret;
@@ -300,24 +298,23 @@ static int test_merge_right(struct btrfs_trans_handle *trans,
};
int ret;
- ret = __remove_from_free_space_tree(trans, cache, path,
- cache->start, cache->length);
+ ret = __btrfs_remove_from_free_space_tree(trans, cache, path,
+ cache->start, cache->length);
if (ret) {
test_err("could not remove free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, cache, path,
- cache->start + 2 * alignment,
- alignment);
+ ret = __btrfs_add_to_free_space_tree(trans, cache, path,
+ cache->start + 2 * alignment,
+ alignment);
if (ret) {
test_err("could not add free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, cache, path,
- cache->start + alignment,
- alignment);
+ ret = __btrfs_add_to_free_space_tree(trans, cache, path,
+ cache->start + alignment, alignment);
if (ret) {
test_err("could not add free space");
return ret;
@@ -338,29 +335,29 @@ static int test_merge_both(struct btrfs_trans_handle *trans,
};
int ret;
- ret = __remove_from_free_space_tree(trans, cache, path,
- cache->start, cache->length);
+ ret = __btrfs_remove_from_free_space_tree(trans, cache, path,
+ cache->start, cache->length);
if (ret) {
test_err("could not remove free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, cache, path, cache->start,
- alignment);
+ ret = __btrfs_add_to_free_space_tree(trans, cache, path, cache->start,
+ alignment);
if (ret) {
test_err("could not add free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, cache, path,
- cache->start + 2 * alignment, alignment);
+ ret = __btrfs_add_to_free_space_tree(trans, cache, path,
+ cache->start + 2 * alignment, alignment);
if (ret) {
test_err("could not add free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, cache, path,
- cache->start + alignment, alignment);
+ ret = __btrfs_add_to_free_space_tree(trans, cache, path,
+ cache->start + alignment, alignment);
if (ret) {
test_err("could not add free space");
return ret;
@@ -383,29 +380,29 @@ static int test_merge_none(struct btrfs_trans_handle *trans,
};
int ret;
- ret = __remove_from_free_space_tree(trans, cache, path,
- cache->start, cache->length);
+ ret = __btrfs_remove_from_free_space_tree(trans, cache, path,
+ cache->start, cache->length);
if (ret) {
test_err("could not remove free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, cache, path, cache->start,
- alignment);
+ ret = __btrfs_add_to_free_space_tree(trans, cache, path, cache->start,
+ alignment);
if (ret) {
test_err("could not add free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, cache, path,
- cache->start + 4 * alignment, alignment);
+ ret = __btrfs_add_to_free_space_tree(trans, cache, path,
+ cache->start + 4 * alignment, alignment);
if (ret) {
test_err("could not add free space");
return ret;
}
- ret = __add_to_free_space_tree(trans, cache, path,
- cache->start + 2 * alignment, alignment);
+ ret = __btrfs_add_to_free_space_tree(trans, cache, path,
+ cache->start + 2 * alignment, alignment);
if (ret) {
test_err("could not add free space");
return ret;
@@ -483,14 +480,14 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize,
goto out;
}
- ret = add_block_group_free_space(&trans, cache);
+ ret = btrfs_add_block_group_free_space(&trans, cache);
if (ret) {
test_err("could not add block group free space");
goto out;
}
if (bitmaps) {
- ret = convert_free_space_to_bitmaps(&trans, cache, path);
+ ret = btrfs_convert_free_space_to_bitmaps(&trans, cache, path);
if (ret) {
test_err("could not convert block group to bitmaps");
goto out;
@@ -501,7 +498,7 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize,
if (ret)
goto out;
- ret = remove_block_group_free_space(&trans, cache);
+ ret = btrfs_remove_block_group_free_space(&trans, cache);
if (ret) {
test_err("could not remove block group free space");
goto out;
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 3ea3bc2225fe..a4c2b7748b95 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -268,7 +268,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("expected a hole, got %llu", em->disk_bytenr);
goto out;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
/*
@@ -314,7 +314,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
* this?
*/
offset = em->start + em->len;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
@@ -336,7 +336,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
offset = em->start + em->len;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
/* Regular extent */
em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
@@ -363,7 +363,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
offset = em->start + em->len;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
/* The next 3 are split extents */
em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
@@ -389,10 +389,10 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("wrong offset, want 0, have %llu", em->offset);
goto out;
}
- disk_bytenr = extent_map_block_start(em);
+ disk_bytenr = btrfs_extent_map_block_start(em);
orig_start = em->start;
offset = em->start + em->len;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
@@ -414,7 +414,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
offset = em->start + em->len;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
@@ -441,13 +441,13 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
disk_bytenr += (em->start - orig_start);
- if (extent_map_block_start(em) != disk_bytenr) {
+ if (btrfs_extent_map_block_start(em) != disk_bytenr) {
test_err("wrong block start, want %llu, have %llu",
- disk_bytenr, extent_map_block_start(em));
+ disk_bytenr, btrfs_extent_map_block_start(em));
goto out;
}
offset = em->start + em->len;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
/* Prealloc extent */
em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
@@ -475,7 +475,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
offset = em->start + em->len;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
/* The next 3 are a half written prealloc extent */
em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
@@ -502,10 +502,10 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("wrong offset, want 0, have %llu", em->offset);
goto out;
}
- disk_bytenr = extent_map_block_start(em);
+ disk_bytenr = btrfs_extent_map_block_start(em);
orig_start = em->start;
offset = em->start + em->len;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
@@ -531,13 +531,13 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
em->start - orig_start, em->offset);
goto out;
}
- if (extent_map_block_start(em) != disk_bytenr + em->offset) {
+ if (btrfs_extent_map_block_start(em) != disk_bytenr + em->offset) {
test_err("unexpected block start, wanted %llu, have %llu",
- disk_bytenr + em->offset, extent_map_block_start(em));
+ disk_bytenr + em->offset, btrfs_extent_map_block_start(em));
goto out;
}
offset = em->start + em->len;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
@@ -564,13 +564,13 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
em->start, em->offset, orig_start);
goto out;
}
- if (extent_map_block_start(em) != disk_bytenr + em->offset) {
+ if (btrfs_extent_map_block_start(em) != disk_bytenr + em->offset) {
test_err("unexpected block start, wanted %llu, have %llu",
- disk_bytenr + em->offset, extent_map_block_start(em));
+ disk_bytenr + em->offset, btrfs_extent_map_block_start(em));
goto out;
}
offset = em->start + em->len;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
/* Now for the compressed extent */
em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
@@ -597,13 +597,13 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("wrong offset, want 0, have %llu", em->offset);
goto out;
}
- if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) {
+ if (btrfs_extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) {
test_err("unexpected compress type, wanted %d, got %d",
- BTRFS_COMPRESS_ZLIB, extent_map_compression(em));
+ BTRFS_COMPRESS_ZLIB, btrfs_extent_map_compression(em));
goto out;
}
offset = em->start + em->len;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
/* Split compressed extent */
em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
@@ -630,15 +630,15 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("wrong offset, want 0, have %llu", em->offset);
goto out;
}
- if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) {
+ if (btrfs_extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) {
test_err("unexpected compress type, wanted %d, got %d",
- BTRFS_COMPRESS_ZLIB, extent_map_compression(em));
+ BTRFS_COMPRESS_ZLIB, btrfs_extent_map_compression(em));
goto out;
}
- disk_bytenr = extent_map_block_start(em);
+ disk_bytenr = btrfs_extent_map_block_start(em);
orig_start = em->start;
offset = em->start + em->len;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
@@ -664,16 +664,16 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
offset = em->start + em->len;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
}
- if (extent_map_block_start(em) != disk_bytenr) {
+ if (btrfs_extent_map_block_start(em) != disk_bytenr) {
test_err("block start does not match, want %llu got %llu",
- disk_bytenr, extent_map_block_start(em));
+ disk_bytenr, btrfs_extent_map_block_start(em));
goto out;
}
if (em->start != offset || em->len != 2 * sectorsize) {
@@ -692,13 +692,13 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
em->start, em->offset, orig_start);
goto out;
}
- if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) {
+ if (btrfs_extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) {
test_err("unexpected compress type, wanted %d, got %d",
- BTRFS_COMPRESS_ZLIB, extent_map_compression(em));
+ BTRFS_COMPRESS_ZLIB, btrfs_extent_map_compression(em));
goto out;
}
offset = em->start + em->len;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
/* A hole between regular extents but no hole extent */
em = btrfs_get_extent(BTRFS_I(inode), NULL, offset + 6, sectorsize);
@@ -725,7 +725,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
offset = em->start + em->len;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, SZ_4M);
if (IS_ERR(em)) {
@@ -757,7 +757,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
offset = em->start + em->len;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);
if (IS_ERR(em)) {
@@ -785,7 +785,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
ret = 0;
out:
if (!IS_ERR(em))
- free_extent_map(em);
+ btrfs_free_extent_map(em);
iput(inode);
btrfs_free_dummy_root(root);
btrfs_free_dummy_fs_info(fs_info);
@@ -858,15 +858,16 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
em->flags);
goto out;
}
- free_extent_map(em);
+ btrfs_free_extent_map(em);
em = btrfs_get_extent(BTRFS_I(inode), NULL, sectorsize, 2 * sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
}
- if (extent_map_block_start(em) != sectorsize) {
- test_err("expected a real extent, got %llu", extent_map_block_start(em));
+ if (btrfs_extent_map_block_start(em) != sectorsize) {
+ test_err("expected a real extent, got %llu",
+ btrfs_extent_map_block_start(em));
goto out;
}
if (em->start != sectorsize || em->len != sectorsize) {
@@ -883,7 +884,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
ret = 0;
out:
if (!IS_ERR(em))
- free_extent_map(em);
+ btrfs_free_extent_map(em);
iput(inode);
btrfs_free_dummy_root(root);
btrfs_free_dummy_fs_info(fs_info);
@@ -949,11 +950,10 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
}
/* [BTRFS_MAX_EXTENT_SIZE/2][sectorsize HOLE][the rest] */
- ret = clear_extent_bit(&BTRFS_I(inode)->io_tree,
- BTRFS_MAX_EXTENT_SIZE >> 1,
- (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1,
- EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
- EXTENT_UPTODATE, NULL);
+ ret = btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree,
+ BTRFS_MAX_EXTENT_SIZE >> 1,
+ (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1,
+ EXTENT_DELALLOC | EXTENT_DELALLOC_NEW, NULL);
if (ret) {
test_err("clear_extent_bit returned %d", ret);
goto out;
@@ -1017,11 +1017,10 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
}
/* [BTRFS_MAX_EXTENT_SIZE+4k][4K HOLE][BTRFS_MAX_EXTENT_SIZE+4k] */
- ret = clear_extent_bit(&BTRFS_I(inode)->io_tree,
- BTRFS_MAX_EXTENT_SIZE + sectorsize,
- BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1,
- EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
- EXTENT_UPTODATE, NULL);
+ ret = btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree,
+ BTRFS_MAX_EXTENT_SIZE + sectorsize,
+ BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1,
+ EXTENT_DELALLOC | EXTENT_DELALLOC_NEW, NULL);
if (ret) {
test_err("clear_extent_bit returned %d", ret);
goto out;
@@ -1052,9 +1051,8 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
}
/* Empty */
- ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
- EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
- EXTENT_UPTODATE, NULL);
+ ret = btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
+ EXTENT_DELALLOC | EXTENT_DELALLOC_NEW, NULL);
if (ret) {
test_err("clear_extent_bit returned %d", ret);
goto out;
@@ -1068,9 +1066,8 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
ret = 0;
out:
if (ret)
- clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
- EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
- EXTENT_UPTODATE, NULL);
+ btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
+ EXTENT_DELALLOC | EXTENT_DELALLOC_NEW, NULL);
iput(inode);
btrfs_free_dummy_root(root);
btrfs_free_dummy_fs_info(fs_info);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 15312013f2a3..c5c0d9cf1a80 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -160,7 +160,13 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
cache = list_first_entry(&transaction->deleted_bgs,
struct btrfs_block_group,
bg_list);
+ /*
+ * Not strictly necessary to lock, as no other task will be using a
+ * block_group on the deleted_bgs list during a transaction abort.
+ */
+ spin_lock(&transaction->fs_info->unused_bgs_lock);
list_del_init(&cache->bg_list);
+ spin_unlock(&transaction->fs_info->unused_bgs_lock);
btrfs_unfreeze_block_group(cache);
btrfs_put_block_group(cache);
}
@@ -191,7 +197,7 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans)
list_del_init(&root->dirty_list);
free_extent_buffer(root->commit_root);
root->commit_root = btrfs_root_node(root);
- extent_io_tree_release(&root->dirty_log_pages);
+ btrfs_extent_io_tree_release(&root->dirty_log_pages);
btrfs_qgroup_clean_swapped_blocks(root);
}
@@ -274,8 +280,10 @@ loop:
cur_trans = fs_info->running_transaction;
if (cur_trans) {
if (TRANS_ABORTED(cur_trans)) {
+ const int abort_error = cur_trans->aborted;
+
spin_unlock(&fs_info->trans_lock);
- return cur_trans->aborted;
+ return abort_error;
}
if (btrfs_blocked_trans_types[cur_trans->state] & type) {
spin_unlock(&fs_info->trans_lock);
@@ -375,10 +383,10 @@ loop:
INIT_LIST_HEAD(&cur_trans->deleted_bgs);
spin_lock_init(&cur_trans->dropped_roots_lock);
list_add_tail(&cur_trans->list, &fs_info->trans_list);
- extent_io_tree_init(fs_info, &cur_trans->dirty_pages,
- IO_TREE_TRANS_DIRTY_PAGES);
- extent_io_tree_init(fs_info, &cur_trans->pinned_extents,
- IO_TREE_FS_PINNED_EXTENTS);
+ btrfs_extent_io_tree_init(fs_info, &cur_trans->dirty_pages,
+ IO_TREE_TRANS_DIRTY_PAGES);
+ btrfs_extent_io_tree_init(fs_info, &cur_trans->pinned_extents,
+ IO_TREE_FS_PINNED_EXTENTS);
btrfs_set_fs_generation(fs_info, fs_info->generation + 1);
cur_trans->transid = fs_info->generation;
fs_info->running_transaction = cur_trans;
@@ -530,15 +538,15 @@ static void wait_current_trans(struct btrfs_fs_info *fs_info)
}
}
-static int may_wait_transaction(struct btrfs_fs_info *fs_info, int type)
+static bool may_wait_transaction(struct btrfs_fs_info *fs_info, int type)
{
if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
- return 0;
+ return false;
if (type == TRANS_START)
- return 1;
+ return true;
- return 0;
+ return false;
}
static inline bool need_reserve_reloc_root(struct btrfs_root *root)
@@ -753,9 +761,10 @@ got_it:
* value here.
*/
if (do_chunk_alloc && num_bytes) {
- u64 flags = h->block_rsv->space_info->flags;
+ struct btrfs_space_info *space_info = h->block_rsv->space_info;
+ u64 flags = space_info->flags;
- btrfs_chunk_alloc(h, btrfs_get_alloc_profile(fs_info, flags),
+ btrfs_chunk_alloc(h, space_info, btrfs_get_alloc_profile(fs_info, flags),
CHUNK_ALLOC_NO_FORCE);
}
@@ -1120,13 +1129,13 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
u64 start = 0;
u64 end;
- while (find_first_extent_bit(dirty_pages, start, &start, &end,
- mark, &cached_state)) {
+ while (btrfs_find_first_extent_bit(dirty_pages, start, &start, &end,
+ mark, &cached_state)) {
bool wait_writeback = false;
- ret = convert_extent_bit(dirty_pages, start, end,
- EXTENT_NEED_WAIT,
- mark, &cached_state);
+ ret = btrfs_convert_extent_bit(dirty_pages, start, end,
+ EXTENT_NEED_WAIT,
+ mark, &cached_state);
/*
* convert_extent_bit can return -ENOMEM, which is most of the
* time a temporary error. So when it happens, ignore the error
@@ -1147,8 +1156,8 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
if (!ret)
ret = filemap_fdatawrite_range(mapping, start, end);
if (!ret && wait_writeback)
- ret = filemap_fdatawait_range(mapping, start, end);
- free_extent_state(cached_state);
+ btrfs_btree_wait_writeback_range(fs_info, start, end);
+ btrfs_free_extent_state(cached_state);
if (ret)
break;
cached_state = NULL;
@@ -1167,14 +1176,13 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info,
struct extent_io_tree *dirty_pages)
{
- struct address_space *mapping = fs_info->btree_inode->i_mapping;
struct extent_state *cached_state = NULL;
u64 start = 0;
u64 end;
int ret = 0;
- while (find_first_extent_bit(dirty_pages, start, &start, &end,
- EXTENT_NEED_WAIT, &cached_state)) {
+ while (btrfs_find_first_extent_bit(dirty_pages, start, &start, &end,
+ EXTENT_NEED_WAIT, &cached_state)) {
/*
* Ignore -ENOMEM errors returned by clear_extent_bit().
* When committing the transaction, we'll remove any entries
@@ -1183,13 +1191,13 @@ static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info,
* concurrently - we do it only at transaction commit time when
* it's safe to do it (through extent_io_tree_release()).
*/
- ret = clear_extent_bit(dirty_pages, start, end,
- EXTENT_NEED_WAIT, &cached_state);
+ ret = btrfs_clear_extent_bit(dirty_pages, start, end,
+ EXTENT_NEED_WAIT, &cached_state);
if (ret == -ENOMEM)
ret = 0;
if (!ret)
- ret = filemap_fdatawait_range(mapping, start, end);
- free_extent_state(cached_state);
+ btrfs_btree_wait_writeback_range(fs_info, start, end);
+ btrfs_free_extent_state(cached_state);
if (ret)
break;
cached_state = NULL;
@@ -1203,15 +1211,15 @@ static int btrfs_wait_extents(struct btrfs_fs_info *fs_info,
struct extent_io_tree *dirty_pages)
{
bool errors = false;
- int err;
+ int ret;
- err = __btrfs_wait_marked_extents(fs_info, dirty_pages);
+ ret = __btrfs_wait_marked_extents(fs_info, dirty_pages);
if (test_and_clear_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags))
errors = true;
- if (errors && !err)
- err = -EIO;
- return err;
+ if (errors && !ret)
+ ret = -EIO;
+ return ret;
}
int btrfs_wait_tree_log_extents(struct btrfs_root *log_root, int mark)
@@ -1219,22 +1227,22 @@ int btrfs_wait_tree_log_extents(struct btrfs_root *log_root, int mark)
struct btrfs_fs_info *fs_info = log_root->fs_info;
struct extent_io_tree *dirty_pages = &log_root->dirty_log_pages;
bool errors = false;
- int err;
+ int ret;
ASSERT(btrfs_root_id(log_root) == BTRFS_TREE_LOG_OBJECTID);
- err = __btrfs_wait_marked_extents(fs_info, dirty_pages);
- if ((mark & EXTENT_DIRTY) &&
+ ret = __btrfs_wait_marked_extents(fs_info, dirty_pages);
+ if ((mark & EXTENT_DIRTY_LOG1) &&
test_and_clear_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags))
errors = true;
- if ((mark & EXTENT_NEW) &&
+ if ((mark & EXTENT_DIRTY_LOG2) &&
test_and_clear_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags))
errors = true;
- if (errors && !err)
- err = -EIO;
- return err;
+ if (errors && !ret)
+ ret = -EIO;
+ return ret;
}
/*
@@ -1257,7 +1265,7 @@ static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans)
blk_finish_plug(&plug);
ret2 = btrfs_wait_extents(fs_info, dirty_pages);
- extent_io_tree_release(&trans->transaction->dirty_pages);
+ btrfs_extent_io_tree_release(&trans->transaction->dirty_pages);
if (ret)
return ret;
@@ -1319,7 +1327,6 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans)
struct btrfs_fs_info *fs_info = trans->fs_info;
struct list_head *dirty_bgs = &trans->transaction->dirty_bgs;
struct list_head *io_bgs = &trans->transaction->io_bgs;
- struct list_head *next;
struct extent_buffer *eb;
int ret;
@@ -1355,13 +1362,13 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans)
again:
while (!list_empty(&fs_info->dirty_cowonly_roots)) {
struct btrfs_root *root;
- next = fs_info->dirty_cowonly_roots.next;
- list_del_init(next);
- root = list_entry(next, struct btrfs_root, dirty_list);
+
+ root = list_first_entry(&fs_info->dirty_cowonly_roots,
+ struct btrfs_root, dirty_list);
clear_bit(BTRFS_ROOT_DIRTY, &root->state);
+ list_move_tail(&root->dirty_list,
+ &trans->transaction->switch_commits);
- list_add_tail(&root->dirty_list,
- &trans->transaction->switch_commits);
ret = update_cowonly_root(trans, root);
if (ret)
return ret;
@@ -1633,7 +1640,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
struct btrfs_root *root = pending->root;
struct btrfs_root *parent_root;
struct btrfs_block_rsv *rsv;
- struct inode *parent_inode = &pending->dir->vfs_inode;
+ struct btrfs_inode *parent_inode = pending->dir;
struct btrfs_path *path;
struct btrfs_dir_item *dir_item;
struct extent_buffer *tmp;
@@ -1659,7 +1666,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
* filesystem.
*/
nofs_flags = memalloc_nofs_save();
- pending->error = fscrypt_setup_filename(parent_inode,
+ pending->error = fscrypt_setup_filename(&parent_inode->vfs_inode,
&pending->dentry->d_name, 0,
&fname);
memalloc_nofs_restore(nofs_flags);
@@ -1688,8 +1695,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
}
key.objectid = objectid;
- key.offset = (u64)-1;
key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = (u64)-1;
rsv = trans->block_rsv;
trans->block_rsv = &pending->block_rsv;
@@ -1697,16 +1704,16 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
trace_btrfs_space_reservation(fs_info, "transaction",
trans->transid,
trans->bytes_reserved, 1);
- parent_root = BTRFS_I(parent_inode)->root;
+ parent_root = parent_inode->root;
ret = record_root_in_trans(trans, parent_root, 0);
if (ret)
goto fail;
- cur_time = current_time(parent_inode);
+ cur_time = current_time(&parent_inode->vfs_inode);
/*
* insert the directory item
*/
- ret = btrfs_set_inode_index(BTRFS_I(parent_inode), &index);
+ ret = btrfs_set_inode_index(parent_inode, &index);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto fail;
@@ -1714,7 +1721,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
/* check if there is a file/dir which has the same name. */
dir_item = btrfs_lookup_dir_item(NULL, parent_root, path,
- btrfs_ino(BTRFS_I(parent_inode)),
+ btrfs_ino(parent_inode),
&fname.disk_name, 0);
if (dir_item != NULL && !IS_ERR(dir_item)) {
pending->error = -EEXIST;
@@ -1728,8 +1735,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
ret = btrfs_create_qgroup(trans, objectid);
if (ret && ret != -EEXIST) {
- btrfs_abort_transaction(trans, ret);
- goto fail;
+ if (ret != -ENOTCONN || btrfs_qgroup_enabled(fs_info)) {
+ btrfs_abort_transaction(trans, ret);
+ goto fail;
+ }
}
/*
@@ -1815,7 +1824,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
*/
ret = btrfs_add_root_ref(trans, objectid,
btrfs_root_id(parent_root),
- btrfs_ino(BTRFS_I(parent_inode)), index,
+ btrfs_ino(parent_inode), index,
&fname.disk_name);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -1853,18 +1862,18 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
goto fail;
ret = btrfs_insert_dir_item(trans, &fname.disk_name,
- BTRFS_I(parent_inode), &key, BTRFS_FT_DIR,
+ parent_inode, &key, BTRFS_FT_DIR,
index);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
- btrfs_i_size_write(BTRFS_I(parent_inode), parent_inode->i_size +
+ btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size +
fname.disk_name.len * 2);
- inode_set_mtime_to_ts(parent_inode,
- inode_set_ctime_current(parent_inode));
- ret = btrfs_update_inode_fallback(trans, BTRFS_I(parent_inode));
+ inode_set_mtime_to_ts(&parent_inode->vfs_inode,
+ inode_set_ctime_current(&parent_inode->vfs_inode));
+ ret = btrfs_update_inode_fallback(trans, parent_inode);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto fail;
@@ -2094,7 +2103,14 @@ static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans)
list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info);
+ /*
+ * Not strictly necessary to lock, as no other task will be using a
+ * block_group on the new_bgs list during a transaction abort.
+ */
+ spin_lock(&fs_info->unused_bgs_lock);
list_del_init(&block_group->bg_list);
+ btrfs_put_block_group(block_group);
+ spin_unlock(&fs_info->unused_bgs_lock);
}
}
@@ -2149,13 +2165,19 @@ static void add_pending_snapshot(struct btrfs_trans_handle *trans)
list_add(&trans->pending_snapshot->list, &cur_trans->pending_snapshots);
}
-static void update_commit_stats(struct btrfs_fs_info *fs_info, ktime_t interval)
+static void update_commit_stats(struct btrfs_fs_info *fs_info)
{
+ ktime_t now = ktime_get_ns();
+ ktime_t interval = now - fs_info->commit_stats.critical_section_start_time;
+
+ ASSERT(fs_info->commit_stats.critical_section_start_time);
+
fs_info->commit_stats.commit_count++;
fs_info->commit_stats.last_commit_dur = interval;
fs_info->commit_stats.max_commit_dur =
max_t(u64, fs_info->commit_stats.max_commit_dur, interval);
fs_info->commit_stats.total_commit_dur += interval;
+ fs_info->commit_stats.critical_section_start_time = 0;
}
int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
@@ -2164,8 +2186,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
struct btrfs_transaction *cur_trans = trans->transaction;
struct btrfs_transaction *prev_trans = NULL;
int ret;
- ktime_t start_time;
- ktime_t interval;
ASSERT(refcount_read(&trans->use_count) == 1);
btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP);
@@ -2256,14 +2276,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
wake_up(&fs_info->transaction_blocked_wait);
btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP);
- if (cur_trans->list.prev != &fs_info->trans_list) {
+ if (!list_is_first(&cur_trans->list, &fs_info->trans_list)) {
enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED;
if (trans->in_fsync)
want_state = TRANS_STATE_SUPER_COMMITTED;
- prev_trans = list_entry(cur_trans->list.prev,
- struct btrfs_transaction, list);
+ prev_trans = list_prev_entry(cur_trans, list);
if (prev_trans->state < want_state) {
refcount_inc(&prev_trans->use_count);
spin_unlock(&fs_info->trans_lock);
@@ -2299,8 +2318,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
* Get the time spent on the work done by the commit thread and not
* the time spent waiting on a previous commit
*/
- start_time = ktime_get_ns();
-
+ fs_info->commit_stats.critical_section_start_time = ktime_get_ns();
extwriter_counter_dec(cur_trans, trans->type);
ret = btrfs_start_delalloc_flush(fs_info);
@@ -2532,6 +2550,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
if (ret)
goto scrub_continue;
+ update_commit_stats(fs_info);
/*
* We needn't acquire the lock here because there is no other task
* which can change it.
@@ -2540,7 +2559,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
wake_up(&cur_trans->commit_wait);
btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
- btrfs_finish_extent_commit(trans);
+ ret = btrfs_finish_extent_commit(trans);
+ if (ret)
+ goto scrub_continue;
if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags))
btrfs_clear_space_info_full(fs_info);
@@ -2566,8 +2587,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
trace_btrfs_transaction_commit(fs_info);
- interval = ktime_get_ns() - start_time;
-
btrfs_scrub_continue(fs_info);
if (current->journal_info == trans)
@@ -2575,8 +2594,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
kmem_cache_free(btrfs_trans_handle_cachep, trans);
- update_commit_stats(fs_info, interval);
-
return ret;
unlock_reloc:
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 43979891f7c8..0f556f4de3f9 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -191,7 +191,7 @@ static bool check_prev_ino(struct extent_buffer *leaf,
* Only subvolume trees along with their reloc trees need this check.
* Things like log tree doesn't follow this ino requirement.
*/
- if (!is_fstree(btrfs_header_owner(leaf)))
+ if (!btrfs_is_fstree(btrfs_header_owner(leaf)))
return true;
if (key->objectid == prev_key->objectid)
@@ -475,7 +475,7 @@ static int check_root_key(struct extent_buffer *leaf, struct btrfs_key *key,
* to be COWed to be relocated.
*/
if (unlikely(is_root_item && key->objectid == BTRFS_TREE_RELOC_OBJECTID &&
- !is_fstree(key->offset))) {
+ !btrfs_is_fstree(key->offset))) {
generic_err(leaf, slot,
"invalid reloc tree for root %lld, root id is not a subvolume tree",
key->offset);
@@ -493,7 +493,7 @@ static int check_root_key(struct extent_buffer *leaf, struct btrfs_key *key,
}
/* DIR_ITEM/INDEX/INODE_REF is not allowed to point to non-fs trees */
- if (unlikely(!is_fstree(key->objectid) && !is_root_item)) {
+ if (unlikely(!btrfs_is_fstree(key->objectid) && !is_root_item)) {
dir_item_err(leaf, slot,
"invalid location key objectid, have %llu expect [%llu, %llu]",
key->objectid, BTRFS_FIRST_FREE_OBJECTID,
@@ -1311,7 +1311,7 @@ static bool is_valid_dref_root(u64 rootid)
* - tree root
* For v1 space cache
*/
- return is_fstree(rootid) || rootid == BTRFS_DATA_RELOC_TREE_OBJECTID ||
+ return btrfs_is_fstree(rootid) || rootid == BTRFS_DATA_RELOC_TREE_OBJECTID ||
rootid == BTRFS_ROOT_TREE_OBJECTID;
}
@@ -1571,7 +1571,7 @@ static int check_extent_item(struct extent_buffer *leaf,
inline_type);
return -EUCLEAN;
}
- if (inline_type < last_type) {
+ if (unlikely(inline_type < last_type)) {
extent_err(leaf, slot,
"inline ref out-of-order: has type %u, prev type %u",
inline_type, last_type);
@@ -1580,7 +1580,7 @@ static int check_extent_item(struct extent_buffer *leaf,
/* Type changed, allow the sequence starts from U64_MAX again. */
if (inline_type > last_type)
last_seq = U64_MAX;
- if (seq > last_seq) {
+ if (unlikely(seq > last_seq)) {
extent_err(leaf, slot,
"inline ref out-of-order: has type %u offset %llu seq 0x%llx, prev type %u seq 0x%llx",
inline_type, inline_offset, seq,
@@ -1929,7 +1929,7 @@ static enum btrfs_tree_block_status check_leaf_item(struct extent_buffer *leaf,
break;
}
- if (ret)
+ if (unlikely(ret))
return BTRFS_TREE_BLOCK_INVALID_ITEM;
return BTRFS_TREE_BLOCK_CLEAN;
}
@@ -2167,7 +2167,7 @@ ALLOW_ERROR_INJECTION(btrfs_check_node, ERRNO);
int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner)
{
- const bool is_subvol = is_fstree(root_owner);
+ const bool is_subvol = btrfs_is_fstree(root_owner);
const u64 eb_owner = btrfs_header_owner(eb);
/*
@@ -2209,7 +2209,7 @@ int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner)
* For subvolume trees, owners can mismatch, but they should all belong
* to subvolume trees.
*/
- if (unlikely(is_subvol != is_fstree(eb_owner))) {
+ if (unlikely(is_subvol != btrfs_is_fstree(eb_owner))) {
btrfs_crit(eb->fs_info,
"corrupted %s, root=%llu block=%llu owner mismatch, have %llu expect [%llu, %llu]",
btrfs_header_level(eb) == 0 ? "leaf" : "node",
@@ -2229,13 +2229,12 @@ int btrfs_verify_level_key(struct extent_buffer *eb,
int ret;
found_level = btrfs_header_level(eb);
- if (found_level != check->level) {
- WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
- KERN_ERR "BTRFS: tree level check failed\n");
+ if (unlikely(found_level != check->level)) {
+ DEBUG_WARN();
btrfs_err(fs_info,
"tree level mismatch detected, bytenr=%llu level expected=%u has=%u",
eb->start, check->level, found_level);
- return -EIO;
+ return -EUCLEAN;
}
if (!check->has_first_key)
@@ -2251,11 +2250,11 @@ int btrfs_verify_level_key(struct extent_buffer *eb,
return 0;
/* We have @first_key, so this @eb must have at least one item */
- if (btrfs_header_nritems(eb) == 0) {
+ if (unlikely(btrfs_header_nritems(eb) == 0)) {
btrfs_err(fs_info,
"invalid tree nritems, bytenr=%llu nritems=0 expect >0",
eb->start);
- WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ DEBUG_WARN();
return -EUCLEAN;
}
@@ -2263,11 +2262,10 @@ int btrfs_verify_level_key(struct extent_buffer *eb,
btrfs_node_key_to_cpu(eb, &found_key, 0);
else
btrfs_item_key_to_cpu(eb, &found_key, 0);
- ret = btrfs_comp_cpu_keys(&check->first_key, &found_key);
- if (ret) {
- WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
- KERN_ERR "BTRFS: tree first key check failed\n");
+ ret = btrfs_comp_cpu_keys(&check->first_key, &found_key);
+ if (unlikely(ret)) {
+ DEBUG_WARN();
btrfs_err(fs_info,
"tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)",
eb->start, check->transid, check->first_key.objectid,
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 955d1677e865..69e11557fd13 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -112,7 +112,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_root *log,
struct btrfs_path *path,
- u64 dirid, int del_all);
+ u64 dirid, bool del_all);
static void wait_log_commit(struct btrfs_root *root, int transid);
/*
@@ -138,10 +138,13 @@ static void wait_log_commit(struct btrfs_root *root, int transid);
* and once to do all the other items.
*/
-static struct inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *root)
+static struct btrfs_inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *root)
{
unsigned int nofs_flag;
- struct inode *inode;
+ struct btrfs_inode *inode;
+
+ /* Only meant to be called for subvolume roots and not for log roots. */
+ ASSERT(btrfs_is_fstree(btrfs_root_id(root)));
/*
* We're holding a transaction handle whether we are logging or
@@ -318,8 +321,7 @@ struct walk_control {
/*
* Ignore any items from the inode currently being processed. Needs
- * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in
- * the LOG_WALK_REPLAY_INODES stage.
+ * to be set every time we find a BTRFS_INODE_ITEM_KEY.
*/
bool ignore_cur_inode;
@@ -376,12 +378,12 @@ static int process_one_buffer(struct btrfs_root *log,
}
/*
- * Item overwrite used by replay and tree logging. eb, slot and key all refer
- * to the src data we are copying out.
+ * Item overwrite used by log replay. The given eb, slot and key all refer to
+ * the source data we are copying out.
*
- * root is the tree we are copying into, and path is a scratch
- * path for use in this function (it should be released on entry and
- * will be released on exit).
+ * The given root is for the tree we are copying into, and path is a scratch
+ * path for use in this function (it should be released on entry and will be
+ * released on exit).
*
* If the key is already in the destination tree the existing item is
* overwritten. If the existing item isn't big enough, it is extended.
@@ -401,6 +403,8 @@ static int overwrite_item(struct btrfs_trans_handle *trans,
int save_old_i_size = 0;
unsigned long src_ptr;
unsigned long dst_ptr;
+ struct extent_buffer *dst_eb;
+ int dst_slot;
bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
/*
@@ -420,11 +424,13 @@ static int overwrite_item(struct btrfs_trans_handle *trans,
if (ret < 0)
return ret;
+ dst_eb = path->nodes[0];
+ dst_slot = path->slots[0];
+
if (ret == 0) {
char *src_copy;
- char *dst_copy;
- u32 dst_size = btrfs_item_size(path->nodes[0],
- path->slots[0]);
+ const u32 dst_size = btrfs_item_size(dst_eb, dst_slot);
+
if (dst_size != item_size)
goto insert;
@@ -432,23 +438,16 @@ static int overwrite_item(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
return 0;
}
- dst_copy = kmalloc(item_size, GFP_NOFS);
src_copy = kmalloc(item_size, GFP_NOFS);
- if (!dst_copy || !src_copy) {
+ if (!src_copy) {
btrfs_release_path(path);
- kfree(dst_copy);
- kfree(src_copy);
return -ENOMEM;
}
read_extent_buffer(eb, src_copy, src_ptr, item_size);
+ dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot);
+ ret = memcmp_extent_buffer(dst_eb, src_copy, dst_ptr, item_size);
- dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
- read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
- item_size);
- ret = memcmp(dst_copy, src_copy, item_size);
-
- kfree(dst_copy);
kfree(src_copy);
/*
* they have the same contents, just return, this saves
@@ -470,9 +469,9 @@ static int overwrite_item(struct btrfs_trans_handle *trans,
u64 nbytes;
u32 mode;
- item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ item = btrfs_item_ptr(dst_eb, dst_slot,
struct btrfs_inode_item);
- nbytes = btrfs_inode_nbytes(path->nodes[0], item);
+ nbytes = btrfs_inode_nbytes(dst_eb, item);
item = btrfs_item_ptr(eb, slot,
struct btrfs_inode_item);
btrfs_set_inode_nbytes(eb, item, nbytes);
@@ -514,11 +513,13 @@ insert:
key, item_size);
path->skip_release_on_error = 0;
+ dst_eb = path->nodes[0];
+ dst_slot = path->slots[0];
+
/* make sure any existing item is the correct size */
if (ret == -EEXIST || ret == -EOVERFLOW) {
- u32 found_size;
- found_size = btrfs_item_size(path->nodes[0],
- path->slots[0]);
+ const u32 found_size = btrfs_item_size(dst_eb, dst_slot);
+
if (found_size > item_size)
btrfs_truncate_item(trans, path, item_size, 1);
else if (found_size < item_size)
@@ -526,8 +527,7 @@ insert:
} else if (ret) {
return ret;
}
- dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
- path->slots[0]);
+ dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot);
/* don't overwrite an existing inode if the generation number
* was logged as zero. This is done when the tree logging code
@@ -546,7 +546,6 @@ insert:
dst_item = (struct btrfs_inode_item *)dst_ptr;
if (btrfs_inode_generation(eb, src_item) == 0) {
- struct extent_buffer *dst_eb = path->nodes[0];
const u64 ino_size = btrfs_inode_size(eb, src_item);
/*
@@ -564,30 +563,28 @@ insert:
}
if (S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
- S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
+ S_ISDIR(btrfs_inode_mode(dst_eb, dst_item))) {
save_old_i_size = 1;
- saved_i_size = btrfs_inode_size(path->nodes[0],
- dst_item);
+ saved_i_size = btrfs_inode_size(dst_eb, dst_item);
}
}
- copy_extent_buffer(path->nodes[0], eb, dst_ptr,
- src_ptr, item_size);
+ copy_extent_buffer(dst_eb, eb, dst_ptr, src_ptr, item_size);
if (save_old_i_size) {
struct btrfs_inode_item *dst_item;
+
dst_item = (struct btrfs_inode_item *)dst_ptr;
- btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
+ btrfs_set_inode_size(dst_eb, dst_item, saved_i_size);
}
/* make sure the generation is filled in */
if (key->type == BTRFS_INODE_ITEM_KEY) {
struct btrfs_inode_item *dst_item;
+
dst_item = (struct btrfs_inode_item *)dst_ptr;
- if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
- btrfs_set_inode_generation(path->nodes[0], dst_item,
- trans->transid);
- }
+ if (btrfs_inode_generation(dst_eb, dst_item) == 0)
+ btrfs_set_inode_generation(dst_eb, dst_item, trans->transid);
}
no_copy:
btrfs_release_path(path);
@@ -609,21 +606,6 @@ static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len,
return 0;
}
-/*
- * simple helper to read an inode off the disk from a given root
- * This can only be called for subvolume roots and not for the log
- */
-static noinline struct inode *read_one_inode(struct btrfs_root *root,
- u64 objectid)
-{
- struct inode *inode;
-
- inode = btrfs_iget_logging(objectid, root);
- if (IS_ERR(inode))
- inode = NULL;
- return inode;
-}
-
/* replays a single extent in 'eb' at 'slot' with 'key' into the
* subvolume 'root'. path is released on entry and should be released
* on exit.
@@ -649,7 +631,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
u64 start = key->offset;
u64 nbytes = 0;
struct btrfs_file_extent_item *item;
- struct inode *inode = NULL;
+ struct btrfs_inode *inode = NULL;
unsigned long size;
int ret = 0;
@@ -673,46 +655,38 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
extent_end = ALIGN(start + size,
fs_info->sectorsize);
} else {
- ret = 0;
- goto out;
+ btrfs_err(fs_info,
+ "unexpected extent type=%d root=%llu inode=%llu offset=%llu",
+ found_type, btrfs_root_id(root), key->objectid, key->offset);
+ return -EUCLEAN;
}
- inode = read_one_inode(root, key->objectid);
- if (!inode) {
- ret = -EIO;
- goto out;
- }
+ inode = btrfs_iget_logging(key->objectid, root);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
/*
* first check to see if we already have this extent in the
* file. This must be done before the btrfs_drop_extents run
* so we don't try to drop this extent.
*/
- ret = btrfs_lookup_file_extent(trans, root, path,
- btrfs_ino(BTRFS_I(inode)), start, 0);
+ ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode), start, 0);
if (ret == 0 &&
(found_type == BTRFS_FILE_EXTENT_REG ||
found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
- struct btrfs_file_extent_item cmp1;
- struct btrfs_file_extent_item cmp2;
- struct btrfs_file_extent_item *existing;
- struct extent_buffer *leaf;
-
- leaf = path->nodes[0];
- existing = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_file_extent_item);
+ struct btrfs_file_extent_item existing;
+ unsigned long ptr;
- read_extent_buffer(eb, &cmp1, (unsigned long)item,
- sizeof(cmp1));
- read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
- sizeof(cmp2));
+ ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
+ read_extent_buffer(path->nodes[0], &existing, ptr, sizeof(existing));
/*
* we already have a pointer to this exact extent,
* we don't have to do anything
*/
- if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
+ if (memcmp_extent_buffer(eb, &existing, (unsigned long)item,
+ sizeof(existing)) == 0) {
btrfs_release_path(path);
goto out;
}
@@ -723,7 +697,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
drop_args.start = start;
drop_args.end = extent_end;
drop_args.drop_cache = true;
- ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args);
+ ret = btrfs_drop_extents(trans, root, inode, &drop_args);
if (ret)
goto out;
@@ -747,8 +721,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
(unsigned long)item, sizeof(*item));
ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
- ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
ins.type = BTRFS_EXTENT_ITEM_KEY;
+ ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
offset = key->offset - btrfs_file_extent_offset(eb, item);
/*
@@ -873,9 +847,9 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
struct btrfs_ordered_sum *sums;
struct btrfs_root *csum_root;
- sums = list_entry(ordered_sums.next,
- struct btrfs_ordered_sum,
- list);
+ sums = list_first_entry(&ordered_sums,
+ struct btrfs_ordered_sum,
+ list);
csum_root = btrfs_csum_root(fs_info,
sums->logical);
if (!ret)
@@ -901,16 +875,15 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
goto out;
}
- ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start,
- extent_end - start);
+ ret = btrfs_inode_set_file_extent_range(inode, start, extent_end - start);
if (ret)
goto out;
update_inode:
- btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found);
- ret = btrfs_update_inode(trans, BTRFS_I(inode));
+ btrfs_update_inode_bytes(inode, nbytes, drop_args.bytes_found);
+ ret = btrfs_update_inode(trans, inode);
out:
- iput(inode);
+ iput(&inode->vfs_inode);
return ret;
}
@@ -947,7 +920,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
struct btrfs_dir_item *di)
{
struct btrfs_root *root = dir->root;
- struct inode *inode;
+ struct btrfs_inode *inode;
struct fscrypt_str name;
struct extent_buffer *leaf;
struct btrfs_key location;
@@ -962,9 +935,10 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
- inode = read_one_inode(root, location.objectid);
- if (!inode) {
- ret = -EIO;
+ inode = btrfs_iget_logging(location.objectid, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ inode = NULL;
goto out;
}
@@ -972,10 +946,11 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
if (ret)
goto out;
- ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), &name);
+ ret = unlink_inode_for_log_replay(trans, dir, inode, &name);
out:
kfree(name.name);
- iput(inode);
+ if (inode)
+ iput(&inode->vfs_inode);
return ret;
}
@@ -1065,6 +1040,126 @@ out:
return ret;
}
+static int unlink_refs_not_in_log(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ struct btrfs_root *log_root,
+ struct btrfs_key *search_key,
+ struct btrfs_inode *dir,
+ struct btrfs_inode *inode,
+ u64 parent_objectid)
+{
+ struct extent_buffer *leaf = path->nodes[0];
+ unsigned long ptr;
+ unsigned long ptr_end;
+
+ /*
+ * Check all the names in this back reference to see if they are in the
+ * log. If so, we allow them to stay otherwise they must be unlinked as
+ * a conflict.
+ */
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]);
+ while (ptr < ptr_end) {
+ struct fscrypt_str victim_name;
+ struct btrfs_inode_ref *victim_ref;
+ int ret;
+
+ victim_ref = (struct btrfs_inode_ref *)ptr;
+ ret = read_alloc_one_name(leaf, (victim_ref + 1),
+ btrfs_inode_ref_name_len(leaf, victim_ref),
+ &victim_name);
+ if (ret)
+ return ret;
+
+ ret = backref_in_log(log_root, search_key, parent_objectid, &victim_name);
+ if (ret) {
+ kfree(victim_name.name);
+ if (ret < 0)
+ return ret;
+ ptr = (unsigned long)(victim_ref + 1) + victim_name.len;
+ continue;
+ }
+
+ inc_nlink(&inode->vfs_inode);
+ btrfs_release_path(path);
+
+ ret = unlink_inode_for_log_replay(trans, dir, inode, &victim_name);
+ kfree(victim_name.name);
+ if (ret)
+ return ret;
+ return -EAGAIN;
+ }
+
+ return 0;
+}
+
+static int unlink_extrefs_not_in_log(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ struct btrfs_root *root,
+ struct btrfs_root *log_root,
+ struct btrfs_key *search_key,
+ struct btrfs_inode *inode,
+ u64 inode_objectid,
+ u64 parent_objectid)
+{
+ struct extent_buffer *leaf = path->nodes[0];
+ const unsigned long base = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ const u32 item_size = btrfs_item_size(leaf, path->slots[0]);
+ u32 cur_offset = 0;
+
+ while (cur_offset < item_size) {
+ struct btrfs_inode_extref *extref;
+ struct btrfs_inode *victim_parent;
+ struct fscrypt_str victim_name;
+ int ret;
+
+ extref = (struct btrfs_inode_extref *)(base + cur_offset);
+ victim_name.len = btrfs_inode_extref_name_len(leaf, extref);
+
+ if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
+ goto next;
+
+ ret = read_alloc_one_name(leaf, &extref->name, victim_name.len,
+ &victim_name);
+ if (ret)
+ return ret;
+
+ search_key->objectid = inode_objectid;
+ search_key->type = BTRFS_INODE_EXTREF_KEY;
+ search_key->offset = btrfs_extref_hash(parent_objectid,
+ victim_name.name,
+ victim_name.len);
+ ret = backref_in_log(log_root, search_key, parent_objectid, &victim_name);
+ if (ret) {
+ kfree(victim_name.name);
+ if (ret < 0)
+ return ret;
+next:
+ cur_offset += victim_name.len + sizeof(*extref);
+ continue;
+ }
+
+ victim_parent = btrfs_iget_logging(parent_objectid, root);
+ if (IS_ERR(victim_parent)) {
+ kfree(victim_name.name);
+ return PTR_ERR(victim_parent);
+ }
+
+ inc_nlink(&inode->vfs_inode);
+ btrfs_release_path(path);
+
+ ret = unlink_inode_for_log_replay(trans, victim_parent, inode,
+ &victim_name);
+ iput(&victim_parent->vfs_inode);
+ kfree(victim_name.name);
+ if (ret)
+ return ret;
+ return -EAGAIN;
+ }
+
+ return 0;
+}
+
static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
@@ -1075,7 +1170,6 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
u64 ref_index, struct fscrypt_str *name)
{
int ret;
- struct extent_buffer *leaf;
struct btrfs_dir_item *di;
struct btrfs_key search_key;
struct btrfs_inode_extref *extref;
@@ -1086,121 +1180,37 @@ again:
search_key.type = BTRFS_INODE_REF_KEY;
search_key.offset = parent_objectid;
ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
- if (ret == 0) {
- struct btrfs_inode_ref *victim_ref;
- unsigned long ptr;
- unsigned long ptr_end;
-
- leaf = path->nodes[0];
-
- /* are we trying to overwrite a back ref for the root directory
- * if so, just jump out, we're done
+ if (ret < 0) {
+ return ret;
+ } else if (ret == 0) {
+ /*
+ * Are we trying to overwrite a back ref for the root directory?
+ * If so, we're done.
*/
if (search_key.objectid == search_key.offset)
return 1;
- /* check all the names in this back reference to see
- * if they are in the log. if so, we allow them to stay
- * otherwise they must be unlinked as a conflict
- */
- ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
- ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]);
- while (ptr < ptr_end) {
- struct fscrypt_str victim_name;
-
- victim_ref = (struct btrfs_inode_ref *)ptr;
- ret = read_alloc_one_name(leaf, (victim_ref + 1),
- btrfs_inode_ref_name_len(leaf, victim_ref),
- &victim_name);
- if (ret)
- return ret;
-
- ret = backref_in_log(log_root, &search_key,
- parent_objectid, &victim_name);
- if (ret < 0) {
- kfree(victim_name.name);
- return ret;
- } else if (!ret) {
- inc_nlink(&inode->vfs_inode);
- btrfs_release_path(path);
-
- ret = unlink_inode_for_log_replay(trans, dir, inode,
- &victim_name);
- kfree(victim_name.name);
- if (ret)
- return ret;
- goto again;
- }
- kfree(victim_name.name);
-
- ptr = (unsigned long)(victim_ref + 1) + victim_name.len;
- }
+ ret = unlink_refs_not_in_log(trans, path, log_root, &search_key,
+ dir, inode, parent_objectid);
+ if (ret == -EAGAIN)
+ goto again;
+ else if (ret)
+ return ret;
}
btrfs_release_path(path);
/* Same search but for extended refs */
- extref = btrfs_lookup_inode_extref(NULL, root, path, name,
- inode_objectid, parent_objectid, 0,
- 0);
+ extref = btrfs_lookup_inode_extref(root, path, name, inode_objectid, parent_objectid);
if (IS_ERR(extref)) {
return PTR_ERR(extref);
} else if (extref) {
- u32 item_size;
- u32 cur_offset = 0;
- unsigned long base;
- struct inode *victim_parent;
-
- leaf = path->nodes[0];
-
- item_size = btrfs_item_size(leaf, path->slots[0]);
- base = btrfs_item_ptr_offset(leaf, path->slots[0]);
-
- while (cur_offset < item_size) {
- struct fscrypt_str victim_name;
-
- extref = (struct btrfs_inode_extref *)(base + cur_offset);
-
- if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
- goto next;
-
- ret = read_alloc_one_name(leaf, &extref->name,
- btrfs_inode_extref_name_len(leaf, extref),
- &victim_name);
- if (ret)
- return ret;
-
- search_key.objectid = inode_objectid;
- search_key.type = BTRFS_INODE_EXTREF_KEY;
- search_key.offset = btrfs_extref_hash(parent_objectid,
- victim_name.name,
- victim_name.len);
- ret = backref_in_log(log_root, &search_key,
- parent_objectid, &victim_name);
- if (ret < 0) {
- kfree(victim_name.name);
- return ret;
- } else if (!ret) {
- ret = -ENOENT;
- victim_parent = read_one_inode(root,
- parent_objectid);
- if (victim_parent) {
- inc_nlink(&inode->vfs_inode);
- btrfs_release_path(path);
-
- ret = unlink_inode_for_log_replay(trans,
- BTRFS_I(victim_parent),
- inode, &victim_name);
- }
- iput(victim_parent);
- kfree(victim_name.name);
- if (ret)
- return ret;
- goto again;
- }
- kfree(victim_name.name);
-next:
- cur_offset += victim_name.len + sizeof(*extref);
- }
+ ret = unlink_extrefs_not_in_log(trans, path, root, log_root,
+ &search_key, inode,
+ inode_objectid, parent_objectid);
+ if (ret == -EAGAIN)
+ goto again;
+ else if (ret)
+ return ret;
}
btrfs_release_path(path);
@@ -1325,19 +1335,18 @@ again:
ret = !!btrfs_find_name_in_backref(log_eb, log_slot, &name);
if (!ret) {
- struct inode *dir;
+ struct btrfs_inode *dir;
btrfs_release_path(path);
- dir = read_one_inode(root, parent_id);
- if (!dir) {
- ret = -ENOENT;
+ dir = btrfs_iget_logging(parent_id, root);
+ if (IS_ERR(dir)) {
+ ret = PTR_ERR(dir);
kfree(name.name);
goto out;
}
- ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir),
- inode, &name);
+ ret = unlink_inode_for_log_replay(trans, dir, inode, &name);
kfree(name.name);
- iput(dir);
+ iput(&dir->vfs_inode);
if (ret)
goto out;
goto again;
@@ -1369,13 +1378,13 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
struct extent_buffer *eb, int slot,
struct btrfs_key *key)
{
- struct inode *dir = NULL;
- struct inode *inode = NULL;
+ struct btrfs_inode *dir = NULL;
+ struct btrfs_inode *inode = NULL;
unsigned long ref_ptr;
unsigned long ref_end;
struct fscrypt_str name = { 0 };
int ret;
- int log_ref_ver = 0;
+ const bool is_extref_item = (key->type == BTRFS_INODE_EXTREF_KEY);
u64 parent_objectid;
u64 inode_objectid;
u64 ref_index = 0;
@@ -1384,11 +1393,10 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
ref_ptr = btrfs_item_ptr_offset(eb, slot);
ref_end = ref_ptr + btrfs_item_size(eb, slot);
- if (key->type == BTRFS_INODE_EXTREF_KEY) {
+ if (is_extref_item) {
struct btrfs_inode_extref *r;
ref_struct_size = sizeof(struct btrfs_inode_extref);
- log_ref_ver = 1;
r = (struct btrfs_inode_extref *)ref_ptr;
parent_objectid = btrfs_inode_extref_parent(eb, r);
} else {
@@ -1403,40 +1411,64 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
* copy the back ref in. The link count fixup code will take
* care of the rest
*/
- dir = read_one_inode(root, parent_objectid);
- if (!dir) {
- ret = -ENOENT;
+ dir = btrfs_iget_logging(parent_objectid, root);
+ if (IS_ERR(dir)) {
+ ret = PTR_ERR(dir);
+ if (ret == -ENOENT)
+ ret = 0;
+ dir = NULL;
goto out;
}
- inode = read_one_inode(root, inode_objectid);
- if (!inode) {
- ret = -EIO;
+ inode = btrfs_iget_logging(inode_objectid, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ inode = NULL;
goto out;
}
while (ref_ptr < ref_end) {
- if (log_ref_ver) {
+ if (is_extref_item) {
ret = extref_get_fields(eb, ref_ptr, &name,
&ref_index, &parent_objectid);
+ if (ret)
+ goto out;
/*
* parent object can change from one array
* item to another.
*/
- if (!dir)
- dir = read_one_inode(root, parent_objectid);
if (!dir) {
- ret = -ENOENT;
- goto out;
+ dir = btrfs_iget_logging(parent_objectid, root);
+ if (IS_ERR(dir)) {
+ ret = PTR_ERR(dir);
+ dir = NULL;
+ /*
+ * A new parent dir may have not been
+ * logged and not exist in the subvolume
+ * tree, see the comment above before
+ * the loop when getting the first
+ * parent dir.
+ */
+ if (ret == -ENOENT) {
+ /*
+ * The next extref may refer to
+ * another parent dir that
+ * exists, so continue.
+ */
+ ret = 0;
+ goto next;
+ }
+ goto out;
+ }
}
} else {
ret = ref_get_fields(eb, ref_ptr, &name, &ref_index);
+ if (ret)
+ goto out;
}
- if (ret)
- goto out;
- ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)),
- btrfs_ino(BTRFS_I(inode)), ref_index, &name);
+ ret = inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
+ ref_index, &name);
if (ret < 0) {
goto out;
} else if (ret == 0) {
@@ -1447,8 +1479,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
* overwrite any existing back reference, and we don't
* want to create dangling pointers in the directory.
*/
- ret = __add_inode_ref(trans, root, path, log,
- BTRFS_I(dir), BTRFS_I(inode),
+ ret = __add_inode_ref(trans, root, path, log, dir, inode,
inode_objectid, parent_objectid,
ref_index, &name);
if (ret) {
@@ -1458,22 +1489,22 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
}
/* insert our name */
- ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
- &name, 0, ref_index);
+ ret = btrfs_add_link(trans, dir, inode, &name, 0, ref_index);
if (ret)
goto out;
- ret = btrfs_update_inode(trans, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, inode);
if (ret)
goto out;
}
/* Else, ret == 1, we already have a perfect match, we're done. */
+next:
ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + name.len;
kfree(name.name);
name.name = NULL;
- if (log_ref_ver) {
- iput(dir);
+ if (is_extref_item && dir) {
+ iput(&dir->vfs_inode);
dir = NULL;
}
}
@@ -1486,8 +1517,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
* dir index entries exist for a name but there is no inode reference
* item with the same name.
*/
- ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot,
- key);
+ ret = unlink_old_inode_refs(trans, root, path, inode, eb, slot, key);
if (ret)
goto out;
@@ -1496,8 +1526,10 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
out:
btrfs_release_path(path);
kfree(name.name);
- iput(dir);
- iput(inode);
+ if (dir)
+ iput(&dir->vfs_inode);
+ if (inode)
+ iput(&inode->vfs_inode);
return ret;
}
@@ -1611,25 +1643,25 @@ process_slot:
* will free the inode.
*/
static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
- struct inode *inode)
+ struct btrfs_inode *inode)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_root *root = inode->root;
struct btrfs_path *path;
int ret;
u64 nlink = 0;
- u64 ino = btrfs_ino(BTRFS_I(inode));
+ const u64 ino = btrfs_ino(inode);
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- ret = count_inode_refs(BTRFS_I(inode), path);
+ ret = count_inode_refs(inode, path);
if (ret < 0)
goto out;
nlink = ret;
- ret = count_inode_extrefs(BTRFS_I(inode), path);
+ ret = count_inode_extrefs(inode, path);
if (ret < 0)
goto out;
@@ -1637,19 +1669,18 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
ret = 0;
- if (nlink != inode->i_nlink) {
- set_nlink(inode, nlink);
- ret = btrfs_update_inode(trans, BTRFS_I(inode));
+ if (nlink != inode->vfs_inode.i_nlink) {
+ set_nlink(&inode->vfs_inode, nlink);
+ ret = btrfs_update_inode(trans, inode);
if (ret)
goto out;
}
- if (S_ISDIR(inode->i_mode))
- BTRFS_I(inode)->index_cnt = (u64)-1;
+ if (S_ISDIR(inode->vfs_inode.i_mode))
+ inode->index_cnt = (u64)-1;
- if (inode->i_nlink == 0) {
- if (S_ISDIR(inode->i_mode)) {
- ret = replay_dir_deletes(trans, root, NULL, path,
- ino, 1);
+ if (inode->vfs_inode.i_nlink == 0) {
+ if (S_ISDIR(inode->vfs_inode.i_mode)) {
+ ret = replay_dir_deletes(trans, root, NULL, path, ino, true);
if (ret)
goto out;
}
@@ -1669,12 +1700,13 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
{
int ret;
struct btrfs_key key;
- struct inode *inode;
key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
key.type = BTRFS_ORPHAN_ITEM_KEY;
key.offset = (u64)-1;
while (1) {
+ struct btrfs_inode *inode;
+
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
break;
@@ -1696,14 +1728,14 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
break;
btrfs_release_path(path);
- inode = read_one_inode(root, key.offset);
- if (!inode) {
- ret = -EIO;
+ inode = btrfs_iget_logging(key.offset, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
break;
}
ret = fixup_inode_link_count(trans, inode);
- iput(inode);
+ iput(&inode->vfs_inode);
if (ret)
break;
@@ -1731,12 +1763,14 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
{
struct btrfs_key key;
int ret = 0;
- struct inode *inode;
+ struct btrfs_inode *inode;
+ struct inode *vfs_inode;
- inode = read_one_inode(root, objectid);
- if (!inode)
- return -EIO;
+ inode = btrfs_iget_logging(objectid, root);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+ vfs_inode = &inode->vfs_inode;
key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
key.type = BTRFS_ORPHAN_ITEM_KEY;
key.offset = objectid;
@@ -1745,15 +1779,15 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
if (ret == 0) {
- if (!inode->i_nlink)
- set_nlink(inode, 1);
+ if (!vfs_inode->i_nlink)
+ set_nlink(vfs_inode, 1);
else
- inc_nlink(inode);
- ret = btrfs_update_inode(trans, BTRFS_I(inode));
+ inc_nlink(vfs_inode);
+ ret = btrfs_update_inode(trans, inode);
} else if (ret == -EEXIST) {
ret = 0;
}
- iput(inode);
+ iput(vfs_inode);
return ret;
}
@@ -1769,27 +1803,26 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans,
const struct fscrypt_str *name,
struct btrfs_key *location)
{
- struct inode *inode;
- struct inode *dir;
+ struct btrfs_inode *inode;
+ struct btrfs_inode *dir;
int ret;
- inode = read_one_inode(root, location->objectid);
- if (!inode)
- return -ENOENT;
+ inode = btrfs_iget_logging(location->objectid, root);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
- dir = read_one_inode(root, dirid);
- if (!dir) {
- iput(inode);
- return -EIO;
+ dir = btrfs_iget_logging(dirid, root);
+ if (IS_ERR(dir)) {
+ iput(&inode->vfs_inode);
+ return PTR_ERR(dir);
}
- ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
- 1, index);
+ ret = btrfs_add_link(trans, dir, inode, name, 1, index);
/* FIXME, put inode into FIXUP list */
- iput(inode);
- iput(dir);
+ iput(&inode->vfs_inode);
+ iput(&dir->vfs_inode);
return ret;
}
@@ -1851,16 +1884,16 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
bool index_dst_matches = false;
struct btrfs_key log_key;
struct btrfs_key search_key;
- struct inode *dir;
+ struct btrfs_inode *dir;
u8 log_flags;
bool exists;
int ret;
bool update_size = true;
bool name_added = false;
- dir = read_one_inode(root, key->objectid);
- if (!dir)
- return -EIO;
+ dir = btrfs_iget_logging(key->objectid, root);
+ if (IS_ERR(dir))
+ return PTR_ERR(dir);
ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name);
if (ret)
@@ -1881,9 +1914,8 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
ret = PTR_ERR(dir_dst_di);
goto out;
} else if (dir_dst_di) {
- ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
- dir_dst_di, &log_key,
- log_flags, exists);
+ ret = delete_conflicting_dir_entry(trans, dir, path, dir_dst_di,
+ &log_key, log_flags, exists);
if (ret < 0)
goto out;
dir_dst_matches = (ret == 1);
@@ -1898,9 +1930,8 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
ret = PTR_ERR(index_dst_di);
goto out;
} else if (index_dst_di) {
- ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
- index_dst_di, &log_key,
- log_flags, exists);
+ ret = delete_conflicting_dir_entry(trans, dir, path, index_dst_di,
+ &log_key, log_flags, exists);
if (ret < 0)
goto out;
index_dst_matches = (ret == 1);
@@ -1955,11 +1986,11 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
out:
if (!ret && update_size) {
- btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name.len * 2);
- ret = btrfs_update_inode(trans, BTRFS_I(dir));
+ btrfs_i_size_write(dir, dir->vfs_inode.i_size + name.len * 2);
+ ret = btrfs_update_inode(trans, dir);
}
kfree(name.name);
- iput(dir);
+ iput(&dir->vfs_inode);
if (!ret && name_added)
ret = 1;
return ret;
@@ -2116,16 +2147,16 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
struct btrfs_root *log,
struct btrfs_path *path,
struct btrfs_path *log_path,
- struct inode *dir,
+ struct btrfs_inode *dir,
struct btrfs_key *dir_key)
{
- struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_root *root = dir->root;
int ret;
struct extent_buffer *eb;
int slot;
struct btrfs_dir_item *di;
struct fscrypt_str name = { 0 };
- struct inode *inode = NULL;
+ struct btrfs_inode *inode = NULL;
struct btrfs_key location;
/*
@@ -2162,9 +2193,10 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
btrfs_dir_item_key_to_cpu(eb, di, &location);
btrfs_release_path(path);
btrfs_release_path(log_path);
- inode = read_one_inode(root, location.objectid);
- if (!inode) {
- ret = -EIO;
+ inode = btrfs_iget_logging(location.objectid, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ inode = NULL;
goto out;
}
@@ -2172,9 +2204,8 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
if (ret)
goto out;
- inc_nlink(inode);
- ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(inode),
- &name);
+ inc_nlink(&inode->vfs_inode);
+ ret = unlink_inode_for_log_replay(trans, dir, inode, &name);
/*
* Unlike dir item keys, dir index keys can only have one name (entry) in
* them, as there are no key collisions since each key has a unique offset
@@ -2184,7 +2215,8 @@ out:
btrfs_release_path(path);
btrfs_release_path(log_path);
kfree(name.name);
- iput(inode);
+ if (inode)
+ iput(&inode->vfs_inode);
return ret;
}
@@ -2300,7 +2332,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_root *log,
struct btrfs_path *path,
- u64 dirid, int del_all)
+ u64 dirid, bool del_all)
{
u64 range_start;
u64 range_end;
@@ -2308,7 +2340,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
struct btrfs_key dir_key;
struct btrfs_key found_key;
struct btrfs_path *log_path;
- struct inode *dir;
+ struct btrfs_inode *dir;
dir_key.objectid = dirid;
dir_key.type = BTRFS_DIR_INDEX_KEY;
@@ -2316,14 +2348,17 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
if (!log_path)
return -ENOMEM;
- dir = read_one_inode(root, dirid);
- /* it isn't an error if the inode isn't there, that can happen
- * because we replay the deletes before we copy in the inode item
- * from the log
+ dir = btrfs_iget_logging(dirid, root);
+ /*
+ * It isn't an error if the inode isn't there, that can happen because
+ * we replay the deletes before we copy in the inode item from the log.
*/
- if (!dir) {
+ if (IS_ERR(dir)) {
btrfs_free_path(log_path);
- return 0;
+ ret = PTR_ERR(dir);
+ if (ret == -ENOENT)
+ ret = 0;
+ return ret;
}
range_start = 0;
@@ -2385,7 +2420,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
out:
btrfs_release_path(path);
btrfs_free_path(log_path);
- iput(dir);
+ iput(&dir->vfs_inode);
return ret;
}
@@ -2429,23 +2464,30 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
nritems = btrfs_header_nritems(eb);
for (i = 0; i < nritems; i++) {
- btrfs_item_key_to_cpu(eb, &key, i);
+ struct btrfs_inode_item *inode_item;
- /* inode keys are done during the first stage */
- if (key.type == BTRFS_INODE_ITEM_KEY &&
- wc->stage == LOG_WALK_REPLAY_INODES) {
- struct btrfs_inode_item *inode_item;
- u32 mode;
+ btrfs_item_key_to_cpu(eb, &key, i);
- inode_item = btrfs_item_ptr(eb, i,
- struct btrfs_inode_item);
+ if (key.type == BTRFS_INODE_ITEM_KEY) {
+ inode_item = btrfs_item_ptr(eb, i, struct btrfs_inode_item);
/*
- * If we have a tmpfile (O_TMPFILE) that got fsync'ed
- * and never got linked before the fsync, skip it, as
- * replaying it is pointless since it would be deleted
- * later. We skip logging tmpfiles, but it's always
- * possible we are replaying a log created with a kernel
- * that used to log tmpfiles.
+ * An inode with no links is either:
+ *
+ * 1) A tmpfile (O_TMPFILE) that got fsync'ed and never
+ * got linked before the fsync, skip it, as replaying
+ * it is pointless since it would be deleted later.
+ * We skip logging tmpfiles, but it's always possible
+ * we are replaying a log created with a kernel that
+ * used to log tmpfiles;
+ *
+ * 2) A non-tmpfile which got its last link deleted
+ * while holding an open fd on it and later got
+ * fsynced through that fd. We always log the
+ * parent inodes when inode->last_unlink_trans is
+ * set to the current transaction, so ignore all the
+ * inode items for this inode. We will delete the
+ * inode when processing the parent directory with
+ * replay_dir_deletes().
*/
if (btrfs_inode_nlink(eb, inode_item) == 0) {
wc->ignore_cur_inode = true;
@@ -2453,14 +2495,20 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
} else {
wc->ignore_cur_inode = false;
}
- ret = replay_xattr_deletes(wc->trans, root, log,
- path, key.objectid);
+ }
+
+ /* Inode keys are done during the first stage. */
+ if (key.type == BTRFS_INODE_ITEM_KEY &&
+ wc->stage == LOG_WALK_REPLAY_INODES) {
+ u32 mode;
+
+ ret = replay_xattr_deletes(wc->trans, root, log, path, key.objectid);
if (ret)
break;
mode = btrfs_inode_mode(eb, inode_item);
if (S_ISDIR(mode)) {
- ret = replay_dir_deletes(wc->trans,
- root, log, path, key.objectid, 0);
+ ret = replay_dir_deletes(wc->trans, root, log, path,
+ key.objectid, false);
if (ret)
break;
}
@@ -2479,30 +2527,28 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
*/
if (S_ISREG(mode)) {
struct btrfs_drop_extents_args drop_args = { 0 };
- struct inode *inode;
+ struct btrfs_inode *inode;
u64 from;
- inode = read_one_inode(root, key.objectid);
- if (!inode) {
- ret = -EIO;
+ inode = btrfs_iget_logging(key.objectid, root);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
break;
}
- from = ALIGN(i_size_read(inode),
+ from = ALIGN(i_size_read(&inode->vfs_inode),
root->fs_info->sectorsize);
drop_args.start = from;
drop_args.end = (u64)-1;
drop_args.drop_cache = true;
- ret = btrfs_drop_extents(wc->trans, root,
- BTRFS_I(inode),
+ ret = btrfs_drop_extents(wc->trans, root, inode,
&drop_args);
if (!ret) {
- inode_sub_bytes(inode,
+ inode_sub_bytes(&inode->vfs_inode,
drop_args.bytes_found);
/* Update the inode's nbytes. */
- ret = btrfs_update_inode(wc->trans,
- BTRFS_I(inode));
+ ret = btrfs_update_inode(wc->trans, inode);
}
- iput(inode);
+ iput(&inode->vfs_inode);
if (ret)
break;
}
@@ -2537,9 +2583,8 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
key.type == BTRFS_INODE_EXTREF_KEY) {
ret = add_inode_ref(wc->trans, root, log, path,
eb, i, &key);
- if (ret && ret != -ENOENT)
+ if (ret)
break;
- ret = 0;
} else if (key.type == BTRFS_EXTENT_DATA_KEY) {
ret = replay_one_extent(wc->trans, root, path,
eb, i, &key);
@@ -2560,14 +2605,14 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
/*
* Correctly adjust the reserved bytes occupied by a log tree extent buffer
*/
-static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)
+static int unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)
{
struct btrfs_block_group *cache;
cache = btrfs_lookup_block_group(fs_info, start);
if (!cache) {
btrfs_err(fs_info, "unable to find block group for %llu", start);
- return;
+ return -ENOENT;
}
spin_lock(&cache->space_info->lock);
@@ -2578,27 +2623,22 @@ static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)
spin_unlock(&cache->space_info->lock);
btrfs_put_block_group(cache);
+
+ return 0;
}
static int clean_log_buffer(struct btrfs_trans_handle *trans,
struct extent_buffer *eb)
{
- int ret;
-
btrfs_tree_lock(eb);
btrfs_clear_buffer_dirty(trans, eb);
wait_on_extent_buffer_writeback(eb);
btrfs_tree_unlock(eb);
- if (trans) {
- ret = btrfs_pin_reserved_extent(trans, eb);
- if (ret)
- return ret;
- } else {
- unaccount_log_buffer(eb->fs_info, eb->start);
- }
+ if (trans)
+ return btrfs_pin_reserved_extent(trans, eb);
- return 0;
+ return unaccount_log_buffer(eb->fs_info, eb->start);
}
static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
@@ -2738,7 +2778,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
level = btrfs_header_level(log->node);
orig_level = level;
path->nodes[level] = log->node;
- atomic_inc(&log->node->refs);
+ refcount_inc(&log->node->refs);
path->slots[level] = 0;
while (1) {
@@ -2979,9 +3019,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
}
if (log_transid % 2 == 0)
- mark = EXTENT_DIRTY;
+ mark = EXTENT_DIRTY_LOG1;
else
- mark = EXTENT_NEW;
+ mark = EXTENT_DIRTY_LOG2;
/* we start IO on all the marked extents here, but we don't actually
* wait for them until later.
@@ -3112,7 +3152,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
ret = btrfs_write_marked_extents(fs_info,
&log_root_tree->dirty_log_pages,
- EXTENT_DIRTY | EXTENT_NEW);
+ EXTENT_DIRTY_LOG1 | EXTENT_DIRTY_LOG2);
blk_finish_plug(&plug);
/*
* As described above, -EAGAIN indicates a hole in the extents. We
@@ -3132,7 +3172,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
ret = btrfs_wait_tree_log_extents(log, mark);
if (!ret)
ret = btrfs_wait_tree_log_extents(log_root_tree,
- EXTENT_NEW | EXTENT_DIRTY);
+ EXTENT_DIRTY_LOG1 | EXTENT_DIRTY_LOG2);
if (ret) {
btrfs_set_log_full_commit(trans);
mutex_unlock(&log_root_tree->log_mutex);
@@ -3258,9 +3298,9 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
*/
btrfs_write_marked_extents(log->fs_info,
&log->dirty_log_pages,
- EXTENT_DIRTY | EXTENT_NEW);
+ EXTENT_DIRTY_LOG1 | EXTENT_DIRTY_LOG2);
btrfs_wait_tree_log_extents(log,
- EXTENT_DIRTY | EXTENT_NEW);
+ EXTENT_DIRTY_LOG1 | EXTENT_DIRTY_LOG2);
if (trans)
btrfs_abort_transaction(trans, ret);
@@ -3269,8 +3309,8 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
}
}
- extent_io_tree_release(&log->dirty_log_pages);
- extent_io_tree_release(&log->log_csum_range);
+ btrfs_extent_io_tree_release(&log->dirty_log_pages);
+ btrfs_extent_io_tree_release(&log->log_csum_range);
btrfs_put_root(log);
}
@@ -3450,7 +3490,7 @@ static int del_logged_dentry(struct btrfs_trans_handle *trans,
* inode item because on log replay we update the field to reflect
* all existing entries in the directory (see overwrite_item()).
*/
- return btrfs_delete_one_dir_name(trans, log, path, di);
+ return btrfs_del_item(trans, log, path);
}
/*
@@ -3490,26 +3530,27 @@ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
return;
}
- ret = join_running_log_trans(root);
- if (ret)
- return;
-
- mutex_lock(&dir->log_mutex);
-
path = btrfs_alloc_path();
if (!path) {
- ret = -ENOMEM;
- goto out_unlock;
+ btrfs_set_log_full_commit(trans);
+ return;
}
+ ret = join_running_log_trans(root);
+ ASSERT(ret == 0, "join_running_log_trans() ret=%d", ret);
+ if (WARN_ON(ret))
+ goto out;
+
+ mutex_lock(&dir->log_mutex);
+
ret = del_logged_dentry(trans, root->log_root, path, btrfs_ino(dir),
name, index);
- btrfs_free_path(path);
-out_unlock:
mutex_unlock(&dir->log_mutex);
if (ret < 0)
btrfs_set_log_full_commit(trans);
btrfs_end_log_trans(root);
+out:
+ btrfs_free_path(path);
}
/* see comments for btrfs_del_dir_entries_in_log */
@@ -3519,7 +3560,6 @@ void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, u64 dirid)
{
struct btrfs_root *log;
- u64 index;
int ret;
ret = inode_logged(trans, inode, NULL);
@@ -3531,13 +3571,13 @@ void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
}
ret = join_running_log_trans(root);
- if (ret)
+ ASSERT(ret == 0, "join_running_log_trans() ret=%d", ret);
+ if (WARN_ON(ret))
return;
log = root->log_root;
mutex_lock(&inode->log_mutex);
- ret = btrfs_del_inode_ref(trans, log, name, btrfs_ino(inode),
- dirid, &index);
+ ret = btrfs_del_inode_ref(trans, log, name, btrfs_ino(inode), dirid, NULL);
mutex_unlock(&inode->log_mutex);
if (ret < 0 && ret != -ENOENT)
btrfs_set_log_full_commit(trans);
@@ -3560,8 +3600,8 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
struct btrfs_dir_log_item *item;
key.objectid = dirid;
- key.offset = first_offset;
key.type = BTRFS_DIR_LOG_INDEX_KEY;
+ key.offset = first_offset;
ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
/*
* -EEXIST is fine and can happen sporadically when we are logging a
@@ -3702,7 +3742,7 @@ static int clone_leaf(struct btrfs_path *path, struct btrfs_log_ctx *ctx)
* Add extra ref to scratch eb so that it is not freed when callers
* release the path, so we can reuse it later if needed.
*/
- atomic_inc(&ctx->scratch_eb->refs);
+ refcount_inc(&ctx->scratch_eb->refs);
return 0;
}
@@ -4190,44 +4230,37 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
struct inode *inode, int log_inode_only,
u64 logged_isize)
{
- struct btrfs_map_token token;
u64 flags;
- btrfs_init_map_token(&token, leaf);
-
if (log_inode_only) {
/* set the generation to zero so the recover code
* can tell the difference between an logging
* just to say 'this inode exists' and a logging
* to say 'update this inode with these values'
*/
- btrfs_set_token_inode_generation(&token, item, 0);
- btrfs_set_token_inode_size(&token, item, logged_isize);
+ btrfs_set_inode_generation(leaf, item, 0);
+ btrfs_set_inode_size(leaf, item, logged_isize);
} else {
- btrfs_set_token_inode_generation(&token, item,
- BTRFS_I(inode)->generation);
- btrfs_set_token_inode_size(&token, item, inode->i_size);
+ btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
+ btrfs_set_inode_size(leaf, item, inode->i_size);
}
- btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
- btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
- btrfs_set_token_inode_mode(&token, item, inode->i_mode);
- btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
+ btrfs_set_inode_uid(leaf, item, i_uid_read(inode));
+ btrfs_set_inode_gid(leaf, item, i_gid_read(inode));
+ btrfs_set_inode_mode(leaf, item, inode->i_mode);
+ btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
+
+ btrfs_set_timespec_sec(leaf, &item->atime, inode_get_atime_sec(inode));
+ btrfs_set_timespec_nsec(leaf, &item->atime, inode_get_atime_nsec(inode));
- btrfs_set_token_timespec_sec(&token, &item->atime,
- inode_get_atime_sec(inode));
- btrfs_set_token_timespec_nsec(&token, &item->atime,
- inode_get_atime_nsec(inode));
+ btrfs_set_timespec_sec(leaf, &item->mtime, inode_get_mtime_sec(inode));
+ btrfs_set_timespec_nsec(leaf, &item->mtime, inode_get_mtime_nsec(inode));
- btrfs_set_token_timespec_sec(&token, &item->mtime,
- inode_get_mtime_sec(inode));
- btrfs_set_token_timespec_nsec(&token, &item->mtime,
- inode_get_mtime_nsec(inode));
+ btrfs_set_timespec_sec(leaf, &item->ctime, inode_get_ctime_sec(inode));
+ btrfs_set_timespec_nsec(leaf, &item->ctime, inode_get_ctime_nsec(inode));
- btrfs_set_token_timespec_sec(&token, &item->ctime,
- inode_get_ctime_sec(inode));
- btrfs_set_token_timespec_nsec(&token, &item->ctime,
- inode_get_ctime_nsec(inode));
+ btrfs_set_timespec_sec(leaf, &item->otime, BTRFS_I(inode)->i_otime_sec);
+ btrfs_set_timespec_nsec(leaf, &item->otime, BTRFS_I(inode)->i_otime_nsec);
/*
* We do not need to set the nbytes field, in fact during a fast fsync
@@ -4238,13 +4271,13 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
* inode item in subvolume tree as needed (see overwrite_item()).
*/
- btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
- btrfs_set_token_inode_transid(&token, item, trans->transid);
- btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
+ btrfs_set_inode_sequence(leaf, item, inode_peek_iversion(inode));
+ btrfs_set_inode_transid(leaf, item, trans->transid);
+ btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
BTRFS_I(inode)->ro_flags);
- btrfs_set_token_inode_flags(&token, item, flags);
- btrfs_set_token_inode_block_group(&token, item, 0);
+ btrfs_set_inode_flags(leaf, item, flags);
+ btrfs_set_inode_block_group(leaf, item, 0);
}
static int log_inode_item(struct btrfs_trans_handle *trans,
@@ -4318,8 +4351,8 @@ static int log_csums(struct btrfs_trans_handle *trans,
* file which happens to refer to the same extent as well. Such races
* can leave checksum items in the log with overlapping ranges.
*/
- ret = lock_extent(&log_root->log_csum_range, sums->logical, lock_end,
- &cached_state);
+ ret = btrfs_lock_extent(&log_root->log_csum_range, sums->logical, lock_end,
+ &cached_state);
if (ret)
return ret;
/*
@@ -4335,8 +4368,8 @@ static int log_csums(struct btrfs_trans_handle *trans,
if (!ret)
ret = btrfs_csum_file_blocks(trans, log_root, sums);
- unlock_extent(&log_root->log_csum_range, sums->logical, lock_end,
- &cached_state);
+ btrfs_unlock_extent(&log_root->log_csum_range, sums->logical, lock_end,
+ &cached_state);
return ret;
}
@@ -4666,7 +4699,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
return 0;
/* If we're compressed we have to save the entire range of csums. */
- if (extent_map_is_compressed(em)) {
+ if (btrfs_extent_map_is_compressed(em)) {
csum_offset = 0;
csum_len = em->disk_num_bytes;
} else {
@@ -4675,7 +4708,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
}
/* block start is already adjusted for the file extent offset. */
- block_start = extent_map_block_start(em);
+ block_start = btrfs_extent_map_block_start(em);
csum_root = btrfs_csum_root(trans->fs_info, block_start);
ret = btrfs_lookup_csums_list(csum_root, block_start + csum_offset,
block_start + csum_offset + csum_len - 1,
@@ -4685,9 +4718,9 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
ret = 0;
while (!list_empty(&ordered_sums)) {
- struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
- struct btrfs_ordered_sum,
- list);
+ struct btrfs_ordered_sum *sums = list_first_entry(&ordered_sums,
+ struct btrfs_ordered_sum,
+ list);
if (!ret)
ret = log_csums(trans, inode, log_root, sums);
list_del(&sums->list);
@@ -4710,7 +4743,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
struct btrfs_key key;
enum btrfs_compression_type compress_type;
u64 extent_offset = em->offset;
- u64 block_start = extent_map_block_start(em);
+ u64 block_start = btrfs_extent_map_block_start(em);
u64 block_len;
int ret;
@@ -4721,7 +4754,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG);
block_len = em->disk_num_bytes;
- compress_type = extent_map_compression(em);
+ compress_type = btrfs_extent_map_compression(em);
if (compress_type != BTRFS_COMPRESS_NONE) {
btrfs_set_stack_file_extent_disk_bytenr(&fi, block_start);
btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
@@ -4965,7 +4998,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
list_sort(NULL, &extents, extent_cmp);
process:
while (!list_empty(&extents)) {
- em = list_entry(extents.next, struct extent_map, list);
+ em = list_first_entry(&extents, struct extent_map, list);
list_del_init(&em->list);
@@ -4974,8 +5007,8 @@ process:
* private list.
*/
if (ret) {
- clear_em_logging(inode, em);
- free_extent_map(em);
+ btrfs_clear_em_logging(inode, em);
+ btrfs_free_extent_map(em);
continue;
}
@@ -4983,8 +5016,8 @@ process:
ret = log_one_extent(trans, inode, em, path, ctx);
write_lock(&tree->lock);
- clear_em_logging(inode, em);
- free_extent_map(em);
+ btrfs_clear_em_logging(inode, em);
+ btrfs_free_extent_map(em);
}
WARN_ON(!list_empty(&extents));
write_unlock(&tree->lock);
@@ -5481,7 +5514,6 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
ihold(&curr_inode->vfs_inode);
while (true) {
- struct inode *vfs_inode;
struct btrfs_key key;
struct btrfs_key found_key;
u64 next_index;
@@ -5497,7 +5529,7 @@ again:
struct extent_buffer *leaf = path->nodes[0];
struct btrfs_dir_item *di;
struct btrfs_key di_key;
- struct inode *di_inode;
+ struct btrfs_inode *di_inode;
int log_mode = LOG_INODE_EXISTS;
int type;
@@ -5524,17 +5556,16 @@ again:
goto out;
}
- if (!need_log_inode(trans, BTRFS_I(di_inode))) {
- btrfs_add_delayed_iput(BTRFS_I(di_inode));
+ if (!need_log_inode(trans, di_inode)) {
+ btrfs_add_delayed_iput(di_inode);
break;
}
ctx->log_new_dentries = false;
if (type == BTRFS_FT_DIR)
log_mode = LOG_INODE_ALL;
- ret = btrfs_log_inode(trans, BTRFS_I(di_inode),
- log_mode, ctx);
- btrfs_add_delayed_iput(BTRFS_I(di_inode));
+ ret = btrfs_log_inode(trans, di_inode, log_mode, ctx);
+ btrfs_add_delayed_iput(di_inode);
if (ret)
goto out;
if (ctx->log_new_dentries) {
@@ -5576,14 +5607,13 @@ again:
kfree(dir_elem);
btrfs_add_delayed_iput(curr_inode);
- curr_inode = NULL;
- vfs_inode = btrfs_iget_logging(ino, root);
- if (IS_ERR(vfs_inode)) {
- ret = PTR_ERR(vfs_inode);
+ curr_inode = btrfs_iget_logging(ino, root);
+ if (IS_ERR(curr_inode)) {
+ ret = PTR_ERR(curr_inode);
+ curr_inode = NULL;
break;
}
- curr_inode = BTRFS_I(vfs_inode);
}
out:
btrfs_free_path(path);
@@ -5661,7 +5691,7 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans,
struct btrfs_log_ctx *ctx)
{
struct btrfs_ino_list *ino_elem;
- struct inode *inode;
+ struct btrfs_inode *inode;
/*
* It's rare to have a lot of conflicting inodes, in practice it is not
@@ -5752,12 +5782,12 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans,
* inode in LOG_INODE_EXISTS mode and rename operations update the log,
* so that the log ends up with the new name and without the old name.
*/
- if (!need_log_inode(trans, BTRFS_I(inode))) {
- btrfs_add_delayed_iput(BTRFS_I(inode));
+ if (!need_log_inode(trans, inode)) {
+ btrfs_add_delayed_iput(inode);
return 0;
}
- btrfs_add_delayed_iput(BTRFS_I(inode));
+ btrfs_add_delayed_iput(inode);
ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
if (!ino_elem)
@@ -5793,7 +5823,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
*/
while (!list_empty(&ctx->conflict_inodes)) {
struct btrfs_ino_list *curr;
- struct inode *inode;
+ struct btrfs_inode *inode;
u64 ino;
u64 parent;
@@ -5829,9 +5859,8 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
* dir index key range logged for the directory. So we
* must make sure the deletion is recorded.
*/
- ret = btrfs_log_inode(trans, BTRFS_I(inode),
- LOG_INODE_ALL, ctx);
- btrfs_add_delayed_iput(BTRFS_I(inode));
+ ret = btrfs_log_inode(trans, inode, LOG_INODE_ALL, ctx);
+ btrfs_add_delayed_iput(inode);
if (ret)
break;
continue;
@@ -5847,8 +5876,8 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
* it again because if some other task logged the inode after
* that, we can avoid doing it again.
*/
- if (!need_log_inode(trans, BTRFS_I(inode))) {
- btrfs_add_delayed_iput(BTRFS_I(inode));
+ if (!need_log_inode(trans, inode)) {
+ btrfs_add_delayed_iput(inode);
continue;
}
@@ -5859,8 +5888,8 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
* well because during a rename we pin the log and update the
* log with the new name before we unpin it.
*/
- ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_INODE_EXISTS, ctx);
- btrfs_add_delayed_iput(BTRFS_I(inode));
+ ret = btrfs_log_inode(trans, inode, LOG_INODE_EXISTS, ctx);
+ btrfs_add_delayed_iput(inode);
if (ret)
break;
}
@@ -6351,7 +6380,7 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
list_for_each_entry(item, delayed_ins_list, log_list) {
struct btrfs_dir_item *dir_item;
- struct inode *di_inode;
+ struct btrfs_inode *di_inode;
struct btrfs_key key;
int log_mode = LOG_INODE_EXISTS;
@@ -6367,8 +6396,8 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
break;
}
- if (!need_log_inode(trans, BTRFS_I(di_inode))) {
- btrfs_add_delayed_iput(BTRFS_I(di_inode));
+ if (!need_log_inode(trans, di_inode)) {
+ btrfs_add_delayed_iput(di_inode);
continue;
}
@@ -6376,12 +6405,12 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
log_mode = LOG_INODE_ALL;
ctx->log_new_dentries = false;
- ret = btrfs_log_inode(trans, BTRFS_I(di_inode), log_mode, ctx);
+ ret = btrfs_log_inode(trans, di_inode, log_mode, ctx);
if (!ret && ctx->log_new_dentries)
- ret = log_new_dir_dentries(trans, BTRFS_I(di_inode), ctx);
+ ret = log_new_dir_dentries(trans, di_inode, ctx);
- btrfs_add_delayed_iput(BTRFS_I(di_inode));
+ btrfs_add_delayed_iput(di_inode);
if (ret)
break;
@@ -6605,6 +6634,19 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
btrfs_log_get_delayed_items(inode, &delayed_ins_list,
&delayed_del_list);
+ /*
+ * If we are fsyncing a file with 0 hard links, then commit the delayed
+ * inode because the last inode ref (or extref) item may still be in the
+ * subvolume tree and if we log it the file will still exist after a log
+ * replay. So commit the delayed inode to delete that last ref and we
+ * skip logging it.
+ */
+ if (inode->vfs_inode.i_nlink == 0) {
+ ret = btrfs_commit_inode_delayed_inode(inode);
+ if (ret)
+ goto out_unlock;
+ }
+
ret = copy_inode_items_to_log(trans, inode, &min_key, &max_key,
path, dst_path, logged_isize,
inode_only, ctx,
@@ -6789,7 +6831,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
ptr = btrfs_item_ptr_offset(leaf, slot);
while (cur_offset < item_size) {
struct btrfs_key inode_key;
- struct inode *dir_inode;
+ struct btrfs_inode *dir_inode;
inode_key.type = BTRFS_INODE_ITEM_KEY;
inode_key.offset = 0;
@@ -6838,18 +6880,16 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
goto out;
}
- if (!need_log_inode(trans, BTRFS_I(dir_inode))) {
- btrfs_add_delayed_iput(BTRFS_I(dir_inode));
+ if (!need_log_inode(trans, dir_inode)) {
+ btrfs_add_delayed_iput(dir_inode);
continue;
}
ctx->log_new_dentries = false;
- ret = btrfs_log_inode(trans, BTRFS_I(dir_inode),
- LOG_INODE_ALL, ctx);
+ ret = btrfs_log_inode(trans, dir_inode, LOG_INODE_ALL, ctx);
if (!ret && ctx->log_new_dentries)
- ret = log_new_dir_dentries(trans,
- BTRFS_I(dir_inode), ctx);
- btrfs_add_delayed_iput(BTRFS_I(dir_inode));
+ ret = log_new_dir_dentries(trans, dir_inode, ctx);
+ btrfs_add_delayed_iput(dir_inode);
if (ret)
goto out;
}
@@ -6874,7 +6914,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
int slot;
struct btrfs_key search_key;
- struct inode *inode;
+ struct btrfs_inode *inode;
u64 ino;
int ret = 0;
@@ -6889,11 +6929,10 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans,
if (IS_ERR(inode))
return PTR_ERR(inode);
- if (BTRFS_I(inode)->generation >= trans->transid &&
- need_log_inode(trans, BTRFS_I(inode)))
- ret = btrfs_log_inode(trans, BTRFS_I(inode),
- LOG_INODE_EXISTS, ctx);
- btrfs_add_delayed_iput(BTRFS_I(inode));
+ if (inode->generation >= trans->transid &&
+ need_log_inode(trans, inode))
+ ret = btrfs_log_inode(trans, inode, LOG_INODE_EXISTS, ctx);
+ btrfs_add_delayed_iput(inode);
if (ret)
return ret;
@@ -7061,42 +7100,29 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
int ret = 0;
- bool log_dentries = false;
+ bool log_dentries;
- if (btrfs_test_opt(fs_info, NOTREELOG)) {
- ret = BTRFS_LOG_FORCE_COMMIT;
- goto end_no_trans;
- }
+ if (btrfs_test_opt(fs_info, NOTREELOG))
+ return BTRFS_LOG_FORCE_COMMIT;
- if (btrfs_root_refs(&root->root_item) == 0) {
- ret = BTRFS_LOG_FORCE_COMMIT;
- goto end_no_trans;
- }
+ if (btrfs_root_refs(&root->root_item) == 0)
+ return BTRFS_LOG_FORCE_COMMIT;
/*
* If we're logging an inode from a subvolume created in the current
* transaction we must force a commit since the root is not persisted.
*/
- if (btrfs_root_generation(&root->root_item) == trans->transid) {
- ret = BTRFS_LOG_FORCE_COMMIT;
- goto end_no_trans;
- }
+ if (btrfs_root_generation(&root->root_item) == trans->transid)
+ return BTRFS_LOG_FORCE_COMMIT;
- /*
- * Skip already logged inodes or inodes corresponding to tmpfiles
- * (since logging them is pointless, a link count of 0 means they
- * will never be accessible).
- */
- if ((btrfs_inode_in_log(inode, trans->transid) &&
- list_empty(&ctx->ordered_extents)) ||
- inode->vfs_inode.i_nlink == 0) {
- ret = BTRFS_NO_LOG_SYNC;
- goto end_no_trans;
- }
+ /* Skip already logged inodes and without new extents. */
+ if (btrfs_inode_in_log(inode, trans->transid) &&
+ list_empty(&ctx->ordered_extents))
+ return BTRFS_NO_LOG_SYNC;
ret = start_log_trans(trans, root, ctx);
if (ret)
- goto end_no_trans;
+ return ret;
ret = btrfs_log_inode(trans, inode, inode_only, ctx);
if (ret)
@@ -7115,8 +7141,11 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
goto end_trans;
}
- if (S_ISDIR(inode->vfs_inode.i_mode) && ctx->log_new_dentries)
- log_dentries = true;
+ /*
+ * Track if we need to log dentries because ctx->log_new_dentries can
+ * be modified in the call chains below.
+ */
+ log_dentries = ctx->log_new_dentries;
/*
* On unlink we must make sure all our current and old parent directory
@@ -7171,8 +7200,6 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
if (log_dentries)
ret = log_new_dir_dentries(trans, inode, ctx);
- else
- ret = 0;
end_trans:
if (ret < 0) {
btrfs_set_log_full_commit(trans);
@@ -7182,7 +7209,7 @@ end_trans:
if (ret)
btrfs_remove_log_ctx(root, ctx);
btrfs_end_log_trans(root);
-end_no_trans:
+
return ret;
}
@@ -7216,8 +7243,6 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
struct btrfs_path *path;
struct btrfs_trans_handle *trans;
struct btrfs_key key;
- struct btrfs_key found_key;
- struct btrfs_root *log;
struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
struct walk_control wc = {
.process_func = process_one_buffer,
@@ -7247,10 +7272,13 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
again:
key.objectid = BTRFS_TREE_LOG_OBJECTID;
- key.offset = (u64)-1;
key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = (u64)-1;
while (1) {
+ struct btrfs_root *log;
+ struct btrfs_key found_key;
+
ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
if (ret < 0) {
@@ -7279,6 +7307,12 @@ again:
true);
if (IS_ERR(wc.replay_dest)) {
ret = PTR_ERR(wc.replay_dest);
+ wc.replay_dest = NULL;
+ if (ret != -ENOENT) {
+ btrfs_put_root(log);
+ btrfs_abort_transaction(trans, ret);
+ goto error;
+ }
/*
* We didn't find the subvol, likely because it was
@@ -7291,36 +7325,36 @@ again:
* block from being modified, and we'll just bail for
* each subsequent pass.
*/
- if (ret == -ENOENT)
- ret = btrfs_pin_extent_for_log_replay(trans, log->node);
- btrfs_put_root(log);
-
- if (!ret)
- goto next;
- btrfs_abort_transaction(trans, ret);
- goto error;
+ ret = btrfs_pin_extent_for_log_replay(trans, log->node);
+ if (ret) {
+ btrfs_put_root(log);
+ btrfs_abort_transaction(trans, ret);
+ goto error;
+ }
+ goto next;
}
wc.replay_dest->log_root = log;
ret = btrfs_record_root_in_trans(trans, wc.replay_dest);
- if (ret)
- /* The loop needs to continue due to the root refs */
+ if (ret) {
btrfs_abort_transaction(trans, ret);
- else
- ret = walk_log_tree(trans, log, &wc);
+ goto next;
+ }
- if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
- ret = fixup_inode_link_counts(trans, wc.replay_dest,
- path);
- if (ret)
- btrfs_abort_transaction(trans, ret);
+ ret = walk_log_tree(trans, log, &wc);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto next;
}
- if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
+ if (wc.stage == LOG_WALK_REPLAY_ALL) {
struct btrfs_root *root = wc.replay_dest;
- btrfs_release_path(path);
-
+ ret = fixup_inode_link_counts(trans, wc.replay_dest, path);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto next;
+ }
/*
* We have just replayed everything, and the highest
* objectid of fs roots probably has changed in case
@@ -7330,17 +7364,20 @@ again:
* could only happen during mount.
*/
ret = btrfs_init_root_free_objectid(root);
- if (ret)
+ if (ret) {
btrfs_abort_transaction(trans, ret);
+ goto next;
+ }
+ }
+next:
+ if (wc.replay_dest) {
+ wc.replay_dest->log_root = NULL;
+ btrfs_put_root(wc.replay_dest);
}
-
- wc.replay_dest->log_root = NULL;
- btrfs_put_root(wc.replay_dest);
btrfs_put_root(log);
if (ret)
goto error;
-next:
if (found_key.offset == 0)
break;
key.offset = found_key.offset - 1;
@@ -7471,6 +7508,8 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
* full log sync.
* Also we don't need to worry with renames, since btrfs_rename() marks the log
* for full commit when renaming a subvolume.
+ *
+ * Must be called before creating the subvolume entry in its parent directory.
*/
void btrfs_record_new_subvolume(const struct btrfs_trans_handle *trans,
struct btrfs_inode *dir)
@@ -7507,6 +7546,9 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
bool log_pinned = false;
int ret;
+ btrfs_init_log_ctx(&ctx, inode);
+ ctx.logging_new_name = true;
+
/*
* this will force the logging code to walk the dentry chain
* up for the file
@@ -7538,6 +7580,13 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
ret = 0;
/*
+ * Now that we know we need to update the log, allocate the scratch eb
+ * for the context before joining a log transaction below, as this can
+ * take time and therefore we could delay log commits from other tasks.
+ */
+ btrfs_init_log_ctx_scratch_eb(&ctx);
+
+ /*
* If we are doing a rename (old_dir is not NULL) from a directory that
* was previously logged, make sure that on log replay we get the old
* dir entry deleted. This is needed because we will also log the new
@@ -7555,6 +7604,14 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
&old_dentry->d_name, 0, &fname);
if (ret)
goto out;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ fscrypt_free_filename(&fname);
+ goto out;
+ }
+
/*
* We have two inodes to update in the log, the old directory and
* the inode that got renamed, so we must pin the log to prevent
@@ -7568,19 +7625,13 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
* mark the log for a full commit.
*/
if (WARN_ON_ONCE(ret < 0)) {
+ btrfs_free_path(path);
fscrypt_free_filename(&fname);
goto out;
}
log_pinned = true;
- path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- fscrypt_free_filename(&fname);
- goto out;
- }
-
/*
* Other concurrent task might be logging the old directory,
* as it can be triggered when logging other inode that had or
@@ -7612,9 +7663,6 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
goto out;
}
- btrfs_init_log_ctx(&ctx, inode);
- ctx.logging_new_name = true;
- btrfs_init_log_ctx_scratch_eb(&ctx);
/*
* We don't care about the return value. If we fail to log the new name
* then we know the next attempt to sync the log will fallback to a full
@@ -7623,7 +7671,6 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
* inconsistent state after a rename operation.
*/
btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx);
- free_extent_buffer(ctx.scratch_eb);
ASSERT(list_empty(&ctx.conflict_inodes));
out:
/*
@@ -7636,5 +7683,6 @@ out:
btrfs_set_log_full_commit(trans);
if (log_pinned)
btrfs_end_log_trans(root);
+ free_extent_buffer(ctx.scratch_eb);
}
diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c
index 1ac2678fc4ca..9e8cb3b7c064 100644
--- a/fs/btrfs/tree-mod-log.c
+++ b/fs/btrfs/tree-mod-log.c
@@ -27,18 +27,29 @@ struct tree_mod_elem {
/* This is used for BTRFS_MOD_LOG_KEY* and BTRFS_MOD_LOG_ROOT_REPLACE. */
u64 generation;
- /* Those are used for op == BTRFS_MOD_LOG_KEY_{REPLACE,REMOVE}. */
- struct btrfs_disk_key key;
- u64 blockptr;
-
- /* This is used for op == BTRFS_MOD_LOG_MOVE_KEYS. */
- struct {
- int dst_slot;
- int nr_items;
- } move;
-
- /* This is used for op == BTRFS_MOD_LOG_ROOT_REPLACE. */
- struct tree_mod_root old_root;
+ union {
+ /*
+ * This is used for the following op types:
+ *
+ * BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING
+ * BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING
+ * BTRFS_MOD_LOG_KEY_REMOVE
+ * BTRFS_MOD_LOG_KEY_REPLACE
+ */
+ struct {
+ struct btrfs_disk_key key;
+ u64 blockptr;
+ } slot_change;
+
+ /* This is used for op == BTRFS_MOD_LOG_MOVE_KEYS. */
+ struct {
+ int dst_slot;
+ int nr_items;
+ } move;
+
+ /* This is used for op == BTRFS_MOD_LOG_ROOT_REPLACE. */
+ struct tree_mod_root old_root;
+ };
};
/*
@@ -164,6 +175,30 @@ static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info,
return 0;
}
+static inline bool skip_eb_logging(const struct extent_buffer *eb)
+{
+ const u64 owner = btrfs_header_owner(eb);
+
+ if (btrfs_header_level(eb) == 0)
+ return true;
+
+ /*
+ * Tree mod logging exists so that there's a consistent view of the
+ * extents and backrefs of inodes even if while a task is iterating over
+ * them other tasks are modifying subvolume trees and the extent tree
+ * (including running delayed refs). So we only need to log extent
+ * buffers from the extent tree and subvolume trees.
+ */
+
+ if (owner == BTRFS_EXTENT_TREE_OBJECTID)
+ return false;
+
+ if (btrfs_is_fstree(owner))
+ return false;
+
+ return true;
+}
+
/*
* Determines if logging can be omitted. Returns true if it can. Otherwise, it
* returns false with the tree_mod_log_lock acquired. The caller must hold
@@ -174,7 +209,7 @@ static bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, const struct extent
{
if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
return true;
- if (eb && btrfs_header_level(eb) == 0)
+ if (eb && skip_eb_logging(eb))
return true;
write_lock(&fs_info->tree_mod_log_lock);
@@ -192,7 +227,7 @@ static bool tree_mod_need_log(const struct btrfs_fs_info *fs_info,
{
if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
return false;
- if (eb && btrfs_header_level(eb) == 0)
+ if (eb && skip_eb_logging(eb))
return false;
return true;
@@ -204,15 +239,17 @@ static struct tree_mod_elem *alloc_tree_mod_elem(const struct extent_buffer *eb,
{
struct tree_mod_elem *tm;
+ /* Can't be one of these types, due to union in struct tree_mod_elem. */
+ ASSERT(op != BTRFS_MOD_LOG_MOVE_KEYS);
+ ASSERT(op != BTRFS_MOD_LOG_ROOT_REPLACE);
+
tm = kzalloc(sizeof(*tm), GFP_NOFS);
if (!tm)
return NULL;
tm->logical = eb->start;
- if (op != BTRFS_MOD_LOG_KEY_ADD) {
- btrfs_node_key(eb, &tm->key, slot);
- tm->blockptr = btrfs_node_blockptr(eb, slot);
- }
+ btrfs_node_key(eb, &tm->slot_change.key, slot);
+ tm->slot_change.blockptr = btrfs_node_blockptr(eb, slot);
tm->op = op;
tm->slot = slot;
tm->generation = btrfs_node_ptr_generation(eb, slot);
@@ -830,8 +867,8 @@ static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
fallthrough;
case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING:
case BTRFS_MOD_LOG_KEY_REMOVE:
- btrfs_set_node_key(eb, &tm->key, tm->slot);
- btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
+ btrfs_set_node_key(eb, &tm->slot_change.key, tm->slot);
+ btrfs_set_node_blockptr(eb, tm->slot, tm->slot_change.blockptr);
btrfs_set_node_ptr_generation(eb, tm->slot,
tm->generation);
n++;
@@ -840,8 +877,8 @@ static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
break;
case BTRFS_MOD_LOG_KEY_REPLACE:
BUG_ON(tm->slot >= n);
- btrfs_set_node_key(eb, &tm->key, tm->slot);
- btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
+ btrfs_set_node_key(eb, &tm->slot_change.key, tm->slot);
+ btrfs_set_node_blockptr(eb, tm->slot, tm->slot_change.blockptr);
btrfs_set_node_ptr_generation(eb, tm->slot,
tm->generation);
break;
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index fc59b57257d6..7e16a253fb35 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -129,21 +129,25 @@ void ulist_free(struct ulist *ulist)
kfree(ulist);
}
+static int ulist_node_val_key_cmp(const void *key, const struct rb_node *node)
+{
+ const u64 *val = key;
+ const struct ulist_node *unode = rb_entry(node, struct ulist_node, rb_node);
+
+ if (unode->val < *val)
+ return 1;
+ else if (unode->val > *val)
+ return -1;
+
+ return 0;
+}
+
static struct ulist_node *ulist_rbtree_search(struct ulist *ulist, u64 val)
{
- struct rb_node *n = ulist->root.rb_node;
- struct ulist_node *u = NULL;
-
- while (n) {
- u = rb_entry(n, struct ulist_node, rb_node);
- if (u->val < val)
- n = n->rb_right;
- else if (u->val > val)
- n = n->rb_left;
- else
- return u;
- }
- return NULL;
+ struct rb_node *node;
+
+ node = rb_find(&val, &ulist->root, ulist_node_val_key_cmp);
+ return rb_entry_safe(node, struct ulist_node, rb_node);
}
static void ulist_rbtree_erase(struct ulist *ulist, struct ulist_node *node)
@@ -155,25 +159,20 @@ static void ulist_rbtree_erase(struct ulist *ulist, struct ulist_node *node)
ulist->nnodes--;
}
+static int ulist_node_val_cmp(struct rb_node *new, const struct rb_node *existing)
+{
+ const struct ulist_node *unode = rb_entry(new, struct ulist_node, rb_node);
+
+ return ulist_node_val_key_cmp(&unode->val, existing);
+}
+
static int ulist_rbtree_insert(struct ulist *ulist, struct ulist_node *ins)
{
- struct rb_node **p = &ulist->root.rb_node;
- struct rb_node *parent = NULL;
- struct ulist_node *cur = NULL;
-
- while (*p) {
- parent = *p;
- cur = rb_entry(parent, struct ulist_node, rb_node);
-
- if (cur->val < ins->val)
- p = &(*p)->rb_right;
- else if (cur->val > ins->val)
- p = &(*p)->rb_left;
- else
- return -EEXIST;
- }
- rb_link_node(&ins->rb_node, parent, p);
- rb_insert_color(&ins->rb_node, &ulist->root);
+ struct rb_node *node;
+
+ node = rb_find_add(&ins->rb_node, &ulist->root, ulist_node_val_cmp);
+ if (node)
+ return -EEXIST;
return 0;
}
diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c
index e97ad824ae16..b7a96a005487 100644
--- a/fs/btrfs/verity.c
+++ b/fs/btrfs/verity.c
@@ -485,7 +485,7 @@ static int rollback_verity(struct btrfs_inode *inode)
goto out;
}
inode->ro_flags &= ~BTRFS_INODE_RO_VERITY;
- btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode);
+ btrfs_sync_inode_flags_to_i_flags(inode);
ret = btrfs_update_inode(trans, inode);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -552,7 +552,7 @@ static int finish_verity(struct btrfs_inode *inode, const void *desc,
goto out;
}
inode->ro_flags |= BTRFS_INODE_RO_VERITY;
- btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode);
+ btrfs_sync_inode_flags_to_i_flags(inode);
ret = btrfs_update_inode(trans, inode);
if (ret)
goto end_trans;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 0a0776489055..fa7a929a0461 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -18,7 +18,6 @@
#include "transaction.h"
#include "volumes.h"
#include "raid56.h"
-#include "rcu-string.h"
#include "dev-replace.h"
#include "sysfs.h"
#include "tree-checker.h"
@@ -214,10 +213,8 @@ void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
u64 flags = bg_flags;
u32 size_bp = size_buf;
- if (!flags) {
- strcpy(bp, "NONE");
+ if (!flags)
return;
- }
#define DESCRIBE_FLAG(flag, desc) \
do { \
@@ -403,8 +400,12 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
static void btrfs_free_device(struct btrfs_device *device)
{
WARN_ON(!list_empty(&device->post_commit_list));
- rcu_string_free(device->name);
- extent_io_tree_release(&device->alloc_state);
+ /*
+ * No need to call kfree_rcu() nor do RCU lock/unlock, nothing is
+ * reading the device name.
+ */
+ kfree(rcu_dereference_raw(device->name));
+ btrfs_extent_io_tree_release(&device->alloc_state);
btrfs_destroy_dev_zone_info(device);
kfree(device);
}
@@ -414,9 +415,10 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
struct btrfs_device *device;
WARN_ON(fs_devices->opened);
+ WARN_ON(fs_devices->holding);
while (!list_empty(&fs_devices->devices)) {
- device = list_entry(fs_devices->devices.next,
- struct btrfs_device, dev_list);
+ device = list_first_entry(&fs_devices->devices,
+ struct btrfs_device, dev_list);
list_del(&device->dev_list);
btrfs_free_device(device);
}
@@ -428,8 +430,8 @@ void __exit btrfs_cleanup_fs_uuids(void)
struct btrfs_fs_devices *fs_devices;
while (!list_empty(&fs_uuids)) {
- fs_devices = list_entry(fs_uuids.next,
- struct btrfs_fs_devices, fs_list);
+ fs_devices = list_first_entry(&fs_uuids, struct btrfs_fs_devices,
+ fs_list);
list_del(&fs_devices->fs_list);
free_fs_devices(fs_devices);
}
@@ -473,7 +475,7 @@ btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder,
struct block_device *bdev;
int ret;
- *bdev_file = bdev_file_open_by_path(device_path, flags, holder, NULL);
+ *bdev_file = bdev_file_open_by_path(device_path, flags, holder, &fs_holder_ops);
if (IS_ERR(*bdev_file)) {
ret = PTR_ERR(*bdev_file);
@@ -488,15 +490,15 @@ btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder,
if (holder) {
ret = set_blocksize(*bdev_file, BTRFS_BDEV_BLOCKSIZE);
if (ret) {
- fput(*bdev_file);
+ bdev_fput(*bdev_file);
goto error;
}
}
invalidate_bdev(bdev);
- *disk_super = btrfs_read_dev_super(bdev);
+ *disk_super = btrfs_read_disk_super(bdev, 0, false);
if (IS_ERR(*disk_super)) {
ret = PTR_ERR(*disk_super);
- fput(*bdev_file);
+ bdev_fput(*bdev_file);
goto error;
}
@@ -541,7 +543,7 @@ static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device
continue;
if (devt && devt != device->devt)
continue;
- if (fs_devices->opened) {
+ if (fs_devices->opened || fs_devices->holding) {
if (devt)
ret = -EBUSY;
break;
@@ -657,7 +659,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
if (!device->name)
return -EINVAL;
- ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
+ ret = btrfs_get_bdev_and_sb(rcu_dereference_raw(device->name), flags, holder, 1,
&bdev_file, &disk_super);
if (ret)
return ret;
@@ -674,8 +676,8 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
if (btrfs_super_incompat_flags(disk_super) &
BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
- pr_err(
- "BTRFS: Invalid seeding and uuid-changed device detected\n");
+ btrfs_err(NULL,
+ "invalid seeding and uuid-changed device detected");
goto error_free_page;
}
@@ -701,7 +703,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
if (device->devt != device->bdev->bd_dev) {
btrfs_warn(NULL,
"device %s maj:min changed from %d:%d to %d:%d",
- device->name->str, MAJOR(device->devt),
+ rcu_dereference_raw(device->name), MAJOR(device->devt),
MINOR(device->devt), MAJOR(device->bdev->bd_dev),
MINOR(device->bdev->bd_dev));
@@ -720,7 +722,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
error_free_page:
btrfs_release_disk_super(disk_super);
- fput(bdev_file);
+ bdev_fput(bdev_file);
return -EINVAL;
}
@@ -733,82 +735,6 @@ const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb)
return has_metadata_uuid ? sb->metadata_uuid : sb->fsid;
}
-/*
- * We can have very weird soft links passed in.
- * One example is "/proc/self/fd/<fd>", which can be a soft link to
- * a block device.
- *
- * But it's never a good idea to use those weird names.
- * Here we check if the path (not following symlinks) is a good one inside
- * "/dev/".
- */
-static bool is_good_dev_path(const char *dev_path)
-{
- struct path path = { .mnt = NULL, .dentry = NULL };
- char *path_buf = NULL;
- char *resolved_path;
- bool is_good = false;
- int ret;
-
- if (!dev_path)
- goto out;
-
- path_buf = kmalloc(PATH_MAX, GFP_KERNEL);
- if (!path_buf)
- goto out;
-
- /*
- * Do not follow soft link, just check if the original path is inside
- * "/dev/".
- */
- ret = kern_path(dev_path, 0, &path);
- if (ret)
- goto out;
- resolved_path = d_path(&path, path_buf, PATH_MAX);
- if (IS_ERR(resolved_path))
- goto out;
- if (strncmp(resolved_path, "/dev/", strlen("/dev/")))
- goto out;
- is_good = true;
-out:
- kfree(path_buf);
- path_put(&path);
- return is_good;
-}
-
-static int get_canonical_dev_path(const char *dev_path, char *canonical)
-{
- struct path path = { .mnt = NULL, .dentry = NULL };
- char *path_buf = NULL;
- char *resolved_path;
- int ret;
-
- if (!dev_path) {
- ret = -EINVAL;
- goto out;
- }
-
- path_buf = kmalloc(PATH_MAX, GFP_KERNEL);
- if (!path_buf) {
- ret = -ENOMEM;
- goto out;
- }
-
- ret = kern_path(dev_path, LOOKUP_FOLLOW, &path);
- if (ret)
- goto out;
- resolved_path = d_path(&path, path_buf, PATH_MAX);
- if (IS_ERR(resolved_path)) {
- ret = PTR_ERR(resolved_path);
- goto out;
- }
- ret = strscpy(canonical, resolved_path, PATH_MAX);
-out:
- kfree(path_buf);
- path_put(&path);
- return ret;
-}
-
static bool is_same_device(struct btrfs_device *device, const char *new_path)
{
struct path old = { .mnt = NULL, .dentry = NULL };
@@ -825,7 +751,7 @@ static bool is_same_device(struct btrfs_device *device, const char *new_path)
goto out;
rcu_read_lock();
- ret = strscpy(old_path, rcu_str_deref(device->name), PATH_MAX);
+ ret = strscpy(old_path, rcu_dereference(device->name), PATH_MAX);
rcu_read_unlock();
if (ret < 0)
goto out;
@@ -858,11 +784,11 @@ static noinline struct btrfs_device *device_list_add(const char *path,
{
struct btrfs_device *device;
struct btrfs_fs_devices *fs_devices = NULL;
- struct rcu_string *name;
+ const char *name;
u64 found_transid = btrfs_super_generation(disk_super);
u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
dev_t path_devt;
- int error;
+ int ret;
bool same_fsid_diff_dev = false;
bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
@@ -874,11 +800,11 @@ static noinline struct btrfs_device *device_list_add(const char *path,
return ERR_PTR(-EAGAIN);
}
- error = lookup_bdev(path, &path_devt);
- if (error) {
+ ret = lookup_bdev(path, &path_devt);
+ if (ret) {
btrfs_err(NULL, "failed to lookup block device for path %s: %d",
- path, error);
- return ERR_PTR(error);
+ path, ret);
+ return ERR_PTR(ret);
}
fs_devices = find_fsid_by_device(disk_super, path_devt, &same_fsid_diff_dev);
@@ -895,7 +821,7 @@ static noinline struct btrfs_device *device_list_add(const char *path,
if (same_fsid_diff_dev) {
generate_random_uuid(fs_devices->fsid);
fs_devices->temp_fsid = true;
- pr_info("BTRFS: device %s (%d:%d) using temp-fsid %pU\n",
+ btrfs_info(NULL, "device %s (%d:%d) using temp-fsid %pU",
path, MAJOR(path_devt), MINOR(path_devt),
fs_devices->fsid);
}
@@ -966,6 +892,8 @@ static noinline struct btrfs_device *device_list_add(const char *path,
current->comm, task_pid_nr(current));
} else if (!device->name || !is_same_device(device, path)) {
+ const char *old_name;
+
/*
* When FS is already mounted.
* 1. If you are here and if the device->name is NULL that
@@ -1019,27 +947,31 @@ static noinline struct btrfs_device *device_list_add(const char *path,
if (device->bdev) {
if (device->devt != path_devt) {
mutex_unlock(&fs_devices->device_list_mutex);
- btrfs_warn_in_rcu(NULL,
+ btrfs_warn(NULL,
"duplicate device %s devid %llu generation %llu scanned by %s (%d)",
path, devid, found_transid,
current->comm,
task_pid_nr(current));
return ERR_PTR(-EEXIST);
}
- btrfs_info_in_rcu(NULL,
+ btrfs_info(NULL,
"devid %llu device path %s changed to %s scanned by %s (%d)",
devid, btrfs_dev_name(device),
path, current->comm,
task_pid_nr(current));
}
- name = rcu_string_strdup(path, GFP_NOFS);
+ name = kstrdup(path, GFP_NOFS);
if (!name) {
mutex_unlock(&fs_devices->device_list_mutex);
return ERR_PTR(-ENOMEM);
}
- rcu_string_free(device->name);
+ rcu_read_lock();
+ old_name = rcu_dereference(device->name);
+ rcu_read_unlock();
rcu_assign_pointer(device->name, name);
+ kfree_rcu_mightsleep(old_name);
+
if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
fs_devices->missing_devices--;
clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
@@ -1088,7 +1020,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
* uuid mutex so nothing we touch in here is going to disappear.
*/
if (orig_dev->name)
- dev_path = orig_dev->name->str;
+ dev_path = rcu_dereference_raw(orig_dev->name);
device = btrfs_alloc_device(NULL, &orig_dev->devid,
orig_dev->uuid, dev_path);
@@ -1146,7 +1078,7 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
continue;
if (device->bdev_file) {
- fput(device->bdev_file);
+ bdev_fput(device->bdev_file);
device->bdev = NULL;
device->bdev_file = NULL;
fs_devices->open_devices--;
@@ -1193,7 +1125,7 @@ static void btrfs_close_bdev(struct btrfs_device *device)
invalidate_bdev(device->bdev);
}
- fput(device->bdev_file);
+ bdev_fput(device->bdev_file);
}
static void btrfs_close_one_device(struct btrfs_device *device)
@@ -1225,7 +1157,7 @@ static void btrfs_close_one_device(struct btrfs_device *device)
device->fs_info = NULL;
atomic_set(&device->dev_stats_ccnt, 0);
- extent_io_tree_release(&device->alloc_state);
+ btrfs_extent_io_tree_release(&device->alloc_state);
/*
* Reset the flush error record. We might have a transient flush error
@@ -1273,7 +1205,7 @@ void btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
mutex_lock(&uuid_mutex);
close_fs_devices(fs_devices);
- if (!fs_devices->opened) {
+ if (!fs_devices->opened && !fs_devices->holding) {
list_splice_init(&fs_devices->seed_list, &list);
/*
@@ -1401,48 +1333,58 @@ void btrfs_release_disk_super(struct btrfs_super_block *super)
put_page(page);
}
-static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
- u64 bytenr, u64 bytenr_orig)
+struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
+ int copy_num, bool drop_cache)
{
- struct btrfs_super_block *disk_super;
+ struct btrfs_super_block *super;
struct page *page;
- void *p;
- pgoff_t index;
+ u64 bytenr, bytenr_orig;
+ struct address_space *mapping = bdev->bd_mapping;
+ int ret;
- /* make sure our super fits in the device */
- if (bytenr + PAGE_SIZE >= bdev_nr_bytes(bdev))
- return ERR_PTR(-EINVAL);
+ bytenr_orig = btrfs_sb_offset(copy_num);
+ ret = btrfs_sb_log_location_bdev(bdev, copy_num, READ, &bytenr);
+ if (ret < 0) {
+ if (ret == -ENOENT)
+ ret = -EINVAL;
+ return ERR_PTR(ret);
+ }
- /* make sure our super fits in the page */
- if (sizeof(*disk_super) > PAGE_SIZE)
+ if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev))
return ERR_PTR(-EINVAL);
- /* make sure our super doesn't straddle pages on disk */
- index = bytenr >> PAGE_SHIFT;
- if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index)
- return ERR_PTR(-EINVAL);
+ if (drop_cache) {
+ /* This should only be called with the primary sb. */
+ ASSERT(copy_num == 0);
- /* pull in the page with our super */
- page = read_cache_page_gfp(bdev->bd_mapping, index, GFP_KERNEL);
+ /*
+ * Drop the page of the primary superblock, so later read will
+ * always read from the device.
+ */
+ invalidate_inode_pages2_range(mapping, bytenr >> PAGE_SHIFT,
+ (bytenr + BTRFS_SUPER_INFO_SIZE) >> PAGE_SHIFT);
+ }
+ page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS);
if (IS_ERR(page))
return ERR_CAST(page);
- p = page_address(page);
-
- /* align our pointer to the offset of the super block */
- disk_super = p + offset_in_page(bytenr);
-
- if (btrfs_super_bytenr(disk_super) != bytenr_orig ||
- btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
- btrfs_release_disk_super(p);
+ super = page_address(page);
+ if (btrfs_super_magic(super) != BTRFS_MAGIC ||
+ btrfs_super_bytenr(super) != bytenr_orig) {
+ btrfs_release_disk_super(super);
return ERR_PTR(-EINVAL);
}
- if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1])
- disk_super->label[BTRFS_LABEL_SIZE - 1] = 0;
+ /*
+ * Make sure the last byte of label is properly NUL termiated. We use
+ * '%s' to print the label, if not properly NUL termiated we can access
+ * beyond the label.
+ */
+ if (super->label[0] && super->label[BTRFS_LABEL_SIZE - 1])
+ super->label[BTRFS_LABEL_SIZE - 1] = 0;
- return disk_super;
+ return super;
}
int btrfs_forget_devices(dev_t devt)
@@ -1480,7 +1422,7 @@ static bool btrfs_skip_registration(struct btrfs_super_block *disk_super,
list_for_each_entry(device, &fs_devices->devices, dev_list) {
if (device->bdev && (device->bdev->bd_dev == devt) &&
- strcmp(device->name->str, path) != 0) {
+ strcmp(rcu_dereference_raw(device->name), path) != 0) {
mutex_unlock(&fs_devices->device_list_mutex);
/* Do not skip registration. */
@@ -1506,30 +1448,17 @@ static bool btrfs_skip_registration(struct btrfs_super_block *disk_super,
* the device or return an error. Multi-device and seeding devices are registered
* in both cases.
*/
-struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
+struct btrfs_device *btrfs_scan_one_device(const char *path,
bool mount_arg_dev)
{
struct btrfs_super_block *disk_super;
bool new_device_added = false;
struct btrfs_device *device = NULL;
struct file *bdev_file;
- char *canonical_path = NULL;
- u64 bytenr;
dev_t devt;
- int ret;
lockdep_assert_held(&uuid_mutex);
- if (!is_good_dev_path(path)) {
- canonical_path = kmalloc(PATH_MAX, GFP_KERNEL);
- if (canonical_path) {
- ret = get_canonical_dev_path(path, canonical_path);
- if (ret < 0) {
- kfree(canonical_path);
- canonical_path = NULL;
- }
- }
- }
/*
* Avoid an exclusive open here, as the systemd-udev may initiate the
* device scan which may race with the user's mount or mkfs command,
@@ -1540,24 +1469,11 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
* values temporarily, as the device paths of the fsid are the only
* required information for assembling the volume.
*/
- bdev_file = bdev_file_open_by_path(path, flags, NULL, NULL);
+ bdev_file = bdev_file_open_by_path(path, BLK_OPEN_READ, NULL, NULL);
if (IS_ERR(bdev_file))
return ERR_CAST(bdev_file);
- /*
- * We would like to check all the super blocks, but doing so would
- * allow a mount to succeed after a mkfs from a different filesystem.
- * Currently, recovery from a bad primary btrfs superblock is done
- * using the userspace command 'btrfs check --super'.
- */
- ret = btrfs_sb_log_location_bdev(file_bdev(bdev_file), 0, READ, &bytenr);
- if (ret) {
- device = ERR_PTR(ret);
- goto error_bdev_put;
- }
-
- disk_super = btrfs_read_disk_super(file_bdev(bdev_file), bytenr,
- btrfs_sb_offset(0));
+ disk_super = btrfs_read_disk_super(file_bdev(bdev_file), 0, false);
if (IS_ERR(disk_super)) {
device = ERR_CAST(disk_super);
goto error_bdev_put;
@@ -1565,7 +1481,7 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
devt = file_bdev(bdev_file)->bd_dev;
if (btrfs_skip_registration(disk_super, path, devt, mount_arg_dev)) {
- pr_debug("BTRFS: skip registering single non-seed device %s (%d:%d)\n",
+ btrfs_debug(NULL, "skip registering single non-seed device %s (%d:%d)",
path, MAJOR(devt), MINOR(devt));
btrfs_free_stale_devices(devt, NULL);
@@ -1574,8 +1490,7 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
goto free_disk_super;
}
- device = device_list_add(canonical_path ? : path, disk_super,
- &new_device_added);
+ device = device_list_add(path, disk_super, &new_device_added);
if (!IS_ERR(device) && new_device_added)
btrfs_free_stale_devices(device->devt, device);
@@ -1583,8 +1498,7 @@ free_disk_super:
btrfs_release_disk_super(disk_super);
error_bdev_put:
- fput(bdev_file);
- kfree(canonical_path);
+ bdev_fput(bdev_file);
return device;
}
@@ -1600,9 +1514,9 @@ static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
lockdep_assert_held(&device->fs_info->chunk_mutex);
- if (find_first_extent_bit(&device->alloc_state, *start,
- &physical_start, &physical_end,
- CHUNK_ALLOCATED, NULL)) {
+ if (btrfs_find_first_extent_bit(&device->alloc_state, *start,
+ &physical_start, &physical_end,
+ CHUNK_ALLOCATED, NULL)) {
if (in_range(physical_start, *start, len) ||
in_range(*start, physical_start,
@@ -1617,6 +1531,9 @@ static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
static u64 dev_extent_search_start(struct btrfs_device *device)
{
switch (device->fs_devices->chunk_alloc_policy) {
+ default:
+ btrfs_warn_unknown_chunk_allocation(device->fs_devices->chunk_alloc_policy);
+ fallthrough;
case BTRFS_CHUNK_ALLOC_REGULAR:
return BTRFS_DEVICE_RANGE_RESERVED;
case BTRFS_CHUNK_ALLOC_ZONED:
@@ -1626,8 +1543,6 @@ static u64 dev_extent_search_start(struct btrfs_device *device)
* for superblock logging.
*/
return 0;
- default:
- BUG();
}
}
@@ -1640,7 +1555,8 @@ static bool dev_extent_hole_check_zoned(struct btrfs_device *device,
int ret;
bool changed = false;
- ASSERT(IS_ALIGNED(*hole_start, zone_size));
+ ASSERT(IS_ALIGNED(*hole_start, zone_size),
+ "hole_start=%llu zone_size=%llu", *hole_start, zone_size);
while (*hole_size > 0) {
pos = btrfs_find_allocatable_zones(device, *hole_start,
@@ -1706,6 +1622,9 @@ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
}
switch (device->fs_devices->chunk_alloc_policy) {
+ default:
+ btrfs_warn_unknown_chunk_allocation(device->fs_devices->chunk_alloc_policy);
+ fallthrough;
case BTRFS_CHUNK_ALLOC_REGULAR:
/* No extra check */
break;
@@ -1720,8 +1639,6 @@ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
continue;
}
break;
- default:
- BUG();
}
break;
@@ -1798,8 +1715,8 @@ again:
path->skip_locking = 1;
key.objectid = device->devid;
- key.offset = search_start;
key.type = BTRFS_DEV_EXTENT_KEY;
+ key.offset = search_start;
ret = btrfs_search_backwards(root, &key, path);
if (ret < 0)
@@ -1891,7 +1808,9 @@ next:
else
ret = 0;
- ASSERT(max_hole_start + max_hole_size <= search_end);
+ ASSERT(max_hole_start + max_hole_size <= search_end,
+ "max_hole_start=%llu max_hole_size=%llu search_end=%llu",
+ max_hole_start, max_hole_size, search_end);
out:
btrfs_free_path(path);
*start = max_hole_start;
@@ -1918,8 +1837,8 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
return -ENOMEM;
key.objectid = device->devid;
- key.offset = start;
key.type = BTRFS_DEV_EXTENT_KEY;
+ key.offset = start;
again:
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0) {
@@ -2204,7 +2123,7 @@ static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
down_read(&fs_info->dev_replace.rwsem);
if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
- ASSERT(num_devices > 1);
+ ASSERT(num_devices > 1, "num_devices=%llu", num_devices);
num_devices--;
}
up_read(&fs_info->dev_replace.rwsem);
@@ -2220,7 +2139,7 @@ static void btrfs_scratch_superblock(struct btrfs_fs_info *fs_info,
const u64 bytenr = btrfs_sb_offset(copy_num);
int ret;
- disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr);
+ disk_super = btrfs_read_disk_super(bdev, copy_num, false);
if (IS_ERR(disk_super))
return;
@@ -2253,7 +2172,7 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, struct btrfs_devic
btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
/* Update ctime/mtime for device path for libblkid */
- update_dev_time(device->name->str);
+ update_dev_time(rcu_dereference_raw(device->name));
}
int btrfs_rm_device(struct btrfs_fs_info *fs_info,
@@ -2293,7 +2212,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
}
if (btrfs_pinned_by_swapfile(fs_info, device)) {
- btrfs_warn_in_rcu(fs_info,
+ btrfs_warn(fs_info,
"cannot remove device %s (devid %llu) due to active swapfile",
btrfs_dev_name(device), device->devid);
return -ETXTBSY;
@@ -2383,7 +2302,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
* free the device.
*
* We cannot call btrfs_close_bdev() here because we're holding the sb
- * write lock, and fput() on the block device will pull in the
+ * write lock, and bdev_fput() on the block device will pull in the
* ->open_mutex on the block device and it's dependencies. Instead
* just flush the device and let the caller do the final bdev_release.
*/
@@ -2408,7 +2327,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
*/
if (cur_devices->num_devices == 0) {
list_del_init(&cur_devices->seed_list);
- ASSERT(cur_devices->opened == 1);
+ ASSERT(cur_devices->opened == 1, "opened=%d", cur_devices->opened);
cur_devices->opened--;
free_fs_devices(cur_devices);
}
@@ -2562,7 +2481,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
else
memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
btrfs_release_disk_super(disk_super);
- fput(bdev_file);
+ bdev_fput(bdev_file);
return 0;
}
@@ -2721,8 +2640,8 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
return -ENOMEM;
key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
- key.offset = 0;
key.type = BTRFS_DEV_ITEM_KEY;
+ key.offset = 0;
while (1) {
btrfs_reserve_chunk_metadata(trans, false);
@@ -2794,7 +2713,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
return -EROFS;
bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE,
- fs_info->bdev_holder, NULL);
+ fs_info->sb, &fs_holder_ops);
if (IS_ERR(bdev_file))
return PTR_ERR(bdev_file);
@@ -3010,7 +2929,7 @@ error_free_zone:
error_free_device:
btrfs_free_device(device);
error:
- fput(bdev_file);
+ bdev_fput(bdev_file);
if (locked) {
mutex_unlock(&uuid_mutex);
up_write(&sb->s_umount);
@@ -3119,8 +3038,8 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
return -ENOMEM;
key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
- key.offset = chunk_offset;
key.type = BTRFS_CHUNK_ITEM_KEY;
+ key.offset = chunk_offset;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
@@ -3338,7 +3257,8 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
* user having built with ASSERT enabled, so if ASSERT doesn't
* do anything we still error out.
*/
- ASSERT(0);
+ DEBUG_WARN("errr %ld reading chunk map at offset %llu",
+ PTR_ERR(map), chunk_offset);
return PTR_ERR(map);
}
@@ -3370,6 +3290,12 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
device->bytes_used - dev_extent_len);
atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
btrfs_clear_space_info_full(fs_info);
+
+ if (list_empty(&device->post_commit_list)) {
+ list_add_tail(&device->post_commit_list,
+ &trans->transaction->dev_update_list);
+ }
+
mutex_unlock(&fs_info->chunk_mutex);
}
}
@@ -3419,8 +3345,16 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
if (ret == -ENOSPC) {
const u64 sys_flags = btrfs_system_alloc_profile(fs_info);
struct btrfs_block_group *sys_bg;
+ struct btrfs_space_info *space_info;
+
+ space_info = btrfs_find_space_info(fs_info, sys_flags);
+ if (!space_info) {
+ ret = -EINVAL;
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
- sys_bg = btrfs_create_chunk(trans, sys_flags);
+ sys_bg = btrfs_create_chunk(trans, space_info, sys_flags);
if (IS_ERR(sys_bg)) {
ret = PTR_ERR(sys_bg);
btrfs_abort_transaction(trans, ret);
@@ -3478,7 +3412,8 @@ out:
return ret;
}
-int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
+int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset,
+ bool verbose)
{
struct btrfs_root *root = fs_info->chunk_root;
struct btrfs_trans_handle *trans;
@@ -3508,7 +3443,7 @@ int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
/* step one, relocate all the extents inside this chunk */
btrfs_scrub_pause(fs_info);
- ret = btrfs_relocate_block_group(fs_info, chunk_offset);
+ ret = btrfs_relocate_block_group(fs_info, chunk_offset, true);
btrfs_scrub_continue(fs_info);
if (ret) {
/*
@@ -3577,8 +3512,8 @@ static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
again:
key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
- key.offset = (u64)-1;
key.type = BTRFS_CHUNK_ITEM_KEY;
+ key.offset = (u64)-1;
while (1) {
mutex_lock(&fs_info->reclaim_bgs_lock);
@@ -3618,7 +3553,8 @@ again:
btrfs_release_path(path);
if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
- ret = btrfs_relocate_chunk(fs_info, found_key.offset);
+ ret = btrfs_relocate_chunk(fs_info, found_key.offset,
+ true);
if (ret == -ENOSPC)
failed++;
else
@@ -3880,26 +3816,25 @@ static void reset_balance_state(struct btrfs_fs_info *fs_info)
* Balance filters. Return 1 if chunk should be filtered out
* (should not be balanced).
*/
-static int chunk_profiles_filter(u64 chunk_type,
- struct btrfs_balance_args *bargs)
+static bool chunk_profiles_filter(u64 chunk_type, struct btrfs_balance_args *bargs)
{
chunk_type = chunk_to_extended(chunk_type) &
BTRFS_EXTENDED_PROFILE_MASK;
if (bargs->profiles & chunk_type)
- return 0;
+ return false;
- return 1;
+ return true;
}
-static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
- struct btrfs_balance_args *bargs)
+static bool chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
+ struct btrfs_balance_args *bargs)
{
struct btrfs_block_group *cache;
u64 chunk_used;
u64 user_thresh_min;
u64 user_thresh_max;
- int ret = 1;
+ bool ret = true;
cache = btrfs_lookup_block_group(fs_info, chunk_offset);
chunk_used = cache->used;
@@ -3917,18 +3852,18 @@ static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_off
user_thresh_max = mult_perc(cache->length, bargs->usage_max);
if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
- ret = 0;
+ ret = false;
btrfs_put_block_group(cache);
return ret;
}
-static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
- u64 chunk_offset, struct btrfs_balance_args *bargs)
+static bool chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
+ struct btrfs_balance_args *bargs)
{
struct btrfs_block_group *cache;
u64 chunk_used, user_thresh;
- int ret = 1;
+ bool ret = true;
cache = btrfs_lookup_block_group(fs_info, chunk_offset);
chunk_used = cache->used;
@@ -3941,15 +3876,14 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
user_thresh = mult_perc(cache->length, bargs->usage);
if (chunk_used < user_thresh)
- ret = 0;
+ ret = false;
btrfs_put_block_group(cache);
return ret;
}
-static int chunk_devid_filter(struct extent_buffer *leaf,
- struct btrfs_chunk *chunk,
- struct btrfs_balance_args *bargs)
+static bool chunk_devid_filter(struct extent_buffer *leaf, struct btrfs_chunk *chunk,
+ struct btrfs_balance_args *bargs)
{
struct btrfs_stripe *stripe;
int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
@@ -3958,10 +3892,10 @@ static int chunk_devid_filter(struct extent_buffer *leaf,
for (i = 0; i < num_stripes; i++) {
stripe = btrfs_stripe_nr(chunk, i);
if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
- return 0;
+ return false;
}
- return 1;
+ return true;
}
static u64 calc_data_stripes(u64 type, int num_stripes)
@@ -3974,9 +3908,8 @@ static u64 calc_data_stripes(u64 type, int num_stripes)
}
/* [pstart, pend) */
-static int chunk_drange_filter(struct extent_buffer *leaf,
- struct btrfs_chunk *chunk,
- struct btrfs_balance_args *bargs)
+static bool chunk_drange_filter(struct extent_buffer *leaf, struct btrfs_chunk *chunk,
+ struct btrfs_balance_args *bargs)
{
struct btrfs_stripe *stripe;
int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
@@ -3987,7 +3920,7 @@ static int chunk_drange_filter(struct extent_buffer *leaf,
int i;
if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
- return 0;
+ return false;
type = btrfs_chunk_type(leaf, chunk);
factor = calc_data_stripes(type, num_stripes);
@@ -4003,56 +3936,53 @@ static int chunk_drange_filter(struct extent_buffer *leaf,
if (stripe_offset < bargs->pend &&
stripe_offset + stripe_length > bargs->pstart)
- return 0;
+ return false;
}
- return 1;
+ return true;
}
/* [vstart, vend) */
-static int chunk_vrange_filter(struct extent_buffer *leaf,
- struct btrfs_chunk *chunk,
- u64 chunk_offset,
- struct btrfs_balance_args *bargs)
+static bool chunk_vrange_filter(struct extent_buffer *leaf, struct btrfs_chunk *chunk,
+ u64 chunk_offset, struct btrfs_balance_args *bargs)
{
if (chunk_offset < bargs->vend &&
chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
/* at least part of the chunk is inside this vrange */
- return 0;
+ return false;
- return 1;
+ return true;
}
-static int chunk_stripes_range_filter(struct extent_buffer *leaf,
- struct btrfs_chunk *chunk,
- struct btrfs_balance_args *bargs)
+static bool chunk_stripes_range_filter(struct extent_buffer *leaf,
+ struct btrfs_chunk *chunk,
+ struct btrfs_balance_args *bargs)
{
int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
if (bargs->stripes_min <= num_stripes
&& num_stripes <= bargs->stripes_max)
- return 0;
+ return false;
- return 1;
+ return true;
}
-static int chunk_soft_convert_filter(u64 chunk_type,
- struct btrfs_balance_args *bargs)
+static bool chunk_soft_convert_filter(u64 chunk_type, struct btrfs_balance_args *bargs)
{
if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
- return 0;
+ return false;
chunk_type = chunk_to_extended(chunk_type) &
BTRFS_EXTENDED_PROFILE_MASK;
if (bargs->target == chunk_type)
- return 1;
+ return true;
- return 0;
+ return false;
}
-static int should_balance_chunk(struct extent_buffer *leaf,
- struct btrfs_chunk *chunk, u64 chunk_offset)
+static bool should_balance_chunk(struct extent_buffer *leaf, struct btrfs_chunk *chunk,
+ u64 chunk_offset)
{
struct btrfs_fs_info *fs_info = leaf->fs_info;
struct btrfs_balance_control *bctl = fs_info->balance_ctl;
@@ -4062,7 +3992,7 @@ static int should_balance_chunk(struct extent_buffer *leaf,
/* type filter */
if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
(bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
- return 0;
+ return false;
}
if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
@@ -4075,46 +4005,46 @@ static int should_balance_chunk(struct extent_buffer *leaf,
/* profiles filter */
if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
chunk_profiles_filter(chunk_type, bargs)) {
- return 0;
+ return false;
}
/* usage filter */
if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
chunk_usage_filter(fs_info, chunk_offset, bargs)) {
- return 0;
+ return false;
} else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
chunk_usage_range_filter(fs_info, chunk_offset, bargs)) {
- return 0;
+ return false;
}
/* devid filter */
if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
chunk_devid_filter(leaf, chunk, bargs)) {
- return 0;
+ return false;
}
/* drange filter, makes sense only with devid filter */
if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
chunk_drange_filter(leaf, chunk, bargs)) {
- return 0;
+ return false;
}
/* vrange filter */
if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
- return 0;
+ return false;
}
/* stripes filter */
if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
chunk_stripes_range_filter(leaf, chunk, bargs)) {
- return 0;
+ return false;
}
/* soft profile changing mode */
if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
chunk_soft_convert_filter(chunk_type, bargs)) {
- return 0;
+ return false;
}
/*
@@ -4122,7 +4052,7 @@ static int should_balance_chunk(struct extent_buffer *leaf,
*/
if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
if (bargs->limit == 0)
- return 0;
+ return false;
else
bargs->limit--;
} else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
@@ -4132,12 +4062,12 @@ static int should_balance_chunk(struct extent_buffer *leaf,
* about the count of all chunks that satisfy the filters.
*/
if (bargs->limit_max == 0)
- return 0;
+ return false;
else
bargs->limit_max--;
}
- return 1;
+ return true;
}
static int __btrfs_balance(struct btrfs_fs_info *fs_info)
@@ -4184,8 +4114,8 @@ again:
bctl->sys.limit = limit_sys;
}
key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
- key.offset = (u64)-1;
key.type = BTRFS_CHUNK_ITEM_KEY;
+ key.offset = (u64)-1;
while (1) {
if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
@@ -4289,7 +4219,7 @@ again:
}
}
- ret = btrfs_relocate_chunk(fs_info, found_key.offset);
+ ret = btrfs_relocate_chunk(fs_info, found_key.offset, true);
mutex_unlock(&fs_info->reclaim_bgs_lock);
if (ret == -ENOSPC) {
enospc_errors++;
@@ -4752,7 +4682,8 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
}
spin_lock(&fs_info->super_lock);
- ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED);
+ ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED,
+ "exclusive_operation=%d", fs_info->exclusive_operation);
fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE;
spin_unlock(&fs_info->super_lock);
/*
@@ -5001,8 +4932,8 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
again:
key.objectid = device->devid;
- key.offset = (u64)-1;
key.type = BTRFS_DEV_EXTENT_KEY;
+ key.offset = (u64)-1;
do {
mutex_lock(&fs_info->reclaim_bgs_lock);
@@ -5056,7 +4987,7 @@ again:
goto done;
}
- ret = btrfs_relocate_chunk(fs_info, chunk_offset);
+ ret = btrfs_relocate_chunk(fs_info, chunk_offset, true);
mutex_unlock(&fs_info->reclaim_bgs_lock);
if (ret == -ENOSPC) {
failed++;
@@ -5088,8 +5019,8 @@ again:
mutex_lock(&fs_info->chunk_mutex);
/* Clear all state bits beyond the shrunk device size */
- clear_extent_bits(&device->alloc_state, new_size, (u64)-1,
- CHUNK_STATE_MASK);
+ btrfs_clear_extent_bit(&device->alloc_state, new_size, (u64)-1,
+ CHUNK_STATE_MASK, NULL);
btrfs_device_set_disk_total_bytes(device, new_size);
if (list_empty(&device->post_commit_list))
@@ -5216,6 +5147,8 @@ struct alloc_chunk_ctl {
u64 stripe_size;
u64 chunk_size;
int ndevs;
+ /* Space_info the block group is going to belong. */
+ struct btrfs_space_info *space_info;
};
static void init_alloc_chunk_ctl_policy_regular(
@@ -5289,14 +5222,15 @@ static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
ctl->ndevs = 0;
switch (fs_devices->chunk_alloc_policy) {
+ default:
+ btrfs_warn_unknown_chunk_allocation(fs_devices->chunk_alloc_policy);
+ fallthrough;
case BTRFS_CHUNK_ALLOC_REGULAR:
init_alloc_chunk_ctl_policy_regular(fs_devices, ctl);
break;
case BTRFS_CHUNK_ALLOC_ZONED:
init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl);
break;
- default:
- BUG();
}
}
@@ -5435,7 +5369,9 @@ static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl,
* It should hold because:
* dev_extent_min == dev_extent_want == zone_size * dev_stripes
*/
- ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min);
+ ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min,
+ "ndevs=%d max_avail=%llu dev_extent_min=%llu", ctl->ndevs,
+ devices_info[ctl->ndevs - 1].max_avail, ctl->dev_extent_min);
ctl->stripe_size = zone_size;
ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
@@ -5448,7 +5384,9 @@ static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl,
ctl->dev_stripes);
ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
- ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size);
+ ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size,
+ "stripe_size=%llu data_stripes=%d max_chunk_size=%llu",
+ ctl->stripe_size, data_stripes, ctl->max_chunk_size);
}
ctl->chunk_size = ctl->stripe_size * data_stripes;
@@ -5481,12 +5419,13 @@ static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
ctl->ndevs = min(ctl->ndevs, ctl->devs_max);
switch (fs_devices->chunk_alloc_policy) {
+ default:
+ btrfs_warn_unknown_chunk_allocation(fs_devices->chunk_alloc_policy);
+ fallthrough;
case BTRFS_CHUNK_ALLOC_REGULAR:
return decide_stripe_size_regular(ctl, devices_info);
case BTRFS_CHUNK_ALLOC_ZONED:
return decide_stripe_size_zoned(ctl, devices_info);
- default:
- BUG();
}
}
@@ -5496,9 +5435,9 @@ static void chunk_map_device_set_bits(struct btrfs_chunk_map *map, unsigned int
struct btrfs_io_stripe *stripe = &map->stripes[i];
struct btrfs_device *device = stripe->dev;
- set_extent_bit(&device->alloc_state, stripe->physical,
- stripe->physical + map->stripe_size - 1,
- bits | EXTENT_NOWAIT, NULL);
+ btrfs_set_extent_bit(&device->alloc_state, stripe->physical,
+ stripe->physical + map->stripe_size - 1,
+ bits | EXTENT_NOWAIT, NULL);
}
}
@@ -5508,10 +5447,9 @@ static void chunk_map_device_clear_bits(struct btrfs_chunk_map *map, unsigned in
struct btrfs_io_stripe *stripe = &map->stripes[i];
struct btrfs_device *device = stripe->dev;
- __clear_extent_bit(&device->alloc_state, stripe->physical,
- stripe->physical + map->stripe_size - 1,
- bits | EXTENT_NOWAIT,
- NULL, NULL);
+ btrfs_clear_extent_bit(&device->alloc_state, stripe->physical,
+ stripe->physical + map->stripe_size - 1,
+ bits | EXTENT_NOWAIT, NULL);
}
}
@@ -5618,7 +5556,8 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
return ERR_PTR(ret);
}
- block_group = btrfs_make_block_group(trans, type, start, ctl->chunk_size);
+ block_group = btrfs_make_block_group(trans, ctl->space_info, type, start,
+ ctl->chunk_size);
if (IS_ERR(block_group)) {
btrfs_remove_chunk_map(info, map);
return block_group;
@@ -5644,7 +5583,8 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
}
struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
- u64 type)
+ struct btrfs_space_info *space_info,
+ u64 type)
{
struct btrfs_fs_info *info = trans->fs_info;
struct btrfs_fs_devices *fs_devices = info->fs_devices;
@@ -5656,7 +5596,7 @@ struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
lockdep_assert_held(&info->chunk_mutex);
if (!alloc_profile_is_valid(type, 0)) {
- ASSERT(0);
+ DEBUG_WARN("invalid alloc profile for type %llu", type);
return ERR_PTR(-EINVAL);
}
@@ -5668,12 +5608,13 @@ struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
btrfs_err(info, "invalid chunk type 0x%llx requested", type);
- ASSERT(0);
+ DEBUG_WARN();
return ERR_PTR(-EINVAL);
}
ctl.start = find_next_chunk(info);
ctl.type = type;
+ ctl.space_info = space_info;
init_alloc_chunk_ctl(fs_devices, &ctl);
devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
@@ -5817,7 +5758,9 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
struct btrfs_fs_info *fs_info = trans->fs_info;
u64 alloc_profile;
struct btrfs_block_group *meta_bg;
+ struct btrfs_space_info *meta_space_info;
struct btrfs_block_group *sys_bg;
+ struct btrfs_space_info *sys_space_info;
/*
* When adding a new device for sprouting, the seed device is read-only
@@ -5841,12 +5784,22 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
*/
alloc_profile = btrfs_metadata_alloc_profile(fs_info);
- meta_bg = btrfs_create_chunk(trans, alloc_profile);
+ meta_space_info = btrfs_find_space_info(fs_info, alloc_profile);
+ if (!meta_space_info) {
+ DEBUG_WARN();
+ return -EINVAL;
+ }
+ meta_bg = btrfs_create_chunk(trans, meta_space_info, alloc_profile);
if (IS_ERR(meta_bg))
return PTR_ERR(meta_bg);
alloc_profile = btrfs_system_alloc_profile(fs_info);
- sys_bg = btrfs_create_chunk(trans, alloc_profile);
+ sys_space_info = btrfs_find_space_info(fs_info, alloc_profile);
+ if (!sys_space_info) {
+ DEBUG_WARN();
+ return -EINVAL;
+ }
+ sys_bg = btrfs_create_chunk(trans, sys_space_info, alloc_profile);
if (IS_ERR(sys_bg))
return PTR_ERR(sys_bg);
@@ -6046,7 +5999,7 @@ static int btrfs_read_rr(const struct btrfs_chunk_map *map, int first, int num_s
static int find_live_mirror(struct btrfs_fs_info *fs_info,
struct btrfs_chunk_map *map, int first,
- int dev_replace_is_ongoing)
+ bool dev_replace_is_ongoing)
{
const enum btrfs_read_policy policy = READ_ONCE(fs_info->fs_devices->read_policy);
int i;
@@ -6055,8 +6008,8 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
int tolerance;
struct btrfs_device *srcdev;
- ASSERT((map->type &
- (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10)));
+ ASSERT((map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10)),
+ "type=%llu", map->type);
if (map->type & BTRFS_BLOCK_GROUP_RAID10)
num_stripes = map->sub_stripes;
@@ -6357,7 +6310,7 @@ static void handle_ops_on_dev_replace(struct btrfs_io_context *bioc,
}
/* We can only have at most 2 extra nr_stripes (for DUP). */
- ASSERT(nr_extra_stripes <= 2);
+ ASSERT(nr_extra_stripes <= 2, "nr_extra_stripes=%d", nr_extra_stripes);
/*
* For GET_READ_MIRRORS, we can only return at most 1 extra stripe for
* replace.
@@ -6368,7 +6321,8 @@ static void handle_ops_on_dev_replace(struct btrfs_io_context *bioc,
struct btrfs_io_stripe *second = &bioc->stripes[num_stripes + 1];
/* Only DUP can have two extra stripes. */
- ASSERT(bioc->map_type & BTRFS_BLOCK_GROUP_DUP);
+ ASSERT(bioc->map_type & BTRFS_BLOCK_GROUP_DUP,
+ "map_type=%llu", bioc->map_type);
/*
* Swap the last stripe stripes and reduce @nr_extra_stripes.
@@ -6395,7 +6349,8 @@ static u64 btrfs_max_io_len(struct btrfs_chunk_map *map, u64 offset,
*/
io_geom->stripe_offset = offset & BTRFS_STRIPE_LEN_MASK;
io_geom->stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT;
- ASSERT(io_geom->stripe_offset < U32_MAX);
+ ASSERT(io_geom->stripe_offset < U32_MAX,
+ "stripe_offset=%llu", io_geom->stripe_offset);
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
unsigned long full_stripe_len =
@@ -6413,8 +6368,12 @@ static u64 btrfs_max_io_len(struct btrfs_chunk_map *map, u64 offset,
io_geom->raid56_full_stripe_start = btrfs_stripe_nr_to_offset(
rounddown(io_geom->stripe_nr, nr_data_stripes(map)));
- ASSERT(io_geom->raid56_full_stripe_start + full_stripe_len > offset);
- ASSERT(io_geom->raid56_full_stripe_start <= offset);
+ ASSERT(io_geom->raid56_full_stripe_start + full_stripe_len > offset,
+ "raid56_full_stripe_start=%llu full_stripe_len=%lu offset=%llu",
+ io_geom->raid56_full_stripe_start, full_stripe_len, offset);
+ ASSERT(io_geom->raid56_full_stripe_start <= offset,
+ "raid56_full_stripe_start=%llu offset=%llu",
+ io_geom->raid56_full_stripe_start, offset);
/*
* For writes to RAID56, allow to write a full stripe set, but
* no straddling of stripe sets.
@@ -6580,7 +6539,7 @@ static void map_blocks_raid56_read(struct btrfs_chunk_map *map,
{
int data_stripes = nr_data_stripes(map);
- ASSERT(io_geom->mirror_num <= 1);
+ ASSERT(io_geom->mirror_num <= 1, "mirror_num=%d", io_geom->mirror_num);
/* Just grab the data stripe directly. */
io_geom->stripe_index = io_geom->stripe_nr % data_stripes;
io_geom->stripe_nr /= data_stripes;
@@ -6648,7 +6607,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
int num_copies;
struct btrfs_io_context *bioc = NULL;
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
- int dev_replace_is_ongoing = 0;
+ bool dev_replace_is_ongoing = false;
u16 num_alloc_stripes;
u64 max_len;
@@ -6953,7 +6912,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
atomic_set(&dev->dev_stats_ccnt, 0);
btrfs_device_data_ordered_init(dev);
- extent_io_tree_init(fs_info, &dev->alloc_state, IO_TREE_DEVICE_ALLOC_STATE);
+ btrfs_extent_io_tree_init(fs_info, &dev->alloc_state, IO_TREE_DEVICE_ALLOC_STATE);
if (devid)
tmp = *devid;
@@ -6974,9 +6933,9 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
generate_random_uuid(dev->uuid);
if (path) {
- struct rcu_string *name;
+ const char *name;
- name = rcu_string_strdup(path, GFP_KERNEL);
+ name = kstrdup(path, GFP_KERNEL);
if (!name) {
btrfs_free_device(dev);
return ERR_PTR(-ENOMEM);
@@ -7155,6 +7114,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
btrfs_err(fs_info,
"failed to add chunk map, start=%llu len=%llu: %d",
map->start, map->chunk_len, ret);
+ btrfs_free_chunk_map(map);
}
return ret;
@@ -7200,8 +7160,12 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
fs_devices = find_fsid(fsid, NULL);
if (!fs_devices) {
- if (!btrfs_test_opt(fs_info, DEGRADED))
+ if (!btrfs_test_opt(fs_info, DEGRADED)) {
+ btrfs_err(fs_info,
+ "failed to find fsid %pU when attempting to open seed devices",
+ fsid);
return ERR_PTR(-ENOENT);
+ }
fs_devices = alloc_fs_devices(fsid);
if (IS_ERR(fs_devices))
@@ -7220,7 +7184,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
if (IS_ERR(fs_devices))
return fs_devices;
- ret = open_fs_devices(fs_devices, BLK_OPEN_READ, fs_info->bdev_holder);
+ ret = open_fs_devices(fs_devices, BLK_OPEN_READ, fs_info->sb);
if (ret) {
free_fs_devices(fs_devices);
return ERR_PTR(ret);
@@ -7534,8 +7498,8 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
* item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
*/
key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
- key.offset = 0;
key.type = 0;
+ key.offset = 0;
btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
struct extent_buffer *node = path->nodes[1];
@@ -7752,7 +7716,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
return -ENOMEM;
ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
if (ret < 0) {
- btrfs_warn_in_rcu(fs_info,
+ btrfs_warn(fs_info,
"error %d while searching for dev_stats item for device %s",
ret, btrfs_dev_name(device));
goto out;
@@ -7763,7 +7727,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
/* need to delete old one and insert a new one */
ret = btrfs_del_item(trans, dev_root, path);
if (ret != 0) {
- btrfs_warn_in_rcu(fs_info,
+ btrfs_warn(fs_info,
"delete too small dev_stats item for device %s failed %d",
btrfs_dev_name(device), ret);
goto out;
@@ -7777,7 +7741,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
ret = btrfs_insert_empty_item(trans, dev_root, path,
&key, sizeof(*ptr));
if (ret < 0) {
- btrfs_warn_in_rcu(fs_info,
+ btrfs_warn(fs_info,
"insert dev_stats item for device %s failed %d",
btrfs_dev_name(device), ret);
goto out;
@@ -7840,7 +7804,7 @@ void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
if (!dev->dev_stats_valid)
return;
- btrfs_err_rl_in_rcu(dev->fs_info,
+ btrfs_err_rl(dev->fs_info,
"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
btrfs_dev_name(dev),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
@@ -7860,7 +7824,7 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
if (i == BTRFS_DEV_STAT_VALUES_MAX)
return; /* all values == 0, suppress message */
- btrfs_info_in_rcu(dev->fs_info,
+ btrfs_info(dev->fs_info,
"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
btrfs_dev_name(dev),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
@@ -7920,7 +7884,7 @@ void btrfs_commit_device_sizes(struct btrfs_transaction *trans)
{
struct btrfs_device *curr, *next;
- ASSERT(trans->state == TRANS_STATE_COMMIT_DOING);
+ ASSERT(trans->state == TRANS_STATE_COMMIT_DOING, "state=%d" , trans->state);
if (list_empty(&trans->dev_update_list))
return;
@@ -7984,7 +7948,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
}
/*
- * Very old mkfs.btrfs (before v4.1) will not respect the reserved
+ * Very old mkfs.btrfs (before v4.15) will not respect the reserved
* space. Although kernel can handle it without problem, better to warn
* the users.
*/
@@ -8236,7 +8200,7 @@ static int relocating_repair_kthread(void *data)
btrfs_info(fs_info,
"zoned: relocating block group %llu to repair IO failure",
target);
- ret = btrfs_relocate_chunk(fs_info, target);
+ ret = btrfs_relocate_chunk(fs_info, target, true);
out:
if (cache)
@@ -8289,7 +8253,7 @@ static void map_raid56_repair_block(struct btrfs_io_context *bioc,
logical < stripe_start + BTRFS_STRIPE_LEN)
break;
}
- ASSERT(i < data_stripes);
+ ASSERT(i < data_stripes, "i=%d data_stripes=%d", i, data_stripes);
smap->dev = bioc->stripes[i].dev;
smap->physical = bioc->stripes[i].physical +
((logical - bioc->full_stripe_logical) &
@@ -8318,7 +8282,7 @@ int btrfs_map_repair_block(struct btrfs_fs_info *fs_info,
int mirror_ret = mirror_num;
int ret;
- ASSERT(mirror_num > 0);
+ ASSERT(mirror_num > 0, "mirror_num=%d", mirror_num);
ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length,
&bioc, smap, &mirror_ret);
@@ -8326,7 +8290,7 @@ int btrfs_map_repair_block(struct btrfs_fs_info *fs_info,
return ret;
/* The map range should not cross stripe boundary. */
- ASSERT(map_length >= length);
+ ASSERT(map_length >= length, "map_length=%llu length=%u", map_length, length);
/* Already mapped to single stripe. */
if (!bioc)
@@ -8338,7 +8302,8 @@ int btrfs_map_repair_block(struct btrfs_fs_info *fs_info,
goto out;
}
- ASSERT(mirror_num <= bioc->num_stripes);
+ ASSERT(mirror_num <= bioc->num_stripes,
+ "mirror_num=%d num_stripes=%d", mirror_num, bioc->num_stripes);
smap->dev = bioc->stripes[mirror_num - 1].dev;
smap->physical = bioc->stripes[mirror_num - 1].physical;
out:
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 120f65e21eeb..a56e873a3029 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -7,6 +7,7 @@
#define BTRFS_VOLUMES_H
#include <linux/blk_types.h>
+#include <linux/blkdev.h>
#include <linux/sizes.h>
#include <linux/atomic.h>
#include <linux/sort.h>
@@ -18,14 +19,16 @@
#include <linux/completion.h>
#include <linux/rbtree.h>
#include <uapi/linux/btrfs.h>
+#include <uapi/linux/btrfs_tree.h>
#include "messages.h"
-#include "rcu-string.h"
+#include "extent-io-tree.h"
struct block_device;
struct bdev_handle;
struct btrfs_fs_info;
struct btrfs_block_group;
struct btrfs_trans_handle;
+struct btrfs_transaction;
struct btrfs_zoned_device_info;
#define BTRFS_MAX_DATA_CHUNK_SIZE (10ULL * SZ_1G)
@@ -110,7 +113,8 @@ struct btrfs_device {
struct btrfs_fs_devices *fs_devices;
struct btrfs_fs_info *fs_info;
- struct rcu_string __rcu *name;
+ /* Device path or NULL if missing. */
+ const char __rcu *name;
u64 generation;
@@ -418,6 +422,16 @@ struct btrfs_fs_devices {
/* Count fs-devices opened. */
int opened;
+ /*
+ * Counter of the processes that are holding this fs_devices but not
+ * yet opened.
+ * This is for mounting handling, as we can only open the fs_devices
+ * after a super block is created. But we cannot take uuid_mutex
+ * during sget_fc(), thus we have to hold the fs_devices (meaning it
+ * cannot be released) until a super block is returned.
+ */
+ int holding;
+
/* Set when we find or add a device that doesn't have the nonrot flag set. */
bool rotating;
/* Devices support TRIM/discard commands. */
@@ -469,7 +483,6 @@ struct btrfs_io_stripe {
struct btrfs_device *dev;
/* Block mapping. */
u64 physical;
- u64 length;
bool rst_search_commit_root;
/* For the endio handler. */
struct btrfs_io_context *bioc;
@@ -664,7 +677,7 @@ enum btrfs_map_op {
BTRFS_MAP_GET_READ_MIRRORS,
};
-static inline enum btrfs_map_op btrfs_op(struct bio *bio)
+static inline enum btrfs_map_op btrfs_op(const struct bio *bio)
{
switch (bio_op(bio)) {
case REQ_OP_WRITE:
@@ -711,12 +724,12 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
int btrfs_read_sys_array(struct btrfs_fs_info *fs_info);
int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);
struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
- u64 type);
+ struct btrfs_space_info *space_info,
+ u64 type);
void btrfs_mapping_tree_free(struct btrfs_fs_info *fs_info);
int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
blk_mode_t flags, void *holder);
-struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
- bool mount_arg_dev);
+struct btrfs_device *btrfs_scan_one_device(const char *path, bool mount_arg_dev);
int btrfs_forget_devices(dev_t devt);
void btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices);
@@ -750,7 +763,8 @@ void btrfs_describe_block_groups(u64 flags, char *buf, u32 size_buf);
int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info);
int btrfs_recover_balance(struct btrfs_fs_info *fs_info);
int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
-int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset);
+int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset,
+ bool verbose);
int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset);
void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index);
@@ -782,6 +796,8 @@ struct btrfs_chunk_map *btrfs_find_chunk_map_nolock(struct btrfs_fs_info *fs_inf
struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
u64 logical, u64 length);
void btrfs_remove_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map);
+struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
+ int copy_num, bool drop_cache);
void btrfs_release_disk_super(struct btrfs_super_block *super);
static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
@@ -840,7 +856,26 @@ static inline const char *btrfs_dev_name(const struct btrfs_device *device)
if (!device || test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
return "<missing disk>";
else
- return rcu_str_deref(device->name);
+ return rcu_dereference(device->name);
+}
+
+static inline void btrfs_warn_unknown_chunk_allocation(enum btrfs_chunk_allocation_policy pol)
+{
+ WARN_ONCE(1, "unknown allocation policy %d, fallback to regular", pol);
+}
+
+static inline void btrfs_fs_devices_inc_holding(struct btrfs_fs_devices *fs_devices)
+{
+ lockdep_assert_held(&uuid_mutex);
+ ASSERT(fs_devices->holding >= 0);
+ fs_devices->holding++;
+}
+
+static inline void btrfs_fs_devices_dec_holding(struct btrfs_fs_devices *fs_devices)
+{
+ lockdep_assert_held(&uuid_mutex);
+ ASSERT(fs_devices->holding > 0);
+ fs_devices->holding--;
}
void btrfs_commit_device_sizes(struct btrfs_transaction *trans);
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 3e0edbcf73e1..79fb1614bd0c 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -510,14 +510,15 @@ static int btrfs_initxattrs(struct inode *inode,
*/
nofs_flag = memalloc_nofs_save();
for (xattr = xattr_array; xattr->name != NULL; xattr++) {
- name = kmalloc(XATTR_SECURITY_PREFIX_LEN +
- strlen(xattr->name) + 1, GFP_KERNEL);
+ const size_t name_len = XATTR_SECURITY_PREFIX_LEN +
+ strlen(xattr->name) + 1;
+
+ name = kmalloc(name_len, GFP_KERNEL);
if (!name) {
ret = -ENOMEM;
break;
}
- strcpy(name, XATTR_SECURITY_PREFIX);
- strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name);
+ scnprintf(name, name_len, "%s%s", XATTR_SECURITY_PREFIX, xattr->name);
if (strcmp(name, XATTR_NAME_CAPS) == 0)
clear_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags);
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 8dc4cf49f6f0..0ce10e4ec836 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -6,6 +6,8 @@
#ifndef BTRFS_XATTR_H
#define BTRFS_XATTR_H
+#include <linux/types.h>
+
struct dentry;
struct inode;
struct qstr;
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index c9e92c6941ec..5292cd341f70 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -94,6 +94,45 @@ fail:
return ERR_PTR(-ENOMEM);
}
+/*
+ * Helper for S390x with hardware zlib compression support.
+ *
+ * That hardware acceleration requires a buffer size larger than a single page
+ * to get ideal performance, thus we need to do the memory copy rather than
+ * use the page cache directly as input buffer.
+ */
+static int copy_data_into_buffer(struct address_space *mapping,
+ struct workspace *workspace, u64 filepos,
+ unsigned long length)
+{
+ u64 cur = filepos;
+
+ /* It's only for hardware accelerated zlib code. */
+ ASSERT(zlib_deflate_dfltcc_enabled());
+
+ while (cur < filepos + length) {
+ struct folio *folio;
+ void *data_in;
+ unsigned int offset;
+ unsigned long copy_length;
+ int ret;
+
+ ret = btrfs_compress_filemap_get_folio(mapping, cur, &folio);
+ if (ret < 0)
+ return ret;
+
+ offset = offset_in_folio(folio, cur);
+ copy_length = min(folio_size(folio) - offset,
+ filepos + length - cur);
+
+ data_in = kmap_local_folio(folio, offset);
+ memcpy(workspace->buf + cur - filepos, data_in, copy_length);
+ kunmap_local(data_in);
+ cur += copy_length;
+ }
+ return 0;
+}
+
int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
u64 start, struct folio **folios, unsigned long *out_folios,
unsigned long *total_in, unsigned long *total_out)
@@ -105,8 +144,6 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
int nr_folios = 0;
struct folio *in_folio = NULL;
struct folio *out_folio = NULL;
- unsigned long bytes_left;
- unsigned int in_buf_folios;
unsigned long len = *total_out;
unsigned long nr_dest_folios = *out_folios;
const unsigned long max_out = nr_dest_folios * PAGE_SIZE;
@@ -150,36 +187,22 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
* the workspace buffer if required.
*/
if (workspace->strm.avail_in == 0) {
- bytes_left = len - workspace->strm.total_in;
- in_buf_folios = min(DIV_ROUND_UP(bytes_left, PAGE_SIZE),
- workspace->buf_size / PAGE_SIZE);
- if (in_buf_folios > 1) {
- int i;
-
- /* S390 hardware acceleration path, not subpage. */
- ASSERT(!btrfs_is_subpage(
- inode_to_fs_info(mapping->host),
- mapping));
- for (i = 0; i < in_buf_folios; i++) {
- if (data_in) {
- kunmap_local(data_in);
- folio_put(in_folio);
- data_in = NULL;
- }
- ret = btrfs_compress_filemap_get_folio(mapping,
- start, &in_folio);
- if (ret < 0)
- goto out;
- data_in = kmap_local_folio(in_folio, 0);
- copy_page(workspace->buf + i * PAGE_SIZE,
- data_in);
- start += PAGE_SIZE;
- }
+ unsigned long bytes_left = len - workspace->strm.total_in;
+ unsigned int copy_length = min(bytes_left, workspace->buf_size);
+
+ /*
+ * This can only happen when hardware zlib compression is
+ * enabled.
+ */
+ if (copy_length > PAGE_SIZE) {
+ ret = copy_data_into_buffer(mapping, workspace,
+ start, copy_length);
+ if (ret < 0)
+ goto out;
+ start += copy_length;
workspace->strm.next_in = workspace->buf;
- workspace->strm.avail_in = min(bytes_left,
- in_buf_folios << PAGE_SHIFT);
+ workspace->strm.avail_in = copy_length;
} else {
- unsigned int pg_off;
unsigned int cur_len;
if (data_in) {
@@ -191,9 +214,9 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
start, &in_folio);
if (ret < 0)
goto out;
- pg_off = offset_in_page(start);
- cur_len = btrfs_calc_input_length(orig_end, start);
- data_in = kmap_local_folio(in_folio, pg_off);
+ cur_len = btrfs_calc_input_length(in_folio, orig_end, start);
+ data_in = kmap_local_folio(in_folio,
+ offset_in_folio(in_folio, start));
start += cur_len;
workspace->strm.next_in = data_in;
workspace->strm.avail_in = cur_len;
@@ -463,6 +486,7 @@ out:
const struct btrfs_compress_op btrfs_zlib_compress = {
.workspace_manager = &wsm,
+ .min_level = 1,
.max_level = 9,
.default_level = BTRFS_ZLIB_DEFAULT_LEVEL,
};
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 73e0aa9fc08a..ea662036f441 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -9,7 +9,6 @@
#include "ctree.h"
#include "volumes.h"
#include "zoned.h"
-#include "rcu-string.h"
#include "disk-io.h"
#include "block-group.h"
#include "dev-replace.h"
@@ -17,6 +16,8 @@
#include "fs.h"
#include "accessors.h"
#include "bio.h"
+#include "transaction.h"
+#include "sysfs.h"
/* Maximum number of zones to report per blkdev_report_zones() call */
#define BTRFS_REPORT_NR_ZONES 4096
@@ -42,6 +43,9 @@
/* Number of superblock log zones */
#define BTRFS_NR_SB_LOG_ZONES 2
+/* Default number of max active zones when the device has no limits. */
+#define BTRFS_DEFAULT_MAX_ACTIVE_ZONES 128
+
/*
* Minimum of active zones we need:
*
@@ -263,9 +267,9 @@ static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones,
copy_zone_info_cb, zones);
if (ret < 0) {
- btrfs_err_in_rcu(device->fs_info,
+ btrfs_err(device->fs_info,
"zoned: failed to read zone %llu on %s (devid %llu)",
- pos, rcu_str_deref(device->name),
+ pos, rcu_dereference(device->name),
device->devid);
return ret;
}
@@ -395,16 +399,16 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
/* We reject devices with a zone size larger than 8GB */
if (zone_info->zone_size > BTRFS_MAX_ZONE_SIZE) {
- btrfs_err_in_rcu(fs_info,
+ btrfs_err(fs_info,
"zoned: %s: zone size %llu larger than supported maximum %llu",
- rcu_str_deref(device->name),
+ rcu_dereference(device->name),
zone_info->zone_size, BTRFS_MAX_ZONE_SIZE);
ret = -EINVAL;
goto out;
} else if (zone_info->zone_size < BTRFS_MIN_ZONE_SIZE) {
- btrfs_err_in_rcu(fs_info,
+ btrfs_err(fs_info,
"zoned: %s: zone size %llu smaller than supported minimum %u",
- rcu_str_deref(device->name),
+ rcu_dereference(device->name),
zone_info->zone_size, BTRFS_MIN_ZONE_SIZE);
ret = -EINVAL;
goto out;
@@ -416,11 +420,14 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
if (!IS_ALIGNED(nr_sectors, zone_sectors))
zone_info->nr_zones++;
- max_active_zones = bdev_max_active_zones(bdev);
+ max_active_zones = min_not_zero(bdev_max_active_zones(bdev),
+ bdev_max_open_zones(bdev));
+ if (!max_active_zones && zone_info->nr_zones > BTRFS_DEFAULT_MAX_ACTIVE_ZONES)
+ max_active_zones = BTRFS_DEFAULT_MAX_ACTIVE_ZONES;
if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) {
- btrfs_err_in_rcu(fs_info,
+ btrfs_err(fs_info,
"zoned: %s: max active zones %u is too small, need at least %u active zones",
- rcu_str_deref(device->name), max_active_zones,
+ rcu_dereference(device->name), max_active_zones,
BTRFS_MIN_ACTIVE_ZONES);
ret = -EINVAL;
goto out;
@@ -460,9 +467,9 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
zone_info->zone_cache = vcalloc(zone_info->nr_zones,
sizeof(struct blk_zone));
if (!zone_info->zone_cache) {
- btrfs_err_in_rcu(device->fs_info,
+ btrfs_err(device->fs_info,
"zoned: failed to allocate zone cache for %s",
- rcu_str_deref(device->name));
+ rcu_dereference(device->name));
ret = -ENOMEM;
goto out;
}
@@ -497,9 +504,9 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
}
if (nreported != zone_info->nr_zones) {
- btrfs_err_in_rcu(device->fs_info,
+ btrfs_err(device->fs_info,
"inconsistent number of zones on %s (%u/%u)",
- rcu_str_deref(device->name), nreported,
+ rcu_dereference(device->name), nreported,
zone_info->nr_zones);
ret = -EIO;
goto out;
@@ -507,9 +514,9 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
if (max_active_zones) {
if (nactive > max_active_zones) {
- btrfs_err_in_rcu(device->fs_info,
+ btrfs_err(device->fs_info,
"zoned: %u active zones on %s exceeds max_active_zones %u",
- nactive, rcu_str_deref(device->name),
+ nactive, rcu_dereference(device->name),
max_active_zones);
ret = -EIO;
goto out;
@@ -538,7 +545,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
goto out;
if (nr_zones != BTRFS_NR_SB_LOG_ZONES) {
- btrfs_err_in_rcu(device->fs_info,
+ btrfs_err(device->fs_info,
"zoned: failed to read super block log zone info at devid %llu zone %u",
device->devid, sb_zone);
ret = -EUCLEAN;
@@ -556,7 +563,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
ret = sb_write_pointer(device->bdev,
&zone_info->sb_zones[sb_pos], &sb_wp);
if (ret != -ENOENT && ret) {
- btrfs_err_in_rcu(device->fs_info,
+ btrfs_err(device->fs_info,
"zoned: super block log zone corrupted devid %llu zone %u",
device->devid, sb_zone);
ret = -EUCLEAN;
@@ -575,9 +582,9 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
emulated = "emulated ";
}
- btrfs_info_in_rcu(fs_info,
+ btrfs_info(fs_info,
"%s block device %s, %u %szones of %llu bytes",
- model, rcu_str_deref(device->name), zone_info->nr_zones,
+ model, rcu_dereference(device->name), zone_info->nr_zones,
emulated, zone_info->zone_size);
return 0;
@@ -989,7 +996,7 @@ int btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
}
/* All the zones are FULL. Should not reach here. */
- ASSERT(0);
+ DEBUG_WARN("unexpected state, all zones full");
return -EIO;
}
@@ -1182,10 +1189,10 @@ int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size)
continue;
/* Free regions should be empty */
- btrfs_warn_in_rcu(
+ btrfs_warn(
device->fs_info,
"zoned: resetting device %s (devid %llu) zone %llu for allocation",
- rcu_str_deref(device->name), device->devid, pos >> shift);
+ rcu_dereference(device->name), device->devid, pos >> shift);
WARN_ON_ONCE(1);
ret = btrfs_reset_device_zone(device, pos, zinfo->zone_size,
@@ -1277,7 +1284,7 @@ struct zone_info {
static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
struct zone_info *info, unsigned long *active,
- struct btrfs_chunk_map *map)
+ struct btrfs_chunk_map *map, bool new)
{
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
struct btrfs_device *device;
@@ -1307,6 +1314,8 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
return 0;
}
+ ASSERT(!new || btrfs_dev_is_empty_zone(device, info->physical));
+
/* This zone will be used for allocation, so mark this zone non-empty. */
btrfs_dev_clear_zone_empty(device, info->physical);
@@ -1319,6 +1328,18 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
* to determine the allocation offset within the zone.
*/
WARN_ON(!IS_ALIGNED(info->physical, fs_info->zone_size));
+
+ if (new) {
+ sector_t capacity;
+
+ capacity = bdev_zone_capacity(device->bdev, info->physical >> SECTOR_SHIFT);
+ up_read(&dev_replace->rwsem);
+ info->alloc_offset = 0;
+ info->capacity = capacity << SECTOR_SHIFT;
+
+ return 0;
+ }
+
nofs_flag = memalloc_nofs_save();
ret = btrfs_get_dev_zone(device, info->physical, &zone);
memalloc_nofs_restore(nofs_flag);
@@ -1331,9 +1352,9 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
}
if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) {
- btrfs_err_in_rcu(fs_info,
+ btrfs_err(fs_info,
"zoned: unexpected conventional zone %llu on device %s (devid %llu)",
- zone.start << SECTOR_SHIFT, rcu_str_deref(device->name),
+ zone.start << SECTOR_SHIFT, rcu_dereference(device->name),
device->devid);
up_read(&dev_replace->rwsem);
return -EIO;
@@ -1344,10 +1365,10 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
switch (zone.cond) {
case BLK_ZONE_COND_OFFLINE:
case BLK_ZONE_COND_READONLY:
- btrfs_err_in_rcu(fs_info,
+ btrfs_err(fs_info,
"zoned: offline/readonly zone %llu on device %s (devid %llu)",
(info->physical >> device->zone_info->zone_size_shift),
- rcu_str_deref(device->name), device->devid);
+ rcu_dereference(device->name), device->devid);
info->alloc_offset = WP_MISSING_DEV;
break;
case BLK_ZONE_COND_EMPTY:
@@ -1389,7 +1410,8 @@ static int btrfs_load_block_group_single(struct btrfs_block_group *bg,
static int btrfs_load_block_group_dup(struct btrfs_block_group *bg,
struct btrfs_chunk_map *map,
struct zone_info *zone_info,
- unsigned long *active)
+ unsigned long *active,
+ u64 last_alloc)
{
struct btrfs_fs_info *fs_info = bg->fs_info;
@@ -1412,6 +1434,13 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg,
zone_info[1].physical);
return -EIO;
}
+
+ if (zone_info[0].alloc_offset == WP_CONVENTIONAL)
+ zone_info[0].alloc_offset = last_alloc;
+
+ if (zone_info[1].alloc_offset == WP_CONVENTIONAL)
+ zone_info[1].alloc_offset = last_alloc;
+
if (zone_info[0].alloc_offset != zone_info[1].alloc_offset) {
btrfs_err(bg->fs_info,
"zoned: write pointer offset mismatch of zones in DUP profile");
@@ -1432,7 +1461,8 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg,
static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg,
struct btrfs_chunk_map *map,
struct zone_info *zone_info,
- unsigned long *active)
+ unsigned long *active,
+ u64 last_alloc)
{
struct btrfs_fs_info *fs_info = bg->fs_info;
int i;
@@ -1447,10 +1477,12 @@ static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg,
bg->zone_capacity = min_not_zero(zone_info[0].capacity, zone_info[1].capacity);
for (i = 0; i < map->num_stripes; i++) {
- if (zone_info[i].alloc_offset == WP_MISSING_DEV ||
- zone_info[i].alloc_offset == WP_CONVENTIONAL)
+ if (zone_info[i].alloc_offset == WP_MISSING_DEV)
continue;
+ if (zone_info[i].alloc_offset == WP_CONVENTIONAL)
+ zone_info[i].alloc_offset = last_alloc;
+
if ((zone_info[0].alloc_offset != zone_info[i].alloc_offset) &&
!btrfs_test_opt(fs_info, DEGRADED)) {
btrfs_err(fs_info,
@@ -1480,7 +1512,8 @@ static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg,
static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg,
struct btrfs_chunk_map *map,
struct zone_info *zone_info,
- unsigned long *active)
+ unsigned long *active,
+ u64 last_alloc)
{
struct btrfs_fs_info *fs_info = bg->fs_info;
@@ -1491,10 +1524,29 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg,
}
for (int i = 0; i < map->num_stripes; i++) {
- if (zone_info[i].alloc_offset == WP_MISSING_DEV ||
- zone_info[i].alloc_offset == WP_CONVENTIONAL)
+ if (zone_info[i].alloc_offset == WP_MISSING_DEV)
continue;
+ if (zone_info[i].alloc_offset == WP_CONVENTIONAL) {
+ u64 stripe_nr, full_stripe_nr;
+ u64 stripe_offset;
+ int stripe_index;
+
+ stripe_nr = div64_u64(last_alloc, map->stripe_size);
+ stripe_offset = stripe_nr * map->stripe_size;
+ full_stripe_nr = div_u64(stripe_nr, map->num_stripes);
+ div_u64_rem(stripe_nr, map->num_stripes, &stripe_index);
+
+ zone_info[i].alloc_offset =
+ full_stripe_nr * map->stripe_size;
+
+ if (stripe_index > i)
+ zone_info[i].alloc_offset += map->stripe_size;
+ else if (stripe_index == i)
+ zone_info[i].alloc_offset +=
+ (last_alloc - stripe_offset);
+ }
+
if (test_bit(0, active) != test_bit(i, active)) {
if (!btrfs_zone_activate(bg))
return -EIO;
@@ -1512,7 +1564,8 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg,
static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg,
struct btrfs_chunk_map *map,
struct zone_info *zone_info,
- unsigned long *active)
+ unsigned long *active,
+ u64 last_alloc)
{
struct btrfs_fs_info *fs_info = bg->fs_info;
@@ -1523,8 +1576,7 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg,
}
for (int i = 0; i < map->num_stripes; i++) {
- if (zone_info[i].alloc_offset == WP_MISSING_DEV ||
- zone_info[i].alloc_offset == WP_CONVENTIONAL)
+ if (zone_info[i].alloc_offset == WP_MISSING_DEV)
continue;
if (test_bit(0, active) != test_bit(i, active)) {
@@ -1535,6 +1587,29 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg,
set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
}
+ if (zone_info[i].alloc_offset == WP_CONVENTIONAL) {
+ u64 stripe_nr, full_stripe_nr;
+ u64 stripe_offset;
+ int stripe_index;
+
+ stripe_nr = div64_u64(last_alloc, map->stripe_size);
+ stripe_offset = stripe_nr * map->stripe_size;
+ full_stripe_nr = div_u64(stripe_nr,
+ map->num_stripes / map->sub_stripes);
+ div_u64_rem(stripe_nr,
+ (map->num_stripes / map->sub_stripes),
+ &stripe_index);
+
+ zone_info[i].alloc_offset =
+ full_stripe_nr * map->stripe_size;
+
+ if (stripe_index > (i / map->sub_stripes))
+ zone_info[i].alloc_offset += map->stripe_size;
+ else if (stripe_index == (i / map->sub_stripes))
+ zone_info[i].alloc_offset +=
+ (last_alloc - stripe_offset);
+ }
+
if ((i % map->sub_stripes) == 0) {
bg->zone_capacity += zone_info[i].capacity;
bg->alloc_offset += zone_info[i].alloc_offset;
@@ -1588,7 +1663,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
}
for (i = 0; i < map->num_stripes; i++) {
- ret = btrfs_load_zone_info(fs_info, i, &zone_info[i], active, map);
+ ret = btrfs_load_zone_info(fs_info, i, &zone_info[i], active, map, new);
if (ret)
goto out;
@@ -1623,18 +1698,22 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
ret = btrfs_load_block_group_single(cache, &zone_info[0], active);
break;
case BTRFS_BLOCK_GROUP_DUP:
- ret = btrfs_load_block_group_dup(cache, map, zone_info, active);
+ ret = btrfs_load_block_group_dup(cache, map, zone_info, active,
+ last_alloc);
break;
case BTRFS_BLOCK_GROUP_RAID1:
case BTRFS_BLOCK_GROUP_RAID1C3:
case BTRFS_BLOCK_GROUP_RAID1C4:
- ret = btrfs_load_block_group_raid1(cache, map, zone_info, active);
+ ret = btrfs_load_block_group_raid1(cache, map, zone_info,
+ active, last_alloc);
break;
case BTRFS_BLOCK_GROUP_RAID0:
- ret = btrfs_load_block_group_raid0(cache, map, zone_info, active);
+ ret = btrfs_load_block_group_raid0(cache, map, zone_info,
+ active, last_alloc);
break;
case BTRFS_BLOCK_GROUP_RAID10:
- ret = btrfs_load_block_group_raid10(cache, map, zone_info, active);
+ ret = btrfs_load_block_group_raid10(cache, map, zone_info,
+ active, last_alloc);
break;
case BTRFS_BLOCK_GROUP_RAID5:
case BTRFS_BLOCK_GROUP_RAID6:
@@ -1659,7 +1738,6 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
* stripe.
*/
cache->alloc_offset = cache->zone_capacity;
- ret = 0;
}
out:
@@ -1784,12 +1862,12 @@ static void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered,
ordered->disk_bytenr = logical;
write_lock(&em_tree->lock);
- em = search_extent_mapping(em_tree, ordered->file_offset,
- ordered->num_bytes);
+ em = btrfs_search_extent_mapping(em_tree, ordered->file_offset,
+ ordered->num_bytes);
/* The em should be a new COW extent, thus it should not have an offset. */
ASSERT(em->offset == 0);
em->disk_bytenr = logical;
- free_extent_map(em);
+ btrfs_free_extent_map(em);
write_unlock(&em_tree->lock);
}
@@ -1799,8 +1877,8 @@ static bool btrfs_zoned_split_ordered(struct btrfs_ordered_extent *ordered,
struct btrfs_ordered_extent *new;
if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) &&
- split_extent_map(ordered->inode, ordered->file_offset,
- ordered->num_bytes, len, logical))
+ btrfs_split_extent_map(ordered->inode, ordered->file_offset,
+ ordered->num_bytes, len, logical))
return false;
new = btrfs_split_ordered_extent(ordered, len);
@@ -2097,10 +2175,15 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
goto out_unlock;
}
- /* No space left */
- if (btrfs_zoned_bg_is_full(block_group)) {
- ret = false;
- goto out_unlock;
+ if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) {
+ /* The caller should check if the block group is full. */
+ if (WARN_ON_ONCE(btrfs_zoned_bg_is_full(block_group))) {
+ ret = false;
+ goto out_unlock;
+ }
+ } else {
+ /* Since it is already written, it should have been active. */
+ WARN_ON_ONCE(block_group->meta_write_pointer != block_group->start);
}
for (i = 0; i < map->num_stripes; i++) {
@@ -2111,6 +2194,9 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
physical = map->stripes[i].physical;
zinfo = device->zone_info;
+ if (!device->bdev)
+ continue;
+
if (zinfo->max_active_zones == 0)
continue;
@@ -2155,27 +2241,15 @@ static void wait_eb_writebacks(struct btrfs_block_group *block_group)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
const u64 end = block_group->start + block_group->length;
- struct radix_tree_iter iter;
struct extent_buffer *eb;
- void __rcu **slot;
+ unsigned long index, start = (block_group->start >> fs_info->nodesize_bits);
rcu_read_lock();
- radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter,
- block_group->start >> fs_info->sectorsize_bits) {
- eb = radix_tree_deref_slot(slot);
- if (!eb)
- continue;
- if (radix_tree_deref_retry(eb)) {
- slot = radix_tree_iter_retry(&iter);
- continue;
- }
-
+ xa_for_each_start(&fs_info->buffer_tree, index, eb, start) {
if (eb->start < block_group->start)
continue;
if (eb->start >= end)
break;
-
- slot = radix_tree_iter_resume(slot, &iter);
rcu_read_unlock();
wait_on_extent_buffer_writeback(eb);
rcu_read_lock();
@@ -2183,6 +2257,40 @@ static void wait_eb_writebacks(struct btrfs_block_group *block_group)
rcu_read_unlock();
}
+static int call_zone_finish(struct btrfs_block_group *block_group,
+ struct btrfs_io_stripe *stripe)
+{
+ struct btrfs_device *device = stripe->dev;
+ const u64 physical = stripe->physical;
+ struct btrfs_zoned_device_info *zinfo = device->zone_info;
+ int ret;
+
+ if (!device->bdev)
+ return 0;
+
+ if (zinfo->max_active_zones == 0)
+ return 0;
+
+ if (btrfs_dev_is_sequential(device, physical)) {
+ unsigned int nofs_flags;
+
+ nofs_flags = memalloc_nofs_save();
+ ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
+ physical >> SECTOR_SHIFT,
+ zinfo->zone_size >> SECTOR_SHIFT);
+ memalloc_nofs_restore(nofs_flags);
+
+ if (ret)
+ return ret;
+ }
+
+ if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA))
+ zinfo->reserved_active_zones++;
+ btrfs_dev_clear_active_zone(device, physical);
+
+ return 0;
+}
+
static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
@@ -2267,28 +2375,12 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
down_read(&dev_replace->rwsem);
map = block_group->physical_map;
for (i = 0; i < map->num_stripes; i++) {
- struct btrfs_device *device = map->stripes[i].dev;
- const u64 physical = map->stripes[i].physical;
- struct btrfs_zoned_device_info *zinfo = device->zone_info;
- unsigned int nofs_flags;
-
- if (zinfo->max_active_zones == 0)
- continue;
-
- nofs_flags = memalloc_nofs_save();
- ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
- physical >> SECTOR_SHIFT,
- zinfo->zone_size >> SECTOR_SHIFT);
- memalloc_nofs_restore(nofs_flags);
+ ret = call_zone_finish(block_group, &map->stripes[i]);
if (ret) {
up_read(&dev_replace->rwsem);
return ret;
}
-
- if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA))
- zinfo->reserved_active_zones++;
- btrfs_dev_clear_active_zone(device, physical);
}
up_read(&dev_replace->rwsem);
@@ -2325,6 +2417,9 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)
if (!btrfs_is_zoned(fs_info))
return true;
+ if (test_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags))
+ return false;
+
/* Check if there is a device with active zones left */
mutex_lock(&fs_info->chunk_mutex);
spin_lock(&fs_info->zone_active_bgs_lock);
@@ -2417,7 +2512,7 @@ void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
/* For the work */
btrfs_get_block_group(bg);
- atomic_inc(&eb->refs);
+ refcount_inc(&eb->refs);
bg->last_eb = eb;
INIT_WORK(&bg->zone_finish_work, btrfs_zone_finish_endio_workfn);
queue_work(system_unbound_wq, &bg->zone_finish_work);
@@ -2433,6 +2528,104 @@ void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg)
spin_unlock(&fs_info->relocation_bg_lock);
}
+void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
+ struct btrfs_space_info *space_info = data_sinfo;
+ struct btrfs_trans_handle *trans;
+ struct btrfs_block_group *bg;
+ struct list_head *bg_list;
+ u64 alloc_flags;
+ bool first = true;
+ bool did_chunk_alloc = false;
+ int index;
+ int ret;
+
+ if (!btrfs_is_zoned(fs_info))
+ return;
+
+ if (fs_info->data_reloc_bg)
+ return;
+
+ if (sb_rdonly(fs_info->sb))
+ return;
+
+ alloc_flags = btrfs_get_alloc_profile(fs_info, space_info->flags);
+ index = btrfs_bg_flags_to_raid_index(alloc_flags);
+
+ /* Scan the data space_info to find empty block groups. Take the second one. */
+again:
+ bg_list = &space_info->block_groups[index];
+ list_for_each_entry(bg, bg_list, list) {
+ if (bg->alloc_offset != 0)
+ continue;
+
+ if (first) {
+ first = false;
+ continue;
+ }
+
+ if (space_info == data_sinfo) {
+ /* Migrate the block group to the data relocation space_info. */
+ struct btrfs_space_info *reloc_sinfo = data_sinfo->sub_group[0];
+ int factor;
+
+ ASSERT(reloc_sinfo->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC);
+ factor = btrfs_bg_type_to_factor(bg->flags);
+
+ down_write(&space_info->groups_sem);
+ list_del_init(&bg->list);
+ /* We can assume this as we choose the second empty one. */
+ ASSERT(!list_empty(&space_info->block_groups[index]));
+ up_write(&space_info->groups_sem);
+
+ spin_lock(&space_info->lock);
+ space_info->total_bytes -= bg->length;
+ space_info->disk_total -= bg->length * factor;
+ /* There is no allocation ever happened. */
+ ASSERT(bg->used == 0);
+ ASSERT(bg->zone_unusable == 0);
+ /* No super block in a block group on the zoned setup. */
+ ASSERT(bg->bytes_super == 0);
+ spin_unlock(&space_info->lock);
+
+ bg->space_info = reloc_sinfo;
+ if (reloc_sinfo->block_group_kobjs[index] == NULL)
+ btrfs_sysfs_add_block_group_type(bg);
+
+ btrfs_add_bg_to_space_info(fs_info, bg);
+ }
+
+ fs_info->data_reloc_bg = bg->start;
+ set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &bg->runtime_flags);
+ btrfs_zone_activate(bg);
+
+ return;
+ }
+
+ if (did_chunk_alloc)
+ return;
+
+ trans = btrfs_join_transaction(fs_info->tree_root);
+ if (IS_ERR(trans))
+ return;
+
+ /* Allocate new BG in the data relocation space_info. */
+ space_info = data_sinfo->sub_group[0];
+ ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC);
+ ret = btrfs_chunk_alloc(trans, space_info, alloc_flags, CHUNK_ALLOC_FORCE);
+ btrfs_end_transaction(trans);
+ if (ret == 1) {
+ /*
+ * We allocated a new block group in the data relocation space_info. We
+ * can take that one.
+ */
+ first = false;
+ did_chunk_alloc = true;
+ goto again;
+ }
+}
+
void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info)
{
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
@@ -2455,8 +2648,8 @@ bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info)
{
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
+ u64 total = btrfs_super_total_bytes(fs_info->super_copy);
u64 used = 0;
- u64 total = 0;
u64 factor;
ASSERT(btrfs_is_zoned(fs_info));
@@ -2469,7 +2662,6 @@ bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info)
if (!device->bdev)
continue;
- total += device->disk_total_bytes;
used += device->bytes_used;
}
mutex_unlock(&fs_devices->device_list_mutex);
@@ -2523,7 +2715,7 @@ int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info)
spin_lock(&block_group->lock);
if (block_group->reserved || block_group->alloc_offset == 0 ||
- (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM) ||
+ !(block_group->flags & BTRFS_BLOCK_GROUP_DATA) ||
test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) {
spin_unlock(&block_group->lock);
continue;
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index 9672bf4c3335..6e11533b8e14 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -88,6 +88,7 @@ void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical,
void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
struct extent_buffer *eb);
void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg);
+void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info);
void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info);
bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info);
void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical,
@@ -241,6 +242,8 @@ static inline void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { }
+static inline void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info) { }
+
static inline void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) { }
static inline bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info)
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index 5232b56d5892..ff0292615e1f 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -24,13 +24,14 @@
#include "super.h"
#define ZSTD_BTRFS_MAX_WINDOWLOG 17
-#define ZSTD_BTRFS_MAX_INPUT (1 << ZSTD_BTRFS_MAX_WINDOWLOG)
+#define ZSTD_BTRFS_MAX_INPUT (1U << ZSTD_BTRFS_MAX_WINDOWLOG)
#define ZSTD_BTRFS_DEFAULT_LEVEL 3
+#define ZSTD_BTRFS_MIN_LEVEL -15
#define ZSTD_BTRFS_MAX_LEVEL 15
/* 307s to avoid pathologically clashing with transaction commit */
#define ZSTD_BTRFS_RECLAIM_JIFFIES (307 * HZ)
-static zstd_parameters zstd_get_btrfs_parameters(unsigned int level,
+static zstd_parameters zstd_get_btrfs_parameters(int level,
size_t src_len)
{
zstd_parameters params = zstd_get_params(level, src_len);
@@ -45,13 +46,14 @@ struct workspace {
void *mem;
size_t size;
char *buf;
- unsigned int level;
- unsigned int req_level;
+ int level;
+ int req_level;
unsigned long last_used; /* jiffies */
struct list_head list;
struct list_head lru_list;
zstd_in_buffer in_buf;
zstd_out_buffer out_buf;
+ zstd_parameters params;
};
/*
@@ -93,8 +95,10 @@ static inline struct workspace *list_to_workspace(struct list_head *list)
return container_of(list, struct workspace, list);
}
-void zstd_free_workspace(struct list_head *ws);
-struct list_head *zstd_alloc_workspace(unsigned int level);
+static inline int clip_level(int level)
+{
+ return max(0, level - 1);
+}
/*
* Timer callback to free unused workspaces.
@@ -123,7 +127,7 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer)
list_for_each_prev_safe(pos, next, &wsm.lru_list) {
struct workspace *victim = container_of(pos, struct workspace,
lru_list);
- unsigned int level;
+ int level;
if (time_after(victim->last_used, reclaim_threshold))
break;
@@ -137,8 +141,8 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer)
list_del(&victim->list);
zstd_free_workspace(&victim->list);
- if (list_empty(&wsm.idle_ws[level - 1]))
- clear_bit(level - 1, &wsm.active_map);
+ if (list_empty(&wsm.idle_ws[level]))
+ clear_bit(level, &wsm.active_map);
}
@@ -160,9 +164,11 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer)
static void zstd_calc_ws_mem_sizes(void)
{
size_t max_size = 0;
- unsigned int level;
+ int level;
- for (level = 1; level <= ZSTD_BTRFS_MAX_LEVEL; level++) {
+ for (level = ZSTD_BTRFS_MIN_LEVEL; level <= ZSTD_BTRFS_MAX_LEVEL; level++) {
+ if (level == 0)
+ continue;
zstd_parameters params =
zstd_get_btrfs_parameters(level, ZSTD_BTRFS_MAX_INPUT);
size_t level_size =
@@ -171,7 +177,8 @@ static void zstd_calc_ws_mem_sizes(void)
zstd_dstream_workspace_bound(ZSTD_BTRFS_MAX_INPUT));
max_size = max_t(size_t, max_size, level_size);
- zstd_ws_mem_sizes[level - 1] = max_size;
+ /* Use level 1 workspace size for all the fast mode negative levels. */
+ zstd_ws_mem_sizes[clip_level(level)] = max_size;
}
}
@@ -193,8 +200,7 @@ void zstd_init_workspace_manager(void)
ws = zstd_alloc_workspace(ZSTD_BTRFS_MAX_LEVEL);
if (IS_ERR(ws)) {
- pr_warn(
- "BTRFS: cannot preallocate zstd compression workspace\n");
+ btrfs_warn(NULL, "cannot preallocate zstd compression workspace");
} else {
set_bit(ZSTD_BTRFS_MAX_LEVEL - 1, &wsm.active_map);
list_add(ws, &wsm.idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1]);
@@ -218,7 +224,7 @@ void zstd_cleanup_workspace_manager(void)
}
spin_unlock_bh(&wsm.lock);
- del_timer_sync(&wsm.timer);
+ timer_delete_sync(&wsm.timer);
}
/*
@@ -233,11 +239,11 @@ void zstd_cleanup_workspace_manager(void)
* offer the opportunity to reclaim the workspace in favor of allocating an
* appropriately sized one in the future.
*/
-static struct list_head *zstd_find_workspace(unsigned int level)
+static struct list_head *zstd_find_workspace(int level)
{
struct list_head *ws;
struct workspace *workspace;
- int i = level - 1;
+ int i = clip_level(level);
spin_lock_bh(&wsm.lock);
for_each_set_bit_from(i, &wsm.active_map, ZSTD_BTRFS_MAX_LEVEL) {
@@ -247,7 +253,7 @@ static struct list_head *zstd_find_workspace(unsigned int level)
list_del_init(ws);
/* keep its place if it's a lower level using this */
workspace->req_level = level;
- if (level == workspace->level)
+ if (clip_level(level) == workspace->level)
list_del(&workspace->lru_list);
if (list_empty(&wsm.idle_ws[i]))
clear_bit(i, &wsm.active_map);
@@ -270,7 +276,7 @@ static struct list_head *zstd_find_workspace(unsigned int level)
* attempt to allocate a new workspace. If we fail to allocate one due to
* memory pressure, go to sleep waiting for the max level workspace to free up.
*/
-struct list_head *zstd_get_workspace(unsigned int level)
+struct list_head *zstd_get_workspace(int level)
{
struct list_head *ws;
unsigned int nofs_flag;
@@ -319,7 +325,7 @@ void zstd_put_workspace(struct list_head *ws)
spin_lock_bh(&wsm.lock);
/* A node is only taken off the lru if we are the corresponding level */
- if (workspace->req_level == workspace->level) {
+ if (clip_level(workspace->req_level) == workspace->level) {
/* Hide a max level workspace from reclaim */
if (list_empty(&wsm.idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1])) {
INIT_LIST_HEAD(&workspace->lru_list);
@@ -332,13 +338,13 @@ void zstd_put_workspace(struct list_head *ws)
}
}
- set_bit(workspace->level - 1, &wsm.active_map);
- list_add(&workspace->list, &wsm.idle_ws[workspace->level - 1]);
+ set_bit(workspace->level, &wsm.active_map);
+ list_add(&workspace->list, &wsm.idle_ws[workspace->level]);
workspace->req_level = 0;
spin_unlock_bh(&wsm.lock);
- if (workspace->level == ZSTD_BTRFS_MAX_LEVEL)
+ if (workspace->level == clip_level(ZSTD_BTRFS_MAX_LEVEL))
cond_wake_up(&wsm.wait);
}
@@ -351,7 +357,7 @@ void zstd_free_workspace(struct list_head *ws)
kfree(workspace);
}
-struct list_head *zstd_alloc_workspace(unsigned int level)
+struct list_head *zstd_alloc_workspace(int level)
{
struct workspace *workspace;
@@ -359,8 +365,9 @@ struct list_head *zstd_alloc_workspace(unsigned int level)
if (!workspace)
return ERR_PTR(-ENOMEM);
- workspace->size = zstd_ws_mem_sizes[level - 1];
- workspace->level = level;
+ /* Use level 1 workspace size for all the fast mode negative levels. */
+ workspace->size = zstd_ws_mem_sizes[clip_level(level)];
+ workspace->level = clip_level(level);
workspace->req_level = level;
workspace->last_used = jiffies;
workspace->mem = kvmalloc(workspace->size, GFP_KERNEL | __GFP_NOWARN);
@@ -393,17 +400,15 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
const unsigned long nr_dest_folios = *out_folios;
const u64 orig_end = start + len;
unsigned long max_out = nr_dest_folios * PAGE_SIZE;
- unsigned int pg_off;
unsigned int cur_len;
- zstd_parameters params = zstd_get_btrfs_parameters(workspace->req_level,
- len);
+ workspace->params = zstd_get_btrfs_parameters(workspace->req_level, len);
*out_folios = 0;
*total_out = 0;
*total_in = 0;
/* Initialize the stream */
- stream = zstd_init_cstream(&params, len, workspace->mem,
+ stream = zstd_init_cstream(&workspace->params, len, workspace->mem,
workspace->size);
if (unlikely(!stream)) {
struct btrfs_inode *inode = BTRFS_I(mapping->host);
@@ -420,9 +425,8 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio);
if (ret < 0)
goto out;
- pg_off = offset_in_page(start);
- cur_len = btrfs_calc_input_length(orig_end, start);
- workspace->in_buf.src = kmap_local_folio(in_folio, pg_off);
+ cur_len = btrfs_calc_input_length(in_folio, orig_end, start);
+ workspace->in_buf.src = kmap_local_folio(in_folio, offset_in_folio(in_folio, start));
workspace->in_buf.pos = 0;
workspace->in_buf.size = cur_len;
@@ -506,9 +510,9 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio);
if (ret < 0)
goto out;
- pg_off = offset_in_page(start);
- cur_len = btrfs_calc_input_length(orig_end, start);
- workspace->in_buf.src = kmap_local_folio(in_folio, pg_off);
+ cur_len = btrfs_calc_input_length(in_folio, orig_end, start);
+ workspace->in_buf.src = kmap_local_folio(in_folio,
+ offset_in_folio(in_folio, start));
workspace->in_buf.pos = 0;
workspace->in_buf.size = cur_len;
}
@@ -717,6 +721,7 @@ finish:
const struct btrfs_compress_op btrfs_zstd_compress = {
/* ZSTD uses own workspace manager */
.workspace_manager = NULL,
+ .min_level = ZSTD_BTRFS_MIN_LEVEL,
.max_level = ZSTD_BTRFS_MAX_LEVEL,
.default_level = ZSTD_BTRFS_DEFAULT_LEVEL,
};
diff --git a/fs/buffer.c b/fs/buffer.c
index cc8452f60251..6a8752f7bbed 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -157,8 +157,8 @@ static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
*/
void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
{
- __end_buffer_read_notouch(bh, uptodate);
put_bh(bh);
+ __end_buffer_read_notouch(bh, uptodate);
}
EXPORT_SYMBOL(end_buffer_read_sync);
@@ -176,18 +176,8 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
}
EXPORT_SYMBOL(end_buffer_write_sync);
-/*
- * Various filesystems appear to want __find_get_block to be non-blocking.
- * But it's the page lock which protects the buffers. To get around this,
- * we get exclusion from try_to_free_buffers with the blockdev mapping's
- * i_private_lock.
- *
- * Hack idea: for the blockdev mapping, i_private_lock contention
- * may be quite high. This code could TryLock the page, and if that
- * succeeds, there is no need to take i_private_lock.
- */
static struct buffer_head *
-__find_get_block_slow(struct block_device *bdev, sector_t block)
+__find_get_block_slow(struct block_device *bdev, sector_t block, bool atomic)
{
struct address_space *bd_mapping = bdev->bd_mapping;
const int blkbits = bd_mapping->host->i_blkbits;
@@ -204,10 +194,28 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
if (IS_ERR(folio))
goto out;
- spin_lock(&bd_mapping->i_private_lock);
+ /*
+ * Folio lock protects the buffers. Callers that cannot block
+ * will fallback to serializing vs try_to_free_buffers() via
+ * the i_private_lock.
+ */
+ if (atomic)
+ spin_lock(&bd_mapping->i_private_lock);
+ else
+ folio_lock(folio);
+
head = folio_buffers(folio);
if (!head)
goto out_unlock;
+ /*
+ * Upon a noref migration, the folio lock serializes here;
+ * otherwise bail.
+ */
+ if (test_bit_acquire(BH_Migrate, &head->b_state)) {
+ WARN_ON(!atomic);
+ goto out_unlock;
+ }
+
bh = head;
do {
if (!buffer_mapped(bh))
@@ -236,7 +244,10 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
1 << blkbits);
}
out_unlock:
- spin_unlock(&bd_mapping->i_private_lock);
+ if (atomic)
+ spin_unlock(&bd_mapping->i_private_lock);
+ else
+ folio_unlock(folio);
folio_put(folio);
out:
return ret;
@@ -286,7 +297,6 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
still_busy:
spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
- return;
}
struct postprocess_bh_ctx {
@@ -411,7 +421,6 @@ static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
still_busy:
spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
- return;
}
/*
@@ -656,7 +665,9 @@ EXPORT_SYMBOL(generic_buffers_fsync);
void write_boundary_block(struct block_device *bdev,
sector_t bblock, unsigned blocksize)
{
- struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
+ struct buffer_head *bh;
+
+ bh = __find_get_block_nonatomic(bdev, bblock + 1, blocksize);
if (bh) {
if (buffer_dirty(bh))
write_dirty_buffer(bh, 0);
@@ -1109,27 +1120,26 @@ static struct buffer_head *
__getblk_slow(struct block_device *bdev, sector_t block,
unsigned size, gfp_t gfp)
{
- /* Size must be multiple of hard sectorsize */
- if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
- (size < 512 || size > PAGE_SIZE))) {
- printk(KERN_ERR "getblk(): invalid block size %d requested\n",
- size);
- printk(KERN_ERR "logical block size: %d\n",
- bdev_logical_block_size(bdev));
+ bool blocking = gfpflags_allow_blocking(gfp);
- dump_stack();
+ if (WARN_ON_ONCE(!IS_ALIGNED(size, bdev_logical_block_size(bdev)))) {
+ printk(KERN_ERR "getblk(): block size %d not aligned to logical block size %d\n",
+ size, bdev_logical_block_size(bdev));
return NULL;
}
for (;;) {
struct buffer_head *bh;
- bh = __find_get_block(bdev, block, size);
- if (bh)
- return bh;
-
if (!grow_buffers(bdev, block, size, gfp))
return NULL;
+
+ if (blocking)
+ bh = __find_get_block_nonatomic(bdev, block, size);
+ else
+ bh = __find_get_block(bdev, block, size);
+ if (bh)
+ return bh;
}
}
@@ -1207,10 +1217,8 @@ void mark_buffer_write_io_error(struct buffer_head *bh)
/* FIXME: do we need to set this in both places? */
if (bh->b_folio && bh->b_folio->mapping)
mapping_set_error(bh->b_folio->mapping, -EIO);
- if (bh->b_assoc_map) {
+ if (bh->b_assoc_map)
mapping_set_error(bh->b_assoc_map, -EIO);
- errseq_set(&bh->b_assoc_map->host->i_sb->s_wb_err, -EIO);
- }
}
EXPORT_SYMBOL(mark_buffer_write_io_error);
@@ -1386,16 +1394,18 @@ lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
/*
* Perform a pagecache lookup for the matching buffer. If it's there, refresh
* it in the LRU and mark it as accessed. If it is not present then return
- * NULL
+ * NULL. Atomic context callers may also return NULL if the buffer is being
+ * migrated; similarly the page is not marked accessed either.
*/
-struct buffer_head *
-__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
+static struct buffer_head *
+find_get_block_common(struct block_device *bdev, sector_t block,
+ unsigned size, bool atomic)
{
struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
if (bh == NULL) {
/* __find_get_block_slow will mark the page accessed */
- bh = __find_get_block_slow(bdev, block);
+ bh = __find_get_block_slow(bdev, block, atomic);
if (bh)
bh_lru_install(bh);
} else
@@ -1403,8 +1413,23 @@ __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
return bh;
}
+
+struct buffer_head *
+__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
+{
+ return find_get_block_common(bdev, block, size, true);
+}
EXPORT_SYMBOL(__find_get_block);
+/* same as __find_get_block() but allows sleeping contexts */
+struct buffer_head *
+__find_get_block_nonatomic(struct block_device *bdev, sector_t block,
+ unsigned size)
+{
+ return find_get_block_common(bdev, block, size, false);
+}
+EXPORT_SYMBOL(__find_get_block_nonatomic);
+
/**
* bdev_getblk - Get a buffer_head in a block device's buffer cache.
* @bdev: The block device.
@@ -1422,7 +1447,12 @@ EXPORT_SYMBOL(__find_get_block);
struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block,
unsigned size, gfp_t gfp)
{
- struct buffer_head *bh = __find_get_block(bdev, block, size);
+ struct buffer_head *bh;
+
+ if (gfpflags_allow_blocking(gfp))
+ bh = __find_get_block_nonatomic(bdev, block, size);
+ else
+ bh = __find_get_block(bdev, block, size);
might_alloc(gfp);
if (bh)
@@ -1578,8 +1608,8 @@ static void discard_buffer(struct buffer_head * bh)
bh->b_bdev = NULL;
b_state = READ_ONCE(bh->b_state);
do {
- } while (!try_cmpxchg(&bh->b_state, &b_state,
- b_state & ~BUFFER_FLAGS_DISCARD));
+ } while (!try_cmpxchg_relaxed(&bh->b_state, &b_state,
+ b_state & ~BUFFER_FLAGS_DISCARD));
unlock_buffer(bh);
}
@@ -1644,7 +1674,6 @@ void block_invalidate_folio(struct folio *folio, size_t offset, size_t length)
filemap_release_folio(folio, 0);
out:
folio_clear_mappedtodisk(folio);
- return;
}
EXPORT_SYMBOL(block_invalidate_folio);
@@ -2166,7 +2195,7 @@ int __block_write_begin(struct folio *folio, loff_t pos, unsigned len,
}
EXPORT_SYMBOL(__block_write_begin);
-static void __block_commit_write(struct folio *folio, size_t from, size_t to)
+void block_commit_write(struct folio *folio, size_t from, size_t to)
{
size_t block_start, block_end;
bool partial = false;
@@ -2204,6 +2233,7 @@ static void __block_commit_write(struct folio *folio, size_t from, size_t to)
if (!partial)
folio_mark_uptodate(folio);
}
+EXPORT_SYMBOL(block_commit_write);
/*
* block_write_begin takes care of the basic task of block allocation and
@@ -2235,9 +2265,8 @@ int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
}
EXPORT_SYMBOL(block_write_begin);
-int block_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct folio *folio, void *fsdata)
+int block_write_end(loff_t pos, unsigned len, unsigned copied,
+ struct folio *folio)
{
size_t start = pos - folio_pos(folio);
@@ -2262,21 +2291,21 @@ int block_write_end(struct file *file, struct address_space *mapping,
flush_dcache_folio(folio);
/* This could be a short (even 0-length) commit */
- __block_commit_write(folio, start, start + copied);
+ block_commit_write(folio, start, start + copied);
return copied;
}
EXPORT_SYMBOL(block_write_end);
-int generic_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct folio *folio, void *fsdata)
+int generic_write_end(const struct kiocb *iocb, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct folio *folio, void *fsdata)
{
struct inode *inode = mapping->host;
loff_t old_size = inode->i_size;
bool i_size_changed = false;
- copied = block_write_end(file, mapping, pos, len, copied, folio, fsdata);
+ copied = block_write_end(pos, len, copied, folio);
/*
* No need to use i_size_read() here, the i_size cannot change under us
@@ -2361,9 +2390,8 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block)
{
struct inode *inode = folio->mapping->host;
sector_t iblock, lblock;
- struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
+ struct buffer_head *bh, *head, *prev = NULL;
size_t blocksize;
- int nr, i;
int fully_mapped = 1;
bool page_error = false;
loff_t limit = i_size_read(inode);
@@ -2372,16 +2400,12 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block)
if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode))
limit = inode->i_sb->s_maxbytes;
- VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
-
head = folio_create_buffers(folio, inode, 0);
blocksize = head->b_size;
iblock = div_u64(folio_pos(folio), blocksize);
lblock = div_u64(limit + blocksize - 1, blocksize);
bh = head;
- nr = 0;
- i = 0;
do {
if (buffer_uptodate(bh))
@@ -2398,7 +2422,7 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block)
page_error = true;
}
if (!buffer_mapped(bh)) {
- folio_zero_range(folio, i * blocksize,
+ folio_zero_range(folio, bh_offset(bh),
blocksize);
if (!err)
set_buffer_uptodate(bh);
@@ -2411,40 +2435,33 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block)
if (buffer_uptodate(bh))
continue;
}
- arr[nr++] = bh;
- } while (i++, iblock++, (bh = bh->b_this_page) != head);
- if (fully_mapped)
- folio_set_mappedtodisk(folio);
-
- if (!nr) {
- /*
- * All buffers are uptodate or get_block() returned an
- * error when trying to map them - we can finish the read.
- */
- folio_end_read(folio, !page_error);
- return 0;
- }
-
- /* Stage two: lock the buffers */
- for (i = 0; i < nr; i++) {
- bh = arr[i];
lock_buffer(bh);
+ if (buffer_uptodate(bh)) {
+ unlock_buffer(bh);
+ continue;
+ }
+
mark_buffer_async_read(bh);
- }
+ if (prev)
+ submit_bh(REQ_OP_READ, prev);
+ prev = bh;
+ } while (iblock++, (bh = bh->b_this_page) != head);
+
+ if (fully_mapped)
+ folio_set_mappedtodisk(folio);
/*
- * Stage 3: start the IO. Check for uptodateness
- * inside the buffer lock in case another process reading
- * the underlying blockdev brought it uptodate (the sct fix).
+ * All buffers are uptodate or get_block() returned an error
+ * when trying to map them - we must finish the read because
+ * end_buffer_async_read() will never be called on any buffer
+ * in this folio.
*/
- for (i = 0; i < nr; i++) {
- bh = arr[i];
- if (buffer_uptodate(bh))
- end_buffer_async_read(bh, 1);
- else
- submit_bh(REQ_OP_READ, bh);
- }
+ if (prev)
+ submit_bh(REQ_OP_READ, prev);
+ else
+ folio_end_read(folio, !page_error);
+
return 0;
}
EXPORT_SYMBOL(block_read_full_folio);
@@ -2477,7 +2494,8 @@ out:
}
EXPORT_SYMBOL(generic_cont_expand_simple);
-static int cont_expand_zero(struct file *file, struct address_space *mapping,
+static int cont_expand_zero(const struct kiocb *iocb,
+ struct address_space *mapping,
loff_t pos, loff_t *bytes)
{
struct inode *inode = mapping->host;
@@ -2501,12 +2519,12 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping,
}
len = PAGE_SIZE - zerofrom;
- err = aops->write_begin(file, mapping, curpos, len,
+ err = aops->write_begin(iocb, mapping, curpos, len,
&folio, &fsdata);
if (err)
goto out;
folio_zero_range(folio, offset_in_folio(folio, curpos), len);
- err = aops->write_end(file, mapping, curpos, len, len,
+ err = aops->write_end(iocb, mapping, curpos, len, len,
folio, fsdata);
if (err < 0)
goto out;
@@ -2534,12 +2552,12 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping,
}
len = offset - zerofrom;
- err = aops->write_begin(file, mapping, curpos, len,
+ err = aops->write_begin(iocb, mapping, curpos, len,
&folio, &fsdata);
if (err)
goto out;
folio_zero_range(folio, offset_in_folio(folio, curpos), len);
- err = aops->write_end(file, mapping, curpos, len, len,
+ err = aops->write_end(iocb, mapping, curpos, len, len,
folio, fsdata);
if (err < 0)
goto out;
@@ -2554,17 +2572,16 @@ out:
* For moronic filesystems that do not allow holes in file.
* We may have to extend the file.
*/
-int cont_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len,
- struct folio **foliop, void **fsdata,
- get_block_t *get_block, loff_t *bytes)
+int cont_write_begin(const struct kiocb *iocb, struct address_space *mapping,
+ loff_t pos, unsigned len, struct folio **foliop,
+ void **fsdata, get_block_t *get_block, loff_t *bytes)
{
struct inode *inode = mapping->host;
unsigned int blocksize = i_blocksize(inode);
unsigned int zerofrom;
int err;
- err = cont_expand_zero(file, mapping, pos, bytes);
+ err = cont_expand_zero(iocb, mapping, pos, bytes);
if (err)
return err;
@@ -2578,13 +2595,6 @@ int cont_write_begin(struct file *file, struct address_space *mapping,
}
EXPORT_SYMBOL(cont_write_begin);
-void block_commit_write(struct page *page, unsigned from, unsigned to)
-{
- struct folio *folio = page_folio(page);
- __block_commit_write(folio, from, to);
-}
-EXPORT_SYMBOL(block_commit_write);
-
/*
* block_page_mkwrite() is not allowed to change the file size as it gets
* called from a page fault handler when a page is first dirtied. Hence we must
@@ -2593,7 +2603,7 @@ EXPORT_SYMBOL(block_commit_write);
* holes and correct delalloc and unwritten extent mapping on filesystems that
* support these features.
*
- * We are not allowed to take the i_mutex here so we have to play games to
+ * We are not allowed to take the i_rwsem here so we have to play games to
* protect against truncate races as the page could now be beyond EOF. Because
* truncate writes the inode size before removing pages, once we have the
* page lock we can determine safely if the page is beyond EOF. If it is not
@@ -2630,7 +2640,7 @@ int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
if (unlikely(ret))
goto out_unlock;
- __block_commit_write(folio, 0, end);
+ block_commit_write(folio, 0, end);
folio_mark_dirty(folio);
folio_wait_stable(folio);
@@ -2713,7 +2723,7 @@ unlock:
EXPORT_SYMBOL(block_truncate_page);
/*
- * The generic ->writepage function for buffer-backed address_spaces
+ * The generic write folio function for buffer-backed address_spaces
*/
int block_write_full_folio(struct folio *folio, struct writeback_control *wbc,
void *get_block)
@@ -2733,7 +2743,7 @@ int block_write_full_folio(struct folio *folio, struct writeback_control *wbc,
/*
* The folio straddles i_size. It must be zeroed out on each and every
- * writepage invocation because it may be mmapped. "A file is mapped
+ * writeback invocation because it may be mmapped. "A file is mapped
* in multiples of the page size. For a file that is not a multiple of
* the page size, the remaining memory is zeroed when mapped, and
* writes to that region are not written out to the file."
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index 38c236e38cef..b62cd3e9a18e 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -71,7 +71,6 @@ struct cachefiles_object {
int debug_id;
spinlock_t lock;
refcount_t ref;
- u8 d_name_len; /* Length of filename */
enum cachefiles_content content_info:8; /* Info about content presence */
unsigned long flags;
#define CACHEFILES_OBJECT_USING_TMPFILE 0 /* Have an unlinked tmpfile */
diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c
index 92058ae43488..3e0576d9db1d 100644
--- a/fs/cachefiles/io.c
+++ b/fs/cachefiles/io.c
@@ -63,7 +63,7 @@ static void cachefiles_read_complete(struct kiocb *iocb, long ret)
ret = -ESTALE;
}
- ki->term_func(ki->term_func_priv, ret, ki->was_async);
+ ki->term_func(ki->term_func_priv, ret);
}
cachefiles_put_kiocb(ki);
@@ -188,7 +188,7 @@ in_progress:
presubmission_error:
if (term_func)
- term_func(term_func_priv, ret < 0 ? ret : skipped, false);
+ term_func(term_func_priv, ret < 0 ? ret : skipped);
return ret;
}
@@ -271,7 +271,7 @@ static void cachefiles_write_complete(struct kiocb *iocb, long ret)
atomic_long_sub(ki->b_writing, &object->volume->cache->b_writing);
set_bit(FSCACHE_COOKIE_HAVE_DATA, &object->cookie->flags);
if (ki->term_func)
- ki->term_func(ki->term_func_priv, ret, ki->was_async);
+ ki->term_func(ki->term_func_priv, ret);
cachefiles_put_kiocb(ki);
}
@@ -301,7 +301,7 @@ int __cachefiles_write(struct cachefiles_object *object,
ki = kzalloc(sizeof(struct cachefiles_kiocb), GFP_KERNEL);
if (!ki) {
if (term_func)
- term_func(term_func_priv, -ENOMEM, false);
+ term_func(term_func_priv, -ENOMEM);
return -ENOMEM;
}
@@ -347,8 +347,6 @@ int __cachefiles_write(struct cachefiles_object *object,
default:
ki->was_async = false;
cachefiles_write_complete(&ki->iocb, ret);
- if (ret > 0)
- ret = 0;
break;
}
@@ -366,7 +364,7 @@ static int cachefiles_write(struct netfs_cache_resources *cres,
{
if (!fscache_wait_for_operation(cres, FSCACHE_WANT_WRITE)) {
if (term_func)
- term_func(term_func_priv, -ENOBUFS, false);
+ term_func(term_func_priv, -ENOBUFS);
trace_netfs_sreq(term_func_priv, netfs_sreq_trace_cache_nowrite);
return -ENOBUFS;
}
@@ -665,7 +663,7 @@ static void cachefiles_issue_write(struct netfs_io_subrequest *subreq)
pre = CACHEFILES_DIO_BLOCK_SIZE - off;
if (pre >= len) {
fscache_count_dio_misfit();
- netfs_write_subrequest_terminated(subreq, len, false);
+ netfs_write_subrequest_terminated(subreq, len);
return;
}
subreq->transferred += pre;
@@ -691,7 +689,7 @@ static void cachefiles_issue_write(struct netfs_io_subrequest *subreq)
len -= post;
if (len == 0) {
fscache_count_dio_misfit();
- netfs_write_subrequest_terminated(subreq, post, false);
+ netfs_write_subrequest_terminated(subreq, post);
return;
}
iov_iter_truncate(&subreq->io_iter, len);
@@ -703,7 +701,7 @@ static void cachefiles_issue_write(struct netfs_io_subrequest *subreq)
&start, &len, len, true);
cachefiles_end_secure(cache, saved_cred);
if (ret < 0) {
- netfs_write_subrequest_terminated(subreq, ret, false);
+ netfs_write_subrequest_terminated(subreq, ret);
return;
}
diff --git a/fs/cachefiles/key.c b/fs/cachefiles/key.c
index bf935e25bdbe..aae86af48ed5 100644
--- a/fs/cachefiles/key.c
+++ b/fs/cachefiles/key.c
@@ -8,7 +8,7 @@
#include <linux/slab.h>
#include "internal.h"
-static const char cachefiles_charmap[64] =
+static const char cachefiles_charmap[64] __nonstring =
"0123456789" /* 0 - 9 */
"abcdefghijklmnopqrstuvwxyz" /* 10 - 35 */
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" /* 36 - 61 */
@@ -132,7 +132,6 @@ bool cachefiles_cook_key(struct cachefiles_object *object)
success:
name[len] = 0;
object->d_name = name;
- object->d_name_len = len;
_leave(" = %s", object->d_name);
return true;
}
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 7cf59713f0f7..91dfd0231877 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -98,7 +98,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
retry:
ret = cachefiles_inject_read_error();
if (ret == 0)
- subdir = lookup_one_len(dirname, dir, strlen(dirname));
+ subdir = lookup_one(&nop_mnt_idmap, &QSTR(dirname), dir);
else
subdir = ERR_PTR(ret);
trace_cachefiles_lookup(NULL, dir, subdir);
@@ -130,16 +130,18 @@ retry:
goto mkdir_error;
ret = cachefiles_inject_write_error();
if (ret == 0)
- ret = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), subdir, 0700);
- if (ret < 0) {
+ subdir = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), subdir, 0700);
+ else
+ subdir = ERR_PTR(ret);
+ if (IS_ERR(subdir)) {
trace_cachefiles_vfs_error(NULL, d_inode(dir), ret,
cachefiles_trace_mkdir_error);
goto mkdir_error;
}
trace_cachefiles_mkdir(dir, subdir);
- if (unlikely(d_unhashed(subdir))) {
- cachefiles_put_directory(subdir);
+ if (unlikely(d_unhashed(subdir) || d_is_negative(subdir))) {
+ dput(subdir);
goto retry;
}
ASSERT(d_backing_inode(subdir));
@@ -195,7 +197,8 @@ mark_error:
mkdir_error:
inode_unlock(d_inode(dir));
- dput(subdir);
+ if (!IS_ERR(subdir))
+ dput(subdir);
pr_err("mkdir %s failed with error %d\n", dirname, ret);
return ERR_PTR(ret);
@@ -335,7 +338,7 @@ try_again:
return -EIO;
}
- grave = lookup_one_len(nbuffer, cache->graveyard, strlen(nbuffer));
+ grave = lookup_one(&nop_mnt_idmap, &QSTR(nbuffer), cache->graveyard);
if (IS_ERR(grave)) {
unlock_rename(cache->graveyard, dir);
trace_cachefiles_vfs_error(object, d_inode(cache->graveyard),
@@ -385,10 +388,10 @@ try_again:
} else {
struct renamedata rd = {
.old_mnt_idmap = &nop_mnt_idmap,
- .old_dir = d_inode(dir),
+ .old_parent = dir,
.old_dentry = rep,
.new_mnt_idmap = &nop_mnt_idmap,
- .new_dir = d_inode(cache->graveyard),
+ .new_parent = cache->graveyard,
.new_dentry = grave,
};
trace_cachefiles_rename(object, d_inode(rep)->i_ino, why);
@@ -627,8 +630,8 @@ bool cachefiles_look_up_object(struct cachefiles_object *object)
/* Look up path "cache/vol/fanout/file". */
ret = cachefiles_inject_read_error();
if (ret == 0)
- dentry = lookup_positive_unlocked(object->d_name, fan,
- object->d_name_len);
+ dentry = lookup_one_positive_unlocked(&nop_mnt_idmap,
+ &QSTR(object->d_name), fan);
else
dentry = ERR_PTR(ret);
trace_cachefiles_lookup(object, fan, dentry);
@@ -680,7 +683,7 @@ bool cachefiles_commit_tmpfile(struct cachefiles_cache *cache,
inode_lock_nested(d_inode(fan), I_MUTEX_PARENT);
ret = cachefiles_inject_read_error();
if (ret == 0)
- dentry = lookup_one_len(object->d_name, fan, object->d_name_len);
+ dentry = lookup_one(&nop_mnt_idmap, &QSTR(object->d_name), fan);
else
dentry = ERR_PTR(ret);
if (IS_ERR(dentry)) {
@@ -699,7 +702,7 @@ bool cachefiles_commit_tmpfile(struct cachefiles_cache *cache,
dput(dentry);
ret = cachefiles_inject_read_error();
if (ret == 0)
- dentry = lookup_one_len(object->d_name, fan, object->d_name_len);
+ dentry = lookup_one(&nop_mnt_idmap, &QSTR(object->d_name), fan);
else
dentry = ERR_PTR(ret);
if (IS_ERR(dentry)) {
@@ -748,7 +751,7 @@ static struct dentry *cachefiles_lookup_for_cull(struct cachefiles_cache *cache,
inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
- victim = lookup_one_len(filename, dir, strlen(filename));
+ victim = lookup_one(&nop_mnt_idmap, &QSTR(filename), dir);
if (IS_ERR(victim))
goto lookup_error;
if (d_is_negative(victim))
diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c
index fe3de9ad57bf..a7ed86fa98bb 100644
--- a/fs/cachefiles/ondemand.c
+++ b/fs/cachefiles/ondemand.c
@@ -83,10 +83,8 @@ static ssize_t cachefiles_ondemand_fd_write_iter(struct kiocb *kiocb,
trace_cachefiles_ondemand_fd_write(object, file_inode(file), pos, len);
ret = __cachefiles_write(object, file, pos, iter, NULL, NULL);
- if (!ret) {
- ret = len;
+ if (ret > 0)
kiocb->ki_pos += ret;
- }
out:
fput(file);
@@ -317,8 +315,9 @@ static int cachefiles_ondemand_get_fd(struct cachefiles_req *req,
goto err_free_id;
}
- anon_file->file = anon_inode_getfile("[cachefiles]",
- &cachefiles_ondemand_fd_fops, object, O_WRONLY);
+ anon_file->file = anon_inode_getfile_fmode("[cachefiles]",
+ &cachefiles_ondemand_fd_fops, object,
+ O_WRONLY, FMODE_PWRITE | FMODE_LSEEK);
if (IS_ERR(anon_file->file)) {
ret = PTR_ERR(anon_file->file);
goto err_put_fd;
@@ -333,8 +332,6 @@ static int cachefiles_ondemand_get_fd(struct cachefiles_req *req,
goto err_put_file;
}
- anon_file->file->f_mode |= FMODE_PWRITE | FMODE_LSEEK;
-
load = (void *)req->msg.data;
load->fd = anon_file->fd;
object->ondemand->ondemand_id = object_id;
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
index 7249d70e1a43..3e7def3d31c1 100644
--- a/fs/ceph/Kconfig
+++ b/fs/ceph/Kconfig
@@ -3,7 +3,7 @@ config CEPH_FS
tristate "Ceph distributed file system"
depends on INET
select CEPH_LIB
- select LIBCRC32C
+ select CRC32
select CRYPTO_AES
select CRYPTO
select NETFS_SUPPORT
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index f5224a566b69..8b202d789e93 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -82,6 +82,7 @@ static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio)
{
struct inode *inode = mapping->host;
struct ceph_client *cl = ceph_inode_to_client(inode);
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
struct ceph_inode_info *ci;
struct ceph_snap_context *snapc;
@@ -92,6 +93,8 @@ static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio)
return false;
}
+ atomic64_inc(&mdsc->dirty_folios);
+
ci = ceph_inode(inode);
/* dirty the head */
@@ -235,6 +238,7 @@ static void finish_netfs_read(struct ceph_osd_request *req)
if (sparse && err > 0)
err = ceph_sparse_ext_map_end(op);
if (err < subreq->len &&
+ subreq->rreq->origin != NETFS_UNBUFFERED_READ &&
subreq->rreq->origin != NETFS_DIO_READ)
__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
if (IS_ENCRYPTED(inode) && err > 0) {
@@ -278,7 +282,8 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq)
size_t len;
int mode;
- if (rreq->origin != NETFS_DIO_READ)
+ if (rreq->origin != NETFS_UNBUFFERED_READ &&
+ rreq->origin != NETFS_DIO_READ)
__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
__clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
@@ -404,6 +409,15 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
struct page **pages;
size_t page_off;
+ /*
+ * FIXME: io_iter.count needs to be corrected to aligned
+ * length. Otherwise, iov_iter_get_pages_alloc2() operates
+ * with the initial unaligned length value. As a result,
+ * ceph_msg_data_cursor_init() triggers BUG_ON() in the case
+ * if msg->sparse_read_total > msg->data_length.
+ */
+ subreq->io_iter.count = len;
+
err = iov_iter_get_pages_alloc2(&subreq->io_iter, &pages, len, &page_off);
if (err < 0) {
doutc(cl, "%llx.%llx failed to allocate pages, %d\n",
@@ -536,7 +550,7 @@ static void ceph_set_page_fscache(struct page *page)
folio_start_private_2(page_folio(page)); /* [DEPRECATED] */
}
-static void ceph_fscache_write_terminated(void *priv, ssize_t error, bool was_async)
+static void ceph_fscache_write_terminated(void *priv, ssize_t error)
{
struct inode *inode = priv;
@@ -568,7 +582,36 @@ struct ceph_writeback_ctl
u64 truncate_size;
u32 truncate_seq;
bool size_stable;
+
bool head_snapc;
+ struct ceph_snap_context *snapc;
+ struct ceph_snap_context *last_snapc;
+
+ bool done;
+ bool should_loop;
+ bool range_whole;
+ pgoff_t start_index;
+ pgoff_t index;
+ pgoff_t end;
+ xa_mark_t tag;
+
+ pgoff_t strip_unit_end;
+ unsigned int wsize;
+ unsigned int nr_folios;
+ unsigned int max_pages;
+ unsigned int locked_pages;
+
+ int op_idx;
+ int num_ops;
+ u64 offset;
+ u64 len;
+
+ struct folio_batch fbatch;
+ unsigned int processed_in_fbatch;
+
+ bool from_pool;
+ struct page **pages;
+ struct page **data_pages;
};
/*
@@ -666,22 +709,23 @@ static u64 get_writepages_data_length(struct inode *inode,
}
/*
- * Write a single page, but leave the page locked.
+ * Write a folio, but leave it locked.
*
* If we get a write error, mark the mapping for error, but still adjust the
- * dirty page accounting (i.e., page is no longer dirty).
+ * dirty page accounting (i.e., folio is no longer dirty).
*/
-static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
+static int write_folio_nounlock(struct folio *folio,
+ struct writeback_control *wbc)
{
- struct folio *folio = page_folio(page);
- struct inode *inode = page->mapping->host;
+ struct page *page = &folio->page;
+ struct inode *inode = folio->mapping->host;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
struct ceph_client *cl = fsc->client;
struct ceph_snap_context *snapc, *oldest;
- loff_t page_off = page_offset(page);
+ loff_t page_off = folio_pos(folio);
int err;
- loff_t len = thp_size(page);
+ loff_t len = folio_size(folio);
loff_t wlen;
struct ceph_writeback_ctl ceph_wbc;
struct ceph_osd_client *osdc = &fsc->client->osdc;
@@ -689,27 +733,27 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
bool caching = ceph_is_cache_enabled(inode);
struct page *bounce_page = NULL;
- doutc(cl, "%llx.%llx page %p idx %lu\n", ceph_vinop(inode), page,
- page->index);
+ doutc(cl, "%llx.%llx folio %p idx %lu\n", ceph_vinop(inode), folio,
+ folio->index);
if (ceph_inode_is_shutdown(inode))
return -EIO;
/* verify this is a writeable snap context */
- snapc = page_snap_context(page);
+ snapc = page_snap_context(&folio->page);
if (!snapc) {
- doutc(cl, "%llx.%llx page %p not dirty?\n", ceph_vinop(inode),
- page);
+ doutc(cl, "%llx.%llx folio %p not dirty?\n", ceph_vinop(inode),
+ folio);
return 0;
}
oldest = get_oldest_context(inode, &ceph_wbc, snapc);
if (snapc->seq > oldest->seq) {
- doutc(cl, "%llx.%llx page %p snapc %p not writeable - noop\n",
- ceph_vinop(inode), page, snapc);
+ doutc(cl, "%llx.%llx folio %p snapc %p not writeable - noop\n",
+ ceph_vinop(inode), folio, snapc);
/* we should only noop if called by kswapd */
WARN_ON(!(current->flags & PF_MEMALLOC));
ceph_put_snap_context(oldest);
- redirty_page_for_writepage(wbc, page);
+ folio_redirty_for_writepage(wbc, folio);
return 0;
}
ceph_put_snap_context(oldest);
@@ -726,8 +770,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
len = ceph_wbc.i_size - page_off;
wlen = IS_ENCRYPTED(inode) ? round_up(len, CEPH_FSCRYPT_BLOCK_SIZE) : len;
- doutc(cl, "%llx.%llx page %p index %lu on %llu~%llu snapc %p seq %lld\n",
- ceph_vinop(inode), page, page->index, page_off, wlen, snapc,
+ doutc(cl, "%llx.%llx folio %p index %lu on %llu~%llu snapc %p seq %lld\n",
+ ceph_vinop(inode), folio, folio->index, page_off, wlen, snapc,
snapc->seq);
if (atomic_long_inc_return(&fsc->writeback_count) >
@@ -740,32 +784,32 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
ceph_wbc.truncate_seq,
ceph_wbc.truncate_size, true);
if (IS_ERR(req)) {
- redirty_page_for_writepage(wbc, page);
+ folio_redirty_for_writepage(wbc, folio);
return PTR_ERR(req);
}
if (wlen < len)
len = wlen;
- set_page_writeback(page);
+ folio_start_writeback(folio);
if (caching)
- ceph_set_page_fscache(page);
+ ceph_set_page_fscache(&folio->page);
ceph_fscache_write_to_cache(inode, page_off, len, caching);
if (IS_ENCRYPTED(inode)) {
- bounce_page = fscrypt_encrypt_pagecache_blocks(page,
+ bounce_page = fscrypt_encrypt_pagecache_blocks(folio,
CEPH_FSCRYPT_BLOCK_SIZE, 0,
GFP_NOFS);
if (IS_ERR(bounce_page)) {
- redirty_page_for_writepage(wbc, page);
- end_page_writeback(page);
+ folio_redirty_for_writepage(wbc, folio);
+ folio_end_writeback(folio);
ceph_osdc_put_request(req);
return PTR_ERR(bounce_page);
}
}
/* it may be a short write due to an object boundary */
- WARN_ON_ONCE(len > thp_size(page));
+ WARN_ON_ONCE(len > folio_size(folio));
osd_req_op_extent_osd_data_pages(req, 0,
bounce_page ? &bounce_page : &page, wlen, 0,
false, false);
@@ -791,25 +835,25 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
if (err == -ERESTARTSYS) {
/* killed by SIGKILL */
doutc(cl, "%llx.%llx interrupted page %p\n",
- ceph_vinop(inode), page);
- redirty_page_for_writepage(wbc, page);
- end_page_writeback(page);
+ ceph_vinop(inode), folio);
+ folio_redirty_for_writepage(wbc, folio);
+ folio_end_writeback(folio);
return err;
}
if (err == -EBLOCKLISTED)
fsc->blocklisted = true;
- doutc(cl, "%llx.%llx setting page/mapping error %d %p\n",
- ceph_vinop(inode), err, page);
+ doutc(cl, "%llx.%llx setting mapping error %d %p\n",
+ ceph_vinop(inode), err, folio);
mapping_set_error(&inode->i_data, err);
wbc->pages_skipped++;
} else {
doutc(cl, "%llx.%llx cleaned page %p\n",
- ceph_vinop(inode), page);
+ ceph_vinop(inode), folio);
err = 0; /* vfs expects us to return 0 */
}
- oldest = detach_page_private(page);
+ oldest = folio_detach_private(folio);
WARN_ON_ONCE(oldest != snapc);
- end_page_writeback(page);
+ folio_end_writeback(folio);
ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
ceph_put_snap_context(snapc); /* page's reference */
@@ -820,32 +864,6 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
return err;
}
-static int ceph_writepage(struct page *page, struct writeback_control *wbc)
-{
- int err;
- struct inode *inode = page->mapping->host;
- BUG_ON(!inode);
- ihold(inode);
-
- if (wbc->sync_mode == WB_SYNC_NONE &&
- ceph_inode_to_fs_client(inode)->write_congested) {
- redirty_page_for_writepage(wbc, page);
- return AOP_WRITEPAGE_ACTIVATE;
- }
-
- folio_wait_private_2(page_folio(page)); /* [DEPRECATED] */
-
- err = writepage_nounlock(page, wbc);
- if (err == -ERESTARTSYS) {
- /* direct memory reclaimer was killed by SIGKILL. return 0
- * to prevent caller from setting mapping/page error */
- err = 0;
- }
- unlock_page(page);
- iput(inode);
- return err;
-}
-
/*
* async writeback completion handler.
*
@@ -865,6 +883,7 @@ static void writepages_finish(struct ceph_osd_request *req)
struct ceph_snap_context *snapc = req->r_snapc;
struct address_space *mapping = inode->i_mapping;
struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
unsigned int len = 0;
bool remove_page;
@@ -920,6 +939,12 @@ static void writepages_finish(struct ceph_osd_request *req)
ceph_put_snap_context(detach_page_private(page));
end_page_writeback(page);
+
+ if (atomic64_dec_return(&mdsc->dirty_folios) <= 0) {
+ wake_up_all(&mdsc->flush_end_wq);
+ WARN_ON(atomic64_read(&mdsc->dirty_folios) < 0);
+ }
+
doutc(cl, "unlocking %p\n", page);
if (remove_page)
@@ -949,36 +974,13 @@ static void writepages_finish(struct ceph_osd_request *req)
ceph_dec_osd_stopping_blocker(fsc->mdsc);
}
-/*
- * initiate async writeback
- */
-static int ceph_writepages_start(struct address_space *mapping,
- struct writeback_control *wbc)
+static inline
+bool is_forced_umount(struct address_space *mapping)
{
struct inode *inode = mapping->host;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
struct ceph_client *cl = fsc->client;
- struct ceph_vino vino = ceph_vino(inode);
- pgoff_t index, start_index, end = -1;
- struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc;
- struct folio_batch fbatch;
- int rc = 0;
- unsigned int wsize = i_blocksize(inode);
- struct ceph_osd_request *req = NULL;
- struct ceph_writeback_ctl ceph_wbc;
- bool should_loop, range_whole = false;
- bool done = false;
- bool caching = ceph_is_cache_enabled(inode);
- xa_mark_t tag;
-
- if (wbc->sync_mode == WB_SYNC_NONE &&
- fsc->write_congested)
- return 0;
-
- doutc(cl, "%llx.%llx (mode=%s)\n", ceph_vinop(inode),
- wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
- (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
if (ceph_inode_is_shutdown(inode)) {
if (ci->i_wrbuffer_ref > 0) {
@@ -987,388 +989,733 @@ static int ceph_writepages_start(struct address_space *mapping,
ceph_vinop(inode), ceph_ino(inode));
}
mapping_set_error(mapping, -EIO);
- return -EIO; /* we're in a forced umount, don't write! */
+ return true;
}
+
+ return false;
+}
+
+static inline
+unsigned int ceph_define_write_size(struct address_space *mapping)
+{
+ struct inode *inode = mapping->host;
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ unsigned int wsize = i_blocksize(inode);
+
if (fsc->mount_options->wsize < wsize)
wsize = fsc->mount_options->wsize;
- folio_batch_init(&fbatch);
+ return wsize;
+}
- start_index = wbc->range_cyclic ? mapping->writeback_index : 0;
- index = start_index;
+static inline
+void ceph_folio_batch_init(struct ceph_writeback_ctl *ceph_wbc)
+{
+ folio_batch_init(&ceph_wbc->fbatch);
+ ceph_wbc->processed_in_fbatch = 0;
+}
+
+static inline
+void ceph_folio_batch_reinit(struct ceph_writeback_ctl *ceph_wbc)
+{
+ folio_batch_release(&ceph_wbc->fbatch);
+ ceph_folio_batch_init(ceph_wbc);
+}
+
+static inline
+void ceph_init_writeback_ctl(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct ceph_writeback_ctl *ceph_wbc)
+{
+ ceph_wbc->snapc = NULL;
+ ceph_wbc->last_snapc = NULL;
+
+ ceph_wbc->strip_unit_end = 0;
+ ceph_wbc->wsize = ceph_define_write_size(mapping);
+
+ ceph_wbc->nr_folios = 0;
+ ceph_wbc->max_pages = 0;
+ ceph_wbc->locked_pages = 0;
+
+ ceph_wbc->done = false;
+ ceph_wbc->should_loop = false;
+ ceph_wbc->range_whole = false;
+
+ ceph_wbc->start_index = wbc->range_cyclic ? mapping->writeback_index : 0;
+ ceph_wbc->index = ceph_wbc->start_index;
+ ceph_wbc->end = -1;
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) {
- tag = PAGECACHE_TAG_TOWRITE;
+ ceph_wbc->tag = PAGECACHE_TAG_TOWRITE;
} else {
- tag = PAGECACHE_TAG_DIRTY;
+ ceph_wbc->tag = PAGECACHE_TAG_DIRTY;
}
-retry:
+
+ ceph_wbc->op_idx = -1;
+ ceph_wbc->num_ops = 0;
+ ceph_wbc->offset = 0;
+ ceph_wbc->len = 0;
+ ceph_wbc->from_pool = false;
+
+ ceph_folio_batch_init(ceph_wbc);
+
+ ceph_wbc->pages = NULL;
+ ceph_wbc->data_pages = NULL;
+}
+
+static inline
+int ceph_define_writeback_range(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct ceph_writeback_ctl *ceph_wbc)
+{
+ struct inode *inode = mapping->host;
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_client *cl = fsc->client;
+
/* find oldest snap context with dirty data */
- snapc = get_oldest_context(inode, &ceph_wbc, NULL);
- if (!snapc) {
+ ceph_wbc->snapc = get_oldest_context(inode, ceph_wbc, NULL);
+ if (!ceph_wbc->snapc) {
/* hmm, why does writepages get called when there
is no dirty data? */
doutc(cl, " no snap context with dirty data?\n");
- goto out;
+ return -ENODATA;
}
- doutc(cl, " oldest snapc is %p seq %lld (%d snaps)\n", snapc,
- snapc->seq, snapc->num_snaps);
- should_loop = false;
- if (ceph_wbc.head_snapc && snapc != last_snapc) {
+ doutc(cl, " oldest snapc is %p seq %lld (%d snaps)\n",
+ ceph_wbc->snapc, ceph_wbc->snapc->seq,
+ ceph_wbc->snapc->num_snaps);
+
+ ceph_wbc->should_loop = false;
+
+ if (ceph_wbc->head_snapc && ceph_wbc->snapc != ceph_wbc->last_snapc) {
/* where to start/end? */
if (wbc->range_cyclic) {
- index = start_index;
- end = -1;
- if (index > 0)
- should_loop = true;
- doutc(cl, " cyclic, start at %lu\n", index);
+ ceph_wbc->index = ceph_wbc->start_index;
+ ceph_wbc->end = -1;
+ if (ceph_wbc->index > 0)
+ ceph_wbc->should_loop = true;
+ doutc(cl, " cyclic, start at %lu\n", ceph_wbc->index);
} else {
- index = wbc->range_start >> PAGE_SHIFT;
- end = wbc->range_end >> PAGE_SHIFT;
+ ceph_wbc->index = wbc->range_start >> PAGE_SHIFT;
+ ceph_wbc->end = wbc->range_end >> PAGE_SHIFT;
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
- range_whole = true;
- doutc(cl, " not cyclic, %lu to %lu\n", index, end);
+ ceph_wbc->range_whole = true;
+ doutc(cl, " not cyclic, %lu to %lu\n",
+ ceph_wbc->index, ceph_wbc->end);
}
- } else if (!ceph_wbc.head_snapc) {
+ } else if (!ceph_wbc->head_snapc) {
/* Do not respect wbc->range_{start,end}. Dirty pages
* in that range can be associated with newer snapc.
* They are not writeable until we write all dirty pages
* associated with 'snapc' get written */
- if (index > 0)
- should_loop = true;
+ if (ceph_wbc->index > 0)
+ ceph_wbc->should_loop = true;
doutc(cl, " non-head snapc, range whole\n");
}
- if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
- tag_pages_for_writeback(mapping, index, end);
+ ceph_put_snap_context(ceph_wbc->last_snapc);
+ ceph_wbc->last_snapc = ceph_wbc->snapc;
- ceph_put_snap_context(last_snapc);
- last_snapc = snapc;
+ return 0;
+}
- while (!done && index <= end) {
- int num_ops = 0, op_idx;
- unsigned i, nr_folios, max_pages, locked_pages = 0;
- struct page **pages = NULL, **data_pages;
- struct page *page;
- pgoff_t strip_unit_end = 0;
- u64 offset = 0, len = 0;
- bool from_pool = false;
+static inline
+bool has_writeback_done(struct ceph_writeback_ctl *ceph_wbc)
+{
+ return ceph_wbc->done && ceph_wbc->index > ceph_wbc->end;
+}
- max_pages = wsize >> PAGE_SHIFT;
+static inline
+bool can_next_page_be_processed(struct ceph_writeback_ctl *ceph_wbc,
+ unsigned index)
+{
+ return index < ceph_wbc->nr_folios &&
+ ceph_wbc->locked_pages < ceph_wbc->max_pages;
+}
-get_more_pages:
- nr_folios = filemap_get_folios_tag(mapping, &index,
- end, tag, &fbatch);
- doutc(cl, "pagevec_lookup_range_tag got %d\n", nr_folios);
- if (!nr_folios && !locked_pages)
- break;
- for (i = 0; i < nr_folios && locked_pages < max_pages; i++) {
- struct folio *folio = fbatch.folios[i];
+static
+int ceph_check_page_before_write(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct ceph_writeback_ctl *ceph_wbc,
+ struct folio *folio)
+{
+ struct inode *inode = mapping->host;
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_client *cl = fsc->client;
+ struct ceph_snap_context *pgsnapc;
- page = &folio->page;
- doutc(cl, "? %p idx %lu\n", page, page->index);
- if (locked_pages == 0)
- lock_page(page); /* first page */
- else if (!trylock_page(page))
- break;
+ /* only dirty folios, or our accounting breaks */
+ if (unlikely(!folio_test_dirty(folio) || folio->mapping != mapping)) {
+ doutc(cl, "!dirty or !mapping %p\n", folio);
+ return -ENODATA;
+ }
- /* only dirty pages, or our accounting breaks */
- if (unlikely(!PageDirty(page)) ||
- unlikely(page->mapping != mapping)) {
- doutc(cl, "!dirty or !mapping %p\n", page);
- unlock_page(page);
- continue;
- }
- /* only if matching snap context */
- pgsnapc = page_snap_context(page);
- if (pgsnapc != snapc) {
- doutc(cl, "page snapc %p %lld != oldest %p %lld\n",
- pgsnapc, pgsnapc->seq, snapc, snapc->seq);
- if (!should_loop &&
- !ceph_wbc.head_snapc &&
- wbc->sync_mode != WB_SYNC_NONE)
- should_loop = true;
- unlock_page(page);
- continue;
+ /* only if matching snap context */
+ pgsnapc = page_snap_context(&folio->page);
+ if (pgsnapc != ceph_wbc->snapc) {
+ doutc(cl, "folio snapc %p %lld != oldest %p %lld\n",
+ pgsnapc, pgsnapc->seq,
+ ceph_wbc->snapc, ceph_wbc->snapc->seq);
+
+ if (!ceph_wbc->should_loop && !ceph_wbc->head_snapc &&
+ wbc->sync_mode != WB_SYNC_NONE)
+ ceph_wbc->should_loop = true;
+
+ return -ENODATA;
+ }
+
+ if (folio_pos(folio) >= ceph_wbc->i_size) {
+ doutc(cl, "folio at %lu beyond eof %llu\n",
+ folio->index, ceph_wbc->i_size);
+
+ if ((ceph_wbc->size_stable ||
+ folio_pos(folio) >= i_size_read(inode)) &&
+ folio_clear_dirty_for_io(folio))
+ folio_invalidate(folio, 0, folio_size(folio));
+
+ return -ENODATA;
+ }
+
+ if (ceph_wbc->strip_unit_end &&
+ (folio->index > ceph_wbc->strip_unit_end)) {
+ doutc(cl, "end of strip unit %p\n", folio);
+ return -E2BIG;
+ }
+
+ return 0;
+}
+
+static inline
+void __ceph_allocate_page_array(struct ceph_writeback_ctl *ceph_wbc,
+ unsigned int max_pages)
+{
+ ceph_wbc->pages = kmalloc_array(max_pages,
+ sizeof(*ceph_wbc->pages),
+ GFP_NOFS);
+ if (!ceph_wbc->pages) {
+ ceph_wbc->from_pool = true;
+ ceph_wbc->pages = mempool_alloc(ceph_wb_pagevec_pool, GFP_NOFS);
+ BUG_ON(!ceph_wbc->pages);
+ }
+}
+
+static inline
+void ceph_allocate_page_array(struct address_space *mapping,
+ struct ceph_writeback_ctl *ceph_wbc,
+ struct folio *folio)
+{
+ struct inode *inode = mapping->host;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ u64 objnum;
+ u64 objoff;
+ u32 xlen;
+
+ /* prepare async write request */
+ ceph_wbc->offset = (u64)folio_pos(folio);
+ ceph_calc_file_object_mapping(&ci->i_layout,
+ ceph_wbc->offset, ceph_wbc->wsize,
+ &objnum, &objoff, &xlen);
+
+ ceph_wbc->num_ops = 1;
+ ceph_wbc->strip_unit_end = folio->index + ((xlen - 1) >> PAGE_SHIFT);
+
+ BUG_ON(ceph_wbc->pages);
+ ceph_wbc->max_pages = calc_pages_for(0, (u64)xlen);
+ __ceph_allocate_page_array(ceph_wbc, ceph_wbc->max_pages);
+
+ ceph_wbc->len = 0;
+}
+
+static inline
+bool is_folio_index_contiguous(const struct ceph_writeback_ctl *ceph_wbc,
+ const struct folio *folio)
+{
+ return folio->index == (ceph_wbc->offset + ceph_wbc->len) >> PAGE_SHIFT;
+}
+
+static inline
+bool is_num_ops_too_big(struct ceph_writeback_ctl *ceph_wbc)
+{
+ return ceph_wbc->num_ops >=
+ (ceph_wbc->from_pool ? CEPH_OSD_SLAB_OPS : CEPH_OSD_MAX_OPS);
+}
+
+static inline
+bool is_write_congestion_happened(struct ceph_fs_client *fsc)
+{
+ return atomic_long_inc_return(&fsc->writeback_count) >
+ CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb);
+}
+
+static inline int move_dirty_folio_in_page_array(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct ceph_writeback_ctl *ceph_wbc, struct folio *folio)
+{
+ struct inode *inode = mapping->host;
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_client *cl = fsc->client;
+ struct page **pages = ceph_wbc->pages;
+ unsigned int index = ceph_wbc->locked_pages;
+ gfp_t gfp_flags = ceph_wbc->locked_pages ? GFP_NOWAIT : GFP_NOFS;
+
+ if (IS_ENCRYPTED(inode)) {
+ pages[index] = fscrypt_encrypt_pagecache_blocks(folio,
+ PAGE_SIZE,
+ 0,
+ gfp_flags);
+ if (IS_ERR(pages[index])) {
+ if (PTR_ERR(pages[index]) == -EINVAL) {
+ pr_err_client(cl, "inode->i_blkbits=%hhu\n",
+ inode->i_blkbits);
}
- if (page_offset(page) >= ceph_wbc.i_size) {
- doutc(cl, "folio at %lu beyond eof %llu\n",
- folio->index, ceph_wbc.i_size);
- if ((ceph_wbc.size_stable ||
- folio_pos(folio) >= i_size_read(inode)) &&
- folio_clear_dirty_for_io(folio))
- folio_invalidate(folio, 0,
- folio_size(folio));
+
+ /* better not fail on first page! */
+ BUG_ON(ceph_wbc->locked_pages == 0);
+
+ pages[index] = NULL;
+ return PTR_ERR(pages[index]);
+ }
+ } else {
+ pages[index] = &folio->page;
+ }
+
+ ceph_wbc->locked_pages++;
+
+ return 0;
+}
+
+static
+int ceph_process_folio_batch(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct ceph_writeback_ctl *ceph_wbc)
+{
+ struct inode *inode = mapping->host;
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_client *cl = fsc->client;
+ struct folio *folio = NULL;
+ unsigned i;
+ int rc = 0;
+
+ for (i = 0; can_next_page_be_processed(ceph_wbc, i); i++) {
+ folio = ceph_wbc->fbatch.folios[i];
+
+ if (!folio)
+ continue;
+
+ doutc(cl, "? %p idx %lu, folio_test_writeback %#x, "
+ "folio_test_dirty %#x, folio_test_locked %#x\n",
+ folio, folio->index, folio_test_writeback(folio),
+ folio_test_dirty(folio),
+ folio_test_locked(folio));
+
+ if (folio_test_writeback(folio) ||
+ folio_test_private_2(folio) /* [DEPRECATED] */) {
+ doutc(cl, "waiting on writeback %p\n", folio);
+ folio_wait_writeback(folio);
+ folio_wait_private_2(folio); /* [DEPRECATED] */
+ continue;
+ }
+
+ if (ceph_wbc->locked_pages == 0)
+ folio_lock(folio);
+ else if (!folio_trylock(folio))
+ break;
+
+ rc = ceph_check_page_before_write(mapping, wbc,
+ ceph_wbc, folio);
+ if (rc == -ENODATA) {
+ rc = 0;
+ folio_unlock(folio);
+ ceph_wbc->fbatch.folios[i] = NULL;
+ continue;
+ } else if (rc == -E2BIG) {
+ rc = 0;
+ folio_unlock(folio);
+ ceph_wbc->fbatch.folios[i] = NULL;
+ break;
+ }
+
+ if (!folio_clear_dirty_for_io(folio)) {
+ doutc(cl, "%p !folio_clear_dirty_for_io\n", folio);
+ folio_unlock(folio);
+ ceph_wbc->fbatch.folios[i] = NULL;
+ continue;
+ }
+
+ /*
+ * We have something to write. If this is
+ * the first locked page this time through,
+ * calculate max possible write size and
+ * allocate a page array
+ */
+ if (ceph_wbc->locked_pages == 0) {
+ ceph_allocate_page_array(mapping, ceph_wbc, folio);
+ } else if (!is_folio_index_contiguous(ceph_wbc, folio)) {
+ if (is_num_ops_too_big(ceph_wbc)) {
+ folio_redirty_for_writepage(wbc, folio);
folio_unlock(folio);
- continue;
- }
- if (strip_unit_end && (page->index > strip_unit_end)) {
- doutc(cl, "end of strip unit %p\n", page);
- unlock_page(page);
break;
}
- if (folio_test_writeback(folio) ||
- folio_test_private_2(folio) /* [DEPRECATED] */) {
- if (wbc->sync_mode == WB_SYNC_NONE) {
- doutc(cl, "%p under writeback\n", folio);
- folio_unlock(folio);
- continue;
- }
- doutc(cl, "waiting on writeback %p\n", folio);
- folio_wait_writeback(folio);
- folio_wait_private_2(folio); /* [DEPRECATED] */
- }
- if (!clear_page_dirty_for_io(page)) {
- doutc(cl, "%p !clear_page_dirty_for_io\n", page);
- unlock_page(page);
- continue;
- }
+ ceph_wbc->num_ops++;
+ ceph_wbc->offset = (u64)folio_pos(folio);
+ ceph_wbc->len = 0;
+ }
- /*
- * We have something to write. If this is
- * the first locked page this time through,
- * calculate max possinle write size and
- * allocate a page array
- */
- if (locked_pages == 0) {
- u64 objnum;
- u64 objoff;
- u32 xlen;
-
- /* prepare async write request */
- offset = (u64)page_offset(page);
- ceph_calc_file_object_mapping(&ci->i_layout,
- offset, wsize,
- &objnum, &objoff,
- &xlen);
- len = xlen;
-
- num_ops = 1;
- strip_unit_end = page->index +
- ((len - 1) >> PAGE_SHIFT);
-
- BUG_ON(pages);
- max_pages = calc_pages_for(0, (u64)len);
- pages = kmalloc_array(max_pages,
- sizeof(*pages),
- GFP_NOFS);
- if (!pages) {
- from_pool = true;
- pages = mempool_alloc(ceph_wb_pagevec_pool, GFP_NOFS);
- BUG_ON(!pages);
- }
-
- len = 0;
- } else if (page->index !=
- (offset + len) >> PAGE_SHIFT) {
- if (num_ops >= (from_pool ? CEPH_OSD_SLAB_OPS :
- CEPH_OSD_MAX_OPS)) {
- redirty_page_for_writepage(wbc, page);
- unlock_page(page);
- break;
- }
-
- num_ops++;
- offset = (u64)page_offset(page);
- len = 0;
- }
+ /* note position of first page in fbatch */
+ doutc(cl, "%llx.%llx will write folio %p idx %lu\n",
+ ceph_vinop(inode), folio, folio->index);
- /* note position of first page in fbatch */
- doutc(cl, "%llx.%llx will write page %p idx %lu\n",
- ceph_vinop(inode), page, page->index);
-
- if (atomic_long_inc_return(&fsc->writeback_count) >
- CONGESTION_ON_THRESH(
- fsc->mount_options->congestion_kb))
- fsc->write_congested = true;
-
- if (IS_ENCRYPTED(inode)) {
- pages[locked_pages] =
- fscrypt_encrypt_pagecache_blocks(page,
- PAGE_SIZE, 0,
- locked_pages ? GFP_NOWAIT : GFP_NOFS);
- if (IS_ERR(pages[locked_pages])) {
- if (PTR_ERR(pages[locked_pages]) == -EINVAL)
- pr_err_client(cl,
- "inode->i_blkbits=%hhu\n",
- inode->i_blkbits);
- /* better not fail on first page! */
- BUG_ON(locked_pages == 0);
- pages[locked_pages] = NULL;
- redirty_page_for_writepage(wbc, page);
- unlock_page(page);
- break;
- }
- ++locked_pages;
- } else {
- pages[locked_pages++] = page;
- }
+ fsc->write_congested = is_write_congestion_happened(fsc);
- fbatch.folios[i] = NULL;
- len += thp_size(page);
+ rc = move_dirty_folio_in_page_array(mapping, wbc, ceph_wbc,
+ folio);
+ if (rc) {
+ folio_redirty_for_writepage(wbc, folio);
+ folio_unlock(folio);
+ break;
}
- /* did we get anything? */
- if (!locked_pages)
- goto release_folios;
- if (i) {
- unsigned j, n = 0;
- /* shift unused page to beginning of fbatch */
- for (j = 0; j < nr_folios; j++) {
- if (!fbatch.folios[j])
- continue;
- if (n < j)
- fbatch.folios[n] = fbatch.folios[j];
- n++;
- }
- fbatch.nr = n;
+ ceph_wbc->fbatch.folios[i] = NULL;
+ ceph_wbc->len += folio_size(folio);
+ }
- if (nr_folios && i == nr_folios &&
- locked_pages < max_pages) {
- doutc(cl, "reached end fbatch, trying for more\n");
- folio_batch_release(&fbatch);
- goto get_more_pages;
- }
+ ceph_wbc->processed_in_fbatch = i;
+
+ return rc;
+}
+
+static inline
+void ceph_shift_unused_folios_left(struct folio_batch *fbatch)
+{
+ unsigned j, n = 0;
+
+ /* shift unused page to beginning of fbatch */
+ for (j = 0; j < folio_batch_count(fbatch); j++) {
+ if (!fbatch->folios[j])
+ continue;
+
+ if (n < j) {
+ fbatch->folios[n] = fbatch->folios[j];
}
+ n++;
+ }
+
+ fbatch->nr = n;
+}
+
+static
+int ceph_submit_write(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct ceph_writeback_ctl *ceph_wbc)
+{
+ struct inode *inode = mapping->host;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_client *cl = fsc->client;
+ struct ceph_vino vino = ceph_vino(inode);
+ struct ceph_osd_request *req = NULL;
+ struct page *page = NULL;
+ bool caching = ceph_is_cache_enabled(inode);
+ u64 offset;
+ u64 len;
+ unsigned i;
+
new_request:
- offset = ceph_fscrypt_page_offset(pages[0]);
- len = wsize;
+ offset = ceph_fscrypt_page_offset(ceph_wbc->pages[0]);
+ len = ceph_wbc->wsize;
+ req = ceph_osdc_new_request(&fsc->client->osdc,
+ &ci->i_layout, vino,
+ offset, &len, 0, ceph_wbc->num_ops,
+ CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE,
+ ceph_wbc->snapc, ceph_wbc->truncate_seq,
+ ceph_wbc->truncate_size, false);
+ if (IS_ERR(req)) {
req = ceph_osdc_new_request(&fsc->client->osdc,
- &ci->i_layout, vino,
- offset, &len, 0, num_ops,
- CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE,
- snapc, ceph_wbc.truncate_seq,
- ceph_wbc.truncate_size, false);
- if (IS_ERR(req)) {
- req = ceph_osdc_new_request(&fsc->client->osdc,
- &ci->i_layout, vino,
- offset, &len, 0,
- min(num_ops,
- CEPH_OSD_SLAB_OPS),
- CEPH_OSD_OP_WRITE,
- CEPH_OSD_FLAG_WRITE,
- snapc, ceph_wbc.truncate_seq,
- ceph_wbc.truncate_size, true);
- BUG_ON(IS_ERR(req));
+ &ci->i_layout, vino,
+ offset, &len, 0,
+ min(ceph_wbc->num_ops,
+ CEPH_OSD_SLAB_OPS),
+ CEPH_OSD_OP_WRITE,
+ CEPH_OSD_FLAG_WRITE,
+ ceph_wbc->snapc,
+ ceph_wbc->truncate_seq,
+ ceph_wbc->truncate_size,
+ true);
+ BUG_ON(IS_ERR(req));
+ }
+
+ page = ceph_wbc->pages[ceph_wbc->locked_pages - 1];
+ BUG_ON(len < ceph_fscrypt_page_offset(page) + thp_size(page) - offset);
+
+ if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) {
+ for (i = 0; i < folio_batch_count(&ceph_wbc->fbatch); i++) {
+ struct folio *folio = ceph_wbc->fbatch.folios[i];
+
+ if (!folio)
+ continue;
+
+ page = &folio->page;
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
}
- BUG_ON(len < ceph_fscrypt_page_offset(pages[locked_pages - 1]) +
- thp_size(pages[locked_pages - 1]) - offset);
- if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) {
- rc = -EIO;
- goto release_folios;
+ for (i = 0; i < ceph_wbc->locked_pages; i++) {
+ page = ceph_fscrypt_pagecache_page(ceph_wbc->pages[i]);
+
+ if (!page)
+ continue;
+
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
}
- req->r_callback = writepages_finish;
- req->r_inode = inode;
-
- /* Format the osd request message and submit the write */
- len = 0;
- data_pages = pages;
- op_idx = 0;
- for (i = 0; i < locked_pages; i++) {
- struct page *page = ceph_fscrypt_pagecache_page(pages[i]);
-
- u64 cur_offset = page_offset(page);
- /*
- * Discontinuity in page range? Ceph can handle that by just passing
- * multiple extents in the write op.
- */
- if (offset + len != cur_offset) {
- /* If it's full, stop here */
- if (op_idx + 1 == req->r_num_ops)
- break;
-
- /* Kick off an fscache write with what we have so far. */
- ceph_fscache_write_to_cache(inode, offset, len, caching);
-
- /* Start a new extent */
- osd_req_op_extent_dup_last(req, op_idx,
- cur_offset - offset);
- doutc(cl, "got pages at %llu~%llu\n", offset,
- len);
- osd_req_op_extent_osd_data_pages(req, op_idx,
- data_pages, len, 0,
- from_pool, false);
- osd_req_op_extent_update(req, op_idx, len);
-
- len = 0;
- offset = cur_offset;
- data_pages = pages + i;
- op_idx++;
- }
- set_page_writeback(page);
- if (caching)
- ceph_set_page_fscache(page);
- len += thp_size(page);
+ ceph_osdc_put_request(req);
+ return -EIO;
+ }
+
+ req->r_callback = writepages_finish;
+ req->r_inode = inode;
+
+ /* Format the osd request message and submit the write */
+ len = 0;
+ ceph_wbc->data_pages = ceph_wbc->pages;
+ ceph_wbc->op_idx = 0;
+ for (i = 0; i < ceph_wbc->locked_pages; i++) {
+ u64 cur_offset;
+
+ page = ceph_fscrypt_pagecache_page(ceph_wbc->pages[i]);
+ cur_offset = page_offset(page);
+
+ /*
+ * Discontinuity in page range? Ceph can handle that by just passing
+ * multiple extents in the write op.
+ */
+ if (offset + len != cur_offset) {
+ /* If it's full, stop here */
+ if (ceph_wbc->op_idx + 1 == req->r_num_ops)
+ break;
+
+ /* Kick off an fscache write with what we have so far. */
+ ceph_fscache_write_to_cache(inode, offset, len, caching);
+
+ /* Start a new extent */
+ osd_req_op_extent_dup_last(req, ceph_wbc->op_idx,
+ cur_offset - offset);
+
+ doutc(cl, "got pages at %llu~%llu\n", offset, len);
+
+ osd_req_op_extent_osd_data_pages(req, ceph_wbc->op_idx,
+ ceph_wbc->data_pages,
+ len, 0,
+ ceph_wbc->from_pool,
+ false);
+ osd_req_op_extent_update(req, ceph_wbc->op_idx, len);
+
+ len = 0;
+ offset = cur_offset;
+ ceph_wbc->data_pages = ceph_wbc->pages + i;
+ ceph_wbc->op_idx++;
}
- ceph_fscache_write_to_cache(inode, offset, len, caching);
-
- if (ceph_wbc.size_stable) {
- len = min(len, ceph_wbc.i_size - offset);
- } else if (i == locked_pages) {
- /* writepages_finish() clears writeback pages
- * according to the data length, so make sure
- * data length covers all locked pages */
- u64 min_len = len + 1 - thp_size(page);
- len = get_writepages_data_length(inode, pages[i - 1],
- offset);
- len = max(len, min_len);
+
+ set_page_writeback(page);
+
+ if (caching)
+ ceph_set_page_fscache(page);
+
+ len += thp_size(page);
+ }
+
+ ceph_fscache_write_to_cache(inode, offset, len, caching);
+
+ if (ceph_wbc->size_stable) {
+ len = min(len, ceph_wbc->i_size - offset);
+ } else if (i == ceph_wbc->locked_pages) {
+ /* writepages_finish() clears writeback pages
+ * according to the data length, so make sure
+ * data length covers all locked pages */
+ u64 min_len = len + 1 - thp_size(page);
+ len = get_writepages_data_length(inode,
+ ceph_wbc->pages[i - 1],
+ offset);
+ len = max(len, min_len);
+ }
+
+ if (IS_ENCRYPTED(inode))
+ len = round_up(len, CEPH_FSCRYPT_BLOCK_SIZE);
+
+ doutc(cl, "got pages at %llu~%llu\n", offset, len);
+
+ if (IS_ENCRYPTED(inode) &&
+ ((offset | len) & ~CEPH_FSCRYPT_BLOCK_MASK)) {
+ pr_warn_client(cl,
+ "bad encrypted write offset=%lld len=%llu\n",
+ offset, len);
+ }
+
+ osd_req_op_extent_osd_data_pages(req, ceph_wbc->op_idx,
+ ceph_wbc->data_pages, len,
+ 0, ceph_wbc->from_pool, false);
+ osd_req_op_extent_update(req, ceph_wbc->op_idx, len);
+
+ BUG_ON(ceph_wbc->op_idx + 1 != req->r_num_ops);
+
+ ceph_wbc->from_pool = false;
+ if (i < ceph_wbc->locked_pages) {
+ BUG_ON(ceph_wbc->num_ops <= req->r_num_ops);
+ ceph_wbc->num_ops -= req->r_num_ops;
+ ceph_wbc->locked_pages -= i;
+
+ /* allocate new pages array for next request */
+ ceph_wbc->data_pages = ceph_wbc->pages;
+ __ceph_allocate_page_array(ceph_wbc, ceph_wbc->locked_pages);
+ memcpy(ceph_wbc->pages, ceph_wbc->data_pages + i,
+ ceph_wbc->locked_pages * sizeof(*ceph_wbc->pages));
+ memset(ceph_wbc->data_pages + i, 0,
+ ceph_wbc->locked_pages * sizeof(*ceph_wbc->pages));
+ } else {
+ BUG_ON(ceph_wbc->num_ops != req->r_num_ops);
+ /* request message now owns the pages array */
+ ceph_wbc->pages = NULL;
+ }
+
+ req->r_mtime = inode_get_mtime(inode);
+ ceph_osdc_start_request(&fsc->client->osdc, req);
+ req = NULL;
+
+ wbc->nr_to_write -= i;
+ if (ceph_wbc->pages)
+ goto new_request;
+
+ return 0;
+}
+
+static
+void ceph_wait_until_current_writes_complete(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct ceph_writeback_ctl *ceph_wbc)
+{
+ struct page *page;
+ unsigned i, nr;
+
+ if (wbc->sync_mode != WB_SYNC_NONE &&
+ ceph_wbc->start_index == 0 && /* all dirty pages were checked */
+ !ceph_wbc->head_snapc) {
+ ceph_wbc->index = 0;
+
+ while ((ceph_wbc->index <= ceph_wbc->end) &&
+ (nr = filemap_get_folios_tag(mapping,
+ &ceph_wbc->index,
+ (pgoff_t)-1,
+ PAGECACHE_TAG_WRITEBACK,
+ &ceph_wbc->fbatch))) {
+ for (i = 0; i < nr; i++) {
+ page = &ceph_wbc->fbatch.folios[i]->page;
+ if (page_snap_context(page) != ceph_wbc->snapc)
+ continue;
+ wait_on_page_writeback(page);
+ }
+
+ folio_batch_release(&ceph_wbc->fbatch);
+ cond_resched();
}
- if (IS_ENCRYPTED(inode))
- len = round_up(len, CEPH_FSCRYPT_BLOCK_SIZE);
+ }
+}
+
+/*
+ * initiate async writeback
+ */
+static int ceph_writepages_start(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct inode *inode = mapping->host;
+ struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
+ struct ceph_client *cl = fsc->client;
+ struct ceph_writeback_ctl ceph_wbc;
+ int rc = 0;
- doutc(cl, "got pages at %llu~%llu\n", offset, len);
+ if (wbc->sync_mode == WB_SYNC_NONE && fsc->write_congested)
+ return 0;
- if (IS_ENCRYPTED(inode) &&
- ((offset | len) & ~CEPH_FSCRYPT_BLOCK_MASK))
- pr_warn_client(cl,
- "bad encrypted write offset=%lld len=%llu\n",
- offset, len);
-
- osd_req_op_extent_osd_data_pages(req, op_idx, data_pages, len,
- 0, from_pool, false);
- osd_req_op_extent_update(req, op_idx, len);
-
- BUG_ON(op_idx + 1 != req->r_num_ops);
-
- from_pool = false;
- if (i < locked_pages) {
- BUG_ON(num_ops <= req->r_num_ops);
- num_ops -= req->r_num_ops;
- locked_pages -= i;
-
- /* allocate new pages array for next request */
- data_pages = pages;
- pages = kmalloc_array(locked_pages, sizeof(*pages),
- GFP_NOFS);
- if (!pages) {
- from_pool = true;
- pages = mempool_alloc(ceph_wb_pagevec_pool, GFP_NOFS);
- BUG_ON(!pages);
+ doutc(cl, "%llx.%llx (mode=%s)\n", ceph_vinop(inode),
+ wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
+ (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
+
+ if (is_forced_umount(mapping)) {
+ /* we're in a forced umount, don't write! */
+ return -EIO;
+ }
+
+ ceph_init_writeback_ctl(mapping, wbc, &ceph_wbc);
+
+ if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) {
+ rc = -EIO;
+ goto out;
+ }
+
+retry:
+ rc = ceph_define_writeback_range(mapping, wbc, &ceph_wbc);
+ if (rc == -ENODATA) {
+ /* hmm, why does writepages get called when there
+ is no dirty data? */
+ rc = 0;
+ goto dec_osd_stopping_blocker;
+ }
+
+ if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
+ tag_pages_for_writeback(mapping, ceph_wbc.index, ceph_wbc.end);
+
+ while (!has_writeback_done(&ceph_wbc)) {
+ ceph_wbc.locked_pages = 0;
+ ceph_wbc.max_pages = ceph_wbc.wsize >> PAGE_SHIFT;
+
+get_more_pages:
+ ceph_folio_batch_reinit(&ceph_wbc);
+
+ ceph_wbc.nr_folios = filemap_get_folios_tag(mapping,
+ &ceph_wbc.index,
+ ceph_wbc.end,
+ ceph_wbc.tag,
+ &ceph_wbc.fbatch);
+ doutc(cl, "pagevec_lookup_range_tag for tag %#x got %d\n",
+ ceph_wbc.tag, ceph_wbc.nr_folios);
+
+ if (!ceph_wbc.nr_folios && !ceph_wbc.locked_pages)
+ break;
+
+process_folio_batch:
+ rc = ceph_process_folio_batch(mapping, wbc, &ceph_wbc);
+ if (rc)
+ goto release_folios;
+
+ /* did we get anything? */
+ if (!ceph_wbc.locked_pages)
+ goto release_folios;
+
+ if (ceph_wbc.processed_in_fbatch) {
+ ceph_shift_unused_folios_left(&ceph_wbc.fbatch);
+
+ if (folio_batch_count(&ceph_wbc.fbatch) == 0 &&
+ ceph_wbc.locked_pages < ceph_wbc.max_pages) {
+ doutc(cl, "reached end fbatch, trying for more\n");
+ goto get_more_pages;
}
- memcpy(pages, data_pages + i,
- locked_pages * sizeof(*pages));
- memset(data_pages + i, 0,
- locked_pages * sizeof(*pages));
- } else {
- BUG_ON(num_ops != req->r_num_ops);
- index = pages[i - 1]->index + 1;
- /* request message now owns the pages array */
- pages = NULL;
}
- req->r_mtime = inode_get_mtime(inode);
- ceph_osdc_start_request(&fsc->client->osdc, req);
- req = NULL;
+ rc = ceph_submit_write(mapping, wbc, &ceph_wbc);
+ if (rc)
+ goto release_folios;
+
+ ceph_wbc.locked_pages = 0;
+ ceph_wbc.strip_unit_end = 0;
- wbc->nr_to_write -= i;
- if (pages)
- goto new_request;
+ if (folio_batch_count(&ceph_wbc.fbatch) > 0) {
+ ceph_wbc.nr_folios =
+ folio_batch_count(&ceph_wbc.fbatch);
+ goto process_folio_batch;
+ }
/*
* We stop writing back only if we are not doing
@@ -1377,61 +1724,44 @@ new_request:
* we tagged for writeback prior to entering this loop.
*/
if (wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE)
- done = true;
+ ceph_wbc.done = true;
release_folios:
doutc(cl, "folio_batch release on %d folios (%p)\n",
- (int)fbatch.nr, fbatch.nr ? fbatch.folios[0] : NULL);
- folio_batch_release(&fbatch);
+ (int)ceph_wbc.fbatch.nr,
+ ceph_wbc.fbatch.nr ? ceph_wbc.fbatch.folios[0] : NULL);
+ folio_batch_release(&ceph_wbc.fbatch);
}
- if (should_loop && !done) {
+ if (ceph_wbc.should_loop && !ceph_wbc.done) {
/* more to do; loop back to beginning of file */
doutc(cl, "looping back to beginning of file\n");
- end = start_index - 1; /* OK even when start_index == 0 */
+ /* OK even when start_index == 0 */
+ ceph_wbc.end = ceph_wbc.start_index - 1;
/* to write dirty pages associated with next snapc,
* we need to wait until current writes complete */
- if (wbc->sync_mode != WB_SYNC_NONE &&
- start_index == 0 && /* all dirty pages were checked */
- !ceph_wbc.head_snapc) {
- struct page *page;
- unsigned i, nr;
- index = 0;
- while ((index <= end) &&
- (nr = filemap_get_folios_tag(mapping, &index,
- (pgoff_t)-1,
- PAGECACHE_TAG_WRITEBACK,
- &fbatch))) {
- for (i = 0; i < nr; i++) {
- page = &fbatch.folios[i]->page;
- if (page_snap_context(page) != snapc)
- continue;
- wait_on_page_writeback(page);
- }
- folio_batch_release(&fbatch);
- cond_resched();
- }
- }
+ ceph_wait_until_current_writes_complete(mapping, wbc, &ceph_wbc);
- start_index = 0;
- index = 0;
+ ceph_wbc.start_index = 0;
+ ceph_wbc.index = 0;
goto retry;
}
- if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
- mapping->writeback_index = index;
+ if (wbc->range_cyclic || (ceph_wbc.range_whole && wbc->nr_to_write > 0))
+ mapping->writeback_index = ceph_wbc.index;
+
+dec_osd_stopping_blocker:
+ ceph_dec_osd_stopping_blocker(fsc->mdsc);
out:
- ceph_osdc_put_request(req);
- ceph_put_snap_context(last_snapc);
+ ceph_put_snap_context(ceph_wbc.last_snapc);
doutc(cl, "%llx.%llx dend - startone, rc = %d\n", ceph_vinop(inode),
rc);
+
return rc;
}
-
-
/*
* See if a given @snapc is either writeable, or already written.
*/
@@ -1447,56 +1777,56 @@ static int context_is_writeable_or_written(struct inode *inode,
/**
* ceph_find_incompatible - find an incompatible context and return it
- * @page: page being dirtied
+ * @folio: folio being dirtied
*
- * We are only allowed to write into/dirty a page if the page is
+ * We are only allowed to write into/dirty a folio if the folio is
* clean, or already dirty within the same snap context. Returns a
* conflicting context if there is one, NULL if there isn't, or a
* negative error code on other errors.
*
- * Must be called with page lock held.
+ * Must be called with folio lock held.
*/
static struct ceph_snap_context *
-ceph_find_incompatible(struct page *page)
+ceph_find_incompatible(struct folio *folio)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
struct ceph_client *cl = ceph_inode_to_client(inode);
struct ceph_inode_info *ci = ceph_inode(inode);
if (ceph_inode_is_shutdown(inode)) {
- doutc(cl, " %llx.%llx page %p is shutdown\n",
- ceph_vinop(inode), page);
+ doutc(cl, " %llx.%llx folio %p is shutdown\n",
+ ceph_vinop(inode), folio);
return ERR_PTR(-ESTALE);
}
for (;;) {
struct ceph_snap_context *snapc, *oldest;
- wait_on_page_writeback(page);
+ folio_wait_writeback(folio);
- snapc = page_snap_context(page);
+ snapc = page_snap_context(&folio->page);
if (!snapc || snapc == ci->i_head_snapc)
break;
/*
- * this page is already dirty in another (older) snap
+ * this folio is already dirty in another (older) snap
* context! is it writeable now?
*/
oldest = get_oldest_context(inode, NULL, NULL);
if (snapc->seq > oldest->seq) {
/* not writeable -- return it for the caller to deal with */
ceph_put_snap_context(oldest);
- doutc(cl, " %llx.%llx page %p snapc %p not current or oldest\n",
- ceph_vinop(inode), page, snapc);
+ doutc(cl, " %llx.%llx folio %p snapc %p not current or oldest\n",
+ ceph_vinop(inode), folio, snapc);
return ceph_get_snap_context(snapc);
}
ceph_put_snap_context(oldest);
- /* yay, writeable, do it now (without dropping page lock) */
- doutc(cl, " %llx.%llx page %p snapc %p not current, but oldest\n",
- ceph_vinop(inode), page, snapc);
- if (clear_page_dirty_for_io(page)) {
- int r = writepage_nounlock(page, NULL);
+ /* yay, writeable, do it now (without dropping folio lock) */
+ doutc(cl, " %llx.%llx folio %p snapc %p not current, but oldest\n",
+ ceph_vinop(inode), folio, snapc);
+ if (folio_clear_dirty_for_io(folio)) {
+ int r = write_folio_nounlock(folio, NULL);
if (r < 0)
return ERR_PTR(r);
}
@@ -1511,7 +1841,7 @@ static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_snap_context *snapc;
- snapc = ceph_find_incompatible(folio_page(*foliop, 0));
+ snapc = ceph_find_incompatible(*foliop);
if (snapc) {
int r;
@@ -1534,10 +1864,12 @@ static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned
* We are only allowed to write into/dirty the page if the page is
* clean, or already dirty within the same snap context.
*/
-static int ceph_write_begin(struct file *file, struct address_space *mapping,
+static int ceph_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
loff_t pos, unsigned len,
struct folio **foliop, void **fsdata)
{
+ struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
int r;
@@ -1555,10 +1887,12 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
* we don't do anything in here that simple_write_end doesn't do
* except adjust dirty page accounting
*/
-static int ceph_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
+static int ceph_write_end(const struct kiocb *iocb,
+ struct address_space *mapping, loff_t pos,
+ unsigned len, unsigned copied,
struct folio *folio, void *fsdata)
{
+ struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
struct ceph_client *cl = ceph_inode_to_client(inode);
bool check_cap = false;
@@ -1594,7 +1928,6 @@ out:
const struct address_space_operations ceph_aops = {
.read_folio = netfs_read_folio,
.readahead = netfs_readahead,
- .writepage = ceph_writepage,
.writepages = ceph_writepages_start,
.write_begin = ceph_write_begin,
.write_end = ceph_write_end,
@@ -1602,6 +1935,7 @@ const struct address_space_operations ceph_aops = {
.invalidate_folio = ceph_invalidate_folio,
.release_folio = netfs_release_folio,
.direct_IO = noop_direct_IO,
+ .migrate_folio = filemap_migrate_folio,
};
static void ceph_block_sigs(sigset_t *oldset)
@@ -1718,8 +2052,8 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_file_info *fi = vma->vm_file->private_data;
struct ceph_cap_flush *prealloc_cf;
- struct page *page = vmf->page;
- loff_t off = page_offset(page);
+ struct folio *folio = page_folio(vmf->page);
+ loff_t off = folio_pos(folio);
loff_t size = i_size_read(inode);
size_t len;
int want, got, err;
@@ -1736,10 +2070,10 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
sb_start_pagefault(inode->i_sb);
ceph_block_sigs(&oldset);
- if (off + thp_size(page) <= size)
- len = thp_size(page);
+ if (off + folio_size(folio) <= size)
+ len = folio_size(folio);
else
- len = offset_in_thp(page, size);
+ len = offset_in_folio(folio, size);
doutc(cl, "%llx.%llx %llu~%zd getting caps i_size %llu\n",
ceph_vinop(inode), off, len, size);
@@ -1756,30 +2090,30 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
doutc(cl, "%llx.%llx %llu~%zd got cap refs on %s\n", ceph_vinop(inode),
off, len, ceph_cap_string(got));
- /* Update time before taking page lock */
+ /* Update time before taking folio lock */
file_update_time(vma->vm_file);
inode_inc_iversion_raw(inode);
do {
struct ceph_snap_context *snapc;
- lock_page(page);
+ folio_lock(folio);
- if (page_mkwrite_check_truncate(page, inode) < 0) {
- unlock_page(page);
+ if (folio_mkwrite_check_truncate(folio, inode) < 0) {
+ folio_unlock(folio);
ret = VM_FAULT_NOPAGE;
break;
}
- snapc = ceph_find_incompatible(page);
+ snapc = ceph_find_incompatible(folio);
if (!snapc) {
- /* success. we'll keep the page locked. */
- set_page_dirty(page);
+ /* success. we'll keep the folio locked. */
+ folio_mark_dirty(folio);
ret = VM_FAULT_LOCKED;
break;
}
- unlock_page(page);
+ folio_unlock(folio);
if (IS_ERR(snapc)) {
ret = VM_FAULT_SIGBUS;
@@ -2000,13 +2334,13 @@ static const struct vm_operations_struct ceph_vmops = {
.page_mkwrite = ceph_page_mkwrite,
};
-int ceph_mmap(struct file *file, struct vm_area_struct *vma)
+int ceph_mmap_prepare(struct vm_area_desc *desc)
{
- struct address_space *mapping = file->f_mapping;
+ struct address_space *mapping = desc->file->f_mapping;
if (!mapping->a_ops->read_folio)
return -ENOEXEC;
- vma->vm_ops = &ceph_vmops;
+ desc->vm_ops = &ceph_vmops;
return 0;
}
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index a8d8b56cf9d2..b1a8ff612c41 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -4957,24 +4957,20 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry,
cl = ceph_inode_to_client(dir);
spin_lock(&dentry->d_lock);
if (ret && di->lease_session && di->lease_session->s_mds == mds) {
+ int len = dentry->d_name.len;
doutc(cl, "%p mds%d seq %d\n", dentry, mds,
(int)di->lease_seq);
rel->dname_seq = cpu_to_le32(di->lease_seq);
__ceph_mdsc_drop_dentry_lease(dentry);
+ memcpy(*p, dentry->d_name.name, len);
spin_unlock(&dentry->d_lock);
if (IS_ENCRYPTED(dir) && fscrypt_has_encryption_key(dir)) {
- int ret2 = ceph_encode_encrypted_fname(dir, dentry, *p);
-
- if (ret2 < 0)
- return ret2;
-
- rel->dname_len = cpu_to_le32(ret2);
- *p += ret2;
- } else {
- rel->dname_len = cpu_to_le32(dentry->d_name.len);
- memcpy(*p, dentry->d_name.name, dentry->d_name.len);
- *p += dentry->d_name.len;
+ len = ceph_encode_encrypted_dname(dir, *p, len);
+ if (len < 0)
+ return len;
}
+ rel->dname_len = cpu_to_le32(len);
+ *p += len;
} else {
spin_unlock(&dentry->d_lock);
}
diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c
index 3b3c4d8d401e..cab722619207 100644
--- a/fs/ceph/crypto.c
+++ b/fs/ceph/crypto.c
@@ -215,35 +215,31 @@ static struct inode *parse_longname(const struct inode *parent,
struct ceph_client *cl = ceph_inode_to_client(parent);
struct inode *dir = NULL;
struct ceph_vino vino = { .snap = CEPH_NOSNAP };
- char *inode_number;
- char *name_end;
- int orig_len = *name_len;
+ char *name_end, *inode_number;
int ret = -EIO;
-
+ /* NUL-terminate */
+ char *str __free(kfree) = kmemdup_nul(name, *name_len, GFP_KERNEL);
+ if (!str)
+ return ERR_PTR(-ENOMEM);
/* Skip initial '_' */
- name++;
- name_end = strrchr(name, '_');
+ str++;
+ name_end = strrchr(str, '_');
if (!name_end) {
- doutc(cl, "failed to parse long snapshot name: %s\n", name);
+ doutc(cl, "failed to parse long snapshot name: %s\n", str);
return ERR_PTR(-EIO);
}
- *name_len = (name_end - name);
+ *name_len = (name_end - str);
if (*name_len <= 0) {
pr_err_client(cl, "failed to parse long snapshot name\n");
return ERR_PTR(-EIO);
}
/* Get the inode number */
- inode_number = kmemdup_nul(name_end + 1,
- orig_len - *name_len - 2,
- GFP_KERNEL);
- if (!inode_number)
- return ERR_PTR(-ENOMEM);
+ inode_number = name_end + 1;
ret = kstrtou64(inode_number, 10, &vino.ino);
if (ret) {
- doutc(cl, "failed to parse inode number: %s\n", name);
- dir = ERR_PTR(ret);
- goto out;
+ doutc(cl, "failed to parse inode number: %s\n", str);
+ return ERR_PTR(ret);
}
/* And finally the inode */
@@ -254,42 +250,29 @@ static struct inode *parse_longname(const struct inode *parent,
if (IS_ERR(dir))
doutc(cl, "can't find inode %s (%s)\n", inode_number, name);
}
-
-out:
- kfree(inode_number);
return dir;
}
-int ceph_encode_encrypted_dname(struct inode *parent, struct qstr *d_name,
- char *buf)
+int ceph_encode_encrypted_dname(struct inode *parent, char *buf, int elen)
{
struct ceph_client *cl = ceph_inode_to_client(parent);
struct inode *dir = parent;
- struct qstr iname;
+ char *p = buf;
u32 len;
- int name_len;
- int elen;
+ int name_len = elen;
int ret;
u8 *cryptbuf = NULL;
- iname.name = d_name->name;
- name_len = d_name->len;
-
/* Handle the special case of snapshot names that start with '_' */
- if ((ceph_snap(dir) == CEPH_SNAPDIR) && (name_len > 0) &&
- (iname.name[0] == '_')) {
- dir = parse_longname(parent, iname.name, &name_len);
+ if (ceph_snap(dir) == CEPH_SNAPDIR && *p == '_') {
+ dir = parse_longname(parent, p, &name_len);
if (IS_ERR(dir))
return PTR_ERR(dir);
- iname.name++; /* skip initial '_' */
+ p++; /* skip initial '_' */
}
- iname.len = name_len;
- if (!fscrypt_has_encryption_key(dir)) {
- memcpy(buf, d_name->name, d_name->len);
- elen = d_name->len;
+ if (!fscrypt_has_encryption_key(dir))
goto out;
- }
/*
* Convert cleartext d_name to ciphertext. If result is longer than
@@ -297,7 +280,7 @@ int ceph_encode_encrypted_dname(struct inode *parent, struct qstr *d_name,
*
* See: fscrypt_setup_filename
*/
- if (!fscrypt_fname_encrypted_size(dir, iname.len, NAME_MAX, &len)) {
+ if (!fscrypt_fname_encrypted_size(dir, name_len, NAME_MAX, &len)) {
elen = -ENAMETOOLONG;
goto out;
}
@@ -310,7 +293,9 @@ int ceph_encode_encrypted_dname(struct inode *parent, struct qstr *d_name,
goto out;
}
- ret = fscrypt_fname_encrypt(dir, &iname, cryptbuf, len);
+ ret = fscrypt_fname_encrypt(dir,
+ &(struct qstr)QSTR_INIT(p, name_len),
+ cryptbuf, len);
if (ret) {
elen = ret;
goto out;
@@ -331,18 +316,13 @@ int ceph_encode_encrypted_dname(struct inode *parent, struct qstr *d_name,
}
/* base64 encode the encrypted name */
- elen = ceph_base64_encode(cryptbuf, len, buf);
- doutc(cl, "base64-encoded ciphertext name = %.*s\n", elen, buf);
+ elen = ceph_base64_encode(cryptbuf, len, p);
+ doutc(cl, "base64-encoded ciphertext name = %.*s\n", elen, p);
/* To understand the 240 limit, see CEPH_NOHASH_NAME_MAX comments */
WARN_ON(elen > 240);
- if ((elen > 0) && (dir != parent)) {
- char tmp_buf[NAME_MAX];
-
- elen = snprintf(tmp_buf, sizeof(tmp_buf), "_%.*s_%ld",
- elen, buf, dir->i_ino);
- memcpy(buf, tmp_buf, elen);
- }
+ if (dir != parent) // leading _ is already there; append _<inum>
+ elen += 1 + sprintf(p + elen, "_%ld", dir->i_ino);
out:
kfree(cryptbuf);
@@ -355,14 +335,6 @@ out:
return elen;
}
-int ceph_encode_encrypted_fname(struct inode *parent, struct dentry *dentry,
- char *buf)
-{
- WARN_ON_ONCE(!fscrypt_has_encryption_key(parent));
-
- return ceph_encode_encrypted_dname(parent, &dentry->d_name, buf);
-}
-
/**
* ceph_fname_to_usr - convert a filename for userland presentation
* @fname: ceph_fname to be converted
@@ -516,15 +488,13 @@ int ceph_fscrypt_decrypt_block_inplace(const struct inode *inode,
int ceph_fscrypt_encrypt_block_inplace(const struct inode *inode,
struct page *page, unsigned int len,
- unsigned int offs, u64 lblk_num,
- gfp_t gfp_flags)
+ unsigned int offs, u64 lblk_num)
{
struct ceph_client *cl = ceph_inode_to_client(inode);
doutc(cl, "%p %llx.%llx len %u offs %u blk %llu\n", inode,
ceph_vinop(inode), len, offs, lblk_num);
- return fscrypt_encrypt_block_inplace(inode, page, len, offs, lblk_num,
- gfp_flags);
+ return fscrypt_encrypt_block_inplace(inode, page, len, offs, lblk_num);
}
/**
@@ -642,9 +612,8 @@ int ceph_fscrypt_decrypt_extents(struct inode *inode, struct page **page,
* @page: pointer to page array
* @off: offset into the file that the data starts
* @len: max length to encrypt
- * @gfp: gfp flags to use for allocation
*
- * Decrypt an array of cleartext pages and return the amount of
+ * Encrypt an array of cleartext pages and return the amount of
* data encrypted. Any data in the page prior to the start of the
* first complete block in the read is ignored. Any incomplete
* crypto blocks at the end of the array are ignored.
@@ -652,7 +621,7 @@ int ceph_fscrypt_decrypt_extents(struct inode *inode, struct page **page,
* Returns the length of the encrypted data or a negative errno.
*/
int ceph_fscrypt_encrypt_pages(struct inode *inode, struct page **page, u64 off,
- int len, gfp_t gfp)
+ int len)
{
int i, num_blocks;
u64 baseblk = off >> CEPH_FSCRYPT_BLOCK_SHIFT;
@@ -673,7 +642,7 @@ int ceph_fscrypt_encrypt_pages(struct inode *inode, struct page **page, u64 off,
fret = ceph_fscrypt_encrypt_block_inplace(inode, page[pgidx],
CEPH_FSCRYPT_BLOCK_SIZE, pgoffs,
- baseblk + i, gfp);
+ baseblk + i);
if (fret < 0) {
if (ret == 0)
ret = fret;
diff --git a/fs/ceph/crypto.h b/fs/ceph/crypto.h
index d0768239a1c9..23612b2e9837 100644
--- a/fs/ceph/crypto.h
+++ b/fs/ceph/crypto.h
@@ -102,10 +102,7 @@ int ceph_fscrypt_prepare_context(struct inode *dir, struct inode *inode,
struct ceph_acl_sec_ctx *as);
void ceph_fscrypt_as_ctx_to_req(struct ceph_mds_request *req,
struct ceph_acl_sec_ctx *as);
-int ceph_encode_encrypted_dname(struct inode *parent, struct qstr *d_name,
- char *buf);
-int ceph_encode_encrypted_fname(struct inode *parent, struct dentry *dentry,
- char *buf);
+int ceph_encode_encrypted_dname(struct inode *parent, char *buf, int len);
static inline int ceph_fname_alloc_buffer(struct inode *parent,
struct fscrypt_str *fname)
@@ -155,15 +152,14 @@ int ceph_fscrypt_decrypt_block_inplace(const struct inode *inode,
unsigned int offs, u64 lblk_num);
int ceph_fscrypt_encrypt_block_inplace(const struct inode *inode,
struct page *page, unsigned int len,
- unsigned int offs, u64 lblk_num,
- gfp_t gfp_flags);
+ unsigned int offs, u64 lblk_num);
int ceph_fscrypt_decrypt_pages(struct inode *inode, struct page **page,
u64 off, int len);
int ceph_fscrypt_decrypt_extents(struct inode *inode, struct page **page,
u64 off, struct ceph_sparse_extent *map,
u32 ext_cnt);
int ceph_fscrypt_encrypt_pages(struct inode *inode, struct page **page, u64 off,
- int len, gfp_t gfp);
+ int len);
static inline struct page *ceph_fscrypt_pagecache_page(struct page *page)
{
@@ -194,17 +190,10 @@ static inline void ceph_fscrypt_as_ctx_to_req(struct ceph_mds_request *req,
{
}
-static inline int ceph_encode_encrypted_dname(struct inode *parent,
- struct qstr *d_name, char *buf)
-{
- memcpy(buf, d_name->name, d_name->len);
- return d_name->len;
-}
-
-static inline int ceph_encode_encrypted_fname(struct inode *parent,
- struct dentry *dentry, char *buf)
+static inline int ceph_encode_encrypted_dname(struct inode *parent, char *buf,
+ int len)
{
- return -EOPNOTSUPP;
+ return len;
}
static inline int ceph_fname_alloc_buffer(struct inode *parent,
@@ -246,8 +235,7 @@ static inline int ceph_fscrypt_decrypt_block_inplace(const struct inode *inode,
static inline int ceph_fscrypt_encrypt_block_inplace(const struct inode *inode,
struct page *page, unsigned int len,
- unsigned int offs, u64 lblk_num,
- gfp_t gfp_flags)
+ unsigned int offs, u64 lblk_num)
{
return 0;
}
@@ -269,7 +257,7 @@ static inline int ceph_fscrypt_decrypt_extents(struct inode *inode,
static inline int ceph_fscrypt_encrypt_pages(struct inode *inode,
struct page **page, u64 off,
- int len, gfp_t gfp)
+ int len)
{
return 0;
}
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index fdf9dc15eafa..fdd404fc8112 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -412,7 +412,7 @@ void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
void ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
{
- char name[100];
+ char name[NAME_MAX];
doutc(fsc->client, "begin\n");
fsc->debugfs_congestion_kb =
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 62e99e65250d..8478e7e75df6 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -141,17 +141,18 @@ __dcache_find_get_entry(struct dentry *parent, u64 idx,
if (ptr_pos >= i_size_read(dir))
return NULL;
- if (!cache_ctl->page || ptr_pgoff != cache_ctl->page->index) {
+ if (!cache_ctl->folio || ptr_pgoff != cache_ctl->folio->index) {
ceph_readdir_cache_release(cache_ctl);
- cache_ctl->page = find_lock_page(&dir->i_data, ptr_pgoff);
- if (!cache_ctl->page) {
- doutc(cl, " page %lu not found\n", ptr_pgoff);
+ cache_ctl->folio = filemap_lock_folio(&dir->i_data, ptr_pgoff);
+ if (IS_ERR(cache_ctl->folio)) {
+ cache_ctl->folio = NULL;
+ doutc(cl, " folio %lu not found\n", ptr_pgoff);
return ERR_PTR(-EAGAIN);
}
/* reading/filling the cache are serialized by
- i_rwsem, no need to use page lock */
- unlock_page(cache_ctl->page);
- cache_ctl->dentries = kmap(cache_ctl->page);
+ i_rwsem, no need to use folio lock */
+ folio_unlock(cache_ctl->folio);
+ cache_ctl->dentries = kmap_local_folio(cache_ctl->folio, 0);
}
cache_ctl->index = idx & idx_mask;
@@ -422,17 +423,16 @@ more:
req->r_inode_drop = CEPH_CAP_FILE_EXCL;
}
if (dfi->last_name) {
- struct qstr d_name = { .name = dfi->last_name,
- .len = strlen(dfi->last_name) };
+ int len = strlen(dfi->last_name);
req->r_path2 = kzalloc(NAME_MAX + 1, GFP_KERNEL);
if (!req->r_path2) {
ceph_mdsc_put_request(req);
return -ENOMEM;
}
+ memcpy(req->r_path2, dfi->last_name, len);
- err = ceph_encode_encrypted_dname(inode, &d_name,
- req->r_path2);
+ err = ceph_encode_encrypted_dname(inode, req->r_path2, len);
if (err < 0) {
ceph_mdsc_put_request(req);
return err;
@@ -1092,19 +1092,20 @@ out:
return err;
}
-static int ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb);
struct ceph_client *cl = mdsc->fsc->client;
struct ceph_mds_request *req;
struct ceph_acl_sec_ctx as_ctx = {};
+ struct dentry *ret;
int err;
int op;
err = ceph_wait_on_conflict_unlink(dentry);
if (err)
- return err;
+ return ERR_PTR(err);
if (ceph_snap(dir) == CEPH_SNAPDIR) {
/* mkdir .snap/foo is a MKSNAP */
@@ -1116,32 +1117,32 @@ static int ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir,
ceph_vinop(dir), dentry, dentry, mode);
op = CEPH_MDS_OP_MKDIR;
} else {
- err = -EROFS;
+ ret = ERR_PTR(-EROFS);
goto out;
}
if (op == CEPH_MDS_OP_MKDIR &&
ceph_quota_is_max_files_exceeded(dir)) {
- err = -EDQUOT;
+ ret = ERR_PTR(-EDQUOT);
goto out;
}
if ((op == CEPH_MDS_OP_MKSNAP) && IS_ENCRYPTED(dir) &&
!fscrypt_has_encryption_key(dir)) {
- err = -ENOKEY;
+ ret = ERR_PTR(-ENOKEY);
goto out;
}
req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
if (IS_ERR(req)) {
- err = PTR_ERR(req);
+ ret = ERR_CAST(req);
goto out;
}
mode |= S_IFDIR;
req->r_new_inode = ceph_new_inode(dir, dentry, &mode, &as_ctx);
if (IS_ERR(req->r_new_inode)) {
- err = PTR_ERR(req->r_new_inode);
+ ret = ERR_CAST(req->r_new_inode);
req->r_new_inode = NULL;
goto out_req;
}
@@ -1165,15 +1166,22 @@ static int ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir,
!req->r_reply_info.head->is_target &&
!req->r_reply_info.head->is_dentry)
err = ceph_handle_notrace_create(dir, dentry);
+ ret = ERR_PTR(err);
out_req:
+ if (!IS_ERR(ret) && req->r_dentry != dentry)
+ /* Some other dentry was spliced in */
+ ret = dget(req->r_dentry);
ceph_mdsc_put_request(req);
out:
- if (!err)
+ if (!IS_ERR(ret)) {
+ if (ret)
+ dentry = ret;
ceph_init_inode_acls(d_inode(dentry), &as_ctx);
- else
+ } else {
d_drop(dentry);
+ }
ceph_release_acl_sec_ctx(&as_ctx);
- return err;
+ return ret;
}
static int ceph_link(struct dentry *old_dentry, struct inode *dir,
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 150076ced937..b2f2af104679 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -33,12 +33,19 @@ struct ceph_nfs_snapfh {
u32 hash;
} __attribute__ ((packed));
+#define BYTES_PER_U32 (sizeof(u32))
+#define CEPH_FH_BASIC_SIZE \
+ (sizeof(struct ceph_nfs_fh) / BYTES_PER_U32)
+#define CEPH_FH_WITH_PARENT_SIZE \
+ (sizeof(struct ceph_nfs_confh) / BYTES_PER_U32)
+#define CEPH_FH_SNAPPED_INODE_SIZE \
+ (sizeof(struct ceph_nfs_snapfh) / BYTES_PER_U32)
+
static int ceph_encode_snapfh(struct inode *inode, u32 *rawfh, int *max_len,
struct inode *parent_inode)
{
struct ceph_client *cl = ceph_inode_to_client(inode);
- static const int snap_handle_length =
- sizeof(struct ceph_nfs_snapfh) >> 2;
+ static const int snap_handle_length = CEPH_FH_SNAPPED_INODE_SIZE;
struct ceph_nfs_snapfh *sfh = (void *)rawfh;
u64 snapid = ceph_snap(inode);
int ret;
@@ -88,10 +95,8 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
struct inode *parent_inode)
{
struct ceph_client *cl = ceph_inode_to_client(inode);
- static const int handle_length =
- sizeof(struct ceph_nfs_fh) >> 2;
- static const int connected_handle_length =
- sizeof(struct ceph_nfs_confh) >> 2;
+ static const int handle_length = CEPH_FH_BASIC_SIZE;
+ static const int connected_handle_length = CEPH_FH_WITH_PARENT_SIZE;
int type;
if (ceph_snap(inode) != CEPH_NOSNAP)
@@ -308,7 +313,7 @@ static struct dentry *ceph_fh_to_dentry(struct super_block *sb,
if (fh_type != FILEID_INO32_GEN &&
fh_type != FILEID_INO32_GEN_PARENT)
return NULL;
- if (fh_len < sizeof(*fh) / 4)
+ if (fh_len < sizeof(*fh) / BYTES_PER_U32)
return NULL;
doutc(fsc->client, "%llx\n", fh->ino);
@@ -427,7 +432,7 @@ static struct dentry *ceph_fh_to_parent(struct super_block *sb,
if (fh_type != FILEID_INO32_GEN_PARENT)
return NULL;
- if (fh_len < sizeof(*cfh) / 4)
+ if (fh_len < sizeof(*cfh) / BYTES_PER_U32)
return NULL;
doutc(fsc->client, "%llx\n", cfh->parent_ino);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 851d70200c6b..c02f100f8552 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1992,8 +1992,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
if (IS_ENCRYPTED(inode)) {
ret = ceph_fscrypt_encrypt_pages(inode, pages,
- write_pos, write_len,
- GFP_KERNEL);
+ write_pos, write_len);
if (ret < 0) {
doutc(cl, "encryption failed with %d\n", ret);
ceph_release_page_vector(pages, num_pages);
@@ -2530,19 +2529,19 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
return generic_file_llseek(file, offset, whence);
}
-static inline void ceph_zero_partial_page(
- struct inode *inode, loff_t offset, unsigned size)
+static inline void ceph_zero_partial_page(struct inode *inode,
+ loff_t offset, size_t size)
{
- struct page *page;
- pgoff_t index = offset >> PAGE_SHIFT;
+ struct folio *folio;
- page = find_lock_page(inode->i_mapping, index);
- if (page) {
- wait_on_page_writeback(page);
- zero_user(page, offset & (PAGE_SIZE - 1), size);
- unlock_page(page);
- put_page(page);
- }
+ folio = filemap_lock_folio(inode->i_mapping, offset >> PAGE_SHIFT);
+ if (IS_ERR(folio))
+ return;
+
+ folio_wait_writeback(folio);
+ folio_zero_range(folio, offset_in_folio(folio, offset), size);
+ folio_unlock(folio);
+ folio_put(folio);
}
static void ceph_zero_pagecache_range(struct inode *inode, loff_t offset,
@@ -2616,7 +2615,7 @@ static int ceph_zero_objects(struct inode *inode, loff_t offset, loff_t length)
s32 stripe_unit = ci->i_layout.stripe_unit;
s32 stripe_count = ci->i_layout.stripe_count;
s32 object_size = ci->i_layout.object_size;
- u64 object_set_size = object_size * stripe_count;
+ u64 object_set_size = (u64) object_size * stripe_count;
u64 nearly, t;
/* round offset up to next period boundary */
@@ -3171,7 +3170,7 @@ const struct file_operations ceph_file_fops = {
.llseek = ceph_llseek,
.read_iter = ceph_read_iter,
.write_iter = ceph_write_iter,
- .mmap = ceph_mmap,
+ .mmap_prepare = ceph_mmap_prepare,
.fsync = ceph_fsync,
.lock = ceph_lock,
.setlease = simple_nosetlease,
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 7dd6c2275085..fc543075b827 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1845,10 +1845,9 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl)
{
- if (ctl->page) {
- kunmap(ctl->page);
- put_page(ctl->page);
- ctl->page = NULL;
+ if (ctl->folio) {
+ folio_release_kmap(ctl->folio, ctl->dentries);
+ ctl->folio = NULL;
}
}
@@ -1862,20 +1861,26 @@ static int fill_readdir_cache(struct inode *dir, struct dentry *dn,
unsigned idx = ctl->index % nsize;
pgoff_t pgoff = ctl->index / nsize;
- if (!ctl->page || pgoff != ctl->page->index) {
+ if (!ctl->folio || pgoff != ctl->folio->index) {
ceph_readdir_cache_release(ctl);
+ fgf_t fgf = FGP_LOCK;
+
if (idx == 0)
- ctl->page = grab_cache_page(&dir->i_data, pgoff);
- else
- ctl->page = find_lock_page(&dir->i_data, pgoff);
- if (!ctl->page) {
+ fgf |= FGP_ACCESSED | FGP_CREAT;
+
+ ctl->folio = __filemap_get_folio(&dir->i_data, pgoff,
+ fgf, mapping_gfp_mask(&dir->i_data));
+ if (IS_ERR(ctl->folio)) {
+ int err = PTR_ERR(ctl->folio);
+
+ ctl->folio = NULL;
ctl->index = -1;
- return idx == 0 ? -ENOMEM : 0;
+ return idx == 0 ? err : 0;
}
/* reading/filling the cache are serialized by
- * i_rwsem, no need to use page lock */
- unlock_page(ctl->page);
- ctl->dentries = kmap(ctl->page);
+ * i_rwsem, no need to use folio lock */
+ folio_unlock(ctl->folio);
+ ctl->dentries = kmap_local_folio(ctl->folio, 0);
if (idx == 0)
memset(ctl->dentries, 0, PAGE_SIZE);
}
@@ -2362,7 +2367,7 @@ static int fill_fscrypt_truncate(struct inode *inode,
/* Try to writeback the dirty pagecaches */
if (issued & (CEPH_CAP_FILE_BUFFER)) {
- loff_t lend = orig_pos + CEPH_FSCRYPT_BLOCK_SHIFT - 1;
+ loff_t lend = orig_pos + CEPH_FSCRYPT_BLOCK_SIZE - 1;
ret = filemap_write_and_wait_range(inode->i_mapping,
orig_pos, lend);
@@ -2431,8 +2436,7 @@ static int fill_fscrypt_truncate(struct inode *inode,
/* encrypt the last block */
ret = ceph_fscrypt_encrypt_block_inplace(inode, page,
CEPH_FSCRYPT_BLOCK_SIZE,
- 0, block,
- GFP_KERNEL);
+ 0, block);
if (ret)
goto out;
}
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index d7a06b9aaef6..0f497c39ff82 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2766,8 +2766,8 @@ retry:
}
if (fscrypt_has_encryption_key(d_inode(parent))) {
- len = ceph_encode_encrypted_fname(d_inode(parent),
- cur, buf);
+ len = ceph_encode_encrypted_dname(d_inode(parent),
+ buf, len);
if (len < 0) {
dput(parent);
dput(cur);
@@ -2948,12 +2948,12 @@ static struct ceph_mds_request_head_legacy *
find_legacy_request_head(void *p, u64 features)
{
bool legacy = !(features & CEPH_FEATURE_FS_BTIME);
- struct ceph_mds_request_head_old *ohead;
+ struct ceph_mds_request_head *head;
if (legacy)
return (struct ceph_mds_request_head_legacy *)p;
- ohead = (struct ceph_mds_request_head_old *)p;
- return (struct ceph_mds_request_head_legacy *)&ohead->oldest_client_tid;
+ head = (struct ceph_mds_request_head *)p;
+ return (struct ceph_mds_request_head_legacy *)&head->oldest_client_tid;
}
/*
@@ -3023,7 +3023,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
if (legacy)
len = sizeof(struct ceph_mds_request_head_legacy);
else if (request_head_version == 1)
- len = sizeof(struct ceph_mds_request_head_old);
+ len = offsetofend(struct ceph_mds_request_head, args);
else if (request_head_version == 2)
len = offsetofend(struct ceph_mds_request_head, ext_num_fwd);
else
@@ -3107,11 +3107,11 @@ static struct ceph_msg *create_request_message(struct ceph_mds_session *session,
msg->hdr.version = cpu_to_le16(3);
p = msg->front.iov_base + sizeof(*lhead);
} else if (request_head_version == 1) {
- struct ceph_mds_request_head_old *ohead = msg->front.iov_base;
+ struct ceph_mds_request_head *nhead = msg->front.iov_base;
msg->hdr.version = cpu_to_le16(4);
- ohead->version = cpu_to_le16(1);
- p = msg->front.iov_base + sizeof(*ohead);
+ nhead->version = cpu_to_le16(1);
+ p = msg->front.iov_base + offsetofend(struct ceph_mds_request_head, args);
} else if (request_head_version == 2) {
struct ceph_mds_request_head *nhead = msg->front.iov_base;
@@ -3268,7 +3268,7 @@ static int __prepare_send_request(struct ceph_mds_session *session,
* so we limit to retry at most 256 times.
*/
if (req->r_attempts) {
- old_max_retry = sizeof_field(struct ceph_mds_request_head_old,
+ old_max_retry = sizeof_field(struct ceph_mds_request_head,
num_retry);
old_max_retry = 1 << (old_max_retry * BITS_PER_BYTE);
if ((old_version && req->r_attempts >= old_max_retry) ||
@@ -5489,6 +5489,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
spin_lock_init(&mdsc->stopping_lock);
atomic_set(&mdsc->stopping_blockers, 0);
init_completion(&mdsc->stopping_waiter);
+ atomic64_set(&mdsc->dirty_folios, 0);
+ init_waitqueue_head(&mdsc->flush_end_wq);
init_waitqueue_head(&mdsc->session_close_wq);
INIT_LIST_HEAD(&mdsc->waiting_for_map);
mdsc->quotarealms_inodes = RB_ROOT;
@@ -5693,18 +5695,18 @@ static int ceph_mds_auth_match(struct ceph_mds_client *mdsc,
*
* All the other cases --> mismatch
*/
+ bool path_matched = true;
char *first = strstr(_tpath, auth->match.path);
- if (first != _tpath) {
- if (free_tpath)
- kfree(_tpath);
- return 0;
+ if (first != _tpath ||
+ (tlen > len && _tpath[len] != '/')) {
+ path_matched = false;
}
- if (tlen > len && _tpath[len] != '/') {
- if (free_tpath)
- kfree(_tpath);
+ if (free_tpath)
+ kfree(_tpath);
+
+ if (!path_matched)
return 0;
- }
}
}
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 7c9fee9e80d4..3e2a6fa7c19a 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -458,6 +458,9 @@ struct ceph_mds_client {
atomic_t stopping_blockers;
struct completion stopping_waiter;
+ atomic64_t dirty_folios;
+ wait_queue_head_t flush_end_wq;
+
atomic64_t quotarealms_count; /* # realms with quota */
/*
* We keep a list of inodes we don't see in the mountpoint but that we
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 4344e1f11806..c3eb651862c5 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -1033,8 +1033,7 @@ void ceph_umount_begin(struct super_block *sb)
struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
doutc(fsc->client, "starting forced umount\n");
- if (!fsc)
- return;
+
fsc->mount_state = CEPH_MOUNT_SHUTDOWN;
__ceph_umount_begin(fsc);
}
@@ -1220,13 +1219,14 @@ static int ceph_set_super(struct super_block *s, struct fs_context *fc)
fsc->max_file_size = 1ULL << 40; /* temp value until we get mdsmap */
s->s_op = &ceph_super_ops;
- s->s_d_op = &ceph_dentry_ops;
+ set_default_d_op(s, &ceph_dentry_ops);
s->s_export_op = &ceph_export_ops;
s->s_time_gran = 1;
s->s_time_min = 0;
s->s_time_max = U32_MAX;
s->s_flags |= SB_NODIRATIME | SB_NOATIME;
+ s->s_magic = CEPH_SUPER_MAGIC;
ceph_fscrypt_set_ops(s);
@@ -1563,6 +1563,17 @@ static void ceph_kill_sb(struct super_block *s)
*/
sync_filesystem(s);
+ if (atomic64_read(&mdsc->dirty_folios) > 0) {
+ wait_queue_head_t *wq = &mdsc->flush_end_wq;
+ long timeleft = wait_event_killable_timeout(*wq,
+ atomic64_read(&mdsc->dirty_folios) <= 0,
+ fsc->client->options->mount_timeout);
+ if (!timeleft) /* timed out */
+ pr_warn_client(cl, "umount timed out, %ld\n", timeleft);
+ else if (timeleft < 0) /* killed */
+ pr_warn_client(cl, "umount was killed, %ld\n", timeleft);
+ }
+
spin_lock(&mdsc->stopping_lock);
mdsc->stopping = CEPH_MDSC_STOPPING_FLUSHING;
wait = !!atomic_read(&mdsc->stopping_blockers);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 7fa1e7be50e4..cf176aab0f82 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -903,7 +903,7 @@ ceph_find_rw_context(struct ceph_file_info *cf)
}
struct ceph_readdir_cache_control {
- struct page *page;
+ struct folio *folio;
struct dentry **dentries;
int index;
};
@@ -1286,7 +1286,7 @@ extern void __ceph_touch_fmode(struct ceph_inode_info *ci,
/* addr.c */
extern const struct address_space_operations ceph_aops;
extern const struct netfs_request_ops ceph_netfs_ops;
-extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
+int ceph_mmap_prepare(struct vm_area_desc *desc);
extern int ceph_uninline_data(struct file *file);
extern int ceph_pool_perm_check(struct inode *inode, int need);
extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc);
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index a3e2dfeedfbf..ca9990017265 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -166,8 +166,8 @@ err_out:
return error;
}
-static int coda_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *de, umode_t mode)
+static struct dentry *coda_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *de, umode_t mode)
{
struct inode *inode;
struct coda_vattr attrs;
@@ -177,14 +177,14 @@ static int coda_mkdir(struct mnt_idmap *idmap, struct inode *dir,
struct CodaFid newfid;
if (is_root_inode(dir) && coda_iscontrol(name, len))
- return -EPERM;
+ return ERR_PTR(-EPERM);
attrs.va_mode = mode;
- error = venus_mkdir(dir->i_sb, coda_i2f(dir),
+ error = venus_mkdir(dir->i_sb, coda_i2f(dir),
name, len, &newfid, &attrs);
if (error)
goto err_out;
-
+
inode = coda_iget(dir->i_sb, &newfid, &attrs);
if (IS_ERR(inode)) {
error = PTR_ERR(inode);
@@ -195,10 +195,10 @@ static int coda_mkdir(struct mnt_idmap *idmap, struct inode *dir,
coda_dir_inc_nlink(dir);
coda_dir_update_mtime(dir);
d_instantiate(de, inode);
- return 0;
+ return NULL;
err_out:
d_drop(de);
- return error;
+ return ERR_PTR(error);
}
/* try to make de an entry in dir_inodde linked to source_de */
@@ -429,17 +429,9 @@ static int coda_readdir(struct file *coda_file, struct dir_context *ctx)
cfi = coda_ftoc(coda_file);
host_file = cfi->cfi_container;
- if (host_file->f_op->iterate_shared) {
- struct inode *host_inode = file_inode(host_file);
- ret = -ENOENT;
- if (!IS_DEADDIR(host_inode)) {
- inode_lock_shared(host_inode);
- ret = host_file->f_op->iterate_shared(host_file, ctx);
- file_accessed(host_file);
- inode_unlock_shared(host_inode);
- }
+ ret = iterate_dir(host_file, ctx);
+ if (ret != -ENOTDIR)
return ret;
- }
/* Venus: we must read Venus dirents from a file */
return coda_venus_readdir(coda_file, ctx);
}
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 148856a582a9..a390b5d21196 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -160,7 +160,7 @@ coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma)
size_t count;
int ret;
- if (!host_file->f_op->mmap)
+ if (!can_mmap_file(host_file))
return -ENODEV;
if (WARN_ON(coda_file != vma->vm_file))
@@ -199,10 +199,10 @@ coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma)
spin_unlock(&cii->c_lock);
vma->vm_file = get_file(host_file);
- ret = call_mmap(vma->vm_file, vma);
+ ret = vfs_mmap(vma->vm_file, vma);
if (ret) {
- /* if call_mmap fails, our caller will put host_file so we
+ /* if vfs_mmap fails, our caller will put host_file so we
* should drop the reference to the coda_file that we got.
*/
fput(coda_file);
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 6896fce122e1..08450d006016 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -230,7 +230,7 @@ static int coda_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_blocksize_bits = 12;
sb->s_magic = CODA_SUPER_MAGIC;
sb->s_op = &coda_super_operations;
- sb->s_d_op = &coda_dentry_operations;
+ set_default_d_op(sb, &coda_dentry_operations);
sb->s_time_gran = 1;
sb->s_time_min = S64_MIN;
sb->s_time_max = S64_MAX;
diff --git a/fs/configfs/Kconfig b/fs/configfs/Kconfig
index 272b64456999..1fcd761fe7be 100644
--- a/fs/configfs/Kconfig
+++ b/fs/configfs/Kconfig
@@ -1,7 +1,6 @@
# SPDX-License-Identifier: GPL-2.0-only
config CONFIGFS_FS
tristate "Userspace-driven configuration filesystem"
- select SYSFS
help
configfs is a RAM-based filesystem that provides the converse
of sysfs's functionality. Where sysfs is a filesystem-based
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 7d10278db30d..f327fbb9a0ca 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -67,7 +67,6 @@ static void configfs_d_iput(struct dentry * dentry,
const struct dentry_operations configfs_dentry_ops = {
.d_iput = configfs_d_iput,
- .d_delete = always_delete_dentry,
};
#ifdef CONFIG_LOCKDEP
@@ -619,7 +618,7 @@ static int populate_attrs(struct config_item *item)
break;
}
}
- if (t->ct_bin_attrs) {
+ if (!error && t->ct_bin_attrs) {
for (i = 0; (bin_attr = t->ct_bin_attrs[i]) != NULL; i++) {
if (ops && ops->is_bin_visible && !ops->is_bin_visible(item, bin_attr, i))
continue;
@@ -970,7 +969,7 @@ static void configfs_dump_one(struct configfs_dirent *sd, int level)
{
pr_info("%*s\"%s\":\n", level, " ", configfs_get_name(sd));
-#define type_print(_type) if (sd->s_type & _type) pr_info("%*s %s\n", level, " ", #_type);
+#define type_print(_type) if (sd->s_type & _type) pr_info("%*s %s\n", level, " ", #_type)
type_print(CONFIGFS_ROOT);
type_print(CONFIGFS_DIR);
type_print(CONFIGFS_ITEM_ATTR);
@@ -1280,8 +1279,8 @@ out_root_unlock:
}
EXPORT_SYMBOL(configfs_depend_item_unlocked);
-static int configfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *configfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
int ret = 0;
int module_got = 0;
@@ -1461,7 +1460,7 @@ out_put:
put_fragment(frag);
out:
- return ret;
+ return ERR_PTR(ret);
}
static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
diff --git a/fs/configfs/item.c b/fs/configfs/item.c
index 254170a82aa3..c378b5cbf87d 100644
--- a/fs/configfs/item.c
+++ b/fs/configfs/item.c
@@ -66,7 +66,7 @@ int config_item_set_name(struct config_item *item, const char *fmt, ...)
name = kvasprintf(GFP_KERNEL, fmt, args);
va_end(args);
if (!name)
- return -EFAULT;
+ return -ENOMEM;
}
/* Free the old name, if necessary. */
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index c2d820063ec4..740f18b60c9d 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -92,7 +92,8 @@ static int configfs_fill_super(struct super_block *sb, struct fs_context *fc)
configfs_root_group.cg_item.ci_dentry = root;
root->d_fsdata = &configfs_root;
sb->s_root = root;
- sb->s_d_op = &configfs_dentry_ops; /* the rest get that */
+ set_default_d_op(sb, &configfs_dentry_ops); /* the rest get that */
+ sb->s_d_flags |= DCACHE_DONTCACHE;
return 0;
}
diff --git a/fs/coredump.c b/fs/coredump.c
index 591700e1b2ce..5dce257c67fc 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -43,6 +43,15 @@
#include <linux/timekeeping.h>
#include <linux/sysctl.h>
#include <linux/elf.h>
+#include <linux/pidfs.h>
+#include <linux/net.h>
+#include <linux/socket.h>
+#include <net/af_unix.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <uapi/linux/pidfd.h>
+#include <uapi/linux/un.h>
+#include <uapi/linux/coredump.h>
#include <linux/uaccess.h>
#include <asm/mmu_context.h>
@@ -60,16 +69,35 @@ static void free_vma_snapshot(struct coredump_params *cprm);
#define CORE_FILE_NOTE_SIZE_DEFAULT (4*1024*1024)
/* Define a reasonable max cap */
#define CORE_FILE_NOTE_SIZE_MAX (16*1024*1024)
+/*
+ * File descriptor number for the pidfd for the thread-group leader of
+ * the coredumping task installed into the usermode helper's file
+ * descriptor table.
+ */
+#define COREDUMP_PIDFD_NUMBER 3
static int core_uses_pid;
static unsigned int core_pipe_limit;
+static unsigned int core_sort_vma;
static char core_pattern[CORENAME_MAX_SIZE] = "core";
static int core_name_size = CORENAME_MAX_SIZE;
unsigned int core_file_note_size_limit = CORE_FILE_NOTE_SIZE_DEFAULT;
+static atomic_t core_pipe_count = ATOMIC_INIT(0);
+
+enum coredump_type_t {
+ COREDUMP_FILE = 1,
+ COREDUMP_PIPE = 2,
+ COREDUMP_SOCK = 3,
+ COREDUMP_SOCK_REQ = 4,
+};
struct core_name {
char *corename;
int used, size;
+ unsigned int core_pipe_limit;
+ bool core_dumped;
+ enum coredump_type_t core_type;
+ u64 mask;
};
static int expand_corename(struct core_name *cn, int size)
@@ -200,35 +228,104 @@ put_exe_file:
return ret;
}
-/* format_corename will inspect the pattern parameter, and output a
- * name into corename, which must have space for at least
- * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
+/*
+ * coredump_parse will inspect the pattern parameter, and output a name
+ * into corename, which must have space for at least CORENAME_MAX_SIZE
+ * bytes plus one byte for the zero terminator.
*/
-static int format_corename(struct core_name *cn, struct coredump_params *cprm,
+static bool coredump_parse(struct core_name *cn, struct coredump_params *cprm,
size_t **argv, int *argc)
{
const struct cred *cred = current_cred();
const char *pat_ptr = core_pattern;
- int ispipe = (*pat_ptr == '|');
bool was_space = false;
int pid_in_pattern = 0;
int err = 0;
+ cn->mask = COREDUMP_KERNEL;
+ if (core_pipe_limit)
+ cn->mask |= COREDUMP_WAIT;
cn->used = 0;
cn->corename = NULL;
+ cn->core_pipe_limit = 0;
+ cn->core_dumped = false;
+ if (*pat_ptr == '|')
+ cn->core_type = COREDUMP_PIPE;
+ else if (*pat_ptr == '@')
+ cn->core_type = COREDUMP_SOCK;
+ else
+ cn->core_type = COREDUMP_FILE;
if (expand_corename(cn, core_name_size))
- return -ENOMEM;
+ return false;
cn->corename[0] = '\0';
- if (ispipe) {
+ switch (cn->core_type) {
+ case COREDUMP_PIPE: {
int argvs = sizeof(core_pattern) / 2;
(*argv) = kmalloc_array(argvs, sizeof(**argv), GFP_KERNEL);
if (!(*argv))
- return -ENOMEM;
+ return false;
(*argv)[(*argc)++] = 0;
++pat_ptr;
if (!(*pat_ptr))
- return -ENOMEM;
+ return false;
+ break;
+ }
+ case COREDUMP_SOCK: {
+ /* skip the @ */
+ pat_ptr++;
+ if (!(*pat_ptr))
+ return false;
+ if (*pat_ptr == '@') {
+ pat_ptr++;
+ if (!(*pat_ptr))
+ return false;
+
+ cn->core_type = COREDUMP_SOCK_REQ;
+ }
+
+ err = cn_printf(cn, "%s", pat_ptr);
+ if (err)
+ return false;
+
+ /* Require absolute paths. */
+ if (cn->corename[0] != '/')
+ return false;
+
+ /*
+ * Ensure we can uses spaces to indicate additional
+ * parameters in the future.
+ */
+ if (strchr(cn->corename, ' ')) {
+ coredump_report_failure("Coredump socket may not %s contain spaces", cn->corename);
+ return false;
+ }
+
+ /* Must not contain ".." in the path. */
+ if (name_contains_dotdot(cn->corename)) {
+ coredump_report_failure("Coredump socket may not %s contain '..' spaces", cn->corename);
+ return false;
+ }
+
+ if (strlen(cn->corename) >= UNIX_PATH_MAX) {
+ coredump_report_failure("Coredump socket path %s too long", cn->corename);
+ return false;
+ }
+
+ /*
+ * Currently no need to parse any other options.
+ * Relevant information can be retrieved from the peer
+ * pidfd retrievable via SO_PEERPIDFD by the receiver or
+ * via /proc/<pid>, using the SO_PEERPIDFD to guard
+ * against pid recycling when opening /proc/<pid>.
+ */
+ return true;
+ }
+ case COREDUMP_FILE:
+ break;
+ default:
+ WARN_ON_ONCE(true);
+ return false;
}
/* Repeat as long as we have more pattern to process and more output
@@ -238,7 +335,7 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm,
* Split on spaces before doing template expansion so that
* %e and %E don't get split if they have spaces in them
*/
- if (ispipe) {
+ if (cn->core_type == COREDUMP_PIPE) {
if (isspace(*pat_ptr)) {
if (cn->used != 0)
was_space = true;
@@ -248,7 +345,7 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm,
was_space = false;
err = cn_printf(cn, "%c", '\0');
if (err)
- return err;
+ return false;
(*argv)[(*argc)++] = cn->used;
}
}
@@ -338,6 +435,27 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm,
case 'C':
err = cn_printf(cn, "%d", cprm->cpu);
break;
+ /* pidfd number */
+ case 'F': {
+ /*
+ * Installing a pidfd only makes sense if
+ * we actually spawn a usermode helper.
+ */
+ if (cn->core_type != COREDUMP_PIPE)
+ break;
+
+ /*
+ * Note that we'll install a pidfd for the
+ * thread-group leader. We know that task
+ * linkage hasn't been removed yet and even if
+ * this @current isn't the actual thread-group
+ * leader we know that the thread-group leader
+ * cannot be reaped until @current has exited.
+ */
+ cprm->pid = task_tgid(current);
+ err = cn_printf(cn, "%d", COREDUMP_PIDFD_NUMBER);
+ break;
+ }
default:
break;
}
@@ -345,7 +463,7 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm,
}
if (err)
- return err;
+ return false;
}
out:
@@ -354,12 +472,10 @@ out:
* If core_pattern does not include a %p (as is the default)
* and core_uses_pid is set, then .%pid will be appended to
* the filename. Do not do this for piped commands. */
- if (!ispipe && !pid_in_pattern && core_uses_pid) {
- err = cn_printf(cn, ".%d", task_tgid_vnr(current));
- if (err)
- return err;
- }
- return ispipe;
+ if (cn->core_type == COREDUMP_FILE && !pid_in_pattern && core_uses_pid)
+ return cn_printf(cn, ".%d", task_tgid_vnr(current)) == 0;
+
+ return true;
}
static int zap_process(struct signal_struct *signal, int exit_code)
@@ -492,7 +608,7 @@ static void wait_for_dump_helpers(struct file *file)
}
/*
- * umh_pipe_setup
+ * umh_coredump_setup
* helper function to customize the process used
* to collect the core in userspace. Specifically
* it sets up a pipe and installs it as fd 0 (stdin)
@@ -502,11 +618,34 @@ static void wait_for_dump_helpers(struct file *file)
* is a special value that we use to trap recursive
* core dumps
*/
-static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
+static int umh_coredump_setup(struct subprocess_info *info, struct cred *new)
{
struct file *files[2];
struct coredump_params *cp = (struct coredump_params *)info->data;
- int err = create_pipe_files(files, 0);
+ int err;
+
+ if (cp->pid) {
+ struct file *pidfs_file __free(fput) = NULL;
+
+ pidfs_file = pidfs_alloc_file(cp->pid, 0);
+ if (IS_ERR(pidfs_file))
+ return PTR_ERR(pidfs_file);
+
+ pidfs_coredump(cp);
+
+ /*
+ * Usermode helpers are childen of either
+ * system_unbound_wq or of kthreadd. So we know that
+ * we're starting off with a clean file descriptor
+ * table. So we should always be able to use
+ * COREDUMP_PIDFD_NUMBER as our file descriptor value.
+ */
+ err = replace_fd(COREDUMP_PIDFD_NUMBER, pidfs_file, 0);
+ if (err < 0)
+ return err;
+ }
+
+ err = create_pipe_files(files, 0);
if (err)
return err;
@@ -514,28 +653,449 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
err = replace_fd(0, files[0], 0);
fput(files[0]);
+ if (err < 0)
+ return err;
+
/* and disallow core files too */
current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1};
- return err;
+ return 0;
+}
+
+#ifdef CONFIG_UNIX
+static bool coredump_sock_connect(struct core_name *cn, struct coredump_params *cprm)
+{
+ struct file *file __free(fput) = NULL;
+ struct sockaddr_un addr = {
+ .sun_family = AF_UNIX,
+ };
+ ssize_t addr_len;
+ int retval;
+ struct socket *socket;
+
+ addr_len = strscpy(addr.sun_path, cn->corename);
+ if (addr_len < 0)
+ return false;
+ addr_len += offsetof(struct sockaddr_un, sun_path) + 1;
+
+ /*
+ * It is possible that the userspace process which is supposed
+ * to handle the coredump and is listening on the AF_UNIX socket
+ * coredumps. Userspace should just mark itself non dumpable.
+ */
+
+ retval = sock_create_kern(&init_net, AF_UNIX, SOCK_STREAM, 0, &socket);
+ if (retval < 0)
+ return false;
+
+ file = sock_alloc_file(socket, 0, NULL);
+ if (IS_ERR(file))
+ return false;
+
+ /*
+ * Set the thread-group leader pid which is used for the peer
+ * credentials during connect() below. Then immediately register
+ * it in pidfs...
+ */
+ cprm->pid = task_tgid(current);
+ retval = pidfs_register_pid(cprm->pid);
+ if (retval)
+ return false;
+
+ /*
+ * ... and set the coredump information so userspace has it
+ * available after connect()...
+ */
+ pidfs_coredump(cprm);
+
+ retval = kernel_connect(socket, (struct sockaddr *)(&addr), addr_len,
+ O_NONBLOCK | SOCK_COREDUMP);
+
+ if (retval) {
+ if (retval == -EAGAIN)
+ coredump_report_failure("Coredump socket %s receive queue full", addr.sun_path);
+ else
+ coredump_report_failure("Coredump socket connection %s failed %d", addr.sun_path, retval);
+ return false;
+ }
+
+ /* ... and validate that @sk_peer_pid matches @cprm.pid. */
+ if (WARN_ON_ONCE(unix_peer(socket->sk)->sk_peer_pid != cprm->pid))
+ return false;
+
+ cprm->limit = RLIM_INFINITY;
+ cprm->file = no_free_ptr(file);
+
+ return true;
+}
+
+static inline bool coredump_sock_recv(struct file *file, struct coredump_ack *ack, size_t size, int flags)
+{
+ struct msghdr msg = {};
+ struct kvec iov = { .iov_base = ack, .iov_len = size };
+ ssize_t ret;
+
+ memset(ack, 0, size);
+ ret = kernel_recvmsg(sock_from_file(file), &msg, &iov, 1, size, flags);
+ return ret == size;
+}
+
+static inline bool coredump_sock_send(struct file *file, struct coredump_req *req)
+{
+ struct msghdr msg = { .msg_flags = MSG_NOSIGNAL };
+ struct kvec iov = { .iov_base = req, .iov_len = sizeof(*req) };
+ ssize_t ret;
+
+ ret = kernel_sendmsg(sock_from_file(file), &msg, &iov, 1, sizeof(*req));
+ return ret == sizeof(*req);
+}
+
+static_assert(sizeof(enum coredump_mark) == sizeof(__u32));
+
+static inline bool coredump_sock_mark(struct file *file, enum coredump_mark mark)
+{
+ struct msghdr msg = { .msg_flags = MSG_NOSIGNAL };
+ struct kvec iov = { .iov_base = &mark, .iov_len = sizeof(mark) };
+ ssize_t ret;
+
+ ret = kernel_sendmsg(sock_from_file(file), &msg, &iov, 1, sizeof(mark));
+ return ret == sizeof(mark);
+}
+
+static inline void coredump_sock_wait(struct file *file)
+{
+ ssize_t n;
+
+ /*
+ * We use a simple read to wait for the coredump processing to
+ * finish. Either the socket is closed or we get sent unexpected
+ * data. In both cases, we're done.
+ */
+ n = __kernel_read(file, &(char){ 0 }, 1, NULL);
+ if (n > 0)
+ coredump_report_failure("Coredump socket had unexpected data");
+ else if (n < 0)
+ coredump_report_failure("Coredump socket failed");
+}
+
+static inline void coredump_sock_shutdown(struct file *file)
+{
+ struct socket *socket;
+
+ socket = sock_from_file(file);
+ if (!socket)
+ return;
+
+ /* Let userspace know we're done processing the coredump. */
+ kernel_sock_shutdown(socket, SHUT_WR);
+}
+
+static bool coredump_sock_request(struct core_name *cn, struct coredump_params *cprm)
+{
+ struct coredump_req req = {
+ .size = sizeof(struct coredump_req),
+ .mask = COREDUMP_KERNEL | COREDUMP_USERSPACE |
+ COREDUMP_REJECT | COREDUMP_WAIT,
+ .size_ack = sizeof(struct coredump_ack),
+ };
+ struct coredump_ack ack = {};
+ ssize_t usize;
+
+ if (cn->core_type != COREDUMP_SOCK_REQ)
+ return true;
+
+ /* Let userspace know what we support. */
+ if (!coredump_sock_send(cprm->file, &req))
+ return false;
+
+ /* Peek the size of the coredump_ack. */
+ if (!coredump_sock_recv(cprm->file, &ack, sizeof(ack.size),
+ MSG_PEEK | MSG_WAITALL))
+ return false;
+
+ /* Refuse unknown coredump_ack sizes. */
+ usize = ack.size;
+ if (usize < COREDUMP_ACK_SIZE_VER0) {
+ coredump_sock_mark(cprm->file, COREDUMP_MARK_MINSIZE);
+ return false;
+ }
+
+ if (usize > sizeof(ack)) {
+ coredump_sock_mark(cprm->file, COREDUMP_MARK_MAXSIZE);
+ return false;
+ }
+
+ /* Now retrieve the coredump_ack. */
+ if (!coredump_sock_recv(cprm->file, &ack, usize, MSG_WAITALL))
+ return false;
+ if (ack.size != usize)
+ return false;
+
+ /* Refuse unknown coredump_ack flags. */
+ if (ack.mask & ~req.mask) {
+ coredump_sock_mark(cprm->file, COREDUMP_MARK_UNSUPPORTED);
+ return false;
+ }
+
+ /* Refuse mutually exclusive options. */
+ if (hweight64(ack.mask & (COREDUMP_USERSPACE | COREDUMP_KERNEL |
+ COREDUMP_REJECT)) != 1) {
+ coredump_sock_mark(cprm->file, COREDUMP_MARK_CONFLICTING);
+ return false;
+ }
+
+ if (ack.spare) {
+ coredump_sock_mark(cprm->file, COREDUMP_MARK_UNSUPPORTED);
+ return false;
+ }
+
+ cn->mask = ack.mask;
+ return coredump_sock_mark(cprm->file, COREDUMP_MARK_REQACK);
+}
+
+static bool coredump_socket(struct core_name *cn, struct coredump_params *cprm)
+{
+ if (!coredump_sock_connect(cn, cprm))
+ return false;
+
+ return coredump_sock_request(cn, cprm);
+}
+#else
+static inline void coredump_sock_wait(struct file *file) { }
+static inline void coredump_sock_shutdown(struct file *file) { }
+static inline bool coredump_socket(struct core_name *cn, struct coredump_params *cprm) { return false; }
+#endif
+
+/* cprm->mm_flags contains a stable snapshot of dumpability flags. */
+static inline bool coredump_force_suid_safe(const struct coredump_params *cprm)
+{
+ /* Require nonrelative corefile path and be extra careful. */
+ return __get_dumpable(cprm->mm_flags) == SUID_DUMP_ROOT;
+}
+
+static bool coredump_file(struct core_name *cn, struct coredump_params *cprm,
+ const struct linux_binfmt *binfmt)
+{
+ struct mnt_idmap *idmap;
+ struct inode *inode;
+ struct file *file __free(fput) = NULL;
+ int open_flags = O_CREAT | O_WRONLY | O_NOFOLLOW | O_LARGEFILE | O_EXCL;
+
+ if (cprm->limit < binfmt->min_coredump)
+ return false;
+
+ if (coredump_force_suid_safe(cprm) && cn->corename[0] != '/') {
+ coredump_report_failure("this process can only dump core to a fully qualified path, skipping core dump");
+ return false;
+ }
+
+ /*
+ * Unlink the file if it exists unless this is a SUID
+ * binary - in that case, we're running around with root
+ * privs and don't want to unlink another user's coredump.
+ */
+ if (!coredump_force_suid_safe(cprm)) {
+ /*
+ * If it doesn't exist, that's fine. If there's some
+ * other problem, we'll catch it at the filp_open().
+ */
+ do_unlinkat(AT_FDCWD, getname_kernel(cn->corename));
+ }
+
+ /*
+ * There is a race between unlinking and creating the
+ * file, but if that causes an EEXIST here, that's
+ * fine - another process raced with us while creating
+ * the corefile, and the other process won. To userspace,
+ * what matters is that at least one of the two processes
+ * writes its coredump successfully, not which one.
+ */
+ if (coredump_force_suid_safe(cprm)) {
+ /*
+ * Using user namespaces, normal user tasks can change
+ * their current->fs->root to point to arbitrary
+ * directories. Since the intention of the "only dump
+ * with a fully qualified path" rule is to control where
+ * coredumps may be placed using root privileges,
+ * current->fs->root must not be used. Instead, use the
+ * root directory of init_task.
+ */
+ struct path root;
+
+ task_lock(&init_task);
+ get_fs_root(init_task.fs, &root);
+ task_unlock(&init_task);
+ file = file_open_root(&root, cn->corename, open_flags, 0600);
+ path_put(&root);
+ } else {
+ file = filp_open(cn->corename, open_flags, 0600);
+ }
+ if (IS_ERR(file))
+ return false;
+
+ inode = file_inode(file);
+ if (inode->i_nlink > 1)
+ return false;
+ if (d_unhashed(file->f_path.dentry))
+ return false;
+ /*
+ * AK: actually i see no reason to not allow this for named
+ * pipes etc, but keep the previous behaviour for now.
+ */
+ if (!S_ISREG(inode->i_mode))
+ return false;
+ /*
+ * Don't dump core if the filesystem changed owner or mode
+ * of the file during file creation. This is an issue when
+ * a process dumps core while its cwd is e.g. on a vfat
+ * filesystem.
+ */
+ idmap = file_mnt_idmap(file);
+ if (!vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), current_fsuid())) {
+ coredump_report_failure("Core dump to %s aborted: cannot preserve file owner", cn->corename);
+ return false;
+ }
+ if ((inode->i_mode & 0677) != 0600) {
+ coredump_report_failure("Core dump to %s aborted: cannot preserve file permissions", cn->corename);
+ return false;
+ }
+ if (!(file->f_mode & FMODE_CAN_WRITE))
+ return false;
+ if (do_truncate(idmap, file->f_path.dentry, 0, 0, file))
+ return false;
+
+ cprm->file = no_free_ptr(file);
+ return true;
+}
+
+static bool coredump_pipe(struct core_name *cn, struct coredump_params *cprm,
+ size_t *argv, int argc)
+{
+ int argi;
+ char **helper_argv __free(kfree) = NULL;
+ struct subprocess_info *sub_info;
+
+ if (cprm->limit == 1) {
+ /* See umh_coredump_setup() which sets RLIMIT_CORE = 1.
+ *
+ * Normally core limits are irrelevant to pipes, since
+ * we're not writing to the file system, but we use
+ * cprm.limit of 1 here as a special value, this is a
+ * consistent way to catch recursive crashes.
+ * We can still crash if the core_pattern binary sets
+ * RLIM_CORE = !1, but it runs as root, and can do
+ * lots of stupid things.
+ *
+ * Note that we use task_tgid_vnr here to grab the pid
+ * of the process group leader. That way we get the
+ * right pid if a thread in a multi-threaded
+ * core_pattern process dies.
+ */
+ coredump_report_failure("RLIMIT_CORE is set to 1, aborting core");
+ return false;
+ }
+ cprm->limit = RLIM_INFINITY;
+
+ cn->core_pipe_limit = atomic_inc_return(&core_pipe_count);
+ if (core_pipe_limit && (core_pipe_limit < cn->core_pipe_limit)) {
+ coredump_report_failure("over core_pipe_limit, skipping core dump");
+ return false;
+ }
+
+ helper_argv = kmalloc_array(argc + 1, sizeof(*helper_argv), GFP_KERNEL);
+ if (!helper_argv) {
+ coredump_report_failure("%s failed to allocate memory", __func__);
+ return false;
+ }
+ for (argi = 0; argi < argc; argi++)
+ helper_argv[argi] = cn->corename + argv[argi];
+ helper_argv[argi] = NULL;
+
+ sub_info = call_usermodehelper_setup(helper_argv[0], helper_argv, NULL,
+ GFP_KERNEL, umh_coredump_setup,
+ NULL, cprm);
+ if (!sub_info)
+ return false;
+
+ if (call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC)) {
+ coredump_report_failure("|%s pipe failed", cn->corename);
+ return false;
+ }
+
+ /*
+ * umh disabled with CONFIG_STATIC_USERMODEHELPER_PATH="" would
+ * have this set to NULL.
+ */
+ if (!cprm->file) {
+ coredump_report_failure("Core dump to |%s disabled", cn->corename);
+ return false;
+ }
+
+ return true;
+}
+
+static bool coredump_write(struct core_name *cn,
+ struct coredump_params *cprm,
+ struct linux_binfmt *binfmt)
+{
+
+ if (dump_interrupted())
+ return true;
+
+ if (!dump_vma_snapshot(cprm))
+ return false;
+
+ file_start_write(cprm->file);
+ cn->core_dumped = binfmt->core_dump(cprm);
+ /*
+ * Ensures that file size is big enough to contain the current
+ * file postion. This prevents gdb from complaining about
+ * a truncated file if the last "write" to the file was
+ * dump_skip.
+ */
+ if (cprm->to_skip) {
+ cprm->to_skip--;
+ dump_emit(cprm, "", 1);
+ }
+ file_end_write(cprm->file);
+ free_vma_snapshot(cprm);
+ return true;
}
-void do_coredump(const kernel_siginfo_t *siginfo)
+static void coredump_cleanup(struct core_name *cn, struct coredump_params *cprm)
{
+ if (cprm->file)
+ filp_close(cprm->file, NULL);
+ if (cn->core_pipe_limit) {
+ VFS_WARN_ON_ONCE(cn->core_type != COREDUMP_PIPE);
+ atomic_dec(&core_pipe_count);
+ }
+ kfree(cn->corename);
+ coredump_finish(cn->core_dumped);
+}
+
+static inline bool coredump_skip(const struct coredump_params *cprm,
+ const struct linux_binfmt *binfmt)
+{
+ if (!binfmt)
+ return true;
+ if (!binfmt->core_dump)
+ return true;
+ if (!__get_dumpable(cprm->mm_flags))
+ return true;
+ return false;
+}
+
+void vfs_coredump(const kernel_siginfo_t *siginfo)
+{
+ struct cred *cred __free(put_cred) = NULL;
+ size_t *argv __free(kfree) = NULL;
struct core_state core_state;
struct core_name cn;
struct mm_struct *mm = current->mm;
- struct linux_binfmt * binfmt;
+ struct linux_binfmt *binfmt = mm->binfmt;
const struct cred *old_cred;
- struct cred *cred;
- int retval = 0;
- int ispipe;
- size_t *argv = NULL;
int argc = 0;
- /* require nonrelative corefile path and be extra careful */
- bool need_suid_safe = false;
- bool core_dumped = false;
- static atomic_t core_dump_count = ATOMIC_INIT(0);
struct coredump_params cprm = {
.siginfo = siginfo,
.limit = rlimit(RLIMIT_CORE),
@@ -551,240 +1111,92 @@ void do_coredump(const kernel_siginfo_t *siginfo)
audit_core_dumps(siginfo->si_signo);
- binfmt = mm->binfmt;
- if (!binfmt || !binfmt->core_dump)
- goto fail;
- if (!__get_dumpable(cprm.mm_flags))
- goto fail;
+ if (coredump_skip(&cprm, binfmt))
+ return;
cred = prepare_creds();
if (!cred)
- goto fail;
+ return;
/*
* We cannot trust fsuid as being the "true" uid of the process
* nor do we know its entire history. We only know it was tainted
* so we dump it as root in mode 2, and only into a controlled
* environment (pipe handler or fully qualified path).
*/
- if (__get_dumpable(cprm.mm_flags) == SUID_DUMP_ROOT) {
- /* Setuid core dump mode */
- cred->fsuid = GLOBAL_ROOT_UID; /* Dump root private */
- need_suid_safe = true;
- }
+ if (coredump_force_suid_safe(&cprm))
+ cred->fsuid = GLOBAL_ROOT_UID;
- retval = coredump_wait(siginfo->si_signo, &core_state);
- if (retval < 0)
- goto fail_creds;
+ if (coredump_wait(siginfo->si_signo, &core_state) < 0)
+ return;
old_cred = override_creds(cred);
- ispipe = format_corename(&cn, &cprm, &argv, &argc);
-
- if (ispipe) {
- int argi;
- int dump_count;
- char **helper_argv;
- struct subprocess_info *sub_info;
-
- if (ispipe < 0) {
- coredump_report_failure("format_corename failed, aborting core");
- goto fail_unlock;
- }
-
- if (cprm.limit == 1) {
- /* See umh_pipe_setup() which sets RLIMIT_CORE = 1.
- *
- * Normally core limits are irrelevant to pipes, since
- * we're not writing to the file system, but we use
- * cprm.limit of 1 here as a special value, this is a
- * consistent way to catch recursive crashes.
- * We can still crash if the core_pattern binary sets
- * RLIM_CORE = !1, but it runs as root, and can do
- * lots of stupid things.
- *
- * Note that we use task_tgid_vnr here to grab the pid
- * of the process group leader. That way we get the
- * right pid if a thread in a multi-threaded
- * core_pattern process dies.
- */
- coredump_report_failure("RLIMIT_CORE is set to 1, aborting core");
- goto fail_unlock;
- }
- cprm.limit = RLIM_INFINITY;
-
- dump_count = atomic_inc_return(&core_dump_count);
- if (core_pipe_limit && (core_pipe_limit < dump_count)) {
- coredump_report_failure("over core_pipe_limit, skipping core dump");
- goto fail_dropcount;
- }
-
- helper_argv = kmalloc_array(argc + 1, sizeof(*helper_argv),
- GFP_KERNEL);
- if (!helper_argv) {
- coredump_report_failure("%s failed to allocate memory", __func__);
- goto fail_dropcount;
- }
- for (argi = 0; argi < argc; argi++)
- helper_argv[argi] = cn.corename + argv[argi];
- helper_argv[argi] = NULL;
-
- retval = -ENOMEM;
- sub_info = call_usermodehelper_setup(helper_argv[0],
- helper_argv, NULL, GFP_KERNEL,
- umh_pipe_setup, NULL, &cprm);
- if (sub_info)
- retval = call_usermodehelper_exec(sub_info,
- UMH_WAIT_EXEC);
-
- kfree(helper_argv);
- if (retval) {
- coredump_report_failure("|%s pipe failed", cn.corename);
- goto close_fail;
- }
- } else {
- struct mnt_idmap *idmap;
- struct inode *inode;
- int open_flags = O_CREAT | O_WRONLY | O_NOFOLLOW |
- O_LARGEFILE | O_EXCL;
-
- if (cprm.limit < binfmt->min_coredump)
- goto fail_unlock;
-
- if (need_suid_safe && cn.corename[0] != '/') {
- coredump_report_failure(
- "this process can only dump core to a fully qualified path, skipping core dump");
- goto fail_unlock;
- }
-
- /*
- * Unlink the file if it exists unless this is a SUID
- * binary - in that case, we're running around with root
- * privs and don't want to unlink another user's coredump.
- */
- if (!need_suid_safe) {
- /*
- * If it doesn't exist, that's fine. If there's some
- * other problem, we'll catch it at the filp_open().
- */
- do_unlinkat(AT_FDCWD, getname_kernel(cn.corename));
- }
-
- /*
- * There is a race between unlinking and creating the
- * file, but if that causes an EEXIST here, that's
- * fine - another process raced with us while creating
- * the corefile, and the other process won. To userspace,
- * what matters is that at least one of the two processes
- * writes its coredump successfully, not which one.
- */
- if (need_suid_safe) {
- /*
- * Using user namespaces, normal user tasks can change
- * their current->fs->root to point to arbitrary
- * directories. Since the intention of the "only dump
- * with a fully qualified path" rule is to control where
- * coredumps may be placed using root privileges,
- * current->fs->root must not be used. Instead, use the
- * root directory of init_task.
- */
- struct path root;
-
- task_lock(&init_task);
- get_fs_root(init_task.fs, &root);
- task_unlock(&init_task);
- cprm.file = file_open_root(&root, cn.corename,
- open_flags, 0600);
- path_put(&root);
- } else {
- cprm.file = filp_open(cn.corename, open_flags, 0600);
- }
- if (IS_ERR(cprm.file))
- goto fail_unlock;
+ if (!coredump_parse(&cn, &cprm, &argv, &argc)) {
+ coredump_report_failure("format_corename failed, aborting core");
+ goto close_fail;
+ }
- inode = file_inode(cprm.file);
- if (inode->i_nlink > 1)
+ switch (cn.core_type) {
+ case COREDUMP_FILE:
+ if (!coredump_file(&cn, &cprm, binfmt))
goto close_fail;
- if (d_unhashed(cprm.file->f_path.dentry))
+ break;
+ case COREDUMP_PIPE:
+ if (!coredump_pipe(&cn, &cprm, argv, argc))
goto close_fail;
- /*
- * AK: actually i see no reason to not allow this for named
- * pipes etc, but keep the previous behaviour for now.
- */
- if (!S_ISREG(inode->i_mode))
- goto close_fail;
- /*
- * Don't dump core if the filesystem changed owner or mode
- * of the file during file creation. This is an issue when
- * a process dumps core while its cwd is e.g. on a vfat
- * filesystem.
- */
- idmap = file_mnt_idmap(cprm.file);
- if (!vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode),
- current_fsuid())) {
- coredump_report_failure("Core dump to %s aborted: "
- "cannot preserve file owner", cn.corename);
- goto close_fail;
- }
- if ((inode->i_mode & 0677) != 0600) {
- coredump_report_failure("Core dump to %s aborted: "
- "cannot preserve file permissions", cn.corename);
- goto close_fail;
- }
- if (!(cprm.file->f_mode & FMODE_CAN_WRITE))
- goto close_fail;
- if (do_truncate(idmap, cprm.file->f_path.dentry,
- 0, 0, cprm.file))
+ break;
+ case COREDUMP_SOCK_REQ:
+ fallthrough;
+ case COREDUMP_SOCK:
+ if (!coredump_socket(&cn, &cprm))
goto close_fail;
+ break;
+ default:
+ WARN_ON_ONCE(true);
+ goto close_fail;
}
+ /* Don't even generate the coredump. */
+ if (cn.mask & COREDUMP_REJECT)
+ goto close_fail;
+
/* get us an unshared descriptor table; almost always a no-op */
/* The cell spufs coredump code reads the file descriptor tables */
- retval = unshare_files();
- if (retval)
+ if (unshare_files())
goto close_fail;
- if (!dump_interrupted()) {
- /*
- * umh disabled with CONFIG_STATIC_USERMODEHELPER_PATH="" would
- * have this set to NULL.
- */
- if (!cprm.file) {
- coredump_report_failure("Core dump to |%s disabled", cn.corename);
- goto close_fail;
- }
- if (!dump_vma_snapshot(&cprm))
- goto close_fail;
- file_start_write(cprm.file);
- core_dumped = binfmt->core_dump(&cprm);
- /*
- * Ensures that file size is big enough to contain the current
- * file postion. This prevents gdb from complaining about
- * a truncated file if the last "write" to the file was
- * dump_skip.
- */
- if (cprm.to_skip) {
- cprm.to_skip--;
- dump_emit(&cprm, "", 1);
+ if ((cn.mask & COREDUMP_KERNEL) && !coredump_write(&cn, &cprm, binfmt))
+ goto close_fail;
+
+ coredump_sock_shutdown(cprm.file);
+
+ /* Let the parent know that a coredump was generated. */
+ if (cn.mask & COREDUMP_USERSPACE)
+ cn.core_dumped = true;
+
+ /*
+ * When core_pipe_limit is set we wait for the coredump server
+ * or usermodehelper to finish before exiting so it can e.g.,
+ * inspect /proc/<pid>.
+ */
+ if (cn.mask & COREDUMP_WAIT) {
+ switch (cn.core_type) {
+ case COREDUMP_PIPE:
+ wait_for_dump_helpers(cprm.file);
+ break;
+ case COREDUMP_SOCK_REQ:
+ fallthrough;
+ case COREDUMP_SOCK:
+ coredump_sock_wait(cprm.file);
+ break;
+ default:
+ break;
}
- file_end_write(cprm.file);
- free_vma_snapshot(&cprm);
}
- if (ispipe && core_pipe_limit)
- wait_for_dump_helpers(cprm.file);
+
close_fail:
- if (cprm.file)
- filp_close(cprm.file, NULL);
-fail_dropcount:
- if (ispipe)
- atomic_dec(&core_dump_count);
-fail_unlock:
- kfree(argv);
- kfree(cn.corename);
- coredump_finish(core_dumped);
+ coredump_cleanup(&cn, &cprm);
revert_creds(old_cred);
-fail_creds:
- put_cred(cred);
-fail:
return;
}
@@ -798,10 +1210,9 @@ static int __dump_emit(struct coredump_params *cprm, const void *addr, int nr)
struct file *file = cprm->file;
loff_t pos = file->f_pos;
ssize_t n;
+
if (cprm->written + nr > cprm->limit)
return 0;
-
-
if (dump_interrupted())
return 0;
n = __kernel_write(file, addr, nr, &pos);
@@ -818,20 +1229,21 @@ static int __dump_skip(struct coredump_params *cprm, size_t nr)
{
static char zeroes[PAGE_SIZE];
struct file *file = cprm->file;
+
if (file->f_mode & FMODE_LSEEK) {
- if (dump_interrupted() ||
- vfs_llseek(file, nr, SEEK_CUR) < 0)
+ if (dump_interrupted() || vfs_llseek(file, nr, SEEK_CUR) < 0)
return 0;
cprm->pos += nr;
return 1;
- } else {
- while (nr > PAGE_SIZE) {
- if (!__dump_emit(cprm, zeroes, PAGE_SIZE))
- return 0;
- nr -= PAGE_SIZE;
- }
- return __dump_emit(cprm, zeroes, nr);
}
+
+ while (nr > PAGE_SIZE) {
+ if (!__dump_emit(cprm, zeroes, PAGE_SIZE))
+ return 0;
+ nr -= PAGE_SIZE;
+ }
+
+ return __dump_emit(cprm, zeroes, nr);
}
int dump_emit(struct coredump_params *cprm, const void *addr, int nr)
@@ -925,14 +1337,23 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start,
{
unsigned long addr;
struct page *dump_page;
+ int locked, ret;
dump_page = dump_page_alloc();
if (!dump_page)
return 0;
+ ret = 0;
+ locked = 0;
for (addr = start; addr < start + len; addr += PAGE_SIZE) {
struct page *page;
+ if (!locked) {
+ if (mmap_read_lock_killable(current->mm))
+ goto out;
+ locked = 1;
+ }
+
/*
* To avoid having to allocate page tables for virtual address
* ranges that have never been used yet, and also to make it
@@ -940,21 +1361,38 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start,
* NULL when encountering an empty page table entry that would
* otherwise have been filled with the zero page.
*/
- page = get_dump_page(addr);
+ page = get_dump_page(addr, &locked);
if (page) {
+ if (locked) {
+ mmap_read_unlock(current->mm);
+ locked = 0;
+ }
int stop = !dump_emit_page(cprm, dump_page_copy(page, dump_page));
put_page(page);
- if (stop) {
- dump_page_free(dump_page);
- return 0;
- }
+ if (stop)
+ goto out;
} else {
dump_skip(cprm, PAGE_SIZE);
}
+
+ if (dump_interrupted())
+ goto out;
+
+ if (!need_resched())
+ continue;
+ if (locked) {
+ mmap_read_unlock(current->mm);
+ locked = 0;
+ }
cond_resched();
}
+ ret = 1;
+out:
+ if (locked)
+ mmap_read_unlock(current->mm);
+
dump_page_free(dump_page);
- return 1;
+ return ret;
}
#endif
@@ -974,7 +1412,7 @@ EXPORT_SYMBOL(dump_align);
void validate_coredump_safety(void)
{
if (suid_dumpable == SUID_DUMP_ROOT &&
- core_pattern[0] != '/' && core_pattern[0] != '|') {
+ core_pattern[0] != '/' && core_pattern[0] != '|' && core_pattern[0] != '@') {
coredump_report_failure("Unsafe core_pattern used with fs.suid_dumpable=2: "
"pipe handler or fully qualified core dump path required. "
@@ -982,18 +1420,74 @@ void validate_coredump_safety(void)
}
}
+static inline bool check_coredump_socket(void)
+{
+ const char *p;
+
+ if (core_pattern[0] != '@')
+ return true;
+
+ /*
+ * Coredump socket must be located in the initial mount
+ * namespace. Don't give the impression that anything else is
+ * supported right now.
+ */
+ if (current->nsproxy->mnt_ns != init_task.nsproxy->mnt_ns)
+ return false;
+
+ /* Must be an absolute path... */
+ if (core_pattern[1] != '/') {
+ /* ... or the socket request protocol... */
+ if (core_pattern[1] != '@')
+ return false;
+ /* ... and if so must be an absolute path. */
+ if (core_pattern[2] != '/')
+ return false;
+ p = &core_pattern[2];
+ } else {
+ p = &core_pattern[1];
+ }
+
+ /* The path obviously cannot exceed UNIX_PATH_MAX. */
+ if (strlen(p) >= UNIX_PATH_MAX)
+ return false;
+
+ /* Must not contain ".." in the path. */
+ if (name_contains_dotdot(core_pattern))
+ return false;
+
+ return true;
+}
+
static int proc_dostring_coredump(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
- int error = proc_dostring(table, write, buffer, lenp, ppos);
+ int error;
+ ssize_t retval;
+ char old_core_pattern[CORENAME_MAX_SIZE];
+
+ retval = strscpy(old_core_pattern, core_pattern, CORENAME_MAX_SIZE);
+
+ error = proc_dostring(table, write, buffer, lenp, ppos);
+ if (error)
+ return error;
+ if (!check_coredump_socket()) {
+ strscpy(core_pattern, old_core_pattern, retval + 1);
+ return -EINVAL;
+ }
- if (!error)
- validate_coredump_safety();
+ validate_coredump_safety();
return error;
}
static const unsigned int core_file_note_size_min = CORE_FILE_NOTE_SIZE_DEFAULT;
static const unsigned int core_file_note_size_max = CORE_FILE_NOTE_SIZE_MAX;
+static char core_modes[] = {
+ "file\npipe"
+#ifdef CONFIG_UNIX
+ "\nsocket"
+#endif
+};
static const struct ctl_table coredump_sysctls[] = {
{
@@ -1015,7 +1509,9 @@ static const struct ctl_table coredump_sysctls[] = {
.data = &core_pipe_limit,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_INT_MAX,
},
{
.procname = "core_file_note_size_limit",
@@ -1026,6 +1522,22 @@ static const struct ctl_table coredump_sysctls[] = {
.extra1 = (unsigned int *)&core_file_note_size_min,
.extra2 = (unsigned int *)&core_file_note_size_max,
},
+ {
+ .procname = "core_sort_vma",
+ .data = &core_sort_vma,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_douintvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
+ .procname = "core_modes",
+ .data = core_modes,
+ .maxlen = sizeof(core_modes) - 1,
+ .mode = 0444,
+ .proc_handler = proc_dostring,
+ },
};
static int __init init_fs_coredump_sysctls(void)
@@ -1256,8 +1768,9 @@ static bool dump_vma_snapshot(struct coredump_params *cprm)
cprm->vma_data_size += m->dump_size;
}
- sort(cprm->vma_meta, cprm->vma_count, sizeof(*cprm->vma_meta),
- cmp_vma_size, NULL);
+ if (core_sort_vma)
+ sort(cprm->vma_meta, cprm->vma_count, sizeof(*cprm->vma_meta),
+ cmp_vma_size, NULL);
return true;
}
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index b84d1747a020..b002e9b734f9 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -17,7 +17,6 @@
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/pagemap.h>
-#include <linux/pfn_t.h>
#include <linux/ramfs.h>
#include <linux/init.h>
#include <linux/string.h>
@@ -412,8 +411,8 @@ static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma)
for (i = 0; i < pages && !ret; i++) {
vm_fault_t vmf;
unsigned long off = i * PAGE_SIZE;
- pfn_t pfn = phys_to_pfn_t(address + off, PFN_DEV);
- vmf = vmf_insert_mixed(vma, vma->vm_start + off, pfn);
+ vmf = vmf_insert_mixed(vma, vma->vm_start + off,
+ address + off);
if (vmf & VM_FAULT_ERROR)
ret = vm_fault_to_errno(vmf, 0);
}
diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig
index 5aff5934baa1..b5dfb0aa405a 100644
--- a/fs/crypto/Kconfig
+++ b/fs/crypto/Kconfig
@@ -3,6 +3,7 @@ config FS_ENCRYPTION
bool "FS Encryption (Per-file encryption)"
select CRYPTO
select CRYPTO_HASH
+ select CRYPTO_HKDF
select CRYPTO_SKCIPHER
select CRYPTO_LIB_SHA256
select KEYS
@@ -24,20 +25,16 @@ config FS_ENCRYPTION
#
# Also note that this option only pulls in the generic implementations of the
# algorithms, not any per-architecture optimized implementations. It is
-# strongly recommended to enable optimized implementations too. It is safe to
-# disable these generic implementations if corresponding optimized
-# implementations will always be available too; for this reason, these are soft
-# dependencies ('imply' rather than 'select'). Only disable these generic
-# implementations if you're sure they will never be needed, though.
+# strongly recommended to enable optimized implementations too.
config FS_ENCRYPTION_ALGS
tristate
- imply CRYPTO_AES
- imply CRYPTO_CBC
- imply CRYPTO_CTS
- imply CRYPTO_ECB
- imply CRYPTO_HMAC
- imply CRYPTO_SHA512
- imply CRYPTO_XTS
+ select CRYPTO_AES
+ select CRYPTO_CBC
+ select CRYPTO_CTS
+ select CRYPTO_ECB
+ select CRYPTO_HMAC
+ select CRYPTO_SHA512
+ select CRYPTO_XTS
config FS_ENCRYPTION_INLINE_CRYPT
bool "Enable fscrypt to use inline crypto"
diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
index 0ad8c30b8fa5..486fcb2ecf13 100644
--- a/fs/crypto/bio.c
+++ b/fs/crypto/bio.c
@@ -7,10 +7,12 @@
* Copyright (C) 2015, Motorola Mobility
*/
-#include <linux/pagemap.h>
-#include <linux/module.h>
#include <linux/bio.h>
+#include <linux/export.h>
+#include <linux/module.h>
#include <linux/namei.h>
+#include <linux/pagemap.h>
+
#include "fscrypt_private.h"
/**
@@ -165,8 +167,7 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
do {
err = fscrypt_crypt_data_unit(ci, FS_ENCRYPT, du_index,
ZERO_PAGE(0), pages[i],
- du_size, offset,
- GFP_NOFS);
+ du_size, offset);
if (err)
goto out;
du_index++;
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 328470d40dec..b6ccab524fde 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -20,12 +20,14 @@
* Special Publication 800-38E and IEEE P1619/D16.
*/
-#include <linux/pagemap.h>
+#include <crypto/skcipher.h>
+#include <linux/export.h>
#include <linux/mempool.h>
#include <linux/module.h>
-#include <linux/scatterlist.h>
+#include <linux/pagemap.h>
#include <linux/ratelimit.h>
-#include <crypto/skcipher.h>
+#include <linux/scatterlist.h>
+
#include "fscrypt_private.h"
static unsigned int num_prealloc_crypto_pages = 32;
@@ -108,15 +110,13 @@ void fscrypt_generate_iv(union fscrypt_iv *iv, u64 index,
int fscrypt_crypt_data_unit(const struct fscrypt_inode_info *ci,
fscrypt_direction_t rw, u64 index,
struct page *src_page, struct page *dest_page,
- unsigned int len, unsigned int offs,
- gfp_t gfp_flags)
+ unsigned int len, unsigned int offs)
{
+ struct crypto_sync_skcipher *tfm = ci->ci_enc_key.tfm;
+ SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
union fscrypt_iv iv;
- struct skcipher_request *req = NULL;
- DECLARE_CRYPTO_WAIT(wait);
struct scatterlist dst, src;
- struct crypto_skcipher *tfm = ci->ci_enc_key.tfm;
- int res = 0;
+ int err;
if (WARN_ON_ONCE(len <= 0))
return -EINVAL;
@@ -125,36 +125,28 @@ int fscrypt_crypt_data_unit(const struct fscrypt_inode_info *ci,
fscrypt_generate_iv(&iv, index, ci);
- req = skcipher_request_alloc(tfm, gfp_flags);
- if (!req)
- return -ENOMEM;
-
skcipher_request_set_callback(
req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
- crypto_req_done, &wait);
-
+ NULL, NULL);
sg_init_table(&dst, 1);
sg_set_page(&dst, dest_page, len, offs);
sg_init_table(&src, 1);
sg_set_page(&src, src_page, len, offs);
skcipher_request_set_crypt(req, &src, &dst, len, &iv);
if (rw == FS_DECRYPT)
- res = crypto_wait_req(crypto_skcipher_decrypt(req), &wait);
+ err = crypto_skcipher_decrypt(req);
else
- res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
- skcipher_request_free(req);
- if (res) {
+ err = crypto_skcipher_encrypt(req);
+ if (err)
fscrypt_err(ci->ci_inode,
"%scryption failed for data unit %llu: %d",
- (rw == FS_DECRYPT ? "De" : "En"), index, res);
- return res;
- }
- return 0;
+ (rw == FS_DECRYPT ? "De" : "En"), index, err);
+ return err;
}
/**
- * fscrypt_encrypt_pagecache_blocks() - Encrypt data from a pagecache page
- * @page: the locked pagecache page containing the data to encrypt
+ * fscrypt_encrypt_pagecache_blocks() - Encrypt data from a pagecache folio
+ * @folio: the locked pagecache folio containing the data to encrypt
* @len: size of the data to encrypt, in bytes
* @offs: offset within @page of the data to encrypt, in bytes
* @gfp_flags: memory allocation flags; see details below
@@ -177,23 +169,21 @@ int fscrypt_crypt_data_unit(const struct fscrypt_inode_info *ci,
*
* Return: the new encrypted bounce page on success; an ERR_PTR() on failure
*/
-struct page *fscrypt_encrypt_pagecache_blocks(struct page *page,
- unsigned int len,
- unsigned int offs,
- gfp_t gfp_flags)
-
+struct page *fscrypt_encrypt_pagecache_blocks(struct folio *folio,
+ size_t len, size_t offs, gfp_t gfp_flags)
{
- const struct inode *inode = page->mapping->host;
+ const struct inode *inode = folio->mapping->host;
const struct fscrypt_inode_info *ci = inode->i_crypt_info;
const unsigned int du_bits = ci->ci_data_unit_bits;
const unsigned int du_size = 1U << du_bits;
struct page *ciphertext_page;
- u64 index = ((u64)page->index << (PAGE_SHIFT - du_bits)) +
+ u64 index = ((u64)folio->index << (PAGE_SHIFT - du_bits)) +
(offs >> du_bits);
unsigned int i;
int err;
- if (WARN_ON_ONCE(!PageLocked(page)))
+ VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
+ if (WARN_ON_ONCE(!folio_test_locked(folio)))
return ERR_PTR(-EINVAL);
if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offs, du_size)))
@@ -205,15 +195,15 @@ struct page *fscrypt_encrypt_pagecache_blocks(struct page *page,
for (i = offs; i < offs + len; i += du_size, index++) {
err = fscrypt_crypt_data_unit(ci, FS_ENCRYPT, index,
- page, ciphertext_page,
- du_size, i, gfp_flags);
+ &folio->page, ciphertext_page,
+ du_size, i);
if (err) {
fscrypt_free_bounce_page(ciphertext_page);
return ERR_PTR(err);
}
}
SetPagePrivate(ciphertext_page);
- set_page_private(ciphertext_page, (unsigned long)page);
+ set_page_private(ciphertext_page, (unsigned long)folio);
return ciphertext_page;
}
EXPORT_SYMBOL(fscrypt_encrypt_pagecache_blocks);
@@ -227,7 +217,6 @@ EXPORT_SYMBOL(fscrypt_encrypt_pagecache_blocks);
* @offs: Byte offset within @page at which the block to encrypt begins
* @lblk_num: Filesystem logical block number of the block, i.e. the 0-based
* number of the block within the file
- * @gfp_flags: Memory allocation flags
*
* Encrypt a possibly-compressed filesystem block that is located in an
* arbitrary page, not necessarily in the original pagecache page. The @inode
@@ -239,13 +228,12 @@ EXPORT_SYMBOL(fscrypt_encrypt_pagecache_blocks);
*/
int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page,
unsigned int len, unsigned int offs,
- u64 lblk_num, gfp_t gfp_flags)
+ u64 lblk_num)
{
if (WARN_ON_ONCE(inode->i_sb->s_cop->supports_subblock_data_units))
return -EOPNOTSUPP;
return fscrypt_crypt_data_unit(inode->i_crypt_info, FS_ENCRYPT,
- lblk_num, page, page, len, offs,
- gfp_flags);
+ lblk_num, page, page, len, offs);
}
EXPORT_SYMBOL(fscrypt_encrypt_block_inplace);
@@ -285,8 +273,7 @@ int fscrypt_decrypt_pagecache_blocks(struct folio *folio, size_t len,
struct page *page = folio_page(folio, i >> PAGE_SHIFT);
err = fscrypt_crypt_data_unit(ci, FS_DECRYPT, index, page,
- page, du_size, i & ~PAGE_MASK,
- GFP_NOFS);
+ page, du_size, i & ~PAGE_MASK);
if (err)
return err;
}
@@ -319,8 +306,7 @@ int fscrypt_decrypt_block_inplace(const struct inode *inode, struct page *page,
if (WARN_ON_ONCE(inode->i_sb->s_cop->supports_subblock_data_units))
return -EOPNOTSUPP;
return fscrypt_crypt_data_unit(inode->i_crypt_info, FS_DECRYPT,
- lblk_num, page, page, len, offs,
- GFP_NOFS);
+ lblk_num, page, page, len, offs);
}
EXPORT_SYMBOL(fscrypt_decrypt_block_inplace);
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index 010f9c0a4c2f..f9f6713e144f 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -11,11 +11,13 @@
* This has not yet undergone a rigorous security audit.
*/
-#include <linux/namei.h>
-#include <linux/scatterlist.h>
#include <crypto/hash.h>
#include <crypto/sha2.h>
#include <crypto/skcipher.h>
+#include <linux/export.h>
+#include <linux/namei.h>
+#include <linux/scatterlist.h>
+
#include "fscrypt_private.h"
/*
@@ -92,13 +94,12 @@ static inline bool fscrypt_is_dot_dotdot(const struct qstr *str)
int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname,
u8 *out, unsigned int olen)
{
- struct skcipher_request *req = NULL;
- DECLARE_CRYPTO_WAIT(wait);
const struct fscrypt_inode_info *ci = inode->i_crypt_info;
- struct crypto_skcipher *tfm = ci->ci_enc_key.tfm;
+ struct crypto_sync_skcipher *tfm = ci->ci_enc_key.tfm;
+ SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
union fscrypt_iv iv;
struct scatterlist sg;
- int res;
+ int err;
/*
* Copy the filename to the output buffer for encrypting in-place and
@@ -109,28 +110,17 @@ int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname,
memcpy(out, iname->name, iname->len);
memset(out + iname->len, 0, olen - iname->len);
- /* Initialize the IV */
fscrypt_generate_iv(&iv, 0, ci);
- /* Set up the encryption request */
- req = skcipher_request_alloc(tfm, GFP_NOFS);
- if (!req)
- return -ENOMEM;
- skcipher_request_set_callback(req,
- CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
- crypto_req_done, &wait);
+ skcipher_request_set_callback(
+ req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+ NULL, NULL);
sg_init_one(&sg, out, olen);
skcipher_request_set_crypt(req, &sg, &sg, olen, &iv);
-
- /* Do the encryption */
- res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
- skcipher_request_free(req);
- if (res < 0) {
- fscrypt_err(inode, "Filename encryption failed: %d", res);
- return res;
- }
-
- return 0;
+ err = crypto_skcipher_encrypt(req);
+ if (err)
+ fscrypt_err(inode, "Filename encryption failed: %d", err);
+ return err;
}
EXPORT_SYMBOL_GPL(fscrypt_fname_encrypt);
@@ -148,34 +138,25 @@ static int fname_decrypt(const struct inode *inode,
const struct fscrypt_str *iname,
struct fscrypt_str *oname)
{
- struct skcipher_request *req = NULL;
- DECLARE_CRYPTO_WAIT(wait);
- struct scatterlist src_sg, dst_sg;
const struct fscrypt_inode_info *ci = inode->i_crypt_info;
- struct crypto_skcipher *tfm = ci->ci_enc_key.tfm;
+ struct crypto_sync_skcipher *tfm = ci->ci_enc_key.tfm;
+ SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
union fscrypt_iv iv;
- int res;
-
- /* Allocate request */
- req = skcipher_request_alloc(tfm, GFP_NOFS);
- if (!req)
- return -ENOMEM;
- skcipher_request_set_callback(req,
- CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
- crypto_req_done, &wait);
+ struct scatterlist src_sg, dst_sg;
+ int err;
- /* Initialize IV */
fscrypt_generate_iv(&iv, 0, ci);
- /* Create decryption request */
+ skcipher_request_set_callback(
+ req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+ NULL, NULL);
sg_init_one(&src_sg, iname->name, iname->len);
sg_init_one(&dst_sg, oname->name, oname->len);
skcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, &iv);
- res = crypto_wait_req(crypto_skcipher_decrypt(req), &wait);
- skcipher_request_free(req);
- if (res < 0) {
- fscrypt_err(inode, "Filename decryption failed: %d", res);
- return res;
+ err = crypto_skcipher_decrypt(req);
+ if (err) {
+ fscrypt_err(inode, "Filename decryption failed: %d", err);
+ return err;
}
oname->len = strnlen(oname->name, iname->len);
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 8371e4e1f596..d8b485b9881c 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -12,6 +12,7 @@
#define _FSCRYPT_PRIVATE_H
#include <linux/fscrypt.h>
+#include <linux/minmax.h>
#include <linux/siphash.h>
#include <crypto/hash.h>
#include <linux/blk-crypto.h>
@@ -27,6 +28,41 @@
*/
#define FSCRYPT_MIN_KEY_SIZE 16
+/* Maximum size of a raw fscrypt master key */
+#define FSCRYPT_MAX_RAW_KEY_SIZE 64
+
+/* Maximum size of a hardware-wrapped fscrypt master key */
+#define FSCRYPT_MAX_HW_WRAPPED_KEY_SIZE BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE
+
+/* Maximum size of an fscrypt master key across both key types */
+#define FSCRYPT_MAX_ANY_KEY_SIZE \
+ MAX(FSCRYPT_MAX_RAW_KEY_SIZE, FSCRYPT_MAX_HW_WRAPPED_KEY_SIZE)
+
+/*
+ * FSCRYPT_MAX_KEY_SIZE is defined in the UAPI header, but the addition of
+ * hardware-wrapped keys has made it misleading as it's only for raw keys.
+ * Don't use it in kernel code; use one of the above constants instead.
+ */
+#undef FSCRYPT_MAX_KEY_SIZE
+
+/*
+ * This mask is passed as the third argument to the crypto_alloc_*() functions
+ * to prevent fscrypt from using the Crypto API drivers for non-inline crypto
+ * engines. Those drivers have been problematic for fscrypt. fscrypt users
+ * have reported hangs and even incorrect en/decryption with these drivers.
+ * Since going to the driver, off CPU, and back again is really slow, such
+ * drivers can be over 50 times slower than the CPU-based code for fscrypt's
+ * workload. Even on platforms that lack AES instructions on the CPU, using the
+ * offloads has been shown to be slower, even staying with AES. (Of course,
+ * Adiantum is faster still, and is the recommended option on such platforms...)
+ *
+ * Note that fscrypt also supports inline crypto engines. Those don't use the
+ * Crypto API and work much better than the old-style (non-inline) engines.
+ */
+#define FSCRYPT_CRYPTOAPI_MASK \
+ (CRYPTO_ALG_ASYNC | CRYPTO_ALG_ALLOCATES_MEMORY | \
+ CRYPTO_ALG_KERN_DRIVER_ONLY)
+
#define FSCRYPT_CONTEXT_V1 1
#define FSCRYPT_CONTEXT_V2 2
@@ -203,7 +239,7 @@ struct fscrypt_symlink_data {
* Normally only one of the fields will be non-NULL.
*/
struct fscrypt_prepared_key {
- struct crypto_skcipher *tfm;
+ struct crypto_sync_skcipher *tfm;
#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
struct blk_crypto_key *blk_key;
#endif
@@ -301,8 +337,7 @@ int fscrypt_initialize(struct super_block *sb);
int fscrypt_crypt_data_unit(const struct fscrypt_inode_info *ci,
fscrypt_direction_t rw, u64 index,
struct page *src_page, struct page *dest_page,
- unsigned int len, unsigned int offs,
- gfp_t gfp_flags);
+ unsigned int len, unsigned int offs);
struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags);
void __printf(3, 4) __cold
@@ -360,13 +395,15 @@ int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key,
* outputs are unique and cryptographically isolated, i.e. knowledge of one
* output doesn't reveal another.
*/
-#define HKDF_CONTEXT_KEY_IDENTIFIER 1 /* info=<empty> */
+#define HKDF_CONTEXT_KEY_IDENTIFIER_FOR_RAW_KEY 1 /* info=<empty> */
#define HKDF_CONTEXT_PER_FILE_ENC_KEY 2 /* info=file_nonce */
#define HKDF_CONTEXT_DIRECT_KEY 3 /* info=mode_num */
#define HKDF_CONTEXT_IV_INO_LBLK_64_KEY 4 /* info=mode_num||fs_uuid */
#define HKDF_CONTEXT_DIRHASH_KEY 5 /* info=file_nonce */
#define HKDF_CONTEXT_IV_INO_LBLK_32_KEY 6 /* info=mode_num||fs_uuid */
#define HKDF_CONTEXT_INODE_HASH_KEY 7 /* info=<empty> */
+#define HKDF_CONTEXT_KEY_IDENTIFIER_FOR_HW_WRAPPED_KEY \
+ 8 /* info=<empty> */
int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context,
const u8 *info, unsigned int infolen,
@@ -376,7 +413,8 @@ void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf);
/* inline_crypt.c */
#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
-int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci);
+int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci,
+ bool is_hw_wrapped_key);
static inline bool
fscrypt_using_inline_encryption(const struct fscrypt_inode_info *ci)
@@ -385,12 +423,17 @@ fscrypt_using_inline_encryption(const struct fscrypt_inode_info *ci)
}
int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
- const u8 *raw_key,
+ const u8 *key_bytes, size_t key_size,
+ bool is_hw_wrapped,
const struct fscrypt_inode_info *ci);
void fscrypt_destroy_inline_crypt_key(struct super_block *sb,
struct fscrypt_prepared_key *prep_key);
+int fscrypt_derive_sw_secret(struct super_block *sb,
+ const u8 *wrapped_key, size_t wrapped_key_size,
+ u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE]);
+
/*
* Check whether the crypto transform or blk-crypto key has been allocated in
* @prep_key, depending on which encryption implementation the file will use.
@@ -414,7 +457,8 @@ fscrypt_is_key_prepared(struct fscrypt_prepared_key *prep_key,
#else /* CONFIG_FS_ENCRYPTION_INLINE_CRYPT */
-static inline int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci)
+static inline int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci,
+ bool is_hw_wrapped_key)
{
return 0;
}
@@ -427,7 +471,8 @@ fscrypt_using_inline_encryption(const struct fscrypt_inode_info *ci)
static inline int
fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
- const u8 *raw_key,
+ const u8 *key_bytes, size_t key_size,
+ bool is_hw_wrapped,
const struct fscrypt_inode_info *ci)
{
WARN_ON_ONCE(1);
@@ -440,6 +485,15 @@ fscrypt_destroy_inline_crypt_key(struct super_block *sb,
{
}
+static inline int
+fscrypt_derive_sw_secret(struct super_block *sb,
+ const u8 *wrapped_key, size_t wrapped_key_size,
+ u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE])
+{
+ fscrypt_warn(NULL, "kernel doesn't support hardware-wrapped keys");
+ return -EOPNOTSUPP;
+}
+
static inline bool
fscrypt_is_key_prepared(struct fscrypt_prepared_key *prep_key,
const struct fscrypt_inode_info *ci)
@@ -456,20 +510,38 @@ fscrypt_is_key_prepared(struct fscrypt_prepared_key *prep_key,
struct fscrypt_master_key_secret {
/*
- * For v2 policy keys: HKDF context keyed by this master key.
- * For v1 policy keys: not set (hkdf.hmac_tfm == NULL).
+ * The KDF with which subkeys of this key can be derived.
+ *
+ * For v1 policy keys, this isn't applicable and won't be set.
+ * Otherwise, this KDF will be keyed by this master key if
+ * ->is_hw_wrapped=false, or by the "software secret" that hardware
+ * derived from this master key if ->is_hw_wrapped=true.
*/
struct fscrypt_hkdf hkdf;
/*
- * Size of the raw key in bytes. This remains set even if ->raw was
+ * True if this key is a hardware-wrapped key; false if this key is a
+ * raw key (i.e. a "software key"). For v1 policy keys this will always
+ * be false, as v1 policy support is a legacy feature which doesn't
+ * support newer functionality such as hardware-wrapped keys.
+ */
+ bool is_hw_wrapped;
+
+ /*
+ * Size of the key in bytes. This remains set even if ->bytes was
* zeroized due to no longer being needed. I.e. we still remember the
* size of the key even if we don't need to remember the key itself.
*/
u32 size;
- /* For v1 policy keys: the raw key. Wiped for v2 policy keys. */
- u8 raw[FSCRYPT_MAX_KEY_SIZE];
+ /*
+ * The bytes of the key, when still needed. This can be either a raw
+ * key or a hardware-wrapped key, as indicated by ->is_hw_wrapped. In
+ * the case of a raw, v2 policy key, there is no need to remember the
+ * actual key separately from ->hkdf so this field will be zeroized as
+ * soon as ->hkdf is initialized.
+ */
+ u8 bytes[FSCRYPT_MAX_ANY_KEY_SIZE];
} __randomize_layout;
diff --git a/fs/crypto/hkdf.c b/fs/crypto/hkdf.c
index 5a384dad2c72..b1ef506cd341 100644
--- a/fs/crypto/hkdf.c
+++ b/fs/crypto/hkdf.c
@@ -1,15 +1,14 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Implementation of HKDF ("HMAC-based Extract-and-Expand Key Derivation
- * Function"), aka RFC 5869. See also the original paper (Krawczyk 2010):
- * "Cryptographic Extraction and Key Derivation: The HKDF Scheme".
- *
- * This is used to derive keys from the fscrypt master keys.
+ * This is used to derive keys from the fscrypt master keys (or from the
+ * "software secrets" which hardware derives from the fscrypt master keys, in
+ * the case that the fscrypt master keys are hardware-wrapped keys).
*
* Copyright 2019 Google LLC
*/
#include <crypto/hash.h>
+#include <crypto/hkdf.h>
#include <crypto/sha2.h>
#include "fscrypt_private.h"
@@ -44,20 +43,6 @@
* there's no way to persist a random salt per master key from kernel mode.
*/
-/* HKDF-Extract (RFC 5869 section 2.2), unsalted */
-static int hkdf_extract(struct crypto_shash *hmac_tfm, const u8 *ikm,
- unsigned int ikmlen, u8 prk[HKDF_HASHLEN])
-{
- static const u8 default_salt[HKDF_HASHLEN];
- int err;
-
- err = crypto_shash_setkey(hmac_tfm, default_salt, HKDF_HASHLEN);
- if (err)
- return err;
-
- return crypto_shash_tfm_digest(hmac_tfm, ikm, ikmlen, prk);
-}
-
/*
* Compute HKDF-Extract using the given master key as the input keying material,
* and prepare an HMAC transform object keyed by the resulting pseudorandom key.
@@ -69,10 +54,11 @@ int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key,
unsigned int master_key_size)
{
struct crypto_shash *hmac_tfm;
+ static const u8 default_salt[HKDF_HASHLEN];
u8 prk[HKDF_HASHLEN];
int err;
- hmac_tfm = crypto_alloc_shash(HKDF_HMAC_ALG, 0, 0);
+ hmac_tfm = crypto_alloc_shash(HKDF_HMAC_ALG, 0, FSCRYPT_CRYPTOAPI_MASK);
if (IS_ERR(hmac_tfm)) {
fscrypt_err(NULL, "Error allocating " HKDF_HMAC_ALG ": %ld",
PTR_ERR(hmac_tfm));
@@ -84,7 +70,8 @@ int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key,
goto err_free_tfm;
}
- err = hkdf_extract(hmac_tfm, master_key, master_key_size, prk);
+ err = hkdf_extract(hmac_tfm, master_key, master_key_size,
+ default_salt, HKDF_HASHLEN, prk);
if (err)
goto err_free_tfm;
@@ -118,61 +105,21 @@ int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context,
u8 *okm, unsigned int okmlen)
{
SHASH_DESC_ON_STACK(desc, hkdf->hmac_tfm);
- u8 prefix[9];
- unsigned int i;
+ u8 *full_info;
int err;
- const u8 *prev = NULL;
- u8 counter = 1;
- u8 tmp[HKDF_HASHLEN];
-
- if (WARN_ON_ONCE(okmlen > 255 * HKDF_HASHLEN))
- return -EINVAL;
+ full_info = kzalloc(infolen + 9, GFP_KERNEL);
+ if (!full_info)
+ return -ENOMEM;
desc->tfm = hkdf->hmac_tfm;
- memcpy(prefix, "fscrypt\0", 8);
- prefix[8] = context;
-
- for (i = 0; i < okmlen; i += HKDF_HASHLEN) {
-
- err = crypto_shash_init(desc);
- if (err)
- goto out;
-
- if (prev) {
- err = crypto_shash_update(desc, prev, HKDF_HASHLEN);
- if (err)
- goto out;
- }
-
- err = crypto_shash_update(desc, prefix, sizeof(prefix));
- if (err)
- goto out;
-
- err = crypto_shash_update(desc, info, infolen);
- if (err)
- goto out;
-
- BUILD_BUG_ON(sizeof(counter) != 1);
- if (okmlen - i < HKDF_HASHLEN) {
- err = crypto_shash_finup(desc, &counter, 1, tmp);
- if (err)
- goto out;
- memcpy(&okm[i], tmp, okmlen - i);
- memzero_explicit(tmp, sizeof(tmp));
- } else {
- err = crypto_shash_finup(desc, &counter, 1, &okm[i]);
- if (err)
- goto out;
- }
- counter++;
- prev = &okm[i];
- }
- err = 0;
-out:
- if (unlikely(err))
- memzero_explicit(okm, okmlen); /* so caller doesn't need to */
- shash_desc_zero(desc);
+ memcpy(full_info, "fscrypt\0", 8);
+ full_info[8] = context;
+ memcpy(full_info + 9, info, infolen);
+
+ err = hkdf_expand(hkdf->hmac_tfm, full_info, infolen + 9,
+ okm, okmlen);
+ kfree_sensitive(full_info);
return err;
}
diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
index d8d5049b8fe1..e0b32ac841f7 100644
--- a/fs/crypto/hooks.c
+++ b/fs/crypto/hooks.c
@@ -5,6 +5,8 @@
* Encryption hooks for higher-level filesystem operations.
*/
+#include <linux/export.h>
+
#include "fscrypt_private.h"
/**
diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c
index 40de69860dcf..caaff809765b 100644
--- a/fs/crypto/inline_crypt.c
+++ b/fs/crypto/inline_crypt.c
@@ -15,6 +15,7 @@
#include <linux/blk-crypto.h>
#include <linux/blkdev.h>
#include <linux/buffer_head.h>
+#include <linux/export.h>
#include <linux/sched/mm.h>
#include <linux/slab.h>
#include <linux/uio.h>
@@ -89,7 +90,8 @@ static void fscrypt_log_blk_crypto_impl(struct fscrypt_mode *mode,
}
/* Enable inline encryption for this file if supported. */
-int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci)
+int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci,
+ bool is_hw_wrapped_key)
{
const struct inode *inode = ci->ci_inode;
struct super_block *sb = inode->i_sb;
@@ -130,6 +132,8 @@ int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci)
crypto_cfg.crypto_mode = ci->ci_mode->blk_crypto_mode;
crypto_cfg.data_unit_size = 1U << ci->ci_data_unit_bits;
crypto_cfg.dun_bytes = fscrypt_get_dun_bytes(ci);
+ crypto_cfg.key_type = is_hw_wrapped_key ?
+ BLK_CRYPTO_KEY_TYPE_HW_WRAPPED : BLK_CRYPTO_KEY_TYPE_RAW;
devs = fscrypt_get_devices(sb, &num_devs);
if (IS_ERR(devs))
@@ -150,12 +154,15 @@ out_free_devs:
}
int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
- const u8 *raw_key,
+ const u8 *key_bytes, size_t key_size,
+ bool is_hw_wrapped,
const struct fscrypt_inode_info *ci)
{
const struct inode *inode = ci->ci_inode;
struct super_block *sb = inode->i_sb;
enum blk_crypto_mode_num crypto_mode = ci->ci_mode->blk_crypto_mode;
+ enum blk_crypto_key_type key_type = is_hw_wrapped ?
+ BLK_CRYPTO_KEY_TYPE_HW_WRAPPED : BLK_CRYPTO_KEY_TYPE_RAW;
struct blk_crypto_key *blk_key;
struct block_device **devs;
unsigned int num_devs;
@@ -166,8 +173,8 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
if (!blk_key)
return -ENOMEM;
- err = blk_crypto_init_key(blk_key, raw_key, crypto_mode,
- fscrypt_get_dun_bytes(ci),
+ err = blk_crypto_init_key(blk_key, key_bytes, key_size, key_type,
+ crypto_mode, fscrypt_get_dun_bytes(ci),
1U << ci->ci_data_unit_bits);
if (err) {
fscrypt_err(inode, "error %d initializing blk-crypto key", err);
@@ -226,6 +233,34 @@ void fscrypt_destroy_inline_crypt_key(struct super_block *sb,
kfree_sensitive(blk_key);
}
+/*
+ * Ask the inline encryption hardware to derive the software secret from a
+ * hardware-wrapped key. Returns -EOPNOTSUPP if hardware-wrapped keys aren't
+ * supported on this filesystem or hardware.
+ */
+int fscrypt_derive_sw_secret(struct super_block *sb,
+ const u8 *wrapped_key, size_t wrapped_key_size,
+ u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE])
+{
+ int err;
+
+ /* The filesystem must be mounted with -o inlinecrypt. */
+ if (!(sb->s_flags & SB_INLINECRYPT)) {
+ fscrypt_warn(NULL,
+ "%s: filesystem not mounted with inlinecrypt\n",
+ sb->s_id);
+ return -EOPNOTSUPP;
+ }
+
+ err = blk_crypto_derive_sw_secret(sb->s_bdev, wrapped_key,
+ wrapped_key_size, sw_secret);
+ if (err == -EOPNOTSUPP)
+ fscrypt_warn(NULL,
+ "%s: block device doesn't support hardware-wrapped keys\n",
+ sb->s_id);
+ return err;
+}
+
bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode)
{
return inode->i_crypt_info->ci_inlinecrypt;
diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c
index 787e9c8938ba..7557f6a88b8f 100644
--- a/fs/crypto/keyring.c
+++ b/fs/crypto/keyring.c
@@ -18,12 +18,13 @@
* information about these ioctls.
*/
-#include <linux/unaligned.h>
#include <crypto/skcipher.h>
+#include <linux/export.h>
#include <linux/key-type.h>
-#include <linux/random.h>
#include <linux/once.h>
+#include <linux/random.h>
#include <linux/seq_file.h>
+#include <linux/unaligned.h>
#include "fscrypt_private.h"
@@ -149,11 +150,11 @@ static int fscrypt_user_key_instantiate(struct key *key,
struct key_preparsed_payload *prep)
{
/*
- * We just charge FSCRYPT_MAX_KEY_SIZE bytes to the user's key quota for
- * each key, regardless of the exact key size. The amount of memory
+ * We just charge FSCRYPT_MAX_RAW_KEY_SIZE bytes to the user's key quota
+ * for each key, regardless of the exact key size. The amount of memory
* actually used is greater than the size of the raw key anyway.
*/
- return key_payload_reserve(key, FSCRYPT_MAX_KEY_SIZE);
+ return key_payload_reserve(key, FSCRYPT_MAX_RAW_KEY_SIZE);
}
static void fscrypt_user_key_describe(const struct key *key, struct seq_file *m)
@@ -558,20 +559,45 @@ static int add_master_key(struct super_block *sb,
int err;
if (key_spec->type == FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER) {
- err = fscrypt_init_hkdf(&secret->hkdf, secret->raw,
- secret->size);
- if (err)
- return err;
+ u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE];
+ u8 *kdf_key = secret->bytes;
+ unsigned int kdf_key_size = secret->size;
+ u8 keyid_kdf_ctx = HKDF_CONTEXT_KEY_IDENTIFIER_FOR_RAW_KEY;
/*
- * Now that the HKDF context is initialized, the raw key is no
- * longer needed.
+ * For raw keys, the fscrypt master key is used directly as the
+ * fscrypt KDF key. For hardware-wrapped keys, we have to pass
+ * the master key to the hardware to derive the KDF key, which
+ * is then only used to derive non-file-contents subkeys.
+ */
+ if (secret->is_hw_wrapped) {
+ err = fscrypt_derive_sw_secret(sb, secret->bytes,
+ secret->size, sw_secret);
+ if (err)
+ return err;
+ kdf_key = sw_secret;
+ kdf_key_size = sizeof(sw_secret);
+ /*
+ * To avoid weird behavior if someone manages to
+ * determine sw_secret and add it as a raw key, ensure
+ * that hardware-wrapped keys and raw keys will have
+ * different key identifiers by deriving their key
+ * identifiers using different KDF contexts.
+ */
+ keyid_kdf_ctx =
+ HKDF_CONTEXT_KEY_IDENTIFIER_FOR_HW_WRAPPED_KEY;
+ }
+ err = fscrypt_init_hkdf(&secret->hkdf, kdf_key, kdf_key_size);
+ /*
+ * Now that the KDF context is initialized, the raw KDF key is
+ * no longer needed.
*/
- memzero_explicit(secret->raw, secret->size);
+ memzero_explicit(kdf_key, kdf_key_size);
+ if (err)
+ return err;
/* Calculate the key identifier */
- err = fscrypt_hkdf_expand(&secret->hkdf,
- HKDF_CONTEXT_KEY_IDENTIFIER, NULL, 0,
+ err = fscrypt_hkdf_expand(&secret->hkdf, keyid_kdf_ctx, NULL, 0,
key_spec->u.identifier,
FSCRYPT_KEY_IDENTIFIER_SIZE);
if (err)
@@ -580,19 +606,36 @@ static int add_master_key(struct super_block *sb,
return do_add_master_key(sb, secret, key_spec);
}
+/*
+ * Validate the size of an fscrypt master key being added. Note that this is
+ * just an initial check, as we don't know which ciphers will be used yet.
+ * There is a stricter size check later when the key is actually used by a file.
+ */
+static inline bool fscrypt_valid_key_size(size_t size, u32 add_key_flags)
+{
+ u32 max_size = (add_key_flags & FSCRYPT_ADD_KEY_FLAG_HW_WRAPPED) ?
+ FSCRYPT_MAX_HW_WRAPPED_KEY_SIZE :
+ FSCRYPT_MAX_RAW_KEY_SIZE;
+
+ return size >= FSCRYPT_MIN_KEY_SIZE && size <= max_size;
+}
+
static int fscrypt_provisioning_key_preparse(struct key_preparsed_payload *prep)
{
const struct fscrypt_provisioning_key_payload *payload = prep->data;
- if (prep->datalen < sizeof(*payload) + FSCRYPT_MIN_KEY_SIZE ||
- prep->datalen > sizeof(*payload) + FSCRYPT_MAX_KEY_SIZE)
+ if (prep->datalen < sizeof(*payload))
+ return -EINVAL;
+
+ if (!fscrypt_valid_key_size(prep->datalen - sizeof(*payload),
+ payload->flags))
return -EINVAL;
if (payload->type != FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR &&
payload->type != FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER)
return -EINVAL;
- if (payload->__reserved)
+ if (payload->flags & ~FSCRYPT_ADD_KEY_FLAG_HW_WRAPPED)
return -EINVAL;
prep->payload.data[0] = kmemdup(payload, prep->datalen, GFP_KERNEL);
@@ -636,21 +679,21 @@ static struct key_type key_type_fscrypt_provisioning = {
};
/*
- * Retrieve the raw key from the Linux keyring key specified by 'key_id', and
- * store it into 'secret'.
+ * Retrieve the key from the Linux keyring key specified by 'key_id', and store
+ * it into 'secret'.
*
- * The key must be of type "fscrypt-provisioning" and must have the field
- * fscrypt_provisioning_key_payload::type set to 'type', indicating that it's
- * only usable with fscrypt with the particular KDF version identified by
- * 'type'. We don't use the "logon" key type because there's no way to
- * completely restrict the use of such keys; they can be used by any kernel API
- * that accepts "logon" keys and doesn't require a specific service prefix.
+ * The key must be of type "fscrypt-provisioning" and must have the 'type' and
+ * 'flags' field of the payload set to the given values, indicating that the key
+ * is intended for use for the specified purpose. We don't use the "logon" key
+ * type because there's no way to completely restrict the use of such keys; they
+ * can be used by any kernel API that accepts "logon" keys and doesn't require a
+ * specific service prefix.
*
* The ability to specify the key via Linux keyring key is intended for cases
* where userspace needs to re-add keys after the filesystem is unmounted and
- * re-mounted. Most users should just provide the raw key directly instead.
+ * re-mounted. Most users should just provide the key directly instead.
*/
-static int get_keyring_key(u32 key_id, u32 type,
+static int get_keyring_key(u32 key_id, u32 type, u32 flags,
struct fscrypt_master_key_secret *secret)
{
key_ref_t ref;
@@ -667,12 +710,16 @@ static int get_keyring_key(u32 key_id, u32 type,
goto bad_key;
payload = key->payload.data[0];
- /* Don't allow fscrypt v1 keys to be used as v2 keys and vice versa. */
- if (payload->type != type)
+ /*
+ * Don't allow fscrypt v1 keys to be used as v2 keys and vice versa.
+ * Similarly, don't allow hardware-wrapped keys to be used as
+ * non-hardware-wrapped keys and vice versa.
+ */
+ if (payload->type != type || payload->flags != flags)
goto bad_key;
secret->size = key->datalen - sizeof(*payload);
- memcpy(secret->raw, payload->raw, secret->size);
+ memcpy(secret->bytes, payload->raw, secret->size);
err = 0;
goto out_put;
@@ -734,19 +781,28 @@ int fscrypt_ioctl_add_key(struct file *filp, void __user *_uarg)
return -EACCES;
memset(&secret, 0, sizeof(secret));
+
+ if (arg.flags) {
+ if (arg.flags & ~FSCRYPT_ADD_KEY_FLAG_HW_WRAPPED)
+ return -EINVAL;
+ if (arg.key_spec.type != FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER)
+ return -EINVAL;
+ secret.is_hw_wrapped = true;
+ }
+
if (arg.key_id) {
if (arg.raw_size != 0)
return -EINVAL;
- err = get_keyring_key(arg.key_id, arg.key_spec.type, &secret);
+ err = get_keyring_key(arg.key_id, arg.key_spec.type, arg.flags,
+ &secret);
if (err)
goto out_wipe_secret;
} else {
- if (arg.raw_size < FSCRYPT_MIN_KEY_SIZE ||
- arg.raw_size > FSCRYPT_MAX_KEY_SIZE)
+ if (!fscrypt_valid_key_size(arg.raw_size, arg.flags))
return -EINVAL;
secret.size = arg.raw_size;
err = -EFAULT;
- if (copy_from_user(secret.raw, uarg->raw, secret.size))
+ if (copy_from_user(secret.bytes, uarg->raw, secret.size))
goto out_wipe_secret;
}
@@ -770,13 +826,13 @@ EXPORT_SYMBOL_GPL(fscrypt_ioctl_add_key);
static void
fscrypt_get_test_dummy_secret(struct fscrypt_master_key_secret *secret)
{
- static u8 test_key[FSCRYPT_MAX_KEY_SIZE];
+ static u8 test_key[FSCRYPT_MAX_RAW_KEY_SIZE];
- get_random_once(test_key, FSCRYPT_MAX_KEY_SIZE);
+ get_random_once(test_key, sizeof(test_key));
memset(secret, 0, sizeof(*secret));
- secret->size = FSCRYPT_MAX_KEY_SIZE;
- memcpy(secret->raw, test_key, FSCRYPT_MAX_KEY_SIZE);
+ secret->size = sizeof(test_key);
+ memcpy(secret->bytes, test_key, sizeof(test_key));
}
int fscrypt_get_test_dummy_key_identifier(
@@ -787,10 +843,11 @@ int fscrypt_get_test_dummy_key_identifier(
fscrypt_get_test_dummy_secret(&secret);
- err = fscrypt_init_hkdf(&secret.hkdf, secret.raw, secret.size);
+ err = fscrypt_init_hkdf(&secret.hkdf, secret.bytes, secret.size);
if (err)
goto out;
- err = fscrypt_hkdf_expand(&secret.hkdf, HKDF_CONTEXT_KEY_IDENTIFIER,
+ err = fscrypt_hkdf_expand(&secret.hkdf,
+ HKDF_CONTEXT_KEY_IDENTIFIER_FOR_RAW_KEY,
NULL, 0, key_identifier,
FSCRYPT_KEY_IDENTIFIER_SIZE);
out:
diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
index b4fe01ea4bd4..4f3b9ecbfe4e 100644
--- a/fs/crypto/keysetup.c
+++ b/fs/crypto/keysetup.c
@@ -9,6 +9,7 @@
*/
#include <crypto/skcipher.h>
+#include <linux/export.h>
#include <linux/random.h>
#include "fscrypt_private.h"
@@ -96,14 +97,15 @@ select_encryption_mode(const union fscrypt_policy *policy,
}
/* Create a symmetric cipher object for the given encryption mode and key */
-static struct crypto_skcipher *
+static struct crypto_sync_skcipher *
fscrypt_allocate_skcipher(struct fscrypt_mode *mode, const u8 *raw_key,
const struct inode *inode)
{
- struct crypto_skcipher *tfm;
+ struct crypto_sync_skcipher *tfm;
int err;
- tfm = crypto_alloc_skcipher(mode->cipher_str, 0, 0);
+ tfm = crypto_alloc_sync_skcipher(mode->cipher_str, 0,
+ FSCRYPT_CRYPTOAPI_MASK);
if (IS_ERR(tfm)) {
if (PTR_ERR(tfm) == -ENOENT) {
fscrypt_warn(inode,
@@ -123,21 +125,22 @@ fscrypt_allocate_skcipher(struct fscrypt_mode *mode, const u8 *raw_key,
* first time a mode is used.
*/
pr_info("fscrypt: %s using implementation \"%s\"\n",
- mode->friendly_name, crypto_skcipher_driver_name(tfm));
+ mode->friendly_name,
+ crypto_skcipher_driver_name(&tfm->base));
}
- if (WARN_ON_ONCE(crypto_skcipher_ivsize(tfm) != mode->ivsize)) {
+ if (WARN_ON_ONCE(crypto_sync_skcipher_ivsize(tfm) != mode->ivsize)) {
err = -EINVAL;
goto err_free_tfm;
}
- crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS);
- err = crypto_skcipher_setkey(tfm, raw_key, mode->keysize);
+ crypto_sync_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS);
+ err = crypto_sync_skcipher_setkey(tfm, raw_key, mode->keysize);
if (err)
goto err_free_tfm;
return tfm;
err_free_tfm:
- crypto_free_skcipher(tfm);
+ crypto_free_sync_skcipher(tfm);
return ERR_PTR(err);
}
@@ -150,10 +153,12 @@ err_free_tfm:
int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key,
const u8 *raw_key, const struct fscrypt_inode_info *ci)
{
- struct crypto_skcipher *tfm;
+ struct crypto_sync_skcipher *tfm;
if (fscrypt_using_inline_encryption(ci))
- return fscrypt_prepare_inline_crypt_key(prep_key, raw_key, ci);
+ return fscrypt_prepare_inline_crypt_key(prep_key, raw_key,
+ ci->ci_mode->keysize,
+ false, ci);
tfm = fscrypt_allocate_skcipher(ci->ci_mode, raw_key, ci->ci_inode);
if (IS_ERR(tfm))
@@ -172,7 +177,7 @@ int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key,
void fscrypt_destroy_prepared_key(struct super_block *sb,
struct fscrypt_prepared_key *prep_key)
{
- crypto_free_skcipher(prep_key->tfm);
+ crypto_free_sync_skcipher(prep_key->tfm);
fscrypt_destroy_inline_crypt_key(sb, prep_key);
memzero_explicit(prep_key, sizeof(*prep_key));
}
@@ -195,14 +200,29 @@ static int setup_per_mode_enc_key(struct fscrypt_inode_info *ci,
struct fscrypt_mode *mode = ci->ci_mode;
const u8 mode_num = mode - fscrypt_modes;
struct fscrypt_prepared_key *prep_key;
- u8 mode_key[FSCRYPT_MAX_KEY_SIZE];
+ u8 mode_key[FSCRYPT_MAX_RAW_KEY_SIZE];
u8 hkdf_info[sizeof(mode_num) + sizeof(sb->s_uuid)];
unsigned int hkdf_infolen = 0;
+ bool use_hw_wrapped_key = false;
int err;
if (WARN_ON_ONCE(mode_num > FSCRYPT_MODE_MAX))
return -EINVAL;
+ if (mk->mk_secret.is_hw_wrapped && S_ISREG(inode->i_mode)) {
+ /* Using a hardware-wrapped key for file contents encryption */
+ if (!fscrypt_using_inline_encryption(ci)) {
+ if (sb->s_flags & SB_INLINECRYPT)
+ fscrypt_warn(ci->ci_inode,
+ "Hardware-wrapped key required, but no suitable inline encryption capabilities are available");
+ else
+ fscrypt_warn(ci->ci_inode,
+ "Hardware-wrapped keys require inline encryption (-o inlinecrypt)");
+ return -EINVAL;
+ }
+ use_hw_wrapped_key = true;
+ }
+
prep_key = &keys[mode_num];
if (fscrypt_is_key_prepared(prep_key, ci)) {
ci->ci_enc_key = *prep_key;
@@ -214,6 +234,16 @@ static int setup_per_mode_enc_key(struct fscrypt_inode_info *ci,
if (fscrypt_is_key_prepared(prep_key, ci))
goto done_unlock;
+ if (use_hw_wrapped_key) {
+ err = fscrypt_prepare_inline_crypt_key(prep_key,
+ mk->mk_secret.bytes,
+ mk->mk_secret.size, true,
+ ci);
+ if (err)
+ goto out_unlock;
+ goto done_unlock;
+ }
+
BUILD_BUG_ON(sizeof(mode_num) != 1);
BUILD_BUG_ON(sizeof(sb->s_uuid) != 16);
BUILD_BUG_ON(sizeof(hkdf_info) != 17);
@@ -336,6 +366,14 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_inode_info *ci,
{
int err;
+ if (mk->mk_secret.is_hw_wrapped &&
+ !(ci->ci_policy.v2.flags & (FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64 |
+ FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32))) {
+ fscrypt_warn(ci->ci_inode,
+ "Hardware-wrapped keys are only supported with IV_INO_LBLK policies");
+ return -EINVAL;
+ }
+
if (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) {
/*
* DIRECT_KEY: instead of deriving per-file encryption keys, the
@@ -362,7 +400,7 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_inode_info *ci,
FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) {
err = fscrypt_setup_iv_ino_lblk_32_key(ci, mk);
} else {
- u8 derived_key[FSCRYPT_MAX_KEY_SIZE];
+ u8 derived_key[FSCRYPT_MAX_RAW_KEY_SIZE];
err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf,
HKDF_CONTEXT_PER_FILE_ENC_KEY,
@@ -445,10 +483,6 @@ static int setup_file_encryption_key(struct fscrypt_inode_info *ci,
struct fscrypt_master_key *mk;
int err;
- err = fscrypt_select_encryption_impl(ci);
- if (err)
- return err;
-
err = fscrypt_policy_to_key_spec(&ci->ci_policy, &mk_spec);
if (err)
return err;
@@ -476,6 +510,10 @@ static int setup_file_encryption_key(struct fscrypt_inode_info *ci,
if (ci->ci_policy.version != FSCRYPT_POLICY_V1)
return -ENOKEY;
+ err = fscrypt_select_encryption_impl(ci, false);
+ if (err)
+ return err;
+
/*
* As a legacy fallback for v1 policies, search for the key in
* the current task's subscribed keyrings too. Don't move this
@@ -497,9 +535,21 @@ static int setup_file_encryption_key(struct fscrypt_inode_info *ci,
goto out_release_key;
}
+ err = fscrypt_select_encryption_impl(ci, mk->mk_secret.is_hw_wrapped);
+ if (err)
+ goto out_release_key;
+
switch (ci->ci_policy.version) {
case FSCRYPT_POLICY_V1:
- err = fscrypt_setup_v1_file_key(ci, mk->mk_secret.raw);
+ if (WARN_ON_ONCE(mk->mk_secret.is_hw_wrapped)) {
+ /*
+ * This should never happen, as adding a v1 policy key
+ * that is hardware-wrapped isn't allowed.
+ */
+ err = -EINVAL;
+ goto out_release_key;
+ }
+ err = fscrypt_setup_v1_file_key(ci, mk->mk_secret.bytes);
break;
case FSCRYPT_POLICY_V2:
err = fscrypt_setup_v2_file_key(ci, mk, need_dirhash_key);
diff --git a/fs/crypto/keysetup_v1.c b/fs/crypto/keysetup_v1.c
index cf3b58ec32cc..c4d05168522b 100644
--- a/fs/crypto/keysetup_v1.c
+++ b/fs/crypto/keysetup_v1.c
@@ -48,39 +48,30 @@ static int derive_key_aes(const u8 *master_key,
const u8 nonce[FSCRYPT_FILE_NONCE_SIZE],
u8 *derived_key, unsigned int derived_keysize)
{
- int res = 0;
- struct skcipher_request *req = NULL;
- DECLARE_CRYPTO_WAIT(wait);
- struct scatterlist src_sg, dst_sg;
- struct crypto_skcipher *tfm = crypto_alloc_skcipher("ecb(aes)", 0, 0);
-
- if (IS_ERR(tfm)) {
- res = PTR_ERR(tfm);
- tfm = NULL;
- goto out;
- }
- crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS);
- req = skcipher_request_alloc(tfm, GFP_KERNEL);
- if (!req) {
- res = -ENOMEM;
- goto out;
- }
- skcipher_request_set_callback(req,
- CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
- crypto_req_done, &wait);
- res = crypto_skcipher_setkey(tfm, nonce, FSCRYPT_FILE_NONCE_SIZE);
- if (res < 0)
- goto out;
+ struct crypto_sync_skcipher *tfm;
+ int err;
- sg_init_one(&src_sg, master_key, derived_keysize);
- sg_init_one(&dst_sg, derived_key, derived_keysize);
- skcipher_request_set_crypt(req, &src_sg, &dst_sg, derived_keysize,
- NULL);
- res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
-out:
- skcipher_request_free(req);
- crypto_free_skcipher(tfm);
- return res;
+ tfm = crypto_alloc_sync_skcipher("ecb(aes)", 0, FSCRYPT_CRYPTOAPI_MASK);
+ if (IS_ERR(tfm))
+ return PTR_ERR(tfm);
+
+ err = crypto_sync_skcipher_setkey(tfm, nonce, FSCRYPT_FILE_NONCE_SIZE);
+ if (err == 0) {
+ SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
+ struct scatterlist src_sg, dst_sg;
+
+ skcipher_request_set_callback(req,
+ CRYPTO_TFM_REQ_MAY_BACKLOG |
+ CRYPTO_TFM_REQ_MAY_SLEEP,
+ NULL, NULL);
+ sg_init_one(&src_sg, master_key, derived_keysize);
+ sg_init_one(&dst_sg, derived_key, derived_keysize);
+ skcipher_request_set_crypt(req, &src_sg, &dst_sg,
+ derived_keysize, NULL);
+ err = crypto_skcipher_encrypt(req);
+ }
+ crypto_free_sync_skcipher(tfm);
+ return err;
}
/*
@@ -118,7 +109,7 @@ find_and_lock_process_key(const char *prefix,
payload = (const struct fscrypt_key *)ukp->data;
if (ukp->datalen != sizeof(struct fscrypt_key) ||
- payload->size < 1 || payload->size > FSCRYPT_MAX_KEY_SIZE) {
+ payload->size < 1 || payload->size > sizeof(payload->raw)) {
fscrypt_warn(NULL,
"key with description '%s' has invalid payload",
key->description);
@@ -149,7 +140,7 @@ struct fscrypt_direct_key {
const struct fscrypt_mode *dk_mode;
struct fscrypt_prepared_key dk_key;
u8 dk_descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE];
- u8 dk_raw[FSCRYPT_MAX_KEY_SIZE];
+ u8 dk_raw[FSCRYPT_MAX_RAW_KEY_SIZE];
};
static void free_direct_key(struct fscrypt_direct_key *dk)
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index 701259991277..6ad30ae07c06 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -10,11 +10,13 @@
* Modified by Eric Biggers, 2019 for v2 policy support.
*/
+#include <linux/export.h>
#include <linux/fs_context.h>
+#include <linux/mount.h>
#include <linux/random.h>
#include <linux/seq_file.h>
#include <linux/string.h>
-#include <linux/mount.h>
+
#include "fscrypt_private.h"
/**
diff --git a/fs/d_path.c b/fs/d_path.c
index 5f4da5c8d5db..bb365511066b 100644
--- a/fs/d_path.c
+++ b/fs/d_path.c
@@ -241,9 +241,9 @@ static void get_fs_root_rcu(struct fs_struct *fs, struct path *root)
unsigned seq;
do {
- seq = read_seqcount_begin(&fs->seq);
+ seq = read_seqbegin(&fs->seq);
*root = fs->root;
- } while (read_seqcount_retry(&fs->seq, seq));
+ } while (read_seqretry(&fs->seq, seq));
}
/**
@@ -385,10 +385,10 @@ static void get_fs_root_and_pwd_rcu(struct fs_struct *fs, struct path *root,
unsigned seq;
do {
- seq = read_seqcount_begin(&fs->seq);
+ seq = read_seqbegin(&fs->seq);
*root = fs->root;
*pwd = fs->pwd;
- } while (read_seqcount_retry(&fs->seq, seq));
+ } while (read_seqretry(&fs->seq, seq));
}
/*
diff --git a/fs/dax.c b/fs/dax.c
index 21b47402b3dc..20ecf652c129 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -20,7 +20,6 @@
#include <linux/sched/signal.h>
#include <linux/uio.h>
#include <linux/vmstat.h>
-#include <linux/pfn_t.h>
#include <linux/sizes.h>
#include <linux/mmu_notifier.h>
#include <linux/iomap.h>
@@ -71,9 +70,14 @@ static unsigned long dax_to_pfn(void *entry)
return xa_to_value(entry) >> DAX_SHIFT;
}
-static void *dax_make_entry(pfn_t pfn, unsigned long flags)
+static struct folio *dax_to_folio(void *entry)
{
- return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT));
+ return page_folio(pfn_to_page(dax_to_pfn(entry)));
+}
+
+static void *dax_make_entry(unsigned long pfn, unsigned long flags)
+{
+ return xa_mk_value(flags | (pfn << DAX_SHIFT));
}
static bool dax_is_locked(void *entry)
@@ -206,7 +210,7 @@ static void dax_wake_entry(struct xa_state *xas, void *entry,
*
* Must be called with the i_pages lock held.
*/
-static void *get_unlocked_entry(struct xa_state *xas, unsigned int order)
+static void *get_next_unlocked_entry(struct xa_state *xas, unsigned int order)
{
void *entry;
struct wait_exceptional_entry_queue ewait;
@@ -236,6 +240,37 @@ static void *get_unlocked_entry(struct xa_state *xas, unsigned int order)
}
/*
+ * Wait for the given entry to become unlocked. Caller must hold the i_pages
+ * lock and call either put_unlocked_entry() if it did not lock the entry or
+ * dax_unlock_entry() if it did. Returns an unlocked entry if still present.
+ */
+static void *wait_entry_unlocked_exclusive(struct xa_state *xas, void *entry)
+{
+ struct wait_exceptional_entry_queue ewait;
+ wait_queue_head_t *wq;
+
+ init_wait(&ewait.wait);
+ ewait.wait.func = wake_exceptional_entry_func;
+
+ while (unlikely(dax_is_locked(entry))) {
+ wq = dax_entry_waitqueue(xas, entry, &ewait.key);
+ prepare_to_wait_exclusive(wq, &ewait.wait,
+ TASK_UNINTERRUPTIBLE);
+ xas_reset(xas);
+ xas_unlock_irq(xas);
+ schedule();
+ finish_wait(wq, &ewait.wait);
+ xas_lock_irq(xas);
+ entry = xas_load(xas);
+ }
+
+ if (xa_is_internal(entry))
+ return NULL;
+
+ return entry;
+}
+
+/*
* The only thing keeping the address space around is the i_pages lock
* (it's cycled in clear_inode() after removing the entries from i_pages)
* After we call xas_unlock_irq(), we cannot touch xas->xa.
@@ -250,7 +285,7 @@ static void wait_entry_unlocked(struct xa_state *xas, void *entry)
wq = dax_entry_waitqueue(xas, entry, &ewait.key);
/*
- * Unlike get_unlocked_entry() there is no guarantee that this
+ * Unlike get_next_unlocked_entry() there is no guarantee that this
* path ever successfully retrieves an unlocked entry before an
* inode dies. Perform a non-exclusive wait in case this path
* never successfully performs its own wake up.
@@ -307,109 +342,151 @@ static unsigned long dax_entry_size(void *entry)
return PAGE_SIZE;
}
-static unsigned long dax_end_pfn(void *entry)
+/*
+ * A DAX folio is considered shared if it has no mapping set and ->share (which
+ * shares the ->index field) is non-zero. Note this may return false even if the
+ * page is shared between multiple files but has not yet actually been mapped
+ * into multiple address spaces.
+ */
+static inline bool dax_folio_is_shared(struct folio *folio)
{
- return dax_to_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE;
+ return !folio->mapping && folio->share;
}
/*
- * Iterate through all mapped pfns represented by an entry, i.e. skip
- * 'empty' and 'zero' entries.
+ * When it is called by dax_insert_entry(), the shared flag will indicate
+ * whether this entry is shared by multiple files. If the page has not
+ * previously been associated with any mappings the ->mapping and ->index
+ * fields will be set. If it has already been associated with a mapping
+ * the mapping will be cleared and the share count set. It's then up to
+ * reverse map users like memory_failure() to call back into the filesystem to
+ * recover ->mapping and ->index information. For example by implementing
+ * dax_holder_operations.
*/
-#define for_each_mapped_pfn(entry, pfn) \
- for (pfn = dax_to_pfn(entry); \
- pfn < dax_end_pfn(entry); pfn++)
-
-static inline bool dax_page_is_shared(struct page *page)
+static void dax_folio_make_shared(struct folio *folio)
{
- return page->mapping == PAGE_MAPPING_DAX_SHARED;
+ /*
+ * folio is not currently shared so mark it as shared by clearing
+ * folio->mapping.
+ */
+ folio->mapping = NULL;
+
+ /*
+ * folio has previously been mapped into one address space so set the
+ * share count.
+ */
+ folio->share = 1;
}
-/*
- * Set the page->mapping with PAGE_MAPPING_DAX_SHARED flag, increase the
- * refcount.
- */
-static inline void dax_page_share_get(struct page *page)
+static inline unsigned long dax_folio_put(struct folio *folio)
{
- if (page->mapping != PAGE_MAPPING_DAX_SHARED) {
+ unsigned long ref;
+ int order, i;
+
+ if (!dax_folio_is_shared(folio))
+ ref = 0;
+ else
+ ref = --folio->share;
+
+ if (ref)
+ return ref;
+
+ folio->mapping = NULL;
+ order = folio_order(folio);
+ if (!order)
+ return 0;
+ folio_reset_order(folio);
+
+ for (i = 0; i < (1UL << order); i++) {
+ struct dev_pagemap *pgmap = page_pgmap(&folio->page);
+ struct page *page = folio_page(folio, i);
+ struct folio *new_folio = (struct folio *)page;
+
+ ClearPageHead(page);
+ clear_compound_head(page);
+
+ new_folio->mapping = NULL;
/*
- * Reset the index if the page was already mapped
- * regularly before.
+ * Reset pgmap which was over-written by
+ * prep_compound_page().
*/
- if (page->mapping)
- page->share = 1;
- page->mapping = PAGE_MAPPING_DAX_SHARED;
+ new_folio->pgmap = pgmap;
+ new_folio->share = 0;
+ WARN_ON_ONCE(folio_ref_count(new_folio));
}
- page->share++;
+
+ return ref;
}
-static inline unsigned long dax_page_share_put(struct page *page)
+static void dax_folio_init(void *entry)
{
- return --page->share;
+ struct folio *folio = dax_to_folio(entry);
+ int order = dax_entry_order(entry);
+
+ /*
+ * Folio should have been split back to order-0 pages in
+ * dax_folio_put() when they were removed from their
+ * final mapping.
+ */
+ WARN_ON_ONCE(folio_order(folio));
+
+ if (order > 0) {
+ prep_compound_page(&folio->page, order);
+ if (order > 1)
+ INIT_LIST_HEAD(&folio->_deferred_list);
+ WARN_ON_ONCE(folio_ref_count(folio));
+ }
}
-/*
- * When it is called in dax_insert_entry(), the shared flag will indicate that
- * whether this entry is shared by multiple files. If so, set the page->mapping
- * PAGE_MAPPING_DAX_SHARED, and use page->share as refcount.
- */
static void dax_associate_entry(void *entry, struct address_space *mapping,
- struct vm_area_struct *vma, unsigned long address, bool shared)
+ struct vm_area_struct *vma,
+ unsigned long address, bool shared)
{
- unsigned long size = dax_entry_size(entry), pfn, index;
- int i = 0;
+ unsigned long size = dax_entry_size(entry), index;
+ struct folio *folio = dax_to_folio(entry);
- if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
+ if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry))
return;
index = linear_page_index(vma, address & ~(size - 1));
- for_each_mapped_pfn(entry, pfn) {
- struct page *page = pfn_to_page(pfn);
+ if (shared && (folio->mapping || dax_folio_is_shared(folio))) {
+ if (folio->mapping)
+ dax_folio_make_shared(folio);
- if (shared) {
- dax_page_share_get(page);
- } else {
- WARN_ON_ONCE(page->mapping);
- page->mapping = mapping;
- page->index = index + i++;
- }
+ WARN_ON_ONCE(!folio->share);
+ WARN_ON_ONCE(dax_entry_order(entry) != folio_order(folio));
+ folio->share++;
+ } else {
+ WARN_ON_ONCE(folio->mapping);
+ dax_folio_init(entry);
+ folio = dax_to_folio(entry);
+ folio->mapping = mapping;
+ folio->index = index;
}
}
static void dax_disassociate_entry(void *entry, struct address_space *mapping,
- bool trunc)
+ bool trunc)
{
- unsigned long pfn;
+ struct folio *folio = dax_to_folio(entry);
- if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
+ if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry))
return;
- for_each_mapped_pfn(entry, pfn) {
- struct page *page = pfn_to_page(pfn);
-
- WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
- if (dax_page_is_shared(page)) {
- /* keep the shared flag if this page is still shared */
- if (dax_page_share_put(page) > 0)
- continue;
- } else
- WARN_ON_ONCE(page->mapping && page->mapping != mapping);
- page->mapping = NULL;
- page->index = 0;
- }
+ dax_folio_put(folio);
}
static struct page *dax_busy_page(void *entry)
{
- unsigned long pfn;
+ struct folio *folio = dax_to_folio(entry);
- for_each_mapped_pfn(entry, pfn) {
- struct page *page = pfn_to_page(pfn);
+ if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry))
+ return NULL;
- if (page_ref_count(page) > 1)
- return page;
- }
- return NULL;
+ if (folio_ref_count(folio) - folio_mapcount(folio))
+ return &folio->page;
+ else
+ return NULL;
}
/**
@@ -580,7 +657,7 @@ static void *grab_mapping_entry(struct xa_state *xas,
retry:
pmd_downgrade = false;
xas_lock_irq(xas);
- entry = get_unlocked_entry(xas, order);
+ entry = get_next_unlocked_entry(xas, order);
if (entry) {
if (dax_is_conflict(entry))
@@ -635,7 +712,7 @@ retry:
if (order > 0)
flags |= DAX_PMD;
- entry = dax_make_entry(pfn_to_pfn_t(0), flags);
+ entry = dax_make_entry(0, flags);
dax_lock_entry(xas, entry);
if (xas_error(xas))
goto out_unlock;
@@ -684,13 +761,7 @@ struct page *dax_layout_busy_page_range(struct address_space *mapping,
pgoff_t end_idx;
XA_STATE(xas, &mapping->i_pages, start_idx);
- /*
- * In the 'limited' case get_user_pages() for dax is disabled.
- */
- if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
- return NULL;
-
- if (!dax_mapping(mapping) || !mapping_mapped(mapping))
+ if (!dax_mapping(mapping))
return NULL;
/* If end == LLONG_MAX, all pages from start to till end of file */
@@ -716,8 +787,7 @@ struct page *dax_layout_busy_page_range(struct address_space *mapping,
xas_for_each(&xas, entry, end_idx) {
if (WARN_ON_ONCE(!xa_is_value(entry)))
continue;
- if (unlikely(dax_is_locked(entry)))
- entry = get_unlocked_entry(&xas, 0);
+ entry = wait_entry_unlocked_exclusive(&xas, entry);
if (entry)
page = dax_busy_page(entry);
put_unlocked_entry(&xas, entry, WAKE_NEXT);
@@ -743,14 +813,14 @@ struct page *dax_layout_busy_page(struct address_space *mapping)
EXPORT_SYMBOL_GPL(dax_layout_busy_page);
static int __dax_invalidate_entry(struct address_space *mapping,
- pgoff_t index, bool trunc)
+ pgoff_t index, bool trunc)
{
XA_STATE(xas, &mapping->i_pages, index);
int ret = 0;
void *entry;
xas_lock_irq(&xas);
- entry = get_unlocked_entry(&xas, 0);
+ entry = get_next_unlocked_entry(&xas, 0);
if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
goto out;
if (!trunc &&
@@ -776,7 +846,9 @@ static int __dax_clear_dirty_range(struct address_space *mapping,
xas_lock_irq(&xas);
xas_for_each(&xas, entry, end) {
- entry = get_unlocked_entry(&xas, 0);
+ entry = wait_entry_unlocked_exclusive(&xas, entry);
+ if (!entry)
+ continue;
xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY);
xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
put_unlocked_entry(&xas, entry, WAKE_NEXT);
@@ -813,6 +885,107 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
return ret;
}
+void dax_delete_mapping_range(struct address_space *mapping,
+ loff_t start, loff_t end)
+{
+ void *entry;
+ pgoff_t start_idx = start >> PAGE_SHIFT;
+ pgoff_t end_idx;
+ XA_STATE(xas, &mapping->i_pages, start_idx);
+
+ /* If end == LLONG_MAX, all pages from start to till end of file */
+ if (end == LLONG_MAX)
+ end_idx = ULONG_MAX;
+ else
+ end_idx = end >> PAGE_SHIFT;
+
+ xas_lock_irq(&xas);
+ xas_for_each(&xas, entry, end_idx) {
+ if (!xa_is_value(entry))
+ continue;
+ entry = wait_entry_unlocked_exclusive(&xas, entry);
+ if (!entry)
+ continue;
+ dax_disassociate_entry(entry, mapping, true);
+ xas_store(&xas, NULL);
+ mapping->nrpages -= 1UL << dax_entry_order(entry);
+ put_unlocked_entry(&xas, entry, WAKE_ALL);
+ }
+ xas_unlock_irq(&xas);
+}
+EXPORT_SYMBOL_GPL(dax_delete_mapping_range);
+
+static int wait_page_idle(struct page *page,
+ void (cb)(struct inode *),
+ struct inode *inode)
+{
+ return ___wait_var_event(page, dax_page_is_idle(page),
+ TASK_INTERRUPTIBLE, 0, 0, cb(inode));
+}
+
+static void wait_page_idle_uninterruptible(struct page *page,
+ struct inode *inode)
+{
+ ___wait_var_event(page, dax_page_is_idle(page),
+ TASK_UNINTERRUPTIBLE, 0, 0, schedule());
+}
+
+/*
+ * Unmaps the inode and waits for any DMA to complete prior to deleting the
+ * DAX mapping entries for the range.
+ *
+ * For NOWAIT behavior, pass @cb as NULL to early-exit on first found
+ * busy page
+ */
+int dax_break_layout(struct inode *inode, loff_t start, loff_t end,
+ void (cb)(struct inode *))
+{
+ struct page *page;
+ int error = 0;
+
+ if (!dax_mapping(inode->i_mapping))
+ return 0;
+
+ do {
+ page = dax_layout_busy_page_range(inode->i_mapping, start, end);
+ if (!page)
+ break;
+ if (!cb) {
+ error = -ERESTARTSYS;
+ break;
+ }
+
+ error = wait_page_idle(page, cb, inode);
+ } while (error == 0);
+
+ if (!page)
+ dax_delete_mapping_range(inode->i_mapping, start, end);
+
+ return error;
+}
+EXPORT_SYMBOL_GPL(dax_break_layout);
+
+void dax_break_layout_final(struct inode *inode)
+{
+ struct page *page;
+
+ if (!dax_mapping(inode->i_mapping))
+ return;
+
+ do {
+ page = dax_layout_busy_page_range(inode->i_mapping, 0,
+ LLONG_MAX);
+ if (!page)
+ break;
+
+ wait_page_idle_uninterruptible(page, inode);
+ } while (true);
+
+ if (!page)
+ dax_delete_mapping_range(inode->i_mapping, 0, LLONG_MAX);
+}
+EXPORT_SYMBOL_GPL(dax_break_layout_final);
+
/*
* Invalidate DAX entry if it is clean.
*/
@@ -867,7 +1040,7 @@ static bool dax_fault_is_synchronous(const struct iomap_iter *iter,
* appropriate.
*/
static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
- const struct iomap_iter *iter, void *entry, pfn_t pfn,
+ const struct iomap_iter *iter, void *entry, unsigned long pfn,
unsigned long flags)
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
@@ -895,8 +1068,9 @@ static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
void *old;
dax_disassociate_entry(entry, mapping, false);
- dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address,
- shared);
+ dax_associate_entry(new_entry, mapping, vmf->vma,
+ vmf->address, shared);
+
/*
* Only swap our new entry into the page cache if the current
* entry is a zero page or an empty entry. If a normal PTE or
@@ -940,7 +1114,7 @@ static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
if (unlikely(dax_is_locked(entry))) {
void *old_entry = entry;
- entry = get_unlocked_entry(xas, 0);
+ entry = get_next_unlocked_entry(xas, 0);
/* Entry got punched out / reallocated? */
if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
@@ -1064,7 +1238,7 @@ int dax_writeback_mapping_range(struct address_space *mapping,
EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,
- size_t size, void **kaddr, pfn_t *pfnp)
+ size_t size, void **kaddr, unsigned long *pfnp)
{
pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
int id, rc = 0;
@@ -1082,11 +1256,9 @@ static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,
rc = -EINVAL;
if (PFN_PHYS(length) < size)
goto out;
- if (pfn_t_to_pfn(*pfnp) & (PHYS_PFN(size)-1))
- goto out;
- /* For larger pages we need devmap */
- if (length > 1 && !pfn_t_devmap(*pfnp))
+ if (*pfnp & (PHYS_PFN(size)-1))
goto out;
+
rc = 0;
out_check_addr:
@@ -1188,12 +1360,12 @@ static vm_fault_t dax_load_hole(struct xa_state *xas, struct vm_fault *vmf,
{
struct inode *inode = iter->inode;
unsigned long vaddr = vmf->address;
- pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr));
+ unsigned long pfn = my_zero_pfn(vaddr);
vm_fault_t ret;
*entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, DAX_ZERO_PAGE);
- ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
+ ret = vmf_insert_page_mkwrite(vmf, pfn_to_page(pfn), false);
trace_dax_load_hole(inode, vmf, ret);
return ret;
}
@@ -1210,14 +1382,14 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
struct folio *zero_folio;
spinlock_t *ptl;
pmd_t pmd_entry;
- pfn_t pfn;
+ unsigned long pfn;
zero_folio = mm_get_huge_zero_folio(vmf->vma->vm_mm);
if (unlikely(!zero_folio))
goto fallback;
- pfn = page_to_pfn_t(&zero_folio->page);
+ pfn = page_to_pfn(&zero_folio->page);
*entry = dax_insert_entry(xas, vmf, iter, *entry, pfn,
DAX_PMD | DAX_ZERO_PAGE);
@@ -1237,8 +1409,7 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
mm_inc_nr_ptes(vma->vm_mm);
}
- pmd_entry = mk_pmd(&zero_folio->page, vmf->vma->vm_page_prot);
- pmd_entry = pmd_mkhuge(pmd_entry);
+ pmd_entry = folio_mk_pmd(zero_folio, vmf->vma->vm_page_prot);
set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
spin_unlock(ptl);
trace_dax_pmd_load_hole(inode, vmf, zero_folio, *entry);
@@ -1258,7 +1429,7 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
}
#endif /* CONFIG_FS_DAX_PMD */
-static s64 dax_unshare_iter(struct iomap_iter *iter)
+static int dax_unshare_iter(struct iomap_iter *iter)
{
struct iomap *iomap = &iter->iomap;
const struct iomap *srcmap = iomap_iter_srcmap(iter);
@@ -1266,11 +1437,11 @@ static s64 dax_unshare_iter(struct iomap_iter *iter)
u64 copy_len = iomap_length(iter);
u32 mod;
int id = 0;
- s64 ret = 0;
+ s64 ret;
void *daddr = NULL, *saddr = NULL;
if (!iomap_want_unshare_iter(iter))
- return iomap_length(iter);
+ return iomap_iter_advance_full(iter);
/*
* Extend the file range to be aligned to fsblock/pagesize, because
@@ -1300,14 +1471,14 @@ static s64 dax_unshare_iter(struct iomap_iter *iter)
if (ret < 0)
goto out_unlock;
- if (copy_mc_to_kernel(daddr, saddr, copy_len) == 0)
- ret = iomap_length(iter);
- else
+ if (copy_mc_to_kernel(daddr, saddr, copy_len) != 0)
ret = -EIO;
out_unlock:
dax_read_unlock(id);
- return dax_mem2blk_err(ret);
+ if (ret < 0)
+ return dax_mem2blk_err(ret);
+ return iomap_iter_advance_full(iter);
}
int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len,
@@ -1326,7 +1497,7 @@ int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len,
iter.len = min(len, size - pos);
while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.processed = dax_unshare_iter(&iter);
+ iter.status = dax_unshare_iter(&iter);
return ret;
}
EXPORT_SYMBOL_GPL(dax_file_unshare);
@@ -1354,17 +1525,16 @@ static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size)
return ret;
}
-static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
+static int dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
{
const struct iomap *iomap = &iter->iomap;
const struct iomap *srcmap = iomap_iter_srcmap(iter);
- loff_t pos = iter->pos;
u64 length = iomap_length(iter);
- s64 written = 0;
+ int ret;
/* already zeroed? we're done. */
if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
- return length;
+ return iomap_iter_advance(iter, &length);
/*
* invalidate the pages whose sharing state is to be changed
@@ -1372,33 +1542,35 @@ static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
*/
if (iomap->flags & IOMAP_F_SHARED)
invalidate_inode_pages2_range(iter->inode->i_mapping,
- pos >> PAGE_SHIFT,
- (pos + length - 1) >> PAGE_SHIFT);
+ iter->pos >> PAGE_SHIFT,
+ (iter->pos + length - 1) >> PAGE_SHIFT);
do {
+ loff_t pos = iter->pos;
unsigned offset = offset_in_page(pos);
- unsigned size = min_t(u64, PAGE_SIZE - offset, length);
pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
- long rc;
int id;
+ length = min_t(u64, PAGE_SIZE - offset, length);
+
id = dax_read_lock();
- if (IS_ALIGNED(pos, PAGE_SIZE) && size == PAGE_SIZE)
- rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
+ if (IS_ALIGNED(pos, PAGE_SIZE) && length == PAGE_SIZE)
+ ret = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
else
- rc = dax_memzero(iter, pos, size);
+ ret = dax_memzero(iter, pos, length);
dax_read_unlock(id);
- if (rc < 0)
- return rc;
- pos += size;
- length -= size;
- written += size;
+ if (ret < 0)
+ return ret;
+
+ ret = iomap_iter_advance(iter, &length);
+ if (ret)
+ return ret;
} while (length > 0);
if (did_zero)
*did_zero = true;
- return written;
+ return ret;
}
int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
@@ -1413,7 +1585,7 @@ int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
int ret;
while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.processed = dax_zero_iter(&iter, did_zero);
+ iter.status = dax_zero_iter(&iter, did_zero);
return ret;
}
EXPORT_SYMBOL_GPL(dax_zero_range);
@@ -1431,8 +1603,7 @@ int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
}
EXPORT_SYMBOL_GPL(dax_truncate_page);
-static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
- struct iov_iter *iter)
+static int dax_iomap_iter(struct iomap_iter *iomi, struct iov_iter *iter)
{
const struct iomap *iomap = &iomi->iomap;
const struct iomap *srcmap = iomap_iter_srcmap(iomi);
@@ -1451,8 +1622,10 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
if (pos >= end)
return 0;
- if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
- return iov_iter_zero(min(length, end - pos), iter);
+ if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) {
+ done = iov_iter_zero(min(length, end - pos), iter);
+ return iomap_iter_advance(iomi, &done);
+ }
}
/*
@@ -1485,7 +1658,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
}
id = dax_read_lock();
- while (pos < end) {
+ while ((pos = iomi->pos) < end) {
unsigned offset = pos & (PAGE_SIZE - 1);
const size_t size = ALIGN(length + offset, PAGE_SIZE);
pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
@@ -1535,18 +1708,16 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr,
map_len, iter);
- pos += xfer;
- length -= xfer;
- done += xfer;
-
- if (xfer == 0)
+ length = xfer;
+ ret = iomap_iter_advance(iomi, &length);
+ if (!ret && xfer == 0)
ret = -EFAULT;
if (xfer < map_len)
break;
}
dax_read_unlock(id);
- return done ? done : ret;
+ return ret;
}
/**
@@ -1572,6 +1743,9 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
loff_t done = 0;
int ret;
+ if (WARN_ON_ONCE(iocb->ki_flags & IOCB_ATOMIC))
+ return -EIO;
+
if (!iomi.len)
return 0;
@@ -1586,7 +1760,7 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
iomi.flags |= IOMAP_NOWAIT;
while ((ret = iomap_iter(&iomi, ops)) > 0)
- iomi.processed = dax_iomap_iter(&iomi, iter);
+ iomi.status = dax_iomap_iter(&iomi, iter);
done = iomi.pos - iocb->ki_pos;
iocb->ki_pos = iomi.pos;
@@ -1607,7 +1781,8 @@ static vm_fault_t dax_fault_return(int error)
* insertion for now and return the pfn so that caller can insert it after the
* fsync is done.
*/
-static vm_fault_t dax_fault_synchronous_pfnp(pfn_t *pfnp, pfn_t pfn)
+static vm_fault_t dax_fault_synchronous_pfnp(unsigned long *pfnp,
+ unsigned long pfn)
{
if (WARN_ON_ONCE(!pfnp))
return VM_FAULT_SIGBUS;
@@ -1655,7 +1830,7 @@ static vm_fault_t dax_fault_cow_page(struct vm_fault *vmf,
* @pmd: distinguish whether it is a pmd fault
*/
static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
- const struct iomap_iter *iter, pfn_t *pfnp,
+ const struct iomap_iter *iter, unsigned long *pfnp,
struct xa_state *xas, void **entry, bool pmd)
{
const struct iomap *iomap = &iter->iomap;
@@ -1664,8 +1839,9 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT;
bool write = iter->flags & IOMAP_WRITE;
unsigned long entry_flags = pmd ? DAX_PMD : 0;
- int err = 0;
- pfn_t pfn;
+ struct folio *folio;
+ int ret, err = 0;
+ unsigned long pfn;
void *kaddr;
if (!pmd && vmf->cow_page)
@@ -1696,20 +1872,21 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
return dax_fault_return(err);
}
+ folio = dax_to_folio(*entry);
if (dax_fault_is_synchronous(iter, vmf->vma))
return dax_fault_synchronous_pfnp(pfnp, pfn);
- /* insert PMD pfn */
+ folio_ref_inc(folio);
if (pmd)
- return vmf_insert_pfn_pmd(vmf, pfn, write);
+ ret = vmf_insert_folio_pmd(vmf, pfn_folio(pfn), write);
+ else
+ ret = vmf_insert_page_mkwrite(vmf, pfn_to_page(pfn), write);
+ folio_put(folio);
- /* insert PTE pfn */
- if (write)
- return vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
- return vmf_insert_mixed(vmf->vma, vmf->address, pfn);
+ return ret;
}
-static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
+static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, unsigned long *pfnp,
int *iomap_errp, const struct iomap_ops *ops)
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
@@ -1750,14 +1927,14 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
* the PTE we need to set up. If so just return and the fault will be
* retried.
*/
- if (pmd_trans_huge(*vmf->pmd) || pmd_devmap(*vmf->pmd)) {
+ if (pmd_trans_huge(*vmf->pmd)) {
ret = VM_FAULT_NOPAGE;
goto unlock_entry;
}
while ((error = iomap_iter(&iter, ops)) > 0) {
if (WARN_ON_ONCE(iomap_length(&iter) < PAGE_SIZE)) {
- iter.processed = -EIO; /* fs corruption? */
+ iter.status = -EIO; /* fs corruption? */
continue;
}
@@ -1769,8 +1946,10 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
ret |= VM_FAULT_MAJOR;
}
- if (!(ret & VM_FAULT_ERROR))
- iter.processed = PAGE_SIZE;
+ if (!(ret & VM_FAULT_ERROR)) {
+ u64 length = PAGE_SIZE;
+ iter.status = iomap_iter_advance(&iter, &length);
+ }
}
if (iomap_errp)
@@ -1819,7 +1998,7 @@ static bool dax_fault_check_fallback(struct vm_fault *vmf, struct xa_state *xas,
return false;
}
-static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
+static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, unsigned long *pfnp,
const struct iomap_ops *ops)
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
@@ -1871,8 +2050,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
* the PMD we need to set up. If so just return and the fault will be
* retried.
*/
- if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd) &&
- !pmd_devmap(*vmf->pmd)) {
+ if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd)) {
ret = 0;
goto unlock_entry;
}
@@ -1883,8 +2061,10 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
continue; /* actually breaks out of the loop */
ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, true);
- if (ret != VM_FAULT_FALLBACK)
- iter.processed = PMD_SIZE;
+ if (ret != VM_FAULT_FALLBACK) {
+ u64 length = PMD_SIZE;
+ iter.status = iomap_iter_advance(&iter, &length);
+ }
}
unlock_entry:
@@ -1899,7 +2079,7 @@ out:
return ret;
}
#else
-static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
+static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, unsigned long *pfnp,
const struct iomap_ops *ops)
{
return VM_FAULT_FALLBACK;
@@ -1920,7 +2100,8 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
* successfully.
*/
vm_fault_t dax_iomap_fault(struct vm_fault *vmf, unsigned int order,
- pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops)
+ unsigned long *pfnp, int *iomap_errp,
+ const struct iomap_ops *ops)
{
if (order == 0)
return dax_iomap_pte_fault(vmf, pfnp, iomap_errp, ops);
@@ -1940,16 +2121,17 @@ EXPORT_SYMBOL_GPL(dax_iomap_fault);
* This function inserts a writeable PTE or PMD entry into the page tables
* for an mmaped DAX file. It also marks the page cache entry as dirty.
*/
-static vm_fault_t
-dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
+static vm_fault_t dax_insert_pfn_mkwrite(struct vm_fault *vmf,
+ unsigned long pfn, unsigned int order)
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order);
+ struct folio *folio;
void *entry;
vm_fault_t ret;
xas_lock_irq(&xas);
- entry = get_unlocked_entry(&xas, order);
+ entry = get_next_unlocked_entry(&xas, order);
/* Did we race with someone splitting entry or so? */
if (!entry || dax_is_conflict(entry) ||
(order == 0 && !dax_is_pte_entry(entry))) {
@@ -1962,14 +2144,17 @@ dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
xas_set_mark(&xas, PAGECACHE_TAG_DIRTY);
dax_lock_entry(&xas, entry);
xas_unlock_irq(&xas);
+ folio = pfn_folio(pfn);
+ folio_ref_inc(folio);
if (order == 0)
- ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
+ ret = vmf_insert_page_mkwrite(vmf, &folio->page, true);
#ifdef CONFIG_FS_DAX_PMD
else if (order == PMD_ORDER)
- ret = vmf_insert_pfn_pmd(vmf, pfn, FAULT_FLAG_WRITE);
+ ret = vmf_insert_folio_pmd(vmf, folio, FAULT_FLAG_WRITE);
#endif
else
ret = VM_FAULT_FALLBACK;
+ folio_put(folio);
dax_unlock_entry(&xas, entry);
trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret);
return ret;
@@ -1986,7 +2171,7 @@ dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
* table entry.
*/
vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, unsigned int order,
- pfn_t pfn)
+ unsigned long pfn)
{
int err;
loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT;
@@ -1999,12 +2184,13 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, unsigned int order,
}
EXPORT_SYMBOL_GPL(dax_finish_sync_fault);
-static loff_t dax_range_compare_iter(struct iomap_iter *it_src,
+static int dax_range_compare_iter(struct iomap_iter *it_src,
struct iomap_iter *it_dest, u64 len, bool *same)
{
const struct iomap *smap = &it_src->iomap;
const struct iomap *dmap = &it_dest->iomap;
loff_t pos1 = it_src->pos, pos2 = it_dest->pos;
+ u64 dest_len;
void *saddr, *daddr;
int id, ret;
@@ -2012,7 +2198,7 @@ static loff_t dax_range_compare_iter(struct iomap_iter *it_src,
if (smap->type == IOMAP_HOLE && dmap->type == IOMAP_HOLE) {
*same = true;
- return len;
+ goto advance;
}
if (smap->type == IOMAP_HOLE || dmap->type == IOMAP_HOLE) {
@@ -2035,7 +2221,13 @@ static loff_t dax_range_compare_iter(struct iomap_iter *it_src,
if (!*same)
len = 0;
dax_read_unlock(id);
- return len;
+
+advance:
+ dest_len = len;
+ ret = iomap_iter_advance(it_src, &len);
+ if (!ret)
+ ret = iomap_iter_advance(it_dest, &dest_len);
+ return ret;
out_unlock:
dax_read_unlock(id);
@@ -2058,15 +2250,15 @@ int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
.len = len,
.flags = IOMAP_DAX,
};
- int ret, compared = 0;
+ int ret, status;
while ((ret = iomap_iter(&src_iter, ops)) > 0 &&
(ret = iomap_iter(&dst_iter, ops)) > 0) {
- compared = dax_range_compare_iter(&src_iter, &dst_iter,
+ status = dax_range_compare_iter(&src_iter, &dst_iter,
min(src_iter.len, dst_iter.len), same);
- if (compared < 0)
+ if (status < 0)
return ret;
- src_iter.processed = dst_iter.processed = compared;
+ src_iter.status = dst_iter.status = status;
}
return ret;
}
diff --git a/fs/dcache.c b/fs/dcache.c
index 9cc0d47da321..60046ae23d51 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -73,8 +73,14 @@
* If no ancestor relationship:
* arbitrary, since it's serialized on rename_lock
*/
-int sysctl_vfs_cache_pressure __read_mostly = 100;
-EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
+static int sysctl_vfs_cache_pressure __read_mostly = 100;
+static int sysctl_vfs_cache_pressure_denom __read_mostly = 100;
+
+unsigned long vfs_pressure_ratio(unsigned long val)
+{
+ return mult_frac(val, sysctl_vfs_cache_pressure, sysctl_vfs_cache_pressure_denom);
+}
+EXPORT_SYMBOL_GPL(vfs_pressure_ratio);
__cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
@@ -211,8 +217,28 @@ static const struct ctl_table fs_dcache_sysctls[] = {
},
};
+static const struct ctl_table vm_dcache_sysctls[] = {
+ {
+ .procname = "vfs_cache_pressure",
+ .data = &sysctl_vfs_cache_pressure,
+ .maxlen = sizeof(sysctl_vfs_cache_pressure),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ },
+ {
+ .procname = "vfs_cache_pressure_denom",
+ .data = &sysctl_vfs_cache_pressure_denom,
+ .maxlen = sizeof(sysctl_vfs_cache_pressure_denom),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE_HUNDRED,
+ },
+};
+
static int __init init_fs_dcache_sysctls(void)
{
+ register_sysctl_init("vm", vm_dcache_sysctls);
register_sysctl_init("fs", fs_dcache_sysctls);
return 0;
}
@@ -1410,7 +1436,7 @@ int d_set_mounted(struct dentry *dentry)
{
struct dentry *p;
int ret = -ENOENT;
- write_seqlock(&rename_lock);
+ read_seqlock_excl(&rename_lock);
for (p = dentry->d_parent; !IS_ROOT(p); p = p->d_parent) {
/* Need exclusion wrt. d_invalidate() */
spin_lock(&p->d_lock);
@@ -1430,7 +1456,7 @@ int d_set_mounted(struct dentry *dentry)
}
spin_unlock(&dentry->d_lock);
out:
- write_sequnlock(&rename_lock);
+ read_sequnlock_excl(&rename_lock);
return ret;
}
@@ -1700,19 +1726,19 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
smp_store_release(&dentry->d_name.name, dname); /* ^^^ */
dentry->d_flags = 0;
- lockref_init(&dentry->d_lockref, 1);
+ lockref_init(&dentry->d_lockref);
seqcount_spinlock_init(&dentry->d_seq, &dentry->d_lock);
dentry->d_inode = NULL;
dentry->d_parent = dentry;
dentry->d_sb = sb;
- dentry->d_op = NULL;
+ dentry->d_op = sb->__s_d_op;
+ dentry->d_flags = sb->s_d_flags;
dentry->d_fsdata = NULL;
INIT_HLIST_BL_NODE(&dentry->d_hash);
INIT_LIST_HEAD(&dentry->d_lru);
INIT_HLIST_HEAD(&dentry->d_children);
INIT_HLIST_NODE(&dentry->d_u.d_alias);
INIT_HLIST_NODE(&dentry->d_sib);
- d_set_d_op(dentry, dentry->d_sb->s_d_op);
if (dentry->d_op && dentry->d_op->d_init) {
err = dentry->d_op->d_init(dentry);
@@ -1795,8 +1821,9 @@ struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name)
struct dentry *dentry = __d_alloc(sb, name);
if (likely(dentry)) {
dentry->d_flags |= DCACHE_NORCU;
- if (!sb->s_d_op)
- d_set_d_op(dentry, &anon_ops);
+ /* d_op_flags(&anon_ops) is 0 */
+ if (!dentry->d_op)
+ dentry->d_op = &anon_ops;
}
return dentry;
}
@@ -1811,35 +1838,50 @@ struct dentry *d_alloc_name(struct dentry *parent, const char *name)
}
EXPORT_SYMBOL(d_alloc_name);
-void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
+#define DCACHE_OP_FLAGS \
+ (DCACHE_OP_HASH | DCACHE_OP_COMPARE | DCACHE_OP_REVALIDATE | \
+ DCACHE_OP_WEAK_REVALIDATE | DCACHE_OP_DELETE | DCACHE_OP_PRUNE | \
+ DCACHE_OP_REAL)
+
+static unsigned int d_op_flags(const struct dentry_operations *op)
+{
+ unsigned int flags = 0;
+ if (op) {
+ if (op->d_hash)
+ flags |= DCACHE_OP_HASH;
+ if (op->d_compare)
+ flags |= DCACHE_OP_COMPARE;
+ if (op->d_revalidate)
+ flags |= DCACHE_OP_REVALIDATE;
+ if (op->d_weak_revalidate)
+ flags |= DCACHE_OP_WEAK_REVALIDATE;
+ if (op->d_delete)
+ flags |= DCACHE_OP_DELETE;
+ if (op->d_prune)
+ flags |= DCACHE_OP_PRUNE;
+ if (op->d_real)
+ flags |= DCACHE_OP_REAL;
+ }
+ return flags;
+}
+
+static void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
{
+ unsigned int flags = d_op_flags(op);
WARN_ON_ONCE(dentry->d_op);
- WARN_ON_ONCE(dentry->d_flags & (DCACHE_OP_HASH |
- DCACHE_OP_COMPARE |
- DCACHE_OP_REVALIDATE |
- DCACHE_OP_WEAK_REVALIDATE |
- DCACHE_OP_DELETE |
- DCACHE_OP_REAL));
+ WARN_ON_ONCE(dentry->d_flags & DCACHE_OP_FLAGS);
dentry->d_op = op;
- if (!op)
- return;
- if (op->d_hash)
- dentry->d_flags |= DCACHE_OP_HASH;
- if (op->d_compare)
- dentry->d_flags |= DCACHE_OP_COMPARE;
- if (op->d_revalidate)
- dentry->d_flags |= DCACHE_OP_REVALIDATE;
- if (op->d_weak_revalidate)
- dentry->d_flags |= DCACHE_OP_WEAK_REVALIDATE;
- if (op->d_delete)
- dentry->d_flags |= DCACHE_OP_DELETE;
- if (op->d_prune)
- dentry->d_flags |= DCACHE_OP_PRUNE;
- if (op->d_real)
- dentry->d_flags |= DCACHE_OP_REAL;
-
-}
-EXPORT_SYMBOL(d_set_d_op);
+ if (flags)
+ dentry->d_flags |= flags;
+}
+
+void set_default_d_op(struct super_block *s, const struct dentry_operations *ops)
+{
+ unsigned int flags = d_op_flags(ops);
+ s->__s_d_op = ops;
+ s->s_d_flags = (s->s_d_flags & ~DCACHE_OP_FLAGS) | flags;
+}
+EXPORT_SYMBOL(set_default_d_op);
static unsigned d_flags_for_inode(struct inode *inode)
{
@@ -2395,7 +2437,6 @@ struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name)
}
return d_lookup(dir, name);
}
-EXPORT_SYMBOL(d_hash_and_lookup);
/*
* When a file is deleted, we have two options:
@@ -2480,7 +2521,8 @@ static inline void end_dir_add(struct inode *dir, unsigned int n,
{
smp_store_release(&dir->i_dir_seq, n + 2);
preempt_enable_nested();
- wake_up_all(d_wait);
+ if (wq_has_sleeper(d_wait))
+ wake_up_all(d_wait);
}
static void d_wait_lookup(struct dentry *dentry)
@@ -2504,13 +2546,19 @@ struct dentry *d_alloc_parallel(struct dentry *parent,
unsigned int hash = name->hash;
struct hlist_bl_head *b = in_lookup_hash(parent, hash);
struct hlist_bl_node *node;
- struct dentry *new = d_alloc(parent, name);
+ struct dentry *new = __d_alloc(parent->d_sb, name);
struct dentry *dentry;
unsigned seq, r_seq, d_seq;
if (unlikely(!new))
return ERR_PTR(-ENOMEM);
+ new->d_flags |= DCACHE_PAR_LOOKUP;
+ spin_lock(&parent->d_lock);
+ new->d_parent = dget_dlock(parent);
+ hlist_add_head(&new->d_sib, &parent->d_children);
+ spin_unlock(&parent->d_lock);
+
retry:
rcu_read_lock();
seq = smp_load_acquire(&parent->d_inode->i_dir_seq);
@@ -2594,8 +2642,6 @@ retry:
return dentry;
}
rcu_read_unlock();
- /* we can't take ->d_lock here; it's OK, though. */
- new->d_flags |= DCACHE_PAR_LOOKUP;
new->d_wait = wq;
hlist_bl_add_head(&new->d_u.d_in_lookup_hash, b);
hlist_bl_unlock(b);
@@ -2641,7 +2687,8 @@ EXPORT_SYMBOL(__d_lookup_unhash_wake);
/* inode->i_lock held if inode is non-NULL */
-static inline void __d_add(struct dentry *dentry, struct inode *inode)
+static inline void __d_add(struct dentry *dentry, struct inode *inode,
+ const struct dentry_operations *ops)
{
wait_queue_head_t *d_wait;
struct inode *dir = NULL;
@@ -2652,6 +2699,8 @@ static inline void __d_add(struct dentry *dentry, struct inode *inode)
n = start_dir_add(dir);
d_wait = __d_lookup_unhash(dentry);
}
+ if (unlikely(ops))
+ d_set_d_op(dentry, ops);
if (inode) {
unsigned add_flags = d_flags_for_inode(inode);
hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
@@ -2683,56 +2732,10 @@ void d_add(struct dentry *entry, struct inode *inode)
security_d_instantiate(entry, inode);
spin_lock(&inode->i_lock);
}
- __d_add(entry, inode);
+ __d_add(entry, inode, NULL);
}
EXPORT_SYMBOL(d_add);
-/**
- * d_exact_alias - find and hash an exact unhashed alias
- * @entry: dentry to add
- * @inode: The inode to go with this dentry
- *
- * If an unhashed dentry with the same name/parent and desired
- * inode already exists, hash and return it. Otherwise, return
- * NULL.
- *
- * Parent directory should be locked.
- */
-struct dentry *d_exact_alias(struct dentry *entry, struct inode *inode)
-{
- struct dentry *alias;
- unsigned int hash = entry->d_name.hash;
-
- spin_lock(&inode->i_lock);
- hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
- /*
- * Don't need alias->d_lock here, because aliases with
- * d_parent == entry->d_parent are not subject to name or
- * parent changes, because the parent inode i_mutex is held.
- */
- if (alias->d_name.hash != hash)
- continue;
- if (alias->d_parent != entry->d_parent)
- continue;
- if (!d_same_name(alias, entry->d_parent, &entry->d_name))
- continue;
- spin_lock(&alias->d_lock);
- if (!d_unhashed(alias)) {
- spin_unlock(&alias->d_lock);
- alias = NULL;
- } else {
- dget_dlock(alias);
- __d_rehash(alias);
- spin_unlock(&alias->d_lock);
- }
- spin_unlock(&inode->i_lock);
- return alias;
- }
- spin_unlock(&inode->i_lock);
- return NULL;
-}
-EXPORT_SYMBOL(d_exact_alias);
-
static void swap_names(struct dentry *dentry, struct dentry *target)
{
if (unlikely(dname_external(target))) {
@@ -2794,10 +2797,10 @@ static void copy_name(struct dentry *dentry, struct dentry *target)
* @target: new dentry
* @exchange: exchange the two dentries
*
- * Update the dcache to reflect the move of a file name. Negative
- * dcache entries should not be moved in this way. Caller must hold
- * rename_lock, the i_mutex of the source and target directories,
- * and the sb->s_vfs_rename_mutex if they differ. See lock_rename().
+ * Update the dcache to reflect the move of a file name. Negative dcache
+ * entries should not be moved in this way. Caller must hold rename_lock, the
+ * i_rwsem of the source and target directories (exclusively), and the sb->
+ * s_vfs_rename_mutex if they differ. See lock_rename().
*/
static void __d_move(struct dentry *dentry, struct dentry *target,
bool exchange)
@@ -2943,7 +2946,7 @@ struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2)
* This helper attempts to cope with remotely renamed directories
*
* It assumes that the caller is already holding
- * dentry->d_parent->d_inode->i_mutex, and rename_lock
+ * dentry->d_parent->d_inode->i_rwsem, and rename_lock
*
* Note: If ever the locking in lock_rename() changes, then please
* remember to update this too...
@@ -2966,11 +2969,11 @@ static int __d_unalias(struct dentry *dentry, struct dentry *alias)
goto out_err;
m2 = &alias->d_parent->d_inode->i_rwsem;
out_unalias:
- if (alias->d_op->d_unalias_trylock &&
+ if (alias->d_op && alias->d_op->d_unalias_trylock &&
!alias->d_op->d_unalias_trylock(alias))
goto out_err;
__d_move(alias, dentry, false);
- if (alias->d_op->d_unalias_unlock)
+ if (alias->d_op && alias->d_op->d_unalias_unlock)
alias->d_op->d_unalias_unlock(alias);
ret = 0;
out_err:
@@ -2981,30 +2984,8 @@ out_err:
return ret;
}
-/**
- * d_splice_alias - splice a disconnected dentry into the tree if one exists
- * @inode: the inode which may have a disconnected dentry
- * @dentry: a negative dentry which we want to point to the inode.
- *
- * If inode is a directory and has an IS_ROOT alias, then d_move that in
- * place of the given dentry and return it, else simply d_add the inode
- * to the dentry and return NULL.
- *
- * If a non-IS_ROOT directory is found, the filesystem is corrupt, and
- * we should error out: directories can't have multiple aliases.
- *
- * This is needed in the lookup routine of any filesystem that is exportable
- * (via knfsd) so that we can build dcache paths to directories effectively.
- *
- * If a dentry was found and moved, then it is returned. Otherwise NULL
- * is returned. This matches the expected return value of ->lookup.
- *
- * Cluster filesystems may call this function with a negative, hashed dentry.
- * In that case, we know that the inode will be a regular file, and also this
- * will only occur during atomic_open. So we need to check for the dentry
- * being already hashed only in the final case.
- */
-struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
+struct dentry *d_splice_alias_ops(struct inode *inode, struct dentry *dentry,
+ const struct dentry_operations *ops)
{
if (IS_ERR(inode))
return ERR_CAST(inode);
@@ -3050,9 +3031,37 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
}
}
out:
- __d_add(dentry, inode);
+ __d_add(dentry, inode, ops);
return NULL;
}
+
+/**
+ * d_splice_alias - splice a disconnected dentry into the tree if one exists
+ * @inode: the inode which may have a disconnected dentry
+ * @dentry: a negative dentry which we want to point to the inode.
+ *
+ * If inode is a directory and has an IS_ROOT alias, then d_move that in
+ * place of the given dentry and return it, else simply d_add the inode
+ * to the dentry and return NULL.
+ *
+ * If a non-IS_ROOT directory is found, the filesystem is corrupt, and
+ * we should error out: directories can't have multiple aliases.
+ *
+ * This is needed in the lookup routine of any filesystem that is exportable
+ * (via knfsd) so that we can build dcache paths to directories effectively.
+ *
+ * If a dentry was found and moved, then it is returned. Otherwise NULL
+ * is returned. This matches the expected return value of ->lookup.
+ *
+ * Cluster filesystems may call this function with a negative, hashed dentry.
+ * In that case, we know that the inode will be a regular file, and also this
+ * will only occur during atomic_open. So we need to check for the dentry
+ * being already hashed only in the final case.
+ */
+struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
+{
+ return d_splice_alias_ops(inode, dentry, NULL);
+}
EXPORT_SYMBOL(d_splice_alias);
/*
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index e33cc77699cd..3ec3324c2060 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -47,29 +47,12 @@ const struct file_operations debugfs_noop_file_operations = {
#define F_DENTRY(filp) ((filp)->f_path.dentry)
-const void *debugfs_get_aux(const struct file *file)
+void *debugfs_get_aux(const struct file *file)
{
return DEBUGFS_I(file_inode(file))->aux;
}
EXPORT_SYMBOL_GPL(debugfs_get_aux);
-const struct file_operations *debugfs_real_fops(const struct file *filp)
-{
- struct debugfs_fsdata *fsd = F_DENTRY(filp)->d_fsdata;
-
- if (!fsd) {
- /*
- * Urgh, we've been called w/o a protecting
- * debugfs_file_get().
- */
- WARN_ON(1);
- return NULL;
- }
-
- return fsd->real_fops;
-}
-EXPORT_SYMBOL_GPL(debugfs_real_fops);
-
enum dbgfs_get_mode {
DBGFS_GET_ALREADY,
DBGFS_GET_REGULAR,
@@ -94,6 +77,7 @@ static int __debugfs_file_get(struct dentry *dentry, enum dbgfs_get_mode mode)
fsd = d_fsd;
} else {
struct inode *inode = dentry->d_inode;
+ unsigned int methods = 0;
if (WARN_ON(mode == DBGFS_GET_ALREADY))
return -EINVAL;
@@ -106,25 +90,28 @@ static int __debugfs_file_get(struct dentry *dentry, enum dbgfs_get_mode mode)
const struct debugfs_short_fops *ops;
ops = fsd->short_fops = DEBUGFS_I(inode)->short_fops;
if (ops->llseek)
- fsd->methods |= HAS_LSEEK;
+ methods |= HAS_LSEEK;
if (ops->read)
- fsd->methods |= HAS_READ;
+ methods |= HAS_READ;
if (ops->write)
- fsd->methods |= HAS_WRITE;
+ methods |= HAS_WRITE;
+ fsd->real_fops = NULL;
} else {
const struct file_operations *ops;
ops = fsd->real_fops = DEBUGFS_I(inode)->real_fops;
if (ops->llseek)
- fsd->methods |= HAS_LSEEK;
+ methods |= HAS_LSEEK;
if (ops->read)
- fsd->methods |= HAS_READ;
+ methods |= HAS_READ;
if (ops->write)
- fsd->methods |= HAS_WRITE;
+ methods |= HAS_WRITE;
if (ops->unlocked_ioctl)
- fsd->methods |= HAS_IOCTL;
+ methods |= HAS_IOCTL;
if (ops->poll)
- fsd->methods |= HAS_POLL;
+ methods |= HAS_POLL;
+ fsd->short_fops = NULL;
}
+ fsd->methods = methods;
refcount_set(&fsd->active_users, 1);
init_completion(&fsd->active_users_drained);
INIT_LIST_HEAD(&fsd->cancellations);
@@ -298,15 +285,13 @@ static int debugfs_locked_down(struct inode *inode,
static int open_proxy_open(struct inode *inode, struct file *filp)
{
struct dentry *dentry = F_DENTRY(filp);
- const struct file_operations *real_fops = NULL;
+ const struct file_operations *real_fops = DEBUGFS_I(inode)->real_fops;
int r;
r = __debugfs_file_get(dentry, DBGFS_GET_REGULAR);
if (r)
return r == -EIO ? -ENOENT : r;
- real_fops = debugfs_real_fops(filp);
-
r = debugfs_locked_down(inode, filp, real_fops);
if (r)
goto out;
@@ -348,7 +333,6 @@ static ret_type full_proxy_ ## name(proto) \
{ \
struct dentry *dentry = F_DENTRY(filp); \
struct debugfs_fsdata *fsd = dentry->d_fsdata; \
- const struct file_operations *real_fops; \
ret_type r; \
\
if (!(fsd->methods & bit)) \
@@ -356,14 +340,13 @@ static ret_type full_proxy_ ## name(proto) \
r = debugfs_file_get(dentry); \
if (unlikely(r)) \
return r; \
- real_fops = debugfs_real_fops(filp); \
- r = real_fops->name(args); \
+ r = fsd->real_fops->name(args); \
debugfs_file_put(dentry); \
return r; \
}
-#define FULL_PROXY_FUNC_BOTH(name, ret_type, filp, proto, args, bit, ret) \
-static ret_type full_proxy_ ## name(proto) \
+#define SHORT_PROXY_FUNC(name, ret_type, filp, proto, args, bit, ret) \
+static ret_type short_proxy_ ## name(proto) \
{ \
struct dentry *dentry = F_DENTRY(filp); \
struct debugfs_fsdata *fsd = dentry->d_fsdata; \
@@ -374,27 +357,38 @@ static ret_type full_proxy_ ## name(proto) \
r = debugfs_file_get(dentry); \
if (unlikely(r)) \
return r; \
- if (fsd->real_fops) \
- r = fsd->real_fops->name(args); \
- else \
- r = fsd->short_fops->name(args); \
+ r = fsd->short_fops->name(args); \
debugfs_file_put(dentry); \
return r; \
}
-FULL_PROXY_FUNC_BOTH(llseek, loff_t, filp,
- PROTO(struct file *filp, loff_t offset, int whence),
- ARGS(filp, offset, whence), HAS_LSEEK, -ESPIPE);
+SHORT_PROXY_FUNC(llseek, loff_t, filp,
+ PROTO(struct file *filp, loff_t offset, int whence),
+ ARGS(filp, offset, whence), HAS_LSEEK, -ESPIPE);
-FULL_PROXY_FUNC_BOTH(read, ssize_t, filp,
- PROTO(struct file *filp, char __user *buf, size_t size,
- loff_t *ppos),
- ARGS(filp, buf, size, ppos), HAS_READ, -EINVAL);
+FULL_PROXY_FUNC(llseek, loff_t, filp,
+ PROTO(struct file *filp, loff_t offset, int whence),
+ ARGS(filp, offset, whence), HAS_LSEEK, -ESPIPE);
-FULL_PROXY_FUNC_BOTH(write, ssize_t, filp,
- PROTO(struct file *filp, const char __user *buf,
- size_t size, loff_t *ppos),
- ARGS(filp, buf, size, ppos), HAS_WRITE, -EINVAL);
+SHORT_PROXY_FUNC(read, ssize_t, filp,
+ PROTO(struct file *filp, char __user *buf, size_t size,
+ loff_t *ppos),
+ ARGS(filp, buf, size, ppos), HAS_READ, -EINVAL);
+
+FULL_PROXY_FUNC(read, ssize_t, filp,
+ PROTO(struct file *filp, char __user *buf, size_t size,
+ loff_t *ppos),
+ ARGS(filp, buf, size, ppos), HAS_READ, -EINVAL);
+
+SHORT_PROXY_FUNC(write, ssize_t, filp,
+ PROTO(struct file *filp, const char __user *buf,
+ size_t size, loff_t *ppos),
+ ARGS(filp, buf, size, ppos), HAS_WRITE, -EINVAL);
+
+FULL_PROXY_FUNC(write, ssize_t, filp,
+ PROTO(struct file *filp, const char __user *buf,
+ size_t size, loff_t *ppos),
+ ARGS(filp, buf, size, ppos), HAS_WRITE, -EINVAL);
FULL_PROXY_FUNC(unlocked_ioctl, long, filp,
PROTO(struct file *filp, unsigned int cmd, unsigned long arg),
@@ -406,22 +400,21 @@ static __poll_t full_proxy_poll(struct file *filp,
struct dentry *dentry = F_DENTRY(filp);
struct debugfs_fsdata *fsd = dentry->d_fsdata;
__poll_t r = 0;
- const struct file_operations *real_fops;
if (!(fsd->methods & HAS_POLL))
return DEFAULT_POLLMASK;
if (debugfs_file_get(dentry))
return EPOLLHUP;
- real_fops = debugfs_real_fops(filp);
- r = real_fops->poll(filp, wait);
+ r = fsd->real_fops->poll(filp, wait);
debugfs_file_put(dentry);
return r;
}
-static int full_proxy_release(struct inode *inode, struct file *filp)
+static int full_proxy_release(struct inode *inode, struct file *file)
{
- const struct file_operations *real_fops = debugfs_real_fops(filp);
+ struct debugfs_fsdata *fsd = F_DENTRY(file)->d_fsdata;
+ const struct file_operations *real_fops = fsd->real_fops;
int r = 0;
/*
@@ -431,7 +424,7 @@ static int full_proxy_release(struct inode *inode, struct file *filp)
* ->i_private is still being meaningful here.
*/
if (real_fops->release)
- r = real_fops->release(inode, filp);
+ r = real_fops->release(inode, file);
fops_put(real_fops);
return r;
@@ -513,9 +506,9 @@ static int full_proxy_open_short(struct inode *inode, struct file *filp)
const struct file_operations debugfs_full_short_proxy_file_operations = {
.open = full_proxy_open_short,
- .llseek = full_proxy_llseek,
- .read = full_proxy_read,
- .write = full_proxy_write,
+ .llseek = short_proxy_llseek,
+ .read = short_proxy_read,
+ .write = short_proxy_write,
};
ssize_t debugfs_attr_read(struct file *file, char __user *buf,
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 75715d8877ee..c12d649df6a5 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -183,6 +183,9 @@ static int debugfs_reconfigure(struct fs_context *fc)
struct debugfs_fs_info *sb_opts = sb->s_fs_info;
struct debugfs_fs_info *new_opts = fc->s_fs_info;
+ if (!new_opts)
+ return 0;
+
sync_filesystem(sb);
/* structure copy of new mount options to sb */
@@ -258,7 +261,6 @@ static struct vfsmount *debugfs_automount(struct path *path)
}
static const struct dentry_operations debugfs_dops = {
- .d_delete = always_delete_dentry,
.d_release = debugfs_release_dentry,
.d_automount = debugfs_automount,
};
@@ -273,7 +275,8 @@ static int debugfs_fill_super(struct super_block *sb, struct fs_context *fc)
return err;
sb->s_op = &debugfs_super_operations;
- sb->s_d_op = &debugfs_dops;
+ set_default_d_op(sb, &debugfs_dops);
+ sb->s_d_flags |= DCACHE_DONTCACHE;
debugfs_apply_options(sb);
@@ -282,10 +285,16 @@ static int debugfs_fill_super(struct super_block *sb, struct fs_context *fc)
static int debugfs_get_tree(struct fs_context *fc)
{
+ int err;
+
if (!(debugfs_allow & DEBUGFS_ALLOW_API))
return -EPERM;
- return get_tree_single(fc, debugfs_fill_super);
+ err = get_tree_single(fc, debugfs_fill_super);
+ if (err)
+ return err;
+
+ return debugfs_reconfigure(fc);
}
static void debugfs_free_fc(struct fs_context *fc)
@@ -346,7 +355,7 @@ struct dentry *debugfs_lookup(const char *name, struct dentry *parent)
if (!parent)
parent = debugfs_mount->mnt_root;
- dentry = lookup_positive_unlocked(name, parent, strlen(name));
+ dentry = lookup_noperm_positive_unlocked(&QSTR(name), parent);
if (IS_ERR(dentry))
return NULL;
return dentry;
@@ -384,27 +393,12 @@ static struct dentry *start_creating(const char *name, struct dentry *parent)
if (!parent)
parent = debugfs_mount->mnt_root;
- inode_lock(d_inode(parent));
- if (unlikely(IS_DEADDIR(d_inode(parent))))
- dentry = ERR_PTR(-ENOENT);
- else
- dentry = lookup_one_len(name, parent, strlen(name));
- if (!IS_ERR(dentry) && d_really_is_positive(dentry)) {
- if (d_is_dir(dentry))
- pr_err("Directory '%s' with parent '%s' already present!\n",
- name, parent->d_name.name);
- else
- pr_err("File '%s' in directory '%s' already present!\n",
- name, parent->d_name.name);
- dput(dentry);
- dentry = ERR_PTR(-EEXIST);
- }
-
+ dentry = simple_start_creating(parent, name);
if (IS_ERR(dentry)) {
- inode_unlock(d_inode(parent));
+ if (dentry == ERR_PTR(-EEXIST))
+ pr_err("'%s' already exists in '%pd'\n", name, parent);
simple_release_fs(&debugfs_mount, &debugfs_mount_count);
}
-
return dentry;
}
@@ -459,7 +453,7 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode,
proxy_fops = &debugfs_noop_file_operations;
inode->i_fop = proxy_fops;
DEBUGFS_I(inode)->raw = real_fops;
- DEBUGFS_I(inode)->aux = aux;
+ DEBUGFS_I(inode)->aux = (void *)aux;
d_instantiate(dentry, inode);
fsnotify_create(d_inode(dentry->d_parent), dentry);
@@ -872,7 +866,7 @@ int __printf(2, 3) debugfs_change_name(struct dentry *dentry, const char *fmt, .
}
if (strcmp(old_name.name.name, new_name) == 0)
goto out;
- target = lookup_one_len(new_name, parent, strlen(new_name));
+ target = lookup_noperm(&QSTR(new_name), parent);
if (IS_ERR(target)) {
error = PTR_ERR(target);
goto out;
diff --git a/fs/debugfs/internal.h b/fs/debugfs/internal.h
index 93483fe84425..427987f81571 100644
--- a/fs/debugfs/internal.h
+++ b/fs/debugfs/internal.h
@@ -19,7 +19,7 @@ struct debugfs_inode_info {
const struct debugfs_short_fops *short_fops;
debugfs_automount_t automount;
};
- const void *aux;
+ void *aux;
};
static inline struct debugfs_inode_info *DEBUGFS_I(struct inode *inode)
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 1096ff8562fa..fdf22264a8e9 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -12,6 +12,8 @@
#include <linux/module.h>
#include <linux/init.h>
#include <linux/fs.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include <linux/sched.h>
#include <linux/namei.h>
#include <linux/slab.h>
@@ -21,7 +23,6 @@
#include <linux/magic.h>
#include <linux/idr.h>
#include <linux/devpts_fs.h>
-#include <linux/parser.h>
#include <linux/fsnotify.h>
#include <linux/seq_file.h>
@@ -87,14 +88,14 @@ enum {
Opt_err
};
-static const match_table_t tokens = {
- {Opt_uid, "uid=%u"},
- {Opt_gid, "gid=%u"},
- {Opt_mode, "mode=%o"},
- {Opt_ptmxmode, "ptmxmode=%o"},
- {Opt_newinstance, "newinstance"},
- {Opt_max, "max=%d"},
- {Opt_err, NULL}
+static const struct fs_parameter_spec devpts_param_specs[] = {
+ fsparam_gid ("gid", Opt_gid),
+ fsparam_s32 ("max", Opt_max),
+ fsparam_u32oct ("mode", Opt_mode),
+ fsparam_flag ("newinstance", Opt_newinstance),
+ fsparam_u32oct ("ptmxmode", Opt_ptmxmode),
+ fsparam_uid ("uid", Opt_uid),
+ {}
};
struct pts_fs_info {
@@ -214,93 +215,48 @@ void devpts_release(struct pts_fs_info *fsi)
deactivate_super(fsi->sb);
}
-#define PARSE_MOUNT 0
-#define PARSE_REMOUNT 1
-
/*
- * parse_mount_options():
- * Set @opts to mount options specified in @data. If an option is not
- * specified in @data, set it to its default value.
- *
- * Note: @data may be NULL (in which case all options are set to default).
+ * devpts_parse_param - Parse mount parameters
*/
-static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
+static int devpts_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- char *p;
- kuid_t uid;
- kgid_t gid;
-
- opts->setuid = 0;
- opts->setgid = 0;
- opts->uid = GLOBAL_ROOT_UID;
- opts->gid = GLOBAL_ROOT_GID;
- opts->mode = DEVPTS_DEFAULT_MODE;
- opts->ptmxmode = DEVPTS_DEFAULT_PTMX_MODE;
- opts->max = NR_UNIX98_PTY_MAX;
-
- /* Only allow instances mounted from the initial mount
- * namespace to tap the reserve pool of ptys.
- */
- if (op == PARSE_MOUNT)
- opts->reserve =
- (current->nsproxy->mnt_ns == init_task.nsproxy->mnt_ns);
-
- while ((p = strsep(&data, ",")) != NULL) {
- substring_t args[MAX_OPT_ARGS];
- int token;
- int option;
-
- if (!*p)
- continue;
-
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_uid:
- if (match_int(&args[0], &option))
- return -EINVAL;
- uid = make_kuid(current_user_ns(), option);
- if (!uid_valid(uid))
- return -EINVAL;
- opts->uid = uid;
- opts->setuid = 1;
- break;
- case Opt_gid:
- if (match_int(&args[0], &option))
- return -EINVAL;
- gid = make_kgid(current_user_ns(), option);
- if (!gid_valid(gid))
- return -EINVAL;
- opts->gid = gid;
- opts->setgid = 1;
- break;
- case Opt_mode:
- if (match_octal(&args[0], &option))
- return -EINVAL;
- opts->mode = option & S_IALLUGO;
- break;
- case Opt_ptmxmode:
- if (match_octal(&args[0], &option))
- return -EINVAL;
- opts->ptmxmode = option & S_IALLUGO;
- break;
- case Opt_newinstance:
- break;
- case Opt_max:
- if (match_int(&args[0], &option) ||
- option < 0 || option > NR_UNIX98_PTY_MAX)
- return -EINVAL;
- opts->max = option;
- break;
- default:
- pr_err("called with bogus options\n");
- return -EINVAL;
- }
+ struct pts_fs_info *fsi = fc->s_fs_info;
+ struct pts_mount_opts *opts = &fsi->mount_opts;
+ struct fs_parse_result result;
+ int opt;
+
+ opt = fs_parse(fc, devpts_param_specs, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_uid:
+ opts->uid = result.uid;
+ opts->setuid = 1;
+ break;
+ case Opt_gid:
+ opts->gid = result.gid;
+ opts->setgid = 1;
+ break;
+ case Opt_mode:
+ opts->mode = result.uint_32 & S_IALLUGO;
+ break;
+ case Opt_ptmxmode:
+ opts->ptmxmode = result.uint_32 & S_IALLUGO;
+ break;
+ case Opt_newinstance:
+ break;
+ case Opt_max:
+ if (result.uint_32 > NR_UNIX98_PTY_MAX)
+ return invalf(fc, "max out of range");
+ opts->max = result.uint_32;
+ break;
}
return 0;
}
-static int mknod_ptmx(struct super_block *sb)
+static int mknod_ptmx(struct super_block *sb, struct fs_context *fc)
{
int mode;
int rc = -ENOMEM;
@@ -362,13 +318,23 @@ static void update_ptmx_mode(struct pts_fs_info *fsi)
}
}
-static int devpts_remount(struct super_block *sb, int *flags, char *data)
+static int devpts_reconfigure(struct fs_context *fc)
{
- int err;
- struct pts_fs_info *fsi = DEVPTS_SB(sb);
- struct pts_mount_opts *opts = &fsi->mount_opts;
+ struct pts_fs_info *fsi = DEVPTS_SB(fc->root->d_sb);
+ struct pts_fs_info *new = fc->s_fs_info;
- err = parse_mount_options(data, PARSE_REMOUNT, opts);
+ /* Apply the revised options. We don't want to change ->reserve.
+ * Ideally, we'd update each option conditionally on it having been
+ * explicitly changed, but the default is to reset everything so that
+ * would break UAPI...
+ */
+ fsi->mount_opts.setuid = new->mount_opts.setuid;
+ fsi->mount_opts.setgid = new->mount_opts.setgid;
+ fsi->mount_opts.uid = new->mount_opts.uid;
+ fsi->mount_opts.gid = new->mount_opts.gid;
+ fsi->mount_opts.mode = new->mount_opts.mode;
+ fsi->mount_opts.ptmxmode = new->mount_opts.ptmxmode;
+ fsi->mount_opts.max = new->mount_opts.max;
/*
* parse_mount_options() restores options to default values
@@ -378,7 +344,7 @@ static int devpts_remount(struct super_block *sb, int *flags, char *data)
*/
update_ptmx_mode(fsi);
- return err;
+ return 0;
}
static int devpts_show_options(struct seq_file *seq, struct dentry *root)
@@ -402,53 +368,26 @@ static int devpts_show_options(struct seq_file *seq, struct dentry *root)
static const struct super_operations devpts_sops = {
.statfs = simple_statfs,
- .remount_fs = devpts_remount,
.show_options = devpts_show_options,
};
-static void *new_pts_fs_info(struct super_block *sb)
-{
- struct pts_fs_info *fsi;
-
- fsi = kzalloc(sizeof(struct pts_fs_info), GFP_KERNEL);
- if (!fsi)
- return NULL;
-
- ida_init(&fsi->allocated_ptys);
- fsi->mount_opts.mode = DEVPTS_DEFAULT_MODE;
- fsi->mount_opts.ptmxmode = DEVPTS_DEFAULT_PTMX_MODE;
- fsi->sb = sb;
-
- return fsi;
-}
-
-static int
-devpts_fill_super(struct super_block *s, void *data, int silent)
+static int devpts_fill_super(struct super_block *s, struct fs_context *fc)
{
+ struct pts_fs_info *fsi = DEVPTS_SB(s);
struct inode *inode;
- int error;
s->s_iflags &= ~SB_I_NODEV;
s->s_blocksize = 1024;
s->s_blocksize_bits = 10;
s->s_magic = DEVPTS_SUPER_MAGIC;
s->s_op = &devpts_sops;
- s->s_d_op = &simple_dentry_operations;
+ s->s_d_flags = DCACHE_DONTCACHE;
s->s_time_gran = 1;
+ fsi->sb = s;
- error = -ENOMEM;
- s->s_fs_info = new_pts_fs_info(s);
- if (!s->s_fs_info)
- goto fail;
-
- error = parse_mount_options(data, PARSE_MOUNT, &DEVPTS_SB(s)->mount_opts);
- if (error)
- goto fail;
-
- error = -ENOMEM;
inode = new_inode(s);
if (!inode)
- goto fail;
+ return -ENOMEM;
inode->i_ino = 1;
simple_inode_init_ts(inode);
inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
@@ -459,31 +398,60 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
s->s_root = d_make_root(inode);
if (!s->s_root) {
pr_err("get root dentry failed\n");
- goto fail;
+ return -ENOMEM;
}
- error = mknod_ptmx(s);
- if (error)
- goto fail_dput;
-
- return 0;
-fail_dput:
- dput(s->s_root);
- s->s_root = NULL;
-fail:
- return error;
+ return mknod_ptmx(s, fc);
}
/*
- * devpts_mount()
+ * devpts_get_tree()
*
* Mount a new (private) instance of devpts. PTYs created in this
* instance are independent of the PTYs in other devpts instances.
*/
-static struct dentry *devpts_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int devpts_get_tree(struct fs_context *fc)
+{
+ return get_tree_nodev(fc, devpts_fill_super);
+}
+
+static void devpts_free_fc(struct fs_context *fc)
+{
+ kfree(fc->s_fs_info);
+}
+
+static const struct fs_context_operations devpts_context_ops = {
+ .free = devpts_free_fc,
+ .parse_param = devpts_parse_param,
+ .get_tree = devpts_get_tree,
+ .reconfigure = devpts_reconfigure,
+};
+
+/*
+ * Set up the filesystem mount context.
+ */
+static int devpts_init_fs_context(struct fs_context *fc)
{
- return mount_nodev(fs_type, flags, data, devpts_fill_super);
+ struct pts_fs_info *fsi;
+
+ fsi = kzalloc(sizeof(struct pts_fs_info), GFP_KERNEL);
+ if (!fsi)
+ return -ENOMEM;
+
+ ida_init(&fsi->allocated_ptys);
+ fsi->mount_opts.uid = GLOBAL_ROOT_UID;
+ fsi->mount_opts.gid = GLOBAL_ROOT_GID;
+ fsi->mount_opts.mode = DEVPTS_DEFAULT_MODE;
+ fsi->mount_opts.ptmxmode = DEVPTS_DEFAULT_PTMX_MODE;
+ fsi->mount_opts.max = NR_UNIX98_PTY_MAX;
+
+ if (fc->purpose == FS_CONTEXT_FOR_MOUNT &&
+ current->nsproxy->mnt_ns == init_task.nsproxy->mnt_ns)
+ fsi->mount_opts.reserve = true;
+
+ fc->s_fs_info = fsi;
+ fc->ops = &devpts_context_ops;
+ return 0;
}
static void devpts_kill_sb(struct super_block *sb)
@@ -498,7 +466,8 @@ static void devpts_kill_sb(struct super_block *sb)
static struct file_system_type devpts_fs_type = {
.name = "devpts",
- .mount = devpts_mount,
+ .init_fs_context = devpts_init_fs_context,
+ .parameters = devpts_param_specs,
.kill_sb = devpts_kill_sb,
.fs_flags = FS_USERNS_MOUNT,
};
diff --git a/fs/direct-io.c b/fs/direct-io.c
index bbd05f1a2145..2267f5ae7f77 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -996,7 +996,7 @@ do_holes:
dio_unpin_page(dio, page);
goto out;
}
- zero_user(page, from, 1 << blkbits);
+ memzero_page(page, from, 1 << blkbits);
sdio->block_in_file++;
from += 1 << blkbits;
dio->result += 1 << blkbits;
@@ -1083,8 +1083,8 @@ static inline int drop_refcount(struct dio *dio)
* The locking rules are governed by the flags parameter:
* - if the flags value contains DIO_LOCKING we use a fancy locking
* scheme for dumb filesystems.
- * For writes this function is called under i_mutex and returns with
- * i_mutex held, for reads, i_mutex is not held on entry, but it is
+ * For writes this function is called under i_rwsem and returns with
+ * i_rwsem held, for reads, i_rwsem is not held on entry, but it is
* taken and dropped again before returning.
* - if the flags value does NOT contain DIO_LOCKING we don't use any
* internal locking but rather rely on the filesystem to synchronize
@@ -1094,7 +1094,7 @@ static inline int drop_refcount(struct dio *dio)
* counter before starting direct I/O, and decrement it once we are done.
* Truncate can wait for it to reach zero to provide exclusion. It is
* expected that filesystem provide exclusion between new direct I/O
- * and truncates. For DIO_LOCKING filesystems this is done by i_mutex,
+ * and truncates. For DIO_LOCKING filesystems this is done by i_rwsem,
* but other filesystems need to take care of this on their own.
*
* NOTE: if you pass "sdio" to anything by pointer make sure that function
@@ -1279,7 +1279,7 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
/*
* All block lookups have been performed. For READ requests
- * we can let i_mutex go now that its achieved its purpose
+ * we can let i_rwsem go now that its achieved its purpose
* of protecting us from looking up uninitialized blocks.
*/
if (iov_iter_rw(iter) == READ && (dio->flags & DIO_LOCKING))
diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig
index f82a4952769d..b46165df5a91 100644
--- a/fs/dlm/Kconfig
+++ b/fs/dlm/Kconfig
@@ -3,7 +3,6 @@ menuconfig DLM
tristate "Distributed Lock Manager (DLM)"
depends on INET
depends on SYSFS && CONFIGFS_FS && (IPV6 || IPV6=n)
- select IP_SCTP
help
A general purpose distributed lock manager for kernel or userspace
applications.
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index cf9ba6fd7a28..a23fd524a6ee 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -197,6 +197,9 @@ static int dlm_check_protocol_and_dlm_running(unsigned int x)
break;
case 1:
/* SCTP */
+ if (!IS_ENABLED(CONFIG_IP_SCTP))
+ return -EOPNOTSUPP;
+
break;
default:
return -EINVAL;
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
index e48c4f9686d3..13a3d0b26194 100644
--- a/fs/dlm/config.h
+++ b/fs/dlm/config.h
@@ -23,7 +23,7 @@ struct dlm_config_node {
extern const struct rhashtable_params dlm_rhash_rsb_params;
-#define DLM_MAX_ADDR_COUNT 3
+#define DLM_MAX_ADDR_COUNT 8
#define DLM_PROTO_TCP 0
#define DLM_PROTO_SCTP 1
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index c8ff88f1cdcf..6dd3a524cd35 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -509,7 +509,7 @@ static void add_scan(struct dlm_ls *ls, struct dlm_rsb *r)
void dlm_rsb_scan(struct timer_list *timer)
{
- struct dlm_ls *ls = from_timer(ls, timer, ls_scan_timer);
+ struct dlm_ls *ls = timer_container_of(ls, timer, ls_scan_timer);
int our_nodeid = dlm_our_nodeid();
struct dlm_rsb *r;
int rv;
@@ -741,6 +741,7 @@ static int find_rsb_dir(struct dlm_ls *ls, const void *name, int len,
read_lock_bh(&ls->ls_rsbtbl_lock);
if (!rsb_flag(r, RSB_HASHED)) {
read_unlock_bh(&ls->ls_rsbtbl_lock);
+ error = -EBADR;
goto do_new;
}
@@ -784,6 +785,7 @@ static int find_rsb_dir(struct dlm_ls *ls, const void *name, int len,
}
} else {
write_unlock_bh(&ls->ls_rsbtbl_lock);
+ error = -EBADR;
goto do_new;
}
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 8afac6e2dff0..1929327ffbe1 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -576,7 +576,7 @@ static int new_lockspace(const char *name, const char *cluster,
lockspace to start running (via sysfs) in dlm_ls_start(). */
error = do_uevent(ls, 1);
- if (error)
+ if (error < 0)
goto out_recoverd;
/* wait until recovery is successful or failed */
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index d28141829c05..e4373bce1bc2 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -160,6 +160,7 @@ struct dlm_proto_ops {
bool try_new_addr;
const char *name;
int proto;
+ int how;
void (*sockopts)(struct socket *sock);
int (*bind)(struct socket *sock);
@@ -533,7 +534,7 @@ static void lowcomms_state_change(struct sock *sk)
/* SCTP layer is not calling sk_data_ready when the connection
* is done, so we catch the signal through here.
*/
- if (sk->sk_shutdown == RCV_SHUTDOWN)
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
lowcomms_data_ready(sk);
}
@@ -810,7 +811,7 @@ static void shutdown_connection(struct connection *con, bool and_other)
return;
}
- ret = kernel_sock_shutdown(con->sock, SHUT_WR);
+ ret = kernel_sock_shutdown(con->sock, dlm_proto_ops->how);
up_read(&con->sock_lock);
if (ret) {
log_print("Connection %p failed to shutdown: %d will force close",
@@ -1826,8 +1827,8 @@ static int dlm_tcp_listen_validate(void)
{
/* We don't support multi-homed hosts */
if (dlm_local_count > 1) {
- log_print("TCP protocol can't handle multi-homed hosts, try SCTP");
- return -EINVAL;
+ log_print("Detect multi-homed hosts but use only the first IP address.");
+ log_print("Try SCTP, if you want to enable multi-link.");
}
return 0;
@@ -1858,6 +1859,7 @@ static int dlm_tcp_listen_bind(struct socket *sock)
static const struct dlm_proto_ops dlm_tcp_ops = {
.name = "TCP",
.proto = IPPROTO_TCP,
+ .how = SHUT_WR,
.sockopts = dlm_tcp_sockopts,
.bind = dlm_tcp_bind,
.listen_validate = dlm_tcp_listen_validate,
@@ -1896,6 +1898,7 @@ static void dlm_sctp_sockopts(struct socket *sock)
static const struct dlm_proto_ops dlm_sctp_ops = {
.name = "SCTP",
.proto = IPPROTO_SCTP,
+ .how = SHUT_RDWR,
.try_new_addr = true,
.sockopts = dlm_sctp_sockopts,
.bind = dlm_sctp_bind,
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index d45ef541d848..019a8b4eaaf9 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -14,7 +14,7 @@
#include "internal.h"
/* A global variable is a bit ugly, but it keeps the code simple */
-int sysctl_drop_caches;
+static int sysctl_drop_caches;
static void drop_pagecache_sb(struct super_block *sb, void *unused)
{
@@ -48,7 +48,7 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
iput(toput_inode);
}
-int drop_caches_sysctl_handler(const struct ctl_table *table, int write,
+static int drop_caches_sysctl_handler(const struct ctl_table *table, int write,
void *buffer, size_t *length, loff_t *ppos)
{
int ret;
@@ -77,3 +77,22 @@ int drop_caches_sysctl_handler(const struct ctl_table *table, int write,
}
return 0;
}
+
+static const struct ctl_table drop_caches_table[] = {
+ {
+ .procname = "drop_caches",
+ .data = &sysctl_drop_caches,
+ .maxlen = sizeof(int),
+ .mode = 0200,
+ .proc_handler = drop_caches_sysctl_handler,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = SYSCTL_FOUR,
+ },
+};
+
+static int __init init_vm_drop_caches_sysctls(void)
+{
+ register_sysctl_init("vm", drop_caches_table);
+ return 0;
+}
+fs_initcall(init_vm_drop_caches_sysctls);
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index ce0a3c5ed0ca..5f8f96da09fe 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -193,7 +193,7 @@ static int ecryptfs_mmap(struct file *file, struct vm_area_struct *vma)
* natively. If FILESYSTEM_MAX_STACK_DEPTH > 2 or ecryptfs
* allows recursive mounting, this will need to be extended.
*/
- if (!lower_file->f_op->mmap)
+ if (!can_mmap_file(lower_file))
return -ENODEV;
return generic_file_mmap(file, vma);
}
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index a9819ddb1ab8..72fbe1316ab8 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -394,8 +394,8 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
char *encrypted_and_encoded_name = NULL;
struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
struct dentry *lower_dir_dentry, *lower_dentry;
- const char *name = ecryptfs_dentry->d_name.name;
- size_t len = ecryptfs_dentry->d_name.len;
+ struct qstr qname = QSTR_INIT(ecryptfs_dentry->d_name.name,
+ ecryptfs_dentry->d_name.len);
struct dentry *res;
int rc = 0;
@@ -404,23 +404,25 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
mount_crypt_stat = &ecryptfs_superblock_to_private(
ecryptfs_dentry->d_sb)->mount_crypt_stat;
if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) {
+ size_t len = qname.len;
rc = ecryptfs_encrypt_and_encode_filename(
&encrypted_and_encoded_name, &len,
- mount_crypt_stat, name, len);
+ mount_crypt_stat, qname.name, len);
if (rc) {
printk(KERN_ERR "%s: Error attempting to encrypt and encode "
"filename; rc = [%d]\n", __func__, rc);
return ERR_PTR(rc);
}
- name = encrypted_and_encoded_name;
+ qname.name = encrypted_and_encoded_name;
+ qname.len = len;
}
- lower_dentry = lookup_one_len_unlocked(name, lower_dir_dentry, len);
+ lower_dentry = lookup_noperm_unlocked(&qname, lower_dir_dentry);
if (IS_ERR(lower_dentry)) {
- ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
+ ecryptfs_printk(KERN_DEBUG, "%s: lookup_noperm() returned "
"[%ld] on lower_dentry = [%s]\n", __func__,
PTR_ERR(lower_dentry),
- name);
+ qname.name);
res = ERR_CAST(lower_dentry);
} else {
res = ecryptfs_lookup_interpose(ecryptfs_dentry, lower_dentry);
@@ -503,18 +505,24 @@ out_lock:
return rc;
}
-static int ecryptfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *ecryptfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
int rc;
struct dentry *lower_dentry;
struct inode *lower_dir;
rc = lock_parent(dentry, &lower_dentry, &lower_dir);
- if (!rc)
- rc = vfs_mkdir(&nop_mnt_idmap, lower_dir,
- lower_dentry, mode);
- if (rc || d_really_is_negative(lower_dentry))
+ if (rc)
+ goto out;
+
+ lower_dentry = vfs_mkdir(&nop_mnt_idmap, lower_dir,
+ lower_dentry, mode);
+ rc = PTR_ERR(lower_dentry);
+ if (IS_ERR(lower_dentry))
+ goto out;
+ rc = 0;
+ if (d_unhashed(lower_dentry))
goto out;
rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb);
if (rc)
@@ -526,7 +534,7 @@ out:
inode_unlock(lower_dir);
if (d_really_is_negative(dentry))
d_drop(dentry);
- return rc;
+ return ERR_PTR(rc);
}
static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry)
@@ -627,10 +635,10 @@ ecryptfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
}
rd.old_mnt_idmap = &nop_mnt_idmap;
- rd.old_dir = d_inode(lower_old_dir_dentry);
+ rd.old_parent = lower_old_dir_dentry;
rd.old_dentry = lower_old_dentry;
rd.new_mnt_idmap = &nop_mnt_idmap;
- rd.new_dir = d_inode(lower_new_dir_dentry);
+ rd.new_parent = lower_new_dir_dentry;
rd.new_dentry = lower_new_dentry;
rc = vfs_rename(&rd);
if (rc)
@@ -1116,13 +1124,13 @@ out:
return rc;
}
-static int ecryptfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+static int ecryptfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
{
return vfs_fileattr_get(ecryptfs_dentry_to_lower(dentry), fa);
}
static int ecryptfs_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa)
+ struct dentry *dentry, struct file_kattr *fa)
{
struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
int rc;
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 8dd1d7189c3b..eab1beb846d3 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -20,6 +20,7 @@
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
#include <linux/fs_stack.h>
+#include <linux/sysfs.h>
#include <linux/slab.h>
#include <linux/magic.h>
#include "ecryptfs_kernel.h"
@@ -471,7 +472,7 @@ static int ecryptfs_get_tree(struct fs_context *fc)
sbi = NULL;
s->s_op = &ecryptfs_sops;
s->s_xattr = ecryptfs_xattr_handlers;
- s->s_d_op = &ecryptfs_dops;
+ set_default_d_op(s, &ecryptfs_dops);
err = "Reading sb failed";
rc = kern_path(fc->source, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path);
@@ -764,7 +765,7 @@ static struct kobject *ecryptfs_kobj;
static ssize_t version_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buff)
{
- return snprintf(buff, PAGE_SIZE, "%d\n", ECRYPTFS_VERSIONING_MASK);
+ return sysfs_emit(buff, "%d\n", ECRYPTFS_VERSIONING_MASK);
}
static struct kobj_attribute version_attr = __ATTR_RO(version);
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 60f0ac8744b5..2c2b12fedeae 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -228,7 +228,7 @@ out:
/**
* ecryptfs_write_begin
- * @file: The eCryptfs file
+ * @iocb: I/O control block for the eCryptfs file
* @mapping: The eCryptfs object
* @pos: The file offset at which to start writing
* @len: Length of the write
@@ -239,7 +239,7 @@ out:
*
* Returns zero on success; non-zero otherwise
*/
-static int ecryptfs_write_begin(struct file *file,
+static int ecryptfs_write_begin(const struct kiocb *iocb,
struct address_space *mapping,
loff_t pos, unsigned len,
struct folio **foliop, void **fsdata)
@@ -322,7 +322,7 @@ static int ecryptfs_write_begin(struct file *file,
* Note, this will increase i_size. */
if (index != 0) {
if (prev_page_end_size > i_size_read(mapping->host)) {
- rc = ecryptfs_truncate(file->f_path.dentry,
+ rc = ecryptfs_truncate(iocb->ki_filp->f_path.dentry,
prev_page_end_size);
if (rc) {
printk(KERN_ERR "%s: Error on attempt to "
@@ -429,7 +429,7 @@ int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode)
/**
* ecryptfs_write_end
- * @file: The eCryptfs file object
+ * @iocb: I/O control block for the eCryptfs file
* @mapping: The eCryptfs object
* @pos: The file position
* @len: The length of the data (unused)
@@ -437,7 +437,7 @@ int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode)
* @folio: The eCryptfs folio
* @fsdata: The fsdata (unused)
*/
-static int ecryptfs_write_end(struct file *file,
+static int ecryptfs_write_end(const struct kiocb *iocb,
struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct folio *folio, void *fsdata)
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index 0b1c878317ab..e7b7f426fecf 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -172,7 +172,6 @@ const struct super_operations ecryptfs_sops = {
.destroy_inode = ecryptfs_destroy_inode,
.free_inode = ecryptfs_free_inode,
.statfs = ecryptfs_statfs,
- .remount_fs = NULL,
.evict_inode = ecryptfs_evict_inode,
.show_options = ecryptfs_show_options
};
diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c
index 98a7299a9ee9..2891614abf8d 100644
--- a/fs/efivarfs/inode.c
+++ b/fs/efivarfs/inode.c
@@ -138,7 +138,7 @@ const struct inode_operations efivarfs_dir_inode_operations = {
};
static int
-efivarfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+efivarfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
{
unsigned int i_flags;
unsigned int flags = 0;
@@ -154,7 +154,7 @@ efivarfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
static int
efivarfs_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa)
+ struct dentry *dentry, struct file_kattr *fa)
{
unsigned int i_flags = 0;
diff --git a/fs/efivarfs/internal.h b/fs/efivarfs/internal.h
index ac6a1dd0a6a5..f913b6824289 100644
--- a/fs/efivarfs/internal.h
+++ b/fs/efivarfs/internal.h
@@ -17,7 +17,6 @@ struct efivarfs_fs_info {
struct efivarfs_mount_opts mount_opts;
struct super_block *sb;
struct notifier_block nb;
- struct notifier_block pm_nb;
};
struct efi_variable {
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 09fcf731e65d..c4a139911356 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -18,8 +18,10 @@
#include <linux/statfs.h>
#include <linux/notifier.h>
#include <linux/printk.h>
+#include <linux/namei.h>
#include "internal.h"
+#include "../internal.h"
static int efivarfs_ops_notifier(struct notifier_block *nb, unsigned long event,
void *data)
@@ -119,12 +121,18 @@ static int efivarfs_statfs(struct dentry *dentry, struct kstatfs *buf)
return 0;
}
+
+static int efivarfs_freeze_fs(struct super_block *sb);
+static int efivarfs_unfreeze_fs(struct super_block *sb);
+
static const struct super_operations efivarfs_ops = {
.statfs = efivarfs_statfs,
.drop_inode = generic_delete_inode,
.alloc_inode = efivarfs_alloc_inode,
.free_inode = efivarfs_free_inode,
.show_options = efivarfs_show_options,
+ .freeze_fs = efivarfs_freeze_fs,
+ .unfreeze_fs = efivarfs_unfreeze_fs,
};
/*
@@ -175,7 +183,6 @@ static int efivarfs_d_hash(const struct dentry *dentry, struct qstr *qstr)
static const struct dentry_operations efivarfs_d_ops = {
.d_compare = efivarfs_d_compare,
.d_hash = efivarfs_d_hash,
- .d_delete = always_delete_dentry,
};
static struct dentry *efivarfs_alloc_dentry(struct dentry *parent, char *name)
@@ -204,7 +211,6 @@ bool efivarfs_variable_is_present(efi_char16_t *variable_name,
char *name = efivar_get_utf8name(variable_name, vendor);
struct super_block *sb = data;
struct dentry *dentry;
- struct qstr qstr;
if (!name)
/*
@@ -217,9 +223,7 @@ bool efivarfs_variable_is_present(efi_char16_t *variable_name,
*/
return true;
- qstr.name = name;
- qstr.len = strlen(name);
- dentry = d_hash_and_lookup(sb->s_root, &qstr);
+ dentry = try_lookup_noperm(&QSTR(name), sb->s_root);
kfree(name);
if (!IS_ERR_OR_NULL(dentry))
dput(dentry);
@@ -345,7 +349,8 @@ static int efivarfs_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_blocksize_bits = PAGE_SHIFT;
sb->s_magic = EFIVARFS_MAGIC;
sb->s_op = &efivarfs_ops;
- sb->s_d_op = &efivarfs_d_ops;
+ set_default_d_op(sb, &efivarfs_d_ops);
+ sb->s_d_flags |= DCACHE_DONTCACHE;
sb->s_time_gran = 1;
if (!efivar_supports_writes())
@@ -385,61 +390,24 @@ static int efivarfs_reconfigure(struct fs_context *fc)
return 0;
}
+static void efivarfs_free(struct fs_context *fc)
+{
+ kfree(fc->s_fs_info);
+}
+
static const struct fs_context_operations efivarfs_context_ops = {
.get_tree = efivarfs_get_tree,
.parse_param = efivarfs_parse_param,
.reconfigure = efivarfs_reconfigure,
+ .free = efivarfs_free,
};
-struct efivarfs_ctx {
- struct dir_context ctx;
- struct super_block *sb;
- struct dentry *dentry;
-};
-
-static bool efivarfs_actor(struct dir_context *ctx, const char *name, int len,
- loff_t offset, u64 ino, unsigned mode)
-{
- unsigned long size;
- struct efivarfs_ctx *ectx = container_of(ctx, struct efivarfs_ctx, ctx);
- struct qstr qstr = { .name = name, .len = len };
- struct dentry *dentry = d_hash_and_lookup(ectx->sb->s_root, &qstr);
- struct inode *inode;
- struct efivar_entry *entry;
- int err;
-
- if (IS_ERR_OR_NULL(dentry))
- return true;
-
- inode = d_inode(dentry);
- entry = efivar_entry(inode);
-
- err = efivar_entry_size(entry, &size);
- size += sizeof(__u32); /* attributes */
- if (err)
- size = 0;
-
- inode_lock(inode);
- i_size_write(inode, size);
- inode_unlock(inode);
-
- if (!size) {
- ectx->dentry = dentry;
- return false;
- }
-
- dput(dentry);
-
- return true;
-}
-
static int efivarfs_check_missing(efi_char16_t *name16, efi_guid_t vendor,
unsigned long name_size, void *data)
{
char *name;
struct super_block *sb = data;
struct dentry *dentry;
- struct qstr qstr;
int err;
if (guid_equal(&vendor, &LINUX_EFI_RANDOM_SEED_TABLE_GUID))
@@ -449,9 +417,7 @@ static int efivarfs_check_missing(efi_char16_t *name16, efi_guid_t vendor,
if (!name)
return -ENOMEM;
- qstr.name = name;
- qstr.len = strlen(name);
- dentry = d_hash_and_lookup(sb->s_root, &qstr);
+ dentry = try_lookup_noperm(&QSTR(name), sb->s_root);
if (IS_ERR(dentry)) {
err = PTR_ERR(dentry);
goto out;
@@ -472,65 +438,59 @@ static int efivarfs_check_missing(efi_char16_t *name16, efi_guid_t vendor,
return err;
}
-static int efivarfs_pm_notify(struct notifier_block *nb, unsigned long action,
- void *ptr)
-{
- struct efivarfs_fs_info *sfi = container_of(nb, struct efivarfs_fs_info,
- pm_nb);
- struct path path = { .mnt = NULL, .dentry = sfi->sb->s_root, };
- struct efivarfs_ctx ectx = {
- .ctx = {
- .actor = efivarfs_actor,
- },
- .sb = sfi->sb,
- };
- struct file *file;
- static bool rescan_done = true;
-
- if (action == PM_HIBERNATION_PREPARE) {
- rescan_done = false;
- return NOTIFY_OK;
- } else if (action != PM_POST_HIBERNATION) {
- return NOTIFY_DONE;
- }
-
- if (rescan_done)
- return NOTIFY_DONE;
-
- pr_info("efivarfs: resyncing variable state\n");
+static struct file_system_type efivarfs_type;
- /* O_NOATIME is required to prevent oops on NULL mnt */
- file = kernel_file_open(&path, O_RDONLY | O_DIRECTORY | O_NOATIME,
- current_cred());
- if (IS_ERR(file))
- return NOTIFY_DONE;
+static int efivarfs_freeze_fs(struct super_block *sb)
+{
+ /* Nothing for us to do. */
+ return 0;
+}
- rescan_done = true;
+static int efivarfs_unfreeze_fs(struct super_block *sb)
+{
+ struct dentry *child = NULL;
/*
- * First loop over the directory and verify each entry exists,
- * removing it if it doesn't
+ * Unconditionally resync the variable state on a thaw request.
+ * Given the size of efivarfs it really doesn't matter to simply
+ * iterate through all of the entries and resync. Freeze/thaw
+ * requests are rare enough for that to not matter and the
+ * number of entries is pretty low too. So we really don't care.
*/
- file->f_pos = 2; /* skip . and .. */
- do {
- ectx.dentry = NULL;
- iterate_dir(file, &ectx.ctx);
- if (ectx.dentry) {
- pr_info("efivarfs: removing variable %pd\n",
- ectx.dentry);
- simple_recursive_removal(ectx.dentry, NULL);
- dput(ectx.dentry);
+ pr_info("efivarfs: resyncing variable state\n");
+ for (;;) {
+ int err;
+ unsigned long size = 0;
+ struct inode *inode;
+ struct efivar_entry *entry;
+
+ child = find_next_child(sb->s_root, child);
+ if (!child)
+ break;
+
+ inode = d_inode(child);
+ entry = efivar_entry(inode);
+
+ err = efivar_entry_size(entry, &size);
+ if (err)
+ size = 0;
+ else
+ size += sizeof(__u32);
+
+ inode_lock(inode);
+ i_size_write(inode, size);
+ inode_unlock(inode);
+
+ /* The variable doesn't exist anymore, delete it. */
+ if (!size) {
+ pr_info("efivarfs: removing variable %pd\n", child);
+ simple_recursive_removal(child, NULL);
}
- } while (ectx.dentry);
- fput(file);
-
- /*
- * then loop over variables, creating them if there's no matching
- * dentry
- */
- efivar_init(efivarfs_check_missing, sfi->sb, false);
+ }
- return NOTIFY_OK;
+ efivar_init(efivarfs_check_missing, sb, false);
+ pr_info("efivarfs: finished resyncing variable state\n");
+ return 0;
}
static int efivarfs_init_fs_context(struct fs_context *fc)
@@ -550,10 +510,6 @@ static int efivarfs_init_fs_context(struct fs_context *fc)
fc->s_fs_info = sfi;
fc->ops = &efivarfs_context_ops;
- sfi->pm_nb.notifier_call = efivarfs_pm_notify;
- sfi->pm_nb.priority = 0;
- register_pm_notifier(&sfi->pm_nb);
-
return 0;
}
@@ -563,7 +519,6 @@ static void efivarfs_kill_sb(struct super_block *sb)
blocking_notifier_chain_unregister(&efivar_ops_nh, &sfi->nb);
kill_litter_super(sb);
- unregister_pm_notifier(&sfi->pm_nb);
kfree(sfi);
}
diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index 6ea60661fa55..d81f3318417d 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -3,8 +3,18 @@
config EROFS_FS
tristate "EROFS filesystem support"
depends on BLOCK
+ select CACHEFILES if EROFS_FS_ONDEMAND
+ select CRC32
+ select CRYPTO if EROFS_FS_ZIP_ACCEL
+ select CRYPTO_DEFLATE if EROFS_FS_ZIP_ACCEL
select FS_IOMAP
- select LIBCRC32C
+ select LZ4_DECOMPRESS if EROFS_FS_ZIP
+ select NETFS_SUPPORT if EROFS_FS_ONDEMAND
+ select XXHASH if EROFS_FS_XATTR
+ select XZ_DEC if EROFS_FS_ZIP_LZMA
+ select XZ_DEC_MICROLZMA if EROFS_FS_ZIP_LZMA
+ select ZLIB_INFLATE if EROFS_FS_ZIP_DEFLATE
+ select ZSTD_DECOMPRESS if EROFS_FS_ZIP_ZSTD
help
EROFS (Enhanced Read-Only File System) is a lightweight read-only
file system with modern designs (e.g. no buffer heads, inline
@@ -13,12 +23,12 @@ config EROFS_FS
smartphones with Android OS, LiveCDs and high-density hosts with
numerous containers;
- It also provides fixed-sized output compression support in order to
- improve storage density as well as keep relatively higher compression
- ratios and implements in-place decompression to reuse the file page
- for compressed data temporarily with proper strategies, which is
- quite useful to ensure guaranteed end-to-end runtime decompression
- performance under extremely memory pressure without extra cost.
+ It also provides transparent compression and deduplication support to
+ improve storage density and maintain relatively high compression
+ ratios, and it implements in-place decompression to temporarily reuse
+ page cache for compressed data using proper strategies, which is
+ quite useful for ensuring guaranteed end-to-end runtime decompression
+ performance under extreme memory pressure without extra cost.
See the documentation at <file:Documentation/filesystems/erofs.rst>
and the web pages at <https://erofs.docs.kernel.org> for more details.
@@ -38,7 +48,6 @@ config EROFS_FS_DEBUG
config EROFS_FS_XATTR
bool "EROFS extended attributes"
depends on EROFS_FS
- select XXHASH
default y
help
Extended attributes are name:value pairs associated with inodes by
@@ -94,18 +103,15 @@ config EROFS_FS_BACKED_BY_FILE
config EROFS_FS_ZIP
bool "EROFS Data Compression Support"
depends on EROFS_FS
- select LZ4_DECOMPRESS
default y
help
- Enable fixed-sized output compression for EROFS.
+ Enable transparent compression support for EROFS file systems.
If you don't want to enable compression feature, say N.
config EROFS_FS_ZIP_LZMA
bool "EROFS LZMA compressed data support"
depends on EROFS_FS_ZIP
- select XZ_DEC
- select XZ_DEC_MICROLZMA
help
Saying Y here includes support for reading EROFS file systems
containing LZMA compressed data, specifically called microLZMA. It
@@ -117,7 +123,6 @@ config EROFS_FS_ZIP_LZMA
config EROFS_FS_ZIP_DEFLATE
bool "EROFS DEFLATE compressed data support"
depends on EROFS_FS_ZIP
- select ZLIB_INFLATE
help
Saying Y here includes support for reading EROFS file systems
containing DEFLATE compressed data. It gives better compression
@@ -132,7 +137,6 @@ config EROFS_FS_ZIP_DEFLATE
config EROFS_FS_ZIP_ZSTD
bool "EROFS Zstandard compressed data support"
depends on EROFS_FS_ZIP
- select ZSTD_DECOMPRESS
help
Saying Y here includes support for reading EROFS file systems
containing Zstandard compressed data. It gives better compression
@@ -144,12 +148,24 @@ config EROFS_FS_ZIP_ZSTD
If unsure, say N.
+config EROFS_FS_ZIP_ACCEL
+ bool "EROFS hardware decompression support"
+ depends on EROFS_FS_ZIP
+ help
+ Saying Y here includes hardware accelerator support for reading
+ EROFS file systems containing compressed data. It gives better
+ decompression speed than the software-implemented decompression, and
+ it costs lower CPU overhead.
+
+ Hardware accelerator support is an experimental feature for now and
+ file systems are still readable without selecting this option.
+
+ If unsure, say N.
+
config EROFS_FS_ONDEMAND
bool "EROFS fscache-based on-demand read support (deprecated)"
depends on EROFS_FS
- select NETFS_SUPPORT
select FSCACHE
- select CACHEFILES
select CACHEFILES_ONDEMAND
help
This permits EROFS to use fscache-backed data blobs with on-demand
diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
index 4331d53c7109..549abc424763 100644
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -7,5 +7,6 @@ erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o zutil.o
erofs-$(CONFIG_EROFS_FS_ZIP_LZMA) += decompressor_lzma.o
erofs-$(CONFIG_EROFS_FS_ZIP_DEFLATE) += decompressor_deflate.o
erofs-$(CONFIG_EROFS_FS_ZIP_ZSTD) += decompressor_zstd.o
+erofs-$(CONFIG_EROFS_FS_ZIP_ACCEL) += decompressor_crypto.o
erofs-$(CONFIG_EROFS_FS_BACKED_BY_FILE) += fileio.o
erofs-$(CONFIG_EROFS_FS_ONDEMAND) += fscache.o
diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h
index 65ff39401020..510e922c5193 100644
--- a/fs/erofs/compress.h
+++ b/fs/erofs/compress.h
@@ -11,6 +11,7 @@
struct z_erofs_decompress_req {
struct super_block *sb;
struct page **in, **out;
+ unsigned int inpages, outpages;
unsigned short pageofs_in, pageofs_out;
unsigned int inputsize, outputsize;
@@ -59,7 +60,6 @@ extern const struct z_erofs_decompressor *z_erofs_decomp[];
struct z_erofs_stream_dctx {
struct z_erofs_decompress_req *rq;
- unsigned int inpages, outpages; /* # of {en,de}coded pages */
int no, ni; /* the current {en,de}coded page # */
unsigned int avail_out; /* remaining bytes in the decoded buffer */
@@ -76,4 +76,14 @@ int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf,
unsigned int padbufsize);
int __init z_erofs_init_decompressor(void);
void z_erofs_exit_decompressor(void);
+int z_erofs_crypto_decompress(struct z_erofs_decompress_req *rq,
+ struct page **pgpl);
+int z_erofs_crypto_enable_engine(const char *name, int len);
+#ifdef CONFIG_EROFS_FS_ZIP_ACCEL
+void z_erofs_crypto_disable_all_engines(void);
+int z_erofs_crypto_show_engines(char *buf, int size, char sep);
+#else
+static inline void z_erofs_crypto_disable_all_engines(void) {}
+static inline int z_erofs_crypto_show_engines(char *buf, int size, char sep) { return 0; }
+#endif
#endif
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 0cd6b5c4df98..3b1ba571c728 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -25,10 +25,9 @@ void erofs_put_metabuf(struct erofs_buf *buf)
buf->page = NULL;
}
-void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset,
- enum erofs_kmap_type type)
+void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset, bool need_kmap)
{
- pgoff_t index = offset >> PAGE_SHIFT;
+ pgoff_t index = (buf->off + offset) >> PAGE_SHIFT;
struct folio *folio = NULL;
if (buf->page) {
@@ -43,18 +42,26 @@ void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset,
return folio;
}
buf->page = folio_file_page(folio, index);
- if (!buf->base && type == EROFS_KMAP)
- buf->base = kmap_local_page(buf->page);
- if (type == EROFS_NO_KMAP)
+ if (!need_kmap)
return NULL;
+ if (!buf->base)
+ buf->base = kmap_local_page(buf->page);
return buf->base + (offset & ~PAGE_MASK);
}
-void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb)
+int erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb,
+ bool in_metabox)
{
struct erofs_sb_info *sbi = EROFS_SB(sb);
buf->file = NULL;
+ if (in_metabox) {
+ if (unlikely(!sbi->metabox_inode))
+ return -EFSCORRUPTED;
+ buf->mapping = sbi->metabox_inode->i_mapping;
+ return 0;
+ }
+ buf->off = sbi->dif0.fsoff;
if (erofs_is_fileio_mode(sbi)) {
buf->file = sbi->dif0.file; /* some fs like FUSE needs it */
buf->mapping = buf->file->f_mapping;
@@ -62,67 +69,55 @@ void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb)
buf->mapping = sbi->dif0.fscache->inode->i_mapping;
else
buf->mapping = sb->s_bdev->bd_mapping;
+ return 0;
}
void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb,
- erofs_off_t offset, enum erofs_kmap_type type)
-{
- erofs_init_metabuf(buf, sb);
- return erofs_bread(buf, offset, type);
-}
-
-static int erofs_map_blocks_flatmode(struct inode *inode,
- struct erofs_map_blocks *map)
+ erofs_off_t offset, bool in_metabox)
{
- struct erofs_inode *vi = EROFS_I(inode);
- struct super_block *sb = inode->i_sb;
- bool tailendpacking = (vi->datalayout == EROFS_INODE_FLAT_INLINE);
- erofs_blk_t lastblk = erofs_iblks(inode) - tailendpacking;
+ int err;
- map->m_flags = EROFS_MAP_MAPPED; /* no hole in flat inodes */
- if (map->m_la < erofs_pos(sb, lastblk)) {
- map->m_pa = erofs_pos(sb, vi->raw_blkaddr) + map->m_la;
- map->m_plen = erofs_pos(sb, lastblk) - map->m_la;
- } else {
- DBG_BUGON(!tailendpacking);
- map->m_pa = erofs_iloc(inode) + vi->inode_isize +
- vi->xattr_isize + erofs_blkoff(sb, map->m_la);
- map->m_plen = inode->i_size - map->m_la;
-
- /* inline data should be located in the same meta block */
- if (erofs_blkoff(sb, map->m_pa) + map->m_plen > sb->s_blocksize) {
- erofs_err(sb, "inline data across blocks @ nid %llu", vi->nid);
- DBG_BUGON(1);
- return -EFSCORRUPTED;
- }
- map->m_flags |= EROFS_MAP_META;
- }
- return 0;
+ err = erofs_init_metabuf(buf, sb, in_metabox);
+ if (err)
+ return ERR_PTR(err);
+ return erofs_bread(buf, offset, true);
}
int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map)
{
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
struct super_block *sb = inode->i_sb;
+ unsigned int unit, blksz = sb->s_blocksize;
struct erofs_inode *vi = EROFS_I(inode);
struct erofs_inode_chunk_index *idx;
- struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
- u64 chunknr;
- unsigned int unit;
+ erofs_blk_t startblk, addrmask;
+ bool tailpacking;
erofs_off_t pos;
- void *kaddr;
+ u64 chunknr;
int err = 0;
trace_erofs_map_blocks_enter(inode, map, 0);
map->m_deviceid = 0;
- if (map->m_la >= inode->i_size) {
- /* leave out-of-bound access unmapped */
- map->m_flags = 0;
- map->m_plen = map->m_llen;
+ map->m_flags = 0;
+ if (map->m_la >= inode->i_size)
goto out;
- }
if (vi->datalayout != EROFS_INODE_CHUNK_BASED) {
- err = erofs_map_blocks_flatmode(inode, map);
+ tailpacking = (vi->datalayout == EROFS_INODE_FLAT_INLINE);
+ if (!tailpacking && vi->startblk == EROFS_NULL_ADDR)
+ goto out;
+ pos = erofs_pos(sb, erofs_iblks(inode) - tailpacking);
+
+ map->m_flags = EROFS_MAP_MAPPED;
+ if (map->m_la < pos) {
+ map->m_pa = erofs_pos(sb, vi->startblk) + map->m_la;
+ map->m_llen = pos - map->m_la;
+ } else {
+ map->m_pa = erofs_iloc(inode) + vi->inode_isize +
+ vi->xattr_isize + erofs_blkoff(sb, map->m_la);
+ map->m_llen = inode->i_size - map->m_la;
+ map->m_flags |= EROFS_MAP_META;
+ }
goto out;
}
@@ -135,45 +130,44 @@ int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map)
pos = ALIGN(erofs_iloc(inode) + vi->inode_isize +
vi->xattr_isize, unit) + unit * chunknr;
- kaddr = erofs_read_metabuf(&buf, sb, pos, EROFS_KMAP);
- if (IS_ERR(kaddr)) {
- err = PTR_ERR(kaddr);
+ idx = erofs_read_metabuf(&buf, sb, pos, erofs_inode_in_metabox(inode));
+ if (IS_ERR(idx)) {
+ err = PTR_ERR(idx);
goto out;
}
map->m_la = chunknr << vi->chunkbits;
- map->m_plen = min_t(erofs_off_t, 1UL << vi->chunkbits,
- round_up(inode->i_size - map->m_la, sb->s_blocksize));
-
- /* handle block map */
- if (!(vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES)) {
- __le32 *blkaddr = kaddr;
-
- if (le32_to_cpu(*blkaddr) == EROFS_NULL_ADDR) {
- map->m_flags = 0;
- } else {
- map->m_pa = erofs_pos(sb, le32_to_cpu(*blkaddr));
+ map->m_llen = min_t(erofs_off_t, 1UL << vi->chunkbits,
+ round_up(inode->i_size - map->m_la, blksz));
+ if (vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES) {
+ addrmask = (vi->chunkformat & EROFS_CHUNK_FORMAT_48BIT) ?
+ BIT_ULL(48) - 1 : BIT_ULL(32) - 1;
+ startblk = (((u64)le16_to_cpu(idx->startblk_hi) << 32) |
+ le32_to_cpu(idx->startblk_lo)) & addrmask;
+ if ((startblk ^ EROFS_NULL_ADDR) & addrmask) {
+ map->m_deviceid = le16_to_cpu(idx->device_id) &
+ EROFS_SB(sb)->device_id_mask;
+ map->m_pa = erofs_pos(sb, startblk);
+ map->m_flags = EROFS_MAP_MAPPED;
+ }
+ } else {
+ startblk = le32_to_cpu(*(__le32 *)idx);
+ if (startblk != (u32)EROFS_NULL_ADDR) {
+ map->m_pa = erofs_pos(sb, startblk);
map->m_flags = EROFS_MAP_MAPPED;
}
- goto out_unlock;
- }
- /* parse chunk indexes */
- idx = kaddr;
- switch (le32_to_cpu(idx->blkaddr)) {
- case EROFS_NULL_ADDR:
- map->m_flags = 0;
- break;
- default:
- map->m_deviceid = le16_to_cpu(idx->device_id) &
- EROFS_SB(sb)->device_id_mask;
- map->m_pa = erofs_pos(sb, le32_to_cpu(idx->blkaddr));
- map->m_flags = EROFS_MAP_MAPPED;
- break;
}
-out_unlock:
erofs_put_metabuf(&buf);
out:
- if (!err)
- map->m_llen = map->m_plen;
+ if (!err) {
+ map->m_plen = map->m_llen;
+ /* inline data should be located in the same meta block */
+ if ((map->m_flags & EROFS_MAP_META) &&
+ erofs_blkoff(sb, map->m_pa) + map->m_plen > blksz) {
+ erofs_err(sb, "inline data across blocks @ nid %llu", vi->nid);
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+ }
+ }
trace_erofs_map_blocks_exit(inode, map, 0, err);
return err;
}
@@ -192,7 +186,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
{
struct erofs_dev_context *devs = EROFS_SB(sb)->devs;
struct erofs_device_info *dif;
- erofs_off_t startoff, length;
+ erofs_off_t startoff;
int id;
erofs_fill_from_devinfo(map, sb, &EROFS_SB(sb)->dif0);
@@ -205,7 +199,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
return -ENODEV;
}
if (devs->flatdev) {
- map->m_pa += erofs_pos(sb, dif->mapped_blkaddr);
+ map->m_pa += erofs_pos(sb, dif->uniaddr);
up_read(&devs->rwsem);
return 0;
}
@@ -214,13 +208,12 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
} else if (devs->extra_devices && !devs->flatdev) {
down_read(&devs->rwsem);
idr_for_each_entry(&devs->tree, dif, id) {
- if (!dif->mapped_blkaddr)
+ if (!dif->uniaddr)
continue;
- startoff = erofs_pos(sb, dif->mapped_blkaddr);
- length = erofs_pos(sb, dif->blocks);
+ startoff = erofs_pos(sb, dif->uniaddr);
if (map->m_pa >= startoff &&
- map->m_pa < startoff + length) {
+ map->m_pa < startoff + erofs_pos(sb, dif->blocks)) {
map->m_pa -= startoff;
erofs_fill_from_devinfo(map, sb, dif);
break;
@@ -233,9 +226,11 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
/*
* bit 30: I/O error occurred on this folio
+ * bit 29: CPU has dirty data in D-cache (needs aliasing handling);
* bit 0 - 29: remaining parts to complete this folio
*/
-#define EROFS_ONLINEFOLIO_EIO (1 << 30)
+#define EROFS_ONLINEFOLIO_EIO 30
+#define EROFS_ONLINEFOLIO_DIRTY 29
void erofs_onlinefolio_init(struct folio *folio)
{
@@ -252,19 +247,23 @@ void erofs_onlinefolio_split(struct folio *folio)
atomic_inc((atomic_t *)&folio->private);
}
-void erofs_onlinefolio_end(struct folio *folio, int err)
+void erofs_onlinefolio_end(struct folio *folio, int err, bool dirty)
{
int orig, v;
do {
orig = atomic_read((atomic_t *)&folio->private);
- v = (orig - 1) | (err ? EROFS_ONLINEFOLIO_EIO : 0);
+ DBG_BUGON(orig <= 0);
+ v = dirty << EROFS_ONLINEFOLIO_DIRTY;
+ v |= (orig - 1) | (!!err << EROFS_ONLINEFOLIO_EIO);
} while (atomic_cmpxchg((atomic_t *)&folio->private, orig, v) != orig);
- if (v & ~EROFS_ONLINEFOLIO_EIO)
+ if (v & (BIT(EROFS_ONLINEFOLIO_DIRTY) - 1))
return;
folio->private = 0;
- folio_end_read(folio, !(v & EROFS_ONLINEFOLIO_EIO));
+ if (v & BIT(EROFS_ONLINEFOLIO_DIRTY))
+ flush_dcache_folio(folio);
+ folio_end_read(folio, !(v & BIT(EROFS_ONLINEFOLIO_EIO)));
}
static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
@@ -277,51 +276,51 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
map.m_la = offset;
map.m_llen = length;
-
ret = erofs_map_blocks(inode, &map);
if (ret < 0)
return ret;
- mdev = (struct erofs_map_dev) {
- .m_deviceid = map.m_deviceid,
- .m_pa = map.m_pa,
- };
- ret = erofs_map_dev(sb, &mdev);
- if (ret)
- return ret;
-
iomap->offset = map.m_la;
- if (flags & IOMAP_DAX)
- iomap->dax_dev = mdev.m_dif->dax_dev;
- else
- iomap->bdev = mdev.m_bdev;
iomap->length = map.m_llen;
iomap->flags = 0;
iomap->private = NULL;
-
+ iomap->addr = IOMAP_NULL_ADDR;
if (!(map.m_flags & EROFS_MAP_MAPPED)) {
iomap->type = IOMAP_HOLE;
- iomap->addr = IOMAP_NULL_ADDR;
- if (!iomap->length)
- iomap->length = length;
return 0;
}
+ if (!(map.m_flags & EROFS_MAP_META) || !erofs_inode_in_metabox(inode)) {
+ mdev = (struct erofs_map_dev) {
+ .m_deviceid = map.m_deviceid,
+ .m_pa = map.m_pa,
+ };
+ ret = erofs_map_dev(sb, &mdev);
+ if (ret)
+ return ret;
+
+ if (flags & IOMAP_DAX)
+ iomap->dax_dev = mdev.m_dif->dax_dev;
+ else
+ iomap->bdev = mdev.m_bdev;
+ iomap->addr = mdev.m_dif->fsoff + mdev.m_pa;
+ if (flags & IOMAP_DAX)
+ iomap->addr += mdev.m_dif->dax_part_off;
+ }
+
if (map.m_flags & EROFS_MAP_META) {
void *ptr;
struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
iomap->type = IOMAP_INLINE;
- ptr = erofs_read_metabuf(&buf, sb, mdev.m_pa, EROFS_KMAP);
+ ptr = erofs_read_metabuf(&buf, sb, map.m_pa,
+ erofs_inode_in_metabox(inode));
if (IS_ERR(ptr))
return PTR_ERR(ptr);
iomap->inline_data = ptr;
iomap->private = buf.base;
} else {
iomap->type = IOMAP_MAPPED;
- iomap->addr = mdev.m_pa;
- if (flags & IOMAP_DAX)
- iomap->addr += mdev.m_dif->dax_part_off;
}
return 0;
}
@@ -370,11 +369,16 @@ int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
*/
static int erofs_read_folio(struct file *file, struct folio *folio)
{
+ trace_erofs_read_folio(folio, true);
+
return iomap_read_folio(folio, &erofs_iomap_ops);
}
static void erofs_readahead(struct readahead_control *rac)
{
+ trace_erofs_readahead(rac->mapping->host, readahead_index(rac),
+ readahead_count(rac), true);
+
return iomap_readahead(rac, &erofs_iomap_ops);
}
@@ -428,20 +432,20 @@ static const struct vm_operations_struct erofs_dax_vm_ops = {
.huge_fault = erofs_dax_huge_fault,
};
-static int erofs_file_mmap(struct file *file, struct vm_area_struct *vma)
+static int erofs_file_mmap_prepare(struct vm_area_desc *desc)
{
- if (!IS_DAX(file_inode(file)))
- return generic_file_readonly_mmap(file, vma);
+ if (!IS_DAX(file_inode(desc->file)))
+ return generic_file_readonly_mmap_prepare(desc);
- if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
+ if ((desc->vm_flags & VM_SHARED) && (desc->vm_flags & VM_MAYWRITE))
return -EINVAL;
- vma->vm_ops = &erofs_dax_vm_ops;
- vm_flags_set(vma, VM_HUGEPAGE);
+ desc->vm_ops = &erofs_dax_vm_ops;
+ desc->vm_flags |= VM_HUGEPAGE;
return 0;
}
#else
-#define erofs_file_mmap generic_file_readonly_mmap
+#define erofs_file_mmap_prepare generic_file_readonly_mmap_prepare
#endif
static loff_t erofs_file_llseek(struct file *file, loff_t offset, int whence)
@@ -471,7 +475,7 @@ static loff_t erofs_file_llseek(struct file *file, loff_t offset, int whence)
const struct file_operations erofs_file_fops = {
.llseek = erofs_file_llseek,
.read_iter = erofs_file_read_iter,
- .mmap = erofs_file_mmap,
+ .mmap_prepare = erofs_file_mmap_prepare,
.get_unmapped_area = thp_get_unmapped_area,
.splice_read = filemap_splice_read,
};
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index 2b123b070a42..354762c9723f 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -9,14 +9,6 @@
#define LZ4_MAX_DISTANCE_PAGES (DIV_ROUND_UP(LZ4_DISTANCE_MAX, PAGE_SIZE) + 1)
-struct z_erofs_lz4_decompress_ctx {
- struct z_erofs_decompress_req *rq;
- /* # of encoded, decoded pages */
- unsigned int inpages, outpages;
- /* decoded block total length (used for in-place decompression) */
- unsigned int oend;
-};
-
static int z_erofs_load_lz4_config(struct super_block *sb,
struct erofs_super_block *dsb, void *data, int size)
{
@@ -55,10 +47,9 @@ static int z_erofs_load_lz4_config(struct super_block *sb,
* Fill all gaps with bounce pages if it's a sparse page list. Also check if
* all physical pages are consecutive, which can be seen for moderate CR.
*/
-static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx,
+static int z_erofs_lz4_prepare_dstpages(struct z_erofs_decompress_req *rq,
struct page **pagepool)
{
- struct z_erofs_decompress_req *rq = ctx->rq;
struct page *availables[LZ4_MAX_DISTANCE_PAGES] = { NULL };
unsigned long bounced[DIV_ROUND_UP(LZ4_MAX_DISTANCE_PAGES,
BITS_PER_LONG)] = { 0 };
@@ -68,7 +59,7 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx,
unsigned int i, j, top;
top = 0;
- for (i = j = 0; i < ctx->outpages; ++i, ++j) {
+ for (i = j = 0; i < rq->outpages; ++i, ++j) {
struct page *const page = rq->out[i];
struct page *victim;
@@ -114,36 +105,36 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx,
return kaddr ? 1 : 0;
}
-static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx,
+static void *z_erofs_lz4_handle_overlap(struct z_erofs_decompress_req *rq,
void *inpage, void *out, unsigned int *inputmargin,
int *maptype, bool may_inplace)
{
- struct z_erofs_decompress_req *rq = ctx->rq;
- unsigned int omargin, total, i;
+ unsigned int oend, omargin, total, i;
struct page **in;
void *src, *tmp;
if (rq->inplace_io) {
- omargin = PAGE_ALIGN(ctx->oend) - ctx->oend;
+ oend = rq->pageofs_out + rq->outputsize;
+ omargin = PAGE_ALIGN(oend) - oend;
if (rq->partial_decoding || !may_inplace ||
omargin < LZ4_DECOMPRESS_INPLACE_MARGIN(rq->inputsize))
goto docopy;
- for (i = 0; i < ctx->inpages; ++i)
- if (rq->out[ctx->outpages - ctx->inpages + i] !=
+ for (i = 0; i < rq->inpages; ++i)
+ if (rq->out[rq->outpages - rq->inpages + i] !=
rq->in[i])
goto docopy;
kunmap_local(inpage);
*maptype = 3;
- return out + ((ctx->outpages - ctx->inpages) << PAGE_SHIFT);
+ return out + ((rq->outpages - rq->inpages) << PAGE_SHIFT);
}
- if (ctx->inpages <= 1) {
+ if (rq->inpages <= 1) {
*maptype = 0;
return inpage;
}
kunmap_local(inpage);
- src = erofs_vm_map_ram(rq->in, ctx->inpages);
+ src = erofs_vm_map_ram(rq->in, rq->inpages);
if (!src)
return ERR_PTR(-ENOMEM);
*maptype = 1;
@@ -152,7 +143,7 @@ static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx,
docopy:
/* Or copy compressed data which can be overlapped to per-CPU buffer */
in = rq->in;
- src = z_erofs_get_gbuf(ctx->inpages);
+ src = z_erofs_get_gbuf(rq->inpages);
if (!src) {
DBG_BUGON(1);
kunmap_local(inpage);
@@ -197,10 +188,8 @@ int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf,
return 0;
}
-static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx,
- u8 *dst)
+static int z_erofs_lz4_decompress_mem(struct z_erofs_decompress_req *rq, u8 *dst)
{
- struct z_erofs_decompress_req *rq = ctx->rq;
bool support_0padding = false, may_inplace = false;
unsigned int inputmargin;
u8 *out, *headpage, *src;
@@ -224,7 +213,7 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx,
}
inputmargin = rq->pageofs_in;
- src = z_erofs_lz4_handle_overlap(ctx, headpage, dst, &inputmargin,
+ src = z_erofs_lz4_handle_overlap(rq, headpage, dst, &inputmargin,
&maptype, may_inplace);
if (IS_ERR(src))
return PTR_ERR(src);
@@ -251,7 +240,7 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx,
if (maptype == 0) {
kunmap_local(headpage);
} else if (maptype == 1) {
- vm_unmap_ram(src, ctx->inpages);
+ vm_unmap_ram(src, rq->inpages);
} else if (maptype == 2) {
z_erofs_put_gbuf(src);
} else if (maptype != 3) {
@@ -264,54 +253,42 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx,
static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq,
struct page **pagepool)
{
- struct z_erofs_lz4_decompress_ctx ctx;
unsigned int dst_maptype;
void *dst;
int ret;
- ctx.rq = rq;
- ctx.oend = rq->pageofs_out + rq->outputsize;
- ctx.outpages = PAGE_ALIGN(ctx.oend) >> PAGE_SHIFT;
- ctx.inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT;
-
/* one optimized fast path only for non bigpcluster cases yet */
- if (ctx.inpages == 1 && ctx.outpages == 1 && !rq->inplace_io) {
+ if (rq->inpages == 1 && rq->outpages == 1 && !rq->inplace_io) {
DBG_BUGON(!*rq->out);
dst = kmap_local_page(*rq->out);
dst_maptype = 0;
- goto dstmap_out;
- }
-
- /* general decoding path which can be used for all cases */
- ret = z_erofs_lz4_prepare_dstpages(&ctx, pagepool);
- if (ret < 0) {
- return ret;
- } else if (ret > 0) {
- dst = page_address(*rq->out);
- dst_maptype = 1;
} else {
- dst = erofs_vm_map_ram(rq->out, ctx.outpages);
- if (!dst)
- return -ENOMEM;
- dst_maptype = 2;
+ /* general decoding path which can be used for all cases */
+ ret = z_erofs_lz4_prepare_dstpages(rq, pagepool);
+ if (ret < 0)
+ return ret;
+ if (ret > 0) {
+ dst = page_address(*rq->out);
+ dst_maptype = 1;
+ } else {
+ dst = erofs_vm_map_ram(rq->out, rq->outpages);
+ if (!dst)
+ return -ENOMEM;
+ dst_maptype = 2;
+ }
}
-
-dstmap_out:
- ret = z_erofs_lz4_decompress_mem(&ctx, dst);
+ ret = z_erofs_lz4_decompress_mem(rq, dst);
if (!dst_maptype)
kunmap_local(dst);
else if (dst_maptype == 2)
- vm_unmap_ram(dst, ctx.outpages);
+ vm_unmap_ram(dst, rq->outpages);
return ret;
}
static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq,
struct page **pagepool)
{
- const unsigned int nrpages_in =
- PAGE_ALIGN(rq->pageofs_in + rq->inputsize) >> PAGE_SHIFT;
- const unsigned int nrpages_out =
- PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
+ const unsigned int nrpages_in = rq->inpages, nrpages_out = rq->outpages;
const unsigned int bs = rq->sb->s_blocksize;
unsigned int cur = 0, ni = 0, no, pi, po, insz, cnt;
u8 *kin;
@@ -324,19 +301,17 @@ static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq,
cur = min(cur, rq->outputsize);
if (cur && rq->out[0]) {
kin = kmap_local_page(rq->in[nrpages_in - 1]);
- if (rq->out[0] == rq->in[nrpages_in - 1]) {
+ if (rq->out[0] == rq->in[nrpages_in - 1])
memmove(kin + rq->pageofs_out, kin + pi, cur);
- flush_dcache_page(rq->out[0]);
- } else {
+ else
memcpy_to_page(rq->out[0], rq->pageofs_out,
kin + pi, cur);
- }
kunmap_local(kin);
}
rq->outputsize -= cur;
}
- for (; rq->outputsize; rq->pageofs_in = 0, cur += PAGE_SIZE, ni++) {
+ for (; rq->outputsize; rq->pageofs_in = 0, cur += insz, ni++) {
insz = min(PAGE_SIZE - rq->pageofs_in, rq->outputsize);
rq->outputsize -= insz;
if (!rq->in[ni])
@@ -348,14 +323,12 @@ static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq,
po = (rq->pageofs_out + cur + pi) & ~PAGE_MASK;
DBG_BUGON(no >= nrpages_out);
cnt = min(insz - pi, PAGE_SIZE - po);
- if (rq->out[no] == rq->in[ni]) {
+ if (rq->out[no] == rq->in[ni])
memmove(kin + po,
kin + rq->pageofs_in + pi, cnt);
- flush_dcache_page(rq->out[no]);
- } else if (rq->out[no]) {
+ else if (rq->out[no])
memcpy_to_page(rq->out[no], po,
kin + rq->pageofs_in + pi, cnt);
- }
pi += cnt;
} while (pi < insz);
kunmap_local(kin);
@@ -373,7 +346,7 @@ int z_erofs_stream_switch_bufs(struct z_erofs_stream_dctx *dctx, void **dst,
unsigned int j;
if (!dctx->avail_out) {
- if (++dctx->no >= dctx->outpages || !rq->outputsize) {
+ if (++dctx->no >= rq->outpages || !rq->outputsize) {
erofs_err(sb, "insufficient space for decompressed data");
return -EFSCORRUPTED;
}
@@ -401,7 +374,7 @@ int z_erofs_stream_switch_bufs(struct z_erofs_stream_dctx *dctx, void **dst,
}
if (dctx->inbuf_pos == dctx->inbuf_sz && rq->inputsize) {
- if (++dctx->ni >= dctx->inpages) {
+ if (++dctx->ni >= rq->inpages) {
erofs_err(sb, "invalid compressed data");
return -EFSCORRUPTED;
}
@@ -434,7 +407,7 @@ int z_erofs_stream_switch_bufs(struct z_erofs_stream_dctx *dctx, void **dst,
dctx->bounced = true;
}
- for (j = dctx->ni + 1; j < dctx->inpages; ++j) {
+ for (j = dctx->ni + 1; j < rq->inpages; ++j) {
if (rq->out[dctx->no] != rq->in[j])
continue;
tmppage = erofs_allocpage(pgpl, rq->gfp);
@@ -494,7 +467,7 @@ int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb)
return -EOPNOTSUPP;
}
- erofs_init_metabuf(&buf, sb);
+ (void)erofs_init_metabuf(&buf, sb, false);
offset = EROFS_SUPER_OFFSET + sbi->sb_size;
alg = 0;
for (algs = sbi->available_compr_algs; algs; algs >>= 1, ++alg) {
diff --git a/fs/erofs/decompressor_crypto.c b/fs/erofs/decompressor_crypto.c
new file mode 100644
index 000000000000..97b77ab64432
--- /dev/null
+++ b/fs/erofs/decompressor_crypto.c
@@ -0,0 +1,181 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/scatterlist.h>
+#include <crypto/acompress.h>
+#include "compress.h"
+
+static int __z_erofs_crypto_decompress(struct z_erofs_decompress_req *rq,
+ struct crypto_acomp *tfm)
+{
+ struct sg_table st_src, st_dst;
+ struct acomp_req *req;
+ struct crypto_wait wait;
+ u8 *headpage;
+ int ret;
+
+ headpage = kmap_local_page(*rq->in);
+ ret = z_erofs_fixup_insize(rq, headpage + rq->pageofs_in,
+ min_t(unsigned int, rq->inputsize,
+ rq->sb->s_blocksize - rq->pageofs_in));
+ kunmap_local(headpage);
+ if (ret)
+ return ret;
+
+ req = acomp_request_alloc(tfm);
+ if (!req)
+ return -ENOMEM;
+
+ ret = sg_alloc_table_from_pages_segment(&st_src, rq->in, rq->inpages,
+ rq->pageofs_in, rq->inputsize, UINT_MAX, GFP_KERNEL);
+ if (ret < 0)
+ goto failed_src_alloc;
+
+ ret = sg_alloc_table_from_pages_segment(&st_dst, rq->out, rq->outpages,
+ rq->pageofs_out, rq->outputsize, UINT_MAX, GFP_KERNEL);
+ if (ret < 0)
+ goto failed_dst_alloc;
+
+ acomp_request_set_params(req, st_src.sgl,
+ st_dst.sgl, rq->inputsize, rq->outputsize);
+
+ crypto_init_wait(&wait);
+ acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+ crypto_req_done, &wait);
+
+ ret = crypto_wait_req(crypto_acomp_decompress(req), &wait);
+ if (ret) {
+ erofs_err(rq->sb, "failed to decompress %d in[%u, %u] out[%u]",
+ ret, rq->inputsize, rq->pageofs_in, rq->outputsize);
+ ret = -EIO;
+ }
+
+ sg_free_table(&st_dst);
+failed_dst_alloc:
+ sg_free_table(&st_src);
+failed_src_alloc:
+ acomp_request_free(req);
+ return ret;
+}
+
+struct z_erofs_crypto_engine {
+ char *crypto_name;
+ struct crypto_acomp *tfm;
+};
+
+struct z_erofs_crypto_engine *z_erofs_crypto[Z_EROFS_COMPRESSION_MAX] = {
+ [Z_EROFS_COMPRESSION_LZ4] = (struct z_erofs_crypto_engine[]) {
+ {},
+ },
+ [Z_EROFS_COMPRESSION_LZMA] = (struct z_erofs_crypto_engine[]) {
+ {},
+ },
+ [Z_EROFS_COMPRESSION_DEFLATE] = (struct z_erofs_crypto_engine[]) {
+ { .crypto_name = "qat_deflate", },
+ {},
+ },
+ [Z_EROFS_COMPRESSION_ZSTD] = (struct z_erofs_crypto_engine[]) {
+ {},
+ },
+};
+static DECLARE_RWSEM(z_erofs_crypto_rwsem);
+
+static struct crypto_acomp *z_erofs_crypto_get_engine(int alg)
+{
+ struct z_erofs_crypto_engine *e;
+
+ for (e = z_erofs_crypto[alg]; e->crypto_name; ++e)
+ if (e->tfm)
+ return e->tfm;
+ return NULL;
+}
+
+int z_erofs_crypto_decompress(struct z_erofs_decompress_req *rq,
+ struct page **pgpl)
+{
+ struct crypto_acomp *tfm;
+ int i, err;
+
+ down_read(&z_erofs_crypto_rwsem);
+ tfm = z_erofs_crypto_get_engine(rq->alg);
+ if (!tfm) {
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+
+ for (i = 0; i < rq->outpages; i++) {
+ struct page *const page = rq->out[i];
+ struct page *victim;
+
+ if (!page) {
+ victim = __erofs_allocpage(pgpl, rq->gfp, true);
+ if (!victim) {
+ err = -ENOMEM;
+ goto out;
+ }
+ set_page_private(victim, Z_EROFS_SHORTLIVED_PAGE);
+ rq->out[i] = victim;
+ }
+ }
+ err = __z_erofs_crypto_decompress(rq, tfm);
+out:
+ up_read(&z_erofs_crypto_rwsem);
+ return err;
+}
+
+int z_erofs_crypto_enable_engine(const char *name, int len)
+{
+ struct z_erofs_crypto_engine *e;
+ struct crypto_acomp *tfm;
+ int alg;
+
+ down_write(&z_erofs_crypto_rwsem);
+ for (alg = 0; alg < Z_EROFS_COMPRESSION_MAX; ++alg) {
+ for (e = z_erofs_crypto[alg]; e->crypto_name; ++e) {
+ if (!strncmp(name, e->crypto_name, len)) {
+ if (e->tfm)
+ break;
+ tfm = crypto_alloc_acomp(e->crypto_name, 0, 0);
+ if (IS_ERR(tfm)) {
+ up_write(&z_erofs_crypto_rwsem);
+ return -EOPNOTSUPP;
+ }
+ e->tfm = tfm;
+ break;
+ }
+ }
+ }
+ up_write(&z_erofs_crypto_rwsem);
+ return 0;
+}
+
+void z_erofs_crypto_disable_all_engines(void)
+{
+ struct z_erofs_crypto_engine *e;
+ int alg;
+
+ down_write(&z_erofs_crypto_rwsem);
+ for (alg = 0; alg < Z_EROFS_COMPRESSION_MAX; ++alg) {
+ for (e = z_erofs_crypto[alg]; e->crypto_name; ++e) {
+ if (!e->tfm)
+ continue;
+ crypto_free_acomp(e->tfm);
+ e->tfm = NULL;
+ }
+ }
+ up_write(&z_erofs_crypto_rwsem);
+}
+
+int z_erofs_crypto_show_engines(char *buf, int size, char sep)
+{
+ struct z_erofs_crypto_engine *e;
+ int alg, len = 0;
+
+ for (alg = 0; alg < Z_EROFS_COMPRESSION_MAX; ++alg) {
+ for (e = z_erofs_crypto[alg]; e->crypto_name; ++e) {
+ if (!e->tfm)
+ continue;
+ len += scnprintf(buf + len, size - len, "%s%c",
+ e->crypto_name, sep);
+ }
+ }
+ return len;
+}
diff --git a/fs/erofs/decompressor_deflate.c b/fs/erofs/decompressor_deflate.c
index 5070d2fcc737..6909b2d529c7 100644
--- a/fs/erofs/decompressor_deflate.c
+++ b/fs/erofs/decompressor_deflate.c
@@ -97,17 +97,11 @@ failed:
return -ENOMEM;
}
-static int z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq,
- struct page **pgpl)
+static int __z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq,
+ struct page **pgpl)
{
struct super_block *sb = rq->sb;
- struct z_erofs_stream_dctx dctx = {
- .rq = rq,
- .inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT,
- .outpages = PAGE_ALIGN(rq->pageofs_out + rq->outputsize)
- >> PAGE_SHIFT,
- .no = -1, .ni = 0,
- };
+ struct z_erofs_stream_dctx dctx = { .rq = rq, .no = -1, .ni = 0 };
struct z_erofs_deflate *strm;
int zerr, err;
@@ -184,6 +178,22 @@ failed_zinit:
return err;
}
+static int z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq,
+ struct page **pgpl)
+{
+#ifdef CONFIG_EROFS_FS_ZIP_ACCEL
+ int err;
+
+ if (!rq->partial_decoding) {
+ err = z_erofs_crypto_decompress(rq, pgpl);
+ if (err != -EOPNOTSUPP)
+ return err;
+
+ }
+#endif
+ return __z_erofs_deflate_decompress(rq, pgpl);
+}
+
const struct z_erofs_decompressor z_erofs_deflate_decomp = {
.config = z_erofs_load_deflate_config,
.decompress = z_erofs_deflate_decompress,
diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c
index 40666815046f..832cffb83a66 100644
--- a/fs/erofs/decompressor_lzma.c
+++ b/fs/erofs/decompressor_lzma.c
@@ -150,13 +150,7 @@ static int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq,
struct page **pgpl)
{
struct super_block *sb = rq->sb;
- struct z_erofs_stream_dctx dctx = {
- .rq = rq,
- .inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT,
- .outpages = PAGE_ALIGN(rq->pageofs_out + rq->outputsize)
- >> PAGE_SHIFT,
- .no = -1, .ni = 0,
- };
+ struct z_erofs_stream_dctx dctx = { .rq = rq, .no = -1, .ni = 0 };
struct xz_buf buf = {};
struct z_erofs_lzma *strm;
enum xz_ret xz_err;
diff --git a/fs/erofs/decompressor_zstd.c b/fs/erofs/decompressor_zstd.c
index 7e177304967e..b4bfe14229f9 100644
--- a/fs/erofs/decompressor_zstd.c
+++ b/fs/erofs/decompressor_zstd.c
@@ -139,13 +139,7 @@ static int z_erofs_zstd_decompress(struct z_erofs_decompress_req *rq,
struct page **pgpl)
{
struct super_block *sb = rq->sb;
- struct z_erofs_stream_dctx dctx = {
- .rq = rq,
- .inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT,
- .outpages = PAGE_ALIGN(rq->pageofs_out + rq->outputsize)
- >> PAGE_SHIFT,
- .no = -1, .ni = 0,
- };
+ struct z_erofs_stream_dctx dctx = { .rq = rq, .no = -1, .ni = 0 };
zstd_in_buffer in_buf = { NULL, 0, 0 };
zstd_out_buffer out_buf = { NULL, 0, 0 };
struct z_erofs_zstd *strm;
diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c
index c3b90abdee37..debf469ad6bd 100644
--- a/fs/erofs/dir.c
+++ b/fs/erofs/dir.c
@@ -34,7 +34,8 @@ static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx,
}
if (!dir_emit(ctx, de_name, de_namelen,
- le64_to_cpu(de->nid), d_type))
+ erofs_nid_to_ino64(EROFS_SB(dir->i_sb),
+ le64_to_cpu(de->nid)), d_type))
return 1;
++de;
ctx->pos += sizeof(struct erofs_dirent);
@@ -47,8 +48,12 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx)
struct inode *dir = file_inode(f);
struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
struct super_block *sb = dir->i_sb;
+ struct file_ra_state *ra = &f->f_ra;
unsigned long bsz = sb->s_blocksize;
unsigned int ofs = erofs_blkoff(sb, ctx->pos);
+ pgoff_t ra_pages = DIV_ROUND_UP_POW2(
+ EROFS_I_SB(dir)->dir_ra_bytes, PAGE_SIZE);
+ pgoff_t nr_pages = DIV_ROUND_UP_POW2(dir->i_size, PAGE_SIZE);
int err = 0;
bool initial = true;
@@ -58,9 +63,24 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx)
struct erofs_dirent *de;
unsigned int nameoff, maxsize;
- de = erofs_bread(&buf, dbstart, EROFS_KMAP);
+ if (fatal_signal_pending(current)) {
+ err = -ERESTARTSYS;
+ break;
+ }
+
+ /* readahead blocks to enhance performance for large directories */
+ if (ra_pages) {
+ pgoff_t idx = DIV_ROUND_UP_POW2(ctx->pos, PAGE_SIZE);
+ pgoff_t pages = min(nr_pages - idx, ra_pages);
+
+ if (pages > 1 && !ra_has_index(ra, idx))
+ page_cache_sync_readahead(dir->i_mapping, ra,
+ f, idx, pages);
+ }
+
+ de = erofs_bread(&buf, dbstart, true);
if (IS_ERR(de)) {
- erofs_err(sb, "fail to readdir of logical block %u of nid %llu",
+ erofs_err(sb, "failed to readdir of logical block %llu of nid %llu",
erofs_blknr(sb, dbstart), EROFS_I(dir)->nid);
err = PTR_ERR(de);
break;
@@ -88,8 +108,14 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx)
break;
ctx->pos = dbstart + maxsize;
ofs = 0;
+ cond_resched();
}
erofs_put_metabuf(&buf);
+ if (EROFS_I(dir)->dot_omitted && ctx->pos == dir->i_size) {
+ if (!dir_emit_dot(f, ctx))
+ return 0;
+ ++ctx->pos;
+ }
return err < 0 ? err : 0;
}
diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
index 199395ed1c1f..377ee12b8b96 100644
--- a/fs/erofs/erofs_fs.h
+++ b/fs/erofs/erofs_fs.h
@@ -15,6 +15,7 @@
#define EROFS_FEATURE_COMPAT_SB_CHKSUM 0x00000001
#define EROFS_FEATURE_COMPAT_MTIME 0x00000002
#define EROFS_FEATURE_COMPAT_XATTR_FILTER 0x00000004
+#define EROFS_FEATURE_COMPAT_SHARED_EA_IN_METABOX 0x00000008
/*
* Any bits that aren't in EROFS_ALL_FEATURE_INCOMPAT should
@@ -30,42 +31,38 @@
#define EROFS_FEATURE_INCOMPAT_FRAGMENTS 0x00000020
#define EROFS_FEATURE_INCOMPAT_DEDUPE 0x00000020
#define EROFS_FEATURE_INCOMPAT_XATTR_PREFIXES 0x00000040
+#define EROFS_FEATURE_INCOMPAT_48BIT 0x00000080
+#define EROFS_FEATURE_INCOMPAT_METABOX 0x00000100
#define EROFS_ALL_FEATURE_INCOMPAT \
- (EROFS_FEATURE_INCOMPAT_ZERO_PADDING | \
- EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \
- EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER | \
- EROFS_FEATURE_INCOMPAT_CHUNKED_FILE | \
- EROFS_FEATURE_INCOMPAT_DEVICE_TABLE | \
- EROFS_FEATURE_INCOMPAT_COMPR_HEAD2 | \
- EROFS_FEATURE_INCOMPAT_ZTAILPACKING | \
- EROFS_FEATURE_INCOMPAT_FRAGMENTS | \
- EROFS_FEATURE_INCOMPAT_DEDUPE | \
- EROFS_FEATURE_INCOMPAT_XATTR_PREFIXES)
+ ((EROFS_FEATURE_INCOMPAT_METABOX << 1) - 1)
#define EROFS_SB_EXTSLOT_SIZE 16
struct erofs_deviceslot {
u8 tag[64]; /* digest(sha256), etc. */
- __le32 blocks; /* total fs blocks of this device */
- __le32 mapped_blkaddr; /* map starting at mapped_blkaddr */
- u8 reserved[56];
+ __le32 blocks_lo; /* total blocks count of this device */
+ __le32 uniaddr_lo; /* unified starting block of this device */
+ __le32 blocks_hi; /* total blocks count MSB */
+ __le16 uniaddr_hi; /* unified starting block MSB */
+ u8 reserved[50];
};
#define EROFS_DEVT_SLOT_SIZE sizeof(struct erofs_deviceslot)
-/* erofs on-disk super block (currently 128 bytes) */
+/* erofs on-disk super block (currently 144 bytes at maximum) */
struct erofs_super_block {
__le32 magic; /* file system magic number */
__le32 checksum; /* crc32c to avoid unexpected on-disk overlap */
__le32 feature_compat;
__u8 blkszbits; /* filesystem block size in bit shift */
__u8 sb_extslots; /* superblock size = 128 + sb_extslots * 16 */
-
- __le16 root_nid; /* nid of root directory */
+ union {
+ __le16 rootnid_2b; /* nid of root directory */
+ __le16 blocks_hi; /* (48BIT on) blocks count MSB */
+ } __packed rb;
__le64 inos; /* total valid ino # (== f_files - f_favail) */
-
- __le64 build_time; /* compact inode time derivation */
- __le32 build_time_nsec; /* compact inode time derivation in ns scale */
- __le32 blocks; /* used for statfs */
+ __le64 epoch; /* base seconds used for compact inodes */
+ __le32 fixed_nsec; /* fixed nanoseconds for compact inodes */
+ __le32 blocks_lo; /* blocks count LSB */
__le32 meta_blkaddr; /* start block address of metadata area */
__le32 xattr_blkaddr; /* start block address of shared xattr area */
__u8 uuid[16]; /* 128-bit uuid for volume */
@@ -84,7 +81,12 @@ struct erofs_super_block {
__le32 xattr_prefix_start; /* start of long xattr prefixes */
__le64 packed_nid; /* nid of the special packed inode */
__u8 xattr_filter_reserved; /* reserved for xattr name filter */
- __u8 reserved2[23];
+ __u8 reserved[3];
+ __le32 build_time; /* seconds added to epoch for mkfs time */
+ __le64 rootnid_8b; /* (48BIT on) nid of root directory */
+ __le64 reserved2;
+ __le64 metabox_nid; /* (METABOX on) nid of the metabox inode */
+ __le64 reserved3; /* [align to extslot 1] */
};
/*
@@ -115,19 +117,19 @@ static inline bool erofs_inode_is_data_compressed(unsigned int datamode)
#define EROFS_I_VERSION_MASK 0x01
#define EROFS_I_DATALAYOUT_MASK 0x07
-#define EROFS_I_VERSION_BIT 0
-#define EROFS_I_DATALAYOUT_BIT 1
-#define EROFS_I_ALL_BIT 4
-
-#define EROFS_I_ALL ((1 << EROFS_I_ALL_BIT) - 1)
+#define EROFS_I_VERSION_BIT 0
+#define EROFS_I_DATALAYOUT_BIT 1
+#define EROFS_I_NLINK_1_BIT 4 /* non-directory compact inodes only */
+#define EROFS_I_DOT_OMITTED_BIT 4 /* (directories) omit the `.` dirent */
+#define EROFS_I_ALL ((1 << (EROFS_I_NLINK_1_BIT + 1)) - 1)
/* indicate chunk blkbits, thus 'chunksize = blocksize << chunk blkbits' */
#define EROFS_CHUNK_FORMAT_BLKBITS_MASK 0x001F
-/* with chunk indexes or just a 4-byte blkaddr array */
+/* with chunk indexes or just a 4-byte block array */
#define EROFS_CHUNK_FORMAT_INDEXES 0x0020
+#define EROFS_CHUNK_FORMAT_48BIT 0x0040
-#define EROFS_CHUNK_FORMAT_ALL \
- (EROFS_CHUNK_FORMAT_BLKBITS_MASK | EROFS_CHUNK_FORMAT_INDEXES)
+#define EROFS_CHUNK_FORMAT_ALL ((EROFS_CHUNK_FORMAT_48BIT << 1) - 1)
/* 32-byte on-disk inode */
#define EROFS_INODE_LAYOUT_COMPACT 0
@@ -140,45 +142,40 @@ struct erofs_inode_chunk_info {
};
union erofs_inode_i_u {
- /* total compressed blocks for compressed inodes */
- __le32 compressed_blocks;
-
- /* block address for uncompressed flat inodes */
- __le32 raw_blkaddr;
-
- /* for device files, used to indicate old/new device # */
- __le32 rdev;
-
- /* for chunk-based files, it contains the summary info */
+ __le32 blocks_lo; /* total blocks count (if compressed inodes) */
+ __le32 startblk_lo; /* starting block number (if flat inodes) */
+ __le32 rdev; /* device ID (if special inodes) */
struct erofs_inode_chunk_info c;
};
+union erofs_inode_i_nb {
+ __le16 nlink; /* if EROFS_I_NLINK_1_BIT is unset */
+ __le16 blocks_hi; /* total blocks count MSB */
+ __le16 startblk_hi; /* starting block number MSB */
+} __packed;
+
/* 32-byte reduced form of an ondisk inode */
struct erofs_inode_compact {
__le16 i_format; /* inode format hints */
-
-/* 1 header + n-1 * 4 bytes inline xattr to keep continuity */
__le16 i_xattr_icount;
__le16 i_mode;
- __le16 i_nlink;
+ union erofs_inode_i_nb i_nb;
__le32 i_size;
- __le32 i_reserved;
+ __le32 i_mtime;
union erofs_inode_i_u i_u;
__le32 i_ino; /* only used for 32-bit stat compatibility */
__le16 i_uid;
__le16 i_gid;
- __le32 i_reserved2;
+ __le32 i_reserved;
};
/* 64-byte complete form of an ondisk inode */
struct erofs_inode_extended {
__le16 i_format; /* inode format hints */
-
-/* 1 header + n-1 * 4 bytes inline xattr to keep continuity */
__le16 i_xattr_icount;
__le16 i_mode;
- __le16 i_reserved;
+ union erofs_inode_i_nb i_nb;
__le64 i_size;
union erofs_inode_i_u i_u;
@@ -248,6 +245,7 @@ static inline unsigned int erofs_xattr_ibody_size(__le16 i_xattr_icount)
if (!i_xattr_icount)
return 0;
+ /* 1 header + n-1 * 4 bytes inline xattr to keep continuity */
return sizeof(struct erofs_xattr_ibody_header) +
sizeof(__u32) * (le16_to_cpu(i_xattr_icount) - 1);
}
@@ -266,13 +264,16 @@ static inline unsigned int erofs_xattr_entry_size(struct erofs_xattr_entry *e)
/* 4-byte block address array */
#define EROFS_BLOCK_MAP_ENTRY_SIZE sizeof(__le32)
-/* 8-byte inode chunk indexes */
+/* 8-byte inode chunk index */
struct erofs_inode_chunk_index {
- __le16 advise; /* always 0, don't care for now */
+ __le16 startblk_hi; /* starting block number MSB */
__le16 device_id; /* back-end storage id (with bits masked) */
- __le32 blkaddr; /* start block address of this inode chunk */
+ __le32 startblk_lo; /* starting block number of this chunk */
};
+#define EROFS_DIRENT_NID_METABOX_BIT 63
+#define EROFS_DIRENT_NID_MASK (BIT_ULL(EROFS_DIRENT_NID_METABOX_BIT) - 1)
+
/* dirent sorts in alphabet order, thus we can do binary search */
struct erofs_dirent {
__le64 nid; /* node number */
@@ -337,21 +338,20 @@ struct z_erofs_zstd_cfgs {
#define Z_EROFS_ZSTD_MAX_DICT_SIZE Z_EROFS_PCLUSTER_MAX_SIZE
/*
- * bit 0 : COMPACTED_2B indexes (0 - off; 1 - on)
- * e.g. for 4k logical cluster size, 4B if compacted 2B is off;
- * (4B) + 2B + (4B) if compacted 2B is on.
- * bit 1 : HEAD1 big pcluster (0 - off; 1 - on)
- * bit 2 : HEAD2 big pcluster (0 - off; 1 - on)
- * bit 3 : tailpacking inline pcluster (0 - off; 1 - on)
- * bit 4 : interlaced plain pcluster (0 - off; 1 - on)
- * bit 5 : fragment pcluster (0 - off; 1 - on)
+ * Enable COMPACTED_2B for EROFS_INODE_COMPRESSED_COMPACT inodes:
+ * 4B (disabled) vs 4B+2B+4B (enabled)
*/
#define Z_EROFS_ADVISE_COMPACTED_2B 0x0001
+/* Enable extent metadata for EROFS_INODE_COMPRESSED_FULL inodes */
+#define Z_EROFS_ADVISE_EXTENTS 0x0001
#define Z_EROFS_ADVISE_BIG_PCLUSTER_1 0x0002
#define Z_EROFS_ADVISE_BIG_PCLUSTER_2 0x0004
#define Z_EROFS_ADVISE_INLINE_PCLUSTER 0x0008
#define Z_EROFS_ADVISE_INTERLACED_PCLUSTER 0x0010
#define Z_EROFS_ADVISE_FRAGMENT_PCLUSTER 0x0020
+/* Indicate the record size for each extent if extent metadata is used */
+#define Z_EROFS_ADVISE_EXTRECSZ_BIT 1
+#define Z_EROFS_ADVISE_EXTRECSZ_MASK 0x3
#define Z_EROFS_FRAGMENT_INODE_BIT 7
struct z_erofs_map_header {
@@ -363,45 +363,24 @@ struct z_erofs_map_header {
/* indicates the encoded size of tailpacking data */
__le16 h_idata_size;
};
+ __le32 h_extents_lo; /* extent count LSB */
};
__le16 h_advise;
- /*
- * bit 0-3 : algorithm type of head 1 (logical cluster type 01);
- * bit 4-7 : algorithm type of head 2 (logical cluster type 11).
- */
- __u8 h_algorithmtype;
- /*
- * bit 0-2 : logical cluster bits - 12, e.g. 0 for 4096;
- * bit 3-6 : reserved;
- * bit 7 : move the whole file into packed inode or not.
- */
- __u8 h_clusterbits;
+ union {
+ struct {
+ /* algorithm type (bit 0-3: HEAD1; bit 4-7: HEAD2) */
+ __u8 h_algorithmtype;
+ /*
+ * bit 0-3 : logical cluster bits - blkszbits
+ * bit 4-6 : reserved
+ * bit 7 : pack the whole file into packed inode
+ */
+ __u8 h_clusterbits;
+ } __packed;
+ __le16 h_extents_hi; /* extent count MSB */
+ } __packed;
};
-/*
- * On-disk logical cluster type:
- * 0 - literal (uncompressed) lcluster
- * 1,3 - compressed lcluster (for HEAD lclusters)
- * 2 - compressed lcluster (for NONHEAD lclusters)
- *
- * In detail,
- * 0 - literal (uncompressed) lcluster,
- * di_advise = 0
- * di_clusterofs = the literal data offset of the lcluster
- * di_blkaddr = the blkaddr of the literal pcluster
- *
- * 1,3 - compressed lcluster (for HEAD lclusters)
- * di_advise = 1 or 3
- * di_clusterofs = the decompressed data offset of the lcluster
- * di_blkaddr = the blkaddr of the compressed pcluster
- *
- * 2 - compressed lcluster (for NONHEAD lclusters)
- * di_advise = 2
- * di_clusterofs =
- * the decompressed data offset in its own HEAD lcluster
- * di_u.delta[0] = distance to this HEAD lcluster
- * di_u.delta[1] = distance to the next HEAD lcluster
- */
enum {
Z_EROFS_LCLUSTER_TYPE_PLAIN = 0,
Z_EROFS_LCLUSTER_TYPE_HEAD1 = 1,
@@ -415,11 +394,7 @@ enum {
/* (noncompact only, HEAD) This pcluster refers to partial decompressed data */
#define Z_EROFS_LI_PARTIAL_REF (1 << 15)
-/*
- * D0_CBLKCNT will be marked _only_ at the 1st non-head lcluster to store the
- * compressed block count of a compressed extent (in logical clusters, aka.
- * block count of a pcluster).
- */
+/* Set on 1st non-head lcluster to store compressed block counti (in blocks) */
#define Z_EROFS_LI_D0_CBLKCNT (1 << 11)
struct z_erofs_lcluster_index {
@@ -428,19 +403,36 @@ struct z_erofs_lcluster_index {
__le16 di_clusterofs;
union {
- /* for the HEAD lclusters */
- __le32 blkaddr;
+ __le32 blkaddr; /* for the HEAD lclusters */
/*
- * for the NONHEAD lclusters
* [0] - distance to its HEAD lcluster
* [1] - distance to the next HEAD lcluster
*/
- __le16 delta[2];
+ __le16 delta[2]; /* for the NONHEAD lclusters */
} di_u;
};
-#define Z_EROFS_FULL_INDEX_ALIGN(end) \
- (ALIGN(end, 8) + sizeof(struct z_erofs_map_header) + 8)
+#define Z_EROFS_MAP_HEADER_END(end) \
+ (ALIGN(end, 8) + sizeof(struct z_erofs_map_header))
+#define Z_EROFS_FULL_INDEX_START(end) (Z_EROFS_MAP_HEADER_END(end) + 8)
+
+#define Z_EROFS_EXTENT_PLEN_PARTIAL BIT(27)
+#define Z_EROFS_EXTENT_PLEN_FMT_BIT 28
+#define Z_EROFS_EXTENT_PLEN_MASK ((Z_EROFS_PCLUSTER_MAX_SIZE << 1) - 1)
+struct z_erofs_extent {
+ __le32 plen; /* encoded length */
+ __le32 pstart_lo; /* physical offset */
+ __le32 pstart_hi; /* physical offset MSB */
+ __le32 lstart_lo; /* logical offset */
+ __le32 lstart_hi; /* logical offset MSB (>= 4GiB inodes) */
+ __u8 reserved[12]; /* for future use */
+};
+
+static inline int z_erofs_extent_recsize(unsigned int advise)
+{
+ return 4 << ((advise >> Z_EROFS_ADVISE_EXTRECSZ_BIT) &
+ Z_EROFS_ADVISE_EXTRECSZ_MASK);
+}
/* check the EROFS on-disk layout strictly at compile time */
static inline void erofs_check_ondisk_layout_definitions(void)
@@ -449,7 +441,7 @@ static inline void erofs_check_ondisk_layout_definitions(void)
.h_clusterbits = 1 << Z_EROFS_FRAGMENT_INODE_BIT
};
- BUILD_BUG_ON(sizeof(struct erofs_super_block) != 128);
+ BUILD_BUG_ON(sizeof(struct erofs_super_block) != 144);
BUILD_BUG_ON(sizeof(struct erofs_inode_compact) != 32);
BUILD_BUG_ON(sizeof(struct erofs_inode_extended) != 64);
BUILD_BUG_ON(sizeof(struct erofs_xattr_ibody_header) != 12);
diff --git a/fs/erofs/fileio.c b/fs/erofs/fileio.c
index 0ffd1c63beeb..b7b3432a9882 100644
--- a/fs/erofs/fileio.c
+++ b/fs/erofs/fileio.c
@@ -32,11 +32,13 @@ static void erofs_fileio_ki_complete(struct kiocb *iocb, long ret)
ret = 0;
}
if (rq->bio.bi_end_io) {
+ if (ret < 0 && !rq->bio.bi_status)
+ rq->bio.bi_status = errno_to_blk_status(ret);
rq->bio.bi_end_io(&rq->bio);
} else {
bio_for_each_folio_all(fi, &rq->bio) {
DBG_BUGON(folio_test_uptodate(fi.folio));
- erofs_onlinefolio_end(fi.folio, ret);
+ erofs_onlinefolio_end(fi.folio, ret, false);
}
}
bio_uninit(&rq->bio);
@@ -45,6 +47,7 @@ static void erofs_fileio_ki_complete(struct kiocb *iocb, long ret)
static void erofs_fileio_rq_submit(struct erofs_fileio_rq *rq)
{
+ const struct cred *old_cred;
struct iov_iter iter;
int ret;
@@ -58,7 +61,9 @@ static void erofs_fileio_rq_submit(struct erofs_fileio_rq *rq)
rq->iocb.ki_flags = IOCB_DIRECT;
iov_iter_bvec(&iter, ITER_DEST, rq->bvecs, rq->bio.bi_vcnt,
rq->bio.bi_iter.bi_size);
+ old_cred = override_creds(rq->iocb.ki_filp->f_cred);
ret = vfs_iocb_iter_read(rq->iocb.ki_filp, &rq->iocb, &iter);
+ revert_creds(old_cred);
if (ret != -EIOCBQUEUED)
erofs_fileio_ki_complete(&rq->iocb, ret);
}
@@ -91,8 +96,6 @@ static int erofs_fileio_scan_folio(struct erofs_fileio *io, struct folio *folio)
struct erofs_map_blocks *map = &io->map;
unsigned int cur = 0, end = folio_size(folio), len, attached = 0;
loff_t pos = folio_pos(folio), ofs;
- struct iov_iter iter;
- struct bio_vec bv;
int err = 0;
erofs_onlinefolio_init(folio);
@@ -112,18 +115,12 @@ static int erofs_fileio_scan_folio(struct erofs_fileio *io, struct folio *folio)
void *src;
src = erofs_read_metabuf(&buf, inode->i_sb,
- map->m_pa + ofs, EROFS_KMAP);
+ map->m_pa + ofs, erofs_inode_in_metabox(inode));
if (IS_ERR(src)) {
err = PTR_ERR(src);
break;
}
- bvec_set_folio(&bv, folio, len, cur);
- iov_iter_bvec(&iter, ITER_DEST, &bv, 1, len);
- if (copy_to_iter(src, len, &iter) != len) {
- erofs_put_metabuf(&buf);
- err = -EIO;
- break;
- }
+ memcpy_to_folio(folio, cur, src, len);
erofs_put_metabuf(&buf);
} else if (!(map->m_flags & EROFS_MAP_MAPPED)) {
folio_zero_segment(folio, cur, cur + len);
@@ -145,18 +142,19 @@ io_retry:
if (err)
break;
io->rq = erofs_fileio_rq_alloc(&io->dev);
- io->rq->bio.bi_iter.bi_sector = io->dev.m_pa >> 9;
+ io->rq->bio.bi_iter.bi_sector =
+ (io->dev.m_dif->fsoff + io->dev.m_pa) >> 9;
attached = 0;
}
- if (!attached++)
- erofs_onlinefolio_split(folio);
if (!bio_add_folio(&io->rq->bio, folio, len, cur))
goto io_retry;
+ if (!attached++)
+ erofs_onlinefolio_split(folio);
io->dev.m_pa += len;
}
cur += len;
}
- erofs_onlinefolio_end(folio, err);
+ erofs_onlinefolio_end(folio, err, false);
return err;
}
@@ -178,7 +176,7 @@ static void erofs_fileio_readahead(struct readahead_control *rac)
struct folio *folio;
int err;
- trace_erofs_readpages(inode, readahead_index(rac),
+ trace_erofs_readahead(inode, readahead_index(rac),
readahead_count(rac), true);
while ((folio = readahead_folio(rac))) {
err = erofs_fileio_scan_folio(&io, folio);
diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c
index ce3d8737df85..362acf828279 100644
--- a/fs/erofs/fscache.c
+++ b/fs/erofs/fscache.c
@@ -102,8 +102,7 @@ static void erofs_fscache_req_io_put(struct erofs_fscache_io *io)
erofs_fscache_req_put(req);
}
-static void erofs_fscache_req_end_io(void *priv,
- ssize_t transferred_or_error, bool was_async)
+static void erofs_fscache_req_end_io(void *priv, ssize_t transferred_or_error)
{
struct erofs_fscache_io *io = priv;
struct erofs_fscache_rq *req = io->private;
@@ -180,8 +179,7 @@ struct erofs_fscache_bio {
struct bio_vec bvecs[BIO_MAX_VECS];
};
-static void erofs_fscache_bio_endio(void *priv,
- ssize_t transferred_or_error, bool was_async)
+static void erofs_fscache_bio_endio(void *priv, ssize_t transferred_or_error)
{
struct erofs_fscache_bio *io = priv;
@@ -276,7 +274,8 @@ static int erofs_fscache_data_read_slice(struct erofs_fscache_rq *req)
size_t size = map.m_llen;
void *src;
- src = erofs_read_metabuf(&buf, sb, map.m_pa, EROFS_KMAP);
+ src = erofs_read_metabuf(&buf, sb, map.m_pa,
+ erofs_inode_in_metabox(inode));
if (IS_ERR(src))
return PTR_ERR(src);
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index d4b89407822a..9a2f59721522 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -27,29 +27,28 @@ static int erofs_fill_symlink(struct inode *inode, void *kaddr,
static int erofs_read_inode(struct inode *inode)
{
struct super_block *sb = inode->i_sb;
+ erofs_blk_t blkaddr = erofs_blknr(sb, erofs_iloc(inode));
+ unsigned int ofs = erofs_blkoff(sb, erofs_iloc(inode));
+ bool in_mbox = erofs_inode_in_metabox(inode);
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
struct erofs_sb_info *sbi = EROFS_SB(sb);
+ erofs_blk_t addrmask = BIT_ULL(48) - 1;
struct erofs_inode *vi = EROFS_I(inode);
- const erofs_off_t inode_loc = erofs_iloc(inode);
- erofs_blk_t blkaddr, nblks = 0;
- void *kaddr;
+ struct erofs_inode_extended *die, copied;
struct erofs_inode_compact *dic;
- struct erofs_inode_extended *die, *copied = NULL;
- union erofs_inode_i_u iu;
- struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
- unsigned int ifmt, ofs;
+ unsigned int ifmt;
+ void *ptr;
int err = 0;
- blkaddr = erofs_blknr(sb, inode_loc);
- ofs = erofs_blkoff(sb, inode_loc);
-
- kaddr = erofs_read_metabuf(&buf, sb, erofs_pos(sb, blkaddr), EROFS_KMAP);
- if (IS_ERR(kaddr)) {
- erofs_err(sb, "failed to get inode (nid: %llu) page, err %ld",
- vi->nid, PTR_ERR(kaddr));
- return PTR_ERR(kaddr);
+ ptr = erofs_read_metabuf(&buf, sb, erofs_pos(sb, blkaddr), in_mbox);
+ if (IS_ERR(ptr)) {
+ err = PTR_ERR(ptr);
+ erofs_err(sb, "failed to read inode meta block (nid: %llu): %d",
+ vi->nid, err);
+ goto err_out;
}
- dic = kaddr + ofs;
+ dic = ptr + ofs;
ifmt = le16_to_cpu(dic->i_format);
if (ifmt & ~EROFS_I_ALL) {
erofs_err(sb, "unsupported i_format %u of nid %llu",
@@ -73,40 +72,34 @@ static int erofs_read_inode(struct inode *inode)
if (ofs + vi->inode_isize <= sb->s_blocksize) {
ofs += vi->inode_isize;
die = (struct erofs_inode_extended *)dic;
+ copied.i_u = die->i_u;
+ copied.i_nb = die->i_nb;
} else {
const unsigned int gotten = sb->s_blocksize - ofs;
- copied = kmalloc(vi->inode_isize, GFP_KERNEL);
- if (!copied) {
- err = -ENOMEM;
+ memcpy(&copied, dic, gotten);
+ ptr = erofs_read_metabuf(&buf, sb,
+ erofs_pos(sb, blkaddr + 1), in_mbox);
+ if (IS_ERR(ptr)) {
+ err = PTR_ERR(ptr);
+ erofs_err(sb, "failed to read inode payload block (nid: %llu): %d",
+ vi->nid, err);
goto err_out;
}
- memcpy(copied, dic, gotten);
- kaddr = erofs_read_metabuf(&buf, sb, erofs_pos(sb, blkaddr + 1),
- EROFS_KMAP);
- if (IS_ERR(kaddr)) {
- erofs_err(sb, "failed to get inode payload block (nid: %llu), err %ld",
- vi->nid, PTR_ERR(kaddr));
- kfree(copied);
- return PTR_ERR(kaddr);
- }
ofs = vi->inode_isize - gotten;
- memcpy((u8 *)copied + gotten, kaddr, ofs);
- die = copied;
+ memcpy((u8 *)&copied + gotten, ptr, ofs);
+ die = &copied;
}
vi->xattr_isize = erofs_xattr_ibody_size(die->i_xattr_icount);
inode->i_mode = le16_to_cpu(die->i_mode);
- iu = die->i_u;
i_uid_write(inode, le32_to_cpu(die->i_uid));
i_gid_write(inode, le32_to_cpu(die->i_gid));
set_nlink(inode, le32_to_cpu(die->i_nlink));
- /* each extended inode has its own timestamp */
- inode_set_ctime(inode, le64_to_cpu(die->i_mtime),
+ inode_set_mtime(inode, le64_to_cpu(die->i_mtime),
le32_to_cpu(die->i_mtime_nsec));
inode->i_size = le64_to_cpu(die->i_size);
- kfree(copied);
break;
case EROFS_INODE_LAYOUT_COMPACT:
vi->inode_isize = sizeof(struct erofs_inode_compact);
@@ -114,12 +107,20 @@ static int erofs_read_inode(struct inode *inode)
vi->xattr_isize = erofs_xattr_ibody_size(dic->i_xattr_icount);
inode->i_mode = le16_to_cpu(dic->i_mode);
- iu = dic->i_u;
+ copied.i_u = dic->i_u;
i_uid_write(inode, le16_to_cpu(dic->i_uid));
i_gid_write(inode, le16_to_cpu(dic->i_gid));
- set_nlink(inode, le16_to_cpu(dic->i_nlink));
- /* use build time for compact inodes */
- inode_set_ctime(inode, sbi->build_time, sbi->build_time_nsec);
+ if (!S_ISDIR(inode->i_mode) &&
+ ((ifmt >> EROFS_I_NLINK_1_BIT) & 1)) {
+ set_nlink(inode, 1);
+ copied.i_nb = dic->i_nb;
+ } else {
+ set_nlink(inode, le16_to_cpu(dic->i_nb.nlink));
+ copied.i_nb.startblk_hi = 0;
+ addrmask = BIT_ULL(32) - 1;
+ }
+ inode_set_mtime(inode, sbi->epoch + le32_to_cpu(dic->i_mtime),
+ sbi->fixed_nsec);
inode->i_size = le32_to_cpu(dic->i_size);
break;
@@ -136,19 +137,26 @@ static int erofs_read_inode(struct inode *inode)
goto err_out;
}
switch (inode->i_mode & S_IFMT) {
- case S_IFREG:
case S_IFDIR:
+ vi->dot_omitted = (ifmt >> EROFS_I_DOT_OMITTED_BIT) & 1;
+ fallthrough;
+ case S_IFREG:
case S_IFLNK:
- vi->raw_blkaddr = le32_to_cpu(iu.raw_blkaddr);
+ vi->startblk = le32_to_cpu(copied.i_u.startblk_lo) |
+ ((u64)le16_to_cpu(copied.i_nb.startblk_hi) << 32);
+ if (vi->datalayout == EROFS_INODE_FLAT_PLAIN &&
+ !((vi->startblk ^ EROFS_NULL_ADDR) & addrmask))
+ vi->startblk = EROFS_NULL_ADDR;
+
if(S_ISLNK(inode->i_mode)) {
- err = erofs_fill_symlink(inode, kaddr, ofs);
+ err = erofs_fill_symlink(inode, ptr, ofs);
if (err)
goto err_out;
}
break;
case S_IFCHR:
case S_IFBLK:
- inode->i_rdev = new_decode_dev(le32_to_cpu(iu.rdev));
+ inode->i_rdev = new_decode_dev(le32_to_cpu(copied.i_u.rdev));
break;
case S_IFIFO:
case S_IFSOCK:
@@ -161,12 +169,15 @@ static int erofs_read_inode(struct inode *inode)
goto err_out;
}
- /* total blocks for compressed files */
- if (erofs_inode_is_data_compressed(vi->datalayout)) {
- nblks = le32_to_cpu(iu.compressed_blocks);
- } else if (vi->datalayout == EROFS_INODE_CHUNK_BASED) {
+ if (erofs_inode_is_data_compressed(vi->datalayout))
+ inode->i_blocks = le32_to_cpu(copied.i_u.blocks_lo) <<
+ (sb->s_blocksize_bits - 9);
+ else
+ inode->i_blocks = round_up(inode->i_size, sb->s_blocksize) >> 9;
+
+ if (vi->datalayout == EROFS_INODE_CHUNK_BASED) {
/* fill chunked inode summary info */
- vi->chunkformat = le16_to_cpu(iu.c.format);
+ vi->chunkformat = le16_to_cpu(copied.i_u.c.format);
if (vi->chunkformat & ~EROFS_CHUNK_FORMAT_ALL) {
erofs_err(sb, "unsupported chunk format %x of nid %llu",
vi->chunkformat, vi->nid);
@@ -176,22 +187,15 @@ static int erofs_read_inode(struct inode *inode)
vi->chunkbits = sb->s_blocksize_bits +
(vi->chunkformat & EROFS_CHUNK_FORMAT_BLKBITS_MASK);
}
- inode_set_mtime_to_ts(inode,
- inode_set_atime_to_ts(inode, inode_get_ctime(inode)));
+ inode_set_atime_to_ts(inode,
+ inode_set_ctime_to_ts(inode, inode_get_mtime(inode)));
inode->i_flags &= ~S_DAX;
if (test_opt(&sbi->opt, DAX_ALWAYS) && S_ISREG(inode->i_mode) &&
(vi->datalayout == EROFS_INODE_FLAT_PLAIN ||
vi->datalayout == EROFS_INODE_CHUNK_BASED))
inode->i_flags |= S_DAX;
-
- if (!nblks)
- /* measure inode.i_blocks as generic filesystems */
- inode->i_blocks = round_up(inode->i_size, sb->s_blocksize) >> 9;
- else
- inode->i_blocks = nblks << (sb->s_blocksize_bits - 9);
err_out:
- DBG_BUGON(err);
erofs_put_metabuf(&buf);
return err;
}
@@ -202,13 +206,10 @@ static int erofs_fill_inode(struct inode *inode)
int err;
trace_erofs_fill_inode(inode);
-
- /* read inode base data from disk */
err = erofs_read_inode(inode);
if (err)
return err;
- /* setup the new inode */
switch (inode->i_mode & S_IFMT) {
case S_IFREG:
inode->i_op = &erofs_generic_iops;
@@ -229,15 +230,10 @@ static int erofs_fill_inode(struct inode *inode)
inode->i_op = &erofs_symlink_iops;
inode_nohighmem(inode);
break;
- case S_IFCHR:
- case S_IFBLK:
- case S_IFIFO:
- case S_IFSOCK:
+ default:
inode->i_op = &erofs_generic_iops;
init_special_inode(inode, inode->i_mode, inode->i_rdev);
return 0;
- default:
- return -EFSCORRUPTED;
}
mapping_set_large_folios(inode->i_mapping);
@@ -269,13 +265,13 @@ static int erofs_fill_inode(struct inode *inode)
* ino_t is 32-bits on 32-bit arch. We have to squash the 64-bit value down
* so that it will fit.
*/
-static ino_t erofs_squash_ino(erofs_nid_t nid)
+static ino_t erofs_squash_ino(struct super_block *sb, erofs_nid_t nid)
{
- ino_t ino = (ino_t)nid;
+ u64 ino64 = erofs_nid_to_ino64(EROFS_SB(sb), nid);
if (sizeof(ino_t) < sizeof(erofs_nid_t))
- ino ^= nid >> (sizeof(erofs_nid_t) - sizeof(ino_t)) * 8;
- return ino;
+ ino64 ^= ino64 >> (sizeof(erofs_nid_t) - sizeof(ino_t)) * 8;
+ return (ino_t)ino64;
}
static int erofs_iget5_eq(struct inode *inode, void *opaque)
@@ -287,7 +283,7 @@ static int erofs_iget5_set(struct inode *inode, void *opaque)
{
const erofs_nid_t nid = *(erofs_nid_t *)opaque;
- inode->i_ino = erofs_squash_ino(nid);
+ inode->i_ino = erofs_squash_ino(inode->i_sb, nid);
EROFS_I(inode)->nid = nid;
return 0;
}
@@ -296,7 +292,7 @@ struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid)
{
struct inode *inode;
- inode = iget5_locked(sb, erofs_squash_ino(nid), erofs_iget5_eq,
+ inode = iget5_locked(sb, erofs_squash_ino(sb, nid), erofs_iget5_eq,
erofs_iget5_set, &nid);
if (!inode)
return ERR_PTR(-ENOMEM);
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 686d835eb533..4ccc5f0ee8df 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -37,18 +37,17 @@ __printf(2, 3) void _erofs_printk(struct super_block *sb, const char *fmt, ...);
typedef u64 erofs_nid_t;
typedef u64 erofs_off_t;
-/* data type for filesystem-wide blocks number */
-typedef u32 erofs_blk_t;
+typedef u64 erofs_blk_t;
struct erofs_device_info {
char *path;
struct erofs_fscache *fscache;
struct file *file;
struct dax_device *dax_dev;
- u64 dax_part_off;
+ u64 fsoff, dax_part_off;
- u32 blocks;
- u32 mapped_blkaddr;
+ erofs_blk_t blocks;
+ erofs_blk_t uniaddr;
};
enum {
@@ -126,6 +125,7 @@ struct erofs_sb_info {
struct erofs_sb_lz4_info lz4;
#endif /* CONFIG_EROFS_FS_ZIP */
struct inode *packed_inode;
+ struct inode *metabox_inode;
struct erofs_dev_context *devs;
u64 total_blocks;
@@ -143,23 +143,23 @@ struct erofs_sb_info {
unsigned char blkszbits; /* filesystem block size in bit shift */
u32 sb_size; /* total superblock size */
- u32 build_time_nsec;
- u64 build_time;
+ u32 fixed_nsec;
+ s64 epoch;
/* what we really care is nid, rather than ino.. */
erofs_nid_t root_nid;
erofs_nid_t packed_nid;
+ erofs_nid_t metabox_nid;
/* used for statfs, f_files - f_favail */
u64 inos;
- u8 uuid[16]; /* 128-bit uuid for volume */
- u8 volume_name[16]; /* volume name */
u32 feature_compat;
u32 feature_incompat;
/* sysfs support */
struct kobject s_kobj; /* /sys/fs/erofs/<devname> */
struct completion s_kobj_unregister;
+ erofs_off_t dir_ra_bytes;
/* fscache support */
struct fscache_volume *volume;
@@ -199,21 +199,17 @@ enum {
EROFS_ZIP_CACHE_READAROUND
};
-enum erofs_kmap_type {
- EROFS_NO_KMAP, /* don't map the buffer */
- EROFS_KMAP, /* use kmap_local_page() to map the buffer */
-};
-
struct erofs_buf {
struct address_space *mapping;
struct file *file;
+ u64 off;
struct page *page;
void *base;
};
#define __EROFS_BUF_INITIALIZER ((struct erofs_buf){ .page = NULL })
-#define erofs_blknr(sb, addr) ((erofs_blk_t)((addr) >> (sb)->s_blocksize_bits))
-#define erofs_blkoff(sb, addr) ((addr) & ((sb)->s_blocksize - 1))
+#define erofs_blknr(sb, pos) ((erofs_blk_t)((pos) >> (sb)->s_blocksize_bits))
+#define erofs_blkoff(sb, pos) ((pos) & ((sb)->s_blocksize - 1))
#define erofs_pos(sb, blk) ((erofs_off_t)(blk) << (sb)->s_blocksize_bits)
#define erofs_iblks(i) (round_up((i)->i_size, i_blocksize(i)) >> (i)->i_blkbits)
@@ -233,8 +229,28 @@ EROFS_FEATURE_FUNCS(ztailpacking, incompat, INCOMPAT_ZTAILPACKING)
EROFS_FEATURE_FUNCS(fragments, incompat, INCOMPAT_FRAGMENTS)
EROFS_FEATURE_FUNCS(dedupe, incompat, INCOMPAT_DEDUPE)
EROFS_FEATURE_FUNCS(xattr_prefixes, incompat, INCOMPAT_XATTR_PREFIXES)
+EROFS_FEATURE_FUNCS(48bit, incompat, INCOMPAT_48BIT)
+EROFS_FEATURE_FUNCS(metabox, incompat, INCOMPAT_METABOX)
EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM)
EROFS_FEATURE_FUNCS(xattr_filter, compat, COMPAT_XATTR_FILTER)
+EROFS_FEATURE_FUNCS(shared_ea_in_metabox, compat, COMPAT_SHARED_EA_IN_METABOX)
+
+static inline u64 erofs_nid_to_ino64(struct erofs_sb_info *sbi, erofs_nid_t nid)
+{
+ if (!erofs_sb_has_metabox(sbi))
+ return nid;
+
+ /*
+ * When metadata compression is enabled, avoid generating excessively
+ * large inode numbers for metadata-compressed inodes. Shift NIDs in
+ * the 31-62 bit range left by one and move the metabox flag to bit 31.
+ *
+ * Note: on-disk NIDs remain unchanged as they are primarily used for
+ * compatibility with non-LFS 32-bit applications.
+ */
+ return ((nid << 1) & GENMASK_ULL(63, 32)) | (nid & GENMASK(30, 0)) |
+ ((nid >> EROFS_DIRENT_NID_METABOX_BIT) << 31);
+}
/* atomic flag definitions */
#define EROFS_I_EA_INITED_BIT 0
@@ -244,6 +260,9 @@ EROFS_FEATURE_FUNCS(xattr_filter, compat, COMPAT_XATTR_FILTER)
#define EROFS_I_BL_XATTR_BIT (BITS_PER_LONG - 1)
#define EROFS_I_BL_Z_BIT (BITS_PER_LONG - 2)
+/* default readahead size of directories */
+#define EROFS_DIR_RA_BYTES 16384
+
struct erofs_inode {
erofs_nid_t nid;
@@ -252,6 +271,7 @@ struct erofs_inode {
unsigned char datalayout;
unsigned char inode_isize;
+ bool dot_omitted;
unsigned int xattr_isize;
unsigned int xattr_name_filter;
@@ -259,7 +279,7 @@ struct erofs_inode {
unsigned int *xattr_shared_xattrs;
union {
- erofs_blk_t raw_blkaddr;
+ erofs_blk_t startblk;
struct {
unsigned short chunkformat;
unsigned char chunkbits;
@@ -268,15 +288,13 @@ struct erofs_inode {
struct {
unsigned short z_advise;
unsigned char z_algorithmtype[2];
- unsigned char z_logical_clusterbits;
- unsigned long z_tailextent_headlcn;
+ unsigned char z_lclusterbits;
union {
- struct {
- erofs_off_t z_idataoff;
- unsigned short z_idata_size;
- };
- erofs_off_t z_fragmentoff;
+ u64 z_tailextent_headlcn;
+ u64 z_extents;
};
+ erofs_off_t z_fragmentoff;
+ unsigned short z_idata_size;
};
#endif /* CONFIG_EROFS_FS_ZIP */
};
@@ -286,12 +304,20 @@ struct erofs_inode {
#define EROFS_I(ptr) container_of(ptr, struct erofs_inode, vfs_inode)
+static inline bool erofs_inode_in_metabox(struct inode *inode)
+{
+ return EROFS_I(inode)->nid & BIT_ULL(EROFS_DIRENT_NID_METABOX_BIT);
+}
+
static inline erofs_off_t erofs_iloc(struct inode *inode)
{
struct erofs_sb_info *sbi = EROFS_I_SB(inode);
+ erofs_nid_t nid_lo = EROFS_I(inode)->nid & EROFS_DIRENT_NID_MASK;
+ if (erofs_inode_in_metabox(inode))
+ return nid_lo << sbi->islotbits;
return erofs_pos(inode->i_sb, sbi->meta_blkaddr) +
- (EROFS_I(inode)->nid << sbi->islotbits);
+ (nid_lo << sbi->islotbits);
}
static inline unsigned int erofs_inode_version(unsigned int ifmt)
@@ -322,10 +348,12 @@ static inline struct folio *erofs_grab_folio_nowait(struct address_space *as,
/* The length of extent is full */
#define EROFS_MAP_FULL_MAPPED 0x0008
/* Located in the special packed inode */
-#define EROFS_MAP_FRAGMENT 0x0010
+#define __EROFS_MAP_FRAGMENT 0x0010
/* The extent refers to partial decompressed data */
#define EROFS_MAP_PARTIAL_REF 0x0020
+#define EROFS_MAP_FRAGMENT (EROFS_MAP_MAPPED | __EROFS_MAP_FRAGMENT)
+
struct erofs_map_blocks {
struct erofs_buf buf;
@@ -387,18 +415,18 @@ void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf,
erofs_off_t *offset, int *lengthp);
void erofs_unmap_metabuf(struct erofs_buf *buf);
void erofs_put_metabuf(struct erofs_buf *buf);
-void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset,
- enum erofs_kmap_type type);
-void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb);
+void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset, bool need_kmap);
+int erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb,
+ bool in_metabox);
void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb,
- erofs_off_t offset, enum erofs_kmap_type type);
+ erofs_off_t offset, bool in_metabox);
int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *dev);
int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len);
int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map);
void erofs_onlinefolio_init(struct folio *folio);
void erofs_onlinefolio_split(struct folio *folio);
-void erofs_onlinefolio_end(struct folio *folio, int err);
+void erofs_onlinefolio_end(struct folio *folio, int err, bool dirty);
struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid);
int erofs_getattr(struct mnt_idmap *idmap, const struct path *path,
struct kstat *stat, u32 request_mask,
@@ -448,6 +476,7 @@ int __init erofs_init_shrinker(void);
void erofs_exit_shrinker(void);
int __init z_erofs_init_subsystem(void);
void z_erofs_exit_subsystem(void);
+int z_erofs_init_super(struct super_block *sb);
unsigned long z_erofs_shrink_scan(struct erofs_sb_info *sbi,
unsigned long nr_shrink);
int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map,
@@ -457,7 +486,6 @@ void z_erofs_put_gbuf(void *ptr);
int z_erofs_gbuf_growsize(unsigned int nrpages);
int __init z_erofs_gbuf_init(void);
void z_erofs_gbuf_exit(void);
-int erofs_init_managed_cache(struct super_block *sb);
int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb);
#else
static inline void erofs_shrinker_register(struct super_block *sb) {}
@@ -466,7 +494,7 @@ static inline int erofs_init_shrinker(void) { return 0; }
static inline void erofs_exit_shrinker(void) {}
static inline int z_erofs_init_subsystem(void) { return 0; }
static inline void z_erofs_exit_subsystem(void) {}
-static inline int erofs_init_managed_cache(struct super_block *sb) { return 0; }
+static inline int z_erofs_init_super(struct super_block *sb) { return 0; }
#endif /* !CONFIG_EROFS_FS_ZIP */
#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE
diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c
index c94d0c1608a8..f7cf4f41af28 100644
--- a/fs/erofs/namei.c
+++ b/fs/erofs/namei.c
@@ -100,7 +100,7 @@ static void *erofs_find_target_block(struct erofs_buf *target,
struct erofs_dirent *de;
buf.mapping = dir->i_mapping;
- de = erofs_bread(&buf, erofs_pos(dir->i_sb, mid), EROFS_KMAP);
+ de = erofs_bread(&buf, erofs_pos(dir->i_sb, mid), true);
if (!IS_ERR(de)) {
const int nameoff = nameoff_from_disk(de->nameoff, bsz);
const int ndirents = nameoff / sizeof(*de);
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 827b62665649..1b529ace4db0 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -94,7 +94,7 @@ void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf,
int len, i, cnt;
*offset = round_up(*offset, 4);
- ptr = erofs_bread(buf, *offset, EROFS_KMAP);
+ ptr = erofs_bread(buf, *offset, true);
if (IS_ERR(ptr))
return ptr;
@@ -110,7 +110,7 @@ void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf,
for (i = 0; i < len; i += cnt) {
cnt = min_t(int, sb->s_blocksize - erofs_blkoff(sb, *offset),
len - i);
- ptr = erofs_bread(buf, *offset, EROFS_KMAP);
+ ptr = erofs_bread(buf, *offset, true);
if (IS_ERR(ptr)) {
kfree(buffer);
return ptr;
@@ -141,7 +141,7 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
struct erofs_deviceslot *dis;
struct file *file;
- dis = erofs_read_metabuf(buf, sb, *pos, EROFS_KMAP);
+ dis = erofs_read_metabuf(buf, sb, *pos, false);
if (IS_ERR(dis))
return PTR_ERR(dis);
@@ -165,12 +165,20 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
filp_open(dif->path, O_RDONLY | O_LARGEFILE, 0) :
bdev_file_open_by_path(dif->path,
BLK_OPEN_READ, sb->s_type, NULL);
- if (IS_ERR(file))
+ if (IS_ERR(file)) {
+ if (file == ERR_PTR(-ENOTBLK))
+ return -EINVAL;
return PTR_ERR(file);
+ }
if (!erofs_is_fileio_mode(sbi)) {
dif->dax_dev = fs_dax_get_by_bdev(file_bdev(file),
&dif->dax_part_off, NULL, NULL);
+ if (!dif->dax_dev && test_opt(&sbi->opt, DAX_ALWAYS)) {
+ erofs_info(sb, "DAX unsupported by %s. Turning off DAX.",
+ dif->path);
+ clear_opt(&sbi->opt, DAX_ALWAYS);
+ }
} else if (!S_ISREG(file_inode(file)->i_mode)) {
fput(file);
return -EINVAL;
@@ -178,8 +186,8 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
dif->file = file;
}
- dif->blocks = le32_to_cpu(dis->blocks);
- dif->mapped_blkaddr = le32_to_cpu(dis->mapped_blkaddr);
+ dif->blocks = le32_to_cpu(dis->blocks_lo);
+ dif->uniaddr = le32_to_cpu(dis->uniaddr_lo);
sbi->total_blocks += dif->blocks;
*pos += EROFS_DEVT_SLOT_SIZE;
return 0;
@@ -207,8 +215,13 @@ static int erofs_scan_devices(struct super_block *sb,
ondisk_extradevs, sbi->devs->extra_devices);
return -EINVAL;
}
- if (!ondisk_extradevs)
+ if (!ondisk_extradevs) {
+ if (test_opt(&sbi->opt, DAX_ALWAYS) && !sbi->dif0.dax_dev) {
+ erofs_info(sb, "DAX unsupported by block device. Turning off DAX.");
+ clear_opt(&sbi->opt, DAX_ALWAYS);
+ }
return 0;
+ }
if (!sbi->devs->extra_devices && !erofs_is_fscache_mode(sb))
sbi->devs->flatdev = true;
@@ -255,7 +268,7 @@ static int erofs_read_superblock(struct super_block *sb)
void *data;
int ret;
- data = erofs_read_metabuf(&buf, sb, 0, EROFS_KMAP);
+ data = erofs_read_metabuf(&buf, sb, 0, false);
if (IS_ERR(data)) {
erofs_err(sb, "cannot read erofs superblock");
return PTR_ERR(data);
@@ -268,7 +281,7 @@ static int erofs_read_superblock(struct super_block *sb)
goto out;
}
- sbi->blkszbits = dsb->blkszbits;
+ sbi->blkszbits = dsb->blkszbits;
if (sbi->blkszbits < 9 || sbi->blkszbits > PAGE_SHIFT) {
erofs_err(sb, "blkszbits %u isn't supported", sbi->blkszbits);
goto out;
@@ -299,7 +312,7 @@ static int erofs_read_superblock(struct super_block *sb)
sbi->sb_size);
goto out;
}
- sbi->dif0.blocks = le32_to_cpu(dsb->blocks);
+ sbi->dif0.blocks = le32_to_cpu(dsb->blocks_lo);
sbi->meta_blkaddr = le32_to_cpu(dsb->meta_blkaddr);
#ifdef CONFIG_EROFS_FS_XATTR
sbi->xattr_blkaddr = le32_to_cpu(dsb->xattr_blkaddr);
@@ -308,31 +321,39 @@ static int erofs_read_superblock(struct super_block *sb)
sbi->xattr_filter_reserved = dsb->xattr_filter_reserved;
#endif
sbi->islotbits = ilog2(sizeof(struct erofs_inode_compact));
- sbi->root_nid = le16_to_cpu(dsb->root_nid);
+ if (erofs_sb_has_48bit(sbi) && dsb->rootnid_8b) {
+ sbi->root_nid = le64_to_cpu(dsb->rootnid_8b);
+ sbi->dif0.blocks = sbi->dif0.blocks |
+ ((u64)le16_to_cpu(dsb->rb.blocks_hi) << 32);
+ } else {
+ sbi->root_nid = le16_to_cpu(dsb->rb.rootnid_2b);
+ }
sbi->packed_nid = le64_to_cpu(dsb->packed_nid);
+ if (erofs_sb_has_metabox(sbi)) {
+ if (sbi->sb_size <= offsetof(struct erofs_super_block,
+ metabox_nid))
+ return -EFSCORRUPTED;
+ sbi->metabox_nid = le64_to_cpu(dsb->metabox_nid);
+ if (sbi->metabox_nid & BIT_ULL(EROFS_DIRENT_NID_METABOX_BIT))
+ return -EFSCORRUPTED; /* self-loop detection */
+ }
sbi->inos = le64_to_cpu(dsb->inos);
- sbi->build_time = le64_to_cpu(dsb->build_time);
- sbi->build_time_nsec = le32_to_cpu(dsb->build_time_nsec);
-
+ sbi->epoch = (s64)le64_to_cpu(dsb->epoch);
+ sbi->fixed_nsec = le32_to_cpu(dsb->fixed_nsec);
super_set_uuid(sb, (void *)dsb->uuid, sizeof(dsb->uuid));
- ret = strscpy(sbi->volume_name, dsb->volume_name,
- sizeof(dsb->volume_name));
- if (ret < 0) { /* -E2BIG */
- erofs_err(sb, "bad volume name without NIL terminator");
- ret = -EFSCORRUPTED;
- goto out;
- }
-
/* parse on-disk compression configurations */
ret = z_erofs_parse_cfgs(sb, dsb);
if (ret < 0)
goto out;
- /* handle multiple devices */
ret = erofs_scan_devices(sb, dsb);
+ if (erofs_sb_has_48bit(sbi))
+ erofs_info(sb, "EXPERIMENTAL 48-bit layout support in use. Use at your own risk!");
+ if (erofs_sb_has_metabox(sbi))
+ erofs_info(sb, "EXPERIMENTAL metadata compression support in use. Use at your own risk!");
if (erofs_is_fscache_mode(sb))
erofs_info(sb, "[deprecated] fscache-based on-demand read feature in use. Use at your own risk!");
out:
@@ -357,8 +378,7 @@ static void erofs_default_options(struct erofs_sb_info *sbi)
enum {
Opt_user_xattr, Opt_acl, Opt_cache_strategy, Opt_dax, Opt_dax_enum,
- Opt_device, Opt_fsid, Opt_domain_id, Opt_directio,
- Opt_err
+ Opt_device, Opt_fsid, Opt_domain_id, Opt_directio, Opt_fsoffset,
};
static const struct constant_table erofs_param_cache_strategy[] = {
@@ -385,6 +405,7 @@ static const struct fs_parameter_spec erofs_fs_parameters[] = {
fsparam_string("fsid", Opt_fsid),
fsparam_string("domain_id", Opt_domain_id),
fsparam_flag_no("directio", Opt_directio),
+ fsparam_u64("fsoffset", Opt_fsoffset),
{}
};
@@ -508,28 +529,59 @@ static int erofs_fc_parse_param(struct fs_context *fc,
errorfc(fc, "%s option not supported", erofs_fs_parameters[opt].name);
#endif
break;
+ case Opt_fsoffset:
+ sbi->dif0.fsoff = result.uint_64;
+ break;
}
return 0;
}
-static struct inode *erofs_nfs_get_inode(struct super_block *sb,
- u64 ino, u32 generation)
+static int erofs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
+ struct inode *parent)
{
- return erofs_iget(sb, ino);
+ erofs_nid_t nid = EROFS_I(inode)->nid;
+ int len = parent ? 6 : 3;
+
+ if (*max_len < len) {
+ *max_len = len;
+ return FILEID_INVALID;
+ }
+
+ fh[0] = (u32)(nid >> 32);
+ fh[1] = (u32)(nid & 0xffffffff);
+ fh[2] = inode->i_generation;
+
+ if (parent) {
+ nid = EROFS_I(parent)->nid;
+
+ fh[3] = (u32)(nid >> 32);
+ fh[4] = (u32)(nid & 0xffffffff);
+ fh[5] = parent->i_generation;
+ }
+
+ *max_len = len;
+ return parent ? FILEID_INO64_GEN_PARENT : FILEID_INO64_GEN;
}
static struct dentry *erofs_fh_to_dentry(struct super_block *sb,
struct fid *fid, int fh_len, int fh_type)
{
- return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
- erofs_nfs_get_inode);
+ if ((fh_type != FILEID_INO64_GEN &&
+ fh_type != FILEID_INO64_GEN_PARENT) || fh_len < 3)
+ return NULL;
+
+ return d_obtain_alias(erofs_iget(sb,
+ ((u64)fid->raw[0] << 32) | fid->raw[1]));
}
static struct dentry *erofs_fh_to_parent(struct super_block *sb,
struct fid *fid, int fh_len, int fh_type)
{
- return generic_fh_to_parent(sb, fid, fh_len, fh_type,
- erofs_nfs_get_inode);
+ if (fh_type != FILEID_INO64_GEN_PARENT || fh_len < 6)
+ return NULL;
+
+ return d_obtain_alias(erofs_iget(sb,
+ ((u64)fid->raw[3] << 32) | fid->raw[4]));
}
static struct dentry *erofs_get_parent(struct dentry *child)
@@ -545,7 +597,7 @@ static struct dentry *erofs_get_parent(struct dentry *child)
}
static const struct export_operations erofs_export_ops = {
- .encode_fh = generic_encode_ino32_fh,
+ .encode_fh = erofs_encode_fh,
.fh_to_dentry = erofs_fh_to_dentry,
.fh_to_parent = erofs_fh_to_parent,
.get_parent = erofs_get_parent,
@@ -620,14 +672,17 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
}
}
- if (test_opt(&sbi->opt, DAX_ALWAYS)) {
- if (!sbi->dif0.dax_dev) {
- errorfc(fc, "DAX unsupported by block device. Turning off DAX.");
- clear_opt(&sbi->opt, DAX_ALWAYS);
- } else if (sbi->blkszbits != PAGE_SHIFT) {
- errorfc(fc, "unsupported blocksize for DAX");
- clear_opt(&sbi->opt, DAX_ALWAYS);
- }
+ if (sbi->dif0.fsoff) {
+ if (sbi->dif0.fsoff & (sb->s_blocksize - 1))
+ return invalfc(fc, "fsoffset %llu is not aligned to block size %lu",
+ sbi->dif0.fsoff, sb->s_blocksize);
+ if (erofs_is_fscache_mode(sb))
+ return invalfc(fc, "cannot use fsoffset in fscache mode");
+ }
+
+ if (test_opt(&sbi->opt, DAX_ALWAYS) && sbi->blkszbits != PAGE_SHIFT) {
+ erofs_info(sb, "unsupported blocksize for DAX");
+ clear_opt(&sbi->opt, DAX_ALWAYS);
}
sb->s_time_gran = 1;
@@ -639,9 +694,22 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
else
sb->s_flags &= ~SB_POSIXACL;
-#ifdef CONFIG_EROFS_FS_ZIP
- xa_init(&sbi->managed_pslots);
-#endif
+ err = z_erofs_init_super(sb);
+ if (err)
+ return err;
+
+ if (erofs_sb_has_fragments(sbi) && sbi->packed_nid) {
+ inode = erofs_iget(sb, sbi->packed_nid);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+ sbi->packed_inode = inode;
+ }
+ if (erofs_sb_has_metabox(sbi)) {
+ inode = erofs_iget(sb, sbi->metabox_nid);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+ sbi->metabox_inode = inode;
+ }
inode = erofs_iget(sb, sbi->root_nid);
if (IS_ERR(inode))
@@ -653,24 +721,11 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
iput(inode);
return -EINVAL;
}
-
sb->s_root = d_make_root(inode);
if (!sb->s_root)
return -ENOMEM;
erofs_shrinker_register(sb);
- if (erofs_sb_has_fragments(sbi) && sbi->packed_nid) {
- sbi->packed_inode = erofs_iget(sb, sbi->packed_nid);
- if (IS_ERR(sbi->packed_inode)) {
- err = PTR_ERR(sbi->packed_inode);
- sbi->packed_inode = NULL;
- return err;
- }
- }
- err = erofs_init_managed_cache(sb);
- if (err)
- return err;
-
err = erofs_xattr_prefixes_init(sb);
if (err)
return err;
@@ -680,6 +735,7 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
if (err)
return err;
+ sbi->dir_ra_bytes = EROFS_DIR_RA_BYTES;
erofs_info(sb, "mounted with root inode @ nid %llu.", sbi->root_nid);
return 0;
}
@@ -806,6 +862,18 @@ static int erofs_init_fs_context(struct fs_context *fc)
return 0;
}
+static void erofs_drop_internal_inodes(struct erofs_sb_info *sbi)
+{
+ iput(sbi->packed_inode);
+ sbi->packed_inode = NULL;
+ iput(sbi->metabox_inode);
+ sbi->metabox_inode = NULL;
+#ifdef CONFIG_EROFS_FS_ZIP
+ iput(sbi->managed_cache);
+ sbi->managed_cache = NULL;
+#endif
+}
+
static void erofs_kill_sb(struct super_block *sb)
{
struct erofs_sb_info *sbi = EROFS_SB(sb);
@@ -815,6 +883,7 @@ static void erofs_kill_sb(struct super_block *sb)
kill_anon_super(sb);
else
kill_block_super(sb);
+ erofs_drop_internal_inodes(sbi);
fs_put_dax(sbi->dif0.dax_dev, NULL);
erofs_fscache_unregister_fs(sb);
erofs_sb_free(sbi);
@@ -825,17 +894,10 @@ static void erofs_put_super(struct super_block *sb)
{
struct erofs_sb_info *const sbi = EROFS_SB(sb);
- DBG_BUGON(!sbi);
-
erofs_unregister_sysfs(sb);
erofs_shrinker_unregister(sb);
erofs_xattr_prefixes_cleanup(sb);
-#ifdef CONFIG_EROFS_FS_ZIP
- iput(sbi->managed_cache);
- sbi->managed_cache = NULL;
-#endif
- iput(sbi->packed_inode);
- sbi->packed_inode = NULL;
+ erofs_drop_internal_inodes(sbi);
erofs_free_dev_context(sbi->devs);
sbi->devs = NULL;
erofs_fscache_unregister_fs(sb);
@@ -951,6 +1013,8 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root)
if (sbi->domain_id)
seq_printf(seq, ",domain_id=%s", sbi->domain_id);
#endif
+ if (sbi->dif0.fsoff)
+ seq_printf(seq, ",fsoffset=%llu", sbi->dif0.fsoff);
return 0;
}
diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c
index 19d586273b70..1e0658a1d95b 100644
--- a/fs/erofs/sysfs.c
+++ b/fs/erofs/sysfs.c
@@ -7,12 +7,14 @@
#include <linux/kobject.h>
#include "internal.h"
+#include "compress.h"
enum {
attr_feature,
attr_drop_caches,
attr_pointer_ui,
attr_pointer_bool,
+ attr_accel,
};
enum {
@@ -60,12 +62,25 @@ static struct erofs_attr erofs_attr_##_name = { \
EROFS_ATTR_RW_UI(sync_decompress, erofs_mount_opts);
EROFS_ATTR_FUNC(drop_caches, 0200);
#endif
+#ifdef CONFIG_EROFS_FS_ZIP_ACCEL
+EROFS_ATTR_FUNC(accel, 0644);
+#endif
+EROFS_ATTR_RW_UI(dir_ra_bytes, erofs_sb_info);
-static struct attribute *erofs_attrs[] = {
+static struct attribute *erofs_sb_attrs[] = {
#ifdef CONFIG_EROFS_FS_ZIP
ATTR_LIST(sync_decompress),
ATTR_LIST(drop_caches),
#endif
+ ATTR_LIST(dir_ra_bytes),
+ NULL,
+};
+ATTRIBUTE_GROUPS(erofs_sb);
+
+static struct attribute *erofs_attrs[] = {
+#ifdef CONFIG_EROFS_FS_ZIP_ACCEL
+ ATTR_LIST(accel),
+#endif
NULL,
};
ATTRIBUTE_GROUPS(erofs);
@@ -81,6 +96,8 @@ EROFS_ATTR_FEATURE(sb_chksum);
EROFS_ATTR_FEATURE(ztailpacking);
EROFS_ATTR_FEATURE(fragments);
EROFS_ATTR_FEATURE(dedupe);
+EROFS_ATTR_FEATURE(48bit);
+EROFS_ATTR_FEATURE(metabox);
static struct attribute *erofs_feat_attrs[] = {
ATTR_LIST(zero_padding),
@@ -93,6 +110,8 @@ static struct attribute *erofs_feat_attrs[] = {
ATTR_LIST(ztailpacking),
ATTR_LIST(fragments),
ATTR_LIST(dedupe),
+ ATTR_LIST(48bit),
+ ATTR_LIST(metabox),
NULL,
};
ATTRIBUTE_GROUPS(erofs_feat);
@@ -126,12 +145,14 @@ static ssize_t erofs_attr_show(struct kobject *kobj,
if (!ptr)
return 0;
return sysfs_emit(buf, "%d\n", *(bool *)ptr);
+ case attr_accel:
+ return z_erofs_crypto_show_engines(buf, PAGE_SIZE, '\n');
}
return 0;
}
static ssize_t erofs_attr_store(struct kobject *kobj, struct attribute *attr,
- const char *buf, size_t len)
+ const char *buf, size_t len)
{
struct erofs_sb_info *sbi = container_of(kobj, struct erofs_sb_info,
s_kobj);
@@ -180,6 +201,19 @@ static ssize_t erofs_attr_store(struct kobject *kobj, struct attribute *attr,
invalidate_mapping_pages(MNGD_MAPPING(sbi), 0, -1);
return len;
#endif
+#ifdef CONFIG_EROFS_FS_ZIP_ACCEL
+ case attr_accel:
+ buf = skip_spaces(buf);
+ z_erofs_crypto_disable_all_engines();
+ while (*buf) {
+ t = strcspn(buf, "\n");
+ ret = z_erofs_crypto_enable_engine(buf, t);
+ if (ret < 0)
+ return ret;
+ buf += buf[t] != '\0' ? t + 1 : t;
+ }
+ return len;
+#endif
}
return 0;
}
@@ -197,12 +231,13 @@ static const struct sysfs_ops erofs_attr_ops = {
};
static const struct kobj_type erofs_sb_ktype = {
- .default_groups = erofs_groups,
+ .default_groups = erofs_sb_groups,
.sysfs_ops = &erofs_attr_ops,
.release = erofs_sb_release,
};
static const struct kobj_type erofs_ktype = {
+ .default_groups = erofs_groups,
.sysfs_ops = &erofs_attr_ops,
};
@@ -246,6 +281,12 @@ void erofs_unregister_sysfs(struct super_block *sb)
}
}
+void erofs_exit_sysfs(void)
+{
+ kobject_put(&erofs_feat);
+ kset_unregister(&erofs_root);
+}
+
int __init erofs_init_sysfs(void)
{
int ret;
@@ -253,24 +294,12 @@ int __init erofs_init_sysfs(void)
kobject_set_name(&erofs_root.kobj, "erofs");
erofs_root.kobj.parent = fs_kobj;
ret = kset_register(&erofs_root);
- if (ret)
- goto root_err;
-
- ret = kobject_init_and_add(&erofs_feat, &erofs_feat_ktype,
- NULL, "features");
- if (ret)
- goto feat_err;
- return ret;
-
-feat_err:
- kobject_put(&erofs_feat);
- kset_unregister(&erofs_root);
-root_err:
+ if (!ret) {
+ ret = kobject_init_and_add(&erofs_feat, &erofs_feat_ktype,
+ NULL, "features");
+ if (!ret)
+ return 0;
+ erofs_exit_sysfs();
+ }
return ret;
}
-
-void erofs_exit_sysfs(void)
-{
- kobject_put(&erofs_feat);
- kset_unregister(&erofs_root);
-}
diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c
index 7940241d9355..eaa9efd766ee 100644
--- a/fs/erofs/xattr.c
+++ b/fs/erofs/xattr.c
@@ -72,16 +72,18 @@ static int erofs_init_inode_xattrs(struct inode *inode)
ret = -EFSCORRUPTED;
goto out_unlock; /* xattr ondisk layout error */
}
- ret = -ENOATTR;
+ ret = -ENODATA;
goto out_unlock;
}
it.buf = __EROFS_BUF_INITIALIZER;
- erofs_init_metabuf(&it.buf, sb);
+ ret = erofs_init_metabuf(&it.buf, sb, erofs_inode_in_metabox(inode));
+ if (ret)
+ goto out_unlock;
it.pos = erofs_iloc(inode) + vi->inode_isize;
/* read in shared xattr array (non-atomic, see kmalloc below) */
- it.kaddr = erofs_bread(&it.buf, it.pos, EROFS_KMAP);
+ it.kaddr = erofs_bread(&it.buf, it.pos, true);
if (IS_ERR(it.kaddr)) {
ret = PTR_ERR(it.kaddr);
goto out_unlock;
@@ -102,7 +104,7 @@ static int erofs_init_inode_xattrs(struct inode *inode)
it.pos += sizeof(struct erofs_xattr_ibody_header);
for (i = 0; i < vi->xattr_shared_count; ++i) {
- it.kaddr = erofs_bread(&it.buf, it.pos, EROFS_KMAP);
+ it.kaddr = erofs_bread(&it.buf, it.pos, true);
if (IS_ERR(it.kaddr)) {
kfree(vi->xattr_shared_xattrs);
vi->xattr_shared_xattrs = NULL;
@@ -183,7 +185,7 @@ static int erofs_xattr_copy_to_buffer(struct erofs_xattr_iter *it,
void *src;
for (processed = 0; processed < len; processed += slice) {
- it->kaddr = erofs_bread(&it->buf, it->pos, EROFS_KMAP);
+ it->kaddr = erofs_bread(&it->buf, it->pos, true);
if (IS_ERR(it->kaddr))
return PTR_ERR(it->kaddr);
@@ -266,27 +268,27 @@ static int erofs_getxattr_foreach(struct erofs_xattr_iter *it)
(entry.e_name_index & EROFS_XATTR_LONG_PREFIX_MASK);
if (pf >= sbi->xattr_prefixes + sbi->xattr_prefix_count)
- return -ENOATTR;
+ return -ENODATA;
if (it->index != pf->prefix->base_index ||
it->name.len != entry.e_name_len + pf->infix_len)
- return -ENOATTR;
+ return -ENODATA;
if (memcmp(it->name.name, pf->prefix->infix, pf->infix_len))
- return -ENOATTR;
+ return -ENODATA;
it->infix_len = pf->infix_len;
} else {
if (it->index != entry.e_name_index ||
it->name.len != entry.e_name_len)
- return -ENOATTR;
+ return -ENODATA;
it->infix_len = 0;
}
/* 2. handle xattr name */
for (processed = 0; processed < entry.e_name_len; processed += slice) {
- it->kaddr = erofs_bread(&it->buf, it->pos, EROFS_KMAP);
+ it->kaddr = erofs_bread(&it->buf, it->pos, true);
if (IS_ERR(it->kaddr))
return PTR_ERR(it->kaddr);
@@ -295,7 +297,7 @@ static int erofs_getxattr_foreach(struct erofs_xattr_iter *it)
entry.e_name_len - processed);
if (memcmp(it->name.name + it->infix_len + processed,
it->kaddr, slice))
- return -ENOATTR;
+ return -ENODATA;
it->pos += slice;
}
@@ -323,14 +325,17 @@ static int erofs_xattr_iter_inline(struct erofs_xattr_iter *it,
sizeof(u32) * vi->xattr_shared_count;
if (xattr_header_sz >= vi->xattr_isize) {
DBG_BUGON(xattr_header_sz > vi->xattr_isize);
- return -ENOATTR;
+ return -ENODATA;
}
+ ret = erofs_init_metabuf(&it->buf, it->sb, erofs_inode_in_metabox(inode));
+ if (ret)
+ return ret;
remaining = vi->xattr_isize - xattr_header_sz;
it->pos = erofs_iloc(inode) + vi->inode_isize + xattr_header_sz;
while (remaining) {
- it->kaddr = erofs_bread(&it->buf, it->pos, EROFS_KMAP);
+ it->kaddr = erofs_bread(&it->buf, it->pos, true);
if (IS_ERR(it->kaddr))
return PTR_ERR(it->kaddr);
@@ -347,7 +352,7 @@ static int erofs_xattr_iter_inline(struct erofs_xattr_iter *it,
ret = erofs_getxattr_foreach(it);
else
ret = erofs_listxattr_foreach(it);
- if ((getxattr && ret != -ENOATTR) || (!getxattr && ret))
+ if ((getxattr && ret != -ENODATA) || (!getxattr && ret))
break;
it->pos = next_pos;
@@ -361,13 +366,18 @@ static int erofs_xattr_iter_shared(struct erofs_xattr_iter *it,
struct erofs_inode *const vi = EROFS_I(inode);
struct super_block *const sb = it->sb;
struct erofs_sb_info *sbi = EROFS_SB(sb);
- unsigned int i;
- int ret = -ENOATTR;
+ unsigned int i = 0;
+ int ret;
- for (i = 0; i < vi->xattr_shared_count; ++i) {
+ ret = erofs_init_metabuf(&it->buf, sb,
+ erofs_sb_has_shared_ea_in_metabox(sbi));
+ if (ret)
+ return ret;
+
+ while (i < vi->xattr_shared_count) {
it->pos = erofs_pos(sb, sbi->xattr_blkaddr) +
- vi->xattr_shared_xattrs[i] * sizeof(__le32);
- it->kaddr = erofs_bread(&it->buf, it->pos, EROFS_KMAP);
+ vi->xattr_shared_xattrs[i++] * sizeof(__le32);
+ it->kaddr = erofs_bread(&it->buf, it->pos, true);
if (IS_ERR(it->kaddr))
return PTR_ERR(it->kaddr);
@@ -375,10 +385,10 @@ static int erofs_xattr_iter_shared(struct erofs_xattr_iter *it,
ret = erofs_getxattr_foreach(it);
else
ret = erofs_listxattr_foreach(it);
- if ((getxattr && ret != -ENOATTR) || (!getxattr && ret))
+ if ((getxattr && ret != -ENODATA) || (!getxattr && ret))
break;
}
- return ret;
+ return i ? ret : -ENODATA;
}
int erofs_getxattr(struct inode *inode, int index, const char *name,
@@ -403,23 +413,22 @@ int erofs_getxattr(struct inode *inode, int index, const char *name,
EROFS_XATTR_FILTER_SEED + index);
hashbit &= EROFS_XATTR_FILTER_BITS - 1;
if (vi->xattr_name_filter & (1U << hashbit))
- return -ENOATTR;
+ return -ENODATA;
}
it.index = index;
- it.name = (struct qstr)QSTR_INIT(name, strlen(name));
+ it.name = QSTR(name);
if (it.name.len > EROFS_NAME_LEN)
return -ERANGE;
it.sb = inode->i_sb;
it.buf = __EROFS_BUF_INITIALIZER;
- erofs_init_metabuf(&it.buf, it.sb);
it.buffer = buffer;
it.buffer_size = buffer_size;
it.buffer_ofs = 0;
ret = erofs_xattr_iter_inline(&it, inode, true);
- if (ret == -ENOATTR)
+ if (ret == -ENODATA)
ret = erofs_xattr_iter_shared(&it, inode, true);
erofs_put_metabuf(&it.buf);
return ret ? ret : it.buffer_ofs;
@@ -432,23 +441,22 @@ ssize_t erofs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
struct inode *inode = d_inode(dentry);
ret = erofs_init_inode_xattrs(inode);
- if (ret == -ENOATTR)
+ if (ret == -ENODATA)
return 0;
if (ret)
return ret;
it.sb = dentry->d_sb;
it.buf = __EROFS_BUF_INITIALIZER;
- erofs_init_metabuf(&it.buf, it.sb);
it.dentry = dentry;
it.buffer = buffer;
it.buffer_size = buffer_size;
it.buffer_ofs = 0;
ret = erofs_xattr_iter_inline(&it, inode, false);
- if (!ret || ret == -ENOATTR)
+ if (!ret || ret == -ENODATA)
ret = erofs_xattr_iter_shared(&it, inode, false);
- if (ret == -ENOATTR)
+ if (ret == -ENODATA)
ret = 0;
erofs_put_metabuf(&it.buf);
return ret ? ret : it.buffer_ofs;
@@ -485,7 +493,7 @@ int erofs_xattr_prefixes_init(struct super_block *sb)
if (sbi->packed_inode)
buf.mapping = sbi->packed_inode->i_mapping;
else
- erofs_init_metabuf(&buf, sb);
+ (void)erofs_init_metabuf(&buf, sb, false);
for (i = 0; i < sbi->xattr_prefix_count; i++) {
void *ptr = erofs_read_metadata(sb, &buf, &pos, &len);
@@ -539,7 +547,7 @@ struct posix_acl *erofs_get_acl(struct inode *inode, int type, bool rcu)
rc = erofs_getxattr(inode, prefix, "", value, rc);
}
- if (rc == -ENOATTR)
+ if (rc == -ENODATA)
acl = NULL;
else if (rc < 0)
acl = ERR_PTR(rc);
diff --git a/fs/erofs/xattr.h b/fs/erofs/xattr.h
index b246cd0e135e..6317caa8413e 100644
--- a/fs/erofs/xattr.h
+++ b/fs/erofs/xattr.h
@@ -10,9 +10,6 @@
#include <linux/posix_acl_xattr.h>
#include <linux/xattr.h>
-/* Attribute not found */
-#define ENOATTR ENODATA
-
#ifdef CONFIG_EROFS_FS_XATTR
extern const struct xattr_handler erofs_xattr_user_handler;
extern const struct xattr_handler erofs_xattr_trusted_handler;
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 29f8963bb523..2d73297003d2 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -44,8 +44,8 @@ struct z_erofs_pcluster {
/* A: point to next chained pcluster or TAILs */
struct z_erofs_pcluster *next;
- /* I: start block address of this pcluster */
- erofs_off_t index;
+ /* I: start physical position of this pcluster */
+ erofs_off_t pos;
/* L: the maximum decompression size of this round */
unsigned int length;
@@ -73,12 +73,12 @@ struct z_erofs_pcluster {
/* I: compression algorithm format */
unsigned char algorithmformat;
+ /* I: whether compressed data is in-lined or not */
+ bool from_meta;
+
/* L: whether partial decompression or not */
bool partial;
- /* L: indicate several pageofs_outs or not */
- bool multibases;
-
/* L: whether extra buffer allocations are best-effort */
bool besteffort;
@@ -102,14 +102,9 @@ struct z_erofs_decompressqueue {
bool eio, sync;
};
-static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl)
-{
- return !pcl->index;
-}
-
static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl)
{
- return PAGE_ALIGN(pcl->pclustersize) >> PAGE_SHIFT;
+ return PAGE_ALIGN(pcl->pageofs_in + pcl->pclustersize) >> PAGE_SHIFT;
}
static bool erofs_folio_is_managed(struct erofs_sb_info *sbi, struct folio *fo)
@@ -133,7 +128,7 @@ struct z_erofs_pcluster_slab {
static struct z_erofs_pcluster_slab pcluster_pool[] __read_mostly = {
_PCLP(1), _PCLP(4), _PCLP(16), _PCLP(64), _PCLP(128),
- _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES)
+ _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES + 1)
};
struct z_erofs_bvec_iter {
@@ -267,7 +262,6 @@ static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int size)
pcl = kmem_cache_zalloc(pcs->slab, GFP_KERNEL);
if (!pcl)
return ERR_PTR(-ENOMEM);
- pcl->pclustersize = size;
return pcl;
}
return ERR_PTR(-EINVAL);
@@ -294,6 +288,7 @@ static struct workqueue_struct *z_erofs_workqueue __read_mostly;
#ifdef CONFIG_EROFS_FS_PCPU_KTHREAD
static struct kthread_worker __rcu **z_erofs_pcpu_workers;
+static atomic_t erofs_percpu_workers_initialized = ATOMIC_INIT(0);
static void erofs_destroy_percpu_workers(void)
{
@@ -339,12 +334,8 @@ static int erofs_init_percpu_workers(void)
}
return 0;
}
-#else
-static inline void erofs_destroy_percpu_workers(void) {}
-static inline int erofs_init_percpu_workers(void) { return 0; }
-#endif
-#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_EROFS_FS_PCPU_KTHREAD)
+#ifdef CONFIG_HOTPLUG_CPU
static DEFINE_SPINLOCK(z_erofs_pcpu_worker_lock);
static enum cpuhp_state erofs_cpuhp_state;
@@ -401,17 +392,56 @@ static void erofs_cpu_hotplug_destroy(void)
if (erofs_cpuhp_state)
cpuhp_remove_state_nocalls(erofs_cpuhp_state);
}
-#else /* !CONFIG_HOTPLUG_CPU || !CONFIG_EROFS_FS_PCPU_KTHREAD */
+#else /* !CONFIG_HOTPLUG_CPU */
static inline int erofs_cpu_hotplug_init(void) { return 0; }
static inline void erofs_cpu_hotplug_destroy(void) {}
-#endif
+#endif/* CONFIG_HOTPLUG_CPU */
+static int z_erofs_init_pcpu_workers(struct super_block *sb)
+{
+ int err;
-void z_erofs_exit_subsystem(void)
+ if (atomic_xchg(&erofs_percpu_workers_initialized, 1))
+ return 0;
+
+ err = erofs_init_percpu_workers();
+ if (err) {
+ erofs_err(sb, "per-cpu workers: failed to allocate.");
+ goto err_init_percpu_workers;
+ }
+
+ err = erofs_cpu_hotplug_init();
+ if (err < 0) {
+ erofs_err(sb, "per-cpu workers: failed CPU hotplug init.");
+ goto err_cpuhp_init;
+ }
+ erofs_info(sb, "initialized per-cpu workers successfully.");
+ return err;
+
+err_cpuhp_init:
+ erofs_destroy_percpu_workers();
+err_init_percpu_workers:
+ atomic_set(&erofs_percpu_workers_initialized, 0);
+ return err;
+}
+
+static void z_erofs_destroy_pcpu_workers(void)
{
+ if (!atomic_xchg(&erofs_percpu_workers_initialized, 0))
+ return;
erofs_cpu_hotplug_destroy();
erofs_destroy_percpu_workers();
+}
+#else /* !CONFIG_EROFS_FS_PCPU_KTHREAD */
+static inline int z_erofs_init_pcpu_workers(struct super_block *sb) { return 0; }
+static inline void z_erofs_destroy_pcpu_workers(void) {}
+#endif/* CONFIG_EROFS_FS_PCPU_KTHREAD */
+
+void z_erofs_exit_subsystem(void)
+{
+ z_erofs_destroy_pcpu_workers();
destroy_workqueue(z_erofs_workqueue);
z_erofs_destroy_pcluster_pool();
+ z_erofs_crypto_disable_all_engines();
z_erofs_exit_decompressor();
}
@@ -433,19 +463,8 @@ int __init z_erofs_init_subsystem(void)
goto err_workqueue_init;
}
- err = erofs_init_percpu_workers();
- if (err)
- goto err_pcpu_worker;
-
- err = erofs_cpu_hotplug_init();
- if (err < 0)
- goto err_cpuhp_init;
return err;
-err_cpuhp_init:
- erofs_destroy_percpu_workers();
-err_pcpu_worker:
- destroy_workqueue(z_erofs_workqueue);
err_workqueue_init:
z_erofs_destroy_pcluster_pool();
err_pcluster_pool:
@@ -516,6 +535,7 @@ static void z_erofs_bind_cache(struct z_erofs_frontend *fe)
struct z_erofs_pcluster *pcl = fe->pcl;
unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
bool shouldalloc = z_erofs_should_alloc_cache(fe);
+ pgoff_t poff = pcl->pos >> PAGE_SHIFT;
bool may_bypass = true;
/* Optimistic allocation, as in-place I/O can be used as a fallback */
gfp_t gfp = (mapping_gfp_mask(mc) & ~__GFP_DIRECT_RECLAIM) |
@@ -532,7 +552,7 @@ static void z_erofs_bind_cache(struct z_erofs_frontend *fe)
if (READ_ONCE(pcl->compressed_bvecs[i].page))
continue;
- folio = filemap_get_folio(mc, pcl->index + i);
+ folio = filemap_get_folio(mc, poff + i);
if (IS_ERR(folio)) {
may_bypass = false;
if (!shouldalloc)
@@ -575,7 +595,7 @@ static int erofs_try_to_free_all_cached_folios(struct erofs_sb_info *sbi,
struct folio *folio;
int i;
- DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
+ DBG_BUGON(pcl->from_meta);
/* Each cached folio contains one page unless bs > ps is supported */
for (i = 0; i < pclusterpages; ++i) {
if (pcl->compressed_bvecs[i].page) {
@@ -607,7 +627,7 @@ static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp)
ret = false;
spin_lock(&pcl->lockref.lock);
if (pcl->lockref.count <= 0) {
- DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
+ DBG_BUGON(pcl->from_meta);
for (; bvec < end; ++bvec) {
if (bvec->page && page_folio(bvec->page) == folio) {
bvec->page = NULL;
@@ -644,18 +664,24 @@ static const struct address_space_operations z_erofs_cache_aops = {
.invalidate_folio = z_erofs_cache_invalidate_folio,
};
-int erofs_init_managed_cache(struct super_block *sb)
+int z_erofs_init_super(struct super_block *sb)
{
- struct inode *const inode = new_inode(sb);
+ struct inode *inode;
+ int err;
+
+ err = z_erofs_init_pcpu_workers(sb);
+ if (err)
+ return err;
+ inode = new_inode(sb);
if (!inode)
return -ENOMEM;
-
set_nlink(inode, 1);
inode->i_size = OFFSET_MAX;
inode->i_mapping->a_ops = &z_erofs_cache_aops;
mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL);
EROFS_SB(sb)->managed_cache = inode;
+ xa_init(&EROFS_SB(sb)->managed_pslots);
return 0;
}
@@ -667,16 +693,20 @@ static int z_erofs_attach_page(struct z_erofs_frontend *fe,
int ret;
if (exclusive) {
- /* give priority for inplaceio to use file pages first */
- spin_lock(&pcl->lockref.lock);
- while (fe->icur > 0) {
- if (pcl->compressed_bvecs[--fe->icur].page)
- continue;
- pcl->compressed_bvecs[fe->icur] = *bvec;
+ /* Inplace I/O is limited to one page for uncompressed data */
+ if (pcl->algorithmformat < Z_EROFS_COMPRESSION_MAX ||
+ fe->icur <= 1) {
+ /* Try to prioritize inplace I/O here */
+ spin_lock(&pcl->lockref.lock);
+ while (fe->icur > 0) {
+ if (pcl->compressed_bvecs[--fe->icur].page)
+ continue;
+ pcl->compressed_bvecs[fe->icur] = *bvec;
+ spin_unlock(&pcl->lockref.lock);
+ return 0;
+ }
spin_unlock(&pcl->lockref.lock);
- return 0;
}
- spin_unlock(&pcl->lockref.lock);
/* otherwise, check if it can be used as a bvpage */
if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED &&
@@ -711,27 +741,25 @@ static int z_erofs_register_pcluster(struct z_erofs_frontend *fe)
struct erofs_map_blocks *map = &fe->map;
struct super_block *sb = fe->inode->i_sb;
struct erofs_sb_info *sbi = EROFS_SB(sb);
- bool ztailpacking = map->m_flags & EROFS_MAP_META;
struct z_erofs_pcluster *pcl, *pre;
+ unsigned int pageofs_in;
int err;
- if (!(map->m_flags & EROFS_MAP_ENCODED) ||
- (!ztailpacking && !erofs_blknr(sb, map->m_pa))) {
- DBG_BUGON(1);
- return -EFSCORRUPTED;
- }
-
- /* no available pcluster, let's allocate one */
- pcl = z_erofs_alloc_pcluster(map->m_plen);
+ pageofs_in = erofs_blkoff(sb, map->m_pa);
+ pcl = z_erofs_alloc_pcluster(pageofs_in + map->m_plen);
if (IS_ERR(pcl))
return PTR_ERR(pcl);
- lockref_init(&pcl->lockref, 1); /* one ref for this request */
+ lockref_init(&pcl->lockref); /* one ref for this request */
pcl->algorithmformat = map->m_algorithmformat;
+ pcl->pclustersize = map->m_plen;
pcl->length = 0;
pcl->partial = true;
pcl->next = fe->head;
+ pcl->pos = map->m_pa;
+ pcl->pageofs_in = pageofs_in;
pcl->pageofs_out = map->m_la & ~PAGE_MASK;
+ pcl->from_meta = map->m_flags & EROFS_MAP_META;
fe->mode = Z_EROFS_PCLUSTER_FOLLOWED;
/*
@@ -741,13 +769,10 @@ static int z_erofs_register_pcluster(struct z_erofs_frontend *fe)
mutex_init(&pcl->lock);
DBG_BUGON(!mutex_trylock(&pcl->lock));
- if (ztailpacking) {
- pcl->index = 0; /* which indicates ztailpacking */
- } else {
- pcl->index = erofs_blknr(sb, map->m_pa);
+ if (!pcl->from_meta) {
while (1) {
xa_lock(&sbi->managed_pslots);
- pre = __xa_cmpxchg(&sbi->managed_pslots, pcl->index,
+ pre = __xa_cmpxchg(&sbi->managed_pslots, pcl->pos,
NULL, pcl, GFP_KERNEL);
if (!pre || xa_is_err(pre) || z_erofs_get_pcluster(pre)) {
xa_unlock(&sbi->managed_pslots);
@@ -779,8 +804,8 @@ static int z_erofs_pcluster_begin(struct z_erofs_frontend *fe)
{
struct erofs_map_blocks *map = &fe->map;
struct super_block *sb = fe->inode->i_sb;
- erofs_blk_t blknr = erofs_blknr(sb, map->m_pa);
struct z_erofs_pcluster *pcl = NULL;
+ void *ptr;
int ret;
DBG_BUGON(fe->pcl);
@@ -790,9 +815,9 @@ static int z_erofs_pcluster_begin(struct z_erofs_frontend *fe)
if (!(map->m_flags & EROFS_MAP_META)) {
while (1) {
rcu_read_lock();
- pcl = xa_load(&EROFS_SB(sb)->managed_pslots, blknr);
+ pcl = xa_load(&EROFS_SB(sb)->managed_pslots, map->m_pa);
if (!pcl || z_erofs_get_pcluster(pcl)) {
- DBG_BUGON(pcl && blknr != pcl->index);
+ DBG_BUGON(pcl && map->m_pa != pcl->pos);
rcu_read_unlock();
break;
}
@@ -826,19 +851,21 @@ static int z_erofs_pcluster_begin(struct z_erofs_frontend *fe)
z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset,
Z_EROFS_INLINE_BVECS, fe->pcl->vcnt);
- if (!z_erofs_is_inline_pcluster(fe->pcl)) {
+ if (!fe->pcl->from_meta) {
/* bind cache first when cached decompression is preferred */
z_erofs_bind_cache(fe);
} else {
- void *mptr;
-
- mptr = erofs_read_metabuf(&map->buf, sb, map->m_pa, EROFS_NO_KMAP);
- if (IS_ERR(mptr)) {
- ret = PTR_ERR(mptr);
- erofs_err(sb, "failed to get inline data %d", ret);
+ ret = erofs_init_metabuf(&map->buf, sb,
+ erofs_inode_in_metabox(fe->inode));
+ if (ret)
+ return ret;
+ ptr = erofs_bread(&map->buf, map->m_pa, false);
+ if (IS_ERR(ptr)) {
+ ret = PTR_ERR(ptr);
+ erofs_err(sb, "failed to get inline folio %d", ret);
return ret;
}
- get_page(map->buf.page);
+ folio_get(page_folio(map->buf.page));
WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, map->buf.page);
fe->pcl->pageofs_in = map->m_pa & ~PAGE_MASK;
fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
@@ -871,7 +898,7 @@ static bool __erofs_try_to_release_pcluster(struct erofs_sb_info *sbi,
* It's impossible to fail after the pcluster is freezed, but in order
* to avoid some race conditions, add a DBG_BUGON to observe this.
*/
- DBG_BUGON(__xa_erase(&sbi->managed_pslots, pcl->index) != pcl);
+ DBG_BUGON(__xa_erase(&sbi->managed_pslots, pcl->pos) != pcl);
lockref_mark_dead(&pcl->lockref);
return true;
@@ -967,7 +994,7 @@ static int z_erofs_read_fragment(struct super_block *sb, struct folio *folio,
buf.mapping = packed_inode->i_mapping;
for (; cur < end; cur += cnt, pos += cnt) {
cnt = min(end - cur, sb->s_blocksize - erofs_blkoff(sb, pos));
- src = erofs_bread(&buf, pos, EROFS_KMAP);
+ src = erofs_bread(&buf, pos, true);
if (IS_ERR(src)) {
erofs_put_metabuf(&buf);
return PTR_ERR(src);
@@ -1010,7 +1037,7 @@ static int z_erofs_scan_folio(struct z_erofs_frontend *f,
if (!(map->m_flags & EROFS_MAP_MAPPED)) {
folio_zero_segment(folio, cur, end);
tight = false;
- } else if (map->m_flags & EROFS_MAP_FRAGMENT) {
+ } else if (map->m_flags & __EROFS_MAP_FRAGMENT) {
erofs_off_t fpos = offset + cur - map->m_la;
err = z_erofs_read_fragment(inode->i_sb, folio, cur,
@@ -1050,8 +1077,6 @@ static int z_erofs_scan_folio(struct z_erofs_frontend *f,
break;
erofs_onlinefolio_split(folio);
- if (f->pcl->pageofs_out != (map->m_la & ~PAGE_MASK))
- f->pcl->multibases = true;
if (f->pcl->length < offset + end - map->m_la) {
f->pcl->length = offset + end - map->m_la;
f->pcl->pageofs_out = map->m_la & ~PAGE_MASK;
@@ -1069,7 +1094,7 @@ static int z_erofs_scan_folio(struct z_erofs_frontend *f,
tight = (bs == PAGE_SIZE);
}
} while ((end = cur) > 0);
- erofs_onlinefolio_end(folio, err);
+ erofs_onlinefolio_end(folio, err, false);
return err;
}
@@ -1097,7 +1122,6 @@ struct z_erofs_backend {
struct page *onstack_pages[Z_EROFS_ONSTACK_PAGES];
struct super_block *sb;
struct z_erofs_pcluster *pcl;
-
/* pages with the longest decompressed length for deduplication */
struct page **decompressed_pages;
/* pages to keep the compressed data */
@@ -1106,6 +1130,8 @@ struct z_erofs_backend {
struct list_head decompressed_secondary_bvecs;
struct page **pagepool;
unsigned int onstack_used, nr_pages;
+ /* indicate if temporary copies should be preserved for later use */
+ bool keepxcpy;
};
struct z_erofs_bvec_item {
@@ -1116,18 +1142,20 @@ struct z_erofs_bvec_item {
static void z_erofs_do_decompressed_bvec(struct z_erofs_backend *be,
struct z_erofs_bvec *bvec)
{
+ int poff = bvec->offset + be->pcl->pageofs_out;
struct z_erofs_bvec_item *item;
- unsigned int pgnr;
-
- if (!((bvec->offset + be->pcl->pageofs_out) & ~PAGE_MASK) &&
- (bvec->end == PAGE_SIZE ||
- bvec->offset + bvec->end == be->pcl->length)) {
- pgnr = (bvec->offset + be->pcl->pageofs_out) >> PAGE_SHIFT;
- DBG_BUGON(pgnr >= be->nr_pages);
- if (!be->decompressed_pages[pgnr]) {
- be->decompressed_pages[pgnr] = bvec->page;
+ struct page **page;
+
+ if (!(poff & ~PAGE_MASK) && (bvec->end == PAGE_SIZE ||
+ bvec->offset + bvec->end == be->pcl->length)) {
+ DBG_BUGON((poff >> PAGE_SHIFT) >= be->nr_pages);
+ page = be->decompressed_pages + (poff >> PAGE_SHIFT);
+ if (!*page) {
+ *page = bvec->page;
return;
}
+ } else {
+ be->keepxcpy = true;
}
/* (cold path) one pcluster is requested multiple times */
@@ -1171,7 +1199,7 @@ static void z_erofs_fill_other_copies(struct z_erofs_backend *be, int err)
cur += len;
}
kunmap_local(dst);
- erofs_onlinefolio_end(page_folio(bvi->bvec.page), err);
+ erofs_onlinefolio_end(page_folio(bvi->bvec.page), err, true);
list_del(p);
kfree(bvi);
}
@@ -1221,7 +1249,7 @@ static int z_erofs_parse_in_bvecs(struct z_erofs_backend *be, bool *overlapped)
}
be->compressed_pages[i] = page;
- if (z_erofs_is_inline_pcluster(pcl) ||
+ if (pcl->from_meta ||
erofs_folio_is_managed(EROFS_SB(be->sb), page_folio(page))) {
if (!PageUptodate(page))
err = -EIO;
@@ -1284,6 +1312,8 @@ static int z_erofs_decompress_pcluster(struct z_erofs_backend *be, int err)
.sb = be->sb,
.in = be->compressed_pages,
.out = be->decompressed_pages,
+ .inpages = pclusterpages,
+ .outpages = be->nr_pages,
.pageofs_in = pcl->pageofs_in,
.pageofs_out = pcl->pageofs_out,
.inputsize = pcl->pclustersize,
@@ -1291,16 +1321,15 @@ static int z_erofs_decompress_pcluster(struct z_erofs_backend *be, int err)
.alg = pcl->algorithmformat,
.inplace_io = overlapped,
.partial_decoding = pcl->partial,
- .fillgaps = pcl->multibases,
+ .fillgaps = be->keepxcpy,
.gfp = pcl->besteffort ? GFP_KERNEL :
GFP_NOWAIT | __GFP_NORETRY
}, be->pagepool);
/* must handle all compressed pages before actual file pages */
- if (z_erofs_is_inline_pcluster(pcl)) {
- page = pcl->compressed_bvecs[0].page;
+ if (pcl->from_meta) {
+ folio_put(page_folio(pcl->compressed_bvecs[0].page));
WRITE_ONCE(pcl->compressed_bvecs[0].page, NULL);
- put_page(page);
} else {
/* managed folios are still left in compressed_bvecs[] */
for (i = 0; i < pclusterpages; ++i) {
@@ -1328,7 +1357,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_backend *be, int err)
DBG_BUGON(z_erofs_page_is_invalidated(page));
if (!z_erofs_is_shortlived_page(page)) {
- erofs_onlinefolio_end(page_folio(page), err);
+ erofs_onlinefolio_end(page_folio(page), err, true);
continue;
}
if (pcl->algorithmformat != Z_EROFS_COMPRESSION_LZ4) {
@@ -1348,7 +1377,6 @@ static int z_erofs_decompress_pcluster(struct z_erofs_backend *be, int err)
pcl->length = 0;
pcl->partial = true;
- pcl->multibases = false;
pcl->besteffort = false;
pcl->bvset.nextpage = NULL;
pcl->vcnt = 0;
@@ -1357,7 +1385,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_backend *be, int err)
WRITE_ONCE(pcl->next, NULL);
mutex_unlock(&pcl->lock);
- if (z_erofs_is_inline_pcluster(pcl))
+ if (pcl->from_meta)
z_erofs_free_pcluster(pcl);
else
z_erofs_put_pcluster(sbi, pcl, try_free);
@@ -1404,6 +1432,16 @@ static void z_erofs_decompressqueue_kthread_work(struct kthread_work *work)
}
#endif
+/* Use (kthread_)work in atomic contexts to minimize scheduling overhead */
+static inline bool z_erofs_in_atomic(void)
+{
+ if (IS_ENABLED(CONFIG_PREEMPTION) && rcu_preempt_depth())
+ return true;
+ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
+ return true;
+ return !preemptible();
+}
+
static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
int bios)
{
@@ -1418,8 +1456,7 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
if (atomic_add_return(bios, &io->pending_bios))
return;
- /* Use (kthread_)work and sync decompression for atomic contexts only */
- if (!in_task() || irqs_disabled() || rcu_read_lock_any_held()) {
+ if (z_erofs_in_atomic()) {
#ifdef CONFIG_EROFS_FS_PCPU_KTHREAD
struct kthread_worker *worker;
@@ -1538,7 +1575,7 @@ out_allocfolio:
folio = page_folio(page);
out_tocache:
if (!tocache || bs != PAGE_SIZE ||
- filemap_add_folio(mc, folio, pcl->index + nr, gfp)) {
+ filemap_add_folio(mc, folio, (pcl->pos >> PAGE_SHIFT) + nr, gfp)) {
/* turn into a temporary shortlived folio (1 ref) */
folio->private = (void *)Z_EROFS_SHORTLIVED_PAGE;
return;
@@ -1655,19 +1692,20 @@ static void z_erofs_submit_queue(struct z_erofs_frontend *f,
pcl = next;
next = READ_ONCE(pcl->next);
- if (z_erofs_is_inline_pcluster(pcl)) {
+ if (pcl->from_meta) {
z_erofs_move_to_bypass_queue(pcl, next, qtail);
continue;
}
/* no device id here, thus it will always succeed */
mdev = (struct erofs_map_dev) {
- .m_pa = erofs_pos(sb, pcl->index),
+ .m_pa = round_down(pcl->pos, sb->s_blocksize),
};
(void)erofs_map_dev(sb, &mdev);
cur = mdev.m_pa;
- end = cur + pcl->pclustersize;
+ end = round_up(cur + pcl->pageofs_in + pcl->pclustersize,
+ sb->s_blocksize);
do {
bvec.bv_page = NULL;
if (bio && (cur != last_pa ||
@@ -1711,7 +1749,8 @@ drain_io:
bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS,
REQ_OP_READ, GFP_NOIO);
bio->bi_end_io = z_erofs_endio;
- bio->bi_iter.bi_sector = cur >> 9;
+ bio->bi_iter.bi_sector =
+ (mdev.m_dif->fsoff + cur) >> 9;
bio->bi_private = q[JQ_SUBMIT];
if (readahead)
bio->bi_opf |= REQ_RAHEAD;
@@ -1859,13 +1898,12 @@ static void z_erofs_readahead(struct readahead_control *rac)
{
struct inode *const inode = rac->mapping->host;
Z_EROFS_DEFINE_FRONTEND(f, inode, readahead_pos(rac));
- struct folio *head = NULL, *folio;
unsigned int nrpages = readahead_count(rac);
+ struct folio *head = NULL, *folio;
int err;
+ trace_erofs_readahead(inode, readahead_index(rac), nrpages, false);
z_erofs_pcluster_readmore(&f, rac, true);
- nrpages = readahead_count(rac);
- trace_erofs_readpages(inode, readahead_index(rac), nrpages, false);
while ((folio = readahead_folio(rac))) {
folio->private = head;
head = folio;
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index 689437e99a5a..a93efd95c555 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -17,7 +17,7 @@ struct z_erofs_maprecorder {
u16 delta[2];
erofs_blk_t pblk, compressedblks;
erofs_off_t nextpackoff;
- bool partialref;
+ bool partialref, in_mbox;
};
static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m,
@@ -25,13 +25,13 @@ static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m,
{
struct inode *const inode = m->inode;
struct erofs_inode *const vi = EROFS_I(inode);
- const erofs_off_t pos = Z_EROFS_FULL_INDEX_ALIGN(erofs_iloc(inode) +
+ const erofs_off_t pos = Z_EROFS_FULL_INDEX_START(erofs_iloc(inode) +
vi->inode_isize + vi->xattr_isize) +
lcn * sizeof(struct z_erofs_lcluster_index);
struct z_erofs_lcluster_index *di;
unsigned int advise;
- di = erofs_read_metabuf(&m->map->buf, inode->i_sb, pos, EROFS_KMAP);
+ di = erofs_read_metabuf(&m->map->buf, inode->i_sb, pos, m->in_mbox);
if (IS_ERR(di))
return PTR_ERR(di);
m->lcn = lcn;
@@ -40,7 +40,7 @@ static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m,
advise = le16_to_cpu(di->di_advise);
m->type = advise & Z_EROFS_LI_LCLUSTER_TYPE_MASK;
if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) {
- m->clusterofs = 1 << vi->z_logical_clusterbits;
+ m->clusterofs = 1 << vi->z_lclusterbits;
m->delta[0] = le16_to_cpu(di->di_u.delta[0]);
if (m->delta[0] & Z_EROFS_LI_D0_CBLKCNT) {
if (!(vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 |
@@ -55,7 +55,7 @@ static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m,
} else {
m->partialref = !!(advise & Z_EROFS_LI_PARTIAL_REF);
m->clusterofs = le16_to_cpu(di->di_clusterofs);
- if (m->clusterofs >= 1 << vi->z_logical_clusterbits) {
+ if (m->clusterofs >= 1 << vi->z_lclusterbits) {
DBG_BUGON(1);
return -EFSCORRUPTED;
}
@@ -102,9 +102,9 @@ static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m,
{
struct inode *const inode = m->inode;
struct erofs_inode *const vi = EROFS_I(inode);
- const erofs_off_t ebase = sizeof(struct z_erofs_map_header) +
- ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8);
- const unsigned int lclusterbits = vi->z_logical_clusterbits;
+ const erofs_off_t ebase = Z_EROFS_MAP_HEADER_END(erofs_iloc(inode) +
+ vi->inode_isize + vi->xattr_isize);
+ const unsigned int lclusterbits = vi->z_lclusterbits;
const unsigned int totalidx = erofs_iblks(inode);
unsigned int compacted_4b_initial, compacted_2b, amortizedshift;
unsigned int vcnt, lo, lobits, encodebits, nblk, bytes;
@@ -146,7 +146,7 @@ static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m,
else
return -EOPNOTSUPP;
- in = erofs_read_metabuf(&m->map->buf, m->inode->i_sb, pos, EROFS_KMAP);
+ in = erofs_read_metabuf(&m->map->buf, inode->i_sb, pos, m->in_mbox);
if (IS_ERR(in))
return PTR_ERR(in);
@@ -240,6 +240,13 @@ static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m,
static int z_erofs_load_lcluster_from_disk(struct z_erofs_maprecorder *m,
unsigned int lcn, bool lookahead)
{
+ if (m->type >= Z_EROFS_LCLUSTER_TYPE_MAX) {
+ erofs_err(m->inode->i_sb, "unknown type %u @ lcn %u of nid %llu",
+ m->type, lcn, EROFS_I(m->inode)->nid);
+ DBG_BUGON(1);
+ return -EOPNOTSUPP;
+ }
+
switch (EROFS_I(m->inode)->datalayout) {
case EROFS_INODE_COMPRESSED_FULL:
return z_erofs_load_full_lcluster(m, lcn);
@@ -255,7 +262,7 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m,
{
struct super_block *sb = m->inode->i_sb;
struct erofs_inode *const vi = EROFS_I(m->inode);
- const unsigned int lclusterbits = vi->z_logical_clusterbits;
+ const unsigned int lclusterbits = vi->z_lclusterbits;
while (m->lcn >= lookback_distance) {
unsigned long lcn = m->lcn - lookback_distance;
@@ -265,26 +272,17 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m,
if (err)
return err;
- switch (m->type) {
- case Z_EROFS_LCLUSTER_TYPE_NONHEAD:
+ if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) {
lookback_distance = m->delta[0];
if (!lookback_distance)
- goto err_bogus;
+ break;
continue;
- case Z_EROFS_LCLUSTER_TYPE_PLAIN:
- case Z_EROFS_LCLUSTER_TYPE_HEAD1:
- case Z_EROFS_LCLUSTER_TYPE_HEAD2:
+ } else {
m->headtype = m->type;
m->map->m_la = (lcn << lclusterbits) | m->clusterofs;
return 0;
- default:
- erofs_err(sb, "unknown type %u @ lcn %lu of nid %llu",
- m->type, lcn, vi->nid);
- DBG_BUGON(1);
- return -EOPNOTSUPP;
}
}
-err_bogus:
erofs_err(sb, "bogus lookback distance %u @ lcn %lu of nid %llu",
lookback_distance, m->lcn, vi->nid);
DBG_BUGON(1);
@@ -308,7 +306,7 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
if ((m->headtype == Z_EROFS_LCLUSTER_TYPE_HEAD1 && !bigpcl1) ||
((m->headtype == Z_EROFS_LCLUSTER_TYPE_PLAIN ||
m->headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2) && !bigpcl2) ||
- (lcn << vi->z_logical_clusterbits) >= inode->i_size)
+ (lcn << vi->z_lclusterbits) >= inode->i_size)
m->compressedblks = 1;
if (m->compressedblks)
@@ -329,35 +327,21 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
DBG_BUGON(lcn == initial_lcn &&
m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD);
- switch (m->type) {
- case Z_EROFS_LCLUSTER_TYPE_PLAIN:
- case Z_EROFS_LCLUSTER_TYPE_HEAD1:
- case Z_EROFS_LCLUSTER_TYPE_HEAD2:
- /*
- * if the 1st NONHEAD lcluster is actually PLAIN or HEAD type
- * rather than CBLKCNT, it's a 1 block-sized pcluster.
- */
- m->compressedblks = 1;
- break;
- case Z_EROFS_LCLUSTER_TYPE_NONHEAD:
- if (m->delta[0] != 1)
- goto err_bonus_cblkcnt;
- if (m->compressedblks)
- break;
- fallthrough;
- default:
- erofs_err(sb, "cannot found CBLKCNT @ lcn %lu of nid %llu", lcn,
- vi->nid);
+ if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD && m->delta[0] != 1) {
+ erofs_err(sb, "bogus CBLKCNT @ lcn %lu of nid %llu", lcn, vi->nid);
DBG_BUGON(1);
return -EFSCORRUPTED;
}
+
+ /*
+ * if the 1st NONHEAD lcluster is actually PLAIN or HEAD type rather
+ * than CBLKCNT, it's a 1 block-sized pcluster.
+ */
+ if (m->type != Z_EROFS_LCLUSTER_TYPE_NONHEAD || !m->compressedblks)
+ m->compressedblks = 1;
out:
m->map->m_plen = erofs_pos(sb, m->compressedblks);
return 0;
-err_bonus_cblkcnt:
- erofs_err(sb, "bogus CBLKCNT @ lcn %lu of nid %llu", lcn, vi->nid);
- DBG_BUGON(1);
- return -EFSCORRUPTED;
}
static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m)
@@ -365,7 +349,7 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m)
struct inode *inode = m->inode;
struct erofs_inode *vi = EROFS_I(inode);
struct erofs_map_blocks *map = m->map;
- unsigned int lclusterbits = vi->z_logical_clusterbits;
+ unsigned int lclusterbits = vi->z_lclusterbits;
u64 lcn = m->lcn, headlcn = map->m_la >> lclusterbits;
int err;
@@ -386,17 +370,10 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m)
m->delta[1] = 1;
DBG_BUGON(1);
}
- } else if (m->type == Z_EROFS_LCLUSTER_TYPE_PLAIN ||
- m->type == Z_EROFS_LCLUSTER_TYPE_HEAD1 ||
- m->type == Z_EROFS_LCLUSTER_TYPE_HEAD2) {
+ } else if (m->type < Z_EROFS_LCLUSTER_TYPE_MAX) {
if (lcn != headlcn)
break; /* ends at the next HEAD lcluster */
m->delta[1] = 1;
- } else {
- erofs_err(inode->i_sb, "unknown type %u @ lcn %llu of nid %llu",
- m->type, lcn, vi->nid);
- DBG_BUGON(1);
- return -EOPNOTSUPP;
}
lcn += m->delta[1];
}
@@ -404,23 +381,32 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m)
return 0;
}
-static int z_erofs_do_map_blocks(struct inode *inode,
+static int z_erofs_map_blocks_fo(struct inode *inode,
struct erofs_map_blocks *map, int flags)
{
- struct erofs_inode *const vi = EROFS_I(inode);
- bool ztailpacking = vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER;
+ struct erofs_inode *vi = EROFS_I(inode);
+ struct super_block *sb = inode->i_sb;
bool fragment = vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER;
+ bool ztailpacking = vi->z_idata_size;
+ unsigned int lclusterbits = vi->z_lclusterbits;
struct z_erofs_maprecorder m = {
.inode = inode,
.map = map,
+ .in_mbox = erofs_inode_in_metabox(inode),
};
int err = 0;
- unsigned int lclusterbits, endoff, afmt;
+ unsigned int endoff, afmt;
unsigned long initial_lcn;
unsigned long long ofs, end;
- lclusterbits = vi->z_logical_clusterbits;
ofs = flags & EROFS_GET_BLOCKS_FINDTAIL ? inode->i_size - 1 : map->m_la;
+ if (fragment && !(flags & EROFS_GET_BLOCKS_FINDTAIL) &&
+ !vi->z_tailextent_headlcn) {
+ map->m_la = 0;
+ map->m_llen = inode->i_size;
+ map->m_flags = EROFS_MAP_FRAGMENT;
+ return 0;
+ }
initial_lcn = ofs >> lclusterbits;
endoff = ofs & ((1 << lclusterbits) - 1);
@@ -428,52 +414,38 @@ static int z_erofs_do_map_blocks(struct inode *inode,
if (err)
goto unmap_out;
- if (ztailpacking && (flags & EROFS_GET_BLOCKS_FINDTAIL))
- vi->z_idataoff = m.nextpackoff;
-
+ if ((flags & EROFS_GET_BLOCKS_FINDTAIL) && ztailpacking)
+ vi->z_fragmentoff = m.nextpackoff;
map->m_flags = EROFS_MAP_MAPPED | EROFS_MAP_ENCODED;
end = (m.lcn + 1ULL) << lclusterbits;
- switch (m.type) {
- case Z_EROFS_LCLUSTER_TYPE_PLAIN:
- case Z_EROFS_LCLUSTER_TYPE_HEAD1:
- case Z_EROFS_LCLUSTER_TYPE_HEAD2:
- if (endoff >= m.clusterofs) {
- m.headtype = m.type;
- map->m_la = (m.lcn << lclusterbits) | m.clusterofs;
- /*
- * For ztailpacking files, in order to inline data more
- * effectively, special EOF lclusters are now supported
- * which can have three parts at most.
- */
- if (ztailpacking && end > inode->i_size)
- end = inode->i_size;
- break;
- }
- /* m.lcn should be >= 1 if endoff < m.clusterofs */
- if (!m.lcn) {
- erofs_err(inode->i_sb,
- "invalid logical cluster 0 at nid %llu",
- vi->nid);
- err = -EFSCORRUPTED;
- goto unmap_out;
+ if (m.type != Z_EROFS_LCLUSTER_TYPE_NONHEAD && endoff >= m.clusterofs) {
+ m.headtype = m.type;
+ map->m_la = (m.lcn << lclusterbits) | m.clusterofs;
+ /*
+ * For ztailpacking files, in order to inline data more
+ * effectively, special EOF lclusters are now supported
+ * which can have three parts at most.
+ */
+ if (ztailpacking && end > inode->i_size)
+ end = inode->i_size;
+ } else {
+ if (m.type != Z_EROFS_LCLUSTER_TYPE_NONHEAD) {
+ /* m.lcn should be >= 1 if endoff < m.clusterofs */
+ if (!m.lcn) {
+ erofs_err(sb, "invalid logical cluster 0 at nid %llu",
+ vi->nid);
+ err = -EFSCORRUPTED;
+ goto unmap_out;
+ }
+ end = (m.lcn << lclusterbits) | m.clusterofs;
+ map->m_flags |= EROFS_MAP_FULL_MAPPED;
+ m.delta[0] = 1;
}
- end = (m.lcn << lclusterbits) | m.clusterofs;
- map->m_flags |= EROFS_MAP_FULL_MAPPED;
- m.delta[0] = 1;
- fallthrough;
- case Z_EROFS_LCLUSTER_TYPE_NONHEAD:
/* get the corresponding first chunk */
err = z_erofs_extent_lookback(&m, m.delta[0]);
if (err)
goto unmap_out;
- break;
- default:
- erofs_err(inode->i_sb,
- "unknown type %u @ offset %llu of nid %llu",
- m.type, ofs, vi->nid);
- err = -EOPNOTSUPP;
- goto unmap_out;
}
if (m.partialref)
map->m_flags |= EROFS_MAP_PARTIAL_REF;
@@ -487,12 +459,18 @@ static int z_erofs_do_map_blocks(struct inode *inode,
}
if (ztailpacking && m.lcn == vi->z_tailextent_headlcn) {
map->m_flags |= EROFS_MAP_META;
- map->m_pa = vi->z_idataoff;
+ map->m_pa = vi->z_fragmentoff;
map->m_plen = vi->z_idata_size;
+ if (erofs_blkoff(sb, map->m_pa) + map->m_plen > sb->s_blocksize) {
+ erofs_err(sb, "invalid tail-packing pclustersize %llu",
+ map->m_plen);
+ err = -EFSCORRUPTED;
+ goto unmap_out;
+ }
} else if (fragment && m.lcn == vi->z_tailextent_headlcn) {
- map->m_flags |= EROFS_MAP_FRAGMENT;
+ map->m_flags = EROFS_MAP_FRAGMENT;
} else {
- map->m_pa = erofs_pos(inode->i_sb, m.pblk);
+ map->m_pa = erofs_pos(sb, m.pblk);
err = z_erofs_get_extent_compressedlen(&m, initial_lcn);
if (err)
goto unmap_out;
@@ -511,7 +489,7 @@ static int z_erofs_do_map_blocks(struct inode *inode,
afmt = m.headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2 ?
vi->z_algorithmtype[1] : vi->z_algorithmtype[0];
if (!(EROFS_I_SB(inode)->available_compr_algs & (1 << afmt))) {
- erofs_err(inode->i_sb, "inconsistent algorithmtype %u for nid %llu",
+ erofs_err(sb, "inconsistent algorithmtype %u for nid %llu",
afmt, vi->nid);
err = -EFSCORRUPTED;
goto unmap_out;
@@ -535,13 +513,121 @@ unmap_out:
return err;
}
-static int z_erofs_fill_inode_lazy(struct inode *inode)
+static int z_erofs_map_blocks_ext(struct inode *inode,
+ struct erofs_map_blocks *map, int flags)
+{
+ struct erofs_inode *vi = EROFS_I(inode);
+ struct super_block *sb = inode->i_sb;
+ bool interlaced = vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER;
+ unsigned int recsz = z_erofs_extent_recsize(vi->z_advise);
+ erofs_off_t pos = round_up(Z_EROFS_MAP_HEADER_END(erofs_iloc(inode) +
+ vi->inode_isize + vi->xattr_isize), recsz);
+ bool in_mbox = erofs_inode_in_metabox(inode);
+ erofs_off_t lend = inode->i_size;
+ erofs_off_t l, r, mid, pa, la, lstart;
+ struct z_erofs_extent *ext;
+ unsigned int fmt;
+ bool last;
+
+ map->m_flags = 0;
+ if (recsz <= offsetof(struct z_erofs_extent, pstart_hi)) {
+ if (recsz <= offsetof(struct z_erofs_extent, pstart_lo)) {
+ ext = erofs_read_metabuf(&map->buf, sb, pos, in_mbox);
+ if (IS_ERR(ext))
+ return PTR_ERR(ext);
+ pa = le64_to_cpu(*(__le64 *)ext);
+ pos += sizeof(__le64);
+ lstart = 0;
+ } else {
+ lstart = round_down(map->m_la, 1 << vi->z_lclusterbits);
+ pos += (lstart >> vi->z_lclusterbits) * recsz;
+ pa = EROFS_NULL_ADDR;
+ }
+
+ for (; lstart <= map->m_la; lstart += 1 << vi->z_lclusterbits) {
+ ext = erofs_read_metabuf(&map->buf, sb, pos, in_mbox);
+ if (IS_ERR(ext))
+ return PTR_ERR(ext);
+ map->m_plen = le32_to_cpu(ext->plen);
+ if (pa != EROFS_NULL_ADDR) {
+ map->m_pa = pa;
+ pa += map->m_plen & Z_EROFS_EXTENT_PLEN_MASK;
+ } else {
+ map->m_pa = le32_to_cpu(ext->pstart_lo);
+ }
+ pos += recsz;
+ }
+ last = (lstart >= round_up(lend, 1 << vi->z_lclusterbits));
+ lend = min(lstart, lend);
+ lstart -= 1 << vi->z_lclusterbits;
+ } else {
+ lstart = lend;
+ for (l = 0, r = vi->z_extents; l < r; ) {
+ mid = l + (r - l) / 2;
+ ext = erofs_read_metabuf(&map->buf, sb,
+ pos + mid * recsz, in_mbox);
+ if (IS_ERR(ext))
+ return PTR_ERR(ext);
+
+ la = le32_to_cpu(ext->lstart_lo);
+ pa = le32_to_cpu(ext->pstart_lo) |
+ (u64)le32_to_cpu(ext->pstart_hi) << 32;
+ if (recsz > offsetof(struct z_erofs_extent, lstart_hi))
+ la |= (u64)le32_to_cpu(ext->lstart_hi) << 32;
+
+ if (la > map->m_la) {
+ r = mid;
+ if (la > lend) {
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+ }
+ lend = la;
+ } else {
+ l = mid + 1;
+ if (map->m_la == la)
+ r = min(l + 1, r);
+ lstart = la;
+ map->m_plen = le32_to_cpu(ext->plen);
+ map->m_pa = pa;
+ }
+ }
+ last = (l >= vi->z_extents);
+ }
+
+ if (lstart < lend) {
+ map->m_la = lstart;
+ if (last && (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER)) {
+ map->m_flags = EROFS_MAP_FRAGMENT;
+ vi->z_fragmentoff = map->m_plen;
+ if (recsz > offsetof(struct z_erofs_extent, pstart_lo))
+ vi->z_fragmentoff |= map->m_pa << 32;
+ } else if (map->m_plen) {
+ map->m_flags |= EROFS_MAP_MAPPED |
+ EROFS_MAP_FULL_MAPPED | EROFS_MAP_ENCODED;
+ fmt = map->m_plen >> Z_EROFS_EXTENT_PLEN_FMT_BIT;
+ if (fmt)
+ map->m_algorithmformat = fmt - 1;
+ else if (interlaced && !erofs_blkoff(sb, map->m_pa))
+ map->m_algorithmformat =
+ Z_EROFS_COMPRESSION_INTERLACED;
+ else
+ map->m_algorithmformat =
+ Z_EROFS_COMPRESSION_SHIFTED;
+ if (map->m_plen & Z_EROFS_EXTENT_PLEN_PARTIAL)
+ map->m_flags |= EROFS_MAP_PARTIAL_REF;
+ map->m_plen &= Z_EROFS_EXTENT_PLEN_MASK;
+ }
+ }
+ map->m_llen = lend - map->m_la;
+ return 0;
+}
+
+static int z_erofs_fill_inode(struct inode *inode, struct erofs_map_blocks *map)
{
struct erofs_inode *const vi = EROFS_I(inode);
struct super_block *const sb = inode->i_sb;
int err, headnr;
erofs_off_t pos;
- struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
struct z_erofs_map_header *h;
if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) {
@@ -561,7 +647,7 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
goto out_unlock;
pos = ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8);
- h = erofs_read_metabuf(&buf, sb, pos, EROFS_KMAP);
+ h = erofs_read_metabuf(&map->buf, sb, pos, erofs_inode_in_metabox(inode));
if (IS_ERR(h)) {
err = PTR_ERR(h);
goto out_unlock;
@@ -578,8 +664,20 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
goto done;
}
vi->z_advise = le16_to_cpu(h->h_advise);
+ vi->z_lclusterbits = sb->s_blocksize_bits + (h->h_clusterbits & 15);
+ if (vi->datalayout == EROFS_INODE_COMPRESSED_FULL &&
+ (vi->z_advise & Z_EROFS_ADVISE_EXTENTS)) {
+ vi->z_extents = le32_to_cpu(h->h_extents_lo) |
+ ((u64)le16_to_cpu(h->h_extents_hi) << 32);
+ goto done;
+ }
+
vi->z_algorithmtype[0] = h->h_algorithmtype & 15;
vi->z_algorithmtype[1] = h->h_algorithmtype >> 4;
+ if (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER)
+ vi->z_fragmentoff = le32_to_cpu(h->h_fragmentoff);
+ else if (vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER)
+ vi->z_idata_size = le16_to_cpu(h->h_idata_size);
headnr = 0;
if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX ||
@@ -587,17 +685,16 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
erofs_err(sb, "unknown HEAD%u format %u for nid %llu, please upgrade kernel",
headnr + 1, vi->z_algorithmtype[headnr], vi->nid);
err = -EOPNOTSUPP;
- goto out_put_metabuf;
+ goto out_unlock;
}
- vi->z_logical_clusterbits = sb->s_blocksize_bits + (h->h_clusterbits & 7);
if (!erofs_sb_has_big_pcluster(EROFS_SB(sb)) &&
vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 |
Z_EROFS_ADVISE_BIG_PCLUSTER_2)) {
erofs_err(sb, "per-inode big pcluster without sb feature for nid %llu",
vi->nid);
err = -EFSCORRUPTED;
- goto out_put_metabuf;
+ goto out_unlock;
}
if (vi->datalayout == EROFS_INODE_COMPRESSED_COMPACT &&
!(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1) ^
@@ -605,48 +702,25 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
erofs_err(sb, "big pcluster head1/2 of compact indexes should be consistent for nid %llu",
vi->nid);
err = -EFSCORRUPTED;
- goto out_put_metabuf;
- }
-
- if (vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER) {
- struct erofs_map_blocks map = {
- .buf = __EROFS_BUF_INITIALIZER
- };
-
- vi->z_idata_size = le16_to_cpu(h->h_idata_size);
- err = z_erofs_do_map_blocks(inode, &map,
- EROFS_GET_BLOCKS_FINDTAIL);
- erofs_put_metabuf(&map.buf);
-
- if (!map.m_plen ||
- erofs_blkoff(sb, map.m_pa) + map.m_plen > sb->s_blocksize) {
- erofs_err(sb, "invalid tail-packing pclustersize %llu",
- map.m_plen);
- err = -EFSCORRUPTED;
- }
- if (err < 0)
- goto out_put_metabuf;
+ goto out_unlock;
}
- if (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER &&
- !(h->h_clusterbits >> Z_EROFS_FRAGMENT_INODE_BIT)) {
- struct erofs_map_blocks map = {
+ if (vi->z_idata_size ||
+ (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER)) {
+ struct erofs_map_blocks tm = {
.buf = __EROFS_BUF_INITIALIZER
};
- vi->z_fragmentoff = le32_to_cpu(h->h_fragmentoff);
- err = z_erofs_do_map_blocks(inode, &map,
+ err = z_erofs_map_blocks_fo(inode, &tm,
EROFS_GET_BLOCKS_FINDTAIL);
- erofs_put_metabuf(&map.buf);
+ erofs_put_metabuf(&tm.buf);
if (err < 0)
- goto out_put_metabuf;
+ goto out_unlock;
}
done:
/* paired with smp_mb() at the beginning of the function */
smp_mb();
set_bit(EROFS_I_Z_INITED_BIT, &vi->flags);
-out_put_metabuf:
- erofs_put_metabuf(&buf);
out_unlock:
clear_and_wake_up_bit(EROFS_I_BL_Z_BIT, &vi->flags);
return err;
@@ -664,17 +738,13 @@ int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map,
map->m_la = inode->i_size;
map->m_flags = 0;
} else {
- err = z_erofs_fill_inode_lazy(inode);
+ err = z_erofs_fill_inode(inode, map);
if (!err) {
- if ((vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER) &&
- !vi->z_tailextent_headlcn) {
- map->m_la = 0;
- map->m_llen = inode->i_size;
- map->m_flags = EROFS_MAP_MAPPED |
- EROFS_MAP_FULL_MAPPED | EROFS_MAP_FRAGMENT;
- } else {
- err = z_erofs_do_map_blocks(inode, map, flags);
- }
+ if (vi->datalayout == EROFS_INODE_COMPRESSED_FULL &&
+ (vi->z_advise & Z_EROFS_ADVISE_EXTENTS))
+ err = z_erofs_map_blocks_ext(inode, map, flags);
+ else
+ err = z_erofs_map_blocks_fo(inode, map, flags);
}
if (!err && (map->m_flags & EROFS_MAP_ENCODED) &&
unlikely(map->m_plen > Z_EROFS_PCLUSTER_MAX_SIZE ||
@@ -704,7 +774,7 @@ static int z_erofs_iomap_begin_report(struct inode *inode, loff_t offset,
iomap->length = map.m_llen;
if (map.m_flags & EROFS_MAP_MAPPED) {
iomap->type = IOMAP_MAPPED;
- iomap->addr = map.m_flags & EROFS_MAP_FRAGMENT ?
+ iomap->addr = map.m_flags & __EROFS_MAP_FRAGMENT ?
IOMAP_NULL_ADDR : map.m_pa;
} else {
iomap->type = IOMAP_HOLE;
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 76129bfcd663..af42b2c7d235 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -406,14 +406,13 @@ static int do_eventfd(unsigned int count, int flags)
if (fd < 0)
goto err;
- file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx, flags);
+ file = anon_inode_getfile_fmode("[eventfd]", &eventfd_fops,
+ ctx, flags, FMODE_NOWAIT);
if (IS_ERR(file)) {
put_unused_fd(fd);
fd = PTR_ERR(file);
goto err;
}
-
- file->f_mode |= FMODE_NOWAIT;
fd_install(fd, file);
return fd;
err:
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 7c0980db77b3..b22d6f819f78 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -218,6 +218,7 @@ struct eventpoll {
/* used to optimize loop detection check */
u64 gen;
struct hlist_head refs;
+ u8 loop_check_depth;
/*
* usage count, used together with epitem->dying to
@@ -438,7 +439,7 @@ static bool ep_busy_loop_end(void *p, unsigned long start_time)
*
* we must do our busy polling with irqs enabled
*/
-static bool ep_busy_loop(struct eventpoll *ep, int nonblock)
+static bool ep_busy_loop(struct eventpoll *ep)
{
unsigned int napi_id = READ_ONCE(ep->napi_id);
u16 budget = READ_ONCE(ep->busy_poll_budget);
@@ -447,8 +448,8 @@ static bool ep_busy_loop(struct eventpoll *ep, int nonblock)
if (!budget)
budget = BUSY_POLL_BUDGET;
- if (napi_id >= MIN_NAPI_ID && ep_busy_loop_on(ep)) {
- napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end,
+ if (napi_id_valid(napi_id) && ep_busy_loop_on(ep)) {
+ napi_busy_loop(napi_id, ep_busy_loop_end,
ep, prefer_busy_poll, budget);
if (ep_events_available(ep))
return true;
@@ -492,7 +493,7 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
* or
* Nothing to do if we already have this ID
*/
- if (napi_id < MIN_NAPI_ID || napi_id == ep->napi_id)
+ if (!napi_id_valid(napi_id) || napi_id == ep->napi_id)
return;
/* record NAPI ID for use in next busy poll */
@@ -546,7 +547,7 @@ static void ep_suspend_napi_irqs(struct eventpoll *ep)
{
unsigned int napi_id = READ_ONCE(ep->napi_id);
- if (napi_id >= MIN_NAPI_ID && READ_ONCE(ep->prefer_busy_poll))
+ if (napi_id_valid(napi_id) && READ_ONCE(ep->prefer_busy_poll))
napi_suspend_irqs(napi_id);
}
@@ -554,13 +555,13 @@ static void ep_resume_napi_irqs(struct eventpoll *ep)
{
unsigned int napi_id = READ_ONCE(ep->napi_id);
- if (napi_id >= MIN_NAPI_ID && READ_ONCE(ep->prefer_busy_poll))
+ if (napi_id_valid(napi_id) && READ_ONCE(ep->prefer_busy_poll))
napi_resume_irqs(napi_id);
}
#else
-static inline bool ep_busy_loop(struct eventpoll *ep, int nonblock)
+static inline bool ep_busy_loop(struct eventpoll *ep)
{
return false;
}
@@ -883,7 +884,7 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force)
kfree_rcu(epi, rcu);
percpu_counter_dec(&ep->user->epoll_watches);
- return ep_refcount_dec_and_test(ep);
+ return true;
}
/*
@@ -891,14 +892,14 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force)
*/
static void ep_remove_safe(struct eventpoll *ep, struct epitem *epi)
{
- WARN_ON_ONCE(__ep_remove(ep, epi, false));
+ if (__ep_remove(ep, epi, false))
+ WARN_ON_ONCE(ep_refcount_dec_and_test(ep));
}
static void ep_clear_and_put(struct eventpoll *ep)
{
struct rb_node *rbp, *next;
struct epitem *epi;
- bool dispose;
/* We need to release all tasks waiting for these file */
if (waitqueue_active(&ep->poll_wait))
@@ -931,10 +932,8 @@ static void ep_clear_and_put(struct eventpoll *ep)
cond_resched();
}
- dispose = ep_refcount_dec_and_test(ep);
mutex_unlock(&ep->mtx);
-
- if (dispose)
+ if (ep_refcount_dec_and_test(ep))
ep_free(ep);
}
@@ -1137,7 +1136,7 @@ again:
dispose = __ep_remove(ep, epi, true);
mutex_unlock(&ep->mtx);
- if (dispose)
+ if (dispose && ep_refcount_dec_and_test(ep))
ep_free(ep);
goto again;
}
@@ -1980,6 +1979,30 @@ static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry,
return ret;
}
+static int ep_try_send_events(struct eventpoll *ep,
+ struct epoll_event __user *events, int maxevents)
+{
+ int res;
+
+ /*
+ * Try to transfer events to user space. In case we get 0 events and
+ * there's still timeout left over, we go trying again in search of
+ * more luck.
+ */
+ res = ep_send_events(ep, events, maxevents);
+ if (res > 0)
+ ep_suspend_napi_irqs(ep);
+ return res;
+}
+
+static int ep_schedule_timeout(ktime_t *to)
+{
+ if (to)
+ return ktime_after(*to, ktime_get());
+ else
+ return 1;
+}
+
/**
* ep_poll - Retrieves ready events, and delivers them to the caller-supplied
* event buffer.
@@ -2031,23 +2054,15 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
while (1) {
if (eavail) {
- /*
- * Try to transfer events to user space. In case we get
- * 0 events and there's still timeout left over, we go
- * trying again in search of more luck.
- */
- res = ep_send_events(ep, events, maxevents);
- if (res) {
- if (res > 0)
- ep_suspend_napi_irqs(ep);
+ res = ep_try_send_events(ep, events, maxevents);
+ if (res)
return res;
- }
}
if (timed_out)
return 0;
- eavail = ep_busy_loop(ep, timed_out);
+ eavail = ep_busy_loop(ep);
if (eavail)
continue;
@@ -2096,8 +2111,9 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
write_unlock_irq(&ep->lock);
if (!eavail)
- timed_out = !schedule_hrtimeout_range(to, slack,
- HRTIMER_MODE_ABS);
+ timed_out = !ep_schedule_timeout(to) ||
+ !schedule_hrtimeout_range(to, slack,
+ HRTIMER_MODE_ABS);
__set_current_state(TASK_RUNNING);
/*
@@ -2125,23 +2141,24 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
}
/**
- * ep_loop_check_proc - verify that adding an epoll file inside another
- * epoll structure does not violate the constraints, in
- * terms of closed loops, or too deep chains (which can
- * result in excessive stack usage).
+ * ep_loop_check_proc - verify that adding an epoll file @ep inside another
+ * epoll file does not create closed loops, and
+ * determine the depth of the subtree starting at @ep
*
* @ep: the &struct eventpoll to be currently checked.
* @depth: Current depth of the path being checked.
*
- * Return: %zero if adding the epoll @file inside current epoll
- * structure @ep does not violate the constraints, or %-1 otherwise.
+ * Return: depth of the subtree, or INT_MAX if we found a loop or went too deep.
*/
static int ep_loop_check_proc(struct eventpoll *ep, int depth)
{
- int error = 0;
+ int result = 0;
struct rb_node *rbp;
struct epitem *epi;
+ if (ep->gen == loop_check_gen)
+ return ep->loop_check_depth;
+
mutex_lock_nested(&ep->mtx, depth + 1);
ep->gen = loop_check_gen;
for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
@@ -2149,13 +2166,11 @@ static int ep_loop_check_proc(struct eventpoll *ep, int depth)
if (unlikely(is_file_epoll(epi->ffd.file))) {
struct eventpoll *ep_tovisit;
ep_tovisit = epi->ffd.file->private_data;
- if (ep_tovisit->gen == loop_check_gen)
- continue;
if (ep_tovisit == inserting_into || depth > EP_MAX_NESTS)
- error = -1;
+ result = INT_MAX;
else
- error = ep_loop_check_proc(ep_tovisit, depth + 1);
- if (error != 0)
+ result = max(result, ep_loop_check_proc(ep_tovisit, depth + 1) + 1);
+ if (result > EP_MAX_NESTS)
break;
} else {
/*
@@ -2169,9 +2184,25 @@ static int ep_loop_check_proc(struct eventpoll *ep, int depth)
list_file(epi->ffd.file);
}
}
+ ep->loop_check_depth = result;
mutex_unlock(&ep->mtx);
- return error;
+ return result;
+}
+
+/* ep_get_upwards_depth_proc - determine depth of @ep when traversed upwards */
+static int ep_get_upwards_depth_proc(struct eventpoll *ep, int depth)
+{
+ int result = 0;
+ struct epitem *epi;
+
+ if (ep->gen == loop_check_gen)
+ return ep->loop_check_depth;
+ hlist_for_each_entry_rcu(epi, &ep->refs, fllink)
+ result = max(result, ep_get_upwards_depth_proc(epi->ep, depth + 1) + 1);
+ ep->gen = loop_check_gen;
+ ep->loop_check_depth = result;
+ return result;
}
/**
@@ -2187,8 +2218,22 @@ static int ep_loop_check_proc(struct eventpoll *ep, int depth)
*/
static int ep_loop_check(struct eventpoll *ep, struct eventpoll *to)
{
+ int depth, upwards_depth;
+
inserting_into = ep;
- return ep_loop_check_proc(to, 0);
+ /*
+ * Check how deep down we can get from @to, and whether it is possible
+ * to loop up to @ep.
+ */
+ depth = ep_loop_check_proc(to, 0);
+ if (depth > EP_MAX_NESTS)
+ return -1;
+ /* Check how far up we can go from @ep. */
+ rcu_read_lock();
+ upwards_depth = ep_get_upwards_depth_proc(ep, 0);
+ rcu_read_unlock();
+
+ return (depth+1+upwards_depth > EP_MAX_NESTS) ? -1 : 0;
}
static void clear_tfile_check_list(void)
@@ -2445,6 +2490,47 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
return do_epoll_ctl(epfd, op, fd, &epds, false);
}
+static int ep_check_params(struct file *file, struct epoll_event __user *evs,
+ int maxevents)
+{
+ /* The maximum number of event must be greater than zero */
+ if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
+ return -EINVAL;
+
+ /* Verify that the area passed by the user is writeable */
+ if (!access_ok(evs, maxevents * sizeof(struct epoll_event)))
+ return -EFAULT;
+
+ /*
+ * We have to check that the file structure underneath the fd
+ * the user passed to us _is_ an eventpoll file.
+ */
+ if (!is_file_epoll(file))
+ return -EINVAL;
+
+ return 0;
+}
+
+int epoll_sendevents(struct file *file, struct epoll_event __user *events,
+ int maxevents)
+{
+ struct eventpoll *ep;
+ int ret;
+
+ ret = ep_check_params(file, events, maxevents);
+ if (unlikely(ret))
+ return ret;
+
+ ep = file->private_data;
+ /*
+ * Racy call, but that's ok - it should get retried based on
+ * poll readiness anyway.
+ */
+ if (ep_events_available(ep))
+ return ep_try_send_events(ep, events, maxevents);
+ return 0;
+}
+
/*
* Implement the event wait interface for the eventpoll file. It is the kernel
* part of the user space epoll_wait(2).
@@ -2453,26 +2539,16 @@ static int do_epoll_wait(int epfd, struct epoll_event __user *events,
int maxevents, struct timespec64 *to)
{
struct eventpoll *ep;
-
- /* The maximum number of event must be greater than zero */
- if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
- return -EINVAL;
-
- /* Verify that the area passed by the user is writeable */
- if (!access_ok(events, maxevents * sizeof(struct epoll_event)))
- return -EFAULT;
+ int ret;
/* Get the "struct file *" for the eventpoll file */
CLASS(fd, f)(epfd);
if (fd_empty(f))
return -EBADF;
- /*
- * We have to check that the file structure underneath the fd
- * the user passed to us _is_ an eventpoll file.
- */
- if (!is_file_epoll(fd_file(f)))
- return -EINVAL;
+ ret = ep_check_params(fd_file(f), events, maxevents);
+ if (unlikely(ret))
+ return ret;
/*
* At this point it is safe to assume that the "private_data" contains
diff --git a/fs/exec.c b/fs/exec.c
index 506cd411f4ac..2a1e5e4042a1 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -78,6 +78,9 @@
#include <trace/events/sched.h>
+/* For vma exec functions. */
+#include "../mm/internal.h"
+
static int bprm_creds_from_file(struct linux_binprm *bprm);
int suid_dumpable = 0;
@@ -111,70 +114,13 @@ static inline void put_binfmt(struct linux_binfmt * fmt)
bool path_noexec(const struct path *path)
{
+ /* If it's an anonymous inode make sure that we catch any shenanigans. */
+ VFS_WARN_ON_ONCE(IS_ANON_FILE(d_inode(path->dentry)) &&
+ !(path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC));
return (path->mnt->mnt_flags & MNT_NOEXEC) ||
(path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC);
}
-#ifdef CONFIG_USELIB
-/*
- * Note that a shared library must be both readable and executable due to
- * security reasons.
- *
- * Also note that we take the address to load from the file itself.
- */
-SYSCALL_DEFINE1(uselib, const char __user *, library)
-{
- struct linux_binfmt *fmt;
- struct file *file;
- struct filename *tmp = getname(library);
- int error = PTR_ERR(tmp);
- static const struct open_flags uselib_flags = {
- .open_flag = O_LARGEFILE | O_RDONLY,
- .acc_mode = MAY_READ | MAY_EXEC,
- .intent = LOOKUP_OPEN,
- .lookup_flags = LOOKUP_FOLLOW,
- };
-
- if (IS_ERR(tmp))
- goto out;
-
- file = do_filp_open(AT_FDCWD, tmp, &uselib_flags);
- putname(tmp);
- error = PTR_ERR(file);
- if (IS_ERR(file))
- goto out;
-
- /*
- * Check do_open_execat() for an explanation.
- */
- error = -EACCES;
- if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode)) ||
- path_noexec(&file->f_path))
- goto exit;
-
- error = -ENOEXEC;
-
- read_lock(&binfmt_lock);
- list_for_each_entry(fmt, &formats, lh) {
- if (!fmt->load_shlib)
- continue;
- if (!try_module_get(fmt->module))
- continue;
- read_unlock(&binfmt_lock);
- error = fmt->load_shlib(file);
- read_lock(&binfmt_lock);
- put_binfmt(fmt);
- if (error != -ENOEXEC)
- break;
- }
- read_unlock(&binfmt_lock);
-exit:
- fput(file);
-out:
- return error;
-}
-#endif /* #ifdef CONFIG_USELIB */
-
#ifdef CONFIG_MMU
/*
* The nascent bprm->mm is not visible until exec_mmap() but it can
@@ -242,60 +188,6 @@ static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
flush_cache_page(bprm->vma, pos, page_to_pfn(page));
}
-static int __bprm_mm_init(struct linux_binprm *bprm)
-{
- int err;
- struct vm_area_struct *vma = NULL;
- struct mm_struct *mm = bprm->mm;
-
- bprm->vma = vma = vm_area_alloc(mm);
- if (!vma)
- return -ENOMEM;
- vma_set_anonymous(vma);
-
- if (mmap_write_lock_killable(mm)) {
- err = -EINTR;
- goto err_free;
- }
-
- /*
- * Need to be called with mmap write lock
- * held, to avoid race with ksmd.
- */
- err = ksm_execve(mm);
- if (err)
- goto err_ksm;
-
- /*
- * Place the stack at the largest stack address the architecture
- * supports. Later, we'll move this to an appropriate place. We don't
- * use STACK_TOP because that can depend on attributes which aren't
- * configured yet.
- */
- BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
- vma->vm_end = STACK_TOP_MAX;
- vma->vm_start = vma->vm_end - PAGE_SIZE;
- vm_flags_init(vma, VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP);
- vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
-
- err = insert_vm_struct(mm, vma);
- if (err)
- goto err;
-
- mm->stack_vm = mm->total_vm = 1;
- mmap_write_unlock(mm);
- bprm->p = vma->vm_end - sizeof(void *);
- return 0;
-err:
- ksm_exit(mm);
-err_ksm:
- mmap_write_unlock(mm);
-err_free:
- bprm->vma = NULL;
- vm_area_free(vma);
- return err;
-}
-
static bool valid_arg_len(struct linux_binprm *bprm, long len)
{
return len <= MAX_ARG_STRLEN;
@@ -348,12 +240,6 @@ static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
{
}
-static int __bprm_mm_init(struct linux_binprm *bprm)
-{
- bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *);
- return 0;
-}
-
static bool valid_arg_len(struct linux_binprm *bprm, long len)
{
return len <= bprm->p;
@@ -382,9 +268,13 @@ static int bprm_mm_init(struct linux_binprm *bprm)
bprm->rlim_stack = current->signal->rlim[RLIMIT_STACK];
task_unlock(current->group_leader);
- err = __bprm_mm_init(bprm);
+#ifndef CONFIG_MMU
+ bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *);
+#else
+ err = create_init_stack_vma(bprm->mm, &bprm->vma, &bprm->p);
if (err)
goto err;
+#endif
return 0;
@@ -714,7 +604,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma = bprm->vma;
struct vm_area_struct *prev = NULL;
- unsigned long vm_flags;
+ vm_flags_t vm_flags;
unsigned long stack_base;
unsigned long stack_size;
unsigned long stack_expand;
@@ -755,8 +645,6 @@ int setup_arg_pages(struct linux_binprm *bprm,
mm->arg_start = bprm->p;
#endif
- if (bprm->loader)
- bprm->loader -= stack_shift;
bprm->exec -= stack_shift;
if (mmap_write_lock_killable(mm))
@@ -896,13 +784,15 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags)
if (IS_ERR(file))
return file;
+ if (path_noexec(&file->f_path))
+ return ERR_PTR(-EACCES);
+
/*
* In the past the regular type check was here. It moved to may_open() in
* 633fb6ac3980 ("exec: move S_ISREG() check earlier"). Since then it is
* an invariant that all non-regular files error out before we get here.
*/
- if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode)) ||
- path_noexec(&file->f_path))
+ if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode)))
return ERR_PTR(-EACCES);
err = exe_file_deny_write_access(file);
@@ -1229,13 +1119,12 @@ int begin_new_exec(struct linux_binprm * bprm)
*/
bprm->point_of_no_return = true;
- /*
- * Make this the only thread in the thread group.
- */
+ /* Make this the only thread in the thread group */
retval = de_thread(me);
if (retval)
goto out;
-
+ /* see the comment in check_unsafe_exec() */
+ current->fs->in_exec = 0;
/*
* Cancel any io_uring activity across execve
*/
@@ -1497,6 +1386,8 @@ static void free_bprm(struct linux_binprm *bprm)
}
free_arg_pages(bprm);
if (bprm->cred) {
+ /* in case exec fails before de_thread() succeeds */
+ current->fs->in_exec = 0;
mutex_unlock(&current->signal->cred_guard_mutex);
abort_creds(bprm->cred);
}
@@ -1618,9 +1509,13 @@ static void check_unsafe_exec(struct linux_binprm *bprm)
* suid exec because the differently privileged task
* will be able to manipulate the current directory, etc.
* It would be nice to force an unshare instead...
+ *
+ * Otherwise we set fs->in_exec = 1 to deny clone(CLONE_FS)
+ * from another sub-thread until de_thread() succeeds, this
+ * state is protected by cred_guard_mutex we hold.
*/
n_fs = 1;
- spin_lock(&p->fs->lock);
+ read_seqlock_excl(&p->fs->seq);
rcu_read_lock();
for_other_threads(p, t) {
if (t->fs == p->fs)
@@ -1633,7 +1528,7 @@ static void check_unsafe_exec(struct linux_binprm *bprm)
bprm->unsafe |= LSM_UNSAFE_SHARE;
else
p->fs->in_exec = 1;
- spin_unlock(&p->fs->lock);
+ read_sequnlock_excl(&p->fs->seq);
}
static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file)
@@ -1861,10 +1756,9 @@ static int bprm_execve(struct linux_binprm *bprm)
goto out;
sched_mm_cid_after_execve(current);
+ rseq_execve(current);
/* execve succeeded */
- current->fs->in_exec = 0;
current->in_execve = 0;
- rseq_execve(current);
user_events_execve(current);
acct_update_integrals(current);
task_numa_free(current, false);
@@ -1881,7 +1775,7 @@ out:
force_fatal_sig(SIGSEGV);
sched_mm_cid_after_execve(current);
- current->fs->in_exec = 0;
+ rseq_set_notify_resume(current);
current->in_execve = 0;
return retval;
diff --git a/fs/exfat/balloc.c b/fs/exfat/balloc.c
index ce9be95c9172..cc01556c9d9b 100644
--- a/fs/exfat/balloc.c
+++ b/fs/exfat/balloc.c
@@ -141,36 +141,28 @@ int exfat_set_bitmap(struct inode *inode, unsigned int clu, bool sync)
return 0;
}
-void exfat_clear_bitmap(struct inode *inode, unsigned int clu, bool sync)
+int exfat_clear_bitmap(struct inode *inode, unsigned int clu, bool sync)
{
int i, b;
unsigned int ent_idx;
struct super_block *sb = inode->i_sb;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
- struct exfat_mount_options *opts = &sbi->options;
if (!is_valid_cluster(sbi, clu))
- return;
+ return -EIO;
ent_idx = CLUSTER_TO_BITMAP_ENT(clu);
i = BITMAP_OFFSET_SECTOR_INDEX(sb, ent_idx);
b = BITMAP_OFFSET_BIT_IN_SECTOR(sb, ent_idx);
- clear_bit_le(b, sbi->vol_amap[i]->b_data);
- exfat_update_bh(sbi->vol_amap[i], sync);
+ if (!test_bit_le(b, sbi->vol_amap[i]->b_data))
+ return -EIO;
- if (opts->discard) {
- int ret_discard;
+ clear_bit_le(b, sbi->vol_amap[i]->b_data);
- ret_discard = sb_issue_discard(sb,
- exfat_cluster_to_sector(sbi, clu),
- (1 << sbi->sect_per_clus_bits), GFP_NOFS, 0);
+ exfat_update_bh(sbi->vol_amap[i], sync);
- if (ret_discard == -EOPNOTSUPP) {
- exfat_err(sb, "discard not supported by device, disabling");
- opts->discard = 0;
- }
- }
+ return 0;
}
/*
diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c
index 3103b932b674..ee060e26f51d 100644
--- a/fs/exfat/dir.c
+++ b/fs/exfat/dir.c
@@ -996,6 +996,7 @@ int exfat_find_dir_entry(struct super_block *sb, struct exfat_inode_info *ei,
struct exfat_hint_femp candi_empty;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
int num_entries = exfat_calc_num_entries(p_uniname);
+ unsigned int clu_count = 0;
if (num_entries < 0)
return num_entries;
@@ -1133,6 +1134,10 @@ rewind:
} else {
if (exfat_get_next_cluster(sb, &clu.dir))
return -EIO;
+
+ /* break if the cluster chain includes a loop */
+ if (unlikely(++clu_count > EXFAT_DATA_CLUSTER_COUNT(sbi)))
+ goto not_found;
}
}
@@ -1195,6 +1200,7 @@ int exfat_count_dir_entries(struct super_block *sb, struct exfat_chain *p_dir)
int i, count = 0;
int dentries_per_clu;
unsigned int entry_type;
+ unsigned int clu_count = 0;
struct exfat_chain clu;
struct exfat_dentry *ep;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
@@ -1227,6 +1233,12 @@ int exfat_count_dir_entries(struct super_block *sb, struct exfat_chain *p_dir)
} else {
if (exfat_get_next_cluster(sb, &(clu.dir)))
return -EIO;
+
+ if (unlikely(++clu_count > sbi->used_clusters)) {
+ exfat_fs_error(sb, "FAT or bitmap is corrupted");
+ return -EIO;
+ }
+
}
}
diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h
index 78be6964a8a0..f8ead4d47ef0 100644
--- a/fs/exfat/exfat_fs.h
+++ b/fs/exfat/exfat_fs.h
@@ -14,8 +14,6 @@
#define EXFAT_ROOT_INO 1
-#define EXFAT_CLUSTERS_UNTRACKED (~0u)
-
/*
* exfat error flags
*/
@@ -456,7 +454,7 @@ int exfat_count_num_clusters(struct super_block *sb,
int exfat_load_bitmap(struct super_block *sb);
void exfat_free_bitmap(struct exfat_sb_info *sbi);
int exfat_set_bitmap(struct inode *inode, unsigned int clu, bool sync);
-void exfat_clear_bitmap(struct inode *inode, unsigned int clu, bool sync);
+int exfat_clear_bitmap(struct inode *inode, unsigned int clu, bool sync);
unsigned int exfat_find_free_bitmap(struct super_block *sb, unsigned int clu);
int exfat_count_used_clusters(struct super_block *sb, unsigned int *ret_count);
int exfat_trim_fs(struct inode *inode, struct fstrim_range *range);
diff --git a/fs/exfat/fatent.c b/fs/exfat/fatent.c
index 9e5492ac409b..232cc7f8ab92 100644
--- a/fs/exfat/fatent.c
+++ b/fs/exfat/fatent.c
@@ -144,6 +144,20 @@ int exfat_chain_cont_cluster(struct super_block *sb, unsigned int chain,
return 0;
}
+static inline void exfat_discard_cluster(struct super_block *sb,
+ unsigned int clu, unsigned int num_clusters)
+{
+ int ret;
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+
+ ret = sb_issue_discard(sb, exfat_cluster_to_sector(sbi, clu),
+ sbi->sect_per_clus * num_clusters, GFP_NOFS, 0);
+ if (ret == -EOPNOTSUPP) {
+ exfat_err(sb, "discard not supported by device, disabling");
+ sbi->options.discard = 0;
+ }
+}
+
/* This function must be called with bitmap_lock held */
static int __exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain)
{
@@ -175,6 +189,7 @@ static int __exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain
BITMAP_OFFSET_SECTOR_INDEX(sb, CLUSTER_TO_BITMAP_ENT(clu));
if (p_chain->flags == ALLOC_NO_FAT_CHAIN) {
+ int err;
unsigned int last_cluster = p_chain->dir + p_chain->size - 1;
do {
bool sync = false;
@@ -189,11 +204,18 @@ static int __exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain
cur_cmap_i = next_cmap_i;
}
- exfat_clear_bitmap(inode, clu, (sync && IS_DIRSYNC(inode)));
+ err = exfat_clear_bitmap(inode, clu, (sync && IS_DIRSYNC(inode)));
+ if (err)
+ break;
clu++;
num_clusters++;
} while (num_clusters < p_chain->size);
+
+ if (sbi->options.discard)
+ exfat_discard_cluster(sb, p_chain->dir, p_chain->size);
} else {
+ unsigned int nr_clu = 1;
+
do {
bool sync = false;
unsigned int n_clu = clu;
@@ -210,12 +232,23 @@ static int __exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain
cur_cmap_i = next_cmap_i;
}
- exfat_clear_bitmap(inode, clu, (sync && IS_DIRSYNC(inode)));
+ if (exfat_clear_bitmap(inode, clu, (sync && IS_DIRSYNC(inode))))
+ break;
+
+ if (sbi->options.discard) {
+ if (n_clu == clu + 1)
+ nr_clu++;
+ else {
+ exfat_discard_cluster(sb, clu - nr_clu + 1, nr_clu);
+ nr_clu = 1;
+ }
+ }
+
clu = n_clu;
num_clusters++;
if (err)
- goto dec_used_clus;
+ break;
if (num_clusters >= sbi->num_clusters - EXFAT_FIRST_CLUSTER) {
/*
@@ -229,7 +262,6 @@ static int __exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain
} while (clu != EXFAT_EOF_CLUSTER);
}
-dec_used_clus:
sbi->used_clusters -= num_clusters;
return 0;
}
@@ -262,7 +294,7 @@ int exfat_find_last_cluster(struct super_block *sb, struct exfat_chain *p_chain,
clu = next;
if (exfat_ent_get(sb, clu, &next))
return -EIO;
- } while (next != EXFAT_EOF_CLUSTER);
+ } while (next != EXFAT_EOF_CLUSTER && count <= p_chain->size);
if (p_chain->size != count) {
exfat_fs_error(sb,
@@ -458,5 +490,15 @@ int exfat_count_num_clusters(struct super_block *sb,
}
*ret_count = count;
+
+ /*
+ * since exfat_count_used_clusters() is not called, sbi->used_clusters
+ * cannot be used here.
+ */
+ if (unlikely(i == sbi->num_clusters && clu != EXFAT_EOF_CLUSTER)) {
+ exfat_fs_error(sb, "The cluster chain has a loop");
+ return -EIO;
+ }
+
return 0;
}
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index 05b51e721783..538d2b6ac2ec 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -532,11 +532,10 @@ int exfat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
return blkdev_issue_flush(inode->i_sb->s_bdev);
}
-static int exfat_extend_valid_size(struct file *file, loff_t new_valid_size)
+static int exfat_extend_valid_size(struct inode *inode, loff_t new_valid_size)
{
int err;
loff_t pos;
- struct inode *inode = file_inode(file);
struct exfat_inode_info *ei = EXFAT_I(inode);
struct address_space *mapping = inode->i_mapping;
const struct address_space_operations *ops = mapping->a_ops;
@@ -551,14 +550,14 @@ static int exfat_extend_valid_size(struct file *file, loff_t new_valid_size)
if (pos + len > new_valid_size)
len = new_valid_size - pos;
- err = ops->write_begin(file, mapping, pos, len, &folio, NULL);
+ err = ops->write_begin(NULL, mapping, pos, len, &folio, NULL);
if (err)
goto out;
off = offset_in_folio(folio, pos);
folio_zero_new_buffers(folio, off, off + len);
- err = ops->write_end(file, mapping, pos, len, len, folio, NULL);
+ err = ops->write_end(NULL, mapping, pos, len, len, folio, NULL);
if (err < 0)
goto out;
pos += len;
@@ -582,12 +581,15 @@ static ssize_t exfat_file_write_iter(struct kiocb *iocb, struct iov_iter *iter)
loff_t pos = iocb->ki_pos;
loff_t valid_size;
+ if (unlikely(exfat_forced_shutdown(inode->i_sb)))
+ return -EIO;
+
inode_lock(inode);
valid_size = ei->valid_size;
ret = generic_write_checks(iocb, iter);
- if (ret < 0)
+ if (ret <= 0)
goto unlock;
if (iocb->ki_flags & IOCB_DIRECT) {
@@ -601,7 +603,7 @@ static ssize_t exfat_file_write_iter(struct kiocb *iocb, struct iov_iter *iter)
}
if (pos > valid_size) {
- ret = exfat_extend_valid_size(file, pos);
+ ret = exfat_extend_valid_size(inode, pos);
if (ret < 0 && ret != -ENOSPC) {
exfat_err(inode->i_sb,
"write: fail to zero from %llu to %llu(%zd)",
@@ -620,9 +622,8 @@ static ssize_t exfat_file_write_iter(struct kiocb *iocb, struct iov_iter *iter)
if (pos > valid_size)
pos = valid_size;
- if (iocb_is_dsync(iocb) && iocb->ki_pos > pos) {
- ssize_t err = vfs_fsync_range(file, pos, iocb->ki_pos - 1,
- iocb->ki_flags & IOCB_SYNC);
+ if (iocb->ki_pos > pos) {
+ ssize_t err = generic_write_sync(iocb, iocb->ki_pos - pos);
if (err < 0)
return err;
}
@@ -635,6 +636,16 @@ unlock:
return ret;
}
+static ssize_t exfat_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+
+ if (unlikely(exfat_forced_shutdown(inode->i_sb)))
+ return -EIO;
+
+ return generic_file_read_iter(iocb, iter);
+}
+
static vm_fault_t exfat_page_mkwrite(struct vm_fault *vmf)
{
int err;
@@ -652,7 +663,7 @@ static vm_fault_t exfat_page_mkwrite(struct vm_fault *vmf)
start + vma->vm_end - vma->vm_start);
if (ei->valid_size < end) {
- err = exfat_extend_valid_size(file, end);
+ err = exfat_extend_valid_size(inode, end);
if (err < 0) {
inode_unlock(inode);
return vmf_fs_error(err);
@@ -670,24 +681,38 @@ static const struct vm_operations_struct exfat_file_vm_ops = {
.page_mkwrite = exfat_page_mkwrite,
};
-static int exfat_file_mmap(struct file *file, struct vm_area_struct *vma)
+static int exfat_file_mmap_prepare(struct vm_area_desc *desc)
{
+ struct file *file = desc->file;
+
+ if (unlikely(exfat_forced_shutdown(file_inode(desc->file)->i_sb)))
+ return -EIO;
+
file_accessed(file);
- vma->vm_ops = &exfat_file_vm_ops;
+ desc->vm_ops = &exfat_file_vm_ops;
return 0;
}
+static ssize_t exfat_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len, unsigned int flags)
+{
+ if (unlikely(exfat_forced_shutdown(file_inode(in)->i_sb)))
+ return -EIO;
+
+ return filemap_splice_read(in, ppos, pipe, len, flags);
+}
+
const struct file_operations exfat_file_operations = {
.llseek = generic_file_llseek,
- .read_iter = generic_file_read_iter,
+ .read_iter = exfat_file_read_iter,
.write_iter = exfat_file_write_iter,
.unlocked_ioctl = exfat_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = exfat_compat_ioctl,
#endif
- .mmap = exfat_file_mmap,
+ .mmap_prepare = exfat_file_mmap_prepare,
.fsync = exfat_file_fsync,
- .splice_read = filemap_splice_read,
+ .splice_read = exfat_splice_read,
.splice_write = iter_file_splice_write,
};
diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
index 96952d4acb50..c10844e1e16c 100644
--- a/fs/exfat/inode.c
+++ b/fs/exfat/inode.c
@@ -274,9 +274,11 @@ static int exfat_get_block(struct inode *inode, sector_t iblock,
sector_t last_block;
sector_t phys = 0;
sector_t valid_blks;
+ loff_t i_size;
mutex_lock(&sbi->s_lock);
- last_block = EXFAT_B_TO_BLK_ROUND_UP(i_size_read(inode), sb);
+ i_size = i_size_read(inode);
+ last_block = EXFAT_B_TO_BLK_ROUND_UP(i_size, sb);
if (iblock >= last_block && !create)
goto done;
@@ -305,77 +307,99 @@ static int exfat_get_block(struct inode *inode, sector_t iblock,
if (buffer_delay(bh_result))
clear_buffer_delay(bh_result);
- if (create) {
+ /*
+ * In most cases, we just need to set bh_result to mapped, unmapped
+ * or new status as follows:
+ * 1. i_size == valid_size
+ * 2. write case (create == 1)
+ * 3. direct_read (!bh_result->b_folio)
+ * -> the unwritten part will be zeroed in exfat_direct_IO()
+ *
+ * Otherwise, in the case of buffered read, it is necessary to take
+ * care the last nested block if valid_size is not equal to i_size.
+ */
+ if (i_size == ei->valid_size || create || !bh_result->b_folio)
valid_blks = EXFAT_B_TO_BLK_ROUND_UP(ei->valid_size, sb);
+ else
+ valid_blks = EXFAT_B_TO_BLK(ei->valid_size, sb);
- if (iblock + max_blocks < valid_blks) {
- /* The range has been written, map it */
- goto done;
- } else if (iblock < valid_blks) {
- /*
- * The range has been partially written,
- * map the written part.
- */
- max_blocks = valid_blks - iblock;
- goto done;
- }
+ /* The range has been fully written, map it */
+ if (iblock + max_blocks < valid_blks)
+ goto done;
- /* The area has not been written, map and mark as new. */
- set_buffer_new(bh_result);
+ /* The range has been partially written, map the written part */
+ if (iblock < valid_blks) {
+ max_blocks = valid_blks - iblock;
+ goto done;
+ }
+ /* The area has not been written, map and mark as new for create case */
+ if (create) {
+ set_buffer_new(bh_result);
ei->valid_size = EXFAT_BLK_TO_B(iblock + max_blocks, sb);
mark_inode_dirty(inode);
- } else {
- valid_blks = EXFAT_B_TO_BLK(ei->valid_size, sb);
+ goto done;
+ }
- if (iblock + max_blocks < valid_blks) {
- /* The range has been written, map it */
+ /*
+ * The area has just one block partially written.
+ * In that case, we should read and fill the unwritten part of
+ * a block with zero.
+ */
+ if (bh_result->b_folio && iblock == valid_blks &&
+ (ei->valid_size & (sb->s_blocksize - 1))) {
+ loff_t size, pos;
+ void *addr;
+
+ max_blocks = 1;
+
+ /*
+ * No buffer_head is allocated.
+ * (1) bmap: It's enough to set blocknr without I/O.
+ * (2) read: The unwritten part should be filled with zero.
+ * If a folio does not have any buffers,
+ * let's returns -EAGAIN to fallback to
+ * block_read_full_folio() for per-bh IO.
+ */
+ if (!folio_buffers(bh_result->b_folio)) {
+ err = -EAGAIN;
goto done;
- } else if (iblock < valid_blks) {
- /*
- * The area has been partially written,
- * map the written part.
- */
- max_blocks = valid_blks - iblock;
+ }
+
+ pos = EXFAT_BLK_TO_B(iblock, sb);
+ size = ei->valid_size - pos;
+ addr = folio_address(bh_result->b_folio) +
+ offset_in_folio(bh_result->b_folio, pos);
+
+ /* Check if bh->b_data points to proper addr in folio */
+ if (bh_result->b_data != addr) {
+ exfat_fs_error_ratelimit(sb,
+ "b_data(%p) != folio_addr(%p)",
+ bh_result->b_data, addr);
+ err = -EINVAL;
goto done;
- } else if (iblock == valid_blks &&
- (ei->valid_size & (sb->s_blocksize - 1))) {
- /*
- * The block has been partially written,
- * zero the unwritten part and map the block.
- */
- loff_t size, off, pos;
-
- max_blocks = 1;
-
- /*
- * For direct read, the unwritten part will be zeroed in
- * exfat_direct_IO()
- */
- if (!bh_result->b_folio)
- goto done;
-
- pos = EXFAT_BLK_TO_B(iblock, sb);
- size = ei->valid_size - pos;
- off = pos & (PAGE_SIZE - 1);
-
- folio_set_bh(bh_result, bh_result->b_folio, off);
- err = bh_read(bh_result, 0);
- if (err < 0)
- goto unlock_ret;
-
- folio_zero_segment(bh_result->b_folio, off + size,
- off + sb->s_blocksize);
- } else {
- /*
- * The range has not been written, clear the mapped flag
- * to only zero the cache and do not read from disk.
- */
- clear_buffer_mapped(bh_result);
}
+
+ /* Read a block */
+ err = bh_read(bh_result, 0);
+ if (err < 0)
+ goto done;
+
+ /* Zero unwritten part of a block */
+ memset(bh_result->b_data + size, 0, bh_result->b_size - size);
+ err = 0;
+ goto done;
}
+
+ /*
+ * The area has not been written, clear mapped for read/bmap cases.
+ * If so, it will be filled with zero without reading from disk.
+ */
+ clear_buffer_mapped(bh_result);
done:
bh_result->b_size = EXFAT_BLK_TO_B(max_blocks, sb);
+ if (err < 0)
+ clear_buffer_mapped(bh_result);
unlock_ret:
mutex_unlock(&sbi->s_lock);
return err;
@@ -422,9 +446,10 @@ static void exfat_write_failed(struct address_space *mapping, loff_t to)
}
}
-static int exfat_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned int len,
- struct folio **foliop, void **fsdata)
+static int exfat_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned int len,
+ struct folio **foliop, void **fsdata)
{
int ret;
@@ -439,15 +464,16 @@ static int exfat_write_begin(struct file *file, struct address_space *mapping,
return ret;
}
-static int exfat_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned int len, unsigned int copied,
- struct folio *folio, void *fsdata)
+static int exfat_write_end(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned int len, unsigned int copied,
+ struct folio *folio, void *fsdata)
{
struct inode *inode = mapping->host;
struct exfat_inode_info *ei = EXFAT_I(inode);
int err;
- err = generic_write_end(file, mapping, pos, len, copied, folio, fsdata);
+ err = generic_write_end(iocb, mapping, pos, len, copied, folio, fsdata);
if (err < len)
exfat_write_failed(mapping, pos+len);
diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
index 691dd77b6ab5..f5f1c4e8a29f 100644
--- a/fs/exfat/namei.c
+++ b/fs/exfat/namei.c
@@ -232,7 +232,7 @@ static int exfat_search_empty_slot(struct super_block *sb,
dentry = 0;
}
- while (dentry + num_entries < total_entries &&
+ while (dentry + num_entries <= total_entries &&
clu.dir != EXFAT_EOF_CLUSTER) {
i = dentry & (dentries_per_clu - 1);
@@ -646,6 +646,11 @@ static int exfat_find(struct inode *dir, struct qstr *qname,
info->valid_size = le64_to_cpu(ep2->dentry.stream.valid_size);
info->size = le64_to_cpu(ep2->dentry.stream.size);
+ if (unlikely(EXFAT_B_TO_CLU_ROUND_UP(info->size, sbi) > sbi->used_clusters)) {
+ exfat_fs_error(sb, "data size is invalid(%lld)", info->size);
+ return -EIO;
+ }
+
info->start_clu = le32_to_cpu(ep2->dentry.stream.start_clu);
if (!is_valid_cluster(sbi, info->start_clu) && info->size) {
exfat_warn(sb, "start_clu is invalid cluster(0x%x)",
@@ -835,8 +840,8 @@ unlock:
return err;
}
-static int exfat_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *exfat_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct super_block *sb = dir->i_sb;
struct inode *inode;
@@ -846,7 +851,7 @@ static int exfat_mkdir(struct mnt_idmap *idmap, struct inode *dir,
loff_t size = i_size_read(dir);
if (unlikely(exfat_forced_shutdown(sb)))
- return -EIO;
+ return ERR_PTR(-EIO);
mutex_lock(&EXFAT_SB(sb)->s_lock);
exfat_set_volume_dirty(sb);
@@ -877,7 +882,7 @@ static int exfat_mkdir(struct mnt_idmap *idmap, struct inode *dir,
unlock:
mutex_unlock(&EXFAT_SB(sb)->s_lock);
- return err;
+ return ERR_PTR(err);
}
static int exfat_check_dir_empty(struct super_block *sb,
@@ -885,6 +890,7 @@ static int exfat_check_dir_empty(struct super_block *sb,
{
int i, dentries_per_clu;
unsigned int type;
+ unsigned int clu_count = 0;
struct exfat_chain clu;
struct exfat_dentry *ep;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
@@ -921,6 +927,10 @@ static int exfat_check_dir_empty(struct super_block *sb,
} else {
if (exfat_get_next_cluster(sb, &(clu.dir)))
return -EIO;
+
+ /* break if the cluster chain includes a loop */
+ if (unlikely(++clu_count > EXFAT_DATA_CLUSTER_COUNT(sbi)))
+ break;
}
}
diff --git a/fs/exfat/nls.c b/fs/exfat/nls.c
index d47896a89596..1729bf42eb51 100644
--- a/fs/exfat/nls.c
+++ b/fs/exfat/nls.c
@@ -801,4 +801,5 @@ load_default:
void exfat_free_upcase_table(struct exfat_sb_info *sbi)
{
kvfree(sbi->vol_utbl);
+ sbi->vol_utbl = NULL;
}
diff --git a/fs/exfat/super.c b/fs/exfat/super.c
index bd57844414aa..8926e63f5bb7 100644
--- a/fs/exfat/super.c
+++ b/fs/exfat/super.c
@@ -36,46 +36,18 @@ static void exfat_put_super(struct super_block *sb)
struct exfat_sb_info *sbi = EXFAT_SB(sb);
mutex_lock(&sbi->s_lock);
+ exfat_clear_volume_dirty(sb);
exfat_free_bitmap(sbi);
brelse(sbi->boot_bh);
mutex_unlock(&sbi->s_lock);
}
-static int exfat_sync_fs(struct super_block *sb, int wait)
-{
- struct exfat_sb_info *sbi = EXFAT_SB(sb);
- int err = 0;
-
- if (unlikely(exfat_forced_shutdown(sb)))
- return 0;
-
- if (!wait)
- return 0;
-
- /* If there are some dirty buffers in the bdev inode */
- mutex_lock(&sbi->s_lock);
- sync_blockdev(sb->s_bdev);
- if (exfat_clear_volume_dirty(sb))
- err = -EIO;
- mutex_unlock(&sbi->s_lock);
- return err;
-}
-
static int exfat_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct super_block *sb = dentry->d_sb;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
unsigned long long id = huge_encode_dev(sb->s_bdev->bd_dev);
- if (sbi->used_clusters == EXFAT_CLUSTERS_UNTRACKED) {
- mutex_lock(&sbi->s_lock);
- if (exfat_count_used_clusters(sb, &sbi->used_clusters)) {
- mutex_unlock(&sbi->s_lock);
- return -EIO;
- }
- mutex_unlock(&sbi->s_lock);
- }
-
buf->f_type = sb->s_magic;
buf->f_bsize = sbi->cluster_size;
buf->f_blocks = sbi->num_clusters - 2; /* clu 0 & 1 */
@@ -228,7 +200,6 @@ static const struct super_operations exfat_sops = {
.write_inode = exfat_write_inode,
.evict_inode = exfat_evict_inode,
.put_super = exfat_put_super,
- .sync_fs = exfat_sync_fs,
.statfs = exfat_statfs,
.show_options = exfat_show_options,
.shutdown = exfat_shutdown,
@@ -370,13 +341,12 @@ static void exfat_hash_init(struct super_block *sb)
INIT_HLIST_HEAD(&sbi->inode_hashtable[i]);
}
-static int exfat_read_root(struct inode *inode)
+static int exfat_read_root(struct inode *inode, struct exfat_chain *root_clu)
{
struct super_block *sb = inode->i_sb;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
struct exfat_inode_info *ei = EXFAT_I(inode);
- struct exfat_chain cdir;
- int num_subdirs, num_clu = 0;
+ int num_subdirs;
exfat_chain_set(&ei->dir, sbi->root_dir, 0, ALLOC_FAT_CHAIN);
ei->entry = -1;
@@ -389,12 +359,9 @@ static int exfat_read_root(struct inode *inode)
ei->hint_stat.clu = sbi->root_dir;
ei->hint_femp.eidx = EXFAT_HINT_NONE;
- exfat_chain_set(&cdir, sbi->root_dir, 0, ALLOC_FAT_CHAIN);
- if (exfat_count_num_clusters(sb, &cdir, &num_clu))
- return -EIO;
- i_size_write(inode, num_clu << sbi->cluster_size_bits);
+ i_size_write(inode, EXFAT_CLU_TO_B(root_clu->size, sbi));
- num_subdirs = exfat_count_dir_entries(sb, &cdir);
+ num_subdirs = exfat_count_dir_entries(sb, root_clu);
if (num_subdirs < 0)
return -EIO;
set_nlink(inode, num_subdirs + EXFAT_MIN_SUBDIR);
@@ -531,7 +498,6 @@ static int exfat_read_boot_sector(struct super_block *sb)
sbi->vol_flags = le16_to_cpu(p_boot->vol_flags);
sbi->vol_flags_persistent = sbi->vol_flags & (VOLUME_DIRTY | MEDIA_FAILURE);
sbi->clu_srch_ptr = EXFAT_FIRST_CLUSTER;
- sbi->used_clusters = EXFAT_CLUSTERS_UNTRACKED;
/* check consistencies */
if ((u64)sbi->num_FAT_sectors << p_boot->sect_size_bits <
@@ -608,7 +574,8 @@ static int exfat_verify_boot_region(struct super_block *sb)
}
/* mount the file system volume */
-static int __exfat_fill_super(struct super_block *sb)
+static int __exfat_fill_super(struct super_block *sb,
+ struct exfat_chain *root_clu)
{
int ret;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
@@ -625,6 +592,18 @@ static int __exfat_fill_super(struct super_block *sb)
goto free_bh;
}
+ /*
+ * Call exfat_count_num_cluster() before searching for up-case and
+ * bitmap directory entries to avoid infinite loop if they are missing
+ * and the cluster chain includes a loop.
+ */
+ exfat_chain_set(root_clu, sbi->root_dir, 0, ALLOC_FAT_CHAIN);
+ ret = exfat_count_num_clusters(sb, root_clu, &root_clu->size);
+ if (ret) {
+ exfat_err(sb, "failed to count the number of clusters in root");
+ goto free_bh;
+ }
+
ret = exfat_create_upcase_table(sb);
if (ret) {
exfat_err(sb, "failed to load upcase table");
@@ -657,6 +636,7 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc)
struct exfat_sb_info *sbi = sb->s_fs_info;
struct exfat_mount_options *opts = &sbi->options;
struct inode *root_inode;
+ struct exfat_chain root_clu;
int err;
if (opts->allow_utime == (unsigned short)-1)
@@ -675,7 +655,7 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_time_min = EXFAT_MIN_TIMESTAMP_SECS;
sb->s_time_max = EXFAT_MAX_TIMESTAMP_SECS;
- err = __exfat_fill_super(sb);
+ err = __exfat_fill_super(sb, &root_clu);
if (err) {
exfat_err(sb, "failed to recognize exfat type");
goto check_nls_io;
@@ -697,9 +677,9 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc)
}
if (sbi->options.utf8)
- sb->s_d_op = &exfat_utf8_dentry_ops;
+ set_default_d_op(sb, &exfat_utf8_dentry_ops);
else
- sb->s_d_op = &exfat_dentry_ops;
+ set_default_d_op(sb, &exfat_dentry_ops);
root_inode = new_inode(sb);
if (!root_inode) {
@@ -710,7 +690,7 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc)
root_inode->i_ino = EXFAT_ROOT_INO;
inode_set_iversion(root_inode, 1);
- err = exfat_read_root(root_inode);
+ err = exfat_read_root(root_inode, &root_clu);
if (err) {
exfat_err(sb, "failed to initialize root inode");
goto put_inode;
@@ -761,10 +741,14 @@ static void exfat_free(struct fs_context *fc)
static int exfat_reconfigure(struct fs_context *fc)
{
+ struct super_block *sb = fc->root->d_sb;
fc->sb_flags |= SB_NODIRATIME;
- /* volume flag will be updated in exfat_sync_fs */
- sync_filesystem(fc->root->d_sb);
+ sync_filesystem(sb);
+ mutex_lock(&EXFAT_SB(sb)->s_lock);
+ exfat_clear_volume_dirty(sb);
+ mutex_unlock(&EXFAT_SB(sb)->s_lock);
+
return 0;
}
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 0c899cfba578..d3e55de4a2a2 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -126,10 +126,8 @@ static struct dentry *reconnect_one(struct vfsmount *mnt,
int err;
parent = ERR_PTR(-EACCES);
- inode_lock(dentry->d_inode);
if (mnt->mnt_sb->s_export_op->get_parent)
parent = mnt->mnt_sb->s_export_op->get_parent(dentry);
- inode_unlock(dentry->d_inode);
if (IS_ERR(parent)) {
dprintk("get_parent of %lu failed, err %ld\n",
@@ -145,7 +143,7 @@ static struct dentry *reconnect_one(struct vfsmount *mnt,
if (err)
goto out_err;
dprintk("%s: found name: %s\n", __func__, nbuf);
- tmp = lookup_one_unlocked(mnt_idmap(mnt), nbuf, parent, strlen(nbuf));
+ tmp = lookup_one_unlocked(mnt_idmap(mnt), &QSTR(nbuf), parent);
if (IS_ERR(tmp)) {
dprintk("lookup failed: %ld\n", PTR_ERR(tmp));
err = PTR_ERR(tmp);
@@ -286,6 +284,7 @@ static int get_name(const struct path *path, char *name, struct dentry *child)
};
struct getdents_callback buffer = {
.ctx.actor = filldir_one,
+ .ctx.count = INT_MAX,
.name = name,
};
@@ -550,16 +549,13 @@ exportfs_decode_fh_raw(struct vfsmount *mnt, struct fid *fid, int fh_len,
goto err_result;
}
- inode_lock(target_dir->d_inode);
- nresult = lookup_one(mnt_idmap(mnt), nbuf,
- target_dir, strlen(nbuf));
+ nresult = lookup_one_unlocked(mnt_idmap(mnt), &QSTR(nbuf), target_dir);
if (!IS_ERR(nresult)) {
if (unlikely(nresult->d_inode != result->d_inode)) {
dput(nresult);
nresult = ERR_PTR(-ESTALE);
}
}
- inode_unlock(target_dir->d_inode);
/*
* At this point we are done with the parent, but it's pinned
* by the child dentry anyway.
@@ -610,4 +606,5 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
}
EXPORT_SYMBOL_GPL(exportfs_decode_fh);
+MODULE_DESCRIPTION("Code mapping from inodes to file handles");
MODULE_LICENSE("GPL");
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 402fecf90a44..b07b3b369710 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -87,7 +87,7 @@ static void ext2_commit_chunk(struct folio *folio, loff_t pos, unsigned len)
struct inode *dir = mapping->host;
inode_inc_iversion(dir);
- block_write_end(NULL, mapping, pos, len, len, folio, NULL);
+ block_write_end(pos, len, len, folio);
if (pos+len > dir->i_size) {
i_size_write(dir, pos+len);
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index f38bdd46e4f7..cf97b76e9fd3 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -368,6 +368,7 @@ struct ext2_inode {
#define EXT2_MOUNT_ERRORS_CONT 0x000010 /* Continue on errors */
#define EXT2_MOUNT_ERRORS_RO 0x000020 /* Remount fs ro on errors */
#define EXT2_MOUNT_ERRORS_PANIC 0x000040 /* Panic on errors */
+#define EXT2_MOUNT_ERRORS_MASK 0x000070
#define EXT2_MOUNT_MINIX_DF 0x000080 /* Mimics the Minix statfs */
#define EXT2_MOUNT_NOBH 0x000100 /* No buffer_heads */
#define EXT2_MOUNT_NO_UID32 0x000200 /* Disable 32-bit UIDs */
@@ -749,9 +750,9 @@ extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len);
/* ioctl.c */
-extern int ext2_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+extern int ext2_fileattr_get(struct dentry *dentry, struct file_kattr *fa);
extern int ext2_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa);
+ struct dentry *dentry, struct file_kattr *fa);
extern long ext2_ioctl(struct file *, unsigned int, unsigned long);
extern long ext2_compat_ioctl(struct file *, unsigned int, unsigned long);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 10b061ac5bc0..76bddce462fc 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -122,17 +122,19 @@ static const struct vm_operations_struct ext2_dax_vm_ops = {
.pfn_mkwrite = ext2_dax_fault,
};
-static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma)
+static int ext2_file_mmap_prepare(struct vm_area_desc *desc)
{
+ struct file *file = desc->file;
+
if (!IS_DAX(file_inode(file)))
- return generic_file_mmap(file, vma);
+ return generic_file_mmap_prepare(desc);
file_accessed(file);
- vma->vm_ops = &ext2_dax_vm_ops;
+ desc->vm_ops = &ext2_dax_vm_ops;
return 0;
}
#else
-#define ext2_file_mmap generic_file_mmap
+#define ext2_file_mmap_prepare generic_file_mmap_prepare
#endif
/*
@@ -316,7 +318,7 @@ const struct file_operations ext2_file_operations = {
#ifdef CONFIG_COMPAT
.compat_ioctl = ext2_compat_ioctl,
#endif
- .mmap = ext2_file_mmap,
+ .mmap_prepare = ext2_file_mmap_prepare,
.open = ext2_file_open,
.release = ext2_release_file,
.fsync = ext2_fsync,
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 30f8201c155f..e10c376843d7 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -895,9 +895,19 @@ int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len)
{
int ret;
+ loff_t i_size;
inode_lock(inode);
- len = min_t(u64, len, i_size_read(inode));
+ i_size = i_size_read(inode);
+ /*
+ * iomap_fiemap() returns EINVAL for 0 length. Make sure we don't trim
+ * length to 0 but still trim the range as much as possible since
+ * ext2_get_blocks() iterates unmapped space block by block which is
+ * slow.
+ */
+ if (i_size == 0)
+ i_size = 1;
+ len = min_t(u64, len, i_size);
ret = iomap_fiemap(inode, fieinfo, start, len, &ext2_iomap_ops);
inode_unlock(inode);
@@ -915,7 +925,7 @@ static void ext2_readahead(struct readahead_control *rac)
}
static int
-ext2_write_begin(struct file *file, struct address_space *mapping,
+ext2_write_begin(const struct kiocb *iocb, struct address_space *mapping,
loff_t pos, unsigned len, struct folio **foliop, void **fsdata)
{
int ret;
@@ -926,13 +936,14 @@ ext2_write_begin(struct file *file, struct address_space *mapping,
return ret;
}
-static int ext2_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct folio *folio, void *fsdata)
+static int ext2_write_end(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct folio *folio, void *fsdata)
{
int ret;
- ret = generic_write_end(file, mapping, pos, len, copied, folio, fsdata);
+ ret = generic_write_end(iocb, mapping, pos, len, copied, folio, fsdata);
if (ret < len)
ext2_write_failed(mapping, pos + len);
return ret;
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index 44e04484e570..c3fea55b8efa 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -18,7 +18,7 @@
#include <linux/uaccess.h>
#include <linux/fileattr.h>
-int ext2_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+int ext2_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
{
struct ext2_inode_info *ei = EXT2_I(d_inode(dentry));
@@ -28,7 +28,7 @@ int ext2_fileattr_get(struct dentry *dentry, struct fileattr *fa)
}
int ext2_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa)
+ struct dentry *dentry, struct file_kattr *fa)
{
struct inode *inode = d_inode(dentry);
struct ext2_inode_info *ei = EXT2_I(inode);
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 8346ab9534c1..bde617a66cec 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -225,15 +225,16 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir,
return err;
}
-static int ext2_mkdir(struct mnt_idmap * idmap,
- struct inode * dir, struct dentry * dentry, umode_t mode)
+static struct dentry *ext2_mkdir(struct mnt_idmap * idmap,
+ struct inode * dir, struct dentry * dentry,
+ umode_t mode)
{
struct inode * inode;
int err;
err = dquot_initialize(dir);
if (err)
- return err;
+ return ERR_PTR(err);
inode_inc_link_count(dir);
@@ -258,7 +259,7 @@ static int ext2_mkdir(struct mnt_idmap * idmap,
d_instantiate_new(dentry, inode);
out:
- return err;
+ return ERR_PTR(err);
out_fail:
inode_dec_link_count(inode);
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 37f7ce56adce..121e634c792a 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -23,7 +23,8 @@
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/blkdev.h>
-#include <linux/parser.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include <linux/random.h>
#include <linux/buffer_head.h>
#include <linux/exportfs.h>
@@ -40,7 +41,6 @@
#include "acl.h"
static void ext2_write_super(struct super_block *sb);
-static int ext2_remount (struct super_block * sb, int * flags, char * data);
static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf);
static int ext2_sync_fs(struct super_block *sb, int wait);
static int ext2_freeze(struct super_block *sb);
@@ -81,6 +81,33 @@ void ext2_error(struct super_block *sb, const char *function,
}
}
+static void ext2_msg_fc(struct fs_context *fc, const char *prefix,
+ const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+ const char *s_id;
+
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+ s_id = fc->root->d_sb->s_id;
+ } else {
+ /* get last path component of source */
+ s_id = strrchr(fc->source, '/');
+ if (s_id)
+ s_id++;
+ else
+ s_id = fc->source;
+ }
+ va_start(args, fmt);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ printk("%sEXT2-fs (%s): %pV\n", prefix, s_id, &vaf);
+
+ va_end(args);
+}
+
void ext2_msg(struct super_block *sb, const char *prefix,
const char *fmt, ...)
{
@@ -346,7 +373,6 @@ static const struct super_operations ext2_sops = {
.freeze_fs = ext2_freeze,
.unfreeze_fs = ext2_unfreeze,
.statfs = ext2_statfs,
- .remount_fs = ext2_remount,
.show_options = ext2_show_options,
#ifdef CONFIG_QUOTA
.quota_read = ext2_quota_read,
@@ -402,230 +428,218 @@ static const struct export_operations ext2_export_ops = {
.get_parent = ext2_get_parent,
};
-static unsigned long get_sb_block(void **data)
-{
- unsigned long sb_block;
- char *options = (char *) *data;
-
- if (!options || strncmp(options, "sb=", 3) != 0)
- return 1; /* Default location */
- options += 3;
- sb_block = simple_strtoul(options, &options, 0);
- if (*options && *options != ',') {
- printk("EXT2-fs: Invalid sb specification: %s\n",
- (char *) *data);
- return 1;
- }
- if (*options == ',')
- options++;
- *data = (void *) options;
- return sb_block;
-}
-
enum {
- Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
- Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic,
- Opt_err_ro, Opt_nouid32, Opt_debug,
- Opt_oldalloc, Opt_orlov, Opt_nobh, Opt_user_xattr, Opt_nouser_xattr,
- Opt_acl, Opt_noacl, Opt_xip, Opt_dax, Opt_ignore, Opt_err, Opt_quota,
- Opt_usrquota, Opt_grpquota, Opt_reservation, Opt_noreservation
+ Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid, Opt_resgid, Opt_resuid,
+ Opt_sb, Opt_errors, Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov,
+ Opt_nobh, Opt_user_xattr, Opt_acl, Opt_xip, Opt_dax, Opt_ignore,
+ Opt_quota, Opt_usrquota, Opt_grpquota, Opt_reservation,
};
-static const match_table_t tokens = {
- {Opt_bsd_df, "bsddf"},
- {Opt_minix_df, "minixdf"},
- {Opt_grpid, "grpid"},
- {Opt_grpid, "bsdgroups"},
- {Opt_nogrpid, "nogrpid"},
- {Opt_nogrpid, "sysvgroups"},
- {Opt_resgid, "resgid=%u"},
- {Opt_resuid, "resuid=%u"},
- {Opt_sb, "sb=%u"},
- {Opt_err_cont, "errors=continue"},
- {Opt_err_panic, "errors=panic"},
- {Opt_err_ro, "errors=remount-ro"},
- {Opt_nouid32, "nouid32"},
- {Opt_debug, "debug"},
- {Opt_oldalloc, "oldalloc"},
- {Opt_orlov, "orlov"},
- {Opt_nobh, "nobh"},
- {Opt_user_xattr, "user_xattr"},
- {Opt_nouser_xattr, "nouser_xattr"},
- {Opt_acl, "acl"},
- {Opt_noacl, "noacl"},
- {Opt_xip, "xip"},
- {Opt_dax, "dax"},
- {Opt_grpquota, "grpquota"},
- {Opt_ignore, "noquota"},
- {Opt_quota, "quota"},
- {Opt_usrquota, "usrquota"},
- {Opt_reservation, "reservation"},
- {Opt_noreservation, "noreservation"},
- {Opt_err, NULL}
+static const struct constant_table ext2_param_errors[] = {
+ {"continue", EXT2_MOUNT_ERRORS_CONT},
+ {"panic", EXT2_MOUNT_ERRORS_PANIC},
+ {"remount-ro", EXT2_MOUNT_ERRORS_RO},
+ {}
+};
+
+static const struct fs_parameter_spec ext2_param_spec[] = {
+ fsparam_flag ("bsddf", Opt_bsd_df),
+ fsparam_flag ("minixdf", Opt_minix_df),
+ fsparam_flag ("grpid", Opt_grpid),
+ fsparam_flag ("bsdgroups", Opt_grpid),
+ fsparam_flag ("nogrpid", Opt_nogrpid),
+ fsparam_flag ("sysvgroups", Opt_nogrpid),
+ fsparam_gid ("resgid", Opt_resgid),
+ fsparam_uid ("resuid", Opt_resuid),
+ fsparam_u32 ("sb", Opt_sb),
+ fsparam_enum ("errors", Opt_errors, ext2_param_errors),
+ fsparam_flag ("nouid32", Opt_nouid32),
+ fsparam_flag ("debug", Opt_debug),
+ fsparam_flag ("oldalloc", Opt_oldalloc),
+ fsparam_flag ("orlov", Opt_orlov),
+ fsparam_flag ("nobh", Opt_nobh),
+ fsparam_flag_no ("user_xattr", Opt_user_xattr),
+ fsparam_flag_no ("acl", Opt_acl),
+ fsparam_flag ("xip", Opt_xip),
+ fsparam_flag ("dax", Opt_dax),
+ fsparam_flag ("grpquota", Opt_grpquota),
+ fsparam_flag ("noquota", Opt_ignore),
+ fsparam_flag ("quota", Opt_quota),
+ fsparam_flag ("usrquota", Opt_usrquota),
+ fsparam_flag_no ("reservation", Opt_reservation),
+ {}
+};
+
+#define EXT2_SPEC_s_resuid (1 << 0)
+#define EXT2_SPEC_s_resgid (1 << 1)
+
+struct ext2_fs_context {
+ unsigned long vals_s_flags; /* Bits to set in s_flags */
+ unsigned long mask_s_flags; /* Bits changed in s_flags */
+ unsigned int vals_s_mount_opt;
+ unsigned int mask_s_mount_opt;
+ kuid_t s_resuid;
+ kgid_t s_resgid;
+ unsigned long s_sb_block;
+ unsigned int spec;
+
};
-static int parse_options(char *options, struct super_block *sb,
- struct ext2_mount_options *opts)
+static inline void ctx_set_mount_opt(struct ext2_fs_context *ctx,
+ unsigned long flag)
{
- char *p;
- substring_t args[MAX_OPT_ARGS];
- int option;
- kuid_t uid;
- kgid_t gid;
-
- if (!options)
- return 1;
-
- while ((p = strsep (&options, ",")) != NULL) {
- int token;
- if (!*p)
- continue;
-
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_bsd_df:
- clear_opt (opts->s_mount_opt, MINIX_DF);
- break;
- case Opt_minix_df:
- set_opt (opts->s_mount_opt, MINIX_DF);
- break;
- case Opt_grpid:
- set_opt (opts->s_mount_opt, GRPID);
- break;
- case Opt_nogrpid:
- clear_opt (opts->s_mount_opt, GRPID);
- break;
- case Opt_resuid:
- if (match_int(&args[0], &option))
- return 0;
- uid = make_kuid(current_user_ns(), option);
- if (!uid_valid(uid)) {
- ext2_msg(sb, KERN_ERR, "Invalid uid value %d", option);
- return 0;
-
- }
- opts->s_resuid = uid;
- break;
- case Opt_resgid:
- if (match_int(&args[0], &option))
- return 0;
- gid = make_kgid(current_user_ns(), option);
- if (!gid_valid(gid)) {
- ext2_msg(sb, KERN_ERR, "Invalid gid value %d", option);
- return 0;
- }
- opts->s_resgid = gid;
- break;
- case Opt_sb:
- /* handled by get_sb_block() instead of here */
- /* *sb_block = match_int(&args[0]); */
- break;
- case Opt_err_panic:
- clear_opt (opts->s_mount_opt, ERRORS_CONT);
- clear_opt (opts->s_mount_opt, ERRORS_RO);
- set_opt (opts->s_mount_opt, ERRORS_PANIC);
- break;
- case Opt_err_ro:
- clear_opt (opts->s_mount_opt, ERRORS_CONT);
- clear_opt (opts->s_mount_opt, ERRORS_PANIC);
- set_opt (opts->s_mount_opt, ERRORS_RO);
- break;
- case Opt_err_cont:
- clear_opt (opts->s_mount_opt, ERRORS_RO);
- clear_opt (opts->s_mount_opt, ERRORS_PANIC);
- set_opt (opts->s_mount_opt, ERRORS_CONT);
- break;
- case Opt_nouid32:
- set_opt (opts->s_mount_opt, NO_UID32);
- break;
- case Opt_debug:
- set_opt (opts->s_mount_opt, DEBUG);
- break;
- case Opt_oldalloc:
- set_opt (opts->s_mount_opt, OLDALLOC);
- break;
- case Opt_orlov:
- clear_opt (opts->s_mount_opt, OLDALLOC);
- break;
- case Opt_nobh:
- ext2_msg(sb, KERN_INFO,
- "nobh option not supported");
- break;
+ ctx->mask_s_mount_opt |= flag;
+ ctx->vals_s_mount_opt |= flag;
+}
+
+static inline void ctx_clear_mount_opt(struct ext2_fs_context *ctx,
+ unsigned long flag)
+{
+ ctx->mask_s_mount_opt |= flag;
+ ctx->vals_s_mount_opt &= ~flag;
+}
+
+static inline unsigned long
+ctx_test_mount_opt(struct ext2_fs_context *ctx, unsigned long flag)
+{
+ return (ctx->vals_s_mount_opt & flag);
+}
+
+static inline bool
+ctx_parsed_mount_opt(struct ext2_fs_context *ctx, unsigned long flag)
+{
+ return (ctx->mask_s_mount_opt & flag);
+}
+
+static void ext2_free_fc(struct fs_context *fc)
+{
+ kfree(fc->fs_private);
+}
+
+static int ext2_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ struct ext2_fs_context *ctx = fc->fs_private;
+ int opt;
+ struct fs_parse_result result;
+
+ opt = fs_parse(fc, ext2_param_spec, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_bsd_df:
+ ctx_clear_mount_opt(ctx, EXT2_MOUNT_MINIX_DF);
+ break;
+ case Opt_minix_df:
+ ctx_set_mount_opt(ctx, EXT2_MOUNT_MINIX_DF);
+ break;
+ case Opt_grpid:
+ ctx_set_mount_opt(ctx, EXT2_MOUNT_GRPID);
+ break;
+ case Opt_nogrpid:
+ ctx_clear_mount_opt(ctx, EXT2_MOUNT_GRPID);
+ break;
+ case Opt_resuid:
+ ctx->s_resuid = result.uid;
+ ctx->spec |= EXT2_SPEC_s_resuid;
+ break;
+ case Opt_resgid:
+ ctx->s_resgid = result.gid;
+ ctx->spec |= EXT2_SPEC_s_resgid;
+ break;
+ case Opt_sb:
+ /* Note that this is silently ignored on remount */
+ ctx->s_sb_block = result.uint_32;
+ break;
+ case Opt_errors:
+ ctx_clear_mount_opt(ctx, EXT2_MOUNT_ERRORS_MASK);
+ ctx_set_mount_opt(ctx, result.uint_32);
+ break;
+ case Opt_nouid32:
+ ctx_set_mount_opt(ctx, EXT2_MOUNT_NO_UID32);
+ break;
+ case Opt_debug:
+ ctx_set_mount_opt(ctx, EXT2_MOUNT_DEBUG);
+ break;
+ case Opt_oldalloc:
+ ctx_set_mount_opt(ctx, EXT2_MOUNT_OLDALLOC);
+ break;
+ case Opt_orlov:
+ ctx_clear_mount_opt(ctx, EXT2_MOUNT_OLDALLOC);
+ break;
+ case Opt_nobh:
+ ext2_msg_fc(fc, KERN_INFO, "nobh option not supported\n");
+ break;
#ifdef CONFIG_EXT2_FS_XATTR
- case Opt_user_xattr:
- set_opt (opts->s_mount_opt, XATTR_USER);
- break;
- case Opt_nouser_xattr:
- clear_opt (opts->s_mount_opt, XATTR_USER);
- break;
+ case Opt_user_xattr:
+ if (!result.negated)
+ ctx_set_mount_opt(ctx, EXT2_MOUNT_XATTR_USER);
+ else
+ ctx_clear_mount_opt(ctx, EXT2_MOUNT_XATTR_USER);
+ break;
#else
- case Opt_user_xattr:
- case Opt_nouser_xattr:
- ext2_msg(sb, KERN_INFO, "(no)user_xattr options"
- "not supported");
- break;
+ case Opt_user_xattr:
+ ext2_msg_fc(fc, KERN_INFO, "(no)user_xattr options not supported");
+ break;
#endif
#ifdef CONFIG_EXT2_FS_POSIX_ACL
- case Opt_acl:
- set_opt(opts->s_mount_opt, POSIX_ACL);
- break;
- case Opt_noacl:
- clear_opt(opts->s_mount_opt, POSIX_ACL);
- break;
+ case Opt_acl:
+ if (!result.negated)
+ ctx_set_mount_opt(ctx, EXT2_MOUNT_POSIX_ACL);
+ else
+ ctx_clear_mount_opt(ctx, EXT2_MOUNT_POSIX_ACL);
+ break;
#else
- case Opt_acl:
- case Opt_noacl:
- ext2_msg(sb, KERN_INFO,
- "(no)acl options not supported");
- break;
+ case Opt_acl:
+ ext2_msg_fc(fc, KERN_INFO, "(no)acl options not supported");
+ break;
#endif
- case Opt_xip:
- ext2_msg(sb, KERN_INFO, "use dax instead of xip");
- set_opt(opts->s_mount_opt, XIP);
- fallthrough;
- case Opt_dax:
+ case Opt_xip:
+ ext2_msg_fc(fc, KERN_INFO, "use dax instead of xip");
+ ctx_set_mount_opt(ctx, EXT2_MOUNT_XIP);
+ fallthrough;
+ case Opt_dax:
#ifdef CONFIG_FS_DAX
- ext2_msg(sb, KERN_WARNING,
- "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
- set_opt(opts->s_mount_opt, DAX);
+ ext2_msg_fc(fc, KERN_WARNING,
+ "DAX enabled. Warning: DAX support in ext2 driver is deprecated"
+ " and will be removed at the end of 2025. Please use ext4 driver instead.");
+ ctx_set_mount_opt(ctx, EXT2_MOUNT_DAX);
#else
- ext2_msg(sb, KERN_INFO, "dax option not supported");
+ ext2_msg_fc(fc, KERN_INFO, "dax option not supported");
#endif
- break;
+ break;
#if defined(CONFIG_QUOTA)
- case Opt_quota:
- case Opt_usrquota:
- set_opt(opts->s_mount_opt, USRQUOTA);
- break;
-
- case Opt_grpquota:
- set_opt(opts->s_mount_opt, GRPQUOTA);
- break;
+ case Opt_quota:
+ case Opt_usrquota:
+ ctx_set_mount_opt(ctx, EXT2_MOUNT_USRQUOTA);
+ break;
+
+ case Opt_grpquota:
+ ctx_set_mount_opt(ctx, EXT2_MOUNT_GRPQUOTA);
+ break;
#else
- case Opt_quota:
- case Opt_usrquota:
- case Opt_grpquota:
- ext2_msg(sb, KERN_INFO,
- "quota operations not supported");
- break;
+ case Opt_quota:
+ case Opt_usrquota:
+ case Opt_grpquota:
+ ext2_msg_fc(fc, KERN_INFO, "quota operations not supported");
+ break;
#endif
-
- case Opt_reservation:
- set_opt(opts->s_mount_opt, RESERVATION);
- ext2_msg(sb, KERN_INFO, "reservations ON");
- break;
- case Opt_noreservation:
- clear_opt(opts->s_mount_opt, RESERVATION);
- ext2_msg(sb, KERN_INFO, "reservations OFF");
- break;
- case Opt_ignore:
- break;
- default:
- return 0;
+ case Opt_reservation:
+ if (!result.negated) {
+ ctx_set_mount_opt(ctx, EXT2_MOUNT_RESERVATION);
+ ext2_msg_fc(fc, KERN_INFO, "reservations ON");
+ } else {
+ ctx_clear_mount_opt(ctx, EXT2_MOUNT_RESERVATION);
+ ext2_msg_fc(fc, KERN_INFO, "reservations OFF");
}
+ break;
+ case Opt_ignore:
+ break;
+ default:
+ return -EINVAL;
}
- return 1;
+ return 0;
}
static int ext2_setup_super (struct super_block * sb,
@@ -801,24 +815,83 @@ static unsigned long descriptor_loc(struct super_block *sb,
return ext2_group_first_block_no(sb, bg) + ext2_bg_has_super(sb, bg);
}
-static int ext2_fill_super(struct super_block *sb, void *data, int silent)
+/*
+ * Set all mount options either from defaults on disk, or from parsed
+ * options. Parsed/specified options override on-disk defaults.
+ */
+static void ext2_set_options(struct fs_context *fc, struct ext2_sb_info *sbi)
+{
+ struct ext2_fs_context *ctx = fc->fs_private;
+ struct ext2_super_block *es = sbi->s_es;
+ unsigned long def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
+
+ /* Copy parsed mount options to sbi */
+ sbi->s_mount_opt = ctx->vals_s_mount_opt;
+
+ /* Use in-superblock defaults only if not specified during parsing */
+ if (!ctx_parsed_mount_opt(ctx, EXT2_MOUNT_DEBUG) &&
+ def_mount_opts & EXT2_DEFM_DEBUG)
+ set_opt(sbi->s_mount_opt, DEBUG);
+
+ if (!ctx_parsed_mount_opt(ctx, EXT2_MOUNT_GRPID) &&
+ def_mount_opts & EXT2_DEFM_BSDGROUPS)
+ set_opt(sbi->s_mount_opt, GRPID);
+
+ if (!ctx_parsed_mount_opt(ctx, EXT2_MOUNT_NO_UID32) &&
+ def_mount_opts & EXT2_DEFM_UID16)
+ set_opt(sbi->s_mount_opt, NO_UID32);
+
+#ifdef CONFIG_EXT2_FS_XATTR
+ if (!ctx_parsed_mount_opt(ctx, EXT2_MOUNT_XATTR_USER) &&
+ def_mount_opts & EXT2_DEFM_XATTR_USER)
+ set_opt(sbi->s_mount_opt, XATTR_USER);
+#endif
+#ifdef CONFIG_EXT2_FS_POSIX_ACL
+ if (!ctx_parsed_mount_opt(ctx, EXT2_MOUNT_POSIX_ACL) &&
+ def_mount_opts & EXT2_DEFM_ACL)
+ set_opt(sbi->s_mount_opt, POSIX_ACL);
+#endif
+
+ if (!ctx_parsed_mount_opt(ctx, EXT2_MOUNT_ERRORS_MASK)) {
+ if (le16_to_cpu(sbi->s_es->s_errors) == EXT2_ERRORS_PANIC)
+ set_opt(sbi->s_mount_opt, ERRORS_PANIC);
+ else if (le16_to_cpu(sbi->s_es->s_errors) == EXT2_ERRORS_CONTINUE)
+ set_opt(sbi->s_mount_opt, ERRORS_CONT);
+ else
+ set_opt(sbi->s_mount_opt, ERRORS_RO);
+ }
+
+ if (ctx->spec & EXT2_SPEC_s_resuid)
+ sbi->s_resuid = ctx->s_resuid;
+ else
+ sbi->s_resuid = make_kuid(&init_user_ns,
+ le16_to_cpu(es->s_def_resuid));
+
+ if (ctx->spec & EXT2_SPEC_s_resgid)
+ sbi->s_resgid = ctx->s_resgid;
+ else
+ sbi->s_resgid = make_kgid(&init_user_ns,
+ le16_to_cpu(es->s_def_resgid));
+}
+
+static int ext2_fill_super(struct super_block *sb, struct fs_context *fc)
{
+ struct ext2_fs_context *ctx = fc->fs_private;
+ int silent = fc->sb_flags & SB_SILENT;
struct buffer_head * bh;
struct ext2_sb_info * sbi;
struct ext2_super_block * es;
struct inode *root;
unsigned long block;
- unsigned long sb_block = get_sb_block(&data);
+ unsigned long sb_block = ctx->s_sb_block;
unsigned long logic_sb_block;
unsigned long offset = 0;
- unsigned long def_mount_opts;
long ret = -ENOMEM;
int blocksize = BLOCK_SIZE;
int db_count;
int i, j;
__le32 features;
int err;
- struct ext2_mount_options opts;
sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
if (!sbi)
@@ -877,42 +950,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
if (sb->s_magic != EXT2_SUPER_MAGIC)
goto cantfind_ext2;
- opts.s_mount_opt = 0;
- /* Set defaults before we parse the mount options */
- def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
- if (def_mount_opts & EXT2_DEFM_DEBUG)
- set_opt(opts.s_mount_opt, DEBUG);
- if (def_mount_opts & EXT2_DEFM_BSDGROUPS)
- set_opt(opts.s_mount_opt, GRPID);
- if (def_mount_opts & EXT2_DEFM_UID16)
- set_opt(opts.s_mount_opt, NO_UID32);
-#ifdef CONFIG_EXT2_FS_XATTR
- if (def_mount_opts & EXT2_DEFM_XATTR_USER)
- set_opt(opts.s_mount_opt, XATTR_USER);
-#endif
-#ifdef CONFIG_EXT2_FS_POSIX_ACL
- if (def_mount_opts & EXT2_DEFM_ACL)
- set_opt(opts.s_mount_opt, POSIX_ACL);
-#endif
-
- if (le16_to_cpu(sbi->s_es->s_errors) == EXT2_ERRORS_PANIC)
- set_opt(opts.s_mount_opt, ERRORS_PANIC);
- else if (le16_to_cpu(sbi->s_es->s_errors) == EXT2_ERRORS_CONTINUE)
- set_opt(opts.s_mount_opt, ERRORS_CONT);
- else
- set_opt(opts.s_mount_opt, ERRORS_RO);
-
- opts.s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid));
- opts.s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid));
-
- set_opt(opts.s_mount_opt, RESERVATION);
-
- if (!parse_options((char *) data, sb, &opts))
- goto failed_mount;
-
- sbi->s_mount_opt = opts.s_mount_opt;
- sbi->s_resuid = opts.s_resuid;
- sbi->s_resgid = opts.s_resgid;
+ ext2_set_options(fc, sbi);
sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
(test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);
@@ -1324,23 +1362,21 @@ static void ext2_write_super(struct super_block *sb)
ext2_sync_fs(sb, 1);
}
-static int ext2_remount (struct super_block * sb, int * flags, char * data)
+static int ext2_reconfigure(struct fs_context *fc)
{
+ struct ext2_fs_context *ctx = fc->fs_private;
+ struct super_block *sb = fc->root->d_sb;
struct ext2_sb_info * sbi = EXT2_SB(sb);
struct ext2_super_block * es;
struct ext2_mount_options new_opts;
+ int flags = fc->sb_flags;
int err;
sync_filesystem(sb);
- spin_lock(&sbi->s_lock);
- new_opts.s_mount_opt = sbi->s_mount_opt;
- new_opts.s_resuid = sbi->s_resuid;
- new_opts.s_resgid = sbi->s_resgid;
- spin_unlock(&sbi->s_lock);
-
- if (!parse_options(data, sb, &new_opts))
- return -EINVAL;
+ new_opts.s_mount_opt = ctx->vals_s_mount_opt;
+ new_opts.s_resuid = ctx->s_resuid;
+ new_opts.s_resgid = ctx->s_resgid;
spin_lock(&sbi->s_lock);
es = sbi->s_es;
@@ -1349,9 +1385,9 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
"dax flag with busy inodes while remounting");
new_opts.s_mount_opt ^= EXT2_MOUNT_DAX;
}
- if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
+ if ((bool)(flags & SB_RDONLY) == sb_rdonly(sb))
goto out_set;
- if (*flags & SB_RDONLY) {
+ if (flags & SB_RDONLY) {
if (le16_to_cpu(es->s_state) & EXT2_VALID_FS ||
!(sbi->s_mount_state & EXT2_VALID_FS))
goto out_set;
@@ -1470,10 +1506,9 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
return 0;
}
-static struct dentry *ext2_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int ext2_get_tree(struct fs_context *fc)
{
- return mount_bdev(fs_type, flags, dev_name, data, ext2_fill_super);
+ return get_tree_bdev(fc, ext2_fill_super);
}
#ifdef CONFIG_QUOTA
@@ -1556,7 +1591,7 @@ static ssize_t ext2_quota_write(struct super_block *sb, int type,
}
lock_buffer(bh);
memcpy(bh->b_data+offset, data, tocopy);
- flush_dcache_page(bh->b_page);
+ flush_dcache_folio(bh->b_folio);
set_buffer_uptodate(bh);
mark_buffer_dirty(bh);
unlock_buffer(bh);
@@ -1624,12 +1659,49 @@ out:
#endif
+static const struct fs_context_operations ext2_context_ops = {
+ .parse_param = ext2_parse_param,
+ .get_tree = ext2_get_tree,
+ .reconfigure = ext2_reconfigure,
+ .free = ext2_free_fc,
+};
+
+static int ext2_init_fs_context(struct fs_context *fc)
+{
+ struct ext2_fs_context *ctx;
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+ struct super_block *sb = fc->root->d_sb;
+ struct ext2_sb_info *sbi = EXT2_SB(sb);
+
+ spin_lock(&sbi->s_lock);
+ ctx->vals_s_mount_opt = sbi->s_mount_opt;
+ ctx->vals_s_flags = sb->s_flags;
+ ctx->s_resuid = sbi->s_resuid;
+ ctx->s_resgid = sbi->s_resgid;
+ spin_unlock(&sbi->s_lock);
+ } else {
+ ctx->s_sb_block = 1;
+ ctx_set_mount_opt(ctx, EXT2_MOUNT_RESERVATION);
+ }
+
+ fc->fs_private = ctx;
+ fc->ops = &ext2_context_ops;
+
+ return 0;
+}
+
static struct file_system_type ext2_fs_type = {
.owner = THIS_MODULE,
.name = "ext2",
- .mount = ext2_mount,
.kill_sb = kill_block_super,
.fs_flags = FS_REQUIRES_DEV,
+ .init_fs_context = ext2_init_fs_context,
+ .parameters = ext2_param_spec,
};
MODULE_ALIAS_FS("ext2");
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 8042ad873808..c9329ed5c094 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -649,8 +649,8 @@ static int ext4_has_free_clusters(struct ext4_sb_info *sbi,
/* Hm, nope. Are (enough) root reserved clusters available? */
if (uid_eq(sbi->s_resuid, current_fsuid()) ||
(!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) ||
- capable(CAP_SYS_RESOURCE) ||
- (flags & EXT4_MB_USE_ROOT_BLOCKS)) {
+ (flags & EXT4_MB_USE_ROOT_BLOCKS) ||
+ capable(CAP_SYS_RESOURCE)) {
if (free_clusters >= (nclusters + dirty_clusters +
resv_clusters))
@@ -703,7 +703,7 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
* possible we just missed a transaction commit that did so
*/
smp_mb();
- if (sbi->s_mb_free_pending == 0) {
+ if (atomic_read(&sbi->s_mb_free_pending) == 0) {
if (test_opt(sb, DISCARD)) {
atomic_inc(&sbi->s_retry_alloc_pending);
flush_work(&sbi->s_discard_work);
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
index 2a135075468d..87760fabdd2e 100644
--- a/fs/ext4/bitmap.c
+++ b/fs/ext4/bitmap.c
@@ -25,12 +25,12 @@ int ext4_inode_bitmap_csum_verify(struct super_block *sb,
struct ext4_sb_info *sbi = EXT4_SB(sb);
int sz;
- if (!ext4_has_metadata_csum(sb))
+ if (!ext4_has_feature_metadata_csum(sb))
return 1;
sz = EXT4_INODES_PER_GROUP(sb) >> 3;
provided = le16_to_cpu(gdp->bg_inode_bitmap_csum_lo);
- calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
+ calculated = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) {
hi = le16_to_cpu(gdp->bg_inode_bitmap_csum_hi);
provided |= (hi << 16);
@@ -48,11 +48,11 @@ void ext4_inode_bitmap_csum_set(struct super_block *sb,
struct ext4_sb_info *sbi = EXT4_SB(sb);
int sz;
- if (!ext4_has_metadata_csum(sb))
+ if (!ext4_has_feature_metadata_csum(sb))
return;
sz = EXT4_INODES_PER_GROUP(sb) >> 3;
- csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
+ csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
gdp->bg_inode_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF);
if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END)
gdp->bg_inode_bitmap_csum_hi = cpu_to_le16(csum >> 16);
@@ -67,11 +67,11 @@ int ext4_block_bitmap_csum_verify(struct super_block *sb,
struct ext4_sb_info *sbi = EXT4_SB(sb);
int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8;
- if (!ext4_has_metadata_csum(sb))
+ if (!ext4_has_feature_metadata_csum(sb))
return 1;
provided = le16_to_cpu(gdp->bg_block_bitmap_csum_lo);
- calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
+ calculated = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END) {
hi = le16_to_cpu(gdp->bg_block_bitmap_csum_hi);
provided |= (hi << 16);
@@ -89,10 +89,10 @@ void ext4_block_bitmap_csum_set(struct super_block *sb,
__u32 csum;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- if (!ext4_has_metadata_csum(sb))
+ if (!ext4_has_feature_metadata_csum(sb))
return;
- csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
+ csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
gdp->bg_block_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF);
if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END)
gdp->bg_block_bitmap_csum_hi = cpu_to_le16(csum >> 16);
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 87ee3a17bd29..e8c5525afc67 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -351,10 +351,9 @@ int ext4_check_blockref(const char *function, unsigned int line,
{
__le32 *bref = p;
unsigned int blk;
+ journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
- if (ext4_has_feature_journal(inode->i_sb) &&
- (inode->i_ino ==
- le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum)))
+ if (journal && inode == journal->j_inode)
return 0;
while (bref < p+max) {
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 02d47a64e8d1..d4164c507a90 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -86,7 +86,7 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
dir->i_sb->s_blocksize);
const int next_offset = ((char *) de - buf) + rlen;
bool fake = is_fake_dir_entry(de);
- bool has_csum = ext4_has_metadata_csum(dir->i_sb);
+ bool has_csum = ext4_has_feature_metadata_csum(dir->i_sb);
if (unlikely(rlen < ext4_dir_rec_len(1, fake ? NULL : dir)))
error_msg = "rec_len is smaller than minimal";
@@ -104,6 +104,9 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
else if (unlikely(le32_to_cpu(de->inode) >
le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)))
error_msg = "inode out of bounds";
+ else if (unlikely(next_offset == size && de->name_len == 1 &&
+ de->name[0] == '.'))
+ error_msg = "'.' directory cannot be the last in data block";
else
return 0;
@@ -145,7 +148,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
return err;
/* Can we just clear INDEX flag to ignore htree information? */
- if (!ext4_has_metadata_csum(sb)) {
+ if (!ext4_has_feature_metadata_csum(sb)) {
/*
* We don't set the inode dirty flag since it's not
* critical that it gets flushed back to the disk.
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 4e7de7eaa374..01a6e2de7fc3 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -157,7 +157,7 @@ enum criteria {
/*
* Reads each block group sequentially, performing disk IO if
- * necessary, to find find_suitable block group. Tries to
+ * necessary, to find suitable block group. Tries to
* allocate goal length but might trim the request if nothing
* is found after enough tries.
*/
@@ -185,14 +185,8 @@ enum criteria {
/* prefer goal again. length */
#define EXT4_MB_HINT_MERGE 0x0001
-/* blocks already reserved */
-#define EXT4_MB_HINT_RESERVED 0x0002
-/* metadata is being allocated */
-#define EXT4_MB_HINT_METADATA 0x0004
/* first blocks in the file */
#define EXT4_MB_HINT_FIRST 0x0008
-/* search for the best chunk */
-#define EXT4_MB_HINT_BEST 0x0010
/* data is being allocated */
#define EXT4_MB_HINT_DATA 0x0020
/* don't preallocate (for tails) */
@@ -213,15 +207,6 @@ enum criteria {
#define EXT4_MB_USE_RESERVED 0x2000
/* Do strict check for free blocks while retrying block allocation */
#define EXT4_MB_STRICT_CHECK 0x4000
-/* Large fragment size list lookup succeeded at least once for
- * CR_POWER2_ALIGNED */
-#define EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED 0x8000
-/* Avg fragment size rb tree lookup succeeded at least once for
- * CR_GOAL_LEN_FAST */
-#define EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED 0x00010000
-/* Avg fragment size rb tree lookup succeeded at least once for
- * CR_BEST_AVAIL_LEN */
-#define EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED 0x00020000
struct ext4_allocation_request {
/* target inode for block we're allocating */
@@ -256,9 +241,19 @@ struct ext4_allocation_request {
#define EXT4_MAP_UNWRITTEN BIT(BH_Unwritten)
#define EXT4_MAP_BOUNDARY BIT(BH_Boundary)
#define EXT4_MAP_DELAYED BIT(BH_Delay)
+/*
+ * This is for use in ext4_map_query_blocks() for a special case where we can
+ * have a physically and logically contiguous blocks split across two leaf
+ * nodes instead of a single extent. This is required in case of atomic writes
+ * to know whether the returned extent is last in leaf. If yes, then lookup for
+ * next in leaf block in ext4_map_query_blocks_next_in_leaf().
+ * - This is never going to be added to any buffer head state.
+ * - We use the next available bit after BH_BITMAP_UPTODATE.
+ */
+#define EXT4_MAP_QUERY_LAST_IN_LEAF BIT(BH_BITMAP_UPTODATE + 1)
#define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\
- EXT4_MAP_DELAYED)
+ EXT4_MAP_DELAYED | EXT4_MAP_QUERY_LAST_IN_LEAF)
struct ext4_map_blocks {
ext4_fsblk_t m_pblk;
@@ -278,7 +273,10 @@ struct ext4_system_blocks {
/*
* Flags for ext4_io_end->flags
*/
-#define EXT4_IO_END_UNWRITTEN 0x0001
+#define EXT4_IO_END_UNWRITTEN 0x0001
+#define EXT4_IO_END_FAILED 0x0002
+
+#define EXT4_IO_END_DEFER_COMPLETION (EXT4_IO_END_UNWRITTEN | EXT4_IO_END_FAILED)
struct ext4_io_end_vec {
struct list_head list; /* list of io_end_vec */
@@ -367,6 +365,8 @@ struct ext4_io_submit {
#define EXT4_MAX_BLOCKS(size, offset, blkbits) \
((EXT4_BLOCK_ALIGN(size + offset, blkbits) >> blkbits) - (offset >> \
blkbits))
+#define EXT4_B_TO_LBLK(inode, offset) \
+ (round_up((offset), i_blocksize(inode)) >> (inode)->i_blkbits)
/* Translate a block number to a cluster number */
#define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits)
@@ -701,9 +701,6 @@ enum {
#define EXT4_GET_BLOCKS_CONVERT 0x0010
#define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_PRE_IO|\
EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT)
- /* Convert extent to initialized after IO complete */
-#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
- EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT)
/* Eventual metadata allocation (due to growing extent tree)
* should not fail, so try to use reserved blocks for that.*/
#define EXT4_GET_BLOCKS_METADATA_NOFAIL 0x0020
@@ -715,11 +712,23 @@ enum {
#define EXT4_GET_BLOCKS_ZERO 0x0200
#define EXT4_GET_BLOCKS_CREATE_ZERO (EXT4_GET_BLOCKS_CREATE |\
EXT4_GET_BLOCKS_ZERO)
- /* Caller will submit data before dropping transaction handle. This
- * allows jbd2 to avoid submitting data before commit. */
+ /* Caller is in the context of data submission, such as writeback,
+ * fsync, etc. Especially, in the generic writeback path, caller will
+ * submit data before dropping transaction handle. This allows jbd2
+ * to avoid submitting data before commit. */
#define EXT4_GET_BLOCKS_IO_SUBMIT 0x0400
+ /* Convert extent to initialized after IO complete */
+#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT |\
+ EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT |\
+ EXT4_GET_BLOCKS_IO_SUBMIT)
/* Caller is in the atomic contex, find extent if it has been cached */
#define EXT4_GET_BLOCKS_CACHED_NOWAIT 0x0800
+/*
+ * Atomic write caller needs this to query in the slow path of mixed mapping
+ * case, when a contiguous extent can be split across two adjacent leaf nodes.
+ * Look EXT4_MAP_QUERY_LAST_IN_LEAF.
+ */
+#define EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF 0x1000
/*
* The bit position of these flags must not overlap with any of the
@@ -733,6 +742,13 @@ enum {
#define EXT4_EX_NOCACHE 0x40000000
#define EXT4_EX_FORCE_CACHE 0x20000000
#define EXT4_EX_NOFAIL 0x10000000
+/*
+ * ext4_map_query_blocks() uses this filter mask to filter the flags needed to
+ * pass while lookup/querying of on disk extent tree.
+ */
+#define EXT4_EX_QUERY_FILTER (EXT4_EX_NOCACHE | EXT4_EX_FORCE_CACHE |\
+ EXT4_EX_NOFAIL |\
+ EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF)
/*
* Flags used by ext4_free_blocks
@@ -1056,15 +1072,16 @@ struct ext4_inode_info {
/* End of lblk range that needs to be committed in this fast commit */
ext4_lblk_t i_fc_lblk_len;
- /* Number of ongoing updates on this inode */
- atomic_t i_fc_updates;
- atomic_t i_unwritten; /* Nr. of inflight conversions pending */
+ spinlock_t i_raw_lock; /* protects updates to the raw inode */
/* Fast commit wait queue for this inode */
wait_queue_head_t i_fc_wait;
- /* Protect concurrent accesses on i_fc_lblk_start, i_fc_lblk_len */
- struct mutex i_fc_lock;
+ /*
+ * Protect concurrent accesses on i_fc_lblk_start, i_fc_lblk_len
+ * and inode's EXT4_FC_STATE_COMMITTING state bit.
+ */
+ spinlock_t i_fc_lock;
/*
* i_disksize keeps track of what the inode size is ON DISK, not
@@ -1097,8 +1114,6 @@ struct ext4_inode_info {
struct inode vfs_inode;
struct jbd2_inode *jinode;
- spinlock_t i_raw_lock; /* protects updates to the raw inode */
-
/*
* File creation time. Its function is same as that of
* struct timespec64 i_{a,c,m}time in the generic inode.
@@ -1141,6 +1156,7 @@ struct ext4_inode_info {
/* quota space reservation, managed internally by quota code */
qsize_t i_reserved_quota;
#endif
+ spinlock_t i_block_reservation_lock;
/* Lock protecting lists below */
spinlock_t i_completed_io_lock;
@@ -1151,8 +1167,6 @@ struct ext4_inode_info {
struct list_head i_rsv_conversion_list;
struct work_struct i_rsv_conversion_work;
- spinlock_t i_block_reservation_lock;
-
/*
* Transactions that contain inode's metadata needed to complete
* fsync and fdatasync, respectively.
@@ -1579,16 +1593,14 @@ struct ext4_sb_info {
unsigned short *s_mb_offsets;
unsigned int *s_mb_maxs;
unsigned int s_group_info_size;
- unsigned int s_mb_free_pending;
+ atomic_t s_mb_free_pending;
struct list_head s_freed_data_list[2]; /* List of blocks to be freed
after commit completed */
struct list_head s_discard_list;
struct work_struct s_discard_work;
atomic_t s_retry_alloc_pending;
- struct list_head *s_mb_avg_fragment_size;
- rwlock_t *s_mb_avg_fragment_size_locks;
- struct list_head *s_mb_largest_free_orders;
- rwlock_t *s_mb_largest_free_orders_locks;
+ struct xarray *s_mb_avg_fragment_size;
+ struct xarray *s_mb_largest_free_orders;
/* tunables */
unsigned long s_stripe;
@@ -1600,12 +1612,15 @@ struct ext4_sb_info {
unsigned int s_mb_order2_reqs;
unsigned int s_mb_group_prealloc;
unsigned int s_max_dir_size_kb;
- /* where last allocation was done - for stream allocation */
- unsigned long s_mb_last_group;
- unsigned long s_mb_last_start;
unsigned int s_mb_prefetch;
unsigned int s_mb_prefetch_limit;
unsigned int s_mb_best_avail_max_trim_order;
+ unsigned int s_sb_update_sec;
+ unsigned int s_sb_update_kb;
+
+ /* where last allocation was done - for stream allocation */
+ ext4_group_t *s_mb_last_groups;
+ unsigned int s_mb_nr_global_goals;
/* stats for buddy allocator */
atomic_t s_bal_reqs; /* number of reqs with len > 1 */
@@ -1615,12 +1630,10 @@ struct ext4_sb_info {
atomic_t s_bal_cX_ex_scanned[EXT4_MB_NUM_CRS]; /* total extents scanned */
atomic_t s_bal_groups_scanned; /* number of groups scanned */
atomic_t s_bal_goals; /* goal hits */
+ atomic_t s_bal_stream_goals; /* stream allocation global goal hits */
atomic_t s_bal_len_goals; /* len goal hits */
atomic_t s_bal_breaks; /* too long searches */
atomic_t s_bal_2orders; /* 2^order hits */
- atomic_t s_bal_p2_aligned_bad_suggestions;
- atomic_t s_bal_goal_fast_bad_suggestions;
- atomic_t s_bal_best_avail_bad_suggestions;
atomic64_t s_bal_cX_groups_considered[EXT4_MB_NUM_CRS];
atomic64_t s_bal_cX_hits[EXT4_MB_NUM_CRS];
atomic64_t s_bal_cX_failed[EXT4_MB_NUM_CRS]; /* cX loop didn't find blocks */
@@ -1749,7 +1762,7 @@ struct ext4_sb_info {
* following fields:
* ei->i_fc_list, s_fc_dentry_q, s_fc_q, s_fc_bytes, s_fc_bh.
*/
- spinlock_t s_fc_lock;
+ struct mutex s_fc_lock;
struct buffer_head *s_fc_bh;
struct ext4_fc_stats s_fc_stats;
tid_t s_fc_ineligible_tid;
@@ -1821,7 +1834,8 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
*/
enum {
EXT4_MF_MNTDIR_SAMPLED,
- EXT4_MF_FC_INELIGIBLE /* Fast commit ineligible */
+ EXT4_MF_FC_INELIGIBLE, /* Fast commit ineligible */
+ EXT4_MF_JOURNAL_DESTROY /* Journal is in process of destroying */
};
static inline void ext4_set_mount_flag(struct super_block *sb, int bit)
@@ -1907,6 +1921,7 @@ enum {
EXT4_STATE_LUSTRE_EA_INODE, /* Lustre-style ea_inode */
EXT4_STATE_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */
EXT4_STATE_FC_COMMITTING, /* Fast commit ongoing */
+ EXT4_STATE_FC_FLUSHING_DATA, /* Fast commit flushing data */
EXT4_STATE_ORPHAN_FILE, /* Inode orphaned in orphan file */
};
@@ -2232,15 +2247,32 @@ extern int ext4_feature_set_ok(struct super_block *sb, int readonly);
/*
* Superblock flags
*/
-#define EXT4_FLAGS_RESIZING 0
-#define EXT4_FLAGS_SHUTDOWN 1
-#define EXT4_FLAGS_BDEV_IS_DAX 2
+enum {
+ EXT4_FLAGS_RESIZING, /* Avoid superblock update and resize race */
+ EXT4_FLAGS_SHUTDOWN, /* Prevent access to the file system */
+ EXT4_FLAGS_BDEV_IS_DAX, /* Current block device support DAX */
+ EXT4_FLAGS_EMERGENCY_RO,/* Emergency read-only due to fs errors */
+};
static inline int ext4_forced_shutdown(struct super_block *sb)
{
return test_bit(EXT4_FLAGS_SHUTDOWN, &EXT4_SB(sb)->s_ext4_flags);
}
+static inline int ext4_emergency_ro(struct super_block *sb)
+{
+ return test_bit(EXT4_FLAGS_EMERGENCY_RO, &EXT4_SB(sb)->s_ext4_flags);
+}
+
+static inline int ext4_emergency_state(struct super_block *sb)
+{
+ if (unlikely(ext4_forced_shutdown(sb)))
+ return -EIO;
+ if (unlikely(ext4_emergency_ro(sb)))
+ return -EROFS;
+ return 0;
+}
+
/*
* Default values for user and/or group using reserved blocks
*/
@@ -2272,10 +2304,19 @@ static inline int ext4_forced_shutdown(struct super_block *sb)
#define EXT4_DEFM_NODELALLOC 0x0800
/*
- * Default journal batch times
+ * Default journal batch times and ioprio.
*/
#define EXT4_DEF_MIN_BATCH_TIME 0
#define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */
+#define EXT4_DEF_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
+
+
+/*
+ * Default values for superblock update
+ */
+#define EXT4_DEF_SB_UPDATE_INTERVAL_SEC (3600) /* seconds (1 hour) */
+#define EXT4_DEF_SB_UPDATE_INTERVAL_KB (16384) /* kilobytes (16MB) */
+
/*
* Minimum number of groups in a flexgroup before we separate out
@@ -2457,8 +2498,7 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
#define DX_HASH_SIPHASH 6
#define DX_HASH_LAST DX_HASH_SIPHASH
-static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc,
- const void *address, unsigned int length)
+static inline u32 ext4_chksum(u32 crc, const void *address, unsigned int length)
{
return crc32c(crc, address, length);
}
@@ -2810,8 +2850,7 @@ extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
struct ext4_dir_entry_2 *dirent,
struct fscrypt_str *ent_name);
extern void ext4_htree_free_dir_info(struct dir_private_info *p);
-extern int ext4_find_dest_de(struct inode *dir, struct inode *inode,
- struct buffer_head *bh,
+extern int ext4_find_dest_de(struct inode *dir, struct buffer_head *bh,
void *buf, int buf_size,
struct ext4_filename *fname,
struct ext4_dir_entry_2 **dest_de);
@@ -2893,8 +2932,6 @@ void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
void ext4_fc_track_create(handle_t *handle, struct dentry *dentry);
void ext4_fc_track_inode(handle_t *handle, struct inode *inode);
void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle);
-void ext4_fc_start_update(struct inode *inode);
-void ext4_fc_stop_update(struct inode *inode);
void ext4_fc_del(struct inode *inode);
bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t block);
void ext4_fc_replay_cleanup(struct super_block *sb);
@@ -2944,6 +2981,7 @@ static inline bool ext4_mb_cr_expensive(enum criteria cr)
void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
struct ext4_inode_info *ei);
int ext4_inode_is_fast_symlink(struct inode *inode);
+void ext4_check_map_extents_env(struct inode *inode);
struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count,
@@ -2964,6 +3002,7 @@ int ext4_walk_page_buffers(handle_t *handle,
struct buffer_head *bh));
int do_journal_get_write_access(handle_t *handle, struct inode *inode,
struct buffer_head *bh);
+void ext4_set_inode_mapping_order(struct inode *inode);
#define FALL_BACK_TO_NONDELALLOC 1
#define CONVERT_INLINE_DATA 2
@@ -3001,13 +3040,17 @@ extern int ext4_inode_attach_jinode(struct inode *inode);
extern int ext4_can_truncate(struct inode *inode);
extern int ext4_truncate(struct inode *);
extern int ext4_break_layouts(struct inode *);
+extern int ext4_truncate_page_cache_block_range(struct inode *inode,
+ loff_t start, loff_t end);
extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
extern void ext4_set_inode_flags(struct inode *, bool init);
extern int ext4_alloc_da_blocks(struct inode *inode);
extern void ext4_set_aops(struct inode *inode);
-extern int ext4_writepage_trans_blocks(struct inode *);
extern int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode);
extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
+extern int ext4_chunk_trans_extent(struct inode *inode, int nrblocks);
+extern int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
+ int pextents);
extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
loff_t lstart, loff_t lend);
extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf);
@@ -3019,6 +3062,17 @@ extern void ext4_da_update_reserve_space(struct inode *inode,
extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk,
ext4_fsblk_t pblk, ext4_lblk_t len);
+static inline bool is_special_ino(struct super_block *sb, unsigned long ino)
+{
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+
+ return (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) ||
+ ino == le32_to_cpu(es->s_usr_quota_inum) ||
+ ino == le32_to_cpu(es->s_grp_quota_inum) ||
+ ino == le32_to_cpu(es->s_prj_quota_inum) ||
+ ino == le32_to_cpu(es->s_orphan_file_inum);
+}
+
/* indirect.c */
extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags);
@@ -3031,8 +3085,8 @@ extern int ext4_ind_remove_space(handle_t *handle, struct inode *inode,
extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
int ext4_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa);
-int ext4_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+ struct dentry *dentry, struct file_kattr *fa);
+int ext4_fileattr_get(struct dentry *dentry, struct file_kattr *fa);
extern void ext4_reset_inode_seed(struct inode *inode);
int ext4_update_overhead(struct super_block *sb, bool force);
int ext4_force_shutdown(struct super_block *sb, u32 flags);
@@ -3088,8 +3142,7 @@ extern int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wa
extern void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block);
extern int ext4_seq_options_show(struct seq_file *seq, void *offset);
extern int ext4_calculate_overhead(struct super_block *sb);
-extern __le32 ext4_superblock_csum(struct super_block *sb,
- struct ext4_super_block *es);
+extern __le32 ext4_superblock_csum(struct ext4_super_block *es);
extern void ext4_superblock_csum_set(struct super_block *sb);
extern int ext4_alloc_flex_bg_array(struct super_block *sb,
ext4_group_t ngroup);
@@ -3259,14 +3312,10 @@ extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group,
extern int ext4_register_li_request(struct super_block *sb,
ext4_group_t first_not_zeroed);
-static inline int ext4_has_metadata_csum(struct super_block *sb)
-{
- return ext4_has_feature_metadata_csum(sb);
-}
-
static inline int ext4_has_group_desc_csum(struct super_block *sb)
{
- return ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb);
+ return ext4_has_feature_gdt_csum(sb) ||
+ ext4_has_feature_metadata_csum(sb);
}
#define ext4_read_incompat_64bit_val(es, name) \
@@ -3351,6 +3400,13 @@ static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi)
return 1 << sbi->s_log_groups_per_flex;
}
+static inline loff_t ext4_get_maxbytes(struct inode *inode)
+{
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ return inode->i_sb->s_maxbytes;
+ return EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
+}
+
#define ext4_std_error(sb, errno) \
do { \
if ((errno)) \
@@ -3415,8 +3471,6 @@ struct ext4_group_info {
void *bb_bitmap;
#endif
struct rw_semaphore alloc_sem;
- struct list_head bb_avg_fragment_size_node;
- struct list_head bb_largest_free_order_node;
ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block
* regions, index is order.
* bb_counters[3] = 5 means
@@ -3467,23 +3521,28 @@ static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi)
return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD);
}
+static inline bool ext4_try_lock_group(struct super_block *sb, ext4_group_t group)
+{
+ if (!spin_trylock(ext4_group_lock_ptr(sb, group)))
+ return false;
+ /*
+ * We're able to grab the lock right away, so drop the lock
+ * contention counter.
+ */
+ atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0);
+ return true;
+}
+
static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
{
- spinlock_t *lock = ext4_group_lock_ptr(sb, group);
- if (spin_trylock(lock))
- /*
- * We're able to grab the lock right away, so drop the
- * lock contention counter.
- */
- atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0);
- else {
+ if (!ext4_try_lock_group(sb, group)) {
/*
* The lock is busy, so bump the contention counter,
* and then wait on the spin lock.
*/
atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1,
EXT4_MAX_CONTENTION);
- spin_lock(lock);
+ spin_lock(ext4_group_lock_ptr(sb, group));
}
}
@@ -3538,6 +3597,7 @@ extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);
extern int ext4_get_max_inline_size(struct inode *inode);
extern int ext4_find_inline_data_nolock(struct inode *inode);
extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode);
+extern void ext4_update_final_de(void *de_buf, int old_size, int new_size);
int ext4_readpage_inline(struct inode *inode, struct folio *folio);
extern int ext4_try_to_write_inline_data(struct address_space *mapping,
@@ -3546,11 +3606,11 @@ extern int ext4_try_to_write_inline_data(struct address_space *mapping,
struct folio **foliop);
int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
unsigned copied, struct folio *folio);
-extern int ext4_da_write_inline_data_begin(struct address_space *mapping,
- struct inode *inode,
- loff_t pos, unsigned len,
- struct folio **foliop,
- void **fsdata);
+extern int ext4_generic_write_inline_data(struct address_space *mapping,
+ struct inode *inode,
+ loff_t pos, unsigned len,
+ struct folio **foliop,
+ void **fsdata, bool da);
extern int ext4_try_add_inline_entry(handle_t *handle,
struct ext4_filename *fname,
struct inode *dir, struct inode *inode);
@@ -3597,10 +3657,10 @@ static inline int ext4_has_inline_data(struct inode *inode)
extern const struct inode_operations ext4_dir_inode_operations;
extern const struct inode_operations ext4_special_inode_operations;
extern struct dentry *ext4_get_parent(struct dentry *child);
-extern struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
- struct ext4_dir_entry_2 *de,
- int blocksize, int csum_size,
- unsigned int parent_ino, int dotdot_real_len);
+extern int ext4_init_dirblock(handle_t *handle, struct inode *inode,
+ struct buffer_head *dir_block,
+ unsigned int parent_ino, void *inline_buf,
+ int inline_size);
extern void ext4_initialize_dirent_tail(struct buffer_head *bh,
unsigned int blocksize);
extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode,
@@ -3683,6 +3743,8 @@ extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
loff_t len);
extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
loff_t offset, ssize_t len);
+extern int ext4_convert_unwritten_extents_atomic(handle_t *handle,
+ struct inode *inode, loff_t offset, ssize_t len);
extern int ext4_convert_unwritten_io_end_vec(handle_t *handle,
ext4_io_end_t *io_end);
extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
@@ -3785,34 +3847,19 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh)
set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
}
-/* For ioend & aio unwritten conversion wait queues */
-#define EXT4_WQ_HASH_SZ 37
-#define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\
- EXT4_WQ_HASH_SZ])
-extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
-
extern int ext4_resize_begin(struct super_block *sb);
extern int ext4_resize_end(struct super_block *sb, bool update_backups);
-static inline void ext4_set_io_unwritten_flag(struct inode *inode,
- struct ext4_io_end *io_end)
+static inline void ext4_set_io_unwritten_flag(struct ext4_io_end *io_end)
{
- if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
+ if (!(io_end->flag & EXT4_IO_END_UNWRITTEN))
io_end->flag |= EXT4_IO_END_UNWRITTEN;
- atomic_inc(&EXT4_I(inode)->i_unwritten);
- }
}
static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
{
- struct inode *inode = io_end->inode;
-
- if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
+ if (io_end->flag & EXT4_IO_END_UNWRITTEN)
io_end->flag &= ~EXT4_IO_END_UNWRITTEN;
- /* Wake up anyone waiting on unwritten extent conversion */
- if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
- wake_up_all(ext4_ioend_wq(inode));
- }
}
extern const struct iomap_ops ext4_iomap_ops;
@@ -3835,7 +3882,9 @@ static inline int ext4_buffer_uptodate(struct buffer_head *bh)
static inline bool ext4_inode_can_atomic_write(struct inode *inode)
{
- return S_ISREG(inode->i_mode) && EXT4_SB(inode->i_sb)->s_awu_min > 0;
+ return S_ISREG(inode->i_mode) &&
+ ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
+ EXT4_SB(inode->i_sb)->s_awu_min > 0;
}
extern int ext4_block_write_begin(handle_t *handle, struct folio *folio,
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 26435f3a3094..c484125d963f 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -31,13 +31,6 @@
#define CHECK_BINSEARCH__
/*
- * If EXT_STATS is defined then stats numbers are collected.
- * These number will be displayed at umount time.
- */
-#define EXT_STATS_
-
-
-/*
* ext4_inode has i_block array (60 bytes total).
* The first 12 bytes store ext4_extent_header;
* the remainder stores an array of ext4_extent.
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index da4a82456383..b3e9b7bd7978 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -16,7 +16,8 @@ int ext4_inode_journal_mode(struct inode *inode)
ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) ||
test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
(ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
- !test_opt(inode->i_sb, DELALLOC))) {
+ !test_opt(inode->i_sb, DELALLOC) &&
+ !mapping_large_folio_support(inode->i_mapping))) {
/* We do not support data journalling for encrypted data */
if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode))
return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */
@@ -63,12 +64,14 @@ static void ext4_put_nojournal(handle_t *handle)
*/
static int ext4_journal_check_start(struct super_block *sb)
{
+ int ret;
journal_t *journal;
might_sleep();
- if (unlikely(ext4_forced_shutdown(sb)))
- return -EIO;
+ ret = ext4_emergency_state(sb);
+ if (unlikely(ret))
+ return ret;
if (WARN_ON_ONCE(sb_rdonly(sb)))
return -EROFS;
@@ -244,7 +247,8 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line,
}
} else
ext4_check_bdev_write_error(sb);
- if (trigger_type == EXT4_JTR_NONE || !ext4_has_metadata_csum(sb))
+ if (trigger_type == EXT4_JTR_NONE ||
+ !ext4_has_feature_metadata_csum(sb))
return 0;
BUG_ON(trigger_type >= EXT4_JOURNAL_TRIGGER_COUNT);
jbd2_journal_set_triggers(bh,
@@ -331,7 +335,8 @@ int __ext4_journal_get_create_access(const char *where, unsigned int line,
err);
return err;
}
- if (trigger_type == EXT4_JTR_NONE || !ext4_has_metadata_csum(sb))
+ if (trigger_type == EXT4_JTR_NONE ||
+ !ext4_has_feature_metadata_csum(sb))
return 0;
BUG_ON(trigger_type >= EXT4_JOURNAL_TRIGGER_COUNT);
jbd2_journal_set_triggers(bh,
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 0c77697d5e90..63d17c5201b5 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -122,90 +122,6 @@
#define EXT4_HT_EXT_CONVERT 11
#define EXT4_HT_MAX 12
-/**
- * struct ext4_journal_cb_entry - Base structure for callback information.
- *
- * This struct is a 'seed' structure for a using with your own callback
- * structs. If you are using callbacks you must allocate one of these
- * or another struct of your own definition which has this struct
- * as it's first element and pass it to ext4_journal_callback_add().
- */
-struct ext4_journal_cb_entry {
- /* list information for other callbacks attached to the same handle */
- struct list_head jce_list;
-
- /* Function to call with this callback structure */
- void (*jce_func)(struct super_block *sb,
- struct ext4_journal_cb_entry *jce, int error);
-
- /* user data goes here */
-};
-
-/**
- * ext4_journal_callback_add: add a function to call after transaction commit
- * @handle: active journal transaction handle to register callback on
- * @func: callback function to call after the transaction has committed:
- * @sb: superblock of current filesystem for transaction
- * @jce: returned journal callback data
- * @rc: journal state at commit (0 = transaction committed properly)
- * @jce: journal callback data (internal and function private data struct)
- *
- * The registered function will be called in the context of the journal thread
- * after the transaction for which the handle was created has completed.
- *
- * No locks are held when the callback function is called, so it is safe to
- * call blocking functions from within the callback, but the callback should
- * not block or run for too long, or the filesystem will be blocked waiting for
- * the next transaction to commit. No journaling functions can be used, or
- * there is a risk of deadlock.
- *
- * There is no guaranteed calling order of multiple registered callbacks on
- * the same transaction.
- */
-static inline void _ext4_journal_callback_add(handle_t *handle,
- struct ext4_journal_cb_entry *jce)
-{
- /* Add the jce to transaction's private list */
- list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list);
-}
-
-static inline void ext4_journal_callback_add(handle_t *handle,
- void (*func)(struct super_block *sb,
- struct ext4_journal_cb_entry *jce,
- int rc),
- struct ext4_journal_cb_entry *jce)
-{
- struct ext4_sb_info *sbi =
- EXT4_SB(handle->h_transaction->t_journal->j_private);
-
- /* Add the jce to transaction's private list */
- jce->jce_func = func;
- spin_lock(&sbi->s_md_lock);
- _ext4_journal_callback_add(handle, jce);
- spin_unlock(&sbi->s_md_lock);
-}
-
-
-/**
- * ext4_journal_callback_del: delete a registered callback
- * @handle: active journal transaction handle on which callback was registered
- * @jce: registered journal callback entry to unregister
- * Return true if object was successfully removed
- */
-static inline bool ext4_journal_callback_try_del(handle_t *handle,
- struct ext4_journal_cb_entry *jce)
-{
- bool deleted;
- struct ext4_sb_info *sbi =
- EXT4_SB(handle->h_transaction->t_journal->j_private);
-
- spin_lock(&sbi->s_md_lock);
- deleted = !list_empty(&jce->jce_list);
- list_del_init(&jce->jce_list);
- spin_unlock(&sbi->s_md_lock);
- return deleted;
-}
-
int
ext4_mark_iloc_dirty(handle_t *handle,
struct inode *inode,
@@ -403,10 +319,10 @@ static inline int ext4_journal_ensure_credits(handle_t *handle, int credits,
revoke_creds, 0);
}
-static inline int ext4_journal_blocks_per_page(struct inode *inode)
+static inline int ext4_journal_blocks_per_folio(struct inode *inode)
{
if (EXT4_JOURNAL(inode) != NULL)
- return jbd2_journal_blocks_per_page(inode);
+ return jbd2_journal_blocks_per_folio(inode);
return 0;
}
@@ -513,4 +429,33 @@ static inline int ext4_should_dioread_nolock(struct inode *inode)
return 1;
}
+/*
+ * Pass journal explicitly as it may not be cached in the sbi->s_journal in some
+ * cases
+ */
+static inline int ext4_journal_destroy(struct ext4_sb_info *sbi, journal_t *journal)
+{
+ int err = 0;
+
+ /*
+ * At this point only two things can be operating on the journal.
+ * JBD2 thread performing transaction commit and s_sb_upd_work
+ * issuing sb update through the journal. Once we set
+ * EXT4_JOURNAL_DESTROY, new ext4_handle_error() calls will not
+ * queue s_sb_upd_work and ext4_force_commit() makes sure any
+ * ext4_handle_error() calls from the running transaction commit are
+ * finished. Hence no new s_sb_upd_work can be queued after we
+ * flush it here.
+ */
+ ext4_set_mount_flag(sbi->s_sb, EXT4_MF_JOURNAL_DESTROY);
+
+ ext4_force_commit(sbi->s_sb);
+ flush_work(&sbi->s_sb_upd_work);
+
+ err = jbd2_journal_destroy(journal);
+ sbi->s_journal = NULL;
+
+ return err;
+}
+
#endif /* _EXT4_JBD2_H */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index a07a98a4b97a..ca5499e9412b 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -50,10 +50,9 @@ static __le32 ext4_extent_block_csum(struct inode *inode,
struct ext4_extent_header *eh)
{
struct ext4_inode_info *ei = EXT4_I(inode);
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
__u32 csum;
- csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh,
+ csum = ext4_chksum(ei->i_csum_seed, (__u8 *)eh,
EXT4_EXTENT_TAIL_OFFSET(eh));
return cpu_to_le32(csum);
}
@@ -63,7 +62,7 @@ static int ext4_extent_block_csum_verify(struct inode *inode,
{
struct ext4_extent_tail *et;
- if (!ext4_has_metadata_csum(inode->i_sb))
+ if (!ext4_has_feature_metadata_csum(inode->i_sb))
return 1;
et = find_ext4_extent_tail(eh);
@@ -77,7 +76,7 @@ static void ext4_extent_block_csum_set(struct inode *inode,
{
struct ext4_extent_tail *et;
- if (!ext4_has_metadata_csum(inode->i_sb))
+ if (!ext4_has_feature_metadata_csum(inode->i_sb))
return;
et = find_ext4_extent_tail(eh);
@@ -611,6 +610,8 @@ int ext4_ext_precache(struct inode *inode)
if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
return 0; /* not an extent-mapped inode */
+ ext4_check_map_extents_env(inode);
+
down_read(&ei->i_data_sem);
depth = ext_depth(inode);
@@ -1530,7 +1531,7 @@ static int ext4_ext_search_left(struct inode *inode,
static int ext4_ext_search_right(struct inode *inode,
struct ext4_ext_path *path,
ext4_lblk_t *logical, ext4_fsblk_t *phys,
- struct ext4_extent *ret_ex)
+ struct ext4_extent *ret_ex, int flags)
{
struct buffer_head *bh = NULL;
struct ext4_extent_header *eh;
@@ -1604,7 +1605,8 @@ got_index:
ix++;
while (++depth < path->p_depth) {
/* subtract from p_depth to get proper eh_depth */
- bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0);
+ bh = read_extent_tree_block(inode, ix, path->p_depth - depth,
+ flags);
if (IS_ERR(bh))
return PTR_ERR(bh);
eh = ext_block_hdr(bh);
@@ -1612,7 +1614,7 @@ got_index:
put_bh(bh);
}
- bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0);
+ bh = read_extent_tree_block(inode, ix, path->p_depth - depth, flags);
if (IS_ERR(bh))
return PTR_ERR(bh);
eh = ext_block_hdr(bh);
@@ -2396,18 +2398,20 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
int ext4_ext_index_trans_blocks(struct inode *inode, int extents)
{
int index;
- int depth;
/* If we are converting the inline data, only one is needed here. */
if (ext4_has_inline_data(inode))
return 1;
- depth = ext_depth(inode);
-
+ /*
+ * Extent tree can change between the time we estimate credits and
+ * the time we actually modify the tree. Assume the worst case.
+ */
if (extents <= 1)
- index = depth * 2;
+ index = (EXT4_MAX_EXTENT_DEPTH * 2) + extents;
else
- index = depth * 3;
+ index = (EXT4_MAX_EXTENT_DEPTH * 3) +
+ DIV_ROUND_UP(extents, ext4_ext_space_block(inode, 0));
return index;
}
@@ -2821,6 +2825,7 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
struct partial_cluster partial;
handle_t *handle;
int i = 0, err = 0;
+ int flags = EXT4_EX_NOCACHE | EXT4_EX_NOFAIL;
partial.pclu = 0;
partial.lblk = 0;
@@ -2851,8 +2856,7 @@ again:
ext4_fsblk_t pblk;
/* find extent for or closest extent to this block */
- path = ext4_find_extent(inode, end, NULL,
- EXT4_EX_NOCACHE | EXT4_EX_NOFAIL);
+ path = ext4_find_extent(inode, end, NULL, flags);
if (IS_ERR(path)) {
ext4_journal_stop(handle);
return PTR_ERR(path);
@@ -2918,7 +2922,7 @@ again:
*/
lblk = ex_end + 1;
err = ext4_ext_search_right(inode, path, &lblk, &pblk,
- NULL);
+ NULL, flags);
if (err < 0)
goto out;
if (pblk) {
@@ -2994,8 +2998,7 @@ again:
i + 1, ext4_idx_pblock(path[i].p_idx));
memset(path + i + 1, 0, sizeof(*path));
bh = read_extent_tree_block(inode, path[i].p_idx,
- depth - i - 1,
- EXT4_EX_NOCACHE);
+ depth - i - 1, flags);
if (IS_ERR(bh)) {
/* should we reset i_size? */
err = PTR_ERR(bh);
@@ -4202,7 +4205,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
/* find extent for this block */
- path = ext4_find_extent(inode, map->m_lblk, NULL, 0);
+ path = ext4_find_extent(inode, map->m_lblk, NULL, flags);
if (IS_ERR(path)) {
err = PTR_ERR(path);
goto out;
@@ -4314,7 +4317,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
if (err)
goto out;
ar.lright = map->m_lblk;
- err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2);
+ err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright,
+ &ex2, flags);
if (err < 0)
goto out;
@@ -4433,6 +4437,20 @@ got_allocated_blocks:
allocated = map->m_len;
ext4_ext_show_leaf(inode, path);
out:
+ /*
+ * We never use EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF with CREATE flag.
+ * So we know that the depth used here is correct, since there was no
+ * block allocation done if EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF is set.
+ * If tomorrow we start using this QUERY flag with CREATE, then we will
+ * need to re-calculate the depth as it might have changed due to block
+ * allocation.
+ */
+ if (flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) {
+ WARN_ON_ONCE(flags & EXT4_GET_BLOCKS_CREATE);
+ if (!err && ex && (ex == EXT_LAST_EXTENT(path[depth].p_hdr)))
+ map->m_flags |= EXT4_MAP_QUERY_LAST_IN_LEAF;
+ }
+
ext4_free_ext_path(path);
trace_ext4_ext_map_blocks_exit(inode, flags, map,
@@ -4483,6 +4501,8 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
struct ext4_map_blocks map;
unsigned int credits;
loff_t epos, old_size = i_size_read(inode);
+ unsigned int blkbits = inode->i_blkbits;
+ bool alloc_zero = false;
BUG_ON(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS));
map.m_lblk = offset;
@@ -4496,6 +4516,17 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
/*
+ * Do the actual write zero during a running journal transaction
+ * costs a lot. First allocate an unwritten extent and then
+ * convert it to written after zeroing it out.
+ */
+ if (flags & EXT4_GET_BLOCKS_ZERO) {
+ flags &= ~EXT4_GET_BLOCKS_ZERO;
+ flags |= EXT4_GET_BLOCKS_UNWRIT_EXT;
+ alloc_zero = true;
+ }
+
+ /*
* credits to insert 1 extent into extent tree
*/
credits = ext4_chunk_trans_blocks(inode, len);
@@ -4531,9 +4562,7 @@ retry:
* allow a full retry cycle for any remaining allocations
*/
retries = 0;
- map.m_lblk += ret;
- map.m_len = len = len - ret;
- epos = (loff_t)map.m_lblk << inode->i_blkbits;
+ epos = (loff_t)(map.m_lblk + ret) << blkbits;
inode_set_ctime_current(inode);
if (new_size) {
if (epos > new_size)
@@ -4553,6 +4582,21 @@ retry:
ret2 = ret3 ? ret3 : ret2;
if (unlikely(ret2))
break;
+
+ if (alloc_zero &&
+ (map.m_flags & (EXT4_MAP_MAPPED | EXT4_MAP_UNWRITTEN))) {
+ ret2 = ext4_issue_zeroout(inode, map.m_lblk, map.m_pblk,
+ map.m_len);
+ if (likely(!ret2))
+ ret2 = ext4_convert_unwritten_extents(NULL,
+ inode, (loff_t)map.m_lblk << blkbits,
+ (loff_t)map.m_len << blkbits);
+ if (ret2)
+ break;
+ }
+
+ map.m_lblk += ret;
+ map.m_len = len = len - ret;
}
if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
goto retry;
@@ -4568,131 +4612,69 @@ static long ext4_zero_range(struct file *file, loff_t offset,
loff_t len, int mode)
{
struct inode *inode = file_inode(file);
- struct address_space *mapping = file->f_mapping;
handle_t *handle = NULL;
- unsigned int max_blocks;
loff_t new_size = 0;
- int ret = 0;
- int flags;
- int credits;
- int partial_begin, partial_end;
- loff_t start, end;
- ext4_lblk_t lblk;
+ loff_t end = offset + len;
+ ext4_lblk_t start_lblk, end_lblk;
+ unsigned int blocksize = i_blocksize(inode);
unsigned int blkbits = inode->i_blkbits;
+ int ret, flags, credits;
trace_ext4_zero_range(inode, offset, len, mode);
+ WARN_ON_ONCE(!inode_is_locked(inode));
- /*
- * Round up offset. This is not fallocate, we need to zero out
- * blocks, so convert interior block aligned part of the range to
- * unwritten and possibly manually zero out unaligned parts of the
- * range. Here, start and partial_begin are inclusive, end and
- * partial_end are exclusive.
- */
- start = round_up(offset, 1 << blkbits);
- end = round_down((offset + len), 1 << blkbits);
-
- if (start < offset || end > offset + len)
- return -EINVAL;
- partial_begin = offset & ((1 << blkbits) - 1);
- partial_end = (offset + len) & ((1 << blkbits) - 1);
-
- lblk = start >> blkbits;
- max_blocks = (end >> blkbits);
- if (max_blocks < lblk)
- max_blocks = 0;
- else
- max_blocks -= lblk;
-
- inode_lock(inode);
-
- /*
- * Indirect files do not support unwritten extents
- */
- if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
- ret = -EOPNOTSUPP;
- goto out_mutex;
- }
+ /* Indirect files do not support unwritten extents */
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+ return -EOPNOTSUPP;
if (!(mode & FALLOC_FL_KEEP_SIZE) &&
- (offset + len > inode->i_size ||
- offset + len > EXT4_I(inode)->i_disksize)) {
- new_size = offset + len;
+ (end > inode->i_size || end > EXT4_I(inode)->i_disksize)) {
+ new_size = end;
ret = inode_newsize_ok(inode, new_size);
if (ret)
- goto out_mutex;
+ return ret;
}
flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
-
- /* Wait all existing dio workers, newcomers will block on i_rwsem */
- inode_dio_wait(inode);
-
- ret = file_modified(file);
- if (ret)
- goto out_mutex;
-
/* Preallocate the range including the unaligned edges */
- if (partial_begin || partial_end) {
- ret = ext4_alloc_file_blocks(file,
- round_down(offset, 1 << blkbits) >> blkbits,
- (round_up((offset + len), 1 << blkbits) -
- round_down(offset, 1 << blkbits)) >> blkbits,
- new_size, flags);
- if (ret)
- goto out_mutex;
+ if (!IS_ALIGNED(offset | end, blocksize)) {
+ ext4_lblk_t alloc_lblk = offset >> blkbits;
+ ext4_lblk_t len_lblk = EXT4_MAX_BLOCKS(len, offset, blkbits);
+ ret = ext4_alloc_file_blocks(file, alloc_lblk, len_lblk,
+ new_size, flags);
+ if (ret)
+ return ret;
}
- /* Zero range excluding the unaligned edges */
- if (max_blocks > 0) {
- flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
- EXT4_EX_NOCACHE);
-
- /*
- * Prevent page faults from reinstantiating pages we have
- * released from page cache.
- */
- filemap_invalidate_lock(mapping);
-
- ret = ext4_break_layouts(inode);
- if (ret) {
- filemap_invalidate_unlock(mapping);
- goto out_mutex;
- }
-
- ret = ext4_update_disksize_before_punch(inode, offset, len);
- if (ret) {
- filemap_invalidate_unlock(mapping);
- goto out_mutex;
- }
+ ret = ext4_update_disksize_before_punch(inode, offset, len);
+ if (ret)
+ return ret;
- /*
- * For journalled data we need to write (and checkpoint) pages
- * before discarding page cache to avoid inconsitent data on
- * disk in case of crash before zeroing trans is committed.
- */
- if (ext4_should_journal_data(inode)) {
- ret = filemap_write_and_wait_range(mapping, start,
- end - 1);
- if (ret) {
- filemap_invalidate_unlock(mapping);
- goto out_mutex;
- }
- }
+ /* Now release the pages and zero block aligned part of pages */
+ ret = ext4_truncate_page_cache_block_range(inode, offset, end);
+ if (ret)
+ return ret;
- /* Now release the pages and zero block aligned part of pages */
- truncate_pagecache_range(inode, start, end - 1);
- inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
+ /* Zero range excluding the unaligned edges */
+ start_lblk = EXT4_B_TO_LBLK(inode, offset);
+ end_lblk = end >> blkbits;
+ if (end_lblk > start_lblk) {
+ ext4_lblk_t zero_blks = end_lblk - start_lblk;
- ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
- flags);
- filemap_invalidate_unlock(mapping);
+ if (mode & FALLOC_FL_WRITE_ZEROES)
+ flags = EXT4_GET_BLOCKS_CREATE_ZERO | EXT4_EX_NOCACHE;
+ else
+ flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
+ EXT4_EX_NOCACHE);
+ ret = ext4_alloc_file_blocks(file, start_lblk, zero_blks,
+ new_size, flags);
if (ret)
- goto out_mutex;
+ return ret;
}
- if (!partial_begin && !partial_end)
- goto out_mutex;
+ /* Finish zeroing out if it doesn't contain partial block */
+ if (IS_ALIGNED(offset | end, blocksize))
+ return ret;
/*
* In worst case we have to writeout two nonadjacent unwritten
@@ -4705,27 +4687,69 @@ static long ext4_zero_range(struct file *file, loff_t offset,
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
ext4_std_error(inode->i_sb, ret);
- goto out_mutex;
+ return ret;
}
- inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
+ /* Zero out partial block at the edges of the range */
+ ret = ext4_zero_partial_blocks(handle, inode, offset, len);
+ if (ret)
+ goto out_handle;
+
if (new_size)
ext4_update_inode_size(inode, new_size);
ret = ext4_mark_inode_dirty(handle, inode);
if (unlikely(ret))
goto out_handle;
- /* Zero out partial block at the edges of the range */
- ret = ext4_zero_partial_blocks(handle, inode, offset, len);
- if (ret >= 0)
- ext4_update_inode_fsync_trans(handle, inode, 1);
+ ext4_update_inode_fsync_trans(handle, inode, 1);
if (file->f_flags & O_SYNC)
ext4_handle_sync(handle);
out_handle:
ext4_journal_stop(handle);
-out_mutex:
- inode_unlock(inode);
+ return ret;
+}
+
+static long ext4_do_fallocate(struct file *file, loff_t offset,
+ loff_t len, int mode)
+{
+ struct inode *inode = file_inode(file);
+ loff_t end = offset + len;
+ loff_t new_size = 0;
+ ext4_lblk_t start_lblk, len_lblk;
+ int ret;
+
+ trace_ext4_fallocate_enter(inode, offset, len, mode);
+ WARN_ON_ONCE(!inode_is_locked(inode));
+
+ start_lblk = offset >> inode->i_blkbits;
+ len_lblk = EXT4_MAX_BLOCKS(len, offset, inode->i_blkbits);
+
+ /* We only support preallocation for extent-based files only. */
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+ (end > inode->i_size || end > EXT4_I(inode)->i_disksize)) {
+ new_size = end;
+ ret = inode_newsize_ok(inode, new_size);
+ if (ret)
+ goto out;
+ }
+
+ ret = ext4_alloc_file_blocks(file, start_lblk, len_lblk, new_size,
+ EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT);
+ if (ret)
+ goto out;
+
+ if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) {
+ ret = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal,
+ EXT4_I(inode)->i_sync_tid);
+ }
+out:
+ trace_ext4_fallocate_exit(inode, offset, len_lblk, ret);
return ret;
}
@@ -4739,12 +4763,8 @@ out_mutex:
long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
{
struct inode *inode = file_inode(file);
- loff_t new_size = 0;
- unsigned int max_blocks;
- int ret = 0;
- int flags;
- ext4_lblk_t lblk;
- unsigned int blkbits = inode->i_blkbits;
+ struct address_space *mapping = file->f_mapping;
+ int ret;
/*
* Encrypted inodes can't handle collapse range or insert
@@ -4755,83 +4775,158 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
if (IS_ENCRYPTED(inode) &&
(mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE)))
return -EOPNOTSUPP;
+ /*
+ * Don't allow writing zeroes if the underlying device does not
+ * enable the unmap write zeroes operation.
+ */
+ if ((mode & FALLOC_FL_WRITE_ZEROES) &&
+ !bdev_write_zeroes_unmap_sectors(inode->i_sb->s_bdev))
+ return -EOPNOTSUPP;
/* Return error if mode is not supported */
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
- FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |
- FALLOC_FL_INSERT_RANGE))
+ FALLOC_FL_ZERO_RANGE | FALLOC_FL_COLLAPSE_RANGE |
+ FALLOC_FL_INSERT_RANGE | FALLOC_FL_WRITE_ZEROES))
return -EOPNOTSUPP;
inode_lock(inode);
ret = ext4_convert_inline_data(inode);
- inode_unlock(inode);
if (ret)
- goto exit;
+ goto out_inode_lock;
- if (mode & FALLOC_FL_PUNCH_HOLE) {
- ret = ext4_punch_hole(file, offset, len);
- goto exit;
- }
+ /* Wait all existing dio workers, newcomers will block on i_rwsem */
+ inode_dio_wait(inode);
- if (mode & FALLOC_FL_COLLAPSE_RANGE) {
- ret = ext4_collapse_range(file, offset, len);
- goto exit;
- }
+ ret = file_modified(file);
+ if (ret)
+ goto out_inode_lock;
- if (mode & FALLOC_FL_INSERT_RANGE) {
- ret = ext4_insert_range(file, offset, len);
- goto exit;
+ if ((mode & FALLOC_FL_MODE_MASK) == FALLOC_FL_ALLOCATE_RANGE) {
+ ret = ext4_do_fallocate(file, offset, len, mode);
+ goto out_inode_lock;
}
- if (mode & FALLOC_FL_ZERO_RANGE) {
+ /*
+ * Follow-up operations will drop page cache, hold invalidate lock
+ * to prevent page faults from reinstantiating pages we have
+ * released from page cache.
+ */
+ filemap_invalidate_lock(mapping);
+
+ ret = ext4_break_layouts(inode);
+ if (ret)
+ goto out_invalidate_lock;
+
+ switch (mode & FALLOC_FL_MODE_MASK) {
+ case FALLOC_FL_PUNCH_HOLE:
+ ret = ext4_punch_hole(file, offset, len);
+ break;
+ case FALLOC_FL_COLLAPSE_RANGE:
+ ret = ext4_collapse_range(file, offset, len);
+ break;
+ case FALLOC_FL_INSERT_RANGE:
+ ret = ext4_insert_range(file, offset, len);
+ break;
+ case FALLOC_FL_ZERO_RANGE:
+ case FALLOC_FL_WRITE_ZEROES:
ret = ext4_zero_range(file, offset, len, mode);
- goto exit;
+ break;
+ default:
+ ret = -EOPNOTSUPP;
}
- trace_ext4_fallocate_enter(inode, offset, len, mode);
- lblk = offset >> blkbits;
+out_invalidate_lock:
+ filemap_invalidate_unlock(mapping);
+out_inode_lock:
+ inode_unlock(inode);
+ return ret;
+}
+
+/*
+ * This function converts a range of blocks to written extents. The caller of
+ * this function will pass the start offset and the size. all unwritten extents
+ * within this range will be converted to written extents.
+ *
+ * This function is called from the direct IO end io call back function for
+ * atomic writes, to convert the unwritten extents after IO is completed.
+ *
+ * Note that the requirement for atomic writes is that all conversion should
+ * happen atomically in a single fs journal transaction. We mainly only allocate
+ * unwritten extents either on a hole on a pre-exiting unwritten extent range in
+ * ext4_map_blocks_atomic_write(). The only case where we can have multiple
+ * unwritten extents in a range [offset, offset+len) is when there is a split
+ * unwritten extent between two leaf nodes which was cached in extent status
+ * cache during ext4_iomap_alloc() time. That will allow
+ * ext4_map_blocks_atomic_write() to return the unwritten extent range w/o going
+ * into the slow path. That means we might need a loop for conversion of this
+ * unwritten extent split across leaf block within a single journal transaction.
+ * Split extents across leaf nodes is a rare case, but let's still handle that
+ * to meet the requirements of multi-fsblock atomic writes.
+ *
+ * Returns 0 on success.
+ */
+int ext4_convert_unwritten_extents_atomic(handle_t *handle, struct inode *inode,
+ loff_t offset, ssize_t len)
+{
+ unsigned int max_blocks;
+ int ret = 0, ret2 = 0, ret3 = 0;
+ struct ext4_map_blocks map;
+ unsigned int blkbits = inode->i_blkbits;
+ unsigned int credits = 0;
+ int flags = EXT4_GET_BLOCKS_IO_CONVERT_EXT | EXT4_EX_NOCACHE;
+
+ map.m_lblk = offset >> blkbits;
max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits);
- flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
- inode_lock(inode);
+ if (!handle) {
+ /*
+ * TODO: An optimization can be added later by having an extent
+ * status flag e.g. EXTENT_STATUS_SPLIT_LEAF. If we query that
+ * it can tell if the extent in the cache is a split extent.
+ * But for now let's assume pextents as 2 always.
+ */
+ credits = ext4_meta_trans_blocks(inode, max_blocks, 2);
+ }
- /*
- * We only support preallocation for extent-based files only
- */
- if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
- ret = -EOPNOTSUPP;
- goto out;
+ if (credits) {
+ handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ return ret;
+ }
}
- if (!(mode & FALLOC_FL_KEEP_SIZE) &&
- (offset + len > inode->i_size ||
- offset + len > EXT4_I(inode)->i_disksize)) {
- new_size = offset + len;
- ret = inode_newsize_ok(inode, new_size);
- if (ret)
- goto out;
+ while (ret >= 0 && ret < max_blocks) {
+ map.m_lblk += ret;
+ map.m_len = (max_blocks -= ret);
+ ret = ext4_map_blocks(handle, inode, &map, flags);
+ if (ret != max_blocks)
+ ext4_msg(inode->i_sb, KERN_INFO,
+ "inode #%lu: block %u: len %u: "
+ "split block mapping found for atomic write, "
+ "ret = %d",
+ inode->i_ino, map.m_lblk,
+ map.m_len, ret);
+ if (ret <= 0)
+ break;
}
- /* Wait all existing dio workers, newcomers will block on i_rwsem */
- inode_dio_wait(inode);
+ ret2 = ext4_mark_inode_dirty(handle, inode);
- ret = file_modified(file);
- if (ret)
- goto out;
+ if (credits) {
+ ret3 = ext4_journal_stop(handle);
+ if (unlikely(ret3))
+ ret2 = ret3;
+ }
- ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags);
- if (ret)
- goto out;
+ if (ret <= 0 || ret2)
+ ext4_warning(inode->i_sb,
+ "inode #%lu: block %u: len %u: "
+ "returned %d or %d",
+ inode->i_ino, map.m_lblk,
+ map.m_len, ret, ret2);
- if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) {
- ret = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal,
- EXT4_I(inode)->i_sync_tid);
- }
-out:
- inode_unlock(inode);
- trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
-exit:
- return ret;
+ return ret > 0 ? ret2 : ret;
}
/*
@@ -4873,8 +4968,14 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
break;
}
}
+ /*
+ * Do not cache any unrelated extents, as it does not hold the
+ * i_rwsem or invalidate_lock, which could corrupt the extent
+ * status tree.
+ */
ret = ext4_map_blocks(handle, inode, &map,
- EXT4_GET_BLOCKS_IO_CONVERT_EXT);
+ EXT4_GET_BLOCKS_IO_CONVERT_EXT |
+ EXT4_EX_NOCACHE);
if (ret <= 0)
ext4_warning(inode->i_sb,
"inode #%lu: block %u: len %u: "
@@ -4985,12 +5086,7 @@ static const struct iomap_ops ext4_iomap_xattr_ops = {
static int ext4_fiemap_check_ranges(struct inode *inode, u64 start, u64 *len)
{
- u64 maxbytes;
-
- if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- maxbytes = inode->i_sb->s_maxbytes;
- else
- maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
+ u64 maxbytes = ext4_get_maxbytes(inode);
if (*len == 0)
return -EINVAL;
@@ -5010,10 +5106,11 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
{
int error = 0;
+ inode_lock_shared(inode);
if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
error = ext4_ext_precache(inode);
if (error)
- return error;
+ goto unlock;
fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE;
}
@@ -5024,15 +5121,19 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
*/
error = ext4_fiemap_check_ranges(inode, start, &len);
if (error)
- return error;
+ goto unlock;
if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
fieinfo->fi_flags &= ~FIEMAP_FLAG_XATTR;
- return iomap_fiemap(inode, fieinfo, start, len,
- &ext4_iomap_xattr_ops);
+ error = iomap_fiemap(inode, fieinfo, start, len,
+ &ext4_iomap_xattr_ops);
+ } else {
+ error = iomap_fiemap(inode, fieinfo, start, len,
+ &ext4_iomap_report_ops);
}
-
- return iomap_fiemap(inode, fieinfo, start, len, &ext4_iomap_report_ops);
+unlock:
+ inode_unlock_shared(inode);
+ return error;
}
int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo,
@@ -5053,7 +5154,9 @@ int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo,
}
if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
+ inode_lock_shared(inode);
error = ext4_ext_precache(inode);
+ inode_unlock_shared(inode);
if (error)
return error;
fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE;
@@ -5112,7 +5215,7 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
credits = depth + 2;
}
- restart_credits = ext4_writepage_trans_blocks(inode);
+ restart_credits = ext4_chunk_trans_extent(inode, 0);
err = ext4_datasem_ensure_credits(handle, inode, credits,
restart_credits, 0);
if (err) {
@@ -5332,109 +5435,74 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
struct address_space *mapping = inode->i_mapping;
- ext4_lblk_t punch_start, punch_stop;
+ loff_t end = offset + len;
+ ext4_lblk_t start_lblk, end_lblk;
handle_t *handle;
unsigned int credits;
- loff_t new_size, ioffset;
+ loff_t start, new_size;
int ret;
- /*
- * We need to test this early because xfstests assumes that a
- * collapse range of (0, 1) will return EOPNOTSUPP if the file
- * system does not support collapse range.
- */
+ trace_ext4_collapse_range(inode, offset, len);
+ WARN_ON_ONCE(!inode_is_locked(inode));
+
+ /* Currently just for extent based files */
if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
return -EOPNOTSUPP;
-
/* Collapse range works only on fs cluster size aligned regions. */
if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb)))
return -EINVAL;
-
- trace_ext4_collapse_range(inode, offset, len);
-
- punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb);
- punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb);
-
- inode_lock(inode);
/*
* There is no need to overlap collapse range with EOF, in which case
* it is effectively a truncate operation
*/
- if (offset + len >= inode->i_size) {
- ret = -EINVAL;
- goto out_mutex;
- }
-
- /* Currently just for extent based files */
- if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
- ret = -EOPNOTSUPP;
- goto out_mutex;
- }
-
- /* Wait for existing dio to complete */
- inode_dio_wait(inode);
-
- ret = file_modified(file);
- if (ret)
- goto out_mutex;
-
- /*
- * Prevent page faults from reinstantiating pages we have released from
- * page cache.
- */
- filemap_invalidate_lock(mapping);
-
- ret = ext4_break_layouts(inode);
- if (ret)
- goto out_mmap;
+ if (end >= inode->i_size)
+ return -EINVAL;
/*
+ * Write tail of the last page before removed range and data that
+ * will be shifted since they will get removed from the page cache
+ * below. We are also protected from pages becoming dirty by
+ * i_rwsem and invalidate_lock.
* Need to round down offset to be aligned with page size boundary
* for page size > block size.
*/
- ioffset = round_down(offset, PAGE_SIZE);
- /*
- * Write tail of the last page before removed range since it will get
- * removed from the page cache below.
- */
- ret = filemap_write_and_wait_range(mapping, ioffset, offset);
- if (ret)
- goto out_mmap;
- /*
- * Write data that will be shifted to preserve them when discarding
- * page cache below. We are also protected from pages becoming dirty
- * by i_rwsem and invalidate_lock.
- */
- ret = filemap_write_and_wait_range(mapping, offset + len,
- LLONG_MAX);
+ start = round_down(offset, PAGE_SIZE);
+ ret = filemap_write_and_wait_range(mapping, start, offset);
+ if (!ret)
+ ret = filemap_write_and_wait_range(mapping, end, LLONG_MAX);
if (ret)
- goto out_mmap;
- truncate_pagecache(inode, ioffset);
+ return ret;
+
+ truncate_pagecache(inode, start);
- credits = ext4_writepage_trans_blocks(inode);
+ credits = ext4_chunk_trans_extent(inode, 0);
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out_mmap;
- }
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle);
+ start_lblk = offset >> inode->i_blkbits;
+ end_lblk = (offset + len) >> inode->i_blkbits;
+
+ ext4_check_map_extents_env(inode);
+
down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode);
- ext4_es_remove_extent(inode, punch_start, EXT_MAX_BLOCKS - punch_start);
+ ext4_es_remove_extent(inode, start_lblk, EXT_MAX_BLOCKS - start_lblk);
- ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1);
+ ret = ext4_ext_remove_space(inode, start_lblk, end_lblk - 1);
if (ret) {
up_write(&EXT4_I(inode)->i_data_sem);
- goto out_stop;
+ goto out_handle;
}
ext4_discard_preallocations(inode);
- ret = ext4_ext_shift_extents(inode, handle, punch_stop,
- punch_stop - punch_start, SHIFT_LEFT);
+ ret = ext4_ext_shift_extents(inode, handle, end_lblk,
+ end_lblk - start_lblk, SHIFT_LEFT);
if (ret) {
up_write(&EXT4_I(inode)->i_data_sem);
- goto out_stop;
+ goto out_handle;
}
new_size = inode->i_size - len;
@@ -5442,18 +5510,16 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
EXT4_I(inode)->i_disksize = new_size;
up_write(&EXT4_I(inode)->i_data_sem);
- if (IS_SYNC(inode))
- ext4_handle_sync(handle);
- inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
ret = ext4_mark_inode_dirty(handle, inode);
+ if (ret)
+ goto out_handle;
+
ext4_update_inode_fsync_trans(handle, inode, 1);
+ if (IS_SYNC(inode))
+ ext4_handle_sync(handle);
-out_stop:
+out_handle:
ext4_journal_stop(handle);
-out_mmap:
- filemap_invalidate_unlock(mapping);
-out_mutex:
- inode_unlock(inode);
return ret;
}
@@ -5473,100 +5539,65 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
handle_t *handle;
struct ext4_ext_path *path;
struct ext4_extent *extent;
- ext4_lblk_t offset_lblk, len_lblk, ee_start_lblk = 0;
+ ext4_lblk_t start_lblk, len_lblk, ee_start_lblk = 0;
unsigned int credits, ee_len;
- int ret = 0, depth, split_flag = 0;
- loff_t ioffset;
+ int ret, depth, split_flag = 0;
+ loff_t start;
- /*
- * We need to test this early because xfstests assumes that an
- * insert range of (0, 1) will return EOPNOTSUPP if the file
- * system does not support insert range.
- */
+ trace_ext4_insert_range(inode, offset, len);
+ WARN_ON_ONCE(!inode_is_locked(inode));
+
+ /* Currently just for extent based files */
if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
return -EOPNOTSUPP;
-
/* Insert range works only on fs cluster size aligned regions. */
if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb)))
return -EINVAL;
-
- trace_ext4_insert_range(inode, offset, len);
-
- offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb);
- len_lblk = len >> EXT4_BLOCK_SIZE_BITS(sb);
-
- inode_lock(inode);
- /* Currently just for extent based files */
- if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
- ret = -EOPNOTSUPP;
- goto out_mutex;
- }
-
- /* Check whether the maximum file size would be exceeded */
- if (len > inode->i_sb->s_maxbytes - inode->i_size) {
- ret = -EFBIG;
- goto out_mutex;
- }
-
/* Offset must be less than i_size */
- if (offset >= inode->i_size) {
- ret = -EINVAL;
- goto out_mutex;
- }
-
- /* Wait for existing dio to complete */
- inode_dio_wait(inode);
-
- ret = file_modified(file);
- if (ret)
- goto out_mutex;
+ if (offset >= inode->i_size)
+ return -EINVAL;
+ /* Check whether the maximum file size would be exceeded */
+ if (len > inode->i_sb->s_maxbytes - inode->i_size)
+ return -EFBIG;
/*
- * Prevent page faults from reinstantiating pages we have released from
- * page cache.
+ * Write out all dirty pages. Need to round down to align start offset
+ * to page size boundary for page size > block size.
*/
- filemap_invalidate_lock(mapping);
-
- ret = ext4_break_layouts(inode);
+ start = round_down(offset, PAGE_SIZE);
+ ret = filemap_write_and_wait_range(mapping, start, LLONG_MAX);
if (ret)
- goto out_mmap;
+ return ret;
- /*
- * Need to round down to align start offset to page size boundary
- * for page size > block size.
- */
- ioffset = round_down(offset, PAGE_SIZE);
- /* Write out all dirty pages */
- ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
- LLONG_MAX);
- if (ret)
- goto out_mmap;
- truncate_pagecache(inode, ioffset);
+ truncate_pagecache(inode, start);
- credits = ext4_writepage_trans_blocks(inode);
+ credits = ext4_chunk_trans_extent(inode, 0);
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out_mmap;
- }
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle);
/* Expand file to avoid data loss if there is error while shifting */
inode->i_size += len;
EXT4_I(inode)->i_disksize += len;
- inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
ret = ext4_mark_inode_dirty(handle, inode);
if (ret)
- goto out_stop;
+ goto out_handle;
+
+ start_lblk = offset >> inode->i_blkbits;
+ len_lblk = len >> inode->i_blkbits;
+
+ ext4_check_map_extents_env(inode);
down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode);
- path = ext4_find_extent(inode, offset_lblk, NULL, 0);
+ path = ext4_find_extent(inode, start_lblk, NULL, 0);
if (IS_ERR(path)) {
up_write(&EXT4_I(inode)->i_data_sem);
ret = PTR_ERR(path);
- goto out_stop;
+ goto out_handle;
}
depth = ext_depth(inode);
@@ -5576,16 +5607,16 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
ee_len = ext4_ext_get_actual_len(extent);
/*
- * If offset_lblk is not the starting block of extent, split
- * the extent @offset_lblk
+ * If start_lblk is not the starting block of extent, split
+ * the extent @start_lblk
*/
- if ((offset_lblk > ee_start_lblk) &&
- (offset_lblk < (ee_start_lblk + ee_len))) {
+ if ((start_lblk > ee_start_lblk) &&
+ (start_lblk < (ee_start_lblk + ee_len))) {
if (ext4_ext_is_unwritten(extent))
split_flag = EXT4_EXT_MARK_UNWRIT1 |
EXT4_EXT_MARK_UNWRIT2;
path = ext4_split_extent_at(handle, inode, path,
- offset_lblk, split_flag,
+ start_lblk, split_flag,
EXT4_EX_NOCACHE |
EXT4_GET_BLOCKS_PRE_IO |
EXT4_GET_BLOCKS_METADATA_NOFAIL);
@@ -5594,32 +5625,29 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
if (IS_ERR(path)) {
up_write(&EXT4_I(inode)->i_data_sem);
ret = PTR_ERR(path);
- goto out_stop;
+ goto out_handle;
}
}
ext4_free_ext_path(path);
- ext4_es_remove_extent(inode, offset_lblk, EXT_MAX_BLOCKS - offset_lblk);
+ ext4_es_remove_extent(inode, start_lblk, EXT_MAX_BLOCKS - start_lblk);
/*
- * if offset_lblk lies in a hole which is at start of file, use
+ * if start_lblk lies in a hole which is at start of file, use
* ee_start_lblk to shift extents
*/
ret = ext4_ext_shift_extents(inode, handle,
- max(ee_start_lblk, offset_lblk), len_lblk, SHIFT_RIGHT);
-
+ max(ee_start_lblk, start_lblk), len_lblk, SHIFT_RIGHT);
up_write(&EXT4_I(inode)->i_data_sem);
+ if (ret)
+ goto out_handle;
+
+ ext4_update_inode_fsync_trans(handle, inode, 1);
if (IS_SYNC(inode))
ext4_handle_sync(handle);
- if (ret >= 0)
- ext4_update_inode_fsync_trans(handle, inode, 1);
-out_stop:
+out_handle:
ext4_journal_stop(handle);
-out_mmap:
- filemap_invalidate_unlock(mapping);
-out_mutex:
- inode_unlock(inode);
return ret;
}
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index ae29832aab1e..31dc0496f8d0 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -120,9 +120,40 @@
* memory. Hence, we will reclaim written/unwritten/hole extents from
* the tree under a heavy memory pressure.
*
+ * ==========================================================================
+ * 3. Assurance of Ext4 extent status tree consistency
+ *
+ * When mapping blocks, Ext4 queries the extent status tree first and should
+ * always trusts that the extent status tree is consistent and up to date.
+ * Therefore, it is important to adheres to the following rules when createing,
+ * modifying and removing extents.
+ *
+ * 1. Besides fastcommit replay, when Ext4 creates or queries block mappings,
+ * the extent information should always be processed through the extent
+ * status tree instead of being organized manually through the on-disk
+ * extent tree.
+ *
+ * 2. When updating the extent tree, Ext4 should acquire the i_data_sem
+ * exclusively and update the extent status tree atomically. If the extents
+ * to be modified are large enough to exceed the range that a single
+ * i_data_sem can process (as ext4_datasem_ensure_credits() may drop
+ * i_data_sem to restart a transaction), it must (e.g. as ext4_punch_hole()
+ * does):
+ *
+ * a) Hold the i_rwsem and invalidate_lock exclusively. This ensures
+ * exclusion against page faults, as well as reads and writes that may
+ * concurrently modify the extent status tree.
+ * b) Evict all page cache in the affected range and recommend rebuilding
+ * or dropping the extent status tree after modifying the on-disk
+ * extent tree. This ensures exclusion against concurrent writebacks
+ * that do not hold those locks but only holds a folio lock.
+ *
+ * 3. Based on the rules above, when querying block mappings, Ext4 should at
+ * least hold the i_rwsem or invalidate_lock or folio lock(s) for the
+ * specified querying range.
*
* ==========================================================================
- * 3. Performance analysis
+ * 4. Performance analysis
*
* -- overhead
* 1. There is a cache extent for write access, so if writes are
@@ -134,7 +165,7 @@
*
*
* ==========================================================================
- * 4. TODO list
+ * 5. TODO list
*
* -- Refactor delayed space reservation
*
@@ -1551,7 +1582,6 @@ retry:
ext4_es_print_tree(inode);
ext4_da_release_space(inode, reserved);
- return;
}
static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index da4263a14a20..42bee1d4f9f9 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -12,6 +12,7 @@
#include "ext4_extents.h"
#include "mballoc.h"
+#include <linux/lockdep.h>
/*
* Ext4 Fast Commits
* -----------------
@@ -49,19 +50,27 @@
* that need to be committed during a fast commit in another in memory queue of
* inodes. During the commit operation, we commit in the following order:
*
- * [1] Lock inodes for any further data updates by setting COMMITTING state
- * [2] Submit data buffers of all the inodes
- * [3] Wait for [2] to complete
- * [4] Commit all the directory entry updates in the fast commit space
- * [5] Commit all the changed inode structures
- * [6] Write tail tag (this tag ensures the atomicity, please read the following
+ * [1] Prepare all the inodes to write out their data by setting
+ * "EXT4_STATE_FC_FLUSHING_DATA". This ensures that inode cannot be
+ * deleted while it is being flushed.
+ * [2] Flush data buffers to disk and clear "EXT4_STATE_FC_FLUSHING_DATA"
+ * state.
+ * [3] Lock the journal by calling jbd2_journal_lock_updates. This ensures that
+ * all the exsiting handles finish and no new handles can start.
+ * [4] Mark all the fast commit eligible inodes as undergoing fast commit
+ * by setting "EXT4_STATE_FC_COMMITTING" state.
+ * [5] Unlock the journal by calling jbd2_journal_unlock_updates. This allows
+ * starting of new handles. If new handles try to start an update on
+ * any of the inodes that are being committed, ext4_fc_track_inode()
+ * will block until those inodes have finished the fast commit.
+ * [6] Commit all the directory entry updates in the fast commit space.
+ * [7] Commit all the changed inodes in the fast commit space and clear
+ * "EXT4_STATE_FC_COMMITTING" for these inodes.
+ * [8] Write tail tag (this tag ensures the atomicity, please read the following
* section for more details).
- * [7] Wait for [4], [5] and [6] to complete.
*
- * All the inode updates must call ext4_fc_start_update() before starting an
- * update. If such an ongoing update is present, fast commit waits for it to
- * complete. The completion of such an update is marked by
- * ext4_fc_stop_update().
+ * All the inode updates must be enclosed within jbd2_jounrnal_start()
+ * and jbd2_journal_stop() similar to JBD2 journaling.
*
* Fast Commit Ineligibility
* -------------------------
@@ -142,6 +151,13 @@
* similarly. Thus, by converting a non-idempotent procedure into a series of
* idempotent outcomes, fast commits ensured idempotence during the replay.
*
+ * Locking
+ * -------
+ * sbi->s_fc_lock protects the fast commit inodes queue and the fast commit
+ * dentry queue. ei->i_fc_lock protects the fast commit related info in a given
+ * inode. Most of the code avoids acquiring both the locks, but if one must do
+ * that then sbi->s_fc_lock must be acquired before ei->i_fc_lock.
+ *
* TODOs
* -----
*
@@ -156,13 +172,12 @@
* fast commit recovery even if that area is invalidated by later full
* commits.
*
- * 1) Fast commit's commit path locks the entire file system during fast
- * commit. This has significant performance penalty. Instead of that, we
- * should use ext4_fc_start/stop_update functions to start inode level
- * updates from ext4_journal_start/stop. Once we do that we can drop file
- * system locking during commit path.
+ * 1) Handle more ineligible cases.
*
- * 2) Handle more ineligible cases.
+ * 2) Change ext4_fc_commit() to lookup logical to physical mapping using extent
+ * status tree. This would get rid of the need to call ext4_fc_track_inode()
+ * before acquiring i_data_sem. To do that we would need to ensure that
+ * modified extents from the extent status tree are not evicted from memory.
*/
#include <trace/events/ext4.h>
@@ -201,32 +216,6 @@ void ext4_fc_init_inode(struct inode *inode)
INIT_LIST_HEAD(&ei->i_fc_list);
INIT_LIST_HEAD(&ei->i_fc_dilist);
init_waitqueue_head(&ei->i_fc_wait);
- atomic_set(&ei->i_fc_updates, 0);
-}
-
-/* This function must be called with sbi->s_fc_lock held. */
-static void ext4_fc_wait_committing_inode(struct inode *inode)
-__releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
-{
- wait_queue_head_t *wq;
- struct ext4_inode_info *ei = EXT4_I(inode);
-
-#if (BITS_PER_LONG < 64)
- DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
- EXT4_STATE_FC_COMMITTING);
- wq = bit_waitqueue(&ei->i_state_flags,
- EXT4_STATE_FC_COMMITTING);
-#else
- DEFINE_WAIT_BIT(wait, &ei->i_flags,
- EXT4_STATE_FC_COMMITTING);
- wq = bit_waitqueue(&ei->i_flags,
- EXT4_STATE_FC_COMMITTING);
-#endif
- lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
- prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
- spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
- schedule();
- finish_wait(wq, &wait.wq_entry);
}
static bool ext4_fc_disabled(struct super_block *sb)
@@ -236,48 +225,6 @@ static bool ext4_fc_disabled(struct super_block *sb)
}
/*
- * Inform Ext4's fast about start of an inode update
- *
- * This function is called by the high level call VFS callbacks before
- * performing any inode update. This function blocks if there's an ongoing
- * fast commit on the inode in question.
- */
-void ext4_fc_start_update(struct inode *inode)
-{
- struct ext4_inode_info *ei = EXT4_I(inode);
-
- if (ext4_fc_disabled(inode->i_sb))
- return;
-
-restart:
- spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
- if (list_empty(&ei->i_fc_list))
- goto out;
-
- if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
- ext4_fc_wait_committing_inode(inode);
- goto restart;
- }
-out:
- atomic_inc(&ei->i_fc_updates);
- spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
-}
-
-/*
- * Stop inode update and wake up waiting fast commits if any.
- */
-void ext4_fc_stop_update(struct inode *inode)
-{
- struct ext4_inode_info *ei = EXT4_I(inode);
-
- if (ext4_fc_disabled(inode->i_sb))
- return;
-
- if (atomic_dec_and_test(&ei->i_fc_updates))
- wake_up_all(&ei->i_fc_wait);
-}
-
-/*
* Remove inode from fast commit list. If the inode is being committed
* we wait until inode commit is done.
*/
@@ -286,31 +233,62 @@ void ext4_fc_del(struct inode *inode)
struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_fc_dentry_update *fc_dentry;
+ wait_queue_head_t *wq;
if (ext4_fc_disabled(inode->i_sb))
return;
-restart:
- spin_lock(&sbi->s_fc_lock);
+ mutex_lock(&sbi->s_fc_lock);
if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) {
- spin_unlock(&sbi->s_fc_lock);
+ mutex_unlock(&sbi->s_fc_lock);
return;
}
- if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
- ext4_fc_wait_committing_inode(inode);
- goto restart;
+ /*
+ * Since ext4_fc_del is called from ext4_evict_inode while having a
+ * handle open, there is no need for us to wait here even if a fast
+ * commit is going on. That is because, if this inode is being
+ * committed, ext4_mark_inode_dirty would have waited for inode commit
+ * operation to finish before we come here. So, by the time we come
+ * here, inode's EXT4_STATE_FC_COMMITTING would have been cleared. So,
+ * we shouldn't see EXT4_STATE_FC_COMMITTING to be set on this inode
+ * here.
+ *
+ * We may come here without any handles open in the "no_delete" case of
+ * ext4_evict_inode as well. However, if that happens, we first mark the
+ * file system as fast commit ineligible anyway. So, even in that case,
+ * it is okay to remove the inode from the fc list.
+ */
+ WARN_ON(ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)
+ && !ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE));
+ while (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) {
+#if (BITS_PER_LONG < 64)
+ DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
+ EXT4_STATE_FC_FLUSHING_DATA);
+ wq = bit_waitqueue(&ei->i_state_flags,
+ EXT4_STATE_FC_FLUSHING_DATA);
+#else
+ DEFINE_WAIT_BIT(wait, &ei->i_flags,
+ EXT4_STATE_FC_FLUSHING_DATA);
+ wq = bit_waitqueue(&ei->i_flags,
+ EXT4_STATE_FC_FLUSHING_DATA);
+#endif
+ prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+ if (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) {
+ mutex_unlock(&sbi->s_fc_lock);
+ schedule();
+ mutex_lock(&sbi->s_fc_lock);
+ }
+ finish_wait(wq, &wait.wq_entry);
}
-
- if (!list_empty(&ei->i_fc_list))
- list_del_init(&ei->i_fc_list);
+ list_del_init(&ei->i_fc_list);
/*
* Since this inode is getting removed, let's also remove all FC
* dentry create references, since it is not needed to log it anyways.
*/
if (list_empty(&ei->i_fc_dilist)) {
- spin_unlock(&sbi->s_fc_lock);
+ mutex_unlock(&sbi->s_fc_lock);
return;
}
@@ -320,12 +298,10 @@ restart:
list_del_init(&fc_dentry->fcd_dilist);
WARN_ON(!list_empty(&ei->i_fc_dilist));
- spin_unlock(&sbi->s_fc_lock);
+ mutex_unlock(&sbi->s_fc_lock);
release_dentry_name_snapshot(&fc_dentry->fcd_name);
kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
-
- return;
}
/*
@@ -353,12 +329,12 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handl
has_transaction = false;
read_unlock(&sbi->s_journal->j_state_lock);
}
- spin_lock(&sbi->s_fc_lock);
+ mutex_lock(&sbi->s_fc_lock);
is_ineligible = ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
if (has_transaction && (!is_ineligible || tid_gt(tid, sbi->s_fc_ineligible_tid)))
sbi->s_fc_ineligible_tid = tid;
ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
- spin_unlock(&sbi->s_fc_lock);
+ mutex_unlock(&sbi->s_fc_lock);
WARN_ON(reason >= EXT4_FC_REASON_MAX);
sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
}
@@ -385,7 +361,7 @@ static int ext4_fc_track_template(
int ret;
tid = handle->h_transaction->t_tid;
- mutex_lock(&ei->i_fc_lock);
+ spin_lock(&ei->i_fc_lock);
if (tid == ei->i_sync_tid) {
update = true;
} else {
@@ -393,19 +369,18 @@ static int ext4_fc_track_template(
ei->i_sync_tid = tid;
}
ret = __fc_track_fn(handle, inode, args, update);
- mutex_unlock(&ei->i_fc_lock);
-
+ spin_unlock(&ei->i_fc_lock);
if (!enqueue)
return ret;
- spin_lock(&sbi->s_fc_lock);
+ mutex_lock(&sbi->s_fc_lock);
if (list_empty(&EXT4_I(inode)->i_fc_list))
list_add_tail(&EXT4_I(inode)->i_fc_list,
(sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ?
&sbi->s_fc_q[FC_Q_STAGING] :
&sbi->s_fc_q[FC_Q_MAIN]);
- spin_unlock(&sbi->s_fc_lock);
+ mutex_unlock(&sbi->s_fc_lock);
return ret;
}
@@ -428,19 +403,19 @@ static int __track_dentry_update(handle_t *handle, struct inode *inode,
struct super_block *sb = inode->i_sb;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- mutex_unlock(&ei->i_fc_lock);
+ spin_unlock(&ei->i_fc_lock);
if (IS_ENCRYPTED(dir)) {
ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_ENCRYPTED_FILENAME,
handle);
- mutex_lock(&ei->i_fc_lock);
+ spin_lock(&ei->i_fc_lock);
return -EOPNOTSUPP;
}
node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
if (!node) {
ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, handle);
- mutex_lock(&ei->i_fc_lock);
+ spin_lock(&ei->i_fc_lock);
return -ENOMEM;
}
@@ -449,7 +424,8 @@ static int __track_dentry_update(handle_t *handle, struct inode *inode,
node->fcd_ino = inode->i_ino;
take_dentry_name_snapshot(&node->fcd_name, dentry);
INIT_LIST_HEAD(&node->fcd_dilist);
- spin_lock(&sbi->s_fc_lock);
+ INIT_LIST_HEAD(&node->fcd_list);
+ mutex_lock(&sbi->s_fc_lock);
if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING)
list_add_tail(&node->fcd_list,
@@ -470,8 +446,8 @@ static int __track_dentry_update(handle_t *handle, struct inode *inode,
WARN_ON(!list_empty(&ei->i_fc_dilist));
list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist);
}
- spin_unlock(&sbi->s_fc_lock);
- mutex_lock(&ei->i_fc_lock);
+ mutex_unlock(&sbi->s_fc_lock);
+ spin_lock(&ei->i_fc_lock);
return 0;
}
@@ -571,6 +547,8 @@ static int __track_inode(handle_t *handle, struct inode *inode, void *arg,
void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ wait_queue_head_t *wq;
int ret;
if (S_ISDIR(inode->i_mode))
@@ -588,6 +566,35 @@ void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
return;
+ /*
+ * If we come here, we may sleep while waiting for the inode to
+ * commit. We shouldn't be holding i_data_sem when we go to sleep since
+ * the commit path needs to grab the lock while committing the inode.
+ */
+ lockdep_assert_not_held(&ei->i_data_sem);
+
+ while (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
+#if (BITS_PER_LONG < 64)
+ DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
+ EXT4_STATE_FC_COMMITTING);
+ wq = bit_waitqueue(&ei->i_state_flags,
+ EXT4_STATE_FC_COMMITTING);
+#else
+ DEFINE_WAIT_BIT(wait, &ei->i_flags,
+ EXT4_STATE_FC_COMMITTING);
+ wq = bit_waitqueue(&ei->i_flags,
+ EXT4_STATE_FC_COMMITTING);
+#endif
+ prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+ if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
+ schedule();
+ finish_wait(wq, &wait.wq_entry);
+ }
+
+ /*
+ * From this point on, this inode will not be committed either
+ * by fast or full commit as long as the handle is open.
+ */
ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
trace_ext4_fc_track_inode(handle, inode, ret);
}
@@ -727,7 +734,7 @@ static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
tl.fc_len = cpu_to_le16(remaining);
memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
memset(dst + EXT4_FC_TAG_BASE_LEN, 0, remaining);
- *crc = ext4_chksum(sbi, *crc, sbi->s_fc_bh->b_data, bsize);
+ *crc = ext4_chksum(*crc, sbi->s_fc_bh->b_data, bsize);
ext4_fc_submit_bh(sb, false);
@@ -774,7 +781,7 @@ static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
memcpy(dst, &tail.fc_tid, sizeof(tail.fc_tid));
dst += sizeof(tail.fc_tid);
- crc = ext4_chksum(sbi, crc, sbi->s_fc_bh->b_data,
+ crc = ext4_chksum(crc, sbi->s_fc_bh->b_data,
dst - (u8 *)sbi->s_fc_bh->b_data);
tail.fc_crc = cpu_to_le32(crc);
memcpy(dst, &tail.fc_crc, sizeof(tail.fc_crc));
@@ -893,15 +900,15 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
struct ext4_extent *ex;
int ret;
- mutex_lock(&ei->i_fc_lock);
+ spin_lock(&ei->i_fc_lock);
if (ei->i_fc_lblk_len == 0) {
- mutex_unlock(&ei->i_fc_lock);
+ spin_unlock(&ei->i_fc_lock);
return 0;
}
old_blk_size = ei->i_fc_lblk_start;
new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
ei->i_fc_lblk_len = 0;
- mutex_unlock(&ei->i_fc_lock);
+ spin_unlock(&ei->i_fc_lock);
cur_lblk_off = old_blk_size;
ext4_debug("will try writing %d to %d for inode %ld\n",
@@ -910,7 +917,9 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
while (cur_lblk_off <= new_blk_size) {
map.m_lblk = cur_lblk_off;
map.m_len = new_blk_size - cur_lblk_off + 1;
- ret = ext4_map_blocks(NULL, inode, &map, 0);
+ ret = ext4_map_blocks(NULL, inode, &map,
+ EXT4_GET_BLOCKS_IO_SUBMIT |
+ EXT4_EX_NOCACHE);
if (ret < 0)
return -ECANCELED;
@@ -954,69 +963,31 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
}
-/* Submit data for all the fast commit inodes */
-static int ext4_fc_submit_inode_data_all(journal_t *journal)
+/* Flushes data of all the inodes in the commit queue. */
+static int ext4_fc_flush_data(journal_t *journal)
{
struct super_block *sb = journal->j_private;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_inode_info *ei;
int ret = 0;
- spin_lock(&sbi->s_fc_lock);
list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
- ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
- while (atomic_read(&ei->i_fc_updates)) {
- DEFINE_WAIT(wait);
-
- prepare_to_wait(&ei->i_fc_wait, &wait,
- TASK_UNINTERRUPTIBLE);
- if (atomic_read(&ei->i_fc_updates)) {
- spin_unlock(&sbi->s_fc_lock);
- schedule();
- spin_lock(&sbi->s_fc_lock);
- }
- finish_wait(&ei->i_fc_wait, &wait);
- }
- spin_unlock(&sbi->s_fc_lock);
ret = jbd2_submit_inode_data(journal, ei->jinode);
if (ret)
return ret;
- spin_lock(&sbi->s_fc_lock);
}
- spin_unlock(&sbi->s_fc_lock);
-
- return ret;
-}
-
-/* Wait for completion of data for all the fast commit inodes */
-static int ext4_fc_wait_inode_data_all(journal_t *journal)
-{
- struct super_block *sb = journal->j_private;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_inode_info *pos, *n;
- int ret = 0;
-
- spin_lock(&sbi->s_fc_lock);
- list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
- if (!ext4_test_inode_state(&pos->vfs_inode,
- EXT4_STATE_FC_COMMITTING))
- continue;
- spin_unlock(&sbi->s_fc_lock);
- ret = jbd2_wait_inode_data(journal, pos->jinode);
+ list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
+ ret = jbd2_wait_inode_data(journal, ei->jinode);
if (ret)
return ret;
- spin_lock(&sbi->s_fc_lock);
}
- spin_unlock(&sbi->s_fc_lock);
return 0;
}
/* Commit all the directory entry updates */
static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
-__acquires(&sbi->s_fc_lock)
-__releases(&sbi->s_fc_lock)
{
struct super_block *sb = journal->j_private;
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -1030,26 +1001,22 @@ __releases(&sbi->s_fc_lock)
list_for_each_entry_safe(fc_dentry, fc_dentry_n,
&sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
- spin_unlock(&sbi->s_fc_lock);
- if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
- ret = -ENOSPC;
- goto lock_and_exit;
- }
- spin_lock(&sbi->s_fc_lock);
+ if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry))
+ return -ENOSPC;
continue;
}
/*
* With fcd_dilist we need not loop in sbi->s_fc_q to get the
- * corresponding inode pointer
+ * corresponding inode. Also, the corresponding inode could have been
+ * deleted, in which case, we don't need to do anything.
*/
- WARN_ON(list_empty(&fc_dentry->fcd_dilist));
+ if (list_empty(&fc_dentry->fcd_dilist))
+ continue;
ei = list_first_entry(&fc_dentry->fcd_dilist,
struct ext4_inode_info, i_fc_dilist);
inode = &ei->vfs_inode;
WARN_ON(inode->i_ino != fc_dentry->fcd_ino);
- spin_unlock(&sbi->s_fc_lock);
-
/*
* We first write the inode and then the create dirent. This
* allows the recovery code to create an unnamed inode first
@@ -1059,23 +1026,14 @@ __releases(&sbi->s_fc_lock)
*/
ret = ext4_fc_write_inode(inode, crc);
if (ret)
- goto lock_and_exit;
-
+ return ret;
ret = ext4_fc_write_inode_data(inode, crc);
if (ret)
- goto lock_and_exit;
-
- if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
- ret = -ENOSPC;
- goto lock_and_exit;
- }
-
- spin_lock(&sbi->s_fc_lock);
+ return ret;
+ if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry))
+ return -ENOSPC;
}
return 0;
-lock_and_exit:
- spin_lock(&sbi->s_fc_lock);
- return ret;
}
static int ext4_fc_perform_commit(journal_t *journal)
@@ -1089,26 +1047,81 @@ static int ext4_fc_perform_commit(journal_t *journal)
int ret = 0;
u32 crc = 0;
- ret = ext4_fc_submit_inode_data_all(journal);
- if (ret)
- return ret;
+ /*
+ * Step 1: Mark all inodes on s_fc_q[MAIN] with
+ * EXT4_STATE_FC_FLUSHING_DATA. This prevents these inodes from being
+ * freed until the data flush is over.
+ */
+ mutex_lock(&sbi->s_fc_lock);
+ list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
+ ext4_set_inode_state(&iter->vfs_inode,
+ EXT4_STATE_FC_FLUSHING_DATA);
+ }
+ mutex_unlock(&sbi->s_fc_lock);
+
+ /* Step 2: Flush data for all the eligible inodes. */
+ ret = ext4_fc_flush_data(journal);
- ret = ext4_fc_wait_inode_data_all(journal);
+ /*
+ * Step 3: Clear EXT4_STATE_FC_FLUSHING_DATA flag, before returning
+ * any error from step 2. This ensures that waiters waiting on
+ * EXT4_STATE_FC_FLUSHING_DATA can resume.
+ */
+ mutex_lock(&sbi->s_fc_lock);
+ list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
+ ext4_clear_inode_state(&iter->vfs_inode,
+ EXT4_STATE_FC_FLUSHING_DATA);
+#if (BITS_PER_LONG < 64)
+ wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_FLUSHING_DATA);
+#else
+ wake_up_bit(&iter->i_flags, EXT4_STATE_FC_FLUSHING_DATA);
+#endif
+ }
+
+ /*
+ * Make sure clearing of EXT4_STATE_FC_FLUSHING_DATA is visible before
+ * the waiter checks the bit. Pairs with implicit barrier in
+ * prepare_to_wait() in ext4_fc_del().
+ */
+ smp_mb();
+ mutex_unlock(&sbi->s_fc_lock);
+
+ /*
+ * If we encountered error in Step 2, return it now after clearing
+ * EXT4_STATE_FC_FLUSHING_DATA bit.
+ */
if (ret)
return ret;
+
+ /* Step 4: Mark all inodes as being committed. */
+ jbd2_journal_lock_updates(journal);
/*
- * If file system device is different from journal device, issue a cache
- * flush before we start writing fast commit blocks.
+ * The journal is now locked. No more handles can start and all the
+ * previous handles are now drained. We now mark the inodes on the
+ * commit queue as being committed.
+ */
+ mutex_lock(&sbi->s_fc_lock);
+ list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
+ ext4_set_inode_state(&iter->vfs_inode,
+ EXT4_STATE_FC_COMMITTING);
+ }
+ mutex_unlock(&sbi->s_fc_lock);
+ jbd2_journal_unlock_updates(journal);
+
+ /*
+ * Step 5: If file system device is different from journal device,
+ * issue a cache flush before we start writing fast commit blocks.
*/
if (journal->j_fs_dev != journal->j_dev)
blkdev_issue_flush(journal->j_fs_dev);
blk_start_plug(&plug);
+ /* Step 6: Write fast commit blocks to disk. */
if (sbi->s_fc_bytes == 0) {
/*
- * Add a head tag only if this is the first fast commit
- * in this TID.
+ * Step 6.1: Add a head tag only if this is the first fast
+ * commit in this TID.
*/
head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
head.fc_tid = cpu_to_le32(
@@ -1120,32 +1133,30 @@ static int ext4_fc_perform_commit(journal_t *journal)
}
}
- spin_lock(&sbi->s_fc_lock);
+ /* Step 6.2: Now write all the dentry updates. */
+ mutex_lock(&sbi->s_fc_lock);
ret = ext4_fc_commit_dentry_updates(journal, &crc);
- if (ret) {
- spin_unlock(&sbi->s_fc_lock);
+ if (ret)
goto out;
- }
+ /* Step 6.3: Now write all the changed inodes to disk. */
list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
inode = &iter->vfs_inode;
if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
continue;
- spin_unlock(&sbi->s_fc_lock);
ret = ext4_fc_write_inode_data(inode, &crc);
if (ret)
goto out;
ret = ext4_fc_write_inode(inode, &crc);
if (ret)
goto out;
- spin_lock(&sbi->s_fc_lock);
}
- spin_unlock(&sbi->s_fc_lock);
-
+ /* Step 6.4: Finally write tail tag to conclude this fast commit. */
ret = ext4_fc_write_tail(sb, crc);
out:
+ mutex_unlock(&sbi->s_fc_lock);
blk_finish_plug(&plug);
return ret;
}
@@ -1191,6 +1202,7 @@ int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
int subtid = atomic_read(&sbi->s_fc_subtid);
int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0;
ktime_t start_time, commit_time;
+ int old_ioprio, journal_ioprio;
if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
return jbd2_complete_transaction(journal, commit_tid);
@@ -1198,6 +1210,7 @@ int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
trace_ext4_fc_commit_start(sb, commit_tid);
start_time = ktime_get();
+ old_ioprio = get_current_ioprio();
restart_fc:
ret = jbd2_fc_begin_commit(journal, commit_tid);
@@ -1228,6 +1241,15 @@ restart_fc:
goto fallback;
}
+ /*
+ * Now that we know that this thread is going to do a fast commit,
+ * elevate the priority to match that of the journal thread.
+ */
+ if (journal->j_task->io_context)
+ journal_ioprio = sbi->s_journal->j_task->io_context->ioprio;
+ else
+ journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO;
+ set_task_ioprio(current, journal_ioprio);
fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
ret = ext4_fc_perform_commit(journal);
if (ret < 0) {
@@ -1242,6 +1264,7 @@ restart_fc:
}
atomic_inc(&sbi->s_fc_subtid);
ret = jbd2_fc_end_commit(journal);
+ set_task_ioprio(current, old_ioprio);
/*
* weight the commit time higher than the average time so we
* don't react too strongly to vast changes in the commit time
@@ -1251,6 +1274,7 @@ restart_fc:
return ret;
fallback:
+ set_task_ioprio(current, old_ioprio);
ret = jbd2_fc_end_commit_fallback(journal);
ext4_fc_update_stats(sb, status, 0, 0, commit_tid);
return ret;
@@ -1264,7 +1288,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
{
struct super_block *sb = journal->j_private;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_inode_info *iter, *iter_n;
+ struct ext4_inode_info *ei;
struct ext4_fc_dentry_update *fc_dentry;
if (full && sbi->s_fc_bh)
@@ -1273,14 +1297,16 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
trace_ext4_fc_cleanup(journal, full, tid);
jbd2_fc_release_bufs(journal);
- spin_lock(&sbi->s_fc_lock);
- list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN],
- i_fc_list) {
- list_del_init(&iter->i_fc_list);
- ext4_clear_inode_state(&iter->vfs_inode,
+ mutex_lock(&sbi->s_fc_lock);
+ while (!list_empty(&sbi->s_fc_q[FC_Q_MAIN])) {
+ ei = list_first_entry(&sbi->s_fc_q[FC_Q_MAIN],
+ struct ext4_inode_info,
+ i_fc_list);
+ list_del_init(&ei->i_fc_list);
+ ext4_clear_inode_state(&ei->vfs_inode,
EXT4_STATE_FC_COMMITTING);
- if (tid_geq(tid, iter->i_sync_tid)) {
- ext4_fc_reset_inode(&iter->vfs_inode);
+ if (tid_geq(tid, ei->i_sync_tid)) {
+ ext4_fc_reset_inode(&ei->vfs_inode);
} else if (full) {
/*
* We are called after a full commit, inode has been
@@ -1291,15 +1317,19 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
* time in that case (and tid doesn't increase so
* tid check above isn't reliable).
*/
- list_add_tail(&EXT4_I(&iter->vfs_inode)->i_fc_list,
+ list_add_tail(&ei->i_fc_list,
&sbi->s_fc_q[FC_Q_STAGING]);
}
- /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
+ /*
+ * Make sure clearing of EXT4_STATE_FC_COMMITTING is
+ * visible before we send the wakeup. Pairs with implicit
+ * barrier in prepare_to_wait() in ext4_fc_track_inode().
+ */
smp_mb();
#if (BITS_PER_LONG < 64)
- wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
+ wake_up_bit(&ei->i_state_flags, EXT4_STATE_FC_COMMITTING);
#else
- wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
+ wake_up_bit(&ei->i_flags, EXT4_STATE_FC_COMMITTING);
#endif
}
@@ -1309,11 +1339,9 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
fcd_list);
list_del_init(&fc_dentry->fcd_list);
list_del_init(&fc_dentry->fcd_dilist);
- spin_unlock(&sbi->s_fc_lock);
release_dentry_name_snapshot(&fc_dentry->fcd_name);
kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
- spin_lock(&sbi->s_fc_lock);
}
list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
@@ -1328,7 +1356,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
if (full)
sbi->s_fc_bytes = 0;
- spin_unlock(&sbi->s_fc_lock);
+ mutex_unlock(&sbi->s_fc_lock);
trace_ext4_fc_stats(sb);
}
@@ -2105,13 +2133,13 @@ static int ext4_fc_replay_scan(journal_t *journal,
case EXT4_FC_TAG_INODE:
case EXT4_FC_TAG_PAD:
state->fc_cur_tag++;
- state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
+ state->fc_crc = ext4_chksum(state->fc_crc, cur,
EXT4_FC_TAG_BASE_LEN + tl.fc_len);
break;
case EXT4_FC_TAG_TAIL:
state->fc_cur_tag++;
memcpy(&tail, val, sizeof(tail));
- state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
+ state->fc_crc = ext4_chksum(state->fc_crc, cur,
EXT4_FC_TAG_BASE_LEN +
offsetof(struct ext4_fc_tail,
fc_crc));
@@ -2138,7 +2166,7 @@ static int ext4_fc_replay_scan(journal_t *journal,
break;
}
state->fc_cur_tag++;
- state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
+ state->fc_crc = ext4_chksum(state->fc_crc, cur,
EXT4_FC_TAG_BASE_LEN + tl.fc_len);
break;
default:
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index a5205149adba..93240e35ee36 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -377,7 +377,12 @@ static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
loff_t pos = iocb->ki_pos;
struct inode *inode = file_inode(iocb->ki_filp);
- if (!error && size && flags & IOMAP_DIO_UNWRITTEN)
+
+ if (!error && size && (flags & IOMAP_DIO_UNWRITTEN) &&
+ (iocb->ki_flags & IOCB_ATOMIC))
+ error = ext4_convert_unwritten_extents_atomic(NULL, inode, pos,
+ size);
+ else if (!error && size && flags & IOMAP_DIO_UNWRITTEN)
error = ext4_convert_unwritten_extents(NULL, inode, pos, size);
if (error)
return error;
@@ -688,10 +693,12 @@ out:
static ssize_t
ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
+ int ret;
struct inode *inode = file_inode(iocb->ki_filp);
- if (unlikely(ext4_forced_shutdown(inode->i_sb)))
- return -EIO;
+ ret = ext4_emergency_state(inode->i_sb);
+ if (unlikely(ret))
+ return ret;
#ifdef CONFIG_FS_DAX
if (IS_DAX(inode))
@@ -700,7 +707,6 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (iocb->ki_flags & IOCB_ATOMIC) {
size_t len = iov_iter_count(from);
- int ret;
if (len < EXT4_SB(inode->i_sb)->s_awu_min ||
len > EXT4_SB(inode->i_sb)->s_awu_max)
@@ -741,7 +747,7 @@ static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, unsigned int order)
bool write = (vmf->flags & FAULT_FLAG_WRITE) &&
(vmf->vma->vm_flags & VM_SHARED);
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
- pfn_t pfn;
+ unsigned long pfn;
if (write) {
sb_start_pagefault(sb);
@@ -756,9 +762,6 @@ retry:
return VM_FAULT_SIGBUS;
}
} else {
- result = filemap_fsnotify_fault(vmf);
- if (unlikely(result))
- return result;
filemap_invalidate_lock_shared(mapping);
}
result = dax_iomap_fault(vmf, order, &pfn, &error, &ext4_iomap_ops);
@@ -801,27 +804,33 @@ static const struct vm_operations_struct ext4_file_vm_ops = {
.page_mkwrite = ext4_page_mkwrite,
};
-static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
+static int ext4_file_mmap_prepare(struct vm_area_desc *desc)
{
+ int ret;
+ struct file *file = desc->file;
struct inode *inode = file->f_mapping->host;
struct dax_device *dax_dev = EXT4_SB(inode->i_sb)->s_daxdev;
- if (unlikely(ext4_forced_shutdown(inode->i_sb)))
- return -EIO;
+ if (file->f_mode & FMODE_WRITE)
+ ret = ext4_emergency_state(inode->i_sb);
+ else
+ ret = ext4_forced_shutdown(inode->i_sb) ? -EIO : 0;
+ if (unlikely(ret))
+ return ret;
/*
* We don't support synchronous mappings for non-DAX files and
* for DAX files if underneath dax_device is not synchronous.
*/
- if (!daxdev_mapping_supported(vma, dax_dev))
+ if (!daxdev_mapping_supported(desc->vm_flags, file_inode(file), dax_dev))
return -EOPNOTSUPP;
file_accessed(file);
if (IS_DAX(file_inode(file))) {
- vma->vm_ops = &ext4_dax_vm_ops;
- vm_flags_set(vma, VM_HUGEPAGE);
+ desc->vm_ops = &ext4_dax_vm_ops;
+ desc->vm_flags |= VM_HUGEPAGE;
} else {
- vma->vm_ops = &ext4_file_vm_ops;
+ desc->vm_ops = &ext4_file_vm_ops;
}
return 0;
}
@@ -838,7 +847,8 @@ static int ext4_sample_last_mounted(struct super_block *sb,
if (likely(ext4_test_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED)))
return 0;
- if (sb_rdonly(sb) || !sb_start_intwrite_trylock(sb))
+ if (ext4_emergency_state(sb) || sb_rdonly(sb) ||
+ !sb_start_intwrite_trylock(sb))
return 0;
ext4_set_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED);
@@ -881,8 +891,12 @@ static int ext4_file_open(struct inode *inode, struct file *filp)
{
int ret;
- if (unlikely(ext4_forced_shutdown(inode->i_sb)))
- return -EIO;
+ if (filp->f_mode & FMODE_WRITE)
+ ret = ext4_emergency_state(inode->i_sb);
+ else
+ ret = ext4_forced_shutdown(inode->i_sb) ? -EIO : 0;
+ if (unlikely(ret))
+ return ret;
ret = ext4_sample_last_mounted(inode->i_sb, filp->f_path.mnt);
if (ret)
@@ -921,12 +935,7 @@ static int ext4_file_open(struct inode *inode, struct file *filp)
loff_t ext4_llseek(struct file *file, loff_t offset, int whence)
{
struct inode *inode = file->f_mapping->host;
- loff_t maxbytes;
-
- if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
- maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
- else
- maxbytes = inode->i_sb->s_maxbytes;
+ loff_t maxbytes = ext4_get_maxbytes(inode);
switch (whence) {
default:
@@ -960,7 +969,7 @@ const struct file_operations ext4_file_operations = {
#ifdef CONFIG_COMPAT
.compat_ioctl = ext4_compat_ioctl,
#endif
- .mmap = ext4_file_mmap,
+ .mmap_prepare = ext4_file_mmap_prepare,
.open = ext4_file_open,
.release = ext4_release_file,
.fsync = ext4_sync_file,
@@ -969,7 +978,8 @@ const struct file_operations ext4_file_operations = {
.splice_write = iter_file_splice_write,
.fallocate = ext4_fallocate,
.fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC |
- FOP_DIO_PARALLEL_WRITE,
+ FOP_DIO_PARALLEL_WRITE |
+ FOP_DONTCACHE,
};
const struct inode_operations ext4_file_inode_operations = {
diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c
index 383c6edea6dd..91185c40f755 100644
--- a/fs/ext4/fsmap.c
+++ b/fs/ext4/fsmap.c
@@ -393,6 +393,14 @@ static unsigned int ext4_getfsmap_find_sb(struct super_block *sb,
/* Reserved GDT blocks */
if (!ext4_has_feature_meta_bg(sb) || metagroup < first_meta_bg) {
len = le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
+
+ /*
+ * mkfs.ext4 can set s_reserved_gdt_blocks as 0 in some cases,
+ * check for that.
+ */
+ if (!len)
+ return 0;
+
error = ext4_getfsmap_fill(meta_list, fsb, len,
EXT4_FMR_OWN_RESV_GDT);
if (error)
@@ -526,6 +534,7 @@ static int ext4_getfsmap_datadev(struct super_block *sb,
ext4_group_t end_ag;
ext4_grpblk_t first_cluster;
ext4_grpblk_t last_cluster;
+ struct ext4_fsmap irec;
int error = 0;
bofs = le32_to_cpu(sbi->s_es->s_first_data_block);
@@ -609,10 +618,18 @@ static int ext4_getfsmap_datadev(struct super_block *sb,
goto err;
}
- /* Report any gaps at the end of the bg */
+ /*
+ * The dummy record below will cause ext4_getfsmap_helper() to report
+ * any allocated blocks at the end of the range.
+ */
+ irec.fmr_device = 0;
+ irec.fmr_physical = end_fsb + 1;
+ irec.fmr_length = 0;
+ irec.fmr_owner = EXT4_FMR_OWN_FREE;
+ irec.fmr_flags = 0;
+
info->gfi_last = true;
- error = ext4_getfsmap_datadev_helper(sb, end_ag, last_cluster + 1,
- 0, info);
+ error = ext4_getfsmap_helper(sb, info, &irec);
if (error)
goto err;
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index b40d3b29f7e5..e476c6de3074 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -132,20 +132,16 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
bool needs_barrier = false;
struct inode *inode = file->f_mapping->host;
- if (unlikely(ext4_forced_shutdown(inode->i_sb)))
- return -EIO;
+ ret = ext4_emergency_state(inode->i_sb);
+ if (unlikely(ret))
+ return ret;
ASSERT(ext4_journal_current_handle() == NULL);
trace_ext4_sync_file_enter(file, datasync);
- if (sb_rdonly(inode->i_sb)) {
- /* Make sure that we read updated s_ext4_flags value */
- smp_rmb();
- if (ext4_forced_shutdown(inode->i_sb))
- ret = -EROFS;
+ if (sb_rdonly(inode->i_sb))
goto out;
- }
if (!EXT4_SB(inode->i_sb)->s_journal) {
ret = ext4_fsync_nojournal(file, start, end, datasync,
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index deabe29da7fb..33cd5b6b02d5 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -302,7 +302,7 @@ int ext4fs_dirhash(const struct inode *dir, const char *name, int len,
if (len && IS_CASEFOLDED(dir) &&
(!IS_ENCRYPTED(dir) || fscrypt_has_encryption_key(dir))) {
- buff = kzalloc(sizeof(char) * PATH_MAX, GFP_KERNEL);
+ buff = kzalloc(PATH_MAX, GFP_KERNEL);
if (!buff)
return -ENOMEM;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 21d228073d79..df4051613b29 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -691,7 +691,8 @@ static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino)
if (!bh || !buffer_uptodate(bh))
/*
* If the block is not in the buffer cache, then it
- * must have been written out.
+ * must have been written out, or, most unlikely, is
+ * being migrated - false failure should be OK here.
*/
goto out;
@@ -951,8 +952,9 @@ struct inode *__ext4_new_inode(struct mnt_idmap *idmap,
sb = dir->i_sb;
sbi = EXT4_SB(sb);
- if (unlikely(ext4_forced_shutdown(sb)))
- return ERR_PTR(-EIO);
+ ret2 = ext4_emergency_state(sb);
+ if (unlikely(ret2))
+ return ERR_PTR(ret2);
ngroups = ext4_get_groups_count(sb);
trace_ext4_request_inode(dir, mode);
@@ -1282,14 +1284,13 @@ got:
inode->i_generation = get_random_u32();
/* Precompute checksum seed for inode metadata */
- if (ext4_has_metadata_csum(sb)) {
+ if (ext4_has_feature_metadata_csum(sb)) {
__u32 csum;
__le32 inum = cpu_to_le32(inode->i_ino);
__le32 gen = cpu_to_le32(inode->i_generation);
- csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
+ csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&inum,
sizeof(inum));
- ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen,
- sizeof(gen));
+ ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen));
}
ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
@@ -1298,7 +1299,7 @@ got:
ei->i_extra_isize = sbi->s_want_extra_isize;
ei->i_inline_off = 0;
if (ext4_has_feature_inline_data(sb) &&
- (!(ei->i_flags & EXT4_DAX_FL) || S_ISDIR(mode)))
+ (!(ei->i_flags & (EXT4_DAX_FL|EXT4_EA_INODE_FL)) || S_ISDIR(mode)))
ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
ret = inode;
err = dquot_alloc_inode(inode);
@@ -1334,6 +1335,8 @@ got:
}
}
+ ext4_set_inode_mapping_order(inode);
+
ext4_update_inode_fsync_trans(handle, inode, 1);
err = ext4_mark_inode_dirty(handle, inode);
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 7de327fa7b1c..d45124318200 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -539,7 +539,7 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
int indirect_blks;
int blocks_to_boundary = 0;
int depth;
- int count = 0;
+ u64 count = 0;
ext4_fsblk_t first_block = 0;
trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
@@ -588,7 +588,7 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
count++;
/* Fill in size of a hole we found */
map->m_pblk = 0;
- map->m_len = min_t(unsigned int, map->m_len, count);
+ map->m_len = umin(map->m_len, count);
goto cleanup;
}
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 3536ca7e4fcc..1b094a4f3866 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -20,6 +20,11 @@
#define EXT4_INLINE_DOTDOT_OFFSET 2
#define EXT4_INLINE_DOTDOT_SIZE 4
+
+static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping,
+ struct inode *inode,
+ void **fsdata);
+
static int ext4_get_inline_size(struct inode *inode)
{
if (EXT4_I(inode)->i_inline_off)
@@ -228,7 +233,7 @@ static void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc,
struct ext4_inode *raw_inode;
int cp_len = 0;
- if (unlikely(ext4_forced_shutdown(inode->i_sb)))
+ if (unlikely(ext4_emergency_state(inode->i_sb)))
return;
BUG_ON(!EXT4_I(inode)->i_inline_off);
@@ -298,7 +303,11 @@ static int ext4_create_inline_data(handle_t *handle,
if (error)
goto out;
- BUG_ON(!is.s.not_found);
+ if (!is.s.not_found) {
+ EXT4_ERROR_INODE(inode, "unexpected inline data xattr");
+ error = -EFSCORRUPTED;
+ goto out;
+ }
error = ext4_xattr_ibody_set(handle, inode, &i, &is);
if (error) {
@@ -349,7 +358,11 @@ static int ext4_update_inline_data(handle_t *handle, struct inode *inode,
if (error)
goto out;
- BUG_ON(is.s.not_found);
+ if (is.s.not_found) {
+ EXT4_ERROR_INODE(inode, "missing inline data xattr");
+ error = -EFSCORRUPTED;
+ goto out;
+ }
len -= EXT4_MIN_INLINE_DATA_SIZE;
value = kzalloc(len, GFP_NOFS);
@@ -392,7 +405,7 @@ out:
}
static int ext4_prepare_inline_data(handle_t *handle, struct inode *inode,
- unsigned int len)
+ loff_t len)
{
int ret, size, no_expand;
struct ext4_inode_info *ei = EXT4_I(inode);
@@ -557,7 +570,7 @@ static int ext4_convert_inline_data_to_extent(struct address_space *mapping,
return 0;
}
- needed_blocks = ext4_writepage_trans_blocks(inode);
+ needed_blocks = ext4_chunk_trans_extent(inode, 1);
ret = ext4_get_inode_loc(inode, &iloc);
if (ret)
@@ -596,6 +609,7 @@ retry:
goto out;
}
+ ext4_fc_track_inode(handle, inode);
ret = ext4_destroy_inline_data_nolock(handle, inode);
if (ret)
goto out;
@@ -606,6 +620,7 @@ retry:
} else
ret = ext4_block_write_begin(handle, folio, from, to,
ext4_get_block);
+ clear_buffer_new(folio_buffers(folio));
if (!ret && ext4_should_journal_data(inode)) {
ret = ext4_walk_page_buffers(handle, inode,
@@ -637,7 +652,7 @@ retry:
goto retry;
if (folio)
- block_commit_write(&folio->page, from, to);
+ block_commit_write(folio, from, to);
out:
if (folio) {
folio_unlock(folio);
@@ -653,91 +668,109 @@ out_nofolio:
}
/*
- * Try to write data in the inode.
- * If the inode has inline data, check whether the new write can be
- * in the inode also. If not, create the page the handle, move the data
- * to the page make it update and let the later codes create extent for it.
+ * Prepare the write for the inline data.
+ * If the data can be written into the inode, we just read
+ * the page and make it uptodate, and start the journal.
+ * Otherwise read the page, makes it dirty so that it can be
+ * handle in writepages(the i_disksize update is left to the
+ * normal ext4_da_write_end).
*/
-int ext4_try_to_write_inline_data(struct address_space *mapping,
- struct inode *inode,
- loff_t pos, unsigned len,
- struct folio **foliop)
+int ext4_generic_write_inline_data(struct address_space *mapping,
+ struct inode *inode,
+ loff_t pos, unsigned len,
+ struct folio **foliop,
+ void **fsdata, bool da)
{
int ret;
handle_t *handle;
struct folio *folio;
struct ext4_iloc iloc;
-
- if (pos + len > ext4_get_max_inline_size(inode))
- goto convert;
+ int retries = 0;
ret = ext4_get_inode_loc(inode, &iloc);
if (ret)
return ret;
- /*
- * The possible write could happen in the inode,
- * so try to reserve the space in inode first.
- */
+retry_journal:
handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
- handle = NULL;
- goto out;
+ goto out_release_bh;
}
ret = ext4_prepare_inline_data(handle, inode, pos + len);
if (ret && ret != -ENOSPC)
- goto out;
+ goto out_stop_journal;
- /* We don't have space in inline inode, so convert it to extent. */
if (ret == -ENOSPC) {
ext4_journal_stop(handle);
- brelse(iloc.bh);
- goto convert;
- }
+ if (!da) {
+ brelse(iloc.bh);
+ /* Retry inside */
+ return ext4_convert_inline_data_to_extent(mapping, inode);
+ }
- ret = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh,
- EXT4_JTR_NONE);
- if (ret)
- goto out;
+ ret = ext4_da_convert_inline_data_to_extent(mapping, inode, fsdata);
+ if (ret == -ENOSPC &&
+ ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry_journal;
+ goto out_release_bh;
+ }
folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS,
mapping_gfp_mask(mapping));
if (IS_ERR(folio)) {
ret = PTR_ERR(folio);
- goto out;
+ goto out_stop_journal;
}
- *foliop = folio;
down_read(&EXT4_I(inode)->xattr_sem);
+ /* Someone else had converted it to extent */
if (!ext4_has_inline_data(inode)) {
ret = 0;
- folio_unlock(folio);
- folio_put(folio);
- goto out_up_read;
+ goto out_release_folio;
}
if (!folio_test_uptodate(folio)) {
ret = ext4_read_inline_folio(inode, folio);
- if (ret < 0) {
- folio_unlock(folio);
- folio_put(folio);
- goto out_up_read;
- }
+ if (ret < 0)
+ goto out_release_folio;
}
- ret = 1;
- handle = NULL;
-out_up_read:
+ ret = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh, EXT4_JTR_NONE);
+ if (ret)
+ goto out_release_folio;
+ *foliop = folio;
up_read(&EXT4_I(inode)->xattr_sem);
-out:
- if (handle && (ret != 1))
- ext4_journal_stop(handle);
+ brelse(iloc.bh);
+ return 1;
+
+out_release_folio:
+ up_read(&EXT4_I(inode)->xattr_sem);
+ folio_unlock(folio);
+ folio_put(folio);
+out_stop_journal:
+ ext4_journal_stop(handle);
+out_release_bh:
brelse(iloc.bh);
return ret;
-convert:
- return ext4_convert_inline_data_to_extent(mapping, inode);
+}
+
+/*
+ * Try to write data in the inode.
+ * If the inode has inline data, check whether the new write can be
+ * in the inode also. If not, create the page the handle, move the data
+ * to the page make it update and let the later codes create extent for it.
+ */
+int ext4_try_to_write_inline_data(struct address_space *mapping,
+ struct inode *inode,
+ loff_t pos, unsigned len,
+ struct folio **foliop)
+{
+ if (pos + len > ext4_get_max_inline_size(inode))
+ return ext4_convert_inline_data_to_extent(mapping, inode);
+ return ext4_generic_write_inline_data(mapping, inode, pos, len,
+ foliop, NULL, false);
}
int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
@@ -867,6 +900,7 @@ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping,
return ret;
}
+ clear_buffer_new(folio_buffers(folio));
folio_mark_dirty(folio);
folio_mark_uptodate(folio);
ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
@@ -881,94 +915,6 @@ out:
return ret;
}
-/*
- * Prepare the write for the inline data.
- * If the data can be written into the inode, we just read
- * the page and make it uptodate, and start the journal.
- * Otherwise read the page, makes it dirty so that it can be
- * handle in writepages(the i_disksize update is left to the
- * normal ext4_da_write_end).
- */
-int ext4_da_write_inline_data_begin(struct address_space *mapping,
- struct inode *inode,
- loff_t pos, unsigned len,
- struct folio **foliop,
- void **fsdata)
-{
- int ret;
- handle_t *handle;
- struct folio *folio;
- struct ext4_iloc iloc;
- int retries = 0;
-
- ret = ext4_get_inode_loc(inode, &iloc);
- if (ret)
- return ret;
-
-retry_journal:
- handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out;
- }
-
- ret = ext4_prepare_inline_data(handle, inode, pos + len);
- if (ret && ret != -ENOSPC)
- goto out_journal;
-
- if (ret == -ENOSPC) {
- ext4_journal_stop(handle);
- ret = ext4_da_convert_inline_data_to_extent(mapping,
- inode,
- fsdata);
- if (ret == -ENOSPC &&
- ext4_should_retry_alloc(inode->i_sb, &retries))
- goto retry_journal;
- goto out;
- }
-
- /*
- * We cannot recurse into the filesystem as the transaction
- * is already started.
- */
- folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS,
- mapping_gfp_mask(mapping));
- if (IS_ERR(folio)) {
- ret = PTR_ERR(folio);
- goto out_journal;
- }
-
- down_read(&EXT4_I(inode)->xattr_sem);
- if (!ext4_has_inline_data(inode)) {
- ret = 0;
- goto out_release_page;
- }
-
- if (!folio_test_uptodate(folio)) {
- ret = ext4_read_inline_folio(inode, folio);
- if (ret < 0)
- goto out_release_page;
- }
- ret = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh,
- EXT4_JTR_NONE);
- if (ret)
- goto out_release_page;
-
- up_read(&EXT4_I(inode)->xattr_sem);
- *foliop = folio;
- brelse(iloc.bh);
- return 1;
-out_release_page:
- up_read(&EXT4_I(inode)->xattr_sem);
- folio_unlock(folio);
- folio_put(folio);
-out_journal:
- ext4_journal_stop(handle);
-out:
- brelse(iloc.bh);
- return ret;
-}
-
#ifdef INLINE_DIR_DEBUG
void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh,
void *inline_start, int inline_size)
@@ -1012,7 +958,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle,
int err;
struct ext4_dir_entry_2 *de;
- err = ext4_find_dest_de(dir, inode, iloc->bh, inline_start,
+ err = ext4_find_dest_de(dir, iloc->bh, inline_start,
inline_size, fname, &de);
if (err)
return err;
@@ -1059,7 +1005,7 @@ static void *ext4_get_inline_xattr_pos(struct inode *inode,
}
/* Set the final de to cover the whole block. */
-static void ext4_update_final_de(void *de_buf, int old_size, int new_size)
+void ext4_update_final_de(void *de_buf, int old_size, int new_size)
{
struct ext4_dir_entry_2 *de, *prev_de;
void *limit;
@@ -1123,51 +1069,6 @@ static void ext4_restore_inline_data(handle_t *handle, struct inode *inode,
ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
}
-static int ext4_finish_convert_inline_dir(handle_t *handle,
- struct inode *inode,
- struct buffer_head *dir_block,
- void *buf,
- int inline_size)
-{
- int err, csum_size = 0, header_size = 0;
- struct ext4_dir_entry_2 *de;
- void *target = dir_block->b_data;
-
- /*
- * First create "." and ".." and then copy the dir information
- * back to the block.
- */
- de = target;
- de = ext4_init_dot_dotdot(inode, de,
- inode->i_sb->s_blocksize, csum_size,
- le32_to_cpu(((struct ext4_dir_entry_2 *)buf)->inode), 1);
- header_size = (void *)de - target;
-
- memcpy((void *)de, buf + EXT4_INLINE_DOTDOT_SIZE,
- inline_size - EXT4_INLINE_DOTDOT_SIZE);
-
- if (ext4_has_metadata_csum(inode->i_sb))
- csum_size = sizeof(struct ext4_dir_entry_tail);
-
- inode->i_size = inode->i_sb->s_blocksize;
- i_size_write(inode, inode->i_sb->s_blocksize);
- EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
- ext4_update_final_de(dir_block->b_data,
- inline_size - EXT4_INLINE_DOTDOT_SIZE + header_size,
- inode->i_sb->s_blocksize - csum_size);
-
- if (csum_size)
- ext4_initialize_dirent_tail(dir_block,
- inode->i_sb->s_blocksize);
- set_buffer_uptodate(dir_block);
- unlock_buffer(dir_block);
- err = ext4_handle_dirty_dirblock(handle, inode, dir_block);
- if (err)
- return err;
- set_buffer_verified(dir_block);
- return ext4_mark_inode_dirty(handle, inode);
-}
-
static int ext4_convert_inline_data_nolock(handle_t *handle,
struct inode *inode,
struct ext4_iloc *iloc)
@@ -1239,8 +1140,17 @@ static int ext4_convert_inline_data_nolock(handle_t *handle,
error = ext4_handle_dirty_metadata(handle,
inode, data_bh);
} else {
- error = ext4_finish_convert_inline_dir(handle, inode, data_bh,
- buf, inline_size);
+ unlock_buffer(data_bh);
+ inode->i_size = inode->i_sb->s_blocksize;
+ i_size_write(inode, inode->i_sb->s_blocksize);
+ EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
+
+ error = ext4_init_dirblock(handle, inode, data_bh,
+ le32_to_cpu(((struct ext4_dir_entry_2 *)buf)->inode),
+ buf + EXT4_INLINE_DOTDOT_SIZE,
+ inline_size - EXT4_INLINE_DOTDOT_SIZE);
+ if (!error)
+ error = ext4_mark_inode_dirty(handle, inode);
}
out_restore:
@@ -1379,7 +1289,7 @@ int ext4_inlinedir_to_tree(struct file *dir_file,
if (pos == 0) {
fake.inode = cpu_to_le32(inode->i_ino);
fake.name_len = 1;
- strcpy(fake.name, ".");
+ memcpy(fake.name, ".", 2);
fake.rec_len = ext4_rec_len_to_disk(
ext4_dir_rec_len(fake.name_len, NULL),
inline_size);
@@ -1389,7 +1299,7 @@ int ext4_inlinedir_to_tree(struct file *dir_file,
} else if (pos == EXT4_INLINE_DOTDOT_OFFSET) {
fake.inode = cpu_to_le32(parent_ino);
fake.name_len = 2;
- strcpy(fake.name, "..");
+ memcpy(fake.name, "..", 3);
fake.rec_len = ext4_rec_len_to_disk(
ext4_dir_rec_len(fake.name_len, NULL),
inline_size);
@@ -1928,7 +1838,7 @@ int ext4_inline_data_truncate(struct inode *inode, int *has_inline)
};
- needed_blocks = ext4_writepage_trans_blocks(inode);
+ needed_blocks = ext4_chunk_trans_extent(inode, 1);
handle = ext4_journal_start(inode, EXT4_HT_INODE, needed_blocks);
if (IS_ERR(handle))
return PTR_ERR(handle);
@@ -1967,7 +1877,12 @@ int ext4_inline_data_truncate(struct inode *inode, int *has_inline)
if ((err = ext4_xattr_ibody_find(inode, &i, &is)) != 0)
goto out_error;
- BUG_ON(is.s.not_found);
+ if (is.s.not_found) {
+ EXT4_ERROR_INODE(inode,
+ "missing inline data xattr");
+ err = -EFSCORRUPTED;
+ goto out_error;
+ }
value_len = le32_to_cpu(is.s.here->e_value_size);
value = kmalloc(value_len, GFP_NOFS);
@@ -2043,7 +1958,7 @@ int ext4_convert_inline_data(struct inode *inode)
return 0;
}
- needed_blocks = ext4_writepage_trans_blocks(inode);
+ needed_blocks = ext4_chunk_trans_extent(inode, 1);
iloc.bh = NULL;
error = ext4_get_inode_loc(inode, &iloc);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 7c54ae5fcbd4..5b7a15db4953 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -31,6 +31,7 @@
#include <linux/writeback.h>
#include <linux/pagevec.h>
#include <linux/mpage.h>
+#include <linux/rmap.h>
#include <linux/namei.h>
#include <linux/uio.h>
#include <linux/bio.h>
@@ -57,29 +58,27 @@ static void ext4_journalled_zero_new_buffers(handle_t *handle,
static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw,
struct ext4_inode_info *ei)
{
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
__u32 csum;
__u16 dummy_csum = 0;
int offset = offsetof(struct ext4_inode, i_checksum_lo);
unsigned int csum_size = sizeof(dummy_csum);
- csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, offset);
- csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, csum_size);
+ csum = ext4_chksum(ei->i_csum_seed, (__u8 *)raw, offset);
+ csum = ext4_chksum(csum, (__u8 *)&dummy_csum, csum_size);
offset += csum_size;
- csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset,
+ csum = ext4_chksum(csum, (__u8 *)raw + offset,
EXT4_GOOD_OLD_INODE_SIZE - offset);
if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
offset = offsetof(struct ext4_inode, i_checksum_hi);
- csum = ext4_chksum(sbi, csum, (__u8 *)raw +
- EXT4_GOOD_OLD_INODE_SIZE,
+ csum = ext4_chksum(csum, (__u8 *)raw + EXT4_GOOD_OLD_INODE_SIZE,
offset - EXT4_GOOD_OLD_INODE_SIZE);
if (EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) {
- csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum,
+ csum = ext4_chksum(csum, (__u8 *)&dummy_csum,
csum_size);
offset += csum_size;
}
- csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset,
+ csum = ext4_chksum(csum, (__u8 *)raw + offset,
EXT4_INODE_SIZE(inode->i_sb) - offset);
}
@@ -93,7 +92,7 @@ static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw,
if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
cpu_to_le32(EXT4_OS_LINUX) ||
- !ext4_has_metadata_csum(inode->i_sb))
+ !ext4_has_feature_metadata_csum(inode->i_sb))
return 1;
provided = le16_to_cpu(raw->i_checksum_lo);
@@ -114,7 +113,7 @@ void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
cpu_to_le32(EXT4_OS_LINUX) ||
- !ext4_has_metadata_csum(inode->i_sb))
+ !ext4_has_feature_metadata_csum(inode->i_sb))
return;
csum = ext4_inode_csum(inode, raw, ei);
@@ -141,16 +140,13 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
new_size);
}
-static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
- int pextents);
-
/*
* Test whether an inode is a fast symlink.
* A fast symlink has its symlink data stored in ext4_inode_info->i_data.
*/
int ext4_inode_is_fast_symlink(struct inode *inode)
{
- if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
+ if (!ext4_has_feature_ea_inode(inode->i_sb)) {
int ea_blocks = EXT4_I(inode)->i_file_acl ?
EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0;
@@ -181,6 +177,8 @@ void ext4_evict_inode(struct inode *inode)
trace_ext4_evict_inode(inode);
+ dax_break_layout_final(inode);
+
if (EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)
ext4_evict_ea_inode(inode);
if (inode->i_nlink) {
@@ -383,10 +381,11 @@ static int __check_block_validity(struct inode *inode, const char *func,
unsigned int line,
struct ext4_map_blocks *map)
{
- if (ext4_has_feature_journal(inode->i_sb) &&
- (inode->i_ino ==
- le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum)))
+ journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
+
+ if (journal && inode == journal->j_inode)
return 0;
+
if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) {
ext4_error_inode(inode, func, line, map->m_pblk,
"lblock %lu mapped to illegal pblock %llu "
@@ -412,6 +411,32 @@ int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk,
return ret;
}
+/*
+ * For generic regular files, when updating the extent tree, Ext4 should
+ * hold the i_rwsem and invalidate_lock exclusively. This ensures
+ * exclusion against concurrent page faults, as well as reads and writes.
+ */
+#ifdef CONFIG_EXT4_DEBUG
+void ext4_check_map_extents_env(struct inode *inode)
+{
+ if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+ return;
+
+ if (!S_ISREG(inode->i_mode) ||
+ IS_NOQUOTA(inode) || IS_VERITY(inode) ||
+ is_special_ino(inode->i_sb, inode->i_ino) ||
+ (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) ||
+ ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) ||
+ ext4_verity_in_progress(inode))
+ return;
+
+ WARN_ON_ONCE(!inode_is_locked(inode) &&
+ !rwsem_is_locked(&inode->i_mapping->invalidate_lock));
+}
+#else
+void ext4_check_map_extents_env(struct inode *inode) {}
+#endif
+
#define check_block_validity(inode, map) \
__check_block_validity((inode), __func__, __LINE__, (map))
@@ -458,16 +483,73 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
}
#endif /* ES_AGGRESSIVE_TEST */
+static int ext4_map_query_blocks_next_in_leaf(handle_t *handle,
+ struct inode *inode, struct ext4_map_blocks *map,
+ unsigned int orig_mlen)
+{
+ struct ext4_map_blocks map2;
+ unsigned int status, status2;
+ int retval;
+
+ status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+ EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+
+ WARN_ON_ONCE(!(map->m_flags & EXT4_MAP_QUERY_LAST_IN_LEAF));
+ WARN_ON_ONCE(orig_mlen <= map->m_len);
+
+ /* Prepare map2 for lookup in next leaf block */
+ map2.m_lblk = map->m_lblk + map->m_len;
+ map2.m_len = orig_mlen - map->m_len;
+ map2.m_flags = 0;
+ retval = ext4_ext_map_blocks(handle, inode, &map2, 0);
+
+ if (retval <= 0) {
+ ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+ map->m_pblk, status, false);
+ return map->m_len;
+ }
+
+ if (unlikely(retval != map2.m_len)) {
+ ext4_warning(inode->i_sb,
+ "ES len assertion failed for inode "
+ "%lu: retval %d != map->m_len %d",
+ inode->i_ino, retval, map2.m_len);
+ WARN_ON(1);
+ }
+
+ status2 = map2.m_flags & EXT4_MAP_UNWRITTEN ?
+ EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+
+ /*
+ * If map2 is contiguous with map, then let's insert it as a single
+ * extent in es cache and return the combined length of both the maps.
+ */
+ if (map->m_pblk + map->m_len == map2.m_pblk &&
+ status == status2) {
+ ext4_es_insert_extent(inode, map->m_lblk,
+ map->m_len + map2.m_len, map->m_pblk,
+ status, false);
+ map->m_len += map2.m_len;
+ } else {
+ ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+ map->m_pblk, status, false);
+ }
+
+ return map->m_len;
+}
+
static int ext4_map_query_blocks(handle_t *handle, struct inode *inode,
- struct ext4_map_blocks *map)
+ struct ext4_map_blocks *map, int flags)
{
unsigned int status;
int retval;
+ unsigned int orig_mlen = map->m_len;
+ flags &= EXT4_EX_QUERY_FILTER;
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- retval = ext4_ext_map_blocks(handle, inode, map, 0);
+ retval = ext4_ext_map_blocks(handle, inode, map, flags);
else
- retval = ext4_ind_map_blocks(handle, inode, map, 0);
+ retval = ext4_ind_map_blocks(handle, inode, map, flags);
if (retval <= 0)
return retval;
@@ -480,11 +562,22 @@ static int ext4_map_query_blocks(handle_t *handle, struct inode *inode,
WARN_ON(1);
}
- status = map->m_flags & EXT4_MAP_UNWRITTEN ?
- EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
- ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
- map->m_pblk, status, false);
- return retval;
+ /*
+ * No need to query next in leaf:
+ * - if returned extent is not last in leaf or
+ * - if the last in leaf is the full requested range
+ */
+ if (!(map->m_flags & EXT4_MAP_QUERY_LAST_IN_LEAF) ||
+ map->m_len == orig_mlen) {
+ status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+ EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+ ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+ map->m_pblk, status, false);
+ return retval;
+ }
+
+ return ext4_map_query_blocks_next_in_leaf(handle, inode, map,
+ orig_mlen);
}
static int ext4_map_create_blocks(handle_t *handle, struct inode *inode,
@@ -598,6 +691,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
struct extent_status es;
int retval;
int ret = 0;
+ unsigned int orig_mlen = map->m_len;
#ifdef ES_AGGRESSIVE_TEST
struct ext4_map_blocks orig_map;
@@ -618,9 +712,18 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS))
return -EFSCORRUPTED;
+ /*
+ * Callers from the context of data submission are the only exceptions
+ * for regular files that do not hold the i_rwsem or invalidate_lock.
+ * However, caching unrelated ranges is not permitted.
+ */
+ if (flags & EXT4_GET_BLOCKS_IO_SUBMIT)
+ WARN_ON_ONCE(!(flags & EXT4_EX_NOCACHE));
+ else
+ ext4_check_map_extents_env(inode);
+
/* Lookup extent status tree firstly */
- if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) &&
- ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
+ if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
map->m_pblk = ext4_es_pblock(&es) +
map->m_lblk - es.es_lblk;
@@ -649,7 +752,11 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
ext4_map_blocks_es_recheck(handle, inode, map,
&orig_map, flags);
#endif
- goto found;
+ if (!(flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) ||
+ orig_mlen == map->m_len)
+ goto found;
+
+ map->m_len = orig_mlen;
}
/*
* In the query cache no-wait mode, nothing we can do more if we
@@ -663,7 +770,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
* file system block.
*/
down_read(&EXT4_I(inode)->i_data_sem);
- retval = ext4_map_query_blocks(handle, inode, map);
+ retval = ext4_map_query_blocks(handle, inode, map, flags);
up_read((&EXT4_I(inode)->i_data_sem));
found:
@@ -692,6 +799,8 @@ found:
if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN))
return retval;
+
+ ext4_fc_track_inode(handle, inode);
/*
* New blocks allocate and/or writing to unwritten extent
* will possibly result in updating i_data, so we take
@@ -751,7 +860,7 @@ static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags)
flags &= EXT4_MAP_FLAGS;
/* Dummy buffer_head? Set non-atomically. */
- if (!bh->b_page) {
+ if (!bh->b_folio) {
bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | flags;
return;
}
@@ -766,6 +875,26 @@ static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags)
} while (unlikely(!try_cmpxchg(&bh->b_state, &old_state, new_state)));
}
+/*
+ * Make sure that the current journal transaction has enough credits to map
+ * one extent. Return -EAGAIN if it cannot extend the current running
+ * transaction.
+ */
+static inline int ext4_journal_ensure_extent_credits(handle_t *handle,
+ struct inode *inode)
+{
+ int credits;
+ int ret;
+
+ /* Called from ext4_da_write_begin() which has no handle started? */
+ if (!handle)
+ return 0;
+
+ credits = ext4_chunk_trans_blocks(inode, 1);
+ ret = __ext4_journal_ensure_credits(handle, credits, credits, 0);
+ return ret <= 0 ? ret : -EAGAIN;
+}
+
static int _ext4_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh, int flags)
{
@@ -1005,7 +1134,12 @@ int ext4_walk_page_buffers(handle_t *handle, struct inode *inode,
*/
static int ext4_dirty_journalled_data(handle_t *handle, struct buffer_head *bh)
{
- folio_mark_dirty(bh->b_folio);
+ struct folio *folio = bh->b_folio;
+ struct inode *inode = folio->mapping->host;
+
+ /* only regular files have a_ops */
+ if (S_ISREG(inode->i_mode))
+ folio_mark_dirty(folio);
return ext4_handle_dirty_metadata(handle, NULL, bh);
}
@@ -1023,7 +1157,7 @@ int ext4_block_write_begin(handle_t *handle, struct folio *folio,
loff_t pos, unsigned len,
get_block_t *get_block)
{
- unsigned from = pos & (PAGE_SIZE - 1);
+ unsigned int from = offset_in_folio(folio, pos);
unsigned to = from + len;
struct inode *inode = folio->mapping->host;
unsigned block_start, block_end;
@@ -1037,8 +1171,7 @@ int ext4_block_write_begin(handle_t *handle, struct folio *folio,
bool should_journal_data = ext4_should_journal_data(inode);
BUG_ON(!folio_test_locked(folio));
- BUG_ON(from > PAGE_SIZE);
- BUG_ON(to > PAGE_SIZE);
+ BUG_ON(to > folio_size(folio));
BUG_ON(from > to);
head = folio_buffers(folio);
@@ -1056,11 +1189,13 @@ int ext4_block_write_begin(handle_t *handle, struct folio *folio,
}
continue;
}
- if (buffer_new(bh))
+ if (WARN_ON_ONCE(buffer_new(bh)))
clear_buffer_new(bh);
if (!buffer_mapped(bh)) {
WARN_ON(bh->b_size != blocksize);
- err = get_block(inode, block, bh, 1);
+ err = ext4_journal_ensure_extent_credits(handle, inode);
+ if (!err)
+ err = get_block(inode, block, bh, 1);
if (err)
break;
if (buffer_new(bh)) {
@@ -1137,7 +1272,8 @@ int ext4_block_write_begin(handle_t *handle, struct folio *folio,
* and the ext4_write_end(). So doing the jbd2_journal_start at the start of
* ext4_write_begin() is the right place.
*/
-static int ext4_write_begin(struct file *file, struct address_space *mapping,
+static int ext4_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
loff_t pos, unsigned len,
struct folio **foliop, void **fsdata)
{
@@ -1149,18 +1285,18 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
pgoff_t index;
unsigned from, to;
- if (unlikely(ext4_forced_shutdown(inode->i_sb)))
- return -EIO;
+ ret = ext4_emergency_state(inode->i_sb);
+ if (unlikely(ret))
+ return ret;
trace_ext4_write_begin(inode, pos, len);
/*
* Reserve one block more for addition to orphan list in case
* we allocate blocks but write fails for some reason
*/
- needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
+ needed_blocks = ext4_chunk_trans_extent(inode,
+ ext4_journal_blocks_per_folio(inode)) + 1;
index = pos >> PAGE_SHIFT;
- from = pos & (PAGE_SIZE - 1);
- to = from + len;
if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
ret = ext4_try_to_write_inline_data(mapping, inode, pos, len,
@@ -1172,17 +1308,23 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
}
/*
- * __filemap_get_folio() can take a long time if the
+ * write_begin_get_folio() can take a long time if the
* system is thrashing due to memory pressure, or if the folio
* is being written back. So grab it first before we start
* the transaction handle. This also allows us to allocate
* the folio (if needed) without using GFP_NOFS.
*/
retry_grab:
- folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
- mapping_gfp_mask(mapping));
+ folio = write_begin_get_folio(iocb, mapping, index, len);
if (IS_ERR(folio))
return PTR_ERR(folio);
+
+ if (pos + len > folio_pos(folio) + folio_size(folio))
+ len = folio_pos(folio) + folio_size(folio) - pos;
+
+ from = offset_in_folio(folio, pos);
+ to = from + len;
+
/*
* The same as page allocation, we prealloc buffer heads before
* starting the handle.
@@ -1251,8 +1393,9 @@ retry_journal:
ext4_orphan_del(NULL, inode);
}
- if (ret == -ENOSPC &&
- ext4_should_retry_alloc(inode->i_sb, &retries))
+ if (ret == -EAGAIN ||
+ (ret == -ENOSPC &&
+ ext4_should_retry_alloc(inode->i_sb, &retries)))
goto retry_journal;
folio_put(folio);
return ret;
@@ -1272,17 +1415,18 @@ static int write_end_fn(handle_t *handle, struct inode *inode,
ret = ext4_dirty_journalled_data(handle, bh);
clear_buffer_meta(bh);
clear_buffer_prio(bh);
+ clear_buffer_new(bh);
return ret;
}
/*
* We need to pick up the new inode size which generic_commit_write gave us
- * `file' can be NULL - eg, when called from page_symlink().
+ * `iocb` can be NULL - eg, when called from page_symlink().
*
* ext4 never places buffers on inode->i_mapping->i_private_list. metadata
* buffers are managed internally.
*/
-static int ext4_write_end(struct file *file,
+static int ext4_write_end(const struct kiocb *iocb,
struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct folio *folio, void *fsdata)
@@ -1301,7 +1445,7 @@ static int ext4_write_end(struct file *file,
return ext4_write_inline_data_end(inode, pos, len, copied,
folio);
- copied = block_write_end(file, mapping, pos, len, copied, folio, fsdata);
+ copied = block_write_end(pos, len, copied, folio);
/*
* it's important to update i_size while still holding folio lock:
* page writeout could otherwise come in and zero beyond i_size.
@@ -1387,7 +1531,7 @@ static void ext4_journalled_zero_new_buffers(handle_t *handle,
} while (bh != head);
}
-static int ext4_journalled_write_end(struct file *file,
+static int ext4_journalled_write_end(const struct kiocb *iocb,
struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct folio *folio, void *fsdata)
@@ -1544,11 +1688,12 @@ struct mpage_da_data {
unsigned int can_map:1; /* Can writepages call map blocks? */
/* These are internal state of ext4_do_writepages() */
- pgoff_t first_page; /* The first page to write */
- pgoff_t next_page; /* Current page to examine */
- pgoff_t last_page; /* Last page to examine */
+ loff_t start_pos; /* The start pos to write */
+ loff_t next_pos; /* Current pos to examine */
+ loff_t end_pos; /* Last pos to examine */
+
/*
- * Extent to map - this can be after first_page because that can be
+ * Extent to map - this can be after start_pos because that can be
* fully mapped. We somewhat abuse m_flags to store whether the extent
* is delalloc or unwritten.
*/
@@ -1568,38 +1713,38 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd,
struct inode *inode = mpd->inode;
struct address_space *mapping = inode->i_mapping;
- /* This is necessary when next_page == 0. */
- if (mpd->first_page >= mpd->next_page)
+ /* This is necessary when next_pos == 0. */
+ if (mpd->start_pos >= mpd->next_pos)
return;
mpd->scanned_until_end = 0;
- index = mpd->first_page;
- end = mpd->next_page - 1;
if (invalidate) {
ext4_lblk_t start, last;
- start = index << (PAGE_SHIFT - inode->i_blkbits);
- last = end << (PAGE_SHIFT - inode->i_blkbits);
+ start = EXT4_B_TO_LBLK(inode, mpd->start_pos);
+ last = mpd->next_pos >> inode->i_blkbits;
/*
* avoid racing with extent status tree scans made by
* ext4_insert_delayed_block()
*/
down_write(&EXT4_I(inode)->i_data_sem);
- ext4_es_remove_extent(inode, start, last - start + 1);
+ ext4_es_remove_extent(inode, start, last - start);
up_write(&EXT4_I(inode)->i_data_sem);
}
folio_batch_init(&fbatch);
- while (index <= end) {
- nr = filemap_get_folios(mapping, &index, end, &fbatch);
+ index = mpd->start_pos >> PAGE_SHIFT;
+ end = mpd->next_pos >> PAGE_SHIFT;
+ while (index < end) {
+ nr = filemap_get_folios(mapping, &index, end - 1, &fbatch);
if (nr == 0)
break;
for (i = 0; i < nr; i++) {
struct folio *folio = fbatch.folios[i];
- if (folio->index < mpd->first_page)
+ if (folio_pos(folio) < mpd->start_pos)
continue;
- if (folio_next_index(folio) - 1 > end)
+ if (folio_next_index(folio) > end)
continue;
BUG_ON(!folio_test_locked(folio));
BUG_ON(folio_test_writeback(folio));
@@ -1760,6 +1905,8 @@ static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map)
ext_debug(inode, "max_blocks %u, logical block %lu\n", map->m_len,
(unsigned long) map->m_lblk);
+ ext4_check_map_extents_env(inode);
+
/* Lookup extent status tree firstly */
if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
map->m_len = min_t(unsigned int, map->m_len,
@@ -1800,7 +1947,7 @@ found:
if (ext4_has_inline_data(inode))
retval = 0;
else
- retval = ext4_map_query_blocks(NULL, inode, map);
+ retval = ext4_map_query_blocks(NULL, inode, map, 0);
up_read(&EXT4_I(inode)->i_data_sem);
if (retval)
return retval < 0 ? retval : 0;
@@ -1823,7 +1970,7 @@ add_delayed:
goto found;
}
} else if (!ext4_has_inline_data(inode)) {
- retval = ext4_map_query_blocks(NULL, inode, map);
+ retval = ext4_map_query_blocks(NULL, inode, map, 0);
if (retval) {
up_write(&EXT4_I(inode)->i_data_sem);
return retval < 0 ? retval : 0;
@@ -1899,7 +2046,8 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
static void mpage_folio_done(struct mpage_da_data *mpd, struct folio *folio)
{
- mpd->first_page += folio_nr_pages(folio);
+ mpd->start_pos += folio_size(folio);
+ mpd->wbc->nr_to_write -= folio_nr_pages(folio);
folio_unlock(folio);
}
@@ -1909,7 +2057,7 @@ static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio)
loff_t size;
int err;
- BUG_ON(folio->index != mpd->first_page);
+ WARN_ON_ONCE(folio_pos(folio) != mpd->start_pos);
folio_clear_dirty_for_io(folio);
/*
* We have to be very careful here! Nothing protects writeback path
@@ -1930,8 +2078,6 @@ static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio)
!ext4_verity_in_progress(mpd->inode))
len = size & (len - 1);
err = ext4_bio_write_folio(&mpd->io_submit, folio, len);
- if (!err)
- mpd->wbc->nr_to_write--;
return err;
}
@@ -2154,7 +2300,6 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
start = mpd->map.m_lblk >> bpp_bits;
end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits;
- lblk = start << bpp_bits;
pblock = mpd->map.m_pblk;
folio_batch_init(&fbatch);
@@ -2165,6 +2310,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
for (i = 0; i < nr; i++) {
struct folio *folio = fbatch.folios[i];
+ lblk = folio->index << bpp_bits;
err = mpage_process_folio(mpd, folio, &lblk, &pblock,
&map_bh);
/*
@@ -2198,6 +2344,11 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
int get_blocks_flags;
int err, dioread_nolock;
+ /* Make sure transaction has enough credits for this extent */
+ err = ext4_journal_ensure_extent_credits(handle, inode);
+ if (err < 0)
+ return err;
+
trace_ext4_da_write_pages_extent(inode, map);
/*
* Call ext4_map_blocks() to allocate any delayed allocation blocks, or
@@ -2207,11 +2358,15 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
* previously reserved. However we must not fail because we're in
* writeback and there is nothing we can do about it so it might result
* in data loss. So use reserved blocks to allocate metadata if
- * possible.
+ * possible. In addition, do not cache any unrelated extents, as it
+ * only holds the folio lock but does not hold the i_rwsem or
+ * invalidate_lock, which could corrupt the extent status tree.
*/
get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
EXT4_GET_BLOCKS_METADATA_NOFAIL |
- EXT4_GET_BLOCKS_IO_SUBMIT;
+ EXT4_GET_BLOCKS_IO_SUBMIT |
+ EXT4_EX_NOCACHE;
+
dioread_nolock = ext4_should_dioread_nolock(inode);
if (dioread_nolock)
get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
@@ -2225,7 +2380,7 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
mpd->io_submit.io_end->handle = handle->h_rsv_handle;
handle->h_rsv_handle = NULL;
}
- ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end);
+ ext4_set_io_unwritten_flag(mpd->io_submit.io_end);
}
BUG_ON(map->m_len == 0);
@@ -2233,6 +2388,47 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
}
/*
+ * This is used to submit mapped buffers in a single folio that is not fully
+ * mapped for various reasons, such as insufficient space or journal credits.
+ */
+static int mpage_submit_partial_folio(struct mpage_da_data *mpd)
+{
+ struct inode *inode = mpd->inode;
+ struct folio *folio;
+ loff_t pos;
+ int ret;
+
+ folio = filemap_get_folio(inode->i_mapping,
+ mpd->start_pos >> PAGE_SHIFT);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
+ /*
+ * The mapped position should be within the current processing folio
+ * but must not be the folio start position.
+ */
+ pos = ((loff_t)mpd->map.m_lblk) << inode->i_blkbits;
+ if (WARN_ON_ONCE((folio_pos(folio) == pos) ||
+ !folio_contains(folio, pos >> PAGE_SHIFT)))
+ return -EINVAL;
+
+ ret = mpage_submit_folio(mpd, folio);
+ if (ret)
+ goto out;
+ /*
+ * Update start_pos to prevent this folio from being released in
+ * mpage_release_unused_pages(), it will be reset to the aligned folio
+ * pos when this folio is written again in the next round. Additionally,
+ * do not update wbc->nr_to_write here, as it will be updated once the
+ * entire folio has finished processing.
+ */
+ mpd->start_pos = pos;
+out:
+ folio_unlock(folio);
+ folio_put(folio);
+ return ret;
+}
+
+/*
* mpage_map_and_submit_extent - map extent starting at mpd->lblk of length
* mpd->len and submit pages underlying it for IO
*
@@ -2273,17 +2469,25 @@ static int mpage_map_and_submit_extent(handle_t *handle,
if (err < 0) {
struct super_block *sb = inode->i_sb;
- if (ext4_forced_shutdown(sb))
+ if (ext4_emergency_state(sb))
goto invalidate_dirty_pages;
/*
* Let the uper layers retry transient errors.
* In the case of ENOSPC, if ext4_count_free_blocks()
* is non-zero, a commit should free up blocks.
*/
- if ((err == -ENOMEM) ||
+ if ((err == -ENOMEM) || (err == -EAGAIN) ||
(err == -ENOSPC && ext4_count_free_clusters(sb))) {
- if (progress)
+ /*
+ * We may have already allocated extents for
+ * some bhs inside the folio, issue the
+ * corresponding data to prevent stale data.
+ */
+ if (progress) {
+ if (mpage_submit_partial_folio(mpd))
+ goto invalidate_dirty_pages;
goto update_disksize;
+ }
return err;
}
ext4_msg(sb, KERN_CRIT,
@@ -2317,7 +2521,7 @@ update_disksize:
* Update on-disk size after IO is submitted. Races with
* truncate are avoided by checking i_size under i_data_sem.
*/
- disksize = ((loff_t)mpd->first_page) << PAGE_SHIFT;
+ disksize = mpd->start_pos;
if (disksize > READ_ONCE(EXT4_I(inode)->i_disksize)) {
int err2;
loff_t i_size;
@@ -2341,21 +2545,6 @@ update_disksize:
return err;
}
-/*
- * Calculate the total number of credits to reserve for one writepages
- * iteration. This is called from ext4_writepages(). We map an extent of
- * up to MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping
- * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN +
- * bpp - 1 blocks in bpp different extents.
- */
-static int ext4_da_writepages_trans_blocks(struct inode *inode)
-{
- int bpp = ext4_journal_blocks_per_page(inode);
-
- return ext4_meta_trans_blocks(inode,
- MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp);
-}
-
static int ext4_journal_folio_buffers(handle_t *handle, struct folio *folio,
size_t len)
{
@@ -2386,7 +2575,7 @@ static int mpage_journal_page_buffers(handle_t *handle,
size_t len = folio_size(folio);
folio_clear_checked(folio);
- mpd->wbc->nr_to_write--;
+ mpd->wbc->nr_to_write -= folio_nr_pages(folio);
if (folio_pos(folio) + len > size &&
!ext4_verity_in_progress(inode))
@@ -2420,15 +2609,15 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
struct address_space *mapping = mpd->inode->i_mapping;
struct folio_batch fbatch;
unsigned int nr_folios;
- pgoff_t index = mpd->first_page;
- pgoff_t end = mpd->last_page;
+ pgoff_t index = mpd->start_pos >> PAGE_SHIFT;
+ pgoff_t end = mpd->end_pos >> PAGE_SHIFT;
xa_mark_t tag;
int i, err = 0;
int blkbits = mpd->inode->i_blkbits;
ext4_lblk_t lblk;
struct buffer_head *head;
handle_t *handle = NULL;
- int bpp = ext4_journal_blocks_per_page(mpd->inode);
+ int bpp = ext4_journal_blocks_per_folio(mpd->inode);
if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages)
tag = PAGECACHE_TAG_TOWRITE;
@@ -2436,7 +2625,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
tag = PAGECACHE_TAG_DIRTY;
mpd->map.m_len = 0;
- mpd->next_page = index;
+ mpd->next_pos = mpd->start_pos;
if (ext4_should_journal_data(mpd->inode)) {
handle = ext4_journal_start(mpd->inode, EXT4_HT_WRITE_PAGE,
bpp);
@@ -2467,7 +2656,8 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
goto out;
/* If we can't merge this page, we are done. */
- if (mpd->map.m_len > 0 && mpd->next_page != folio->index)
+ if (mpd->map.m_len > 0 &&
+ mpd->next_pos != folio_pos(folio))
goto out;
if (handle) {
@@ -2513,8 +2703,8 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
}
if (mpd->map.m_len == 0)
- mpd->first_page = folio->index;
- mpd->next_page = folio_next_index(folio);
+ mpd->start_pos = folio_pos(folio);
+ mpd->next_pos = folio_pos(folio) + folio_size(folio);
/*
* Writeout when we cannot modify metadata is simple.
* Just submit the page. For data=journal mode we
@@ -2599,10 +2789,9 @@ static int ext4_do_writepages(struct mpage_da_data *mpd)
* *never* be called, so if that ever happens, we would want
* the stack trace.
*/
- if (unlikely(ext4_forced_shutdown(mapping->host->i_sb))) {
- ret = -EROFS;
+ ret = ext4_emergency_state(mapping->host->i_sb);
+ if (unlikely(ret))
goto out_writepages;
- }
/*
* If we have inline data and arrive here, it means that
@@ -2643,12 +2832,12 @@ static int ext4_do_writepages(struct mpage_da_data *mpd)
mpd->journalled_more_data = 0;
if (ext4_should_dioread_nolock(inode)) {
+ int bpf = ext4_journal_blocks_per_folio(inode);
/*
* We may need to convert up to one extent per block in
- * the page and we may dirty the inode.
+ * the folio and we may dirty the inode.
*/
- rsv_blocks = 1 + ext4_chunk_trans_blocks(inode,
- PAGE_SIZE >> inode->i_blkbits);
+ rsv_blocks = 1 + ext4_ext_index_trans_blocks(inode, bpf);
}
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
@@ -2658,18 +2847,18 @@ static int ext4_do_writepages(struct mpage_da_data *mpd)
writeback_index = mapping->writeback_index;
if (writeback_index)
cycled = 0;
- mpd->first_page = writeback_index;
- mpd->last_page = -1;
+ mpd->start_pos = writeback_index << PAGE_SHIFT;
+ mpd->end_pos = LLONG_MAX;
} else {
- mpd->first_page = wbc->range_start >> PAGE_SHIFT;
- mpd->last_page = wbc->range_end >> PAGE_SHIFT;
+ mpd->start_pos = wbc->range_start;
+ mpd->end_pos = wbc->range_end;
}
ext4_io_submit_init(&mpd->io_submit, wbc);
retry:
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
- tag_pages_for_writeback(mapping, mpd->first_page,
- mpd->last_page);
+ tag_pages_for_writeback(mapping, mpd->start_pos >> PAGE_SHIFT,
+ mpd->end_pos >> PAGE_SHIFT);
blk_start_plug(&plug);
/*
@@ -2712,8 +2901,14 @@ retry:
* not supported by delalloc.
*/
BUG_ON(ext4_should_journal_data(inode));
- needed_blocks = ext4_da_writepages_trans_blocks(inode);
-
+ /*
+ * Calculate the number of credits needed to reserve for one
+ * extent of up to MAX_WRITEPAGES_EXTENT_LEN blocks. It will
+ * attempt to extend the transaction or start a new iteration
+ * if the reserved credits are insufficient.
+ */
+ needed_blocks = ext4_chunk_trans_blocks(inode,
+ MAX_WRITEPAGES_EXTENT_LEN);
/* start a new transaction */
handle = ext4_journal_start_with_reserve(inode,
EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks);
@@ -2729,7 +2924,8 @@ retry:
}
mpd->do_map = 1;
- trace_ext4_da_write_pages(inode, mpd->first_page, wbc);
+ trace_ext4_da_write_folios_start(inode, mpd->start_pos,
+ mpd->next_pos, wbc);
ret = mpage_prepare_extent_to_map(mpd);
if (!ret && mpd->map.m_len)
ret = mpage_map_and_submit_extent(handle, mpd,
@@ -2767,6 +2963,8 @@ retry:
} else
ext4_put_io_end(mpd->io_submit.io_end);
mpd->io_submit.io_end = NULL;
+ trace_ext4_da_write_folios_end(inode, mpd->start_pos,
+ mpd->next_pos, wbc, ret);
if (ret == -ENOSPC && sbi->s_journal) {
/*
@@ -2778,6 +2976,8 @@ retry:
ret = 0;
continue;
}
+ if (ret == -EAGAIN)
+ ret = 0;
/* Fatal error - ENOMEM, EIO... */
if (ret)
break;
@@ -2786,8 +2986,8 @@ unplug:
blk_finish_plug(&plug);
if (!ret && !cycled && wbc->nr_to_write > 0) {
cycled = 1;
- mpd->last_page = writeback_index - 1;
- mpd->first_page = 0;
+ mpd->end_pos = (writeback_index << PAGE_SHIFT) - 1;
+ mpd->start_pos = 0;
goto retry;
}
@@ -2797,7 +2997,7 @@ unplug:
* Set the writeback_index so that range_cyclic
* mode will write it back later
*/
- mapping->writeback_index = mpd->first_page;
+ mapping->writeback_index = mpd->start_pos >> PAGE_SHIFT;
out_writepages:
trace_ext4_writepages_result(inode, wbc, ret,
@@ -2817,8 +3017,9 @@ static int ext4_writepages(struct address_space *mapping,
int ret;
int alloc_ctx;
- if (unlikely(ext4_forced_shutdown(sb)))
- return -EIO;
+ ret = ext4_emergency_state(sb);
+ if (unlikely(ret))
+ return ret;
alloc_ctx = ext4_writepages_down_read(sb);
ret = ext4_do_writepages(&mpd);
@@ -2858,8 +3059,9 @@ static int ext4_dax_writepages(struct address_space *mapping,
struct inode *inode = mapping->host;
int alloc_ctx;
- if (unlikely(ext4_forced_shutdown(inode->i_sb)))
- return -EIO;
+ ret = ext4_emergency_state(inode->i_sb);
+ if (unlikely(ret))
+ return ret;
alloc_ctx = ext4_writepages_down_read(inode->i_sb);
trace_ext4_writepages(inode, wbc);
@@ -2906,7 +3108,8 @@ static int ext4_nonda_switch(struct super_block *sb)
return 0;
}
-static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
+static int ext4_da_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
loff_t pos, unsigned len,
struct folio **foliop, void **fsdata)
{
@@ -2915,22 +3118,23 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
pgoff_t index;
struct inode *inode = mapping->host;
- if (unlikely(ext4_forced_shutdown(inode->i_sb)))
- return -EIO;
+ ret = ext4_emergency_state(inode->i_sb);
+ if (unlikely(ret))
+ return ret;
index = pos >> PAGE_SHIFT;
if (ext4_nonda_switch(inode->i_sb) || ext4_verity_in_progress(inode)) {
*fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
- return ext4_write_begin(file, mapping, pos,
+ return ext4_write_begin(iocb, mapping, pos,
len, foliop, fsdata);
}
*fsdata = (void *)0;
trace_ext4_da_write_begin(inode, pos, len);
if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
- ret = ext4_da_write_inline_data_begin(mapping, inode, pos, len,
- foliop, fsdata);
+ ret = ext4_generic_write_inline_data(mapping, inode, pos, len,
+ foliop, fsdata, true);
if (ret < 0)
return ret;
if (ret == 1)
@@ -2938,18 +3142,20 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
}
retry:
- folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
- mapping_gfp_mask(mapping));
+ folio = write_begin_get_folio(iocb, mapping, index, len);
if (IS_ERR(folio))
return PTR_ERR(folio);
+ if (pos + len > folio_pos(folio) + folio_size(folio))
+ len = folio_pos(folio) + folio_size(folio) - pos;
+
ret = ext4_block_write_begin(NULL, folio, pos, len,
ext4_da_get_block_prep);
if (ret < 0) {
folio_unlock(folio);
folio_put(folio);
/*
- * block_write_begin may have instantiated a few blocks
+ * ext4_block_write_begin may have instantiated a few blocks
* outside i_size. Trim these off again. Don't need
* i_size_read because we hold inode lock.
*/
@@ -3008,8 +3214,7 @@ static int ext4_da_do_write_end(struct address_space *mapping,
* block_write_end() will mark the inode as dirty with I_DIRTY_PAGES
* flag, which all that's needed to trigger page writeback.
*/
- copied = block_write_end(NULL, mapping, pos, len, copied,
- folio, NULL);
+ copied = block_write_end(pos, len, copied, folio);
new_i_size = pos + copied;
/*
@@ -3031,7 +3236,7 @@ static int ext4_da_do_write_end(struct address_space *mapping,
unsigned long end;
i_size_write(inode, new_i_size);
- end = (new_i_size - 1) & (PAGE_SIZE - 1);
+ end = offset_in_folio(folio, new_i_size - 1);
if (copied && ext4_da_should_update_i_disksize(folio, end)) {
ext4_update_i_disksize(inode, new_i_size);
disksize_changed = true;
@@ -3060,7 +3265,7 @@ static int ext4_da_do_write_end(struct address_space *mapping,
return copied;
}
-static int ext4_da_write_end(struct file *file,
+static int ext4_da_write_end(const struct kiocb *iocb,
struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct folio *folio, void *fsdata)
@@ -3069,7 +3274,7 @@ static int ext4_da_write_end(struct file *file,
int write_mode = (int)(unsigned long)fsdata;
if (write_mode == FALL_BACK_TO_NONDELALLOC)
- return ext4_write_end(file, mapping, pos,
+ return ext4_write_end(iocb, mapping, pos,
len, copied, folio, fsdata);
trace_ext4_da_write_end(inode, pos, len, copied);
@@ -3290,6 +3495,10 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
if (map->m_flags & EXT4_MAP_NEW)
iomap->flags |= IOMAP_F_NEW;
+ /* HW-offload atomics are always used */
+ if (flags & IOMAP_ATOMIC)
+ iomap->flags |= IOMAP_F_ATOMIC_BIO;
+
if (flags & IOMAP_DAX)
iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev;
else
@@ -3329,12 +3538,149 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
}
}
+static int ext4_map_blocks_atomic_write_slow(handle_t *handle,
+ struct inode *inode, struct ext4_map_blocks *map)
+{
+ ext4_lblk_t m_lblk = map->m_lblk;
+ unsigned int m_len = map->m_len;
+ unsigned int mapped_len = 0, m_flags = 0;
+ ext4_fsblk_t next_pblk;
+ bool check_next_pblk = false;
+ int ret = 0;
+
+ WARN_ON_ONCE(!ext4_has_feature_bigalloc(inode->i_sb));
+
+ /*
+ * This is a slow path in case of mixed mapping. We use
+ * EXT4_GET_BLOCKS_CREATE_ZERO flag here to make sure we get a single
+ * contiguous mapped mapping. This will ensure any unwritten or hole
+ * regions within the requested range is zeroed out and we return
+ * a single contiguous mapped extent.
+ */
+ m_flags = EXT4_GET_BLOCKS_CREATE_ZERO;
+
+ do {
+ ret = ext4_map_blocks(handle, inode, map, m_flags);
+ if (ret < 0 && ret != -ENOSPC)
+ goto out_err;
+ /*
+ * This should never happen, but let's return an error code to
+ * avoid an infinite loop in here.
+ */
+ if (ret == 0) {
+ ret = -EFSCORRUPTED;
+ ext4_warning_inode(inode,
+ "ext4_map_blocks() couldn't allocate blocks m_flags: 0x%x, ret:%d",
+ m_flags, ret);
+ goto out_err;
+ }
+ /*
+ * With bigalloc we should never get ENOSPC nor discontiguous
+ * physical extents.
+ */
+ if ((check_next_pblk && next_pblk != map->m_pblk) ||
+ ret == -ENOSPC) {
+ ext4_warning_inode(inode,
+ "Non-contiguous allocation detected: expected %llu, got %llu, "
+ "or ext4_map_blocks() returned out of space ret: %d",
+ next_pblk, map->m_pblk, ret);
+ ret = -EFSCORRUPTED;
+ goto out_err;
+ }
+ next_pblk = map->m_pblk + map->m_len;
+ check_next_pblk = true;
+
+ mapped_len += map->m_len;
+ map->m_lblk += map->m_len;
+ map->m_len = m_len - mapped_len;
+ } while (mapped_len < m_len);
+
+ /*
+ * We might have done some work in above loop, so we need to query the
+ * start of the physical extent, based on the origin m_lblk and m_len.
+ * Let's also ensure we were able to allocate the required range for
+ * mixed mapping case.
+ */
+ map->m_lblk = m_lblk;
+ map->m_len = m_len;
+ map->m_flags = 0;
+
+ ret = ext4_map_blocks(handle, inode, map,
+ EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF);
+ if (ret != m_len) {
+ ext4_warning_inode(inode,
+ "allocation failed for atomic write request m_lblk:%u, m_len:%u, ret:%d\n",
+ m_lblk, m_len, ret);
+ ret = -EINVAL;
+ }
+ return ret;
+
+out_err:
+ /* reset map before returning an error */
+ map->m_lblk = m_lblk;
+ map->m_len = m_len;
+ map->m_flags = 0;
+ return ret;
+}
+
+/*
+ * ext4_map_blocks_atomic: Helper routine to ensure the entire requested
+ * range in @map [lblk, lblk + len) is one single contiguous extent with no
+ * mixed mappings.
+ *
+ * We first use m_flags passed to us by our caller (ext4_iomap_alloc()).
+ * We only call EXT4_GET_BLOCKS_ZERO in the slow path, when the underlying
+ * physical extent for the requested range does not have a single contiguous
+ * mapping type i.e. (Hole, Mapped, or Unwritten) throughout.
+ * In that case we will loop over the requested range to allocate and zero out
+ * the unwritten / holes in between, to get a single mapped extent from
+ * [m_lblk, m_lblk + m_len). Note that this is only possible because we know
+ * this can be called only with bigalloc enabled filesystem where the underlying
+ * cluster is already allocated. This avoids allocating discontiguous extents
+ * in the slow path due to multiple calls to ext4_map_blocks().
+ * The slow path is mostly non-performance critical path, so it should be ok to
+ * loop using ext4_map_blocks() with appropriate flags to allocate & zero the
+ * underlying short holes/unwritten extents within the requested range.
+ */
+static int ext4_map_blocks_atomic_write(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map, int m_flags,
+ bool *force_commit)
+{
+ ext4_lblk_t m_lblk = map->m_lblk;
+ unsigned int m_len = map->m_len;
+ int ret = 0;
+
+ WARN_ON_ONCE(m_len > 1 && !ext4_has_feature_bigalloc(inode->i_sb));
+
+ ret = ext4_map_blocks(handle, inode, map, m_flags);
+ if (ret < 0 || ret == m_len)
+ goto out;
+ /*
+ * This is a mixed mapping case where we were not able to allocate
+ * a single contiguous extent. In that case let's reset requested
+ * mapping and call the slow path.
+ */
+ map->m_lblk = m_lblk;
+ map->m_len = m_len;
+ map->m_flags = 0;
+
+ /*
+ * slow path means we have mixed mapping, that means we will need
+ * to force txn commit.
+ */
+ *force_commit = true;
+ return ext4_map_blocks_atomic_write_slow(handle, inode, map);
+out:
+ return ret;
+}
+
static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map,
unsigned int flags)
{
handle_t *handle;
u8 blkbits = inode->i_blkbits;
int ret, dio_credits, m_flags = 0, retries = 0;
+ bool force_commit = false;
/*
* Trim the mapping request to the maximum value that we can map at
@@ -3342,7 +3688,30 @@ static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map,
*/
if (map->m_len > DIO_MAX_BLOCKS)
map->m_len = DIO_MAX_BLOCKS;
- dio_credits = ext4_chunk_trans_blocks(inode, map->m_len);
+
+ /*
+ * journal credits estimation for atomic writes. We call
+ * ext4_map_blocks(), to find if there could be a mixed mapping. If yes,
+ * then let's assume the no. of pextents required can be m_len i.e.
+ * every alternate block can be unwritten and hole.
+ */
+ if (flags & IOMAP_ATOMIC) {
+ unsigned int orig_mlen = map->m_len;
+
+ ret = ext4_map_blocks(NULL, inode, map, 0);
+ if (ret < 0)
+ return ret;
+ if (map->m_len < orig_mlen) {
+ map->m_len = orig_mlen;
+ dio_credits = ext4_meta_trans_blocks(inode, orig_mlen,
+ map->m_len);
+ } else {
+ dio_credits = ext4_chunk_trans_blocks(inode,
+ map->m_len);
+ }
+ } else {
+ dio_credits = ext4_chunk_trans_blocks(inode, map->m_len);
+ }
retry:
/*
@@ -3373,7 +3742,11 @@ retry:
else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT;
- ret = ext4_map_blocks(handle, inode, map, m_flags);
+ if (flags & IOMAP_ATOMIC)
+ ret = ext4_map_blocks_atomic_write(handle, inode, map, m_flags,
+ &force_commit);
+ else
+ ret = ext4_map_blocks(handle, inode, map, m_flags);
/*
* We cannot fill holes in indirect tree based inodes as that could
@@ -3387,6 +3760,22 @@ retry:
if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
goto retry;
+ /*
+ * Force commit the current transaction if the allocation spans a mixed
+ * mapping range. This ensures any pending metadata updates (like
+ * unwritten to written extents conversion) in this range are in
+ * consistent state with the file data blocks, before performing the
+ * actual write I/O. If the commit fails, the whole I/O must be aborted
+ * to prevent any possible torn writes.
+ */
+ if (ret > 0 && force_commit) {
+ int ret2;
+
+ ret2 = ext4_force_commit(inode->i_sb);
+ if (ret2)
+ return ret2;
+ }
+
return ret;
}
@@ -3397,6 +3786,7 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
int ret;
struct ext4_map_blocks map;
u8 blkbits = inode->i_blkbits;
+ unsigned int orig_mlen;
if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
return -EINVAL;
@@ -3410,6 +3800,7 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
map.m_lblk = offset >> blkbits;
map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
+ orig_mlen = map.m_len;
if (flags & IOMAP_WRITE) {
/*
@@ -3420,11 +3811,23 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
*/
if (offset + length <= i_size_read(inode)) {
ret = ext4_map_blocks(NULL, inode, &map, 0);
- if (ret > 0 && (map.m_flags & EXT4_MAP_MAPPED))
- goto out;
+ /*
+ * For atomic writes the entire requested length should
+ * be mapped.
+ */
+ if (map.m_flags & EXT4_MAP_MAPPED) {
+ if ((!(flags & IOMAP_ATOMIC) && ret > 0) ||
+ (flags & IOMAP_ATOMIC && ret >= orig_mlen))
+ goto out;
+ }
+ map.m_len = orig_mlen;
}
ret = ext4_iomap_alloc(inode, &map, flags);
} else {
+ /*
+ * This can be called for overwrites path from
+ * ext4_iomap_overwrite_begin().
+ */
ret = ext4_map_blocks(NULL, inode, &map, 0);
}
@@ -3438,6 +3841,16 @@ out:
*/
map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len);
+ /*
+ * Before returning to iomap, let's ensure the allocated mapping
+ * covers the entire requested length for atomic writes.
+ */
+ if (flags & IOMAP_ATOMIC) {
+ if (map.m_len < (length >> blkbits)) {
+ WARN_ON_ONCE(1);
+ return -EINVAL;
+ }
+ }
ext4_set_iomap(inode, iomap, &map, offset, length, flags);
return 0;
@@ -3679,9 +4092,7 @@ void ext4_set_aops(struct inode *inode)
static int __ext4_block_zero_page_range(handle_t *handle,
struct address_space *mapping, loff_t from, loff_t length)
{
- ext4_fsblk_t index = from >> PAGE_SHIFT;
- unsigned offset = from & (PAGE_SIZE-1);
- unsigned blocksize, pos;
+ unsigned int offset, blocksize, pos;
ext4_lblk_t iblock;
struct inode *inode = mapping->host;
struct buffer_head *bh;
@@ -3696,13 +4107,14 @@ static int __ext4_block_zero_page_range(handle_t *handle,
blocksize = inode->i_sb->s_blocksize;
- iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
+ iblock = folio->index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
bh = folio_buffers(folio);
if (!bh)
bh = create_empty_buffers(folio, blocksize, 0);
/* Find the buffer that contains "offset" */
+ offset = offset_in_folio(folio, from);
pos = blocksize;
while (offset >= pos) {
bh = bh->b_this_page;
@@ -3902,6 +4314,68 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
return ret;
}
+static inline void ext4_truncate_folio(struct inode *inode,
+ loff_t start, loff_t end)
+{
+ unsigned long blocksize = i_blocksize(inode);
+ struct folio *folio;
+
+ /* Nothing to be done if no complete block needs to be truncated. */
+ if (round_up(start, blocksize) >= round_down(end, blocksize))
+ return;
+
+ folio = filemap_lock_folio(inode->i_mapping, start >> PAGE_SHIFT);
+ if (IS_ERR(folio))
+ return;
+
+ if (folio_mkclean(folio))
+ folio_mark_dirty(folio);
+ folio_unlock(folio);
+ folio_put(folio);
+}
+
+int ext4_truncate_page_cache_block_range(struct inode *inode,
+ loff_t start, loff_t end)
+{
+ unsigned long blocksize = i_blocksize(inode);
+ int ret;
+
+ /*
+ * For journalled data we need to write (and checkpoint) pages
+ * before discarding page cache to avoid inconsitent data on disk
+ * in case of crash before freeing or unwritten converting trans
+ * is committed.
+ */
+ if (ext4_should_journal_data(inode)) {
+ ret = filemap_write_and_wait_range(inode->i_mapping, start,
+ end - 1);
+ if (ret)
+ return ret;
+ goto truncate_pagecache;
+ }
+
+ /*
+ * If the block size is less than the page size, the file's mapped
+ * blocks within one page could be freed or converted to unwritten.
+ * So it's necessary to remove writable userspace mappings, and then
+ * ext4_page_mkwrite() can be called during subsequent write access
+ * to these partial folios.
+ */
+ if (!IS_ALIGNED(start | end, PAGE_SIZE) &&
+ blocksize < PAGE_SIZE && start < inode->i_size) {
+ loff_t page_boundary = round_up(start, PAGE_SIZE);
+
+ ext4_truncate_folio(inode, start, min(page_boundary, end));
+ if (end > page_boundary)
+ ext4_truncate_folio(inode,
+ round_down(end, PAGE_SIZE), end);
+ }
+
+truncate_pagecache:
+ truncate_pagecache_range(inode, start, end - 1);
+ return 0;
+}
+
static void ext4_wait_dax_page(struct inode *inode)
{
filemap_invalidate_unlock(inode->i_mapping);
@@ -3911,24 +4385,10 @@ static void ext4_wait_dax_page(struct inode *inode)
int ext4_break_layouts(struct inode *inode)
{
- struct page *page;
- int error;
-
if (WARN_ON_ONCE(!rwsem_is_locked(&inode->i_mapping->invalidate_lock)))
return -EINVAL;
- do {
- page = dax_layout_busy_page(inode->i_mapping);
- if (!page)
- return 0;
-
- error = ___wait_var_event(&page->_refcount,
- atomic_read(&page->_refcount) == 1,
- TASK_INTERRUPTIBLE, 0, 0,
- ext4_wait_dax_page(inode));
- } while (error == 0);
-
- return error;
+ return dax_break_layout_inode(inode, ext4_wait_dax_page);
}
/*
@@ -3946,148 +4406,112 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
{
struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
- ext4_lblk_t first_block, stop_block;
- struct address_space *mapping = inode->i_mapping;
- loff_t first_block_offset, last_block_offset, max_length;
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ ext4_lblk_t start_lblk, end_lblk;
+ loff_t max_end = sb->s_maxbytes;
+ loff_t end = offset + length;
handle_t *handle;
unsigned int credits;
- int ret = 0, ret2 = 0;
+ int ret;
trace_ext4_punch_hole(inode, offset, length, 0);
+ WARN_ON_ONCE(!inode_is_locked(inode));
/*
- * Write out all dirty pages to avoid race conditions
- * Then release them.
+ * For indirect-block based inodes, make sure that the hole within
+ * one block before last range.
*/
- if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
- ret = filemap_write_and_wait_range(mapping, offset,
- offset + length - 1);
- if (ret)
- return ret;
- }
-
- inode_lock(inode);
+ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ max_end = EXT4_SB(sb)->s_bitmap_maxbytes - sb->s_blocksize;
/* No need to punch hole beyond i_size */
- if (offset >= inode->i_size)
- goto out_mutex;
+ if (offset >= inode->i_size || offset >= max_end)
+ return 0;
/*
- * If the hole extends beyond i_size, set the hole
- * to end after the page that contains i_size
+ * If the hole extends beyond i_size, set the hole to end after
+ * the page that contains i_size.
*/
- if (offset + length > inode->i_size) {
- length = inode->i_size +
- PAGE_SIZE - (inode->i_size & (PAGE_SIZE - 1)) -
- offset;
- }
+ if (end > inode->i_size)
+ end = round_up(inode->i_size, PAGE_SIZE);
+ if (end > max_end)
+ end = max_end;
+ length = end - offset;
/*
- * For punch hole the length + offset needs to be within one block
- * before last range. Adjust the length if it goes beyond that limit.
+ * Attach jinode to inode for jbd2 if we do any zeroing of partial
+ * block.
*/
- max_length = sbi->s_bitmap_maxbytes - inode->i_sb->s_blocksize;
- if (offset + length > max_length)
- length = max_length - offset;
-
- if (offset & (sb->s_blocksize - 1) ||
- (offset + length) & (sb->s_blocksize - 1)) {
- /*
- * Attach jinode to inode for jbd2 if we do any zeroing of
- * partial block
- */
+ if (!IS_ALIGNED(offset | end, sb->s_blocksize)) {
ret = ext4_inode_attach_jinode(inode);
if (ret < 0)
- goto out_mutex;
-
+ return ret;
}
- /* Wait all existing dio workers, newcomers will block on i_rwsem */
- inode_dio_wait(inode);
- ret = file_modified(file);
+ ret = ext4_update_disksize_before_punch(inode, offset, length);
if (ret)
- goto out_mutex;
-
- /*
- * Prevent page faults from reinstantiating pages we have released from
- * page cache.
- */
- filemap_invalidate_lock(mapping);
-
- ret = ext4_break_layouts(inode);
- if (ret)
- goto out_dio;
-
- first_block_offset = round_up(offset, sb->s_blocksize);
- last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
+ return ret;
/* Now release the pages and zero block aligned part of pages*/
- if (last_block_offset > first_block_offset) {
- ret = ext4_update_disksize_before_punch(inode, offset, length);
- if (ret)
- goto out_dio;
- truncate_pagecache_range(inode, first_block_offset,
- last_block_offset);
- }
+ ret = ext4_truncate_page_cache_block_range(inode, offset, end);
+ if (ret)
+ return ret;
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- credits = ext4_writepage_trans_blocks(inode);
+ credits = ext4_chunk_trans_extent(inode, 2);
else
credits = ext4_blocks_for_truncate(inode);
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
ext4_std_error(sb, ret);
- goto out_dio;
+ return ret;
}
- ret = ext4_zero_partial_blocks(handle, inode, offset,
- length);
+ ret = ext4_zero_partial_blocks(handle, inode, offset, length);
if (ret)
- goto out_stop;
-
- first_block = (offset + sb->s_blocksize - 1) >>
- EXT4_BLOCK_SIZE_BITS(sb);
- stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
+ goto out_handle;
/* If there are blocks to remove, do it */
- if (stop_block > first_block) {
- ext4_lblk_t hole_len = stop_block - first_block;
+ start_lblk = EXT4_B_TO_LBLK(inode, offset);
+ end_lblk = end >> inode->i_blkbits;
+
+ if (end_lblk > start_lblk) {
+ ext4_lblk_t hole_len = end_lblk - start_lblk;
+ ext4_fc_track_inode(handle, inode);
+ ext4_check_map_extents_env(inode);
down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode);
- ext4_es_remove_extent(inode, first_block, hole_len);
+ ext4_es_remove_extent(inode, start_lblk, hole_len);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- ret = ext4_ext_remove_space(inode, first_block,
- stop_block - 1);
+ ret = ext4_ext_remove_space(inode, start_lblk,
+ end_lblk - 1);
else
- ret = ext4_ind_remove_space(handle, inode, first_block,
- stop_block);
+ ret = ext4_ind_remove_space(handle, inode, start_lblk,
+ end_lblk);
+ if (ret) {
+ up_write(&EXT4_I(inode)->i_data_sem);
+ goto out_handle;
+ }
- ext4_es_insert_extent(inode, first_block, hole_len, ~0,
+ ext4_es_insert_extent(inode, start_lblk, hole_len, ~0,
EXTENT_STATUS_HOLE, 0);
up_write(&EXT4_I(inode)->i_data_sem);
}
- ext4_fc_track_range(handle, inode, first_block, stop_block);
+ ext4_fc_track_range(handle, inode, start_lblk, end_lblk);
+
+ ret = ext4_mark_inode_dirty(handle, inode);
+ if (unlikely(ret))
+ goto out_handle;
+
+ ext4_update_inode_fsync_trans(handle, inode, 1);
if (IS_SYNC(inode))
ext4_handle_sync(handle);
-
- inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
- ret2 = ext4_mark_inode_dirty(handle, inode);
- if (unlikely(ret2))
- ret = ret2;
- if (ret >= 0)
- ext4_update_inode_fsync_trans(handle, inode, 1);
-out_stop:
+out_handle:
ext4_journal_stop(handle);
-out_dio:
- filemap_invalidate_unlock(mapping);
-out_mutex:
- inode_unlock(inode);
return ret;
}
@@ -4183,7 +4607,7 @@ int ext4_truncate(struct inode *inode)
}
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- credits = ext4_writepage_trans_blocks(inode);
+ credits = ext4_chunk_trans_extent(inode, 1);
else
credits = ext4_blocks_for_truncate(inode);
@@ -4209,8 +4633,10 @@ int ext4_truncate(struct inode *inode)
if (err)
goto out_stop;
- down_write(&EXT4_I(inode)->i_data_sem);
+ ext4_fc_track_inode(handle, inode);
+ ext4_check_map_extents_env(inode);
+ down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
@@ -4674,6 +5100,11 @@ static inline int ext4_iget_extra_inode(struct inode *inode,
*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) {
int err;
+ err = xattr_check_inode(inode, IHDR(inode, raw_inode),
+ ITAIL(inode, raw_inode));
+ if (err)
+ return err;
+
ext4_set_inode_state(inode, EXT4_STATE_XATTR);
err = ext4_find_inline_data_nolock(inode);
if (!err && ext4_has_inline_data(inode))
@@ -4705,22 +5136,76 @@ static inline void ext4_inode_set_iversion_queried(struct inode *inode, u64 val)
inode_set_iversion_queried(inode, val);
}
-static const char *check_igot_inode(struct inode *inode, ext4_iget_flags flags)
-
+static int check_igot_inode(struct inode *inode, ext4_iget_flags flags,
+ const char *function, unsigned int line)
{
+ const char *err_str;
+
if (flags & EXT4_IGET_EA_INODE) {
- if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
- return "missing EA_INODE flag";
+ if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
+ err_str = "missing EA_INODE flag";
+ goto error;
+ }
if (ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
- EXT4_I(inode)->i_file_acl)
- return "ea_inode with extended attributes";
+ EXT4_I(inode)->i_file_acl) {
+ err_str = "ea_inode with extended attributes";
+ goto error;
+ }
} else {
- if ((EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
- return "unexpected EA_INODE flag";
+ if ((EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
+ /*
+ * open_by_handle_at() could provide an old inode number
+ * that has since been reused for an ea_inode; this does
+ * not indicate filesystem corruption
+ */
+ if (flags & EXT4_IGET_HANDLE)
+ return -ESTALE;
+ err_str = "unexpected EA_INODE flag";
+ goto error;
+ }
+ }
+ if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD)) {
+ err_str = "unexpected bad inode w/o EXT4_IGET_BAD";
+ goto error;
}
- if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD))
- return "unexpected bad inode w/o EXT4_IGET_BAD";
- return NULL;
+ return 0;
+
+error:
+ ext4_error_inode(inode, function, line, 0, "%s", err_str);
+ return -EFSCORRUPTED;
+}
+
+static bool ext4_should_enable_large_folio(struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+
+ if (!S_ISREG(inode->i_mode))
+ return false;
+ if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
+ ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
+ return false;
+ if (ext4_has_feature_verity(sb))
+ return false;
+ if (ext4_has_feature_encrypt(sb))
+ return false;
+
+ return true;
+}
+
+/*
+ * Limit the maximum folio order to 2048 blocks to prevent overestimation
+ * of reserve handle credits during the folio writeback in environments
+ * where the PAGE_SIZE exceeds 4KB.
+ */
+#define EXT4_MAX_PAGECACHE_ORDER(i) \
+ umin(MAX_PAGECACHE_ORDER, (11 + (i)->i_blkbits - PAGE_SHIFT))
+void ext4_set_inode_mapping_order(struct inode *inode)
+{
+ if (!ext4_should_enable_large_folio(inode))
+ return;
+
+ mapping_set_folio_order_range(inode->i_mapping, 0,
+ EXT4_MAX_PAGECACHE_ORDER(inode));
}
struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
@@ -4732,7 +5217,6 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
struct ext4_inode_info *ei;
struct ext4_super_block *es = EXT4_SB(sb)->s_es;
struct inode *inode;
- const char *err_str;
journal_t *journal = EXT4_SB(sb)->s_journal;
long ret;
loff_t size;
@@ -4741,12 +5225,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
gid_t i_gid;
projid_t i_projid;
- if ((!(flags & EXT4_IGET_SPECIAL) &&
- ((ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) ||
- ino == le32_to_cpu(es->s_usr_quota_inum) ||
- ino == le32_to_cpu(es->s_grp_quota_inum) ||
- ino == le32_to_cpu(es->s_prj_quota_inum) ||
- ino == le32_to_cpu(es->s_orphan_file_inum))) ||
+ if ((!(flags & EXT4_IGET_SPECIAL) && is_special_ino(sb, ino)) ||
(ino < EXT4_ROOT_INO) ||
(ino > le32_to_cpu(es->s_inodes_count))) {
if (flags & EXT4_IGET_HANDLE)
@@ -4761,10 +5240,10 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
if (!inode)
return ERR_PTR(-ENOMEM);
if (!(inode->i_state & I_NEW)) {
- if ((err_str = check_igot_inode(inode, flags)) != NULL) {
- ext4_error_inode(inode, function, line, 0, err_str);
+ ret = check_igot_inode(inode, flags, function, line);
+ if (ret) {
iput(inode);
- return ERR_PTR(-EFSCORRUPTED);
+ return ERR_PTR(ret);
}
return inode;
}
@@ -4800,15 +5279,14 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
ei->i_extra_isize = 0;
/* Precompute checksum seed for inode metadata */
- if (ext4_has_metadata_csum(sb)) {
+ if (ext4_has_feature_metadata_csum(sb)) {
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
__u32 csum;
__le32 inum = cpu_to_le32(inode->i_ino);
__le32 gen = raw_inode->i_generation;
- csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
+ csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&inum,
sizeof(inum));
- ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen,
- sizeof(gen));
+ ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen));
}
if ((!ext4_inode_csum_verify(inode, raw_inode, ei) ||
@@ -4876,7 +5354,8 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
ei->i_file_acl |=
((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
inode->i_size = ext4_isize(sb, raw_inode);
- if ((size = i_size_read(inode)) < 0) {
+ size = i_size_read(inode);
+ if (size < 0 || size > ext4_get_maxbytes(inode)) {
ext4_error_inode(inode, function, line, 0,
"iget: bad i_size value: %lld", size);
ret = -EFSCORRUPTED;
@@ -4887,7 +5366,8 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
* we'd normally treat htree data as empty space. But with metadata
* checksumming that corrupts checksums so forbid that.
*/
- if (!ext4_has_feature_dir_index(sb) && ext4_has_metadata_csum(sb) &&
+ if (!ext4_has_feature_dir_index(sb) &&
+ ext4_has_feature_metadata_csum(sb) &&
ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) {
ext4_error_inode(inode, function, line, 0,
"iget: Dir with htree data on filesystem without dir_index feature.");
@@ -5007,8 +5487,16 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
inode->i_op = &ext4_encrypted_symlink_inode_operations;
} else if (ext4_inode_is_fast_symlink(inode)) {
inode->i_op = &ext4_fast_symlink_inode_operations;
- nd_terminate_link(ei->i_data, inode->i_size,
- sizeof(ei->i_data) - 1);
+ if (inode->i_size == 0 ||
+ inode->i_size >= sizeof(ei->i_data) ||
+ strnlen((char *)ei->i_data, inode->i_size + 1) !=
+ inode->i_size) {
+ ext4_error_inode(inode, function, line, 0,
+ "invalid fast symlink length %llu",
+ (unsigned long long)inode->i_size);
+ ret = -EFSCORRUPTED;
+ goto bad_inode;
+ }
inode_set_cached_link(inode, (char *)ei->i_data,
inode->i_size);
} else {
@@ -5037,13 +5525,24 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
ret = -EFSCORRUPTED;
goto bad_inode;
}
- if ((err_str = check_igot_inode(inode, flags)) != NULL) {
- ext4_error_inode(inode, function, line, 0, err_str);
- ret = -EFSCORRUPTED;
- goto bad_inode;
- }
+ ext4_set_inode_mapping_order(inode);
+
+ ret = check_igot_inode(inode, flags, function, line);
+ /*
+ * -ESTALE here means there is nothing inherently wrong with the inode,
+ * it's just not an inode we can return for an fhandle lookup.
+ */
+ if (ret == -ESTALE) {
+ brelse(iloc.bh);
+ unlock_new_inode(inode);
+ iput(inode);
+ return ERR_PTR(-ESTALE);
+ }
+ if (ret)
+ goto bad_inode;
brelse(iloc.bh);
+
unlock_new_inode(inode);
return inode;
@@ -5228,8 +5727,9 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
if (WARN_ON_ONCE(current->flags & PF_MEMALLOC))
return 0;
- if (unlikely(ext4_forced_shutdown(inode->i_sb)))
- return -EIO;
+ err = ext4_emergency_state(inode->i_sb);
+ if (unlikely(err))
+ return err;
if (EXT4_SB(inode->i_sb)->s_journal) {
if (ext4_journal_current_handle()) {
@@ -5351,8 +5851,9 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
const unsigned int ia_valid = attr->ia_valid;
bool inc_ivers = true;
- if (unlikely(ext4_forced_shutdown(inode->i_sb)))
- return -EIO;
+ error = ext4_emergency_state(inode->i_sb);
+ if (unlikely(error))
+ return error;
if (unlikely(IS_IMMUTABLE(inode)))
return -EPERM;
@@ -5464,7 +5965,7 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
oldsize & (inode->i_sb->s_blocksize - 1)) {
error = ext4_inode_attach_jinode(inode);
if (error)
- goto err_out;
+ goto out_mmap_sem;
}
handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
@@ -5505,9 +6006,7 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
down_write(&EXT4_I(inode)->i_data_sem);
old_disksize = EXT4_I(inode)->i_disksize;
EXT4_I(inode)->i_disksize = attr->ia_size;
- rc = ext4_mark_inode_dirty(handle, inode);
- if (!error)
- error = rc;
+
/*
* We have to update i_size under i_data_sem together
* with i_disksize to avoid races with writeback code
@@ -5518,6 +6017,9 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
else
EXT4_I(inode)->i_disksize = old_disksize;
up_write(&EXT4_I(inode)->i_data_sem);
+ rc = ext4_mark_inode_dirty(handle, inode);
+ if (!error)
+ error = rc;
ext4_journal_stop(handle);
if (error)
goto out_mmap_sem;
@@ -5633,7 +6135,7 @@ int ext4_getattr(struct mnt_idmap *idmap, const struct path *path,
awu_max = sbi->s_awu_max;
}
- generic_fill_statx_atomic_writes(stat, awu_min, awu_max);
+ generic_fill_statx_atomic_writes(stat, awu_min, awu_max, 0);
}
flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
@@ -5714,8 +6216,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int lblocks,
*
* Also account for superblock, inode, quota and xattr blocks
*/
-static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
- int pextents)
+int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents)
{
ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
int gdpblocks;
@@ -5723,13 +6224,11 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
int ret;
/*
- * How many index blocks need to touch to map @lblocks logical blocks
- * to @pextents physical extents?
+ * How many index and leaf blocks need to touch to map @lblocks
+ * logical blocks to @pextents physical extents?
*/
idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents);
- ret = idxblocks;
-
/*
* Now let's see how many group bitmaps and group descriptors need
* to account
@@ -5742,7 +6241,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;
/* bitmaps and block group descriptor blocks */
- ret += groups + gdpblocks;
+ ret = idxblocks + groups + gdpblocks;
/* Blocks for super block, inode, quota and xattr blocks */
ret += EXT4_META_TRANS_BLOCKS(inode->i_sb);
@@ -5751,25 +6250,19 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
}
/*
- * Calculate the total number of credits to reserve to fit
- * the modification of a single pages into a single transaction,
- * which may include multiple chunks of block allocations.
- *
- * This could be called via ext4_write_begin()
- *
- * We need to consider the worse case, when
- * one new block per extent.
+ * Calculate the journal credits for modifying the number of blocks
+ * in a single extent within one transaction. 'nrblocks' is used only
+ * for non-extent inodes. For extent type inodes, 'nrblocks' can be
+ * zero if the exact number of blocks is unknown.
*/
-int ext4_writepage_trans_blocks(struct inode *inode)
+int ext4_chunk_trans_extent(struct inode *inode, int nrblocks)
{
- int bpp = ext4_journal_blocks_per_page(inode);
int ret;
- ret = ext4_meta_trans_blocks(inode, bpp, bpp);
-
+ ret = ext4_meta_trans_blocks(inode, nrblocks, 1);
/* Account for data blocks for journalled mode */
if (ext4_should_journal_data(inode))
- ret += bpp;
+ ret += nrblocks;
return ret;
}
@@ -5796,9 +6289,10 @@ int ext4_mark_iloc_dirty(handle_t *handle,
{
int err = 0;
- if (unlikely(ext4_forced_shutdown(inode->i_sb))) {
+ err = ext4_emergency_state(inode->i_sb);
+ if (unlikely(err)) {
put_bh(iloc->bh);
- return -EIO;
+ return err;
}
ext4_fc_track_inode(handle, inode);
@@ -5822,8 +6316,9 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
{
int err;
- if (unlikely(ext4_forced_shutdown(inode->i_sb)))
- return -EIO;
+ err = ext4_emergency_state(inode->i_sb);
+ if (unlikely(err))
+ return err;
err = ext4_get_inode_loc(inode, iloc);
if (!err) {
@@ -5834,6 +6329,7 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
brelse(iloc->bh);
iloc->bh = NULL;
}
+ ext4_fc_track_inode(handle, inode);
}
ext4_std_error(inode->i_sb, err);
return err;
@@ -6138,6 +6634,55 @@ static int ext4_bh_unmapped(handle_t *handle, struct inode *inode,
return !buffer_mapped(bh);
}
+static int ext4_block_page_mkwrite(struct inode *inode, struct folio *folio,
+ get_block_t get_block)
+{
+ handle_t *handle;
+ loff_t size;
+ unsigned long len;
+ int credits;
+ int ret;
+
+ credits = ext4_chunk_trans_extent(inode,
+ ext4_journal_blocks_per_folio(inode));
+ handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, credits);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ folio_lock(folio);
+ size = i_size_read(inode);
+ /* Page got truncated from under us? */
+ if (folio->mapping != inode->i_mapping || folio_pos(folio) > size) {
+ ret = -EFAULT;
+ goto out_error;
+ }
+
+ len = folio_size(folio);
+ if (folio_pos(folio) + len > size)
+ len = size - folio_pos(folio);
+
+ ret = ext4_block_write_begin(handle, folio, 0, len, get_block);
+ if (ret)
+ goto out_error;
+
+ if (!ext4_should_journal_data(inode)) {
+ block_commit_write(folio, 0, len);
+ folio_mark_dirty(folio);
+ } else {
+ ret = ext4_journal_folio_buffers(handle, folio, len);
+ if (ret)
+ goto out_error;
+ }
+ ext4_journal_stop(handle);
+ folio_wait_stable(folio);
+ return ret;
+
+out_error:
+ folio_unlock(folio);
+ ext4_journal_stop(handle);
+ return ret;
+}
+
vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
@@ -6149,8 +6694,7 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
struct file *file = vma->vm_file;
struct inode *inode = file_inode(file);
struct address_space *mapping = inode->i_mapping;
- handle_t *handle;
- get_block_t *get_block;
+ get_block_t *get_block = ext4_get_block;
int retries = 0;
if (unlikely(IS_IMMUTABLE(inode)))
@@ -6218,47 +6762,11 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
/* OK, we need to fill the hole... */
if (ext4_should_dioread_nolock(inode))
get_block = ext4_get_block_unwritten;
- else
- get_block = ext4_get_block;
retry_alloc:
- handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
- ext4_writepage_trans_blocks(inode));
- if (IS_ERR(handle)) {
- ret = VM_FAULT_SIGBUS;
- goto out;
- }
- /*
- * Data journalling can't use block_page_mkwrite() because it
- * will set_buffer_dirty() before do_journal_get_write_access()
- * thus might hit warning messages for dirty metadata buffers.
- */
- if (!ext4_should_journal_data(inode)) {
- err = block_page_mkwrite(vma, vmf, get_block);
- } else {
- folio_lock(folio);
- size = i_size_read(inode);
- /* Page got truncated from under us? */
- if (folio->mapping != mapping || folio_pos(folio) > size) {
- ret = VM_FAULT_NOPAGE;
- goto out_error;
- }
-
- len = folio_size(folio);
- if (folio_pos(folio) + len > size)
- len = size - folio_pos(folio);
-
- err = ext4_block_write_begin(handle, folio, 0, len,
- ext4_get_block);
- if (!err) {
- ret = VM_FAULT_SIGBUS;
- if (ext4_journal_folio_buffers(handle, folio, len))
- goto out_error;
- } else {
- folio_unlock(folio);
- }
- }
- ext4_journal_stop(handle);
- if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ /* Start journal and allocate blocks */
+ err = ext4_block_page_mkwrite(inode, folio, get_block);
+ if (err == -EAGAIN ||
+ (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)))
goto retry_alloc;
out_ret:
ret = vmf_fs_error(err);
@@ -6266,8 +6774,4 @@ out:
filemap_invalidate_unlock_shared(mapping);
sb_end_pagefault(inode->i_sb);
return ret;
-out_error:
- folio_unlock(folio);
- ext4_journal_stop(handle);
- goto out;
}
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 7b9ce71c1c81..84e3c73952d7 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -142,16 +142,16 @@ static int ext4_update_backup_sb(struct super_block *sb,
es = (struct ext4_super_block *) (bh->b_data + offset);
lock_buffer(bh);
- if (ext4_has_metadata_csum(sb) &&
- es->s_checksum != ext4_superblock_csum(sb, es)) {
+ if (ext4_has_feature_metadata_csum(sb) &&
+ es->s_checksum != ext4_superblock_csum(es)) {
ext4_msg(sb, KERN_ERR, "Invalid checksum for backup "
"superblock %llu", sb_block);
unlock_buffer(bh);
goto out_bh;
}
func(es, arg);
- if (ext4_has_metadata_csum(sb))
- es->s_checksum = ext4_superblock_csum(sb, es);
+ if (ext4_has_feature_metadata_csum(sb))
+ es->s_checksum = ext4_superblock_csum(es);
set_buffer_uptodate(bh);
unlock_buffer(bh);
@@ -351,11 +351,11 @@ void ext4_reset_inode_seed(struct inode *inode)
__le32 gen = cpu_to_le32(inode->i_generation);
__u32 csum;
- if (!ext4_has_metadata_csum(inode->i_sb))
+ if (!ext4_has_feature_metadata_csum(inode->i_sb))
return;
- csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum));
- ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, sizeof(gen));
+ csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum));
+ ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen));
}
/*
@@ -980,7 +980,7 @@ group_add_out:
return err;
}
-int ext4_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+int ext4_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
{
struct inode *inode = d_inode(dentry);
struct ext4_inode_info *ei = EXT4_I(inode);
@@ -997,7 +997,7 @@ int ext4_fileattr_get(struct dentry *dentry, struct fileattr *fa)
}
int ext4_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa)
+ struct dentry *dentry, struct file_kattr *fa)
{
struct inode *inode = d_inode(dentry);
u32 flags = fa->flags;
@@ -1205,7 +1205,8 @@ static int ext4_ioctl_setuuid(struct file *filp,
* If any checksums (group descriptors or metadata) are being used
* then the checksum seed feature is required to change the UUID.
*/
- if (((ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb))
+ if (((ext4_has_feature_gdt_csum(sb) ||
+ ext4_has_feature_metadata_csum(sb))
&& !ext4_has_feature_csum_seed(sb))
|| ext4_has_feature_stable_inodes(sb))
return -EOPNOTSUPP;
@@ -1253,7 +1254,7 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
if (!inode_owner_or_capable(idmap, inode))
return -EPERM;
- if (ext4_has_metadata_csum(inode->i_sb)) {
+ if (ext4_has_feature_metadata_csum(inode->i_sb)) {
ext4_warning(sb, "Setting inode version is not "
"supported with metadata_csum enabled.");
return -ENOTTY;
@@ -1504,8 +1505,14 @@ resizefs_out:
return 0;
}
case EXT4_IOC_PRECACHE_EXTENTS:
- return ext4_ext_precache(inode);
+ {
+ int ret;
+ inode_lock_shared(inode);
+ ret = ext4_ext_precache(inode);
+ inode_unlock_shared(inode);
+ return ret;
+ }
case FS_IOC_SET_ENCRYPTION_POLICY:
if (!ext4_has_feature_encrypt(sb))
return -EOPNOTSUPP;
@@ -1705,7 +1712,7 @@ int ext4_update_overhead(struct super_block *sb, bool force)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- if (sb_rdonly(sb))
+ if (ext4_emergency_state(sb) || sb_rdonly(sb))
return 0;
if (!force &&
(sbi->s_overhead == 0 ||
diff --git a/fs/ext4/mballoc-test.c b/fs/ext4/mballoc-test.c
index bb2a223b207c..a9416b20ff64 100644
--- a/fs/ext4/mballoc-test.c
+++ b/fs/ext4/mballoc-test.c
@@ -155,6 +155,7 @@ static struct super_block *mbt_ext4_alloc_super_block(void)
bgl_lock_init(sbi->s_blockgroup_lock);
sbi->s_es = &fsb->es;
+ sbi->s_sb = sb;
sb->s_fs_info = sbi;
up_write(&sb->s_umount);
@@ -796,11 +797,14 @@ static void test_mb_mark_used(struct kunit *test)
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buddy);
grp = kunit_kzalloc(test, offsetof(struct ext4_group_info,
bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, grp);
ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b);
KUNIT_ASSERT_EQ(test, ret, 0);
grp->bb_free = EXT4_CLUSTERS_PER_GROUP(sb);
+ grp->bb_largest_free_order = -1;
+ grp->bb_avg_fragment_size_order = -1;
mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT);
for (i = 0; i < TEST_RANGE_COUNT; i++)
test_mb_mark_used_range(test, &e4b, ranges[i].start,
@@ -860,6 +864,7 @@ static void test_mb_free_blocks(struct kunit *test)
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buddy);
grp = kunit_kzalloc(test, offsetof(struct ext4_group_info,
bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, grp);
ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b);
KUNIT_ASSERT_EQ(test, ret, 0);
@@ -873,6 +878,8 @@ static void test_mb_free_blocks(struct kunit *test)
ext4_unlock_group(sb, TEST_GOAL_GROUP);
grp->bb_free = 0;
+ grp->bb_largest_free_order = -1;
+ grp->bb_avg_fragment_size_order = -1;
memset(bitmap, 0xff, sb->s_blocksize);
mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index b25a27c86696..5898d92ba19f 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -132,25 +132,30 @@
* If "mb_optimize_scan" mount option is set, we maintain in memory group info
* structures in two data structures:
*
- * 1) Array of largest free order lists (sbi->s_mb_largest_free_orders)
+ * 1) Array of largest free order xarrays (sbi->s_mb_largest_free_orders)
*
- * Locking: sbi->s_mb_largest_free_orders_locks(array of rw locks)
+ * Locking: Writers use xa_lock, readers use rcu_read_lock.
*
- * This is an array of lists where the index in the array represents the
+ * This is an array of xarrays where the index in the array represents the
* largest free order in the buddy bitmap of the participating group infos of
- * that list. So, there are exactly MB_NUM_ORDERS(sb) (which means total
- * number of buddy bitmap orders possible) number of lists. Group-infos are
- * placed in appropriate lists.
+ * that xarray. So, there are exactly MB_NUM_ORDERS(sb) (which means total
+ * number of buddy bitmap orders possible) number of xarrays. Group-infos are
+ * placed in appropriate xarrays.
*
- * 2) Average fragment size lists (sbi->s_mb_avg_fragment_size)
+ * 2) Average fragment size xarrays (sbi->s_mb_avg_fragment_size)
*
- * Locking: sbi->s_mb_avg_fragment_size_locks(array of rw locks)
+ * Locking: Writers use xa_lock, readers use rcu_read_lock.
*
- * This is an array of lists where in the i-th list there are groups with
+ * This is an array of xarrays where in the i-th xarray there are groups with
* average fragment size >= 2^i and < 2^(i+1). The average fragment size
* is computed as ext4_group_info->bb_free / ext4_group_info->bb_fragments.
- * Note that we don't bother with a special list for completely empty groups
- * so we only have MB_NUM_ORDERS(sb) lists.
+ * Note that we don't bother with a special xarray for completely empty
+ * groups so we only have MB_NUM_ORDERS(sb) xarrays. Group-infos are placed
+ * in appropriate xarrays.
+ *
+ * In xarray, the index is the block group number, the value is the block group
+ * information, and a non-empty value indicates the block group is present in
+ * the current xarray.
*
* When "mb_optimize_scan" mount option is set, mballoc consults the above data
* structures to decide the order in which groups are to be traversed for
@@ -187,7 +192,7 @@
* /sys/fs/ext4/<partition>/mb_min_to_scan
* /sys/fs/ext4/<partition>/mb_max_to_scan
* /sys/fs/ext4/<partition>/mb_order2_req
- * /sys/fs/ext4/<partition>/mb_linear_limit
+ * /sys/fs/ext4/<partition>/mb_max_linear_groups
*
* The regular allocator uses buddy scan only if the request len is power of
* 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
@@ -209,7 +214,7 @@
* get traversed linearly. That may result in subsequent allocations being not
* close to each other. And so, the underlying device may get filled up in a
* non-linear fashion. While that may not matter on non-rotational devices, for
- * rotational devices that may result in higher seek times. "mb_linear_limit"
+ * rotational devices that may result in higher seek times. "mb_max_linear_groups"
* tells mballoc how many groups mballoc should search linearly before
* performing consulting above data structures for more efficient lookups. For
* non rotational devices, this value defaults to 0 and for rotational devices
@@ -420,8 +425,8 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
ext4_group_t group);
static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac);
-static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
- ext4_group_t group, enum criteria cr);
+static int ext4_mb_scan_group(struct ext4_allocation_context *ac,
+ ext4_group_t group);
static int ext4_try_to_trim_range(struct super_block *sb,
struct ext4_buddy *e4b, ext4_grpblk_t start,
@@ -841,132 +846,161 @@ static void
mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- int new_order;
+ int new, old;
- if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_fragments == 0)
+ if (!test_opt2(sb, MB_OPTIMIZE_SCAN))
return;
- new_order = mb_avg_fragment_size_order(sb,
- grp->bb_free / grp->bb_fragments);
- if (new_order == grp->bb_avg_fragment_size_order)
+ old = grp->bb_avg_fragment_size_order;
+ new = grp->bb_fragments == 0 ? -1 :
+ mb_avg_fragment_size_order(sb, grp->bb_free / grp->bb_fragments);
+ if (new == old)
return;
- if (grp->bb_avg_fragment_size_order != -1) {
- write_lock(&sbi->s_mb_avg_fragment_size_locks[
- grp->bb_avg_fragment_size_order]);
- list_del(&grp->bb_avg_fragment_size_node);
- write_unlock(&sbi->s_mb_avg_fragment_size_locks[
- grp->bb_avg_fragment_size_order]);
+ if (old >= 0)
+ xa_erase(&sbi->s_mb_avg_fragment_size[old], grp->bb_group);
+
+ grp->bb_avg_fragment_size_order = new;
+ if (new >= 0) {
+ /*
+ * Cannot use __GFP_NOFAIL because we hold the group lock.
+ * Although allocation for insertion may fails, it's not fatal
+ * as we have linear traversal to fall back on.
+ */
+ int err = xa_insert(&sbi->s_mb_avg_fragment_size[new],
+ grp->bb_group, grp, GFP_ATOMIC);
+ if (err)
+ mb_debug(sb, "insert group: %u to s_mb_avg_fragment_size[%d] failed, err %d",
+ grp->bb_group, new, err);
+ }
+}
+
+static int ext4_mb_scan_groups_xa_range(struct ext4_allocation_context *ac,
+ struct xarray *xa,
+ ext4_group_t start, ext4_group_t end)
+{
+ struct super_block *sb = ac->ac_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ enum criteria cr = ac->ac_criteria;
+ ext4_group_t ngroups = ext4_get_groups_count(sb);
+ unsigned long group = start;
+ struct ext4_group_info *grp;
+
+ if (WARN_ON_ONCE(end > ngroups || start >= end))
+ return 0;
+
+ xa_for_each_range(xa, group, grp, start, end - 1) {
+ int err;
+
+ if (sbi->s_mb_stats)
+ atomic64_inc(&sbi->s_bal_cX_groups_considered[cr]);
+
+ err = ext4_mb_scan_group(ac, grp->bb_group);
+ if (err || ac->ac_status != AC_STATUS_CONTINUE)
+ return err;
+
+ cond_resched();
}
- grp->bb_avg_fragment_size_order = new_order;
- write_lock(&sbi->s_mb_avg_fragment_size_locks[
- grp->bb_avg_fragment_size_order]);
- list_add_tail(&grp->bb_avg_fragment_size_node,
- &sbi->s_mb_avg_fragment_size[grp->bb_avg_fragment_size_order]);
- write_unlock(&sbi->s_mb_avg_fragment_size_locks[
- grp->bb_avg_fragment_size_order]);
+
+ return 0;
+}
+
+/*
+ * Find a suitable group of given order from the largest free orders xarray.
+ */
+static inline int
+ext4_mb_scan_groups_largest_free_order_range(struct ext4_allocation_context *ac,
+ int order, ext4_group_t start,
+ ext4_group_t end)
+{
+ struct xarray *xa = &EXT4_SB(ac->ac_sb)->s_mb_largest_free_orders[order];
+
+ if (xa_empty(xa))
+ return 0;
+
+ return ext4_mb_scan_groups_xa_range(ac, xa, start, end);
}
/*
* Choose next group by traversing largest_free_order lists. Updates *new_cr if
* cr level needs an update.
*/
-static void ext4_mb_choose_next_group_p2_aligned(struct ext4_allocation_context *ac,
- enum criteria *new_cr, ext4_group_t *group)
+static int ext4_mb_scan_groups_p2_aligned(struct ext4_allocation_context *ac,
+ ext4_group_t group)
{
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
- struct ext4_group_info *iter;
int i;
+ int ret = 0;
+ ext4_group_t start, end;
- if (ac->ac_status == AC_STATUS_FOUND)
- return;
-
- if (unlikely(sbi->s_mb_stats && ac->ac_flags & EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED))
- atomic_inc(&sbi->s_bal_p2_aligned_bad_suggestions);
-
+ start = group;
+ end = ext4_get_groups_count(ac->ac_sb);
+wrap_around:
for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) {
- if (list_empty(&sbi->s_mb_largest_free_orders[i]))
- continue;
- read_lock(&sbi->s_mb_largest_free_orders_locks[i]);
- if (list_empty(&sbi->s_mb_largest_free_orders[i])) {
- read_unlock(&sbi->s_mb_largest_free_orders_locks[i]);
- continue;
- }
- list_for_each_entry(iter, &sbi->s_mb_largest_free_orders[i],
- bb_largest_free_order_node) {
- if (sbi->s_mb_stats)
- atomic64_inc(&sbi->s_bal_cX_groups_considered[CR_POWER2_ALIGNED]);
- if (likely(ext4_mb_good_group(ac, iter->bb_group, CR_POWER2_ALIGNED))) {
- *group = iter->bb_group;
- ac->ac_flags |= EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED;
- read_unlock(&sbi->s_mb_largest_free_orders_locks[i]);
- return;
- }
- }
- read_unlock(&sbi->s_mb_largest_free_orders_locks[i]);
+ ret = ext4_mb_scan_groups_largest_free_order_range(ac, i,
+ start, end);
+ if (ret || ac->ac_status != AC_STATUS_CONTINUE)
+ return ret;
+ }
+ if (start) {
+ end = start;
+ start = 0;
+ goto wrap_around;
}
+ if (sbi->s_mb_stats)
+ atomic64_inc(&sbi->s_bal_cX_failed[ac->ac_criteria]);
+
/* Increment cr and search again if no group is found */
- *new_cr = CR_GOAL_LEN_FAST;
+ ac->ac_criteria = CR_GOAL_LEN_FAST;
+ return ret;
}
/*
- * Find a suitable group of given order from the average fragments list.
+ * Find a suitable group of given order from the average fragments xarray.
*/
-static struct ext4_group_info *
-ext4_mb_find_good_group_avg_frag_lists(struct ext4_allocation_context *ac, int order)
+static int
+ext4_mb_scan_groups_avg_frag_order_range(struct ext4_allocation_context *ac,
+ int order, ext4_group_t start,
+ ext4_group_t end)
{
- struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
- struct list_head *frag_list = &sbi->s_mb_avg_fragment_size[order];
- rwlock_t *frag_list_lock = &sbi->s_mb_avg_fragment_size_locks[order];
- struct ext4_group_info *grp = NULL, *iter;
- enum criteria cr = ac->ac_criteria;
+ struct xarray *xa = &EXT4_SB(ac->ac_sb)->s_mb_avg_fragment_size[order];
- if (list_empty(frag_list))
- return NULL;
- read_lock(frag_list_lock);
- if (list_empty(frag_list)) {
- read_unlock(frag_list_lock);
- return NULL;
- }
- list_for_each_entry(iter, frag_list, bb_avg_fragment_size_node) {
- if (sbi->s_mb_stats)
- atomic64_inc(&sbi->s_bal_cX_groups_considered[cr]);
- if (likely(ext4_mb_good_group(ac, iter->bb_group, cr))) {
- grp = iter;
- break;
- }
- }
- read_unlock(frag_list_lock);
- return grp;
+ if (xa_empty(xa))
+ return 0;
+
+ return ext4_mb_scan_groups_xa_range(ac, xa, start, end);
}
/*
* Choose next group by traversing average fragment size list of suitable
* order. Updates *new_cr if cr level needs an update.
*/
-static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context *ac,
- enum criteria *new_cr, ext4_group_t *group)
+static int ext4_mb_scan_groups_goal_fast(struct ext4_allocation_context *ac,
+ ext4_group_t group)
{
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
- struct ext4_group_info *grp = NULL;
- int i;
-
- if (unlikely(ac->ac_flags & EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED)) {
- if (sbi->s_mb_stats)
- atomic_inc(&sbi->s_bal_goal_fast_bad_suggestions);
+ int i, ret = 0;
+ ext4_group_t start, end;
+
+ start = group;
+ end = ext4_get_groups_count(ac->ac_sb);
+wrap_around:
+ i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len);
+ for (; i < MB_NUM_ORDERS(ac->ac_sb); i++) {
+ ret = ext4_mb_scan_groups_avg_frag_order_range(ac, i,
+ start, end);
+ if (ret || ac->ac_status != AC_STATUS_CONTINUE)
+ return ret;
}
-
- for (i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len);
- i < MB_NUM_ORDERS(ac->ac_sb); i++) {
- grp = ext4_mb_find_good_group_avg_frag_lists(ac, i);
- if (grp) {
- *group = grp->bb_group;
- ac->ac_flags |= EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED;
- return;
- }
+ if (start) {
+ end = start;
+ start = 0;
+ goto wrap_around;
}
+ if (sbi->s_mb_stats)
+ atomic64_inc(&sbi->s_bal_cX_failed[ac->ac_criteria]);
/*
* CR_BEST_AVAIL_LEN works based on the concept that we have
* a larger normalized goal len request which can be trimmed to
@@ -976,9 +1010,11 @@ static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context *
* See function ext4_mb_normalize_request() (EXT4_MB_HINT_DATA).
*/
if (ac->ac_flags & EXT4_MB_HINT_DATA)
- *new_cr = CR_BEST_AVAIL_LEN;
+ ac->ac_criteria = CR_BEST_AVAIL_LEN;
else
- *new_cr = CR_GOAL_LEN_SLOW;
+ ac->ac_criteria = CR_GOAL_LEN_SLOW;
+
+ return ret;
}
/*
@@ -990,18 +1026,14 @@ static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context *
* preallocations. However, we make sure that we don't trim the request too
* much and fall to CR_GOAL_LEN_SLOW in that case.
*/
-static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context *ac,
- enum criteria *new_cr, ext4_group_t *group)
+static int ext4_mb_scan_groups_best_avail(struct ext4_allocation_context *ac,
+ ext4_group_t group)
{
+ int ret = 0;
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
- struct ext4_group_info *grp = NULL;
int i, order, min_order;
unsigned long num_stripe_clusters = 0;
-
- if (unlikely(ac->ac_flags & EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED)) {
- if (sbi->s_mb_stats)
- atomic_inc(&sbi->s_bal_best_avail_bad_suggestions);
- }
+ ext4_group_t start, end;
/*
* mb_avg_fragment_size_order() returns order in a way that makes
@@ -1033,6 +1065,9 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context
if (1 << min_order < ac->ac_o_ex.fe_len)
min_order = fls(ac->ac_o_ex.fe_len);
+ start = group;
+ end = ext4_get_groups_count(ac->ac_sb);
+wrap_around:
for (i = order; i >= min_order; i--) {
int frag_order;
/*
@@ -1055,17 +1090,24 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context
frag_order = mb_avg_fragment_size_order(ac->ac_sb,
ac->ac_g_ex.fe_len);
- grp = ext4_mb_find_good_group_avg_frag_lists(ac, frag_order);
- if (grp) {
- *group = grp->bb_group;
- ac->ac_flags |= EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED;
- return;
- }
+ ret = ext4_mb_scan_groups_avg_frag_order_range(ac, frag_order,
+ start, end);
+ if (ret || ac->ac_status != AC_STATUS_CONTINUE)
+ return ret;
+ }
+ if (start) {
+ end = start;
+ start = 0;
+ goto wrap_around;
}
/* Reset goal length to original goal length before falling into CR_GOAL_LEN_SLOW */
ac->ac_g_ex.fe_len = ac->ac_orig_goal_len;
- *new_cr = CR_GOAL_LEN_SLOW;
+ if (sbi->s_mb_stats)
+ atomic64_inc(&sbi->s_bal_cX_failed[ac->ac_criteria]);
+ ac->ac_criteria = CR_GOAL_LEN_SLOW;
+
+ return ret;
}
static inline int should_optimize_scan(struct ext4_allocation_context *ac)
@@ -1080,59 +1122,82 @@ static inline int should_optimize_scan(struct ext4_allocation_context *ac)
}
/*
- * Return next linear group for allocation.
+ * next linear group for allocation.
*/
-static ext4_group_t
-next_linear_group(ext4_group_t group, ext4_group_t ngroups)
+static void next_linear_group(ext4_group_t *group, ext4_group_t ngroups)
{
/*
* Artificially restricted ngroups for non-extent
* files makes group > ngroups possible on first loop.
*/
- return group + 1 >= ngroups ? 0 : group + 1;
+ *group = *group + 1 >= ngroups ? 0 : *group + 1;
}
-/*
- * ext4_mb_choose_next_group: choose next group for allocation.
- *
- * @ac Allocation Context
- * @new_cr This is an output parameter. If the there is no good group
- * available at current CR level, this field is updated to indicate
- * the new cr level that should be used.
- * @group This is an input / output parameter. As an input it indicates the
- * next group that the allocator intends to use for allocation. As
- * output, this field indicates the next group that should be used as
- * determined by the optimization functions.
- * @ngroups Total number of groups
- */
-static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac,
- enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups)
+static int ext4_mb_scan_groups_linear(struct ext4_allocation_context *ac,
+ ext4_group_t ngroups, ext4_group_t *start, ext4_group_t count)
{
- *new_cr = ac->ac_criteria;
+ int ret, i;
+ enum criteria cr = ac->ac_criteria;
+ struct super_block *sb = ac->ac_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ ext4_group_t group = *start;
- if (!should_optimize_scan(ac)) {
- *group = next_linear_group(*group, ngroups);
- return;
+ for (i = 0; i < count; i++, next_linear_group(&group, ngroups)) {
+ ret = ext4_mb_scan_group(ac, group);
+ if (ret || ac->ac_status != AC_STATUS_CONTINUE)
+ return ret;
+ cond_resched();
}
+ *start = group;
+ if (count == ngroups)
+ ac->ac_criteria++;
+
+ /* Processed all groups and haven't found blocks */
+ if (sbi->s_mb_stats && i == ngroups)
+ atomic64_inc(&sbi->s_bal_cX_failed[cr]);
+
+ return 0;
+}
+
+static int ext4_mb_scan_groups(struct ext4_allocation_context *ac)
+{
+ int ret = 0;
+ ext4_group_t start;
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+ ext4_group_t ngroups = ext4_get_groups_count(ac->ac_sb);
+
+ /* non-extent files are limited to low blocks/groups */
+ if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
+ ngroups = sbi->s_blockfile_groups;
+
+ /* searching for the right group start from the goal value specified */
+ start = ac->ac_g_ex.fe_group;
+ ac->ac_prefetch_grp = start;
+ ac->ac_prefetch_nr = 0;
+
+ if (!should_optimize_scan(ac))
+ return ext4_mb_scan_groups_linear(ac, ngroups, &start, ngroups);
+
/*
* Optimized scanning can return non adjacent groups which can cause
* seek overhead for rotational disks. So try few linear groups before
* trying optimized scan.
*/
- if (ac->ac_groups_linear_remaining) {
- *group = next_linear_group(*group, ngroups);
- ac->ac_groups_linear_remaining--;
- return;
- }
+ if (sbi->s_mb_max_linear_groups)
+ ret = ext4_mb_scan_groups_linear(ac, ngroups, &start,
+ sbi->s_mb_max_linear_groups);
+ if (ret || ac->ac_status != AC_STATUS_CONTINUE)
+ return ret;
- if (*new_cr == CR_POWER2_ALIGNED) {
- ext4_mb_choose_next_group_p2_aligned(ac, new_cr, group);
- } else if (*new_cr == CR_GOAL_LEN_FAST) {
- ext4_mb_choose_next_group_goal_fast(ac, new_cr, group);
- } else if (*new_cr == CR_BEST_AVAIL_LEN) {
- ext4_mb_choose_next_group_best_avail(ac, new_cr, group);
- } else {
+ switch (ac->ac_criteria) {
+ case CR_POWER2_ALIGNED:
+ return ext4_mb_scan_groups_p2_aligned(ac, start);
+ case CR_GOAL_LEN_FAST:
+ return ext4_mb_scan_groups_goal_fast(ac, start);
+ case CR_BEST_AVAIL_LEN:
+ return ext4_mb_scan_groups_best_avail(ac, start);
+ default:
/*
* TODO: For CR_GOAL_LEN_SLOW, we can arrange groups in an
* rb tree sorted by bb_free. But until that happens, we should
@@ -1140,6 +1205,8 @@ static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac,
*/
WARN_ON(1);
}
+
+ return 0;
}
/*
@@ -1150,33 +1217,35 @@ static void
mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- int i;
+ int new, old = grp->bb_largest_free_order;
- for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--)
- if (grp->bb_counters[i] > 0)
+ for (new = MB_NUM_ORDERS(sb) - 1; new >= 0; new--)
+ if (grp->bb_counters[new] > 0)
break;
+
/* No need to move between order lists? */
- if (!test_opt2(sb, MB_OPTIMIZE_SCAN) ||
- i == grp->bb_largest_free_order) {
- grp->bb_largest_free_order = i;
+ if (new == old)
return;
- }
- if (grp->bb_largest_free_order >= 0) {
- write_lock(&sbi->s_mb_largest_free_orders_locks[
- grp->bb_largest_free_order]);
- list_del_init(&grp->bb_largest_free_order_node);
- write_unlock(&sbi->s_mb_largest_free_orders_locks[
- grp->bb_largest_free_order]);
+ if (old >= 0) {
+ struct xarray *xa = &sbi->s_mb_largest_free_orders[old];
+
+ if (!xa_empty(xa) && xa_load(xa, grp->bb_group))
+ xa_erase(xa, grp->bb_group);
}
- grp->bb_largest_free_order = i;
- if (grp->bb_largest_free_order >= 0 && grp->bb_free) {
- write_lock(&sbi->s_mb_largest_free_orders_locks[
- grp->bb_largest_free_order]);
- list_add_tail(&grp->bb_largest_free_order_node,
- &sbi->s_mb_largest_free_orders[grp->bb_largest_free_order]);
- write_unlock(&sbi->s_mb_largest_free_orders_locks[
- grp->bb_largest_free_order]);
+
+ grp->bb_largest_free_order = new;
+ if (test_opt2(sb, MB_OPTIMIZE_SCAN) && new >= 0 && grp->bb_free) {
+ /*
+ * Cannot use __GFP_NOFAIL because we hold the group lock.
+ * Although allocation for insertion may fails, it's not fatal
+ * as we have linear traversal to fall back on.
+ */
+ int err = xa_insert(&sbi->s_mb_largest_free_orders[new],
+ grp->bb_group, grp, GFP_ATOMIC);
+ if (err)
+ mb_debug(sb, "insert group: %u to s_mb_largest_free_orders[%d] failed, err %d",
+ grp->bb_group, new, err);
}
}
@@ -2167,11 +2236,11 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
folio_get(ac->ac_buddy_folio);
/* store last allocated for subsequent stream allocation */
if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
- spin_lock(&sbi->s_md_lock);
- sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
- sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
- spin_unlock(&sbi->s_md_lock);
+ int hash = ac->ac_inode->i_ino % sbi->s_mb_nr_global_goals;
+
+ WRITE_ONCE(sbi->s_mb_last_groups[hash], ac->ac_f_ex.fe_group);
}
+
/*
* As we've just preallocated more space than
* user requested originally, we store allocated
@@ -2571,6 +2640,30 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
}
}
+static void __ext4_mb_scan_group(struct ext4_allocation_context *ac)
+{
+ bool is_stripe_aligned;
+ struct ext4_sb_info *sbi;
+ enum criteria cr = ac->ac_criteria;
+
+ ac->ac_groups_scanned++;
+ if (cr == CR_POWER2_ALIGNED)
+ return ext4_mb_simple_scan_group(ac, ac->ac_e4b);
+
+ sbi = EXT4_SB(ac->ac_sb);
+ is_stripe_aligned = false;
+ if ((sbi->s_stripe >= sbi->s_cluster_ratio) &&
+ !(ac->ac_g_ex.fe_len % EXT4_NUM_B2C(sbi, sbi->s_stripe)))
+ is_stripe_aligned = true;
+
+ if ((cr == CR_GOAL_LEN_FAST || cr == CR_BEST_AVAIL_LEN) &&
+ is_stripe_aligned)
+ ext4_mb_scan_aligned(ac, ac->ac_e4b);
+
+ if (ac->ac_status == AC_STATUS_CONTINUE)
+ ext4_mb_complex_scan_group(ac, ac->ac_e4b);
+}
+
/*
* This is also called BEFORE we load the buddy bitmap.
* Returns either 1 or 0 indicating that the group is either suitable
@@ -2761,6 +2854,37 @@ ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group,
}
/*
+ * Batch reads of the block allocation bitmaps to get
+ * multiple READs in flight; limit prefetching at inexpensive
+ * CR, otherwise mballoc can spend a lot of time loading
+ * imperfect groups
+ */
+static void ext4_mb_might_prefetch(struct ext4_allocation_context *ac,
+ ext4_group_t group)
+{
+ struct ext4_sb_info *sbi;
+
+ if (ac->ac_prefetch_grp != group)
+ return;
+
+ sbi = EXT4_SB(ac->ac_sb);
+ if (ext4_mb_cr_expensive(ac->ac_criteria) ||
+ ac->ac_prefetch_ios < sbi->s_mb_prefetch_limit) {
+ unsigned int nr = sbi->s_mb_prefetch;
+
+ if (ext4_has_feature_flex_bg(ac->ac_sb)) {
+ nr = 1 << sbi->s_log_groups_per_flex;
+ nr -= group & (nr - 1);
+ nr = umin(nr, sbi->s_mb_prefetch);
+ }
+
+ ac->ac_prefetch_nr = nr;
+ ac->ac_prefetch_grp = ext4_mb_prefetch(ac->ac_sb, group, nr,
+ &ac->ac_prefetch_ios);
+ }
+}
+
+/*
* Prefetching reads the block bitmap into the buffer cache; but we
* need to make sure that the buddy bitmap in the page cache has been
* initialized. Note that ext4_mb_init_group() will block if the I/O
@@ -2793,24 +2917,58 @@ void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group,
}
}
+static int ext4_mb_scan_group(struct ext4_allocation_context *ac,
+ ext4_group_t group)
+{
+ int ret;
+ struct super_block *sb = ac->ac_sb;
+ enum criteria cr = ac->ac_criteria;
+
+ ext4_mb_might_prefetch(ac, group);
+
+ /* prevent unnecessary buddy loading. */
+ if (cr < CR_ANY_FREE && spin_is_locked(ext4_group_lock_ptr(sb, group)))
+ return 0;
+
+ /* This now checks without needing the buddy page */
+ ret = ext4_mb_good_group_nolock(ac, group, cr);
+ if (ret <= 0) {
+ if (!ac->ac_first_err)
+ ac->ac_first_err = ret;
+ return 0;
+ }
+
+ ret = ext4_mb_load_buddy(sb, group, ac->ac_e4b);
+ if (ret)
+ return ret;
+
+ /* skip busy group */
+ if (cr >= CR_ANY_FREE)
+ ext4_lock_group(sb, group);
+ else if (!ext4_try_lock_group(sb, group))
+ goto out_unload;
+
+ /* We need to check again after locking the block group. */
+ if (unlikely(!ext4_mb_good_group(ac, group, cr)))
+ goto out_unlock;
+
+ __ext4_mb_scan_group(ac);
+
+out_unlock:
+ ext4_unlock_group(sb, group);
+out_unload:
+ ext4_mb_unload_buddy(ac->ac_e4b);
+ return ret;
+}
+
static noinline_for_stack int
ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
{
- ext4_group_t prefetch_grp = 0, ngroups, group, i;
- enum criteria new_cr, cr = CR_GOAL_LEN_FAST;
- int err = 0, first_err = 0;
- unsigned int nr = 0, prefetch_ios = 0;
- struct ext4_sb_info *sbi;
- struct super_block *sb;
+ ext4_group_t i;
+ int err = 0;
+ struct super_block *sb = ac->ac_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_buddy e4b;
- int lost;
-
- sb = ac->ac_sb;
- sbi = EXT4_SB(sb);
- ngroups = ext4_get_groups_count(sb);
- /* non-extent files are limited to low blocks/groups */
- if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
- ngroups = sbi->s_blockfile_groups;
BUG_ON(ac->ac_status == AC_STATUS_FOUND);
@@ -2844,11 +3002,11 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
/* if stream allocation is enabled, use global goal */
if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
- /* TBD: may be hot point */
- spin_lock(&sbi->s_md_lock);
- ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
- ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
- spin_unlock(&sbi->s_md_lock);
+ int hash = ac->ac_inode->i_ino % sbi->s_mb_nr_global_goals;
+
+ ac->ac_g_ex.fe_group = READ_ONCE(sbi->s_mb_last_groups[hash]);
+ ac->ac_g_ex.fe_start = -1;
+ ac->ac_flags &= ~EXT4_MB_HINT_TRY_GOAL;
}
/*
@@ -2856,107 +3014,21 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
* start with CR_GOAL_LEN_FAST, unless it is power of 2
* aligned, in which case let's do that faster approach first.
*/
+ ac->ac_criteria = CR_GOAL_LEN_FAST;
if (ac->ac_2order)
- cr = CR_POWER2_ALIGNED;
-repeat:
- for (; cr < EXT4_MB_NUM_CRS && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
- ac->ac_criteria = cr;
- /*
- * searching for the right group start
- * from the goal value specified
- */
- group = ac->ac_g_ex.fe_group;
- ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups;
- prefetch_grp = group;
- nr = 0;
-
- for (i = 0, new_cr = cr; i < ngroups; i++,
- ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups)) {
- int ret = 0;
-
- cond_resched();
- if (new_cr != cr) {
- cr = new_cr;
- goto repeat;
- }
-
- /*
- * Batch reads of the block allocation bitmaps
- * to get multiple READs in flight; limit
- * prefetching at inexpensive CR, otherwise mballoc
- * can spend a lot of time loading imperfect groups
- */
- if ((prefetch_grp == group) &&
- (ext4_mb_cr_expensive(cr) ||
- prefetch_ios < sbi->s_mb_prefetch_limit)) {
- nr = sbi->s_mb_prefetch;
- if (ext4_has_feature_flex_bg(sb)) {
- nr = 1 << sbi->s_log_groups_per_flex;
- nr -= group & (nr - 1);
- nr = min(nr, sbi->s_mb_prefetch);
- }
- prefetch_grp = ext4_mb_prefetch(sb, group,
- nr, &prefetch_ios);
- }
-
- /* This now checks without needing the buddy page */
- ret = ext4_mb_good_group_nolock(ac, group, cr);
- if (ret <= 0) {
- if (!first_err)
- first_err = ret;
- continue;
- }
-
- err = ext4_mb_load_buddy(sb, group, &e4b);
- if (err)
- goto out;
-
- ext4_lock_group(sb, group);
-
- /*
- * We need to check again after locking the
- * block group
- */
- ret = ext4_mb_good_group(ac, group, cr);
- if (ret == 0) {
- ext4_unlock_group(sb, group);
- ext4_mb_unload_buddy(&e4b);
- continue;
- }
+ ac->ac_criteria = CR_POWER2_ALIGNED;
- ac->ac_groups_scanned++;
- if (cr == CR_POWER2_ALIGNED)
- ext4_mb_simple_scan_group(ac, &e4b);
- else {
- bool is_stripe_aligned =
- (sbi->s_stripe >=
- sbi->s_cluster_ratio) &&
- !(ac->ac_g_ex.fe_len %
- EXT4_NUM_B2C(sbi, sbi->s_stripe));
-
- if ((cr == CR_GOAL_LEN_FAST ||
- cr == CR_BEST_AVAIL_LEN) &&
- is_stripe_aligned)
- ext4_mb_scan_aligned(ac, &e4b);
-
- if (ac->ac_status == AC_STATUS_CONTINUE)
- ext4_mb_complex_scan_group(ac, &e4b);
- }
-
- ext4_unlock_group(sb, group);
- ext4_mb_unload_buddy(&e4b);
-
- if (ac->ac_status != AC_STATUS_CONTINUE)
- break;
- }
- /* Processed all groups and haven't found blocks */
- if (sbi->s_mb_stats && i == ngroups)
- atomic64_inc(&sbi->s_bal_cX_failed[cr]);
+ ac->ac_e4b = &e4b;
+ ac->ac_prefetch_ios = 0;
+ ac->ac_first_err = 0;
+repeat:
+ while (ac->ac_criteria < EXT4_MB_NUM_CRS) {
+ err = ext4_mb_scan_groups(ac);
+ if (err)
+ goto out;
- if (i == ngroups && ac->ac_criteria == CR_BEST_AVAIL_LEN)
- /* Reset goal length to original goal length before
- * falling into CR_GOAL_LEN_SLOW */
- ac->ac_g_ex.fe_len = ac->ac_orig_goal_len;
+ if (ac->ac_status != AC_STATUS_CONTINUE)
+ break;
}
if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
@@ -2967,6 +3039,8 @@ repeat:
*/
ext4_mb_try_best_found(ac, &e4b);
if (ac->ac_status != AC_STATUS_FOUND) {
+ int lost;
+
/*
* Someone more lucky has already allocated it.
* The only thing we can do is just take first
@@ -2982,23 +3056,27 @@ repeat:
ac->ac_b_ex.fe_len = 0;
ac->ac_status = AC_STATUS_CONTINUE;
ac->ac_flags |= EXT4_MB_HINT_FIRST;
- cr = CR_ANY_FREE;
+ ac->ac_criteria = CR_ANY_FREE;
goto repeat;
}
}
- if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND)
+ if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND) {
atomic64_inc(&sbi->s_bal_cX_hits[ac->ac_criteria]);
+ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC &&
+ ac->ac_b_ex.fe_group == ac->ac_g_ex.fe_group)
+ atomic_inc(&sbi->s_bal_stream_goals);
+ }
out:
- if (!err && ac->ac_status != AC_STATUS_FOUND && first_err)
- err = first_err;
+ if (!err && ac->ac_status != AC_STATUS_FOUND && ac->ac_first_err)
+ err = ac->ac_first_err;
mb_debug(sb, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n",
ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status,
- ac->ac_flags, cr, err);
+ ac->ac_flags, ac->ac_criteria, err);
- if (nr)
- ext4_mb_prefetch_fini(sb, prefetch_grp, nr);
+ if (ac->ac_prefetch_nr)
+ ext4_mb_prefetch_fini(sb, ac->ac_prefetch_grp, ac->ac_prefetch_nr);
return err;
}
@@ -3037,10 +3115,8 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
unsigned char blocksize_bits = min_t(unsigned char,
sb->s_blocksize_bits,
EXT4_MAX_BLOCK_LOG_SIZE);
- struct sg {
- struct ext4_group_info info;
- ext4_grpblk_t counters[EXT4_MAX_BLOCK_LOG_SIZE + 2];
- } sg;
+ DEFINE_RAW_FLEX(struct ext4_group_info, sg, bb_counters,
+ EXT4_MAX_BLOCK_LOG_SIZE + 2);
group--;
if (group == 0)
@@ -3048,7 +3124,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
" 2^0 2^1 2^2 2^3 2^4 2^5 2^6 "
" 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]\n");
- i = (blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
+ i = (blocksize_bits + 2) * sizeof(sg->bb_counters[0]) +
sizeof(struct ext4_group_info);
grinfo = ext4_get_group_info(sb, group);
@@ -3068,14 +3144,14 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
* We care only about free space counters in the group info and
* these are safe to access even after the buddy has been unloaded
*/
- memcpy(&sg, grinfo, i);
- seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
- sg.info.bb_fragments, sg.info.bb_first_free);
+ memcpy(sg, grinfo, i);
+ seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg->bb_free,
+ sg->bb_fragments, sg->bb_first_free);
for (i = 0; i <= 13; i++)
seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ?
- sg.info.bb_counters[i] : 0);
+ sg->bb_counters[i] : 0);
seq_puts(seq, " ]");
- if (EXT4_MB_GRP_BBITMAP_CORRUPT(&sg.info))
+ if (EXT4_MB_GRP_BBITMAP_CORRUPT(sg))
seq_puts(seq, " Block bitmap corrupted!");
seq_putc(seq, '\n');
return 0;
@@ -3123,8 +3199,6 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset)
atomic_read(&sbi->s_bal_cX_ex_scanned[CR_POWER2_ALIGNED]));
seq_printf(seq, "\t\tuseless_loops: %llu\n",
atomic64_read(&sbi->s_bal_cX_failed[CR_POWER2_ALIGNED]));
- seq_printf(seq, "\t\tbad_suggestions: %u\n",
- atomic_read(&sbi->s_bal_p2_aligned_bad_suggestions));
/* CR_GOAL_LEN_FAST stats */
seq_puts(seq, "\tcr_goal_fast_stats:\n");
@@ -3137,8 +3211,6 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset)
atomic_read(&sbi->s_bal_cX_ex_scanned[CR_GOAL_LEN_FAST]));
seq_printf(seq, "\t\tuseless_loops: %llu\n",
atomic64_read(&sbi->s_bal_cX_failed[CR_GOAL_LEN_FAST]));
- seq_printf(seq, "\t\tbad_suggestions: %u\n",
- atomic_read(&sbi->s_bal_goal_fast_bad_suggestions));
/* CR_BEST_AVAIL_LEN stats */
seq_puts(seq, "\tcr_best_avail_stats:\n");
@@ -3152,8 +3224,6 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset)
atomic_read(&sbi->s_bal_cX_ex_scanned[CR_BEST_AVAIL_LEN]));
seq_printf(seq, "\t\tuseless_loops: %llu\n",
atomic64_read(&sbi->s_bal_cX_failed[CR_BEST_AVAIL_LEN]));
- seq_printf(seq, "\t\tbad_suggestions: %u\n",
- atomic_read(&sbi->s_bal_best_avail_bad_suggestions));
/* CR_GOAL_LEN_SLOW stats */
seq_puts(seq, "\tcr_goal_slow_stats:\n");
@@ -3183,6 +3253,8 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset)
seq_printf(seq, "\textents_scanned: %u\n",
atomic_read(&sbi->s_bal_ex_scanned));
seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals));
+ seq_printf(seq, "\t\tstream_goal_hits: %u\n",
+ atomic_read(&sbi->s_bal_stream_goals));
seq_printf(seq, "\t\tlen_goal_hits: %u\n",
atomic_read(&sbi->s_bal_len_goals));
seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders));
@@ -3229,6 +3301,7 @@ static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v)
unsigned long position = ((unsigned long) v);
struct ext4_group_info *grp;
unsigned int count;
+ unsigned long idx;
position--;
if (position >= MB_NUM_ORDERS(sb)) {
@@ -3237,11 +3310,8 @@ static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v)
seq_puts(seq, "avg_fragment_size_lists:\n");
count = 0;
- read_lock(&sbi->s_mb_avg_fragment_size_locks[position]);
- list_for_each_entry(grp, &sbi->s_mb_avg_fragment_size[position],
- bb_avg_fragment_size_node)
+ xa_for_each(&sbi->s_mb_avg_fragment_size[position], idx, grp)
count++;
- read_unlock(&sbi->s_mb_avg_fragment_size_locks[position]);
seq_printf(seq, "\tlist_order_%u_groups: %u\n",
(unsigned int)position, count);
return 0;
@@ -3253,11 +3323,8 @@ static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v)
seq_puts(seq, "max_free_order_lists:\n");
}
count = 0;
- read_lock(&sbi->s_mb_largest_free_orders_locks[position]);
- list_for_each_entry(grp, &sbi->s_mb_largest_free_orders[position],
- bb_largest_free_order_node)
+ xa_for_each(&sbi->s_mb_largest_free_orders[position], idx, grp)
count++;
- read_unlock(&sbi->s_mb_largest_free_orders_locks[position]);
seq_printf(seq, "\tlist_order_%u_groups: %u\n",
(unsigned int)position, count);
@@ -3377,8 +3444,6 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
init_rwsem(&meta_group_info[i]->alloc_sem);
meta_group_info[i]->bb_free_root = RB_ROOT;
- INIT_LIST_HEAD(&meta_group_info[i]->bb_largest_free_order_node);
- INIT_LIST_HEAD(&meta_group_info[i]->bb_avg_fragment_size_node);
meta_group_info[i]->bb_largest_free_order = -1; /* uninit */
meta_group_info[i]->bb_avg_fragment_size_order = -1; /* uninit */
meta_group_info[i]->bb_group = group;
@@ -3588,6 +3653,20 @@ static void ext4_discard_work(struct work_struct *work)
ext4_mb_unload_buddy(&e4b);
}
+static inline void ext4_mb_avg_fragment_size_destroy(struct ext4_sb_info *sbi)
+{
+ for (int i = 0; i < MB_NUM_ORDERS(sbi->s_sb); i++)
+ xa_destroy(&sbi->s_mb_avg_fragment_size[i]);
+ kfree(sbi->s_mb_avg_fragment_size);
+}
+
+static inline void ext4_mb_largest_free_orders_destroy(struct ext4_sb_info *sbi)
+{
+ for (int i = 0; i < MB_NUM_ORDERS(sbi->s_sb); i++)
+ xa_destroy(&sbi->s_mb_largest_free_orders[i]);
+ kfree(sbi->s_mb_largest_free_orders);
+}
+
int ext4_mb_init(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -3633,44 +3712,27 @@ int ext4_mb_init(struct super_block *sb)
} while (i < MB_NUM_ORDERS(sb));
sbi->s_mb_avg_fragment_size =
- kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head),
+ kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct xarray),
GFP_KERNEL);
if (!sbi->s_mb_avg_fragment_size) {
ret = -ENOMEM;
goto out;
}
- sbi->s_mb_avg_fragment_size_locks =
- kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t),
- GFP_KERNEL);
- if (!sbi->s_mb_avg_fragment_size_locks) {
- ret = -ENOMEM;
- goto out;
- }
- for (i = 0; i < MB_NUM_ORDERS(sb); i++) {
- INIT_LIST_HEAD(&sbi->s_mb_avg_fragment_size[i]);
- rwlock_init(&sbi->s_mb_avg_fragment_size_locks[i]);
- }
+ for (i = 0; i < MB_NUM_ORDERS(sb); i++)
+ xa_init(&sbi->s_mb_avg_fragment_size[i]);
+
sbi->s_mb_largest_free_orders =
- kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head),
+ kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct xarray),
GFP_KERNEL);
if (!sbi->s_mb_largest_free_orders) {
ret = -ENOMEM;
goto out;
}
- sbi->s_mb_largest_free_orders_locks =
- kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t),
- GFP_KERNEL);
- if (!sbi->s_mb_largest_free_orders_locks) {
- ret = -ENOMEM;
- goto out;
- }
- for (i = 0; i < MB_NUM_ORDERS(sb); i++) {
- INIT_LIST_HEAD(&sbi->s_mb_largest_free_orders[i]);
- rwlock_init(&sbi->s_mb_largest_free_orders_locks[i]);
- }
+ for (i = 0; i < MB_NUM_ORDERS(sb); i++)
+ xa_init(&sbi->s_mb_largest_free_orders[i]);
spin_lock_init(&sbi->s_md_lock);
- sbi->s_mb_free_pending = 0;
+ atomic_set(&sbi->s_mb_free_pending, 0);
INIT_LIST_HEAD(&sbi->s_freed_data_list[0]);
INIT_LIST_HEAD(&sbi->s_freed_data_list[1]);
INIT_LIST_HEAD(&sbi->s_discard_list);
@@ -3711,10 +3773,19 @@ int ext4_mb_init(struct super_block *sb)
sbi->s_mb_group_prealloc, EXT4_NUM_B2C(sbi, sbi->s_stripe));
}
+ sbi->s_mb_nr_global_goals = umin(num_possible_cpus(),
+ DIV_ROUND_UP(sbi->s_groups_count, 4));
+ sbi->s_mb_last_groups = kcalloc(sbi->s_mb_nr_global_goals,
+ sizeof(ext4_group_t), GFP_KERNEL);
+ if (sbi->s_mb_last_groups == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
if (sbi->s_locality_groups == NULL) {
ret = -ENOMEM;
- goto out;
+ goto out_free_last_groups;
}
for_each_possible_cpu(i) {
struct ext4_locality_group *lg;
@@ -3739,11 +3810,12 @@ int ext4_mb_init(struct super_block *sb)
out_free_locality_groups:
free_percpu(sbi->s_locality_groups);
sbi->s_locality_groups = NULL;
+out_free_last_groups:
+ kfree(sbi->s_mb_last_groups);
+ sbi->s_mb_last_groups = NULL;
out:
- kfree(sbi->s_mb_avg_fragment_size);
- kfree(sbi->s_mb_avg_fragment_size_locks);
- kfree(sbi->s_mb_largest_free_orders);
- kfree(sbi->s_mb_largest_free_orders_locks);
+ ext4_mb_avg_fragment_size_destroy(sbi);
+ ext4_mb_largest_free_orders_destroy(sbi);
kfree(sbi->s_mb_offsets);
sbi->s_mb_offsets = NULL;
kfree(sbi->s_mb_maxs);
@@ -3810,10 +3882,8 @@ void ext4_mb_release(struct super_block *sb)
kvfree(group_info);
rcu_read_unlock();
}
- kfree(sbi->s_mb_avg_fragment_size);
- kfree(sbi->s_mb_avg_fragment_size_locks);
- kfree(sbi->s_mb_largest_free_orders);
- kfree(sbi->s_mb_largest_free_orders_locks);
+ ext4_mb_avg_fragment_size_destroy(sbi);
+ ext4_mb_largest_free_orders_destroy(sbi);
kfree(sbi->s_mb_offsets);
kfree(sbi->s_mb_maxs);
iput(sbi->s_buddy_cache);
@@ -3843,6 +3913,7 @@ void ext4_mb_release(struct super_block *sb)
}
free_percpu(sbi->s_locality_groups);
+ kfree(sbi->s_mb_last_groups);
}
static inline int ext4_issue_discard(struct super_block *sb,
@@ -3873,10 +3944,7 @@ static void ext4_free_data_in_buddy(struct super_block *sb,
/* we expect to find existing buddy because it's pinned */
BUG_ON(err != 0);
- spin_lock(&EXT4_SB(sb)->s_md_lock);
- EXT4_SB(sb)->s_mb_free_pending -= entry->efd_count;
- spin_unlock(&EXT4_SB(sb)->s_md_lock);
-
+ atomic_sub(entry->efd_count, &EXT4_SB(sb)->s_mb_free_pending);
db = e4b.bd_info;
/* there are blocks to put in buddy to make them really free */
count += entry->efd_count;
@@ -5653,7 +5721,7 @@ static inline void ext4_mb_show_pa(struct super_block *sb)
{
ext4_group_t i, ngroups;
- if (ext4_forced_shutdown(sb))
+ if (ext4_emergency_state(sb))
return;
ngroups = ext4_get_groups_count(sb);
@@ -5687,7 +5755,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
{
struct super_block *sb = ac->ac_sb;
- if (ext4_forced_shutdown(sb))
+ if (ext4_emergency_state(sb))
return;
mb_debug(sb, "Can't allocate:"
@@ -6280,28 +6348,63 @@ out:
* are contiguous, AND the extents were freed by the same transaction,
* AND the blocks are associated with the same group.
*/
-static void ext4_try_merge_freed_extent(struct ext4_sb_info *sbi,
- struct ext4_free_data *entry,
- struct ext4_free_data *new_entry,
- struct rb_root *entry_rb_root)
+static inline bool
+ext4_freed_extents_can_be_merged(struct ext4_free_data *entry1,
+ struct ext4_free_data *entry2)
{
- if ((entry->efd_tid != new_entry->efd_tid) ||
- (entry->efd_group != new_entry->efd_group))
- return;
- if (entry->efd_start_cluster + entry->efd_count ==
- new_entry->efd_start_cluster) {
- new_entry->efd_start_cluster = entry->efd_start_cluster;
- new_entry->efd_count += entry->efd_count;
- } else if (new_entry->efd_start_cluster + new_entry->efd_count ==
- entry->efd_start_cluster) {
- new_entry->efd_count += entry->efd_count;
- } else
- return;
+ if (entry1->efd_tid != entry2->efd_tid)
+ return false;
+ if (entry1->efd_start_cluster + entry1->efd_count !=
+ entry2->efd_start_cluster)
+ return false;
+ if (WARN_ON_ONCE(entry1->efd_group != entry2->efd_group))
+ return false;
+ return true;
+}
+
+static inline void
+ext4_merge_freed_extents(struct ext4_sb_info *sbi, struct rb_root *root,
+ struct ext4_free_data *entry1,
+ struct ext4_free_data *entry2)
+{
+ entry1->efd_count += entry2->efd_count;
spin_lock(&sbi->s_md_lock);
- list_del(&entry->efd_list);
+ list_del(&entry2->efd_list);
spin_unlock(&sbi->s_md_lock);
- rb_erase(&entry->efd_node, entry_rb_root);
- kmem_cache_free(ext4_free_data_cachep, entry);
+ rb_erase(&entry2->efd_node, root);
+ kmem_cache_free(ext4_free_data_cachep, entry2);
+}
+
+static inline void
+ext4_try_merge_freed_extent_prev(struct ext4_sb_info *sbi, struct rb_root *root,
+ struct ext4_free_data *entry)
+{
+ struct ext4_free_data *prev;
+ struct rb_node *node;
+
+ node = rb_prev(&entry->efd_node);
+ if (!node)
+ return;
+
+ prev = rb_entry(node, struct ext4_free_data, efd_node);
+ if (ext4_freed_extents_can_be_merged(prev, entry))
+ ext4_merge_freed_extents(sbi, root, prev, entry);
+}
+
+static inline void
+ext4_try_merge_freed_extent_next(struct ext4_sb_info *sbi, struct rb_root *root,
+ struct ext4_free_data *entry)
+{
+ struct ext4_free_data *next;
+ struct rb_node *node;
+
+ node = rb_next(&entry->efd_node);
+ if (!node)
+ return;
+
+ next = rb_entry(node, struct ext4_free_data, efd_node);
+ if (ext4_freed_extents_can_be_merged(entry, next))
+ ext4_merge_freed_extents(sbi, root, entry, next);
}
static noinline_for_stack void
@@ -6311,11 +6414,12 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
ext4_group_t group = e4b->bd_group;
ext4_grpblk_t cluster;
ext4_grpblk_t clusters = new_entry->efd_count;
- struct ext4_free_data *entry;
+ struct ext4_free_data *entry = NULL;
struct ext4_group_info *db = e4b->bd_info;
struct super_block *sb = e4b->bd_sb;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct rb_node **n = &db->bb_free_root.rb_node, *node;
+ struct rb_root *root = &db->bb_free_root;
+ struct rb_node **n = &root->rb_node;
struct rb_node *parent = NULL, *new_node;
BUG_ON(!ext4_handle_valid(handle));
@@ -6351,27 +6455,30 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
}
}
- rb_link_node(new_node, parent, n);
- rb_insert_color(new_node, &db->bb_free_root);
+ atomic_add(clusters, &sbi->s_mb_free_pending);
+ if (!entry)
+ goto insert;
- /* Now try to see the extent can be merged to left and right */
- node = rb_prev(new_node);
- if (node) {
- entry = rb_entry(node, struct ext4_free_data, efd_node);
- ext4_try_merge_freed_extent(sbi, entry, new_entry,
- &(db->bb_free_root));
+ /* Now try to see the extent can be merged to prev and next */
+ if (ext4_freed_extents_can_be_merged(new_entry, entry)) {
+ entry->efd_start_cluster = cluster;
+ entry->efd_count += new_entry->efd_count;
+ kmem_cache_free(ext4_free_data_cachep, new_entry);
+ ext4_try_merge_freed_extent_prev(sbi, root, entry);
+ return;
}
-
- node = rb_next(new_node);
- if (node) {
- entry = rb_entry(node, struct ext4_free_data, efd_node);
- ext4_try_merge_freed_extent(sbi, entry, new_entry,
- &(db->bb_free_root));
+ if (ext4_freed_extents_can_be_merged(entry, new_entry)) {
+ entry->efd_count += new_entry->efd_count;
+ kmem_cache_free(ext4_free_data_cachep, new_entry);
+ ext4_try_merge_freed_extent_next(sbi, root, entry);
+ return;
}
+insert:
+ rb_link_node(new_node, parent, n);
+ rb_insert_color(new_node, root);
spin_lock(&sbi->s_md_lock);
list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list[new_entry->efd_tid & 1]);
- sbi->s_mb_free_pending += clusters;
spin_unlock(&sbi->s_md_lock);
}
@@ -6644,7 +6751,8 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
for (i = 0; i < count; i++) {
cond_resched();
if (is_metadata)
- bh = sb_find_get_block(inode->i_sb, block + i);
+ bh = sb_find_get_block_nonatomic(inode->i_sb,
+ block + i);
ext4_forget(handle, is_metadata, inode, bh, block + i);
}
}
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index f8280de3e882..15a049f05d04 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -192,8 +192,13 @@ struct ext4_allocation_context {
*/
ext4_grpblk_t ac_orig_goal_len;
+ ext4_group_t ac_prefetch_grp;
+ unsigned int ac_prefetch_ios;
+ unsigned int ac_prefetch_nr;
+
+ int ac_first_err;
+
__u32 ac_flags; /* allocation hints */
- __u32 ac_groups_linear_remaining;
__u16 ac_groups_scanned;
__u16 ac_found;
__u16 ac_cX_found[EXT4_MB_NUM_CRS];
@@ -204,6 +209,8 @@ struct ext4_allocation_context {
__u8 ac_2order; /* if request is to allocate 2^N blocks and
* N > 0, the field stores N, otherwise 0 */
__u8 ac_op; /* operation, for history only */
+
+ struct ext4_buddy *ac_e4b;
struct folio *ac_bitmap_folio;
struct folio *ac_buddy_folio;
struct ext4_prealloc_space *ac_pa;
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index d64c04ed061a..51661570cf3b 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -14,14 +14,14 @@ static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp)
int offset = offsetof(struct mmp_struct, mmp_checksum);
__u32 csum;
- csum = ext4_chksum(sbi, sbi->s_csum_seed, (char *)mmp, offset);
+ csum = ext4_chksum(sbi->s_csum_seed, (char *)mmp, offset);
return cpu_to_le32(csum);
}
static int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp)
{
- if (!ext4_has_metadata_csum(sb))
+ if (!ext4_has_feature_metadata_csum(sb))
return 1;
return mmp->mmp_checksum == ext4_mmp_csum(sb, mmp);
@@ -29,7 +29,7 @@ static int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp)
static void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp)
{
- if (!ext4_has_metadata_csum(sb))
+ if (!ext4_has_feature_metadata_csum(sb))
return;
mmp->mmp_checksum = ext4_mmp_csum(sb, mmp);
@@ -162,7 +162,7 @@ static int kmmpd(void *data)
memcpy(mmp->mmp_nodename, init_utsname()->nodename,
sizeof(mmp->mmp_nodename));
- while (!kthread_should_stop() && !ext4_forced_shutdown(sb)) {
+ while (!kthread_should_stop() && !ext4_emergency_state(sb)) {
if (!ext4_has_feature_mmp(sb)) {
ext4_warning(sb, "kmmpd being stopped since MMP feature"
" has been disabled.");
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 898443e98efc..adae3caf175a 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -269,7 +269,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
unsigned int tmp_data_size, data_size, replaced_size;
int i, err2, jblocks, retries = 0;
int replaced_count = 0;
- int from = data_offset_in_page << orig_inode->i_blkbits;
+ int from;
int blocks_per_page = PAGE_SIZE >> orig_inode->i_blkbits;
struct super_block *sb = orig_inode->i_sb;
struct buffer_head *bh = NULL;
@@ -280,7 +280,8 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
*/
again:
*err = 0;
- jblocks = ext4_writepage_trans_blocks(orig_inode) * 2;
+ jblocks = ext4_meta_trans_blocks(orig_inode, block_len_in_page,
+ block_len_in_page) * 2;
handle = ext4_journal_start(orig_inode, EXT4_HT_MOVE_EXTENTS, jblocks);
if (IS_ERR(handle)) {
*err = PTR_ERR(handle);
@@ -323,11 +324,6 @@ again:
* hold page's lock, if it is still the case data copy is not
* necessary, just swap data blocks between orig and donor.
*/
-
- VM_BUG_ON_FOLIO(folio_test_large(folio[0]), folio[0]);
- VM_BUG_ON_FOLIO(folio_test_large(folio[1]), folio[1]);
- VM_BUG_ON_FOLIO(folio_nr_pages(folio[0]) != folio_nr_pages(folio[1]), folio[1]);
-
if (unwritten) {
ext4_double_down_write_data_sem(orig_inode, donor_inode);
/* If any of extents in range became initialized we have to
@@ -360,6 +356,8 @@ again:
goto unlock_folios;
}
data_copy:
+ from = offset_in_folio(folio[0],
+ orig_blk_offset << orig_inode->i_blkbits);
*err = mext_page_mkuptodate(folio[0], from, from + replaced_size);
if (*err)
goto unlock_folios;
@@ -390,7 +388,7 @@ data_copy:
if (!bh)
bh = create_empty_buffers(folio[0],
1 << orig_inode->i_blkbits, 0);
- for (i = 0; i < data_offset_in_page; i++)
+ for (i = 0; i < from >> orig_inode->i_blkbits; i++)
bh = bh->b_this_page;
for (i = 0; i < block_len_in_page; i++) {
*err = ext4_get_block(orig_inode, orig_blk_offset + i, bh, 0);
@@ -399,7 +397,7 @@ data_copy:
bh = bh->b_this_page;
}
- block_commit_write(&folio[0]->page, from, from + replaced_size);
+ block_commit_write(folio[0], from, from + replaced_size);
/* Even in case of data=writeback it is reasonable to pin
* inode to transaction, to prevent unexpected data loss */
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 536d56d15072..2cd36f59c9e3 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -176,7 +176,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
brelse(bh);
return ERR_PTR(-EFSCORRUPTED);
}
- if (!ext4_has_metadata_csum(inode->i_sb) ||
+ if (!ext4_has_feature_metadata_csum(inode->i_sb) ||
buffer_verified(bh))
return bh;
@@ -291,36 +291,6 @@ struct dx_tail {
__le32 dt_checksum; /* crc32c(uuid+inum+dirblock) */
};
-static inline ext4_lblk_t dx_get_block(struct dx_entry *entry);
-static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value);
-static inline unsigned dx_get_hash(struct dx_entry *entry);
-static void dx_set_hash(struct dx_entry *entry, unsigned value);
-static unsigned dx_get_count(struct dx_entry *entries);
-static unsigned dx_get_limit(struct dx_entry *entries);
-static void dx_set_count(struct dx_entry *entries, unsigned value);
-static void dx_set_limit(struct dx_entry *entries, unsigned value);
-static unsigned dx_root_limit(struct inode *dir, unsigned infosize);
-static unsigned dx_node_limit(struct inode *dir);
-static struct dx_frame *dx_probe(struct ext4_filename *fname,
- struct inode *dir,
- struct dx_hash_info *hinfo,
- struct dx_frame *frame);
-static void dx_release(struct dx_frame *frames);
-static int dx_make_map(struct inode *dir, struct buffer_head *bh,
- struct dx_hash_info *hinfo,
- struct dx_map_entry *map_tail);
-static void dx_sort_map(struct dx_map_entry *map, unsigned count);
-static struct ext4_dir_entry_2 *dx_move_dirents(struct inode *dir, char *from,
- char *to, struct dx_map_entry *offsets,
- int count, unsigned int blocksize);
-static struct ext4_dir_entry_2 *dx_pack_dirents(struct inode *dir, char *base,
- unsigned int blocksize);
-static void dx_insert_block(struct dx_frame *frame,
- u32 hash, ext4_lblk_t block);
-static int ext4_htree_next_block(struct inode *dir, __u32 hash,
- struct dx_frame *frame,
- struct dx_frame *frames,
- __u32 *start_hash);
static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
struct ext4_filename *fname,
struct ext4_dir_entry_2 **res_dir);
@@ -376,11 +346,10 @@ static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode,
static __le32 ext4_dirblock_csum(struct inode *inode, void *dirent, int size)
{
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_inode_info *ei = EXT4_I(inode);
__u32 csum;
- csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size);
+ csum = ext4_chksum(ei->i_csum_seed, (__u8 *)dirent, size);
return cpu_to_le32(csum);
}
@@ -398,7 +367,7 @@ int ext4_dirblock_csum_verify(struct inode *inode, struct buffer_head *bh)
{
struct ext4_dir_entry_tail *t;
- if (!ext4_has_metadata_csum(inode->i_sb))
+ if (!ext4_has_feature_metadata_csum(inode->i_sb))
return 1;
t = get_dirent_tail(inode, bh);
@@ -419,7 +388,7 @@ static void ext4_dirblock_csum_set(struct inode *inode,
{
struct ext4_dir_entry_tail *t;
- if (!ext4_has_metadata_csum(inode->i_sb))
+ if (!ext4_has_feature_metadata_csum(inode->i_sb))
return;
t = get_dirent_tail(inode, bh);
@@ -472,7 +441,6 @@ static struct dx_countlimit *get_dx_countlimit(struct inode *inode,
static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent,
int count_offset, int count, struct dx_tail *t)
{
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_inode_info *ei = EXT4_I(inode);
__u32 csum;
int size;
@@ -480,9 +448,9 @@ static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent,
int offset = offsetof(struct dx_tail, dt_checksum);
size = count_offset + (count * sizeof(struct dx_entry));
- csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size);
- csum = ext4_chksum(sbi, csum, (__u8 *)t, offset);
- csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, sizeof(dummy_csum));
+ csum = ext4_chksum(ei->i_csum_seed, (__u8 *)dirent, size);
+ csum = ext4_chksum(csum, (__u8 *)t, offset);
+ csum = ext4_chksum(csum, (__u8 *)&dummy_csum, sizeof(dummy_csum));
return cpu_to_le32(csum);
}
@@ -494,7 +462,7 @@ static int ext4_dx_csum_verify(struct inode *inode,
struct dx_tail *t;
int count_offset, limit, count;
- if (!ext4_has_metadata_csum(inode->i_sb))
+ if (!ext4_has_feature_metadata_csum(inode->i_sb))
return 1;
c = get_dx_countlimit(inode, dirent, &count_offset);
@@ -523,7 +491,7 @@ static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent)
struct dx_tail *t;
int count_offset, limit, count;
- if (!ext4_has_metadata_csum(inode->i_sb))
+ if (!ext4_has_feature_metadata_csum(inode->i_sb))
return;
c = get_dx_countlimit(inode, dirent, &count_offset);
@@ -612,7 +580,7 @@ static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize)
ext4_dir_rec_len(1, NULL) -
ext4_dir_rec_len(2, NULL) - infosize;
- if (ext4_has_metadata_csum(dir->i_sb))
+ if (ext4_has_feature_metadata_csum(dir->i_sb))
entry_space -= sizeof(struct dx_tail);
return entry_space / sizeof(struct dx_entry);
}
@@ -622,7 +590,7 @@ static inline unsigned dx_node_limit(struct inode *dir)
unsigned int entry_space = dir->i_sb->s_blocksize -
ext4_dir_rec_len(0, dir);
- if (ext4_has_metadata_csum(dir->i_sb))
+ if (ext4_has_feature_metadata_csum(dir->i_sb))
entry_space -= sizeof(struct dx_tail);
return entry_space / sizeof(struct dx_entry);
}
@@ -1076,7 +1044,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
struct ext4_dir_entry_2 *de, *top;
int err = 0, count = 0;
struct fscrypt_str fname_crypto_str = FSTR_INIT(NULL, 0), tmp_str;
- int csum = ext4_has_metadata_csum(dir->i_sb);
+ int csum = ext4_has_feature_metadata_csum(dir->i_sb);
dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n",
(unsigned long)block));
@@ -1320,7 +1288,7 @@ static int dx_make_map(struct inode *dir, struct buffer_head *bh,
struct dx_hash_info h = *hinfo;
int blocksize = EXT4_BLOCK_SIZE(dir->i_sb);
- if (ext4_has_metadata_csum(dir->i_sb))
+ if (ext4_has_feature_metadata_csum(dir->i_sb))
buflen -= sizeof(struct ext4_dir_entry_tail);
while ((char *) de < base + buflen) {
@@ -1462,7 +1430,8 @@ static bool ext4_match(struct inode *parent,
* sure cf_name was properly initialized before
* considering the calculated hash.
*/
- if (IS_ENCRYPTED(parent) && fname->cf_name.name &&
+ if (sb_no_casefold_compat_fallback(parent->i_sb) &&
+ IS_ENCRYPTED(parent) && fname->cf_name.name &&
(fname->hinfo.hash != EXT4_DIRENT_HASH(de) ||
fname->hinfo.minor_hash != EXT4_DIRENT_MINOR_HASH(de)))
return false;
@@ -1595,10 +1564,15 @@ static struct buffer_head *__ext4_find_entry(struct inode *dir,
* return. Otherwise, fall back to doing a search the
* old fashioned way.
*/
- if (!IS_ERR(ret) || PTR_ERR(ret) != ERR_BAD_DX_DIR)
+ if (IS_ERR(ret) && PTR_ERR(ret) == ERR_BAD_DX_DIR)
+ dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
+ "falling back\n"));
+ else if (!sb_no_casefold_compat_fallback(dir->i_sb) &&
+ *res_dir == NULL && IS_CASEFOLDED(dir))
+ dxtrace(printk(KERN_DEBUG "ext4_find_entry: casefold "
+ "failed, falling back\n"));
+ else
goto cleanup_and_exit;
- dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
- "falling back\n"));
ret = NULL;
}
nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
@@ -1945,7 +1919,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
int csum_size = 0;
int err = 0, i;
- if (ext4_has_metadata_csum(dir->i_sb))
+ if (ext4_has_feature_metadata_csum(dir->i_sb))
csum_size = sizeof(struct ext4_dir_entry_tail);
bh2 = ext4_append(handle, dir, &newblock);
@@ -1995,7 +1969,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
* split it in half by count; each resulting block will have at least
* half the space free.
*/
- if (i > 0)
+ if (i >= 0)
split = count - move;
else
split = count/2;
@@ -2060,8 +2034,7 @@ out:
return ERR_PTR(err);
}
-int ext4_find_dest_de(struct inode *dir, struct inode *inode,
- struct buffer_head *bh,
+int ext4_find_dest_de(struct inode *dir, struct buffer_head *bh,
void *buf, int buf_size,
struct ext4_filename *fname,
struct ext4_dir_entry_2 **dest_de)
@@ -2143,11 +2116,11 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
int csum_size = 0;
int err, err2;
- if (ext4_has_metadata_csum(inode->i_sb))
+ if (ext4_has_feature_metadata_csum(inode->i_sb))
csum_size = sizeof(struct ext4_dir_entry_tail);
if (!de) {
- err = ext4_find_dest_de(dir, inode, bh, bh->b_data,
+ err = ext4_find_dest_de(dir, bh, bh->b_data,
blocksize - csum_size, fname, &de);
if (err)
return err;
@@ -2252,7 +2225,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
struct fake_dirent *fde;
int csum_size = 0;
- if (ext4_has_metadata_csum(inode->i_sb))
+ if (ext4_has_feature_metadata_csum(inode->i_sb))
csum_size = sizeof(struct ext4_dir_entry_tail);
blocksize = dir->i_sb->s_blocksize;
@@ -2396,7 +2369,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
ext4_lblk_t block, blocks;
int csum_size = 0;
- if (ext4_has_metadata_csum(inode->i_sb))
+ if (ext4_has_feature_metadata_csum(inode->i_sb))
csum_size = sizeof(struct ext4_dir_entry_tail);
sb = dir->i_sb;
@@ -2427,7 +2400,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
if (!retval || (retval != ERR_BAD_DX_DIR))
goto out;
/* Can we just ignore htree data? */
- if (ext4_has_metadata_csum(sb)) {
+ if (ext4_has_feature_metadata_csum(sb)) {
EXT4_ERROR_INODE(dir,
"Directory has corrupted htree index.");
retval = -EFSCORRUPTED;
@@ -2577,8 +2550,10 @@ again:
BUFFER_TRACE(frame->bh, "get_write_access");
err = ext4_journal_get_write_access(handle, sb, frame->bh,
EXT4_JTR_NONE);
- if (err)
+ if (err) {
+ brelse(bh2);
goto journal_error;
+ }
if (!add_level) {
unsigned icount1 = icount/2, icount2 = icount - icount1;
unsigned hash2 = dx_get_hash(entries + icount1);
@@ -2589,8 +2564,10 @@ again:
err = ext4_journal_get_write_access(handle, sb,
(frame - 1)->bh,
EXT4_JTR_NONE);
- if (err)
+ if (err) {
+ brelse(bh2);
goto journal_error;
+ }
memcpy((char *) entries2, (char *) (entries + icount1),
icount2 * sizeof(struct dx_entry));
@@ -2609,8 +2586,10 @@ again:
dxtrace(dx_show_index("node",
((struct dx_node *) bh2->b_data)->entries));
err = ext4_handle_dirty_dx_node(handle, dir, bh2);
- if (err)
+ if (err) {
+ brelse(bh2);
goto journal_error;
+ }
brelse (bh2);
err = ext4_handle_dirty_dx_node(handle, dir,
(frame - 1)->bh);
@@ -2635,8 +2614,10 @@ again:
"Creating %d level index...\n",
dxroot->info.indirect_levels));
err = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
- if (err)
+ if (err) {
+ brelse(bh2);
goto journal_error;
+ }
err = ext4_handle_dirty_dx_node(handle, dir, bh2);
brelse(bh2);
restart = 1;
@@ -2733,7 +2714,7 @@ static int ext4_delete_entry(handle_t *handle,
return err;
}
- if (ext4_has_metadata_csum(dir->i_sb))
+ if (ext4_has_feature_metadata_csum(dir->i_sb))
csum_size = sizeof(struct ext4_dir_entry_tail);
BUFFER_TRACE(bh, "get_write_access");
@@ -2934,48 +2915,59 @@ err_unlock_inode:
return err;
}
-struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
- struct ext4_dir_entry_2 *de,
- int blocksize, int csum_size,
- unsigned int parent_ino, int dotdot_real_len)
+int ext4_init_dirblock(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, unsigned int parent_ino,
+ void *inline_buf, int inline_size)
{
+ struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) bh->b_data;
+ size_t blocksize = bh->b_size;
+ int csum_size = 0, header_size;
+
+ if (ext4_has_feature_metadata_csum(inode->i_sb))
+ csum_size = sizeof(struct ext4_dir_entry_tail);
+
de->inode = cpu_to_le32(inode->i_ino);
de->name_len = 1;
de->rec_len = ext4_rec_len_to_disk(ext4_dir_rec_len(de->name_len, NULL),
blocksize);
- strcpy(de->name, ".");
+ memcpy(de->name, ".", 2);
ext4_set_de_type(inode->i_sb, de, S_IFDIR);
de = ext4_next_entry(de, blocksize);
de->inode = cpu_to_le32(parent_ino);
de->name_len = 2;
- if (!dotdot_real_len)
- de->rec_len = ext4_rec_len_to_disk(blocksize -
- (csum_size + ext4_dir_rec_len(1, NULL)),
- blocksize);
- else
+ memcpy(de->name, "..", 3);
+ ext4_set_de_type(inode->i_sb, de, S_IFDIR);
+ if (inline_buf) {
de->rec_len = ext4_rec_len_to_disk(
ext4_dir_rec_len(de->name_len, NULL),
blocksize);
- strcpy(de->name, "..");
- ext4_set_de_type(inode->i_sb, de, S_IFDIR);
+ de = ext4_next_entry(de, blocksize);
+ header_size = (char *)de - bh->b_data;
+ memcpy((void *)de, inline_buf, inline_size);
+ ext4_update_final_de(bh->b_data, inline_size + header_size,
+ blocksize - csum_size);
+ } else {
+ de->rec_len = ext4_rec_len_to_disk(blocksize -
+ (csum_size + ext4_dir_rec_len(1, NULL)),
+ blocksize);
+ }
- return ext4_next_entry(de, blocksize);
+ if (csum_size)
+ ext4_initialize_dirent_tail(bh, blocksize);
+ BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
+ set_buffer_uptodate(bh);
+ set_buffer_verified(bh);
+ return ext4_handle_dirty_dirblock(handle, inode, bh);
}
int ext4_init_new_dir(handle_t *handle, struct inode *dir,
struct inode *inode)
{
struct buffer_head *dir_block = NULL;
- struct ext4_dir_entry_2 *de;
ext4_lblk_t block = 0;
- unsigned int blocksize = dir->i_sb->s_blocksize;
- int csum_size = 0;
int err;
- if (ext4_has_metadata_csum(dir->i_sb))
- csum_size = sizeof(struct ext4_dir_entry_tail);
-
if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
err = ext4_try_create_inline_dir(handle, dir, inode);
if (err < 0 && err != -ENOSPC)
@@ -2984,39 +2976,30 @@ int ext4_init_new_dir(handle_t *handle, struct inode *dir,
goto out;
}
+ set_nlink(inode, 2);
inode->i_size = 0;
dir_block = ext4_append(handle, inode, &block);
if (IS_ERR(dir_block))
return PTR_ERR(dir_block);
- de = (struct ext4_dir_entry_2 *)dir_block->b_data;
- ext4_init_dot_dotdot(inode, de, blocksize, csum_size, dir->i_ino, 0);
- set_nlink(inode, 2);
- if (csum_size)
- ext4_initialize_dirent_tail(dir_block, blocksize);
-
- BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
- err = ext4_handle_dirty_dirblock(handle, inode, dir_block);
- if (err)
- goto out;
- set_buffer_verified(dir_block);
+ err = ext4_init_dirblock(handle, inode, dir_block, dir->i_ino, NULL, 0);
out:
brelse(dir_block);
return err;
}
-static int ext4_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *ext4_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
handle_t *handle;
struct inode *inode;
int err, err2 = 0, credits, retries = 0;
if (EXT4_DIR_LINK_MAX(dir))
- return -EMLINK;
+ return ERR_PTR(-EMLINK);
err = dquot_initialize(dir);
if (err)
- return err;
+ return ERR_PTR(err);
credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
@@ -3066,7 +3049,7 @@ out_stop:
out_retry:
if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
goto retry;
- return err;
+ return ERR_PTR(err);
}
/*
@@ -3101,7 +3084,8 @@ bool ext4_empty_dir(struct inode *inode)
de = (struct ext4_dir_entry_2 *) bh->b_data;
if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size,
0) ||
- le32_to_cpu(de->inode) != inode->i_ino || strcmp(".", de->name)) {
+ le32_to_cpu(de->inode) != inode->i_ino || de->name_len != 1 ||
+ de->name[0] != '.') {
ext4_warning_inode(inode, "directory missing '.'");
brelse(bh);
return false;
@@ -3110,7 +3094,8 @@ bool ext4_empty_dir(struct inode *inode)
de = ext4_next_entry(de, sb->s_blocksize);
if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size,
offset) ||
- le32_to_cpu(de->inode) == 0 || strcmp("..", de->name)) {
+ le32_to_cpu(de->inode) == 0 || de->name_len != 2 ||
+ de->name[0] != '.' || de->name[1] != '.') {
ext4_warning_inode(inode, "directory missing '..'");
brelse(bh);
return false;
@@ -3151,8 +3136,9 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
struct ext4_dir_entry_2 *de;
handle_t *handle = NULL;
- if (unlikely(ext4_forced_shutdown(dir->i_sb)))
- return -EIO;
+ retval = ext4_emergency_state(dir->i_sb);
+ if (unlikely(retval))
+ return retval;
/* Initialize quotas before so that eventual writes go in
* separate transaction */
@@ -3309,8 +3295,9 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
{
int retval;
- if (unlikely(ext4_forced_shutdown(dir->i_sb)))
- return -EIO;
+ retval = ext4_emergency_state(dir->i_sb);
+ if (unlikely(retval))
+ return retval;
trace_ext4_unlink_enter(dir, dentry);
/*
@@ -3376,8 +3363,9 @@ static int ext4_symlink(struct mnt_idmap *idmap, struct inode *dir,
struct fscrypt_str disk_link;
int retries = 0;
- if (unlikely(ext4_forced_shutdown(dir->i_sb)))
- return -EIO;
+ err = ext4_emergency_state(dir->i_sb);
+ if (unlikely(err))
+ return err;
err = fscrypt_prepare_symlink(dir, symname, len, dir->i_sb->s_blocksize,
&disk_link);
@@ -3548,7 +3536,7 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle,
if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data,
bh->b_size, 0) ||
le32_to_cpu(de->inode) != inode->i_ino ||
- strcmp(".", de->name)) {
+ de->name_len != 1 || de->name[0] != '.') {
EXT4_ERROR_INODE(inode, "directory missing '.'");
brelse(bh);
*retval = -EFSCORRUPTED;
@@ -3559,7 +3547,8 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle,
de = ext4_next_entry(de, inode->i_sb->s_blocksize);
if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data,
bh->b_size, offset) ||
- le32_to_cpu(de->inode) == 0 || strcmp("..", de->name)) {
+ le32_to_cpu(de->inode) == 0 || de->name_len != 2 ||
+ de->name[0] != '.' || de->name[1] != '.') {
EXT4_ERROR_INODE(inode, "directory missing '..'");
brelse(bh);
*retval = -EFSCORRUPTED;
@@ -4199,8 +4188,9 @@ static int ext4_rename2(struct mnt_idmap *idmap,
{
int err;
- if (unlikely(ext4_forced_shutdown(old_dir->i_sb)))
- return -EIO;
+ err = ext4_emergency_state(old_dir->i_sb);
+ if (unlikely(err))
+ return err;
if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
return -EINVAL;
diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c
index e5b47dda3317..524d4658fa40 100644
--- a/fs/ext4/orphan.c
+++ b/fs/ext4/orphan.c
@@ -537,13 +537,13 @@ static int ext4_orphan_file_block_csum_verify(struct super_block *sb,
struct ext4_orphan_block_tail *ot;
__le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr);
- if (!ext4_has_metadata_csum(sb))
+ if (!ext4_has_feature_metadata_csum(sb))
return 1;
ot = ext4_orphan_block_tail(sb, bh);
- calculated = ext4_chksum(EXT4_SB(sb), oi->of_csum_seed,
- (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr));
- calculated = ext4_chksum(EXT4_SB(sb), calculated, (__u8 *)bh->b_data,
+ calculated = ext4_chksum(oi->of_csum_seed, (__u8 *)&dsk_block_nr,
+ sizeof(dsk_block_nr));
+ calculated = ext4_chksum(calculated, (__u8 *)bh->b_data,
inodes_per_ob * sizeof(__u32));
return le32_to_cpu(ot->ob_checksum) == calculated;
}
@@ -560,10 +560,9 @@ void ext4_orphan_file_block_trigger(struct jbd2_buffer_trigger_type *triggers,
struct ext4_orphan_block_tail *ot;
__le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr);
- csum = ext4_chksum(EXT4_SB(sb), oi->of_csum_seed,
- (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr));
- csum = ext4_chksum(EXT4_SB(sb), csum, (__u8 *)data,
- inodes_per_ob * sizeof(__u32));
+ csum = ext4_chksum(oi->of_csum_seed, (__u8 *)&dsk_block_nr,
+ sizeof(dsk_block_nr));
+ csum = ext4_chksum(csum, (__u8 *)data, inodes_per_ob * sizeof(__u32));
ot = ext4_orphan_block_tail(sb, bh);
ot->ob_checksum = cpu_to_le32(csum);
}
@@ -590,8 +589,9 @@ int ext4_init_orphan_info(struct super_block *sb)
}
oi->of_blocks = inode->i_size >> sb->s_blocksize_bits;
oi->of_csum_seed = EXT4_I(inode)->i_csum_seed;
- oi->of_binfo = kmalloc(oi->of_blocks*sizeof(struct ext4_orphan_block),
- GFP_KERNEL);
+ oi->of_binfo = kmalloc_array(oi->of_blocks,
+ sizeof(struct ext4_orphan_block),
+ GFP_KERNEL);
if (!oi->of_binfo) {
ret = -ENOMEM;
goto out_put;
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 69b8a7221a2b..39abfeec5f36 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -164,7 +164,8 @@ static void ext4_release_io_end(ext4_io_end_t *io_end)
}
/*
- * Check a range of space and convert unwritten extents to written. Note that
+ * On successful IO, check a range of space and convert unwritten extents to
+ * written. On IO failure, check if journal abort is needed. Note that
* we are protected from truncate touching same part of extent tree by the
* fact that truncate code waits for all DIO to finish (thus exclusion from
* direct IO is achieved) and also waits for PageWriteback bits. Thus we
@@ -175,20 +176,36 @@ static int ext4_end_io_end(ext4_io_end_t *io_end)
{
struct inode *inode = io_end->inode;
handle_t *handle = io_end->handle;
+ struct super_block *sb = inode->i_sb;
int ret = 0;
ext4_debug("ext4_end_io_nolock: io_end 0x%p from inode %lu,list->next 0x%p,"
"list->prev 0x%p\n",
io_end, inode->i_ino, io_end->list.next, io_end->list.prev);
- io_end->handle = NULL; /* Following call will use up the handle */
- ret = ext4_convert_unwritten_io_end_vec(handle, io_end);
- if (ret < 0 && !ext4_forced_shutdown(inode->i_sb)) {
- ext4_msg(inode->i_sb, KERN_EMERG,
+ /*
+ * Do not convert the unwritten extents if data writeback fails,
+ * or stale data may be exposed.
+ */
+ io_end->handle = NULL; /* Following call will use up the handle */
+ if (unlikely(io_end->flag & EXT4_IO_END_FAILED)) {
+ ret = -EIO;
+ if (handle)
+ jbd2_journal_free_reserved(handle);
+
+ if (test_opt(sb, DATA_ERR_ABORT))
+ jbd2_journal_abort(EXT4_SB(sb)->s_journal, ret);
+ } else {
+ ret = ext4_convert_unwritten_io_end_vec(handle, io_end);
+ }
+ if (ret < 0 && !ext4_emergency_state(sb) &&
+ io_end->flag & EXT4_IO_END_UNWRITTEN) {
+ ext4_msg(sb, KERN_EMERG,
"failed to convert unwritten extents to written "
"extents -- potential data loss! "
"(inode %lu, error %d)", inode->i_ino, ret);
}
+
ext4_clear_io_unwritten_flag(io_end);
ext4_release_io_end(io_end);
return ret;
@@ -217,6 +234,18 @@ static void dump_completed_IO(struct inode *inode, struct list_head *head)
#endif
}
+static bool ext4_io_end_defer_completion(ext4_io_end_t *io_end)
+{
+ if (io_end->flag & EXT4_IO_END_UNWRITTEN &&
+ !list_empty(&io_end->list_vec))
+ return true;
+ if (test_opt(io_end->inode->i_sb, DATA_ERR_ABORT) &&
+ io_end->flag & EXT4_IO_END_FAILED &&
+ !ext4_emergency_state(io_end->inode->i_sb))
+ return true;
+ return false;
+}
+
/* Add the io_end to per-inode completed end_io list. */
static void ext4_add_complete_io(ext4_io_end_t *io_end)
{
@@ -225,9 +254,12 @@ static void ext4_add_complete_io(ext4_io_end_t *io_end)
struct workqueue_struct *wq;
unsigned long flags;
- /* Only reserved conversions from writeback should enter here */
- WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
- WARN_ON(!io_end->handle && sbi->s_journal);
+ /* Only reserved conversions or pending IO errors will enter here. */
+ WARN_ON(!(io_end->flag & EXT4_IO_END_DEFER_COMPLETION));
+ WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN &&
+ !io_end->handle && sbi->s_journal);
+ WARN_ON(!io_end->bio);
+
spin_lock_irqsave(&ei->i_completed_io_lock, flags);
wq = sbi->rsv_conversion_wq;
if (list_empty(&ei->i_rsv_conversion_list))
@@ -252,7 +284,7 @@ static int ext4_do_flush_completed_IO(struct inode *inode,
while (!list_empty(&unwritten)) {
io_end = list_entry(unwritten.next, ext4_io_end_t, list);
- BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
+ BUG_ON(!(io_end->flag & EXT4_IO_END_DEFER_COMPLETION));
list_del_init(&io_end->list);
err = ext4_end_io_end(io_end);
@@ -263,7 +295,8 @@ static int ext4_do_flush_completed_IO(struct inode *inode,
}
/*
- * work on completed IO, to convert unwritten extents to extents
+ * Used to convert unwritten extents to written extents upon IO completion,
+ * or used to abort the journal upon IO errors.
*/
void ext4_end_io_rsv_work(struct work_struct *work)
{
@@ -288,29 +321,22 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
void ext4_put_io_end_defer(ext4_io_end_t *io_end)
{
if (refcount_dec_and_test(&io_end->count)) {
- if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) ||
- list_empty(&io_end->list_vec)) {
- ext4_release_io_end(io_end);
- return;
- }
- ext4_add_complete_io(io_end);
+ if (ext4_io_end_defer_completion(io_end))
+ return ext4_add_complete_io(io_end);
+
+ ext4_release_io_end(io_end);
}
}
int ext4_put_io_end(ext4_io_end_t *io_end)
{
- int err = 0;
-
if (refcount_dec_and_test(&io_end->count)) {
- if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
- err = ext4_convert_unwritten_io_end_vec(io_end->handle,
- io_end);
- io_end->handle = NULL;
- ext4_clear_io_unwritten_flag(io_end);
- }
+ if (ext4_io_end_defer_completion(io_end))
+ return ext4_end_io_end(io_end);
+
ext4_release_io_end(io_end);
}
- return err;
+ return 0;
}
ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end)
@@ -344,11 +370,12 @@ static void ext4_end_bio(struct bio *bio)
bio->bi_status, inode->i_ino,
(unsigned long long)
bi_sector >> (inode->i_blkbits - 9));
+ io_end->flag |= EXT4_IO_END_FAILED;
mapping_set_error(inode->i_mapping,
blk_status_to_errno(bio->bi_status));
}
- if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
+ if (ext4_io_end_defer_completion(io_end)) {
/*
* Link bio into list hanging from io_end. We have to do it
* atomically as bio completions can be racing against each
@@ -520,9 +547,9 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio,
* first page of the bio. Otherwise it can deadlock.
*/
if (io->io_bio)
- gfp_flags = GFP_NOWAIT | __GFP_NOWARN;
+ gfp_flags = GFP_NOWAIT;
retry_encrypt:
- bounce_page = fscrypt_encrypt_pagecache_blocks(&folio->page,
+ bounce_page = fscrypt_encrypt_pagecache_blocks(folio,
enc_bytes, 0, gfp_flags);
if (IS_ERR(bounce_page)) {
ret = PTR_ERR(bounce_page);
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index 5d3a9dc9a32d..f329daf6e5c7 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -227,24 +227,30 @@ int ext4_mpage_readpages(struct inode *inode,
int length;
unsigned relative_block = 0;
struct ext4_map_blocks map;
- unsigned int nr_pages = rac ? readahead_count(rac) : 1;
+ unsigned int nr_pages, folio_pages;
map.m_pblk = 0;
map.m_lblk = 0;
map.m_len = 0;
map.m_flags = 0;
- for (; nr_pages; nr_pages--) {
+ nr_pages = rac ? readahead_count(rac) : folio_nr_pages(folio);
+ for (; nr_pages; nr_pages -= folio_pages) {
int fully_mapped = 1;
- unsigned first_hole = blocks_per_page;
+ unsigned int first_hole;
+ unsigned int blocks_per_folio;
if (rac)
folio = readahead_folio(rac);
+
+ folio_pages = folio_nr_pages(folio);
prefetchw(&folio->flags);
if (folio_buffers(folio))
goto confused;
+ blocks_per_folio = folio_size(folio) >> blkbits;
+ first_hole = blocks_per_folio;
block_in_file = next_block =
(sector_t)folio->index << (PAGE_SHIFT - blkbits);
last_block = block_in_file + nr_pages * blocks_per_page;
@@ -270,7 +276,7 @@ int ext4_mpage_readpages(struct inode *inode,
map.m_flags &= ~EXT4_MAP_MAPPED;
break;
}
- if (page_block == blocks_per_page)
+ if (page_block == blocks_per_folio)
break;
page_block++;
block_in_file++;
@@ -281,7 +287,7 @@ int ext4_mpage_readpages(struct inode *inode,
* Then do more ext4_map_blocks() calls until we are
* done with this folio.
*/
- while (page_block < blocks_per_page) {
+ while (page_block < blocks_per_folio) {
if (block_in_file < last_block) {
map.m_lblk = block_in_file;
map.m_len = last_block - block_in_file;
@@ -296,13 +302,13 @@ int ext4_mpage_readpages(struct inode *inode,
}
if ((map.m_flags & EXT4_MAP_MAPPED) == 0) {
fully_mapped = 0;
- if (first_hole == blocks_per_page)
+ if (first_hole == blocks_per_folio)
first_hole = page_block;
page_block++;
block_in_file++;
continue;
}
- if (first_hole != blocks_per_page)
+ if (first_hole != blocks_per_folio)
goto confused; /* hole -> non-hole */
/* Contiguous blocks? */
@@ -315,13 +321,13 @@ int ext4_mpage_readpages(struct inode *inode,
/* needed? */
map.m_flags &= ~EXT4_MAP_MAPPED;
break;
- } else if (page_block == blocks_per_page)
+ } else if (page_block == blocks_per_folio)
break;
page_block++;
block_in_file++;
}
}
- if (first_hole != blocks_per_page) {
+ if (first_hole != blocks_per_folio) {
folio_zero_segment(folio, first_hole << blkbits,
folio_size(folio));
if (first_hole == 0) {
@@ -367,11 +373,11 @@ int ext4_mpage_readpages(struct inode *inode,
if (((map.m_flags & EXT4_MAP_BOUNDARY) &&
(relative_block == map.m_len)) ||
- (first_hole != blocks_per_page)) {
+ (first_hole != blocks_per_folio)) {
submit_bio(bio);
bio = NULL;
} else
- last_block_in_bio = first_block + blocks_per_page - 1;
+ last_block_in_bio = first_block + blocks_per_folio - 1;
continue;
confused:
if (bio) {
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 72f77f78ae8d..050f26168d97 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1118,8 +1118,8 @@ static inline void ext4_set_block_group_nr(struct super_block *sb, char *data,
struct ext4_super_block *es = (struct ext4_super_block *) data;
es->s_block_group_nr = cpu_to_le16(group);
- if (ext4_has_metadata_csum(sb))
- es->s_checksum = ext4_superblock_csum(sb, es);
+ if (ext4_has_feature_metadata_csum(sb))
+ es->s_checksum = ext4_superblock_csum(es);
}
/*
@@ -1315,7 +1315,7 @@ static int ext4_set_bitmap_checksums(struct super_block *sb,
{
struct buffer_head *bh;
- if (!ext4_has_metadata_csum(sb))
+ if (!ext4_has_feature_metadata_csum(sb))
return 0;
bh = ext4_get_bitmap(sb, group_data->inode_bitmap);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index a50e5c31b937..699c15db28a8 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -79,7 +79,6 @@ static int ext4_unfreeze(struct super_block *sb);
static int ext4_freeze(struct super_block *sb);
static inline int ext2_feature_set_ok(struct super_block *sb);
static inline int ext3_feature_set_ok(struct super_block *sb);
-static void ext4_destroy_lazyinit_thread(void);
static void ext4_unregister_li_request(struct super_block *sb);
static void ext4_clear_request_list(void);
static struct inode *ext4_get_journal_inode(struct super_block *sb,
@@ -269,7 +268,7 @@ struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb,
void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block)
{
struct buffer_head *bh = bdev_getblk(sb->s_bdev, block,
- sb->s_blocksize, GFP_NOWAIT | __GFP_NOWARN);
+ sb->s_blocksize, GFP_NOWAIT);
if (likely(bh)) {
if (trylock_buffer(bh))
@@ -287,14 +286,12 @@ static int ext4_verify_csum_type(struct super_block *sb,
return es->s_checksum_type == EXT4_CRC32C_CHKSUM;
}
-__le32 ext4_superblock_csum(struct super_block *sb,
- struct ext4_super_block *es)
+__le32 ext4_superblock_csum(struct ext4_super_block *es)
{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
int offset = offsetof(struct ext4_super_block, s_checksum);
__u32 csum;
- csum = ext4_chksum(sbi, ~0, (char *)es, offset);
+ csum = ext4_chksum(~0, (char *)es, offset);
return cpu_to_le32(csum);
}
@@ -302,20 +299,20 @@ __le32 ext4_superblock_csum(struct super_block *sb,
static int ext4_superblock_csum_verify(struct super_block *sb,
struct ext4_super_block *es)
{
- if (!ext4_has_metadata_csum(sb))
+ if (!ext4_has_feature_metadata_csum(sb))
return 1;
- return es->s_checksum == ext4_superblock_csum(sb, es);
+ return es->s_checksum == ext4_superblock_csum(es);
}
void ext4_superblock_csum_set(struct super_block *sb)
{
struct ext4_super_block *es = EXT4_SB(sb)->s_es;
- if (!ext4_has_metadata_csum(sb))
+ if (!ext4_has_feature_metadata_csum(sb))
return;
- es->s_checksum = ext4_superblock_csum(sb, es);
+ es->s_checksum = ext4_superblock_csum(es);
}
ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
@@ -448,9 +445,6 @@ static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi)
#define ext4_get_tstamp(es, tstamp) \
__ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi)
-#define EXT4_SB_REFRESH_INTERVAL_SEC (3600) /* seconds (1 hour) */
-#define EXT4_SB_REFRESH_INTERVAL_KB (16384) /* kilobytes (16MB) */
-
/*
* The ext4_maybe_update_superblock() function checks and updates the
* superblock if needed.
@@ -458,8 +452,10 @@ static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi)
* This function is designed to update the on-disk superblock only under
* certain conditions to prevent excessive disk writes and unnecessary
* waking of the disk from sleep. The superblock will be updated if:
- * 1. More than an hour has passed since the last superblock update, and
- * 2. More than 16MB have been written since the last superblock update.
+ * 1. More than sbi->s_sb_update_sec (def: 1 hour) has passed since the last
+ * superblock update
+ * 2. More than sbi->s_sb_update_kb (def: 16MB) kbs have been written since the
+ * last superblock update.
*
* @sb: The superblock
*/
@@ -473,14 +469,15 @@ static void ext4_maybe_update_superblock(struct super_block *sb)
__u64 lifetime_write_kbytes;
__u64 diff_size;
- if (sb_rdonly(sb) || !(sb->s_flags & SB_ACTIVE) ||
- !journal || (journal->j_flags & JBD2_UNMOUNT))
+ if (ext4_emergency_state(sb) || sb_rdonly(sb) ||
+ !(sb->s_flags & SB_ACTIVE) || !journal ||
+ journal->j_flags & JBD2_UNMOUNT)
return;
now = ktime_get_real_seconds();
last_update = ext4_get_tstamp(es, s_wtime);
- if (likely(now - last_update < EXT4_SB_REFRESH_INTERVAL_SEC))
+ if (likely(now - last_update < sbi->s_sb_update_sec))
return;
lifetime_write_kbytes = sbi->s_kbytes_written +
@@ -495,49 +492,23 @@ static void ext4_maybe_update_superblock(struct super_block *sb)
*/
diff_size = lifetime_write_kbytes - le64_to_cpu(es->s_kbytes_written);
- if (diff_size > EXT4_SB_REFRESH_INTERVAL_KB)
+ if (diff_size > sbi->s_sb_update_kb)
schedule_work(&EXT4_SB(sb)->s_sb_upd_work);
}
static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
{
struct super_block *sb = journal->j_private;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- int error = is_journal_aborted(journal);
- struct ext4_journal_cb_entry *jce;
BUG_ON(txn->t_state == T_FINISHED);
ext4_process_freed_data(sb, txn->t_tid);
ext4_maybe_update_superblock(sb);
-
- spin_lock(&sbi->s_md_lock);
- while (!list_empty(&txn->t_private_list)) {
- jce = list_entry(txn->t_private_list.next,
- struct ext4_journal_cb_entry, jce_list);
- list_del_init(&jce->jce_list);
- spin_unlock(&sbi->s_md_lock);
- jce->jce_func(sb, jce, error);
- spin_lock(&sbi->s_md_lock);
- }
- spin_unlock(&sbi->s_md_lock);
}
-/*
- * This writepage callback for write_cache_pages()
- * takes care of a few cases after page cleaning.
- *
- * write_cache_pages() already checks for dirty pages
- * and calls clear_page_dirty_for_io(), which we want,
- * to write protect the pages.
- *
- * However, we may have to redirty a page (see below.)
- */
-static int ext4_journalled_writepage_callback(struct folio *folio,
- struct writeback_control *wbc,
- void *data)
+static bool ext4_journalled_writepage_needs_redirty(struct jbd2_inode *jinode,
+ struct folio *folio)
{
- transaction_t *transaction = (transaction_t *) data;
struct buffer_head *bh, *head;
struct journal_head *jh;
@@ -558,15 +529,12 @@ static int ext4_journalled_writepage_callback(struct folio *folio,
*/
jh = bh2jh(bh);
if (buffer_dirty(bh) ||
- (jh && (jh->b_transaction != transaction ||
- jh->b_next_transaction))) {
- folio_redirty_for_writepage(wbc, folio);
- goto out;
- }
+ (jh && (jh->b_transaction != jinode->i_transaction ||
+ jh->b_next_transaction)))
+ return true;
} while ((bh = bh->b_this_page) != head);
-out:
- return AOP_WRITEPAGE_ACTIVATE;
+ return false;
}
static int ext4_journalled_submit_inode_data_buffers(struct jbd2_inode *jinode)
@@ -578,10 +546,23 @@ static int ext4_journalled_submit_inode_data_buffers(struct jbd2_inode *jinode)
.range_start = jinode->i_dirty_start,
.range_end = jinode->i_dirty_end,
};
+ struct folio *folio = NULL;
+ int error;
- return write_cache_pages(mapping, &wbc,
- ext4_journalled_writepage_callback,
- jinode->i_transaction);
+ /*
+ * writeback_iter() already checks for dirty pages and calls
+ * folio_clear_dirty_for_io(), which we want to write protect the
+ * folios.
+ *
+ * However, we may have to redirty a folio sometimes.
+ */
+ while ((folio = writeback_iter(mapping, &wbc, folio, &error))) {
+ if (ext4_journalled_writepage_needs_redirty(jinode, folio))
+ folio_redirty_for_writepage(&wbc, folio);
+ folio_unlock(folio);
+ }
+
+ return error;
}
static int ext4_journal_submit_inode_data_buffers(struct jbd2_inode *jinode)
@@ -707,11 +688,8 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error,
if (test_opt(sb, WARN_ON_ERROR))
WARN_ON_ONCE(1);
- if (!continue_fs && !sb_rdonly(sb)) {
- set_bit(EXT4_FLAGS_SHUTDOWN, &EXT4_SB(sb)->s_ext4_flags);
- if (journal)
- jbd2_journal_abort(journal, -EIO);
- }
+ if (!continue_fs && !ext4_emergency_ro(sb) && journal)
+ jbd2_journal_abort(journal, -EIO);
if (!bdev_read_only(sb->s_bdev)) {
save_error_info(sb, error, ino, block, func, line);
@@ -719,9 +697,13 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error,
* In case the fs should keep running, we need to writeout
* superblock through the journal. Due to lock ordering
* constraints, it may not be safe to do it right here so we
- * defer superblock flushing to a workqueue.
+ * defer superblock flushing to a workqueue. We just need to be
+ * careful when the journal is already shutting down. If we get
+ * here in that case, just update the sb directly as the last
+ * transaction won't commit anyway.
*/
- if (continue_fs && journal)
+ if (continue_fs && journal &&
+ !ext4_test_mount_flag(sb, EXT4_MF_JOURNAL_DESTROY))
schedule_work(&EXT4_SB(sb)->s_sb_upd_work);
else
ext4_commit_super(sb);
@@ -737,17 +719,17 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error,
sb->s_id);
}
- if (sb_rdonly(sb) || continue_fs)
+ if (ext4_emergency_ro(sb) || continue_fs)
return;
ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
/*
- * EXT4_FLAGS_SHUTDOWN was set which stops all filesystem
- * modifications. We don't set SB_RDONLY because that requires
- * sb->s_umount semaphore and setting it without proper remount
- * procedure is confusing code such as freeze_super() leading to
- * deadlocks and other problems.
+ * We don't set SB_RDONLY because that requires sb->s_umount
+ * semaphore and setting it without proper remount procedure is
+ * confusing code such as freeze_super() leading to deadlocks
+ * and other problems.
*/
+ set_bit(EXT4_FLAGS_EMERGENCY_RO, &EXT4_SB(sb)->s_ext4_flags);
}
static void update_super_work(struct work_struct *work)
@@ -765,7 +747,8 @@ static void update_super_work(struct work_struct *work)
* We use directly jbd2 functions here to avoid recursing back into
* ext4 error handling code during handling of previous errors.
*/
- if (!sb_rdonly(sbi->s_sb) && journal) {
+ if (!ext4_emergency_state(sbi->s_sb) &&
+ !sb_rdonly(sbi->s_sb) && journal) {
struct buffer_head *sbh = sbi->s_sbh;
bool call_notify_err = false;
@@ -819,7 +802,7 @@ void __ext4_error(struct super_block *sb, const char *function,
struct va_format vaf;
va_list args;
- if (unlikely(ext4_forced_shutdown(sb)))
+ if (unlikely(ext4_emergency_state(sb)))
return;
trace_ext4_error(sb, function, line);
@@ -844,7 +827,7 @@ void __ext4_error_inode(struct inode *inode, const char *function,
va_list args;
struct va_format vaf;
- if (unlikely(ext4_forced_shutdown(inode->i_sb)))
+ if (unlikely(ext4_emergency_state(inode->i_sb)))
return;
trace_ext4_error(inode->i_sb, function, line);
@@ -879,7 +862,7 @@ void __ext4_error_file(struct file *file, const char *function,
struct inode *inode = file_inode(file);
char pathname[80], *path;
- if (unlikely(ext4_forced_shutdown(inode->i_sb)))
+ if (unlikely(ext4_emergency_state(inode->i_sb)))
return;
trace_ext4_error(inode->i_sb, function, line);
@@ -959,7 +942,7 @@ void __ext4_std_error(struct super_block *sb, const char *function,
char nbuf[16];
const char *errstr;
- if (unlikely(ext4_forced_shutdown(sb)))
+ if (unlikely(ext4_emergency_state(sb)))
return;
/* Special case: if the error is EROFS, and we're not already
@@ -1053,7 +1036,7 @@ __acquires(bitlock)
struct va_format vaf;
va_list args;
- if (unlikely(ext4_forced_shutdown(sb)))
+ if (unlikely(ext4_emergency_state(sb)))
return;
trace_ext4_error(sb, function, line);
@@ -1306,18 +1289,17 @@ static void ext4_put_super(struct super_block *sb)
ext4_unregister_li_request(sb);
ext4_quotas_off(sb, EXT4_MAXQUOTAS);
- flush_work(&sbi->s_sb_upd_work);
destroy_workqueue(sbi->rsv_conversion_wq);
ext4_release_orphan_info(sb);
if (sbi->s_journal) {
aborted = is_journal_aborted(sbi->s_journal);
- err = jbd2_journal_destroy(sbi->s_journal);
- sbi->s_journal = NULL;
+ err = ext4_journal_destroy(sbi, sbi->s_journal);
if ((err < 0) && !aborted) {
ext4_abort(sb, -err, "Couldn't clean up the journal");
}
- }
+ } else
+ flush_work(&sbi->s_sb_upd_work);
ext4_es_unregister_shrinker(sbi);
timer_shutdown_sync(&sbi->s_err_report);
@@ -1325,13 +1307,14 @@ static void ext4_put_super(struct super_block *sb)
ext4_mb_release(sb);
ext4_ext_release(sb);
- if (!sb_rdonly(sb) && !aborted) {
- ext4_clear_feature_journal_needs_recovery(sb);
- ext4_clear_feature_orphan_present(sb);
- es->s_state = cpu_to_le16(sbi->s_mount_state);
- }
- if (!sb_rdonly(sb))
+ if (!ext4_emergency_state(sb) && !sb_rdonly(sb)) {
+ if (!aborted) {
+ ext4_clear_feature_journal_needs_recovery(sb);
+ ext4_clear_feature_orphan_present(sb);
+ es->s_state = cpu_to_le16(sbi->s_mount_state);
+ }
ext4_commit_super(sb);
+ }
ext4_group_desc_free(sbi);
ext4_flex_groups_free(sbi);
@@ -1426,10 +1409,9 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
spin_lock_init(&ei->i_completed_io_lock);
ei->i_sync_tid = 0;
ei->i_datasync_tid = 0;
- atomic_set(&ei->i_unwritten, 0);
INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
ext4_fc_init_inode(&ei->vfs_inode);
- mutex_init(&ei->i_fc_lock);
+ spin_lock_init(&ei->i_fc_lock);
return &ei->vfs_inode;
}
@@ -1823,7 +1805,6 @@ static const struct fs_parameter_spec ext4_param_specs[] = {
{}
};
-#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
#define MOPT_SET 0x0001
#define MOPT_CLEAR 0x0002
@@ -2017,6 +1998,9 @@ int ext4_init_fs_context(struct fs_context *fc)
fc->fs_private = ctx;
fc->ops = &ext4_context_ops;
+ /* i_version is always enabled now */
+ fc->sb_flags |= SB_I_VERSION;
+
return 0;
}
@@ -2785,6 +2769,13 @@ static int ext4_check_opt_consistency(struct fs_context *fc,
}
if (is_remount) {
+ if (!sbi->s_journal &&
+ ctx_test_mount_opt(ctx, EXT4_MOUNT_DATA_ERR_ABORT)) {
+ ext4_msg(NULL, KERN_WARNING,
+ "Remounting fs w/o journal so ignoring data_err option");
+ ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_ERR_ABORT);
+ }
+
if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS) &&
(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) {
ext4_msg(NULL, KERN_ERR, "can't mount with "
@@ -2987,6 +2978,8 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time);
if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME)
SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time);
+ if (nodefs && sb->s_flags & SB_I_VERSION)
+ SEQ_OPTS_PUTS("i_version");
if (nodefs || sbi->s_stripe)
SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe);
if (nodefs || EXT4_MOUNT_DATA_FLAGS &
@@ -3038,6 +3031,12 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
if (nodefs && !test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS))
SEQ_OPTS_PUTS("prefetch_block_bitmaps");
+ if (ext4_emergency_ro(sb))
+ SEQ_OPTS_PUTS("emergency_ro");
+
+ if (ext4_forced_shutdown(sb))
+ SEQ_OPTS_PUTS("shutdown");
+
ext4_show_quota_options(seq, sb);
return 0;
}
@@ -3205,19 +3204,19 @@ static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group,
__le32 le_group = cpu_to_le32(block_group);
struct ext4_sb_info *sbi = EXT4_SB(sb);
- if (ext4_has_metadata_csum(sbi->s_sb)) {
+ if (ext4_has_feature_metadata_csum(sbi->s_sb)) {
/* Use new metadata_csum algorithm */
__u32 csum32;
__u16 dummy_csum = 0;
- csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group,
+ csum32 = ext4_chksum(sbi->s_csum_seed, (__u8 *)&le_group,
sizeof(le_group));
- csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp, offset);
- csum32 = ext4_chksum(sbi, csum32, (__u8 *)&dummy_csum,
+ csum32 = ext4_chksum(csum32, (__u8 *)gdp, offset);
+ csum32 = ext4_chksum(csum32, (__u8 *)&dummy_csum,
sizeof(dummy_csum));
offset += sizeof(dummy_csum);
if (offset < sbi->s_desc_size)
- csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp + offset,
+ csum32 = ext4_chksum(csum32, (__u8 *)gdp + offset,
sbi->s_desc_size - offset);
crc = csum32 & 0xFFFF;
@@ -3633,7 +3632,7 @@ int ext4_feature_set_ok(struct super_block *sb, int readonly)
*/
static void print_daily_error_info(struct timer_list *t)
{
- struct ext4_sb_info *sbi = from_timer(sbi, t, s_err_report);
+ struct ext4_sb_info *sbi = timer_container_of(sbi, t, s_err_report);
struct super_block *sb = sbi->s_sb;
struct ext4_super_block *es = sbi->s_es;
@@ -3693,7 +3692,8 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
if (group >= elr->lr_next_group) {
ret = 1;
if (elr->lr_first_not_zeroed != ngroups &&
- !sb_rdonly(sb) && test_opt(sb, INIT_INODE_TABLE)) {
+ !ext4_emergency_state(sb) && !sb_rdonly(sb) &&
+ test_opt(sb, INIT_INODE_TABLE)) {
elr->lr_next_group = elr->lr_first_not_zeroed;
elr->lr_mode = EXT4_LI_MODE_ITABLE;
ret = 0;
@@ -3998,7 +3998,7 @@ int ext4_register_li_request(struct super_block *sb,
goto out;
}
- if (sb_rdonly(sb) ||
+ if (ext4_emergency_state(sb) || sb_rdonly(sb) ||
(test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS) &&
(first_not_zeroed == ngroups || !test_opt(sb, INIT_INODE_TABLE))))
goto out;
@@ -4061,7 +4061,7 @@ static int set_journal_csum_feature_set(struct super_block *sb)
int compat, incompat;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- if (ext4_has_metadata_csum(sb)) {
+ if (ext4_has_feature_metadata_csum(sb)) {
/* journal checksum v3 */
compat = 0;
incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3;
@@ -4349,7 +4349,7 @@ static void ext4_set_def_opts(struct super_block *sb,
if (ext4_has_feature_fast_commit(sb))
set_opt2(sb, JOURNAL_FAST_COMMIT);
/* don't forget to enable journal_csum when metadata_csum is enabled. */
- if (ext4_has_metadata_csum(sb))
+ if (ext4_has_feature_metadata_csum(sb))
set_opt(sb, JOURNAL_CHECKSUM);
if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
@@ -4441,13 +4441,16 @@ static int ext4_handle_clustersize(struct super_block *sb)
/*
* ext4_atomic_write_init: Initializes filesystem min & max atomic write units.
+ * With non-bigalloc filesystem awu will be based upon filesystem blocksize
+ * & bdev awu units.
+ * With bigalloc it will be based upon bigalloc cluster size & bdev awu units.
* @sb: super block
- * TODO: Later add support for bigalloc
*/
static void ext4_atomic_write_init(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct block_device *bdev = sb->s_bdev;
+ unsigned int clustersize = EXT4_CLUSTER_SIZE(sb);
if (!bdev_can_atomic_write(bdev))
return;
@@ -4457,7 +4460,7 @@ static void ext4_atomic_write_init(struct super_block *sb)
sbi->s_awu_min = max(sb->s_blocksize,
bdev_atomic_write_unit_min_bytes(bdev));
- sbi->s_awu_max = min(sb->s_blocksize,
+ sbi->s_awu_max = min(clustersize,
bdev_atomic_write_unit_max_bytes(bdev));
if (sbi->s_awu_min && sbi->s_awu_max &&
sbi->s_awu_min <= sbi->s_awu_max) {
@@ -4482,7 +4485,7 @@ static void ext4_fast_commit_init(struct super_block *sb)
sbi->s_fc_bytes = 0;
ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
sbi->s_fc_ineligible_tid = 0;
- spin_lock_init(&sbi->s_fc_lock);
+ mutex_init(&sbi->s_fc_lock);
memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats));
sbi->s_fc_replay_state.fc_regions = NULL;
sbi->s_fc_replay_state.fc_regions_size = 0;
@@ -4642,8 +4645,9 @@ static int ext4_init_metadata_csum(struct super_block *sb, struct ext4_super_blo
/* Precompute checksum seed for all metadata */
if (ext4_has_feature_csum_seed(sb))
sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed);
- else if (ext4_has_metadata_csum(sb) || ext4_has_feature_ea_inode(sb))
- sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid,
+ else if (ext4_has_feature_metadata_csum(sb) ||
+ ext4_has_feature_ea_inode(sb))
+ sbi->s_csum_seed = ext4_chksum(~0, es->s_uuid,
sizeof(es->s_uuid));
return 0;
}
@@ -4973,10 +4977,7 @@ static int ext4_load_and_init_journal(struct super_block *sb,
return 0;
out:
- /* flush s_sb_upd_work before destroying the journal. */
- flush_work(&sbi->s_sb_upd_work);
- jbd2_journal_destroy(sbi->s_journal);
- sbi->s_journal = NULL;
+ ext4_journal_destroy(sbi, sbi->s_journal);
return -EINVAL;
}
@@ -5013,6 +5014,24 @@ static int ext4_check_journal_data_mode(struct super_block *sb)
return 0;
}
+static const char *ext4_has_journal_option(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (test_opt(sb, JOURNAL_ASYNC_COMMIT))
+ return "journal_async_commit";
+ if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM))
+ return "journal_checksum";
+ if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ)
+ return "commit=";
+ if (EXT4_MOUNT_DATA_FLAGS &
+ (sbi->s_mount_opt ^ sbi->s_def_mount_opt))
+ return "data=";
+ if (test_opt(sb, DATA_ERR_ABORT))
+ return "data_err=abort";
+ return NULL;
+}
+
static int ext4_load_super(struct super_block *sb, ext4_fsblk_t *lsb,
int silent)
{
@@ -5239,7 +5258,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
/* Set defaults for the variables that will be set during parsing */
if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO))
- ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
+ ctx->journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO;
sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
sbi->s_sectors_written_start =
@@ -5263,6 +5282,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
+ sbi->s_sb_update_kb = EXT4_DEF_SB_UPDATE_INTERVAL_KB;
+ sbi->s_sb_update_sec = EXT4_DEF_SB_UPDATE_INTERVAL_SEC;
/*
* set default s_li_wait_mult for lazyinit, for the case there is
@@ -5298,9 +5319,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
(test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);
- /* i_version is always enabled now */
- sb->s_flags |= SB_I_VERSION;
-
/* HSM events are allowed by default. */
sb->s_iflags |= SB_I_ALLOW_HSM;
@@ -5398,36 +5416,25 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
err = ext4_load_and_init_journal(sb, es, ctx);
if (err)
goto failed_mount3a;
+ if (bdev_read_only(sb->s_bdev))
+ needs_recovery = 0;
} else if (test_opt(sb, NOLOAD) && !sb_rdonly(sb) &&
ext4_has_feature_journal_needs_recovery(sb)) {
ext4_msg(sb, KERN_ERR, "required journal recovery "
"suppressed and not mounted read-only");
goto failed_mount3a;
} else {
+ const char *journal_option;
+
/* Nojournal mode, all journal mount options are illegal */
- if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
- ext4_msg(sb, KERN_ERR, "can't mount with "
- "journal_async_commit, fs mounted w/o journal");
+ journal_option = ext4_has_journal_option(sb);
+ if (journal_option != NULL) {
+ ext4_msg(sb, KERN_ERR,
+ "can't mount with %s, fs mounted w/o journal",
+ journal_option);
goto failed_mount3a;
}
- if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) {
- ext4_msg(sb, KERN_ERR, "can't mount with "
- "journal_checksum, fs mounted w/o journal");
- goto failed_mount3a;
- }
- if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
- ext4_msg(sb, KERN_ERR, "can't mount with "
- "commit=%lu, fs mounted w/o journal",
- sbi->s_commit_interval / HZ);
- goto failed_mount3a;
- }
- if (EXT4_MOUNT_DATA_FLAGS &
- (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) {
- ext4_msg(sb, KERN_ERR, "can't mount with "
- "data=, fs mounted w/o journal");
- goto failed_mount3a;
- }
sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM;
clear_opt(sb, JOURNAL_CHECKSUM);
clear_opt(sb, DATA_FLAGS);
@@ -5616,9 +5623,11 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
goto failed_mount9;
}
- if (test_opt(sb, DISCARD) && !bdev_max_discard_sectors(sb->s_bdev))
+ if (test_opt(sb, DISCARD) && !bdev_max_discard_sectors(sb->s_bdev)) {
ext4_msg(sb, KERN_WARNING,
"mounting with \"discard\" option, but the device does not support discard");
+ clear_opt(sb, DISCARD);
+ }
if (es->s_error_count)
mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
@@ -5665,10 +5674,7 @@ failed_mount_wq:
sbi->s_ea_block_cache = NULL;
if (sbi->s_journal) {
- /* flush s_sb_upd_work before journal destroy. */
- flush_work(&sbi->s_sb_upd_work);
- jbd2_journal_destroy(sbi->s_journal);
- sbi->s_journal = NULL;
+ ext4_journal_destroy(sbi, sbi->s_journal);
}
failed_mount3a:
ext4_es_unregister_shrinker(sbi);
@@ -5676,7 +5682,7 @@ failed_mount3:
/* flush s_sb_upd_work before sbi destroy */
flush_work(&sbi->s_sb_upd_work);
ext4_stop_mmpd(sbi);
- del_timer_sync(&sbi->s_err_report);
+ timer_delete_sync(&sbi->s_err_report);
ext4_group_desc_free(sbi);
failed_mount:
#if IS_ENABLED(CONFIG_UNICODE)
@@ -5773,10 +5779,6 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
journal->j_flags |= JBD2_BARRIER;
else
journal->j_flags &= ~JBD2_BARRIER;
- if (test_opt(sb, DATA_ERR_ABORT))
- journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR;
- else
- journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR;
/*
* Always enable journal cycle record option, letting the journal
* records log transactions continuously between each mount.
@@ -5916,7 +5918,7 @@ static struct file *ext4_get_journal_blkdev(struct super_block *sb,
if ((le32_to_cpu(es->s_feature_ro_compat) &
EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
- es->s_checksum != ext4_superblock_csum(sb, es)) {
+ es->s_checksum != ext4_superblock_csum(es)) {
ext4_msg(sb, KERN_ERR, "external journal has corrupt superblock");
errno = -EFSCORRUPTED;
goto out_bh;
@@ -5973,7 +5975,7 @@ static journal_t *ext4_open_dev_journal(struct super_block *sb,
return journal;
out_journal:
- jbd2_journal_destroy(journal);
+ ext4_journal_destroy(EXT4_SB(sb), journal);
out_bdev:
bdev_fput(bdev_file);
return ERR_PTR(errno);
@@ -6090,8 +6092,7 @@ static int ext4_load_journal(struct super_block *sb,
EXT4_SB(sb)->s_journal = journal;
err = ext4_clear_journal_err(sb, es);
if (err) {
- EXT4_SB(sb)->s_journal = NULL;
- jbd2_journal_destroy(journal);
+ ext4_journal_destroy(EXT4_SB(sb), journal);
return err;
}
@@ -6109,7 +6110,7 @@ static int ext4_load_journal(struct super_block *sb,
return 0;
err_out:
- jbd2_journal_destroy(journal);
+ ext4_journal_destroy(EXT4_SB(sb), journal);
return err;
}
@@ -6336,8 +6337,9 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
bool needs_barrier = false;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- if (unlikely(ext4_forced_shutdown(sb)))
- return -EIO;
+ ret = ext4_emergency_state(sb);
+ if (unlikely(ret))
+ return ret;
trace_ext4_sync_fs(sb, wait);
flush_workqueue(sbi->rsv_conversion_wq);
@@ -6419,7 +6421,7 @@ out:
*/
static int ext4_unfreeze(struct super_block *sb)
{
- if (ext4_forced_shutdown(sb))
+ if (ext4_emergency_state(sb))
return 0;
if (EXT4_SB(sb)->s_journal) {
@@ -6495,7 +6497,7 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb)
ctx->journal_ioprio =
sbi->s_journal->j_task->io_context->ioprio;
else
- ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
+ ctx->journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO;
}
@@ -6575,7 +6577,7 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb)
flush_work(&sbi->s_sb_upd_work);
if ((bool)(fc->sb_flags & SB_RDONLY) != sb_rdonly(sb)) {
- if (ext4_forced_shutdown(sb)) {
+ if (ext4_emergency_state(sb)) {
err = -EROFS;
goto restore_opts;
}
@@ -6780,6 +6782,7 @@ static int ext4_reconfigure(struct fs_context *fc)
{
struct super_block *sb = fc->root->d_sb;
int ret;
+ bool old_ro = sb_rdonly(sb);
fc->s_fs_info = EXT4_SB(sb);
@@ -6791,9 +6794,9 @@ static int ext4_reconfigure(struct fs_context *fc)
if (ret < 0)
return ret;
- ext4_msg(sb, KERN_INFO, "re-mounted %pU %s. Quota mode: %s.",
- &sb->s_uuid, sb_rdonly(sb) ? "ro" : "r/w",
- ext4_quota_mode(sb));
+ ext4_msg(sb, KERN_INFO, "re-mounted %pU%s.",
+ &sb->s_uuid,
+ (old_ro != sb_rdonly(sb)) ? (sb_rdonly(sb) ? " ro" : " r/w") : "");
return 0;
}
@@ -6817,22 +6820,29 @@ static int ext4_statfs_project(struct super_block *sb,
dquot->dq_dqb.dqb_bhardlimit);
limit >>= sb->s_blocksize_bits;
- if (limit && buf->f_blocks > limit) {
+ if (limit) {
+ uint64_t remaining = 0;
+
curblock = (dquot->dq_dqb.dqb_curspace +
dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits;
- buf->f_blocks = limit;
- buf->f_bfree = buf->f_bavail =
- (buf->f_blocks > curblock) ?
- (buf->f_blocks - curblock) : 0;
+ if (limit > curblock)
+ remaining = limit - curblock;
+
+ buf->f_blocks = min(buf->f_blocks, limit);
+ buf->f_bfree = min(buf->f_bfree, remaining);
+ buf->f_bavail = min(buf->f_bavail, remaining);
}
limit = min_not_zero(dquot->dq_dqb.dqb_isoftlimit,
dquot->dq_dqb.dqb_ihardlimit);
- if (limit && buf->f_files > limit) {
- buf->f_files = limit;
- buf->f_ffree =
- (buf->f_files > dquot->dq_dqb.dqb_curinodes) ?
- (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0;
+ if (limit) {
+ uint64_t remaining = 0;
+
+ if (limit > dquot->dq_dqb.dqb_curinodes)
+ remaining = limit - dquot->dq_dqb.dqb_curinodes;
+
+ buf->f_files = min(buf->f_files, limit);
+ buf->f_ffree = min(buf->f_ffree, remaining);
}
spin_unlock(&dquot->dq_dqb_lock);
@@ -6935,12 +6945,25 @@ static int ext4_release_dquot(struct dquot *dquot)
{
int ret, err;
handle_t *handle;
+ bool freeze_protected = false;
+
+ /*
+ * Trying to sb_start_intwrite() in a running transaction
+ * can result in a deadlock. Further, running transactions
+ * are already protected from freezing.
+ */
+ if (!ext4_journal_current_handle()) {
+ sb_start_intwrite(dquot->dq_sb);
+ freeze_protected = true;
+ }
handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA,
EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
if (IS_ERR(handle)) {
/* Release dquot anyway to avoid endless cycle in dqput() */
dquot_release(dquot);
+ if (freeze_protected)
+ sb_end_intwrite(dquot->dq_sb);
return PTR_ERR(handle);
}
ret = dquot_release(dquot);
@@ -6951,6 +6974,10 @@ static int ext4_release_dquot(struct dquot *dquot)
err = ext4_journal_stop(handle);
if (!ret)
ret = err;
+
+ if (freeze_protected)
+ sb_end_intwrite(dquot->dq_sb);
+
return ret;
}
@@ -7288,7 +7315,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
}
lock_buffer(bh);
memcpy(bh->b_data+offset, data, len);
- flush_dcache_page(bh->b_page);
+ flush_dcache_folio(bh->b_folio);
unlock_buffer(bh);
err = ext4_handle_dirty_metadata(handle, NULL, bh);
brelse(bh);
@@ -7381,12 +7408,9 @@ static struct file_system_type ext4_fs_type = {
};
MODULE_ALIAS_FS("ext4");
-/* Shared across all ext4 file systems */
-wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
-
static int __init ext4_init_fs(void)
{
- int i, err;
+ int err;
ratelimit_state_init(&ext4_mount_msg_ratelimit, 30 * HZ, 64);
ext4_li_info = NULL;
@@ -7394,9 +7418,6 @@ static int __init ext4_init_fs(void)
/* Build-time check for flags consistency */
ext4_check_flag_values();
- for (i = 0; i < EXT4_WQ_HASH_SZ; i++)
- init_waitqueue_head(&ext4__ioend_wq[i]);
-
err = ext4_init_es();
if (err)
return err;
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index ddb54608ca2e..987bd00f916a 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -254,6 +254,8 @@ EXT4_ATTR(journal_task, 0444, journal_task);
EXT4_RW_ATTR_SBI_UI(mb_prefetch, s_mb_prefetch);
EXT4_RW_ATTR_SBI_UI(mb_prefetch_limit, s_mb_prefetch_limit);
EXT4_RW_ATTR_SBI_UL(last_trim_minblks, s_last_trim_minblks);
+EXT4_RW_ATTR_SBI_UI(sb_update_sec, s_sb_update_sec);
+EXT4_RW_ATTR_SBI_UI(sb_update_kb, s_sb_update_kb);
static unsigned int old_bump_val = 128;
EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val);
@@ -305,6 +307,8 @@ static struct attribute *ext4_attrs[] = {
ATTR_LIST(mb_prefetch),
ATTR_LIST(mb_prefetch_limit),
ATTR_LIST(last_trim_minblks),
+ ATTR_LIST(sb_update_sec),
+ ATTR_LIST(sb_update_kb),
NULL,
};
ATTRIBUTE_GROUPS(ext4);
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 7647e9f6e190..5a6fe1513fd2 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -139,12 +139,12 @@ static __le32 ext4_xattr_block_csum(struct inode *inode,
__u32 dummy_csum = 0;
int offset = offsetof(struct ext4_xattr_header, h_checksum);
- csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&dsk_block_nr,
+ csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&dsk_block_nr,
sizeof(dsk_block_nr));
- csum = ext4_chksum(sbi, csum, (__u8 *)hdr, offset);
- csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, sizeof(dummy_csum));
+ csum = ext4_chksum(csum, (__u8 *)hdr, offset);
+ csum = ext4_chksum(csum, (__u8 *)&dummy_csum, sizeof(dummy_csum));
offset += sizeof(dummy_csum);
- csum = ext4_chksum(sbi, csum, (__u8 *)hdr + offset,
+ csum = ext4_chksum(csum, (__u8 *)hdr + offset,
EXT4_BLOCK_SIZE(inode->i_sb) - offset);
return cpu_to_le32(csum);
@@ -156,7 +156,7 @@ static int ext4_xattr_block_csum_verify(struct inode *inode,
struct ext4_xattr_header *hdr = BHDR(bh);
int ret = 1;
- if (ext4_has_metadata_csum(inode->i_sb)) {
+ if (ext4_has_feature_metadata_csum(inode->i_sb)) {
lock_buffer(bh);
ret = (hdr->h_checksum == ext4_xattr_block_csum(inode,
bh->b_blocknr, hdr));
@@ -168,7 +168,7 @@ static int ext4_xattr_block_csum_verify(struct inode *inode,
static void ext4_xattr_block_csum_set(struct inode *inode,
struct buffer_head *bh)
{
- if (ext4_has_metadata_csum(inode->i_sb))
+ if (ext4_has_feature_metadata_csum(inode->i_sb))
BHDR(bh)->h_checksum = ext4_xattr_block_csum(inode,
bh->b_blocknr, BHDR(bh));
}
@@ -308,7 +308,7 @@ __ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh,
__ext4_xattr_check_block((inode), (bh), __func__, __LINE__)
-static inline int
+int
__xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header,
void *end, const char *function, unsigned int line)
{
@@ -316,9 +316,6 @@ __xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header,
function, line);
}
-#define xattr_check_inode(inode, header, end) \
- __xattr_check_inode((inode), (header), (end), __func__, __LINE__)
-
static int
xattr_find_entry(struct inode *inode, struct ext4_xattr_entry **pentry,
void *end, int name_index, const char *name, int sorted)
@@ -341,7 +338,7 @@ xattr_find_entry(struct inode *inode, struct ext4_xattr_entry **pentry,
cmp = name_len - entry->e_name_len;
if (!cmp)
cmp = memcmp(name, entry->e_name, name_len);
- if (cmp <= 0 && (sorted || cmp == 0))
+ if (!cmp || (cmp < 0 && sorted))
break;
}
*pentry = entry;
@@ -351,7 +348,7 @@ xattr_find_entry(struct inode *inode, struct ext4_xattr_entry **pentry,
static u32
ext4_xattr_inode_hash(struct ext4_sb_info *sbi, const void *buffer, size_t size)
{
- return ext4_chksum(sbi, sbi->s_csum_seed, buffer, size);
+ return ext4_chksum(sbi->s_csum_seed, buffer, size);
}
static u64 ext4_xattr_inode_get_ref(struct inode *ea_inode)
@@ -649,10 +646,7 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
return error;
raw_inode = ext4_raw_inode(&iloc);
header = IHDR(inode, raw_inode);
- end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
- error = xattr_check_inode(inode, header, end);
- if (error)
- goto cleanup;
+ end = ITAIL(inode, raw_inode);
entry = IFIRST(header);
error = xattr_find_entry(inode, &entry, end, name_index, name, 0);
if (error)
@@ -783,7 +777,6 @@ ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size)
struct ext4_xattr_ibody_header *header;
struct ext4_inode *raw_inode;
struct ext4_iloc iloc;
- void *end;
int error;
if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
@@ -793,14 +786,9 @@ ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size)
return error;
raw_inode = ext4_raw_inode(&iloc);
header = IHDR(inode, raw_inode);
- end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
- error = xattr_check_inode(inode, header, end);
- if (error)
- goto cleanup;
error = ext4_xattr_list_entries(dentry, IFIRST(header),
buffer, buffer_size);
-cleanup:
brelse(iloc.bh);
return error;
}
@@ -868,7 +856,6 @@ int ext4_get_inode_usage(struct inode *inode, qsize_t *usage)
struct ext4_xattr_ibody_header *header;
struct ext4_xattr_entry *entry;
qsize_t ea_inode_refs = 0;
- void *end;
int ret;
lockdep_assert_held_read(&EXT4_I(inode)->xattr_sem);
@@ -879,10 +866,6 @@ int ext4_get_inode_usage(struct inode *inode, qsize_t *usage)
goto out;
raw_inode = ext4_raw_inode(&iloc);
header = IHDR(inode, raw_inode);
- end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
- ret = xattr_check_inode(inode, header, end);
- if (ret)
- goto out;
for (entry = IFIRST(header); !IS_LAST_ENTRY(entry);
entry = EXT4_XATTR_NEXT(entry))
@@ -979,7 +962,7 @@ int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode,
* so we need to reserve credits for this eventuality
*/
if (inode && ext4_has_inline_data(inode))
- credits += ext4_writepage_trans_blocks(inode) + 1;
+ credits += ext4_chunk_trans_extent(inode, 1) + 1;
/* We are done if ea_inode feature is not enabled. */
if (!ext4_has_feature_ea_inode(sb))
@@ -1176,15 +1159,24 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
{
struct inode *ea_inode;
struct ext4_xattr_entry *entry;
+ struct ext4_iloc iloc;
bool dirty = false;
unsigned int ea_ino;
int err;
int credits;
+ void *end;
+
+ if (block_csum)
+ end = (void *)bh->b_data + bh->b_size;
+ else {
+ ext4_get_inode_loc(parent, &iloc);
+ end = (void *)ext4_raw_inode(&iloc) + EXT4_SB(parent->i_sb)->s_inode_size;
+ }
/* One credit for dec ref on ea_inode, one for orphan list addition, */
credits = 2 + extra_credits;
- for (entry = first; !IS_LAST_ENTRY(entry);
+ for (entry = first; (void *)entry < end && !IS_LAST_ENTRY(entry);
entry = EXT4_XATTR_NEXT(entry)) {
if (!entry->e_value_inum)
continue;
@@ -2235,11 +2227,8 @@ int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
header = IHDR(inode, raw_inode);
is->s.base = is->s.first = IFIRST(header);
is->s.here = is->s.first;
- is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
+ is->s.end = ITAIL(inode, raw_inode);
if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
- error = xattr_check_inode(inode, header, is->s.end);
- if (error)
- return error;
/* Find the named attribute. */
error = xattr_find_entry(inode, &is->s.here, is->s.end,
i->name_index, i->name, 0);
@@ -2786,14 +2775,10 @@ retry:
*/
base = IFIRST(header);
- end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
+ end = ITAIL(inode, raw_inode);
min_offs = end - base;
total_ino = sizeof(struct ext4_xattr_ibody_header) + sizeof(u32);
- error = xattr_check_inode(inode, header, end);
- if (error)
- goto cleanup;
-
ifree = ext4_xattr_free_space(base, &min_offs, base, &total_ino);
if (ifree >= isize_diff)
goto shift;
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index b25c2d7b5f99..1fedf44d4fb6 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -67,6 +67,9 @@ struct ext4_xattr_entry {
((void *)raw_inode + \
EXT4_GOOD_OLD_INODE_SIZE + \
EXT4_I(inode)->i_extra_isize))
+#define ITAIL(inode, raw_inode) \
+ ((void *)(raw_inode) + \
+ EXT4_SB((inode)->i_sb)->s_inode_size)
#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
/*
@@ -206,6 +209,13 @@ extern int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
extern struct mb_cache *ext4_xattr_create_cache(void);
extern void ext4_xattr_destroy_cache(struct mb_cache *);
+extern int
+__xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header,
+ void *end, const char *function, unsigned int line);
+
+#define xattr_check_inode(inode, header, end) \
+ __xattr_check_inode((inode), (header), (end), __func__, __LINE__)
+
#ifdef CONFIG_EXT4_FS_SECURITY
extern int ext4_init_security(handle_t *handle, struct inode *inode,
struct inode *dir, const struct qstr *qstr);
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index 1fbc0607363b..d4d7f329d23f 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -166,7 +166,7 @@ fail:
}
static struct posix_acl *__f2fs_get_acl(struct inode *inode, int type,
- struct page *dpage)
+ struct folio *dfolio)
{
int name_index = F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT;
void *value = NULL;
@@ -176,13 +176,13 @@ static struct posix_acl *__f2fs_get_acl(struct inode *inode, int type,
if (type == ACL_TYPE_ACCESS)
name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS;
- retval = f2fs_getxattr(inode, name_index, "", NULL, 0, dpage);
+ retval = f2fs_getxattr(inode, name_index, "", NULL, 0, dfolio);
if (retval > 0) {
value = f2fs_kmalloc(F2FS_I_SB(inode), retval, GFP_F2FS_ZERO);
if (!value)
return ERR_PTR(-ENOMEM);
retval = f2fs_getxattr(inode, name_index, "", value,
- retval, dpage);
+ retval, dfolio);
}
if (retval > 0)
@@ -227,7 +227,7 @@ static int f2fs_acl_update_mode(struct mnt_idmap *idmap,
static int __f2fs_set_acl(struct mnt_idmap *idmap,
struct inode *inode, int type,
- struct posix_acl *acl, struct page *ipage)
+ struct posix_acl *acl, struct folio *ifolio)
{
int name_index;
void *value = NULL;
@@ -238,9 +238,8 @@ static int __f2fs_set_acl(struct mnt_idmap *idmap,
switch (type) {
case ACL_TYPE_ACCESS:
name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS;
- if (acl && !ipage) {
- error = f2fs_acl_update_mode(idmap, inode,
- &mode, &acl);
+ if (acl && !ifolio) {
+ error = f2fs_acl_update_mode(idmap, inode, &mode, &acl);
if (error)
return error;
set_acl_inode(inode, mode);
@@ -265,7 +264,7 @@ static int __f2fs_set_acl(struct mnt_idmap *idmap,
}
}
- error = f2fs_setxattr(inode, name_index, "", value, size, ipage, 0);
+ error = f2fs_setxattr(inode, name_index, "", value, size, ifolio, 0);
kfree(value);
if (!error)
@@ -360,7 +359,7 @@ static int f2fs_acl_create_masq(struct posix_acl *acl, umode_t *mode_p)
static int f2fs_acl_create(struct inode *dir, umode_t *mode,
struct posix_acl **default_acl, struct posix_acl **acl,
- struct page *dpage)
+ struct folio *dfolio)
{
struct posix_acl *p;
struct posix_acl *clone;
@@ -372,7 +371,7 @@ static int f2fs_acl_create(struct inode *dir, umode_t *mode,
if (S_ISLNK(*mode) || !IS_POSIXACL(dir))
return 0;
- p = __f2fs_get_acl(dir, ACL_TYPE_DEFAULT, dpage);
+ p = __f2fs_get_acl(dir, ACL_TYPE_DEFAULT, dfolio);
if (!p || p == ERR_PTR(-EOPNOTSUPP)) {
*mode &= ~current_umask();
return 0;
@@ -409,29 +408,29 @@ release_acl:
return ret;
}
-int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage,
- struct page *dpage)
+int f2fs_init_acl(struct inode *inode, struct inode *dir, struct folio *ifolio,
+ struct folio *dfolio)
{
struct posix_acl *default_acl = NULL, *acl = NULL;
int error;
- error = f2fs_acl_create(dir, &inode->i_mode, &default_acl, &acl, dpage);
+ error = f2fs_acl_create(dir, &inode->i_mode, &default_acl, &acl, dfolio);
if (error)
return error;
f2fs_mark_inode_dirty_sync(inode, true);
if (default_acl) {
- error = __f2fs_set_acl(NULL, inode, ACL_TYPE_DEFAULT, default_acl,
- ipage);
+ error = __f2fs_set_acl(NULL, inode, ACL_TYPE_DEFAULT,
+ default_acl, ifolio);
posix_acl_release(default_acl);
} else {
inode->i_default_acl = NULL;
}
if (acl) {
if (!error)
- error = __f2fs_set_acl(NULL, inode, ACL_TYPE_ACCESS, acl,
- ipage);
+ error = __f2fs_set_acl(NULL, inode, ACL_TYPE_ACCESS,
+ acl, ifolio);
posix_acl_release(acl);
} else {
inode->i_acl = NULL;
diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h
index 94ebfbfbdc6f..20e87e63c089 100644
--- a/fs/f2fs/acl.h
+++ b/fs/f2fs/acl.h
@@ -33,17 +33,17 @@ struct f2fs_acl_header {
#ifdef CONFIG_F2FS_FS_POSIX_ACL
-extern struct posix_acl *f2fs_get_acl(struct inode *, int, bool);
-extern int f2fs_set_acl(struct mnt_idmap *, struct dentry *,
+struct posix_acl *f2fs_get_acl(struct inode *, int, bool);
+int f2fs_set_acl(struct mnt_idmap *, struct dentry *,
struct posix_acl *, int);
-extern int f2fs_init_acl(struct inode *, struct inode *, struct page *,
- struct page *);
+int f2fs_init_acl(struct inode *, struct inode *, struct folio *ifolio,
+ struct folio *dfolio);
#else
#define f2fs_get_acl NULL
#define f2fs_set_acl NULL
static inline int f2fs_init_acl(struct inode *inode, struct inode *dir,
- struct page *ipage, struct page *dpage)
+ struct folio *ifolio, struct folio *dfolio)
{
return 0;
}
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index efda9a022981..db3831f7f2f5 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -21,7 +21,7 @@
#include "iostat.h"
#include <trace/events/f2fs.h>
-#define DEFAULT_CHECKPOINT_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
+#define DEFAULT_CHECKPOINT_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 3))
static struct kmem_cache *ino_entry_slab;
struct kmem_cache *f2fs_inode_entry_slab;
@@ -29,7 +29,7 @@ struct kmem_cache *f2fs_inode_entry_slab;
void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io,
unsigned char reason)
{
- f2fs_build_fault_attr(sbi, 0, 0);
+ f2fs_build_fault_attr(sbi, 0, 0, FAULT_ALL);
if (!end_io)
f2fs_flush_merged_writes(sbi);
f2fs_handle_critical_error(sbi, reason);
@@ -38,27 +38,27 @@ void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io,
/*
* We guarantee no failure on the returned page.
*/
-struct page *f2fs_grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
+struct folio *f2fs_grab_meta_folio(struct f2fs_sb_info *sbi, pgoff_t index)
{
struct address_space *mapping = META_MAPPING(sbi);
- struct page *page;
+ struct folio *folio;
repeat:
- page = f2fs_grab_cache_page(mapping, index, false);
- if (!page) {
+ folio = f2fs_grab_cache_folio(mapping, index, false);
+ if (IS_ERR(folio)) {
cond_resched();
goto repeat;
}
- f2fs_wait_on_page_writeback(page, META, true, true);
- if (!PageUptodate(page))
- SetPageUptodate(page);
- return page;
+ f2fs_folio_wait_writeback(folio, META, true, true);
+ if (!folio_test_uptodate(folio))
+ folio_mark_uptodate(folio);
+ return folio;
}
-static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index,
+static struct folio *__get_meta_folio(struct f2fs_sb_info *sbi, pgoff_t index,
bool is_meta)
{
struct address_space *mapping = META_MAPPING(sbi);
- struct page *page;
+ struct folio *folio;
struct f2fs_io_info fio = {
.sbi = sbi,
.type = META,
@@ -74,64 +74,64 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index,
if (unlikely(!is_meta))
fio.op_flags &= ~REQ_META;
repeat:
- page = f2fs_grab_cache_page(mapping, index, false);
- if (!page) {
+ folio = f2fs_grab_cache_folio(mapping, index, false);
+ if (IS_ERR(folio)) {
cond_resched();
goto repeat;
}
- if (PageUptodate(page))
+ if (folio_test_uptodate(folio))
goto out;
- fio.page = page;
+ fio.folio = folio;
err = f2fs_submit_page_bio(&fio);
if (err) {
- f2fs_put_page(page, 1);
+ f2fs_folio_put(folio, true);
return ERR_PTR(err);
}
f2fs_update_iostat(sbi, NULL, FS_META_READ_IO, F2FS_BLKSIZE);
- lock_page(page);
- if (unlikely(page->mapping != mapping)) {
- f2fs_put_page(page, 1);
+ folio_lock(folio);
+ if (unlikely(!is_meta_folio(folio))) {
+ f2fs_folio_put(folio, true);
goto repeat;
}
- if (unlikely(!PageUptodate(page))) {
- f2fs_handle_page_eio(sbi, page_folio(page), META);
- f2fs_put_page(page, 1);
+ if (unlikely(!folio_test_uptodate(folio))) {
+ f2fs_handle_page_eio(sbi, folio, META);
+ f2fs_folio_put(folio, true);
return ERR_PTR(-EIO);
}
out:
- return page;
+ return folio;
}
-struct page *f2fs_get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
+struct folio *f2fs_get_meta_folio(struct f2fs_sb_info *sbi, pgoff_t index)
{
- return __get_meta_page(sbi, index, true);
+ return __get_meta_folio(sbi, index, true);
}
-struct page *f2fs_get_meta_page_retry(struct f2fs_sb_info *sbi, pgoff_t index)
+struct folio *f2fs_get_meta_folio_retry(struct f2fs_sb_info *sbi, pgoff_t index)
{
- struct page *page;
+ struct folio *folio;
int count = 0;
retry:
- page = __get_meta_page(sbi, index, true);
- if (IS_ERR(page)) {
- if (PTR_ERR(page) == -EIO &&
+ folio = __get_meta_folio(sbi, index, true);
+ if (IS_ERR(folio)) {
+ if (PTR_ERR(folio) == -EIO &&
++count <= DEFAULT_RETRY_IO_COUNT)
goto retry;
f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_META_PAGE);
}
- return page;
+ return folio;
}
/* for POR only */
-struct page *f2fs_get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index)
+struct folio *f2fs_get_tmp_folio(struct f2fs_sb_info *sbi, pgoff_t index)
{
- return __get_meta_page(sbi, index, false);
+ return __get_meta_folio(sbi, index, false);
}
static bool __is_bitmap_valid(struct f2fs_sb_info *sbi, block_t blkaddr,
@@ -252,7 +252,6 @@ bool f2fs_is_valid_blkaddr_raw(struct f2fs_sb_info *sbi,
int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
int type, bool sync)
{
- struct page *page;
block_t blkno = start;
struct f2fs_io_info fio = {
.sbi = sbi,
@@ -271,6 +270,7 @@ int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
blk_start_plug(&plug);
for (; nrpages-- > 0; blkno++) {
+ struct folio *folio;
if (!f2fs_is_valid_blkaddr(sbi, blkno, type))
goto out;
@@ -300,18 +300,18 @@ int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
BUG();
}
- page = f2fs_grab_cache_page(META_MAPPING(sbi),
+ folio = f2fs_grab_cache_folio(META_MAPPING(sbi),
fio.new_blkaddr, false);
- if (!page)
+ if (IS_ERR(folio))
continue;
- if (PageUptodate(page)) {
- f2fs_put_page(page, 1);
+ if (folio_test_uptodate(folio)) {
+ f2fs_folio_put(folio, true);
continue;
}
- fio.page = page;
+ fio.folio = folio;
err = f2fs_submit_page_bio(&fio);
- f2fs_put_page(page, err ? 1 : 0);
+ f2fs_folio_put(folio, err ? true : false);
if (!err)
f2fs_update_iostat(sbi, NULL, FS_META_READ_IO,
@@ -325,27 +325,26 @@ out:
void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index,
unsigned int ra_blocks)
{
- struct page *page;
+ struct folio *folio;
bool readahead = false;
if (ra_blocks == RECOVERY_MIN_RA_BLOCKS)
return;
- page = find_get_page(META_MAPPING(sbi), index);
- if (!page || !PageUptodate(page))
+ folio = filemap_get_folio(META_MAPPING(sbi), index);
+ if (IS_ERR(folio) || !folio_test_uptodate(folio))
readahead = true;
- f2fs_put_page(page, 0);
+ f2fs_folio_put(folio, false);
if (readahead)
f2fs_ra_meta_pages(sbi, index, ra_blocks, META_POR, true);
}
-static int __f2fs_write_meta_page(struct page *page,
+static bool __f2fs_write_meta_folio(struct folio *folio,
struct writeback_control *wbc,
enum iostat_type io_type)
{
- struct f2fs_sb_info *sbi = F2FS_P_SB(page);
- struct folio *folio = page_folio(page);
+ struct f2fs_sb_info *sbi = F2FS_F_SB(folio);
trace_f2fs_writepage(folio, META);
@@ -354,37 +353,26 @@ static int __f2fs_write_meta_page(struct page *page,
folio_clear_uptodate(folio);
dec_page_count(sbi, F2FS_DIRTY_META);
folio_unlock(folio);
- return 0;
+ return true;
}
goto redirty_out;
}
if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
goto redirty_out;
- if (wbc->for_reclaim && folio->index < GET_SUM_BLOCK(sbi, 0))
- goto redirty_out;
f2fs_do_write_meta_page(sbi, folio, io_type);
dec_page_count(sbi, F2FS_DIRTY_META);
- if (wbc->for_reclaim)
- f2fs_submit_merged_write_cond(sbi, NULL, page, 0, META);
-
folio_unlock(folio);
if (unlikely(f2fs_cp_error(sbi)))
f2fs_submit_merged_write(sbi, META);
- return 0;
+ return true;
redirty_out:
- redirty_page_for_writepage(wbc, page);
- return AOP_WRITEPAGE_ACTIVATE;
-}
-
-static int f2fs_write_meta_page(struct page *page,
- struct writeback_control *wbc)
-{
- return __f2fs_write_meta_page(page, wbc, FS_META_IO);
+ folio_redirty_for_writepage(wbc, folio);
+ return false;
}
static int f2fs_write_meta_pages(struct address_space *mapping,
@@ -427,9 +415,7 @@ long f2fs_sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
struct folio_batch fbatch;
long nwritten = 0;
int nr_folios;
- struct writeback_control wbc = {
- .for_reclaim = 0,
- };
+ struct writeback_control wbc = {};
struct blk_plug plug;
folio_batch_init(&fbatch);
@@ -453,7 +439,7 @@ long f2fs_sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
folio_lock(folio);
- if (unlikely(folio->mapping != mapping)) {
+ if (unlikely(!is_meta_folio(folio))) {
continue_unlock:
folio_unlock(folio);
continue;
@@ -463,13 +449,12 @@ continue_unlock:
goto continue_unlock;
}
- f2fs_wait_on_page_writeback(&folio->page, META,
- true, true);
+ f2fs_folio_wait_writeback(folio, META, true, true);
if (!folio_clear_dirty_for_io(folio))
goto continue_unlock;
- if (__f2fs_write_meta_page(&folio->page, &wbc,
+ if (!__f2fs_write_meta_folio(folio, &wbc,
io_type)) {
folio_unlock(folio);
break;
@@ -500,14 +485,13 @@ static bool f2fs_dirty_meta_folio(struct address_space *mapping,
folio_mark_uptodate(folio);
if (filemap_dirty_folio(mapping, folio)) {
inc_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_META);
- set_page_private_reference(&folio->page);
+ folio_set_f2fs_reference(folio);
return true;
}
return false;
}
const struct address_space_operations f2fs_meta_aops = {
- .writepage = f2fs_write_meta_page,
.writepages = f2fs_write_meta_pages,
.dirty_folio = f2fs_dirty_meta_folio,
.invalidate_folio = f2fs_invalidate_folio,
@@ -520,6 +504,7 @@ static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino,
{
struct inode_management *im = &sbi->im[type];
struct ino_entry *e = NULL, *new = NULL;
+ int ret;
if (type == FLUSH_INO) {
rcu_read_lock();
@@ -532,7 +517,8 @@ retry:
new = f2fs_kmem_cache_alloc(ino_entry_slab,
GFP_NOFS, true, NULL);
- radix_tree_preload(GFP_NOFS | __GFP_NOFAIL);
+ ret = radix_tree_preload(GFP_NOFS | __GFP_NOFAIL);
+ f2fs_bug_on(sbi, ret);
spin_lock(&im->ino_lock);
e = radix_tree_lookup(&im->ino_root, ino);
@@ -757,26 +743,26 @@ int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi)
f2fs_ra_meta_pages(sbi, start_blk, orphan_blocks, META_CP, true);
for (i = 0; i < orphan_blocks; i++) {
- struct page *page;
+ struct folio *folio;
struct f2fs_orphan_block *orphan_blk;
- page = f2fs_get_meta_page(sbi, start_blk + i);
- if (IS_ERR(page)) {
- err = PTR_ERR(page);
+ folio = f2fs_get_meta_folio(sbi, start_blk + i);
+ if (IS_ERR(folio)) {
+ err = PTR_ERR(folio);
goto out;
}
- orphan_blk = (struct f2fs_orphan_block *)page_address(page);
+ orphan_blk = folio_address(folio);
for (j = 0; j < le32_to_cpu(orphan_blk->entry_count); j++) {
nid_t ino = le32_to_cpu(orphan_blk->ino[j]);
err = recover_orphan_inode(sbi, ino);
if (err) {
- f2fs_put_page(page, 1);
+ f2fs_folio_put(folio, true);
goto out;
}
}
- f2fs_put_page(page, 1);
+ f2fs_folio_put(folio, true);
}
/* clear Orphan Flag */
clear_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG);
@@ -793,7 +779,7 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
unsigned int nentries = 0;
unsigned short index = 1;
unsigned short orphan_blocks;
- struct page *page = NULL;
+ struct folio *folio = NULL;
struct ino_entry *orphan = NULL;
struct inode_management *im = &sbi->im[ORPHAN_INO];
@@ -808,10 +794,9 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
/* loop for each orphan inode entry and write them in journal block */
list_for_each_entry(orphan, head, list) {
- if (!page) {
- page = f2fs_grab_meta_page(sbi, start_blk++);
- orphan_blk =
- (struct f2fs_orphan_block *)page_address(page);
+ if (!folio) {
+ folio = f2fs_grab_meta_folio(sbi, start_blk++);
+ orphan_blk = folio_address(folio);
memset(orphan_blk, 0, sizeof(*orphan_blk));
}
@@ -826,62 +811,61 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
orphan_blk->blk_addr = cpu_to_le16(index);
orphan_blk->blk_count = cpu_to_le16(orphan_blocks);
orphan_blk->entry_count = cpu_to_le32(nentries);
- set_page_dirty(page);
- f2fs_put_page(page, 1);
+ folio_mark_dirty(folio);
+ f2fs_folio_put(folio, true);
index++;
nentries = 0;
- page = NULL;
+ folio = NULL;
}
}
- if (page) {
+ if (folio) {
orphan_blk->blk_addr = cpu_to_le16(index);
orphan_blk->blk_count = cpu_to_le16(orphan_blocks);
orphan_blk->entry_count = cpu_to_le32(nentries);
- set_page_dirty(page);
- f2fs_put_page(page, 1);
+ folio_mark_dirty(folio);
+ f2fs_folio_put(folio, true);
}
}
-static __u32 f2fs_checkpoint_chksum(struct f2fs_sb_info *sbi,
- struct f2fs_checkpoint *ckpt)
+static __u32 f2fs_checkpoint_chksum(struct f2fs_checkpoint *ckpt)
{
unsigned int chksum_ofs = le32_to_cpu(ckpt->checksum_offset);
__u32 chksum;
- chksum = f2fs_crc32(sbi, ckpt, chksum_ofs);
+ chksum = f2fs_crc32(ckpt, chksum_ofs);
if (chksum_ofs < CP_CHKSUM_OFFSET) {
chksum_ofs += sizeof(chksum);
- chksum = f2fs_chksum(sbi, chksum, (__u8 *)ckpt + chksum_ofs,
- F2FS_BLKSIZE - chksum_ofs);
+ chksum = f2fs_chksum(chksum, (__u8 *)ckpt + chksum_ofs,
+ F2FS_BLKSIZE - chksum_ofs);
}
return chksum;
}
static int get_checkpoint_version(struct f2fs_sb_info *sbi, block_t cp_addr,
- struct f2fs_checkpoint **cp_block, struct page **cp_page,
+ struct f2fs_checkpoint **cp_block, struct folio **cp_folio,
unsigned long long *version)
{
size_t crc_offset = 0;
__u32 crc;
- *cp_page = f2fs_get_meta_page(sbi, cp_addr);
- if (IS_ERR(*cp_page))
- return PTR_ERR(*cp_page);
+ *cp_folio = f2fs_get_meta_folio(sbi, cp_addr);
+ if (IS_ERR(*cp_folio))
+ return PTR_ERR(*cp_folio);
- *cp_block = (struct f2fs_checkpoint *)page_address(*cp_page);
+ *cp_block = folio_address(*cp_folio);
crc_offset = le32_to_cpu((*cp_block)->checksum_offset);
if (crc_offset < CP_MIN_CHKSUM_OFFSET ||
crc_offset > CP_CHKSUM_OFFSET) {
- f2fs_put_page(*cp_page, 1);
+ f2fs_folio_put(*cp_folio, true);
f2fs_warn(sbi, "invalid crc_offset: %zu", crc_offset);
return -EINVAL;
}
- crc = f2fs_checkpoint_chksum(sbi, *cp_block);
+ crc = f2fs_checkpoint_chksum(*cp_block);
if (crc != cur_cp_crc(*cp_block)) {
- f2fs_put_page(*cp_page, 1);
+ f2fs_folio_put(*cp_folio, true);
f2fs_warn(sbi, "invalid crc value");
return -EINVAL;
}
@@ -890,17 +874,17 @@ static int get_checkpoint_version(struct f2fs_sb_info *sbi, block_t cp_addr,
return 0;
}
-static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
+static struct folio *validate_checkpoint(struct f2fs_sb_info *sbi,
block_t cp_addr, unsigned long long *version)
{
- struct page *cp_page_1 = NULL, *cp_page_2 = NULL;
+ struct folio *cp_folio_1 = NULL, *cp_folio_2 = NULL;
struct f2fs_checkpoint *cp_block = NULL;
unsigned long long cur_version = 0, pre_version = 0;
unsigned int cp_blocks;
int err;
err = get_checkpoint_version(sbi, cp_addr, &cp_block,
- &cp_page_1, version);
+ &cp_folio_1, version);
if (err)
return NULL;
@@ -915,19 +899,19 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
cp_addr += cp_blocks - 1;
err = get_checkpoint_version(sbi, cp_addr, &cp_block,
- &cp_page_2, version);
+ &cp_folio_2, version);
if (err)
goto invalid_cp;
cur_version = *version;
if (cur_version == pre_version) {
*version = cur_version;
- f2fs_put_page(cp_page_2, 1);
- return cp_page_1;
+ f2fs_folio_put(cp_folio_2, true);
+ return cp_folio_1;
}
- f2fs_put_page(cp_page_2, 1);
+ f2fs_folio_put(cp_folio_2, true);
invalid_cp:
- f2fs_put_page(cp_page_1, 1);
+ f2fs_folio_put(cp_folio_1, true);
return NULL;
}
@@ -935,7 +919,7 @@ int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi)
{
struct f2fs_checkpoint *cp_block;
struct f2fs_super_block *fsb = sbi->raw_super;
- struct page *cp1, *cp2, *cur_page;
+ struct folio *cp1, *cp2, *cur_folio;
unsigned long blk_size = sbi->blocksize;
unsigned long long cp1_version = 0, cp2_version = 0;
unsigned long long cp_start_blk_no;
@@ -962,22 +946,22 @@ int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi)
if (cp1 && cp2) {
if (ver_after(cp2_version, cp1_version))
- cur_page = cp2;
+ cur_folio = cp2;
else
- cur_page = cp1;
+ cur_folio = cp1;
} else if (cp1) {
- cur_page = cp1;
+ cur_folio = cp1;
} else if (cp2) {
- cur_page = cp2;
+ cur_folio = cp2;
} else {
err = -EFSCORRUPTED;
goto fail_no_cp;
}
- cp_block = (struct f2fs_checkpoint *)page_address(cur_page);
+ cp_block = folio_address(cur_folio);
memcpy(sbi->ckpt, cp_block, blk_size);
- if (cur_page == cp1)
+ if (cur_folio == cp1)
sbi->cur_cp_pack = 1;
else
sbi->cur_cp_pack = 2;
@@ -992,30 +976,30 @@ int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi)
goto done;
cp_blk_no = le32_to_cpu(fsb->cp_blkaddr);
- if (cur_page == cp2)
+ if (cur_folio == cp2)
cp_blk_no += BIT(le32_to_cpu(fsb->log_blocks_per_seg));
for (i = 1; i < cp_blks; i++) {
void *sit_bitmap_ptr;
unsigned char *ckpt = (unsigned char *)sbi->ckpt;
- cur_page = f2fs_get_meta_page(sbi, cp_blk_no + i);
- if (IS_ERR(cur_page)) {
- err = PTR_ERR(cur_page);
+ cur_folio = f2fs_get_meta_folio(sbi, cp_blk_no + i);
+ if (IS_ERR(cur_folio)) {
+ err = PTR_ERR(cur_folio);
goto free_fail_no_cp;
}
- sit_bitmap_ptr = page_address(cur_page);
+ sit_bitmap_ptr = folio_address(cur_folio);
memcpy(ckpt + i * blk_size, sit_bitmap_ptr, blk_size);
- f2fs_put_page(cur_page, 1);
+ f2fs_folio_put(cur_folio, true);
}
done:
- f2fs_put_page(cp1, 1);
- f2fs_put_page(cp2, 1);
+ f2fs_folio_put(cp1, true);
+ f2fs_folio_put(cp2, true);
return 0;
free_fail_no_cp:
- f2fs_put_page(cp1, 1);
- f2fs_put_page(cp2, 1);
+ f2fs_folio_put(cp1, true);
+ f2fs_folio_put(cp2, true);
fail_no_cp:
kvfree(sbi->ckpt);
return err;
@@ -1061,7 +1045,7 @@ void f2fs_update_dirty_folio(struct inode *inode, struct folio *folio)
inode_inc_dirty_pages(inode);
spin_unlock(&sbi->inode_lock[type]);
- set_page_private_reference(&folio->page);
+ folio_set_f2fs_reference(folio);
}
void f2fs_remove_dirty_inode(struct inode *inode)
@@ -1225,7 +1209,6 @@ static int block_operations(struct f2fs_sb_info *sbi)
struct writeback_control wbc = {
.sync_mode = WB_SYNC_ALL,
.nr_to_write = LONG_MAX,
- .for_reclaim = 0,
};
int err = 0, cnt = 0;
@@ -1237,7 +1220,7 @@ static int block_operations(struct f2fs_sb_info *sbi)
retry_flush_quotas:
f2fs_lock_all(sbi);
if (__need_flush_quota(sbi)) {
- int locked;
+ bool need_lock = sbi->umount_lock_holder != current;
if (++cnt > DEFAULT_RETRY_QUOTA_FLUSH_COUNT) {
set_sbi_flag(sbi, SBI_QUOTA_SKIP_FLUSH);
@@ -1246,11 +1229,13 @@ retry_flush_quotas:
}
f2fs_unlock_all(sbi);
- /* only failed during mount/umount/freeze/quotactl */
- locked = down_read_trylock(&sbi->sb->s_umount);
- f2fs_quota_sync(sbi->sb, -1);
- if (locked)
+ /* don't grab s_umount lock during mount/umount/remount/freeze/quotactl */
+ if (!need_lock) {
+ f2fs_do_quota_sync(sbi->sb, -1);
+ } else if (down_read_trylock(&sbi->sb->s_umount)) {
+ f2fs_do_quota_sync(sbi->sb, -1);
up_read(&sbi->sb->s_umount);
+ }
cond_resched();
goto retry_flush_quotas;
}
@@ -1344,21 +1329,13 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc)
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
unsigned long flags;
- if (cpc->reason & CP_UMOUNT) {
- if (le32_to_cpu(ckpt->cp_pack_total_block_count) +
- NM_I(sbi)->nat_bits_blocks > BLKS_PER_SEG(sbi)) {
- clear_ckpt_flags(sbi, CP_NAT_BITS_FLAG);
- f2fs_notice(sbi, "Disable nat_bits due to no space");
- } else if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG) &&
- f2fs_nat_bitmap_enabled(sbi)) {
- f2fs_enable_nat_bits(sbi);
- set_ckpt_flags(sbi, CP_NAT_BITS_FLAG);
- f2fs_notice(sbi, "Rebuild and enable nat_bits");
- }
- }
-
spin_lock_irqsave(&sbi->cp_lock, flags);
+ if ((cpc->reason & CP_UMOUNT) &&
+ le32_to_cpu(ckpt->cp_pack_total_block_count) >
+ sbi->blocks_per_seg - NM_I(sbi)->nat_bits_blocks)
+ disable_nat_bits(sbi, false);
+
if (cpc->reason & CP_TRIMMED)
__set_ckpt_flags(ckpt, CP_TRIMMED_FLAG);
else
@@ -1415,35 +1392,31 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc)
static void commit_checkpoint(struct f2fs_sb_info *sbi,
void *src, block_t blk_addr)
{
- struct writeback_control wbc = {
- .for_reclaim = 0,
- };
+ struct writeback_control wbc = {};
/*
- * filemap_get_folios_tag and lock_page again will take
+ * filemap_get_folios_tag and folio_lock again will take
* some extra time. Therefore, f2fs_update_meta_pages and
* f2fs_sync_meta_pages are combined in this function.
*/
- struct page *page = f2fs_grab_meta_page(sbi, blk_addr);
- int err;
-
- f2fs_wait_on_page_writeback(page, META, true, true);
+ struct folio *folio = f2fs_grab_meta_folio(sbi, blk_addr);
- memcpy(page_address(page), src, PAGE_SIZE);
+ memcpy(folio_address(folio), src, PAGE_SIZE);
- set_page_dirty(page);
- if (unlikely(!clear_page_dirty_for_io(page)))
+ folio_mark_dirty(folio);
+ if (unlikely(!folio_clear_dirty_for_io(folio)))
f2fs_bug_on(sbi, 1);
/* writeout cp pack 2 page */
- err = __f2fs_write_meta_page(page, &wbc, FS_CP_META_IO);
- if (unlikely(err && f2fs_cp_error(sbi))) {
- f2fs_put_page(page, 1);
- return;
+ if (unlikely(!__f2fs_write_meta_folio(folio, &wbc, FS_CP_META_IO))) {
+ if (f2fs_cp_error(sbi)) {
+ f2fs_folio_put(folio, true);
+ return;
+ }
+ f2fs_bug_on(sbi, true);
}
- f2fs_bug_on(sbi, err);
- f2fs_put_page(page, 0);
+ f2fs_folio_put(folio, false);
/* submit checkpoint (with barrier if NOBARRIER is not set) */
f2fs_submit_merged_write(sbi, META_FLUSH);
@@ -1533,7 +1506,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
get_sit_bitmap(sbi, __bitmap_ptr(sbi, SIT_BITMAP));
get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP));
- crc32 = f2fs_checkpoint_chksum(sbi, ckpt);
+ crc32 = f2fs_checkpoint_chksum(ckpt);
*((__le32 *)((unsigned char *)ckpt +
le32_to_cpu(ckpt->checksum_offset)))
= cpu_to_le32(crc32);
@@ -1541,8 +1514,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
start_blk = __start_cp_next_addr(sbi);
/* write nat bits */
- if ((cpc->reason & CP_UMOUNT) &&
- is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) {
+ if (enabled_nat_bits(sbi, cpc)) {
__u64 cp_ver = cur_cp_version(ckpt);
block_t blk;
@@ -1867,7 +1839,8 @@ int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi)
struct cp_control cpc;
cpc.reason = __get_cp_reason(sbi);
- if (!test_opt(sbi, MERGE_CHECKPOINT) || cpc.reason != CP_SYNC) {
+ if (!test_opt(sbi, MERGE_CHECKPOINT) || cpc.reason != CP_SYNC ||
+ sbi->umount_lock_holder == current) {
int ret;
f2fs_down_write(&sbi->gc_lock);
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 985690d81a82..5c1f47e45dab 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -23,20 +23,18 @@
static struct kmem_cache *cic_entry_slab;
static struct kmem_cache *dic_entry_slab;
-static void *page_array_alloc(struct inode *inode, int nr)
+static void *page_array_alloc(struct f2fs_sb_info *sbi, int nr)
{
- struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
unsigned int size = sizeof(struct page *) * nr;
if (likely(size <= sbi->page_array_slab_size))
return f2fs_kmem_cache_alloc(sbi->page_array_slab,
- GFP_F2FS_ZERO, false, F2FS_I_SB(inode));
+ GFP_F2FS_ZERO, false, sbi);
return f2fs_kzalloc(sbi, size, GFP_NOFS);
}
-static void page_array_free(struct inode *inode, void *pages, int nr)
+static void page_array_free(struct f2fs_sb_info *sbi, void *pages, int nr)
{
- struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
unsigned int size = sizeof(struct page *) * nr;
if (!pages)
@@ -73,17 +71,15 @@ static pgoff_t start_idx_of_cluster(struct compress_ctx *cc)
return cc->cluster_idx << cc->log_cluster_size;
}
-bool f2fs_is_compressed_page(struct page *page)
+bool f2fs_is_compressed_page(struct folio *folio)
{
- if (!PagePrivate(page))
- return false;
- if (!page_private(page))
+ if (!folio->private)
return false;
- if (page_private_nonpointer(page))
+ if (folio_test_f2fs_nonpointer(folio))
return false;
- f2fs_bug_on(F2FS_M_SB(page->mapping),
- *((u32 *)page_private(page)) != F2FS_COMPRESSED_PAGE_MAGIC);
+ f2fs_bug_on(F2FS_F_SB(folio),
+ *((u32 *)folio->private) != F2FS_COMPRESSED_PAGE_MAGIC);
return true;
}
@@ -137,9 +133,11 @@ static void f2fs_put_rpages_wbc(struct compress_ctx *cc,
}
}
-struct page *f2fs_compress_control_page(struct page *page)
+struct folio *f2fs_compress_control_folio(struct folio *folio)
{
- return ((struct compress_io_ctx *)page_private(page))->rpages[0];
+ struct compress_io_ctx *ctx = folio->private;
+
+ return page_folio(ctx->rpages[0]);
}
int f2fs_init_compress_ctx(struct compress_ctx *cc)
@@ -147,13 +145,13 @@ int f2fs_init_compress_ctx(struct compress_ctx *cc)
if (cc->rpages)
return 0;
- cc->rpages = page_array_alloc(cc->inode, cc->cluster_size);
+ cc->rpages = page_array_alloc(F2FS_I_SB(cc->inode), cc->cluster_size);
return cc->rpages ? 0 : -ENOMEM;
}
void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse)
{
- page_array_free(cc->inode, cc->rpages, cc->cluster_size);
+ page_array_free(F2FS_I_SB(cc->inode), cc->rpages, cc->cluster_size);
cc->rpages = NULL;
cc->nr_rpages = 0;
cc->nr_cpages = 0;
@@ -178,8 +176,8 @@ void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct folio *folio)
#ifdef CONFIG_F2FS_FS_LZO
static int lzo_init_compress_ctx(struct compress_ctx *cc)
{
- cc->private = f2fs_kvmalloc(F2FS_I_SB(cc->inode),
- LZO1X_MEM_COMPRESS, GFP_NOFS);
+ cc->private = f2fs_vmalloc(F2FS_I_SB(cc->inode),
+ LZO1X_MEM_COMPRESS);
if (!cc->private)
return -ENOMEM;
@@ -189,7 +187,7 @@ static int lzo_init_compress_ctx(struct compress_ctx *cc)
static void lzo_destroy_compress_ctx(struct compress_ctx *cc)
{
- kvfree(cc->private);
+ vfree(cc->private);
cc->private = NULL;
}
@@ -214,13 +212,13 @@ static int lzo_decompress_pages(struct decompress_io_ctx *dic)
ret = lzo1x_decompress_safe(dic->cbuf->cdata, dic->clen,
dic->rbuf, &dic->rlen);
if (ret != LZO_E_OK) {
- f2fs_err_ratelimited(F2FS_I_SB(dic->inode),
+ f2fs_err_ratelimited(dic->sbi,
"lzo decompress failed, ret:%d", ret);
return -EIO;
}
if (dic->rlen != PAGE_SIZE << dic->log_cluster_size) {
- f2fs_err_ratelimited(F2FS_I_SB(dic->inode),
+ f2fs_err_ratelimited(dic->sbi,
"lzo invalid rlen:%zu, expected:%lu",
dic->rlen, PAGE_SIZE << dic->log_cluster_size);
return -EIO;
@@ -246,7 +244,7 @@ static int lz4_init_compress_ctx(struct compress_ctx *cc)
size = LZ4HC_MEM_COMPRESS;
#endif
- cc->private = f2fs_kvmalloc(F2FS_I_SB(cc->inode), size, GFP_NOFS);
+ cc->private = f2fs_vmalloc(F2FS_I_SB(cc->inode), size);
if (!cc->private)
return -ENOMEM;
@@ -261,7 +259,7 @@ static int lz4_init_compress_ctx(struct compress_ctx *cc)
static void lz4_destroy_compress_ctx(struct compress_ctx *cc)
{
- kvfree(cc->private);
+ vfree(cc->private);
cc->private = NULL;
}
@@ -294,13 +292,13 @@ static int lz4_decompress_pages(struct decompress_io_ctx *dic)
ret = LZ4_decompress_safe(dic->cbuf->cdata, dic->rbuf,
dic->clen, dic->rlen);
if (ret < 0) {
- f2fs_err_ratelimited(F2FS_I_SB(dic->inode),
+ f2fs_err_ratelimited(dic->sbi,
"lz4 decompress failed, ret:%d", ret);
return -EIO;
}
if (ret != PAGE_SIZE << dic->log_cluster_size) {
- f2fs_err_ratelimited(F2FS_I_SB(dic->inode),
+ f2fs_err_ratelimited(dic->sbi,
"lz4 invalid ret:%d, expected:%lu",
ret, PAGE_SIZE << dic->log_cluster_size);
return -EIO;
@@ -342,8 +340,7 @@ static int zstd_init_compress_ctx(struct compress_ctx *cc)
params = zstd_get_params(level, cc->rlen);
workspace_size = zstd_cstream_workspace_bound(&params.cParams);
- workspace = f2fs_kvmalloc(F2FS_I_SB(cc->inode),
- workspace_size, GFP_NOFS);
+ workspace = f2fs_vmalloc(F2FS_I_SB(cc->inode), workspace_size);
if (!workspace)
return -ENOMEM;
@@ -351,7 +348,7 @@ static int zstd_init_compress_ctx(struct compress_ctx *cc)
if (!stream) {
f2fs_err_ratelimited(F2FS_I_SB(cc->inode),
"%s zstd_init_cstream failed", __func__);
- kvfree(workspace);
+ vfree(workspace);
return -EIO;
}
@@ -364,7 +361,7 @@ static int zstd_init_compress_ctx(struct compress_ctx *cc)
static void zstd_destroy_compress_ctx(struct compress_ctx *cc)
{
- kvfree(cc->private);
+ vfree(cc->private);
cc->private = NULL;
cc->private2 = NULL;
}
@@ -423,16 +420,15 @@ static int zstd_init_decompress_ctx(struct decompress_io_ctx *dic)
workspace_size = zstd_dstream_workspace_bound(max_window_size);
- workspace = f2fs_kvmalloc(F2FS_I_SB(dic->inode),
- workspace_size, GFP_NOFS);
+ workspace = f2fs_vmalloc(dic->sbi, workspace_size);
if (!workspace)
return -ENOMEM;
stream = zstd_init_dstream(max_window_size, workspace, workspace_size);
if (!stream) {
- f2fs_err_ratelimited(F2FS_I_SB(dic->inode),
+ f2fs_err_ratelimited(dic->sbi,
"%s zstd_init_dstream failed", __func__);
- kvfree(workspace);
+ vfree(workspace);
return -EIO;
}
@@ -444,7 +440,7 @@ static int zstd_init_decompress_ctx(struct decompress_io_ctx *dic)
static void zstd_destroy_decompress_ctx(struct decompress_io_ctx *dic)
{
- kvfree(dic->private);
+ vfree(dic->private);
dic->private = NULL;
dic->private2 = NULL;
}
@@ -466,14 +462,14 @@ static int zstd_decompress_pages(struct decompress_io_ctx *dic)
ret = zstd_decompress_stream(stream, &outbuf, &inbuf);
if (zstd_is_error(ret)) {
- f2fs_err_ratelimited(F2FS_I_SB(dic->inode),
+ f2fs_err_ratelimited(dic->sbi,
"%s zstd_decompress_stream failed, ret: %d",
__func__, zstd_get_error_code(ret));
return -EIO;
}
if (dic->rlen != outbuf.pos) {
- f2fs_err_ratelimited(F2FS_I_SB(dic->inode),
+ f2fs_err_ratelimited(dic->sbi,
"%s ZSTD invalid rlen:%zu, expected:%lu",
__func__, dic->rlen,
PAGE_SIZE << dic->log_cluster_size);
@@ -593,11 +589,14 @@ static struct page *f2fs_compress_alloc_page(void)
static void f2fs_compress_free_page(struct page *page)
{
+ struct folio *folio;
+
if (!page)
return;
- detach_page_private(page);
- page->mapping = NULL;
- unlock_page(page);
+ folio = page_folio(page);
+ folio_detach_private(folio);
+ folio->mapping = NULL;
+ folio_unlock(folio);
mempool_free(page, compress_page_pool);
}
@@ -619,6 +618,7 @@ static void *f2fs_vmap(struct page **pages, unsigned int count)
static int f2fs_compress_pages(struct compress_ctx *cc)
{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode);
struct f2fs_inode_info *fi = F2FS_I(cc->inode);
const struct f2fs_compress_ops *cops =
f2fs_cops[fi->i_compress_algorithm];
@@ -639,7 +639,7 @@ static int f2fs_compress_pages(struct compress_ctx *cc)
cc->nr_cpages = DIV_ROUND_UP(max_len, PAGE_SIZE);
cc->valid_nr_cpages = cc->nr_cpages;
- cc->cpages = page_array_alloc(cc->inode, cc->nr_cpages);
+ cc->cpages = page_array_alloc(sbi, cc->nr_cpages);
if (!cc->cpages) {
ret = -ENOMEM;
goto destroy_compress_ctx;
@@ -674,8 +674,7 @@ static int f2fs_compress_pages(struct compress_ctx *cc)
cc->cbuf->clen = cpu_to_le32(cc->clen);
if (fi->i_compress_flag & BIT(COMPRESS_CHKSUM))
- chksum = f2fs_crc32(F2FS_I_SB(cc->inode),
- cc->cbuf->cdata, cc->clen);
+ chksum = f2fs_crc32(cc->cbuf->cdata, cc->clen);
cc->cbuf->chksum = cpu_to_le32(chksum);
for (i = 0; i < COMPRESS_DATA_RESERVED_SIZE; i++)
@@ -714,7 +713,7 @@ out_free_cpages:
if (cc->cpages[i])
f2fs_compress_free_page(cc->cpages[i]);
}
- page_array_free(cc->inode, cc->cpages, cc->nr_cpages);
+ page_array_free(sbi, cc->cpages, cc->nr_cpages);
cc->cpages = NULL;
destroy_compress_ctx:
if (cops->destroy_compress_ctx)
@@ -732,7 +731,7 @@ static void f2fs_release_decomp_mem(struct decompress_io_ctx *dic,
void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task)
{
- struct f2fs_sb_info *sbi = F2FS_I_SB(dic->inode);
+ struct f2fs_sb_info *sbi = dic->sbi;
struct f2fs_inode_info *fi = F2FS_I(dic->inode);
const struct f2fs_compress_ops *cops =
f2fs_cops[fi->i_compress_algorithm];
@@ -771,7 +770,7 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task)
if (!ret && (fi->i_compress_flag & BIT(COMPRESS_CHKSUM))) {
u32 provided = le32_to_cpu(dic->cbuf->chksum);
- u32 calculated = f2fs_crc32(sbi, dic->cbuf->cdata, dic->clen);
+ u32 calculated = f2fs_crc32(dic->cbuf->cdata, dic->clen);
if (provided != calculated) {
if (!is_inode_flag_set(dic->inode, FI_COMPRESS_CORRUPT)) {
@@ -794,25 +793,27 @@ out_end_io:
f2fs_decompress_end_io(dic, ret, in_task);
}
+static void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi,
+ struct folio *folio, nid_t ino, block_t blkaddr);
+
/*
* This is called when a page of a compressed cluster has been read from disk
* (or failed to be read from disk). It checks whether this page was the last
* page being waited on in the cluster, and if so, it decompresses the cluster
* (or in the case of a failure, cleans up without actually decompressing).
*/
-void f2fs_end_read_compressed_page(struct page *page, bool failed,
+void f2fs_end_read_compressed_page(struct folio *folio, bool failed,
block_t blkaddr, bool in_task)
{
- struct decompress_io_ctx *dic =
- (struct decompress_io_ctx *)page_private(page);
- struct f2fs_sb_info *sbi = F2FS_I_SB(dic->inode);
+ struct decompress_io_ctx *dic = folio->private;
+ struct f2fs_sb_info *sbi = dic->sbi;
dec_page_count(sbi, F2FS_RD_DATA);
if (failed)
WRITE_ONCE(dic->failed, true);
else if (blkaddr && in_task)
- f2fs_cache_compressed_page(sbi, page,
+ f2fs_cache_compressed_page(sbi, folio,
dic->inode->i_ino, blkaddr);
if (atomic_dec_and_test(&dic->remaining_pages))
@@ -909,7 +910,7 @@ bool f2fs_sanity_check_cluster(struct dnode_of_data *dn)
}
for (i = 1, count = 1; i < cluster_size; i++, count++) {
- block_t blkaddr = data_blkaddr(dn->inode, dn->node_page,
+ block_t blkaddr = data_blkaddr(dn->inode, dn->node_folio,
dn->ofs_in_node + i);
/* [COMPR_ADDR, ..., COMPR_ADDR] */
@@ -950,7 +951,7 @@ static int __f2fs_get_cluster_blocks(struct inode *inode,
int count, i;
for (i = 0, count = 0; i < cluster_size; i++) {
- block_t blkaddr = data_blkaddr(dn->inode, dn->node_page,
+ block_t blkaddr = data_blkaddr(dn->inode, dn->node_folio,
dn->ofs_in_node + i);
if (__is_valid_data_blkaddr(blkaddr))
@@ -1090,7 +1091,7 @@ static int prepare_compress_overwrite(struct compress_ctx *cc,
{
struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode);
struct address_space *mapping = cc->inode->i_mapping;
- struct page *page;
+ struct folio *folio;
sector_t last_block_in_bio;
fgf_t fgp_flag = FGP_LOCK | FGP_WRITE | FGP_CREAT;
pgoff_t start_idx = start_idx_of_cluster(cc);
@@ -1105,19 +1106,19 @@ retry:
if (ret)
return ret;
- /* keep page reference to avoid page reclaim */
+ /* keep folio reference to avoid page reclaim */
for (i = 0; i < cc->cluster_size; i++) {
- page = f2fs_pagecache_get_page(mapping, start_idx + i,
- fgp_flag, GFP_NOFS);
- if (!page) {
- ret = -ENOMEM;
+ folio = f2fs_filemap_get_folio(mapping, start_idx + i,
+ fgp_flag, GFP_NOFS);
+ if (IS_ERR(folio)) {
+ ret = PTR_ERR(folio);
goto unlock_pages;
}
- if (PageUptodate(page))
- f2fs_put_page(page, 1);
+ if (folio_test_uptodate(folio))
+ f2fs_folio_put(folio, true);
else
- f2fs_compress_ctx_add_page(cc, page_folio(page));
+ f2fs_compress_ctx_add_page(cc, folio);
}
if (!f2fs_cluster_is_empty(cc)) {
@@ -1140,16 +1141,17 @@ retry:
for (i = 0; i < cc->cluster_size; i++) {
f2fs_bug_on(sbi, cc->rpages[i]);
- page = find_lock_page(mapping, start_idx + i);
- if (!page) {
- /* page can be truncated */
+ folio = filemap_lock_folio(mapping, start_idx + i);
+ if (IS_ERR(folio)) {
+ /* folio could be truncated */
goto release_and_retry;
}
- f2fs_wait_on_page_writeback(page, DATA, true, true);
- f2fs_compress_ctx_add_page(cc, page_folio(page));
+ f2fs_folio_wait_writeback(folio, DATA, true, true);
+ f2fs_compress_ctx_add_page(cc, folio);
- if (!PageUptodate(page)) {
+ if (!folio_test_uptodate(folio)) {
+ f2fs_handle_page_eio(sbi, folio, DATA);
release_and_retry:
f2fs_put_rpages(cc);
f2fs_unlock_rpages(cc, i + 1);
@@ -1316,7 +1318,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
goto out_unlock_op;
for (i = 0; i < cc->cluster_size; i++) {
- if (data_blkaddr(dn.inode, dn.node_page,
+ if (data_blkaddr(dn.inode, dn.node_folio,
dn.ofs_in_node + i) == NULL_ADDR)
goto out_put_dnode;
}
@@ -1337,7 +1339,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
cic->magic = F2FS_COMPRESSED_PAGE_MAGIC;
cic->inode = inode;
atomic_set(&cic->pending_pages, cc->valid_nr_cpages);
- cic->rpages = page_array_alloc(cc->inode, cc->cluster_size);
+ cic->rpages = page_array_alloc(sbi, cc->cluster_size);
if (!cic->rpages)
goto out_put_cic;
@@ -1348,7 +1350,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
page_folio(cc->rpages[i + 1])->index, cic);
fio.compressed_page = cc->cpages[i];
- fio.old_blkaddr = data_blkaddr(dn.inode, dn.node_page,
+ fio.old_blkaddr = data_blkaddr(dn.inode, dn.node_folio,
dn.ofs_in_node + i + 1);
/* wait for GCed page writeback via META_MAPPING */
@@ -1417,7 +1419,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
(*submitted)++;
unlock_continue:
inode_dec_dirty_pages(cc->inode);
- unlock_page(fio.page);
+ folio_unlock(fio.folio);
}
if (fio.compr_blocks)
@@ -1439,13 +1441,13 @@ unlock_continue:
spin_unlock(&fi->i_size_lock);
f2fs_put_rpages(cc);
- page_array_free(cc->inode, cc->cpages, cc->nr_cpages);
+ page_array_free(sbi, cc->cpages, cc->nr_cpages);
cc->cpages = NULL;
f2fs_destroy_compress_ctx(cc, false);
return 0;
out_destroy_crypt:
- page_array_free(cc->inode, cic->rpages, cc->cluster_size);
+ page_array_free(sbi, cic->rpages, cc->cluster_size);
for (--i; i >= 0; i--) {
if (!cc->cpages[i])
@@ -1466,21 +1468,21 @@ out_free:
f2fs_compress_free_page(cc->cpages[i]);
cc->cpages[i] = NULL;
}
- page_array_free(cc->inode, cc->cpages, cc->nr_cpages);
+ page_array_free(sbi, cc->cpages, cc->nr_cpages);
cc->cpages = NULL;
return -EAGAIN;
}
-void f2fs_compress_write_end_io(struct bio *bio, struct page *page)
+void f2fs_compress_write_end_io(struct bio *bio, struct folio *folio)
{
+ struct page *page = &folio->page;
struct f2fs_sb_info *sbi = bio->bi_private;
- struct compress_io_ctx *cic =
- (struct compress_io_ctx *)page_private(page);
- enum count_type type = WB_DATA_TYPE(page,
- f2fs_is_compressed_page(page));
+ struct compress_io_ctx *cic = folio->private;
+ enum count_type type = WB_DATA_TYPE(folio,
+ f2fs_is_compressed_page(folio));
int i;
- if (unlikely(bio->bi_status))
+ if (unlikely(bio->bi_status != BLK_STS_OK))
mapping_set_error(cic->inode->i_mapping, -EIO);
f2fs_compress_free_page(page);
@@ -1496,7 +1498,7 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page)
end_page_writeback(cic->rpages[i]);
}
- page_array_free(cic->inode, cic->rpages, cic->nr_rpages);
+ page_array_free(sbi, cic->rpages, cic->nr_rpages);
kmem_cache_free(cic_entry_slab, cic);
}
@@ -1528,37 +1530,38 @@ static int f2fs_write_raw_pages(struct compress_ctx *cc,
f2fs_lock_op(sbi);
for (i = 0; i < cc->cluster_size; i++) {
+ struct folio *folio;
+
if (!cc->rpages[i])
continue;
+ folio = page_folio(cc->rpages[i]);
retry_write:
- lock_page(cc->rpages[i]);
+ folio_lock(folio);
- if (cc->rpages[i]->mapping != mapping) {
+ if (folio->mapping != mapping) {
continue_unlock:
- unlock_page(cc->rpages[i]);
+ folio_unlock(folio);
continue;
}
- if (!PageDirty(cc->rpages[i]))
+ if (!folio_test_dirty(folio))
goto continue_unlock;
- if (folio_test_writeback(page_folio(cc->rpages[i]))) {
+ if (folio_test_writeback(folio)) {
if (wbc->sync_mode == WB_SYNC_NONE)
goto continue_unlock;
- f2fs_wait_on_page_writeback(cc->rpages[i], DATA, true, true);
+ f2fs_folio_wait_writeback(folio, DATA, true, true);
}
- if (!clear_page_dirty_for_io(cc->rpages[i]))
+ if (!folio_clear_dirty_for_io(folio))
goto continue_unlock;
submitted = 0;
- ret = f2fs_write_single_data_page(page_folio(cc->rpages[i]),
- &submitted,
+ ret = f2fs_write_single_data_page(folio, &submitted,
NULL, NULL, wbc, io_type,
compr_blocks, false);
if (ret) {
- if (ret == AOP_WRITEPAGE_ACTIVATE) {
- unlock_page(cc->rpages[i]);
+ if (ret == 1) {
ret = 0;
} else if (ret == -EAGAIN) {
ret = 0;
@@ -1629,14 +1632,13 @@ static inline bool allow_memalloc_for_decomp(struct f2fs_sb_info *sbi,
static int f2fs_prepare_decomp_mem(struct decompress_io_ctx *dic,
bool pre_alloc)
{
- const struct f2fs_compress_ops *cops =
- f2fs_cops[F2FS_I(dic->inode)->i_compress_algorithm];
+ const struct f2fs_compress_ops *cops = f2fs_cops[dic->compress_algorithm];
int i;
- if (!allow_memalloc_for_decomp(F2FS_I_SB(dic->inode), pre_alloc))
+ if (!allow_memalloc_for_decomp(dic->sbi, pre_alloc))
return 0;
- dic->tpages = page_array_alloc(dic->inode, dic->cluster_size);
+ dic->tpages = page_array_alloc(dic->sbi, dic->cluster_size);
if (!dic->tpages)
return -ENOMEM;
@@ -1666,10 +1668,9 @@ static int f2fs_prepare_decomp_mem(struct decompress_io_ctx *dic,
static void f2fs_release_decomp_mem(struct decompress_io_ctx *dic,
bool bypass_destroy_callback, bool pre_alloc)
{
- const struct f2fs_compress_ops *cops =
- f2fs_cops[F2FS_I(dic->inode)->i_compress_algorithm];
+ const struct f2fs_compress_ops *cops = f2fs_cops[dic->compress_algorithm];
- if (!allow_memalloc_for_decomp(F2FS_I_SB(dic->inode), pre_alloc))
+ if (!allow_memalloc_for_decomp(dic->sbi, pre_alloc))
return;
if (!bypass_destroy_callback && cops->destroy_decompress_ctx)
@@ -1696,7 +1697,7 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc)
if (!dic)
return ERR_PTR(-ENOMEM);
- dic->rpages = page_array_alloc(cc->inode, cc->cluster_size);
+ dic->rpages = page_array_alloc(sbi, cc->cluster_size);
if (!dic->rpages) {
kmem_cache_free(dic_entry_slab, dic);
return ERR_PTR(-ENOMEM);
@@ -1704,6 +1705,8 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc)
dic->magic = F2FS_COMPRESSED_PAGE_MAGIC;
dic->inode = cc->inode;
+ dic->sbi = sbi;
+ dic->compress_algorithm = F2FS_I(cc->inode)->i_compress_algorithm;
atomic_set(&dic->remaining_pages, cc->nr_cpages);
dic->cluster_idx = cc->cluster_idx;
dic->cluster_size = cc->cluster_size;
@@ -1717,7 +1720,7 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc)
dic->rpages[i] = cc->rpages[i];
dic->nr_rpages = cc->cluster_size;
- dic->cpages = page_array_alloc(dic->inode, dic->nr_cpages);
+ dic->cpages = page_array_alloc(sbi, dic->nr_cpages);
if (!dic->cpages) {
ret = -ENOMEM;
goto out_free;
@@ -1747,6 +1750,8 @@ static void f2fs_free_dic(struct decompress_io_ctx *dic,
bool bypass_destroy_callback)
{
int i;
+ /* use sbi in dic to avoid UFA of dic->inode*/
+ struct f2fs_sb_info *sbi = dic->sbi;
f2fs_release_decomp_mem(dic, bypass_destroy_callback, true);
@@ -1758,7 +1763,7 @@ static void f2fs_free_dic(struct decompress_io_ctx *dic,
continue;
f2fs_compress_free_page(dic->tpages[i]);
}
- page_array_free(dic->inode, dic->tpages, dic->cluster_size);
+ page_array_free(sbi, dic->tpages, dic->cluster_size);
}
if (dic->cpages) {
@@ -1767,10 +1772,10 @@ static void f2fs_free_dic(struct decompress_io_ctx *dic,
continue;
f2fs_compress_free_page(dic->cpages[i]);
}
- page_array_free(dic->inode, dic->cpages, dic->nr_cpages);
+ page_array_free(sbi, dic->cpages, dic->nr_cpages);
}
- page_array_free(dic->inode, dic->rpages, dic->nr_rpages);
+ page_array_free(sbi, dic->rpages, dic->nr_rpages);
kmem_cache_free(dic_entry_slab, dic);
}
@@ -1789,8 +1794,7 @@ static void f2fs_put_dic(struct decompress_io_ctx *dic, bool in_task)
f2fs_free_dic(dic, false);
} else {
INIT_WORK(&dic->free_work, f2fs_late_free_dic);
- queue_work(F2FS_I_SB(dic->inode)->post_read_wq,
- &dic->free_work);
+ queue_work(dic->sbi->post_read_wq, &dic->free_work);
}
}
}
@@ -1861,14 +1865,13 @@ void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed,
}
/*
- * Put a reference to a compressed page's decompress_io_ctx.
+ * Put a reference to a compressed folio's decompress_io_ctx.
*
- * This is called when the page is no longer needed and can be freed.
+ * This is called when the folio is no longer needed and can be freed.
*/
-void f2fs_put_page_dic(struct page *page, bool in_task)
+void f2fs_put_folio_dic(struct folio *folio, bool in_task)
{
- struct decompress_io_ctx *dic =
- (struct decompress_io_ctx *)page_private(page);
+ struct decompress_io_ctx *dic = folio->private;
f2fs_put_dic(dic, in_task);
}
@@ -1880,14 +1883,14 @@ void f2fs_put_page_dic(struct page *page, bool in_task)
unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn,
unsigned int ofs_in_node)
{
- bool compressed = data_blkaddr(dn->inode, dn->node_page,
+ bool compressed = data_blkaddr(dn->inode, dn->node_folio,
ofs_in_node) == COMPRESS_ADDR;
int i = compressed ? 1 : 0;
- block_t first_blkaddr = data_blkaddr(dn->inode, dn->node_page,
+ block_t first_blkaddr = data_blkaddr(dn->inode, dn->node_folio,
ofs_in_node + i);
for (i += 1; i < F2FS_I(dn->inode)->i_cluster_size; i++) {
- block_t blkaddr = data_blkaddr(dn->inode, dn->node_page,
+ block_t blkaddr = data_blkaddr(dn->inode, dn->node_folio,
ofs_in_node + i);
if (!__is_valid_data_blkaddr(blkaddr))
@@ -1918,10 +1921,10 @@ void f2fs_invalidate_compress_pages_range(struct f2fs_sb_info *sbi,
invalidate_mapping_pages(COMPRESS_MAPPING(sbi), blkaddr, blkaddr + len - 1);
}
-void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, struct page *page,
- nid_t ino, block_t blkaddr)
+static void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi,
+ struct folio *folio, nid_t ino, block_t blkaddr)
{
- struct page *cpage;
+ struct folio *cfolio;
int ret;
if (!test_opt(sbi, COMPRESS_CACHE))
@@ -1933,49 +1936,49 @@ void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, struct page *page,
if (!f2fs_available_free_memory(sbi, COMPRESS_PAGE))
return;
- cpage = find_get_page(COMPRESS_MAPPING(sbi), blkaddr);
- if (cpage) {
- f2fs_put_page(cpage, 0);
+ cfolio = filemap_get_folio(COMPRESS_MAPPING(sbi), blkaddr);
+ if (!IS_ERR(cfolio)) {
+ f2fs_folio_put(cfolio, false);
return;
}
- cpage = alloc_page(__GFP_NOWARN | __GFP_IO);
- if (!cpage)
+ cfolio = filemap_alloc_folio(__GFP_NOWARN | __GFP_IO, 0);
+ if (!cfolio)
return;
- ret = add_to_page_cache_lru(cpage, COMPRESS_MAPPING(sbi),
+ ret = filemap_add_folio(COMPRESS_MAPPING(sbi), cfolio,
blkaddr, GFP_NOFS);
if (ret) {
- f2fs_put_page(cpage, 0);
+ f2fs_folio_put(cfolio, false);
return;
}
- set_page_private_data(cpage, ino);
+ folio_set_f2fs_data(cfolio, ino);
- memcpy(page_address(cpage), page_address(page), PAGE_SIZE);
- SetPageUptodate(cpage);
- f2fs_put_page(cpage, 1);
+ memcpy(folio_address(cfolio), folio_address(folio), PAGE_SIZE);
+ folio_mark_uptodate(cfolio);
+ f2fs_folio_put(cfolio, true);
}
-bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi, struct page *page,
+bool f2fs_load_compressed_folio(struct f2fs_sb_info *sbi, struct folio *folio,
block_t blkaddr)
{
- struct page *cpage;
+ struct folio *cfolio;
bool hitted = false;
if (!test_opt(sbi, COMPRESS_CACHE))
return false;
- cpage = f2fs_pagecache_get_page(COMPRESS_MAPPING(sbi),
+ cfolio = f2fs_filemap_get_folio(COMPRESS_MAPPING(sbi),
blkaddr, FGP_LOCK | FGP_NOWAIT, GFP_NOFS);
- if (cpage) {
- if (PageUptodate(cpage)) {
+ if (!IS_ERR(cfolio)) {
+ if (folio_test_uptodate(cfolio)) {
atomic_inc(&sbi->compress_page_hit);
- memcpy(page_address(page),
- page_address(cpage), PAGE_SIZE);
+ memcpy(folio_address(folio),
+ folio_address(cfolio), folio_size(folio));
hitted = true;
}
- f2fs_put_page(cpage, 1);
+ f2fs_folio_put(cfolio, true);
}
return hitted;
@@ -2009,7 +2012,7 @@ void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino)
continue;
}
- if (ino != get_page_private_data(&folio->page)) {
+ if (ino != folio_get_f2fs_data(folio)) {
folio_unlock(folio);
continue;
}
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index de4da6d9cd93..7961e0ddfca3 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -47,14 +47,14 @@ void f2fs_destroy_bioset(void)
bioset_exit(&f2fs_bioset);
}
-bool f2fs_is_cp_guaranteed(struct page *page)
+bool f2fs_is_cp_guaranteed(const struct folio *folio)
{
- struct address_space *mapping = page->mapping;
+ struct address_space *mapping = folio->mapping;
struct inode *inode;
struct f2fs_sb_info *sbi;
- if (!mapping)
- return false;
+ if (fscrypt_is_bounce_folio(folio))
+ return folio_test_f2fs_gcing(fscrypt_pagecache_folio(folio));
inode = mapping->host;
sbi = F2FS_I_SB(inode);
@@ -65,7 +65,7 @@ bool f2fs_is_cp_guaranteed(struct page *page)
return true;
if ((S_ISREG(inode->i_mode) && IS_NOQUOTA(inode)) ||
- page_private_gcing(page))
+ folio_test_f2fs_gcing(folio))
return true;
return false;
}
@@ -142,16 +142,16 @@ static void f2fs_finish_read_bio(struct bio *bio, bool in_task)
bio_for_each_folio_all(fi, bio) {
struct folio *folio = fi.folio;
- if (f2fs_is_compressed_page(&folio->page)) {
+ if (f2fs_is_compressed_page(folio)) {
if (ctx && !ctx->decompression_attempted)
- f2fs_end_read_compressed_page(&folio->page, true, 0,
+ f2fs_end_read_compressed_page(folio, true, 0,
in_task);
- f2fs_put_page_dic(&folio->page, in_task);
+ f2fs_put_folio_dic(folio, in_task);
continue;
}
dec_page_count(F2FS_F_SB(folio), __read_io_type(folio));
- folio_end_read(folio, bio->bi_status == 0);
+ folio_end_read(folio, bio->bi_status == BLK_STS_OK);
}
if (ctx)
@@ -181,14 +181,13 @@ static void f2fs_verify_bio(struct work_struct *work)
* as those were handled separately by f2fs_end_read_compressed_page().
*/
if (may_have_compressed_pages) {
- struct bio_vec *bv;
- struct bvec_iter_all iter_all;
+ struct folio_iter fi;
- bio_for_each_segment_all(bv, bio, iter_all) {
- struct page *page = bv->bv_page;
+ bio_for_each_folio_all(fi, bio) {
+ struct folio *folio = fi.folio;
- if (!f2fs_is_compressed_page(page) &&
- !fsverity_verify_page(page)) {
+ if (!f2fs_is_compressed_page(folio) &&
+ !fsverity_verify_page(&folio->page)) {
bio->bi_status = BLK_STS_IOERR;
break;
}
@@ -233,16 +232,15 @@ static void f2fs_verify_and_finish_bio(struct bio *bio, bool in_task)
static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx,
bool in_task)
{
- struct bio_vec *bv;
- struct bvec_iter_all iter_all;
+ struct folio_iter fi;
bool all_compressed = true;
block_t blkaddr = ctx->fs_blkaddr;
- bio_for_each_segment_all(bv, ctx->bio, iter_all) {
- struct page *page = bv->bv_page;
+ bio_for_each_folio_all(fi, ctx->bio) {
+ struct folio *folio = fi.folio;
- if (f2fs_is_compressed_page(page))
- f2fs_end_read_compressed_page(page, false, blkaddr,
+ if (f2fs_is_compressed_page(folio))
+ f2fs_end_read_compressed_page(folio, false, blkaddr,
in_task);
else
all_compressed = false;
@@ -280,9 +278,9 @@ static void f2fs_post_read_work(struct work_struct *work)
static void f2fs_read_end_io(struct bio *bio)
{
- struct f2fs_sb_info *sbi = F2FS_P_SB(bio_first_page_all(bio));
+ struct f2fs_sb_info *sbi = F2FS_F_SB(bio_first_folio_all(bio));
struct bio_post_read_ctx *ctx;
- bool intask = in_task();
+ bool intask = in_task() && !irqs_disabled();
iostat_update_and_unbind_ctx(bio);
ctx = bio->bi_private;
@@ -290,7 +288,7 @@ static void f2fs_read_end_io(struct bio *bio)
if (time_to_inject(sbi, FAULT_READ_IO))
bio->bi_status = BLK_STS_IOERR;
- if (bio->bi_status) {
+ if (bio->bi_status != BLK_STS_OK) {
f2fs_finish_read_bio(bio, intask);
return;
}
@@ -319,8 +317,7 @@ static void f2fs_read_end_io(struct bio *bio)
static void f2fs_write_end_io(struct bio *bio)
{
struct f2fs_sb_info *sbi;
- struct bio_vec *bvec;
- struct bvec_iter_all iter_all;
+ struct folio_iter fi;
iostat_update_and_unbind_ctx(bio);
sbi = bio->bi_private;
@@ -328,34 +325,41 @@ static void f2fs_write_end_io(struct bio *bio)
if (time_to_inject(sbi, FAULT_WRITE_IO))
bio->bi_status = BLK_STS_IOERR;
- bio_for_each_segment_all(bvec, bio, iter_all) {
- struct page *page = bvec->bv_page;
- enum count_type type = WB_DATA_TYPE(page, false);
+ bio_for_each_folio_all(fi, bio) {
+ struct folio *folio = fi.folio;
+ enum count_type type;
+
+ if (fscrypt_is_bounce_folio(folio)) {
+ struct folio *io_folio = folio;
- fscrypt_finalize_bounce_page(&page);
+ folio = fscrypt_pagecache_folio(io_folio);
+ fscrypt_free_bounce_page(&io_folio->page);
+ }
#ifdef CONFIG_F2FS_FS_COMPRESSION
- if (f2fs_is_compressed_page(page)) {
- f2fs_compress_write_end_io(bio, page);
+ if (f2fs_is_compressed_page(folio)) {
+ f2fs_compress_write_end_io(bio, folio);
continue;
}
#endif
- if (unlikely(bio->bi_status)) {
- mapping_set_error(page->mapping, -EIO);
+ type = WB_DATA_TYPE(folio, false);
+
+ if (unlikely(bio->bi_status != BLK_STS_OK)) {
+ mapping_set_error(folio->mapping, -EIO);
if (type == F2FS_WB_CP_DATA)
f2fs_stop_checkpoint(sbi, true,
STOP_CP_REASON_WRITE_FAIL);
}
- f2fs_bug_on(sbi, page->mapping == NODE_MAPPING(sbi) &&
- page_folio(page)->index != nid_of_node(page));
+ f2fs_bug_on(sbi, is_node_folio(folio) &&
+ folio->index != nid_of_node(folio));
dec_page_count(sbi, type);
- if (f2fs_in_warm_node_list(sbi, page))
- f2fs_del_fsync_node_entry(sbi, page);
- clear_page_private_gcing(page);
- end_page_writeback(page);
+ if (f2fs_in_warm_node_list(sbi, folio))
+ f2fs_del_fsync_node_entry(sbi, folio);
+ folio_clear_f2fs_gcing(folio);
+ folio_end_writeback(folio);
}
if (!get_pages(sbi, F2FS_WB_CP_DATA) &&
wq_has_sleeper(&sbi->cp_wait))
@@ -438,6 +442,11 @@ static blk_opf_t f2fs_io_flags(struct f2fs_io_info *fio)
op_flags |= REQ_META;
if (BIT(fio->temp) & fua_flag)
op_flags |= REQ_FUA;
+
+ if (fio->type == DATA &&
+ F2FS_I(fio->folio->mapping->host)->ioprio_hint == F2FS_IOPRIO_WRITE)
+ op_flags |= REQ_PRIO;
+
return op_flags;
}
@@ -534,34 +543,33 @@ static void __submit_merged_bio(struct f2fs_bio_info *io)
}
static bool __has_merged_page(struct bio *bio, struct inode *inode,
- struct page *page, nid_t ino)
+ struct folio *folio, nid_t ino)
{
- struct bio_vec *bvec;
- struct bvec_iter_all iter_all;
+ struct folio_iter fi;
if (!bio)
return false;
- if (!inode && !page && !ino)
+ if (!inode && !folio && !ino)
return true;
- bio_for_each_segment_all(bvec, bio, iter_all) {
- struct page *target = bvec->bv_page;
+ bio_for_each_folio_all(fi, bio) {
+ struct folio *target = fi.folio;
- if (fscrypt_is_bounce_page(target)) {
- target = fscrypt_pagecache_page(target);
+ if (fscrypt_is_bounce_folio(target)) {
+ target = fscrypt_pagecache_folio(target);
if (IS_ERR(target))
continue;
}
if (f2fs_is_compressed_page(target)) {
- target = f2fs_compress_control_page(target);
+ target = f2fs_compress_control_folio(target);
if (IS_ERR(target))
continue;
}
if (inode && inode == target->mapping->host)
return true;
- if (page && page == target)
+ if (folio && folio == target)
return true;
if (ino && ino == ino_of_node(target))
return true;
@@ -630,7 +638,7 @@ unlock_out:
}
static void __submit_merged_write_cond(struct f2fs_sb_info *sbi,
- struct inode *inode, struct page *page,
+ struct inode *inode, struct folio *folio,
nid_t ino, enum page_type type, bool force)
{
enum temp_type temp;
@@ -642,7 +650,7 @@ static void __submit_merged_write_cond(struct f2fs_sb_info *sbi,
struct f2fs_bio_info *io = sbi->write_io[btype] + temp;
f2fs_down_read(&io->io_rwsem);
- ret = __has_merged_page(io->bio, inode, page, ino);
+ ret = __has_merged_page(io->bio, inode, folio, ino);
f2fs_up_read(&io->io_rwsem);
}
if (ret)
@@ -660,10 +668,10 @@ void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type)
}
void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi,
- struct inode *inode, struct page *page,
+ struct inode *inode, struct folio *folio,
nid_t ino, enum page_type type)
{
- __submit_merged_write_cond(sbi, inode, page, ino, type, false);
+ __submit_merged_write_cond(sbi, inode, folio, ino, type, false);
}
void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi)
@@ -680,7 +688,7 @@ void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi)
int f2fs_submit_page_bio(struct f2fs_io_info *fio)
{
struct bio *bio;
- struct folio *fio_folio = page_folio(fio->page);
+ struct folio *fio_folio = fio->folio;
struct folio *data_folio = fio->encrypted_page ?
page_folio(fio->encrypted_page) : fio_folio;
@@ -702,7 +710,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
wbc_account_cgroup_owner(fio->io_wbc, fio_folio, PAGE_SIZE);
inc_page_count(fio->sbi, is_read_io(fio->op) ?
- __read_io_type(data_folio) : WB_DATA_TYPE(fio->page, false));
+ __read_io_type(data_folio) : WB_DATA_TYPE(fio->folio, false));
if (is_read_io(bio_op(bio)))
f2fs_submit_read_bio(fio->sbi, bio, fio->type);
@@ -768,6 +776,7 @@ static void del_bio_entry(struct bio_entry *be)
static int add_ipu_page(struct f2fs_io_info *fio, struct bio **bio,
struct page *page)
{
+ struct folio *fio_folio = fio->folio;
struct f2fs_sb_info *sbi = fio->sbi;
enum temp_type temp;
bool found = false;
@@ -789,8 +798,8 @@ static int add_ipu_page(struct f2fs_io_info *fio, struct bio **bio,
*fio->last_block,
fio->new_blkaddr));
if (f2fs_crypt_mergeable_bio(*bio,
- fio->page->mapping->host,
- page_folio(fio->page)->index, fio) &&
+ fio_folio->mapping->host,
+ fio_folio->index, fio) &&
bio_add_page(*bio, page, PAGE_SIZE, 0) ==
PAGE_SIZE) {
ret = 0;
@@ -814,13 +823,13 @@ static int add_ipu_page(struct f2fs_io_info *fio, struct bio **bio,
}
void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi,
- struct bio **bio, struct page *page)
+ struct bio **bio, struct folio *folio)
{
enum temp_type temp;
bool found = false;
struct bio *target = bio ? *bio : NULL;
- f2fs_bug_on(sbi, !target && !page);
+ f2fs_bug_on(sbi, !target && !folio);
for (temp = HOT; temp < NR_TEMP_TYPE && !found; temp++) {
struct f2fs_bio_info *io = sbi->write_io[DATA] + temp;
@@ -836,7 +845,7 @@ void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi,
found = (target == be->bio);
else
found = __has_merged_page(be->bio, NULL,
- page, 0);
+ folio, 0);
if (found)
break;
}
@@ -853,7 +862,7 @@ void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi,
found = (target == be->bio);
else
found = __has_merged_page(be->bio, NULL,
- page, 0);
+ folio, 0);
if (found) {
target = be->bio;
del_bio_entry(be);
@@ -874,14 +883,15 @@ void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi,
int f2fs_merge_page_bio(struct f2fs_io_info *fio)
{
struct bio *bio = *fio->bio;
- struct page *page = fio->encrypted_page ?
- fio->encrypted_page : fio->page;
+ struct folio *data_folio = fio->encrypted_page ?
+ page_folio(fio->encrypted_page) : fio->folio;
+ struct folio *folio = fio->folio;
if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr,
__is_meta_io(fio) ? META_GENERIC : DATA_GENERIC))
return -EFSCORRUPTED;
- trace_f2fs_submit_folio_bio(page_folio(page), fio);
+ trace_f2fs_submit_folio_bio(data_folio, fio);
if (bio && !page_is_mergeable(fio->sbi, bio, *fio->last_block,
fio->new_blkaddr))
@@ -889,20 +899,19 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio)
alloc_new:
if (!bio) {
bio = __bio_alloc(fio, BIO_MAX_VECS);
- f2fs_set_bio_crypt_ctx(bio, fio->page->mapping->host,
- page_folio(fio->page)->index, fio, GFP_NOIO);
+ f2fs_set_bio_crypt_ctx(bio, folio->mapping->host,
+ folio->index, fio, GFP_NOIO);
- add_bio_entry(fio->sbi, bio, page, fio->temp);
+ add_bio_entry(fio->sbi, bio, &data_folio->page, fio->temp);
} else {
- if (add_ipu_page(fio, &bio, page))
+ if (add_ipu_page(fio, &bio, &data_folio->page))
goto alloc_new;
}
if (fio->io_wbc)
- wbc_account_cgroup_owner(fio->io_wbc, page_folio(fio->page),
- PAGE_SIZE);
+ wbc_account_cgroup_owner(fio->io_wbc, folio, folio_size(folio));
- inc_page_count(fio->sbi, WB_DATA_TYPE(page, false));
+ inc_page_count(fio->sbi, WB_DATA_TYPE(data_folio, false));
*fio->last_block = fio->new_blkaddr;
*fio->bio = bio;
@@ -937,7 +946,7 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio)
struct f2fs_sb_info *sbi = fio->sbi;
enum page_type btype = PAGE_TYPE_OF_BIO(fio->type);
struct f2fs_bio_info *io = sbi->write_io[btype] + fio->temp;
- struct page *bio_page;
+ struct folio *bio_folio;
enum count_type type;
f2fs_bug_on(sbi, is_read_io(fio->op));
@@ -968,44 +977,44 @@ next:
verify_fio_blkaddr(fio);
if (fio->encrypted_page)
- bio_page = fio->encrypted_page;
+ bio_folio = page_folio(fio->encrypted_page);
else if (fio->compressed_page)
- bio_page = fio->compressed_page;
+ bio_folio = page_folio(fio->compressed_page);
else
- bio_page = fio->page;
+ bio_folio = fio->folio;
/* set submitted = true as a return value */
fio->submitted = 1;
- type = WB_DATA_TYPE(bio_page, fio->compressed_page);
+ type = WB_DATA_TYPE(bio_folio, fio->compressed_page);
inc_page_count(sbi, type);
if (io->bio &&
(!io_is_mergeable(sbi, io->bio, io, fio, io->last_block_in_bio,
fio->new_blkaddr) ||
- !f2fs_crypt_mergeable_bio(io->bio, fio->page->mapping->host,
- page_folio(bio_page)->index, fio)))
+ !f2fs_crypt_mergeable_bio(io->bio, fio_inode(fio),
+ bio_folio->index, fio)))
__submit_merged_bio(io);
alloc_new:
if (io->bio == NULL) {
io->bio = __bio_alloc(fio, BIO_MAX_VECS);
- f2fs_set_bio_crypt_ctx(io->bio, fio->page->mapping->host,
- page_folio(bio_page)->index, fio, GFP_NOIO);
+ f2fs_set_bio_crypt_ctx(io->bio, fio_inode(fio),
+ bio_folio->index, fio, GFP_NOIO);
io->fio = *fio;
}
- if (bio_add_page(io->bio, bio_page, PAGE_SIZE, 0) < PAGE_SIZE) {
+ if (!bio_add_folio(io->bio, bio_folio, folio_size(bio_folio), 0)) {
__submit_merged_bio(io);
goto alloc_new;
}
if (fio->io_wbc)
- wbc_account_cgroup_owner(fio->io_wbc, page_folio(fio->page),
- PAGE_SIZE);
+ wbc_account_cgroup_owner(fio->io_wbc, fio->folio,
+ folio_size(fio->folio));
io->last_block_in_bio = fio->new_blkaddr;
- trace_f2fs_submit_folio_write(page_folio(fio->page), fio);
+ trace_f2fs_submit_folio_write(fio->folio, fio);
#ifdef CONFIG_BLK_DEV_ZONED
if (f2fs_sb_has_blkzoned(sbi) && btype < META &&
is_end_zone_blkaddr(sbi, fio->new_blkaddr)) {
@@ -1041,8 +1050,6 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
bio = bio_alloc_bioset(bdev, bio_max_segs(nr_pages),
REQ_OP_READ | op_flag,
for_write ? GFP_NOIO : GFP_KERNEL, &f2fs_bioset);
- if (!bio)
- return ERR_PTR(-ENOMEM);
bio->bi_iter.bi_sector = sector;
f2fs_set_bio_crypt_ctx(bio, inode, first_idx, NULL, GFP_NOFS);
bio->bi_end_io = f2fs_read_end_io;
@@ -1106,7 +1113,7 @@ static int f2fs_submit_page_read(struct inode *inode, struct folio *folio,
static void __set_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr)
{
- __le32 *addr = get_dnode_addr(dn->inode, dn->node_page);
+ __le32 *addr = get_dnode_addr(dn->inode, dn->node_folio);
dn->data_blkaddr = blkaddr;
addr[dn->ofs_in_node] = cpu_to_le32(dn->data_blkaddr);
@@ -1115,14 +1122,14 @@ static void __set_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr)
/*
* Lock ordering for the change of data block address:
* ->data_page
- * ->node_page
+ * ->node_folio
* update block addresses in the node page
*/
void f2fs_set_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr)
{
- f2fs_wait_on_page_writeback(dn->node_page, NODE, true, true);
+ f2fs_folio_wait_writeback(dn->node_folio, NODE, true, true);
__set_data_blkaddr(dn, blkaddr);
- if (set_page_dirty(dn->node_page))
+ if (folio_mark_dirty(dn->node_folio))
dn->node_changed = true;
}
@@ -1150,7 +1157,7 @@ int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count)
trace_f2fs_reserve_new_blocks(dn->inode, dn->nid,
dn->ofs_in_node, count);
- f2fs_wait_on_page_writeback(dn->node_page, NODE, true, true);
+ f2fs_folio_wait_writeback(dn->node_folio, NODE, true, true);
for (; count > 0; dn->ofs_in_node++) {
block_t blkaddr = f2fs_data_blkaddr(dn);
@@ -1161,7 +1168,7 @@ int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count)
}
}
- if (set_page_dirty(dn->node_page))
+ if (folio_mark_dirty(dn->node_folio))
dn->node_changed = true;
return 0;
}
@@ -1179,7 +1186,7 @@ int f2fs_reserve_new_block(struct dnode_of_data *dn)
int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index)
{
- bool need_put = dn->inode_page ? false : true;
+ bool need_put = dn->inode_folio ? false : true;
int err;
err = f2fs_get_dnode_of_data(dn, index, ALLOC_NODE);
@@ -1193,18 +1200,17 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index)
return err;
}
-struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index,
- blk_opf_t op_flags, bool for_write,
- pgoff_t *next_pgofs)
+struct folio *f2fs_get_read_data_folio(struct inode *inode, pgoff_t index,
+ blk_opf_t op_flags, bool for_write, pgoff_t *next_pgofs)
{
struct address_space *mapping = inode->i_mapping;
struct dnode_of_data dn;
- struct page *page;
+ struct folio *folio;
int err;
- page = f2fs_grab_cache_page(mapping, index, for_write);
- if (!page)
- return ERR_PTR(-ENOMEM);
+ folio = f2fs_grab_cache_folio(mapping, index, for_write);
+ if (IS_ERR(folio))
+ return folio;
if (f2fs_lookup_read_extent_cache_block(inode, index,
&dn.data_blkaddr)) {
@@ -1239,61 +1245,64 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index,
goto put_err;
}
got_it:
- if (PageUptodate(page)) {
- unlock_page(page);
- return page;
+ if (folio_test_uptodate(folio)) {
+ folio_unlock(folio);
+ return folio;
}
/*
* A new dentry page is allocated but not able to be written, since its
* new inode page couldn't be allocated due to -ENOSPC.
* In such the case, its blkaddr can be remained as NEW_ADDR.
- * see, f2fs_add_link -> f2fs_get_new_data_page ->
+ * see, f2fs_add_link -> f2fs_get_new_data_folio ->
* f2fs_init_inode_metadata.
*/
if (dn.data_blkaddr == NEW_ADDR) {
- zero_user_segment(page, 0, PAGE_SIZE);
- if (!PageUptodate(page))
- SetPageUptodate(page);
- unlock_page(page);
- return page;
+ folio_zero_segment(folio, 0, folio_size(folio));
+ if (!folio_test_uptodate(folio))
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
+ return folio;
}
- err = f2fs_submit_page_read(inode, page_folio(page), dn.data_blkaddr,
+ err = f2fs_submit_page_read(inode, folio, dn.data_blkaddr,
op_flags, for_write);
if (err)
goto put_err;
- return page;
+ return folio;
put_err:
- f2fs_put_page(page, 1);
+ f2fs_folio_put(folio, true);
return ERR_PTR(err);
}
-struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index,
+struct folio *f2fs_find_data_folio(struct inode *inode, pgoff_t index,
pgoff_t *next_pgofs)
{
struct address_space *mapping = inode->i_mapping;
- struct page *page;
+ struct folio *folio;
- page = find_get_page_flags(mapping, index, FGP_ACCESSED);
- if (page && PageUptodate(page))
- return page;
- f2fs_put_page(page, 0);
+ folio = __filemap_get_folio(mapping, index, FGP_ACCESSED, 0);
+ if (IS_ERR(folio))
+ goto read;
+ if (folio_test_uptodate(folio))
+ return folio;
+ f2fs_folio_put(folio, false);
- page = f2fs_get_read_data_page(inode, index, 0, false, next_pgofs);
- if (IS_ERR(page))
- return page;
+read:
+ folio = f2fs_get_read_data_folio(inode, index, 0, false, next_pgofs);
+ if (IS_ERR(folio))
+ return folio;
- if (PageUptodate(page))
- return page;
+ if (folio_test_uptodate(folio))
+ return folio;
- wait_on_page_locked(page);
- if (unlikely(!PageUptodate(page))) {
- f2fs_put_page(page, 0);
+ folio_wait_locked(folio);
+ if (unlikely(!folio_test_uptodate(folio))) {
+ f2fs_folio_put(folio, false);
return ERR_PTR(-EIO);
}
- return page;
+ return folio;
}
/*
@@ -1301,23 +1310,23 @@ struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index,
* Because, the callers, functions in dir.c and GC, should be able to know
* whether this page exists or not.
*/
-struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index,
+struct folio *f2fs_get_lock_data_folio(struct inode *inode, pgoff_t index,
bool for_write)
{
struct address_space *mapping = inode->i_mapping;
- struct page *page;
+ struct folio *folio;
- page = f2fs_get_read_data_page(inode, index, 0, for_write, NULL);
- if (IS_ERR(page))
- return page;
+ folio = f2fs_get_read_data_folio(inode, index, 0, for_write, NULL);
+ if (IS_ERR(folio))
+ return folio;
/* wait for read completion */
- lock_page(page);
- if (unlikely(page->mapping != mapping || !PageUptodate(page))) {
- f2fs_put_page(page, 1);
+ folio_lock(folio);
+ if (unlikely(folio->mapping != mapping || !folio_test_uptodate(folio))) {
+ f2fs_folio_put(folio, true);
return ERR_PTR(-EIO);
}
- return page;
+ return folio;
}
/*
@@ -1326,57 +1335,57 @@ struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index,
*
* Also, caller should grab and release a rwsem by calling f2fs_lock_op() and
* f2fs_unlock_op().
- * Note that, ipage is set only by make_empty_dir, and if any error occur,
- * ipage should be released by this function.
+ * Note that, ifolio is set only by make_empty_dir, and if any error occur,
+ * ifolio should be released by this function.
*/
-struct page *f2fs_get_new_data_page(struct inode *inode,
- struct page *ipage, pgoff_t index, bool new_i_size)
+struct folio *f2fs_get_new_data_folio(struct inode *inode,
+ struct folio *ifolio, pgoff_t index, bool new_i_size)
{
struct address_space *mapping = inode->i_mapping;
- struct page *page;
+ struct folio *folio;
struct dnode_of_data dn;
int err;
- page = f2fs_grab_cache_page(mapping, index, true);
- if (!page) {
+ folio = f2fs_grab_cache_folio(mapping, index, true);
+ if (IS_ERR(folio)) {
/*
- * before exiting, we should make sure ipage will be released
+ * before exiting, we should make sure ifolio will be released
* if any error occur.
*/
- f2fs_put_page(ipage, 1);
+ f2fs_folio_put(ifolio, true);
return ERR_PTR(-ENOMEM);
}
- set_new_dnode(&dn, inode, ipage, NULL, 0);
+ set_new_dnode(&dn, inode, ifolio, NULL, 0);
err = f2fs_reserve_block(&dn, index);
if (err) {
- f2fs_put_page(page, 1);
+ f2fs_folio_put(folio, true);
return ERR_PTR(err);
}
- if (!ipage)
+ if (!ifolio)
f2fs_put_dnode(&dn);
- if (PageUptodate(page))
+ if (folio_test_uptodate(folio))
goto got_it;
if (dn.data_blkaddr == NEW_ADDR) {
- zero_user_segment(page, 0, PAGE_SIZE);
- if (!PageUptodate(page))
- SetPageUptodate(page);
+ folio_zero_segment(folio, 0, folio_size(folio));
+ if (!folio_test_uptodate(folio))
+ folio_mark_uptodate(folio);
} else {
- f2fs_put_page(page, 1);
+ f2fs_folio_put(folio, true);
- /* if ipage exists, blkaddr should be NEW_ADDR */
- f2fs_bug_on(F2FS_I_SB(inode), ipage);
- page = f2fs_get_lock_data_page(inode, index, true);
- if (IS_ERR(page))
- return page;
+ /* if ifolio exists, blkaddr should be NEW_ADDR */
+ f2fs_bug_on(F2FS_I_SB(inode), ifolio);
+ folio = f2fs_get_lock_data_folio(inode, index, true);
+ if (IS_ERR(folio))
+ return folio;
}
got_it:
if (new_i_size && i_size_read(inode) <
((loff_t)(index + 1) << PAGE_SHIFT))
f2fs_i_size_write(inode, ((loff_t)(index + 1) << PAGE_SHIFT));
- return page;
+ return folio;
}
static int __allocate_data_block(struct dnode_of_data *dn, int seg_type)
@@ -1541,10 +1550,14 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int flag)
unsigned int start_pgofs;
int bidx = 0;
bool is_hole;
+ bool lfs_dio_write;
if (!maxblocks)
return 0;
+ lfs_dio_write = (flag == F2FS_GET_BLOCK_DIO && f2fs_lfs_mode(sbi) &&
+ map->m_may_create);
+
if (!map->m_may_create && f2fs_map_blocks_cached(inode, map, flag))
goto out;
@@ -1560,8 +1573,11 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int flag)
end = pgofs + maxblocks;
next_dnode:
- if (map->m_may_create)
+ if (map->m_may_create) {
+ if (f2fs_lfs_mode(sbi))
+ f2fs_balance_fs(sbi, true);
f2fs_map_lock(sbi, flag);
+ }
/* When reading holes, we need its node page */
set_new_dnode(&dn, inode, NULL, NULL, 0);
@@ -1577,7 +1593,7 @@ next_dnode:
start_pgofs = pgofs;
prealloc = 0;
last_ofs_in_node = ofs_in_node = dn.ofs_in_node;
- end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
+ end_offset = ADDRS_PER_PAGE(dn.node_folio, inode);
next_block:
blkaddr = f2fs_data_blkaddr(&dn);
@@ -1591,7 +1607,7 @@ next_block:
/* use out-place-update for direct IO under LFS mode */
if (map->m_may_create && (is_hole ||
(flag == F2FS_GET_BLOCK_DIO && f2fs_lfs_mode(sbi) &&
- !f2fs_is_pinned_file(inode)))) {
+ !f2fs_is_pinned_file(inode) && map->m_last_pblk != blkaddr))) {
if (unlikely(f2fs_cp_error(sbi))) {
err = -EIO;
goto sync_out;
@@ -1675,10 +1691,15 @@ next_block:
if (map->m_multidev_dio)
map->m_bdev = FDEV(bidx).bdev;
+
+ if (lfs_dio_write)
+ map->m_last_pblk = NULL_ADDR;
} else if (map_is_mergeable(sbi, map, blkaddr, flag, bidx, ofs)) {
ofs++;
map->m_len++;
} else {
+ if (lfs_dio_write && !f2fs_is_pinned_file(inode))
+ map->m_last_pblk = blkaddr;
goto sync_out;
}
@@ -1703,14 +1724,6 @@ skip:
dn.ofs_in_node = end_offset;
}
- if (flag == F2FS_GET_BLOCK_DIO && f2fs_lfs_mode(sbi) &&
- map->m_may_create) {
- /* the next block to be allocated may not be contiguous. */
- if (GET_SEGOFF_FROM_SEG0(sbi, blkaddr) % BLKS_PER_SEC(sbi) ==
- CAP_BLKS_PER_SEC(sbi) - 1)
- goto sync_out;
- }
-
if (pgofs >= end)
goto sync_out;
else if (dn.ofs_in_node < end_offset)
@@ -1813,7 +1826,6 @@ static int f2fs_xattr_fiemap(struct inode *inode,
struct fiemap_extent_info *fieinfo)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct page *page;
struct node_info ni;
__u64 phys = 0, len;
__u32 flags;
@@ -1822,15 +1834,15 @@ static int f2fs_xattr_fiemap(struct inode *inode,
if (f2fs_has_inline_xattr(inode)) {
int offset;
+ struct folio *folio = f2fs_grab_cache_folio(NODE_MAPPING(sbi),
+ inode->i_ino, false);
- page = f2fs_grab_cache_page(NODE_MAPPING(sbi),
- inode->i_ino, false);
- if (!page)
- return -ENOMEM;
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
err = f2fs_get_node_info(sbi, inode->i_ino, &ni, false);
if (err) {
- f2fs_put_page(page, 1);
+ f2fs_folio_put(folio, true);
return err;
}
@@ -1842,7 +1854,7 @@ static int f2fs_xattr_fiemap(struct inode *inode,
phys += offset;
len = inline_xattr_size(inode);
- f2fs_put_page(page, 1);
+ f2fs_folio_put(folio, true);
flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED;
@@ -1856,20 +1868,22 @@ static int f2fs_xattr_fiemap(struct inode *inode,
}
if (xnid) {
- page = f2fs_grab_cache_page(NODE_MAPPING(sbi), xnid, false);
- if (!page)
- return -ENOMEM;
+ struct folio *folio = f2fs_grab_cache_folio(NODE_MAPPING(sbi),
+ xnid, false);
+
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
err = f2fs_get_node_info(sbi, xnid, &ni, false);
if (err) {
- f2fs_put_page(page, 1);
+ f2fs_folio_put(folio, true);
return err;
}
phys = F2FS_BLK_TO_BYTES(ni.blk_addr);
len = inode->i_sb->s_blocksize;
- f2fs_put_page(page, 1);
+ f2fs_folio_put(folio, true);
flags = FIEMAP_EXTENT_LAST;
}
@@ -2065,7 +2079,7 @@ static int f2fs_read_single_page(struct inode *inode, struct folio *folio,
sector_t last_block;
sector_t last_block_in_file;
sector_t block_nr;
- pgoff_t index = folio_index(folio);
+ pgoff_t index = folio->index;
int ret = 0;
block_in_file = (sector_t)index;
@@ -2178,6 +2192,12 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
int i;
int ret = 0;
+ if (unlikely(f2fs_cp_error(sbi))) {
+ ret = -EIO;
+ from_dnode = false;
+ goto out_put_dnode;
+ }
+
f2fs_bug_on(sbi, f2fs_cluster_is_empty(cc));
last_block_in_file = F2FS_BYTES_TO_BLK(f2fs_readpage_limit(inode) +
@@ -2221,17 +2241,13 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
if (ret)
goto out;
- if (unlikely(f2fs_cp_error(sbi))) {
- ret = -EIO;
- goto out_put_dnode;
- }
f2fs_bug_on(sbi, dn.data_blkaddr != COMPRESS_ADDR);
skip_reading_dnode:
for (i = 1; i < cc->cluster_size; i++) {
block_t blkaddr;
- blkaddr = from_dnode ? data_blkaddr(dn.inode, dn.node_page,
+ blkaddr = from_dnode ? data_blkaddr(dn.inode, dn.node_folio,
dn.ofs_in_node + i) :
ei.blk + i - 1;
@@ -2265,14 +2281,13 @@ skip_reading_dnode:
block_t blkaddr;
struct bio_post_read_ctx *ctx;
- blkaddr = from_dnode ? data_blkaddr(dn.inode, dn.node_page,
+ blkaddr = from_dnode ? data_blkaddr(dn.inode, dn.node_folio,
dn.ofs_in_node + i + 1) :
ei.blk + i;
f2fs_wait_on_block_writeback(inode, blkaddr);
- if (f2fs_load_compressed_page(sbi, folio_page(folio, 0),
- blkaddr)) {
+ if (f2fs_load_compressed_folio(sbi, folio, blkaddr)) {
if (atomic_dec_and_test(&dic->remaining_pages)) {
f2fs_decompress_cluster(dic, true);
break;
@@ -2289,7 +2304,7 @@ submit_and_realloc:
}
if (!bio) {
- bio = f2fs_grab_read_bio(inode, blkaddr, nr_pages,
+ bio = f2fs_grab_read_bio(inode, blkaddr, nr_pages - i,
f2fs_ra_op_flags(rac),
folio->index, for_write);
if (IS_ERR(bio)) {
@@ -2362,6 +2377,14 @@ static int f2fs_mpage_readpages(struct inode *inode,
unsigned max_nr_pages = nr_pages;
int ret = 0;
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+ if (f2fs_compressed_file(inode)) {
+ index = rac ? readahead_index(rac) : folio->index;
+ max_nr_pages = round_up(index + nr_pages, cc.cluster_size) -
+ round_down(index, cc.cluster_size);
+ }
+#endif
+
map.m_pblk = 0;
map.m_lblk = 0;
map.m_len = 0;
@@ -2378,7 +2401,7 @@ static int f2fs_mpage_readpages(struct inode *inode,
}
#ifdef CONFIG_F2FS_FS_COMPRESSION
- index = folio_index(folio);
+ index = folio->index;
if (!f2fs_compressed_file(inode))
goto read_single_page;
@@ -2487,8 +2510,9 @@ static void f2fs_readahead(struct readahead_control *rac)
int f2fs_encrypt_one_page(struct f2fs_io_info *fio)
{
- struct inode *inode = fio->page->mapping->host;
- struct page *mpage, *page;
+ struct inode *inode = fio_inode(fio);
+ struct folio *mfolio;
+ struct page *page;
gfp_t gfp_flags = GFP_NOFS;
if (!f2fs_encrypted_file(inode))
@@ -2500,7 +2524,7 @@ int f2fs_encrypt_one_page(struct f2fs_io_info *fio)
return 0;
retry_encrypt:
- fio->encrypted_page = fscrypt_encrypt_pagecache_blocks(page,
+ fio->encrypted_page = fscrypt_encrypt_pagecache_blocks(page_folio(page),
PAGE_SIZE, 0, gfp_flags);
if (IS_ERR(fio->encrypted_page)) {
/* flush pending IOs and wait for a while in the ENOMEM case */
@@ -2513,12 +2537,12 @@ retry_encrypt:
return PTR_ERR(fio->encrypted_page);
}
- mpage = find_lock_page(META_MAPPING(fio->sbi), fio->old_blkaddr);
- if (mpage) {
- if (PageUptodate(mpage))
- memcpy(page_address(mpage),
+ mfolio = filemap_lock_folio(META_MAPPING(fio->sbi), fio->old_blkaddr);
+ if (!IS_ERR(mfolio)) {
+ if (folio_test_uptodate(mfolio))
+ memcpy(folio_address(mfolio),
page_address(fio->encrypted_page), PAGE_SIZE);
- f2fs_put_page(mpage, 1);
+ f2fs_folio_put(mfolio, true);
}
return 0;
}
@@ -2617,7 +2641,7 @@ bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio)
static inline bool need_inplace_update(struct f2fs_io_info *fio)
{
- struct inode *inode = fio->page->mapping->host;
+ struct inode *inode = fio_inode(fio);
if (f2fs_should_update_outplace(inode, fio))
return false;
@@ -2627,7 +2651,7 @@ static inline bool need_inplace_update(struct f2fs_io_info *fio)
int f2fs_do_write_data_page(struct f2fs_io_info *fio)
{
- struct folio *folio = page_folio(fio->page);
+ struct folio *folio = fio->folio;
struct inode *inode = folio->mapping->host;
struct dnode_of_data dn;
struct node_info ni;
@@ -2637,7 +2661,7 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio)
/* Use COW inode to make dnode_of_data for atomic write */
atomic_commit = f2fs_is_atomic_file(inode) &&
- page_private_atomic(folio_page(folio, 0));
+ folio_test_f2fs_atomic(folio);
if (atomic_commit)
set_new_dnode(&dn, F2FS_I(inode)->cow_inode, NULL, NULL, 0);
else
@@ -2668,7 +2692,7 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio)
/* This page is already truncated */
if (fio->old_blkaddr == NULL_ADDR) {
folio_clear_uptodate(folio);
- clear_page_private_gcing(folio_page(folio, 0));
+ folio_clear_f2fs_gcing(folio);
goto out_writepage;
}
got_it:
@@ -2738,7 +2762,7 @@ got_it:
trace_f2fs_do_write_data_page(folio, OPU);
set_inode_flag(inode, FI_APPEND_WRITE);
if (atomic_commit)
- clear_page_private_atomic(folio_page(folio, 0));
+ folio_clear_f2fs_atomic(folio);
out_writepage:
f2fs_put_dnode(&dn);
out:
@@ -2756,7 +2780,6 @@ int f2fs_write_single_data_page(struct folio *folio, int *submitted,
bool allow_balance)
{
struct inode *inode = folio->mapping->host;
- struct page *page = folio_page(folio, 0);
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
loff_t i_size = i_size_read(inode);
const pgoff_t end_index = ((unsigned long long)i_size)
@@ -2773,7 +2796,7 @@ int f2fs_write_single_data_page(struct folio *folio, int *submitted,
.op = REQ_OP_WRITE,
.op_flags = wbc_to_write_flags(wbc),
.old_blkaddr = NULL_ADDR,
- .page = page,
+ .folio = folio,
.encrypted_page = NULL,
.submitted = 0,
.compr_blocks = compr_blocks,
@@ -2841,13 +2864,7 @@ write:
goto done;
}
- if (!wbc->for_reclaim)
- need_balance_fs = true;
- else if (has_not_enough_free_secs(sbi, 0, 0))
- goto redirty_out;
- else
- set_inode_flag(inode, FI_HOT_DATA);
-
+ need_balance_fs = true;
err = -EAGAIN;
if (f2fs_has_inline_data(inode)) {
err = f2fs_write_inline_data(inode, folio);
@@ -2881,14 +2898,7 @@ out:
inode_dec_dirty_pages(inode);
if (err) {
folio_clear_uptodate(folio);
- clear_page_private_gcing(page);
- }
-
- if (wbc->for_reclaim) {
- f2fs_submit_merged_write_cond(sbi, NULL, page, 0, DATA);
- clear_inode_flag(inode, FI_HOT_DATA);
- f2fs_remove_dirty_inode(inode);
- submitted = NULL;
+ folio_clear_f2fs_gcing(folio);
}
folio_unlock(folio);
if (!S_ISDIR(inode->i_mode) && !IS_NOQUOTA(inode) &&
@@ -2915,35 +2925,12 @@ redirty_out:
* file_write_and_wait_range() will see EIO error, which is critical
* to return value of fsync() followed by atomic_write failure to user.
*/
- if (!err || wbc->for_reclaim)
- return AOP_WRITEPAGE_ACTIVATE;
folio_unlock(folio);
+ if (!err)
+ return 1;
return err;
}
-static int f2fs_write_data_page(struct page *page,
- struct writeback_control *wbc)
-{
- struct folio *folio = page_folio(page);
-#ifdef CONFIG_F2FS_FS_COMPRESSION
- struct inode *inode = folio->mapping->host;
-
- if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
- goto out;
-
- if (f2fs_compressed_file(inode)) {
- if (f2fs_is_compressed_cluster(inode, folio->index)) {
- folio_redirty_for_writepage(wbc, folio);
- return AOP_WRITEPAGE_ACTIVATE;
- }
- }
-out:
-#endif
-
- return f2fs_write_single_data_page(folio, NULL, NULL, NULL,
- wbc, FS_DATA_IO, 0, true);
-}
-
/*
* This function was copied from write_cache_pages from mm/page-writeback.c.
* The major change is making write step of cold data page separately from
@@ -3137,7 +3124,7 @@ continue_unlock:
if (folio_test_writeback(folio)) {
if (wbc->sync_mode == WB_SYNC_NONE)
goto continue_unlock;
- f2fs_wait_on_page_writeback(&folio->page, DATA, true, true);
+ f2fs_folio_wait_writeback(folio, DATA, true, true);
}
if (!folio_clear_dirty_for_io(folio))
@@ -3154,8 +3141,6 @@ continue_unlock:
ret = f2fs_write_single_data_page(folio,
&submitted, &bio, &last_block,
wbc, io_type, 0, true);
- if (ret == AOP_WRITEPAGE_ACTIVATE)
- folio_unlock(folio);
#ifdef CONFIG_F2FS_FS_COMPRESSION
result:
#endif
@@ -3167,7 +3152,7 @@ result:
* keep nr_to_write, since vfs uses this to
* get # of written pages.
*/
- if (ret == AOP_WRITEPAGE_ACTIVATE) {
+ if (ret == 1) {
ret = 0;
goto next;
} else if (ret == -EAGAIN) {
@@ -3266,10 +3251,6 @@ static int __f2fs_write_data_pages(struct address_space *mapping,
int ret;
bool locked = false;
- /* deal with chardevs and other special file */
- if (!mapping->a_ops->writepage)
- return 0;
-
/* skip writing if there is no dirty page in this inode */
if (!get_dirty_pages(inode) && wbc->sync_mode == WB_SYNC_NONE)
return 0;
@@ -3365,7 +3346,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi,
struct inode *inode = folio->mapping->host;
pgoff_t index = folio->index;
struct dnode_of_data dn;
- struct page *ipage;
+ struct folio *ifolio;
bool locked = false;
int flag = F2FS_GET_BLOCK_PRE_AIO;
int err = 0;
@@ -3390,23 +3371,23 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi,
restart:
/* check inline_data */
- ipage = f2fs_get_node_page(sbi, inode->i_ino);
- if (IS_ERR(ipage)) {
- err = PTR_ERR(ipage);
+ ifolio = f2fs_get_inode_folio(sbi, inode->i_ino);
+ if (IS_ERR(ifolio)) {
+ err = PTR_ERR(ifolio);
goto unlock_out;
}
- set_new_dnode(&dn, inode, ipage, ipage, 0);
+ set_new_dnode(&dn, inode, ifolio, ifolio, 0);
if (f2fs_has_inline_data(inode)) {
if (pos + len <= MAX_INLINE_DATA(inode)) {
- f2fs_do_read_inline_data(folio, ipage);
+ f2fs_do_read_inline_data(folio, ifolio);
set_inode_flag(inode, FI_DATA_EXIST);
if (inode->i_nlink)
- set_page_private_inline(ipage);
+ folio_set_f2fs_inline(ifolio);
goto out;
}
- err = f2fs_convert_inline_page(&dn, folio_page(folio, 0));
+ err = f2fs_convert_inline_folio(&dn, folio);
if (err || dn.data_blkaddr != NULL_ADDR)
goto out;
}
@@ -3450,14 +3431,14 @@ static int __find_data_block(struct inode *inode, pgoff_t index,
block_t *blk_addr)
{
struct dnode_of_data dn;
- struct page *ipage;
+ struct folio *ifolio;
int err = 0;
- ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino);
- if (IS_ERR(ipage))
- return PTR_ERR(ipage);
+ ifolio = f2fs_get_inode_folio(F2FS_I_SB(inode), inode->i_ino);
+ if (IS_ERR(ifolio))
+ return PTR_ERR(ifolio);
- set_new_dnode(&dn, inode, ipage, ipage, 0);
+ set_new_dnode(&dn, inode, ifolio, ifolio, 0);
if (!f2fs_lookup_read_extent_cache_block(inode, index,
&dn.data_blkaddr)) {
@@ -3478,17 +3459,17 @@ static int __reserve_data_block(struct inode *inode, pgoff_t index,
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct dnode_of_data dn;
- struct page *ipage;
+ struct folio *ifolio;
int err = 0;
f2fs_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO);
- ipage = f2fs_get_node_page(sbi, inode->i_ino);
- if (IS_ERR(ipage)) {
- err = PTR_ERR(ipage);
+ ifolio = f2fs_get_inode_folio(sbi, inode->i_ino);
+ if (IS_ERR(ifolio)) {
+ err = PTR_ERR(ifolio);
goto unlock_out;
}
- set_new_dnode(&dn, inode, ipage, ipage, 0);
+ set_new_dnode(&dn, inode, ifolio, ifolio, 0);
if (!f2fs_lookup_read_extent_cache_block(dn.inode, index,
&dn.data_blkaddr))
@@ -3546,8 +3527,10 @@ reserve_block:
return 0;
}
-static int f2fs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, struct folio **foliop, void **fsdata)
+static int f2fs_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, struct folio **foliop,
+ void **fsdata)
{
struct inode *inode = mapping->host;
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
@@ -3636,7 +3619,7 @@ repeat:
}
}
- f2fs_wait_on_page_writeback(&folio->page, DATA, false, true);
+ f2fs_folio_wait_writeback(folio, DATA, false, true);
if (len == folio_size(folio) || folio_test_uptodate(folio))
return 0;
@@ -3683,7 +3666,7 @@ fail:
return err;
}
-static int f2fs_write_end(struct file *file,
+static int f2fs_write_end(const struct kiocb *iocb,
struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct folio *folio, void *fsdata)
@@ -3723,7 +3706,7 @@ static int f2fs_write_end(struct file *file,
folio_mark_dirty(folio);
if (f2fs_is_atomic_file(inode))
- set_page_private_atomic(folio_page(folio, 0));
+ folio_set_f2fs_atomic(folio);
if (pos + copied > i_size_read(inode) &&
!f2fs_verity_in_progress(inode)) {
@@ -3758,7 +3741,7 @@ void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length)
f2fs_remove_dirty_inode(inode);
}
}
- clear_page_private_all(&folio->page);
+ folio_detach_private(folio);
}
bool f2fs_release_folio(struct folio *folio, gfp_t wait)
@@ -3767,7 +3750,7 @@ bool f2fs_release_folio(struct folio *folio, gfp_t wait)
if (folio_test_dirty(folio))
return false;
- clear_page_private_all(&folio->page);
+ folio_detach_private(folio);
return true;
}
@@ -3891,18 +3874,18 @@ static int f2fs_migrate_blocks(struct inode *inode, block_t start_blk,
set_inode_flag(inode, FI_SKIP_WRITES);
for (blkofs = 0; blkofs <= blkofs_end; blkofs++) {
- struct page *page;
+ struct folio *folio;
unsigned int blkidx = secidx * blk_per_sec + blkofs;
- page = f2fs_get_lock_data_page(inode, blkidx, true);
- if (IS_ERR(page)) {
+ folio = f2fs_get_lock_data_folio(inode, blkidx, true);
+ if (IS_ERR(folio)) {
f2fs_up_write(&sbi->pin_sem);
- ret = PTR_ERR(page);
+ ret = PTR_ERR(folio);
goto done;
}
- set_page_dirty(page);
- f2fs_put_page(page, 1);
+ folio_mark_dirty(folio);
+ f2fs_folio_put(folio, true);
}
clear_inode_flag(inode, FI_SKIP_WRITES);
@@ -3979,7 +3962,7 @@ retry:
if ((pblock - SM_I(sbi)->main_blkaddr) % blks_per_sec ||
nr_pblocks % blks_per_sec ||
- !f2fs_valid_pinned_area(sbi, pblock)) {
+ f2fs_is_sequential_zone_area(sbi, pblock)) {
bool last_extent = false;
not_aligned++;
@@ -4101,7 +4084,6 @@ static void f2fs_swap_deactivate(struct file *file)
const struct address_space_operations f2fs_dblock_aops = {
.read_folio = f2fs_read_data_folio,
.readahead = f2fs_readahead,
- .writepage = f2fs_write_data_page,
.writepages = f2fs_write_data_pages,
.write_begin = f2fs_write_begin,
.write_end = f2fs_write_end,
@@ -4186,7 +4168,7 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
unsigned int flags, struct iomap *iomap,
struct iomap *srcmap)
{
- struct f2fs_map_blocks map = {};
+ struct f2fs_map_blocks map = { NULL, };
pgoff_t next_pgofs = 0;
int err;
@@ -4195,7 +4177,17 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
map.m_next_pgofs = &next_pgofs;
map.m_seg_type = f2fs_rw_hint_to_seg_type(F2FS_I_SB(inode),
inode->i_write_hint);
- if (flags & IOMAP_WRITE)
+ if (flags & IOMAP_WRITE && iomap->private) {
+ map.m_last_pblk = (unsigned long)iomap->private;
+ iomap->private = NULL;
+ }
+
+ /*
+ * If the blocks being overwritten are already allocated,
+ * f2fs_map_lock and f2fs_balance_fs are not necessary.
+ */
+ if ((flags & IOMAP_WRITE) &&
+ !f2fs_overwrite_io(inode, offset, length))
map.m_may_create = true;
err = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_DIO);
@@ -4227,6 +4219,9 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
iomap->flags |= IOMAP_F_MERGED;
iomap->bdev = map.m_bdev;
iomap->addr = F2FS_BLK_TO_BYTES(map.m_pblk);
+
+ if (flags & IOMAP_WRITE && map.m_last_pblk)
+ iomap->private = (void *)map.m_last_pblk;
} else {
if (flags & IOMAP_WRITE)
return -ENOTBLK;
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 468828288a4a..43a83bbd3bc5 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -21,7 +21,7 @@
#include "gc.h"
static LIST_HEAD(f2fs_stat_list);
-static DEFINE_RAW_SPINLOCK(f2fs_stat_lock);
+static DEFINE_SPINLOCK(f2fs_stat_lock);
#ifdef CONFIG_DEBUG_FS
static struct dentry *f2fs_debugfs_root;
#endif
@@ -91,7 +91,7 @@ static void update_multidevice_stats(struct f2fs_sb_info *sbi)
seg_blks = get_seg_entry(sbi, j)->valid_blocks;
/* update segment stats */
- if (IS_CURSEG(sbi, j))
+ if (is_curseg(sbi, j))
dev_stats[i].devstats[0][DEVSTAT_INUSE]++;
else if (seg_blks == BLKS_PER_SEG(sbi))
dev_stats[i].devstats[0][DEVSTAT_FULL]++;
@@ -109,7 +109,7 @@ static void update_multidevice_stats(struct f2fs_sb_info *sbi)
sec_blks = get_sec_entry(sbi, j)->valid_blocks;
/* update section stats */
- if (IS_CURSEC(sbi, GET_SEC_FROM_SEG(sbi, j)))
+ if (is_cursec(sbi, GET_SEC_FROM_SEG(sbi, j)))
dev_stats[i].devstats[1][DEVSTAT_INUSE]++;
else if (sec_blks == BLKS_PER_SEC(sbi))
dev_stats[i].devstats[1][DEVSTAT_FULL]++;
@@ -164,6 +164,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->ndirty_imeta = get_pages(sbi, F2FS_DIRTY_IMETA);
si->ndirty_dirs = sbi->ndirty_inode[DIR_INODE];
si->ndirty_files = sbi->ndirty_inode[FILE_INODE];
+ si->ndonate_files = sbi->donate_files;
si->nquota_files = sbi->nquota_files;
si->ndirty_all = sbi->ndirty_inode[DIRTY_META];
si->aw_cnt = atomic_read(&sbi->atomic_files);
@@ -438,9 +439,8 @@ static int stat_show(struct seq_file *s, void *v)
{
struct f2fs_stat_info *si;
int i = 0, j = 0;
- unsigned long flags;
- raw_spin_lock_irqsave(&f2fs_stat_lock, flags);
+ spin_lock(&f2fs_stat_lock);
list_for_each_entry(si, &f2fs_stat_list, stat_list) {
struct f2fs_sb_info *sbi = si->sbi;
@@ -501,6 +501,8 @@ static int stat_show(struct seq_file *s, void *v)
si->compr_inode, si->compr_blocks);
seq_printf(s, " - Swapfile Inode: %u\n",
si->swapfile_inode);
+ seq_printf(s, " - Donate Inode: %u\n",
+ si->ndonate_files);
seq_printf(s, " - Orphan/Append/Update Inode: %u, %u, %u\n",
si->orphans, si->append, si->update);
seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n",
@@ -750,7 +752,7 @@ static int stat_show(struct seq_file *s, void *v)
seq_printf(s, " - paged : %llu KB\n",
si->page_mem >> 10);
}
- raw_spin_unlock_irqrestore(&f2fs_stat_lock, flags);
+ spin_unlock(&f2fs_stat_lock);
return 0;
}
@@ -762,7 +764,6 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
struct f2fs_stat_info *si;
struct f2fs_dev_stats *dev_stats;
- unsigned long flags;
int i;
si = f2fs_kzalloc(sbi, sizeof(struct f2fs_stat_info), GFP_KERNEL);
@@ -814,9 +815,9 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
atomic_set(&sbi->max_aw_cnt, 0);
- raw_spin_lock_irqsave(&f2fs_stat_lock, flags);
+ spin_lock(&f2fs_stat_lock);
list_add_tail(&si->stat_list, &f2fs_stat_list);
- raw_spin_unlock_irqrestore(&f2fs_stat_lock, flags);
+ spin_unlock(&f2fs_stat_lock);
return 0;
}
@@ -824,11 +825,10 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
void f2fs_destroy_stats(struct f2fs_sb_info *sbi)
{
struct f2fs_stat_info *si = F2FS_STAT(sbi);
- unsigned long flags;
- raw_spin_lock_irqsave(&f2fs_stat_lock, flags);
+ spin_lock(&f2fs_stat_lock);
list_del(&si->stat_list);
- raw_spin_unlock_irqrestore(&f2fs_stat_lock, flags);
+ spin_unlock(&f2fs_stat_lock);
kfree(si->dev_stats);
kfree(si);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 54dd52de7269..fffd7749d6d1 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -173,7 +173,7 @@ static unsigned long dir_block_index(unsigned int level,
}
static struct f2fs_dir_entry *find_in_block(struct inode *dir,
- struct page *dentry_page,
+ struct folio *dentry_folio,
const struct f2fs_filename *fname,
int *max_slots,
bool use_hash)
@@ -181,7 +181,7 @@ static struct f2fs_dir_entry *find_in_block(struct inode *dir,
struct f2fs_dentry_block *dentry_blk;
struct f2fs_dentry_ptr d;
- dentry_blk = (struct f2fs_dentry_block *)page_address(dentry_page);
+ dentry_blk = folio_address(dentry_folio);
make_dentry_ptr_block(dir, &d, dentry_blk);
return f2fs_find_target_dentry(&d, fname, max_slots, use_hash);
@@ -260,13 +260,12 @@ found:
static struct f2fs_dir_entry *find_in_level(struct inode *dir,
unsigned int level,
const struct f2fs_filename *fname,
- struct page **res_page,
+ struct folio **res_folio,
bool use_hash)
{
int s = GET_DENTRY_SLOTS(fname->disk_name.len);
unsigned int nbucket, nblock;
unsigned int bidx, end_block, bucket_no;
- struct page *dentry_page;
struct f2fs_dir_entry *de = NULL;
pgoff_t next_pgofs;
bool room = false;
@@ -284,31 +283,32 @@ start_find_bucket:
while (bidx < end_block) {
/* no need to allocate new dentry pages to all the indices */
- dentry_page = f2fs_find_data_page(dir, bidx, &next_pgofs);
- if (IS_ERR(dentry_page)) {
- if (PTR_ERR(dentry_page) == -ENOENT) {
+ struct folio *dentry_folio;
+ dentry_folio = f2fs_find_data_folio(dir, bidx, &next_pgofs);
+ if (IS_ERR(dentry_folio)) {
+ if (PTR_ERR(dentry_folio) == -ENOENT) {
room = true;
bidx = next_pgofs;
continue;
} else {
- *res_page = dentry_page;
+ *res_folio = dentry_folio;
break;
}
}
- de = find_in_block(dir, dentry_page, fname, &max_slots, use_hash);
+ de = find_in_block(dir, dentry_folio, fname, &max_slots, use_hash);
if (IS_ERR(de)) {
- *res_page = ERR_CAST(de);
+ *res_folio = ERR_CAST(de);
de = NULL;
break;
} else if (de) {
- *res_page = dentry_page;
+ *res_folio = dentry_folio;
break;
}
if (max_slots >= s)
room = true;
- f2fs_put_page(dentry_page, 0);
+ f2fs_folio_put(dentry_folio, false);
bidx++;
}
@@ -329,7 +329,7 @@ start_find_bucket:
struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir,
const struct f2fs_filename *fname,
- struct page **res_page)
+ struct folio **res_folio)
{
unsigned long npages = dir_blocks(dir);
struct f2fs_dir_entry *de = NULL;
@@ -337,13 +337,13 @@ struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir,
unsigned int level;
bool use_hash = true;
- *res_page = NULL;
+ *res_folio = NULL;
#if IS_ENABLED(CONFIG_UNICODE)
start_find_entry:
#endif
if (f2fs_has_inline_dentry(dir)) {
- de = f2fs_find_in_inline_dir(dir, fname, res_page, use_hash);
+ de = f2fs_find_in_inline_dir(dir, fname, res_folio, use_hash);
goto out;
}
@@ -359,14 +359,15 @@ start_find_entry:
}
for (level = 0; level < max_depth; level++) {
- de = find_in_level(dir, level, fname, res_page, use_hash);
- if (de || IS_ERR(*res_page))
+ de = find_in_level(dir, level, fname, res_folio, use_hash);
+ if (de || IS_ERR(*res_folio))
break;
}
out:
#if IS_ENABLED(CONFIG_UNICODE)
- if (IS_CASEFOLDED(dir) && !de && use_hash) {
+ if (!sb_no_casefold_compat_fallback(dir->i_sb) &&
+ IS_CASEFOLDED(dir) && !de && use_hash) {
use_hash = false;
goto start_find_entry;
}
@@ -384,7 +385,7 @@ out:
* Entry is guaranteed to be valid.
*/
struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
- const struct qstr *child, struct page **res_page)
+ const struct qstr *child, struct folio **res_folio)
{
struct f2fs_dir_entry *de = NULL;
struct f2fs_filename fname;
@@ -393,67 +394,67 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
err = f2fs_setup_filename(dir, child, 1, &fname);
if (err) {
if (err == -ENOENT)
- *res_page = NULL;
+ *res_folio = NULL;
else
- *res_page = ERR_PTR(err);
+ *res_folio = ERR_PTR(err);
return NULL;
}
- de = __f2fs_find_entry(dir, &fname, res_page);
+ de = __f2fs_find_entry(dir, &fname, res_folio);
f2fs_free_filename(&fname);
return de;
}
-struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p)
+struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct folio **f)
{
- return f2fs_find_entry(dir, &dotdot_name, p);
+ return f2fs_find_entry(dir, &dotdot_name, f);
}
ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr,
- struct page **page)
+ struct folio **folio)
{
ino_t res = 0;
struct f2fs_dir_entry *de;
- de = f2fs_find_entry(dir, qstr, page);
+ de = f2fs_find_entry(dir, qstr, folio);
if (de) {
res = le32_to_cpu(de->ino);
- f2fs_put_page(*page, 0);
+ f2fs_folio_put(*folio, false);
}
return res;
}
void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
- struct page *page, struct inode *inode)
+ struct folio *folio, struct inode *inode)
{
enum page_type type = f2fs_has_inline_dentry(dir) ? NODE : DATA;
- lock_page(page);
- f2fs_wait_on_page_writeback(page, type, true, true);
+ folio_lock(folio);
+ f2fs_folio_wait_writeback(folio, type, true, true);
de->ino = cpu_to_le32(inode->i_ino);
de->file_type = fs_umode_to_ftype(inode->i_mode);
- set_page_dirty(page);
+ folio_mark_dirty(folio);
inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
f2fs_mark_inode_dirty_sync(dir, false);
- f2fs_put_page(page, 1);
+ f2fs_folio_put(folio, true);
}
static void init_dent_inode(struct inode *dir, struct inode *inode,
const struct f2fs_filename *fname,
- struct page *ipage)
+ struct folio *ifolio)
{
struct f2fs_inode *ri;
if (!fname) /* tmpfile case? */
return;
- f2fs_wait_on_page_writeback(ipage, NODE, true, true);
+ f2fs_folio_wait_writeback(ifolio, NODE, true, true);
- /* copy name info. to this inode page */
- ri = F2FS_INODE(ipage);
+ /* copy name info. to this inode folio */
+ ri = F2FS_INODE(ifolio);
ri->i_namelen = cpu_to_le32(fname->disk_name.len);
memcpy(ri->i_name, fname->disk_name.name, fname->disk_name.len);
if (IS_ENCRYPTED(dir)) {
@@ -474,7 +475,7 @@ static void init_dent_inode(struct inode *dir, struct inode *inode,
file_lost_pino(inode);
}
}
- set_page_dirty(ipage);
+ folio_mark_dirty(ifolio);
}
void f2fs_do_make_empty_dir(struct inode *inode, struct inode *parent,
@@ -491,72 +492,73 @@ void f2fs_do_make_empty_dir(struct inode *inode, struct inode *parent,
}
static int make_empty_dir(struct inode *inode,
- struct inode *parent, struct page *page)
+ struct inode *parent, struct folio *folio)
{
- struct page *dentry_page;
+ struct folio *dentry_folio;
struct f2fs_dentry_block *dentry_blk;
struct f2fs_dentry_ptr d;
if (f2fs_has_inline_dentry(inode))
- return f2fs_make_empty_inline_dir(inode, parent, page);
+ return f2fs_make_empty_inline_dir(inode, parent, folio);
- dentry_page = f2fs_get_new_data_page(inode, page, 0, true);
- if (IS_ERR(dentry_page))
- return PTR_ERR(dentry_page);
+ dentry_folio = f2fs_get_new_data_folio(inode, folio, 0, true);
+ if (IS_ERR(dentry_folio))
+ return PTR_ERR(dentry_folio);
- dentry_blk = page_address(dentry_page);
+ dentry_blk = folio_address(dentry_folio);
make_dentry_ptr_block(NULL, &d, dentry_blk);
f2fs_do_make_empty_dir(inode, parent, &d);
- set_page_dirty(dentry_page);
- f2fs_put_page(dentry_page, 1);
+ folio_mark_dirty(dentry_folio);
+ f2fs_folio_put(dentry_folio, true);
return 0;
}
-struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir,
- const struct f2fs_filename *fname, struct page *dpage)
+struct folio *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir,
+ const struct f2fs_filename *fname, struct folio *dfolio)
{
- struct page *page;
+ struct folio *folio;
int err;
if (is_inode_flag_set(inode, FI_NEW_INODE)) {
- page = f2fs_new_inode_page(inode);
- if (IS_ERR(page))
- return page;
+ folio = f2fs_new_inode_folio(inode);
+ if (IS_ERR(folio))
+ return folio;
if (S_ISDIR(inode->i_mode)) {
/* in order to handle error case */
- get_page(page);
- err = make_empty_dir(inode, dir, page);
+ folio_get(folio);
+ err = make_empty_dir(inode, dir, folio);
if (err) {
- lock_page(page);
+ folio_lock(folio);
goto put_error;
}
- put_page(page);
+ folio_put(folio);
}
- err = f2fs_init_acl(inode, dir, page, dpage);
+ err = f2fs_init_acl(inode, dir, folio, dfolio);
if (err)
goto put_error;
err = f2fs_init_security(inode, dir,
- fname ? fname->usr_fname : NULL, page);
+ fname ? fname->usr_fname : NULL,
+ folio);
if (err)
goto put_error;
if (IS_ENCRYPTED(inode)) {
- err = fscrypt_set_context(inode, page);
+ err = fscrypt_set_context(inode, folio);
if (err)
goto put_error;
}
} else {
- page = f2fs_get_node_page(F2FS_I_SB(dir), inode->i_ino);
- if (IS_ERR(page))
- return page;
+ folio = f2fs_get_inode_folio(F2FS_I_SB(dir), inode->i_ino);
+ if (IS_ERR(folio))
+ return folio;
}
- init_dent_inode(dir, inode, fname, page);
+ init_dent_inode(dir, inode, fname, folio);
/*
* This file should be checkpointed during fsync.
@@ -573,12 +575,12 @@ struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir,
f2fs_remove_orphan_inode(F2FS_I_SB(dir), inode->i_ino);
f2fs_i_links_write(inode, true);
}
- return page;
+ return folio;
put_error:
clear_nlink(inode);
- f2fs_update_inode(inode, page);
- f2fs_put_page(page, 1);
+ f2fs_update_inode(inode, folio);
+ f2fs_folio_put(folio, true);
return ERR_PTR(err);
}
@@ -620,14 +622,14 @@ next:
goto next;
}
-bool f2fs_has_enough_room(struct inode *dir, struct page *ipage,
+bool f2fs_has_enough_room(struct inode *dir, struct folio *ifolio,
const struct f2fs_filename *fname)
{
struct f2fs_dentry_ptr d;
unsigned int bit_pos;
int slots = GET_DENTRY_SLOTS(fname->disk_name.len);
- make_dentry_ptr_inline(dir, &d, inline_data_addr(dir, ipage));
+ make_dentry_ptr_inline(dir, &d, inline_data_addr(dir, ifolio));
bit_pos = f2fs_room_for_filename(d.bitmap, slots, d.max);
@@ -664,10 +666,10 @@ int f2fs_add_regular_entry(struct inode *dir, const struct f2fs_filename *fname,
unsigned int current_depth;
unsigned long bidx, block;
unsigned int nbucket, nblock;
- struct page *dentry_page = NULL;
+ struct folio *dentry_folio = NULL;
struct f2fs_dentry_block *dentry_blk = NULL;
struct f2fs_dentry_ptr d;
- struct page *page = NULL;
+ struct folio *folio = NULL;
int slots, err = 0;
level = 0;
@@ -697,30 +699,30 @@ start:
(le32_to_cpu(fname->hash) % nbucket));
for (block = bidx; block <= (bidx + nblock - 1); block++) {
- dentry_page = f2fs_get_new_data_page(dir, NULL, block, true);
- if (IS_ERR(dentry_page))
- return PTR_ERR(dentry_page);
+ dentry_folio = f2fs_get_new_data_folio(dir, NULL, block, true);
+ if (IS_ERR(dentry_folio))
+ return PTR_ERR(dentry_folio);
- dentry_blk = page_address(dentry_page);
+ dentry_blk = folio_address(dentry_folio);
bit_pos = f2fs_room_for_filename(&dentry_blk->dentry_bitmap,
slots, NR_DENTRY_IN_BLOCK);
if (bit_pos < NR_DENTRY_IN_BLOCK)
goto add_dentry;
- f2fs_put_page(dentry_page, 1);
+ f2fs_folio_put(dentry_folio, true);
}
/* Move to next level to find the empty slot for new dentry */
++level;
goto start;
add_dentry:
- f2fs_wait_on_page_writeback(dentry_page, DATA, true, true);
+ f2fs_folio_wait_writeback(dentry_folio, DATA, true, true);
if (inode) {
f2fs_down_write(&F2FS_I(inode)->i_sem);
- page = f2fs_init_inode_metadata(inode, dir, fname, NULL);
- if (IS_ERR(page)) {
- err = PTR_ERR(page);
+ folio = f2fs_init_inode_metadata(inode, dir, fname, NULL);
+ if (IS_ERR(folio)) {
+ err = PTR_ERR(folio);
goto fail;
}
}
@@ -729,16 +731,16 @@ add_dentry:
f2fs_update_dentry(ino, mode, &d, &fname->disk_name, fname->hash,
bit_pos);
- set_page_dirty(dentry_page);
+ folio_mark_dirty(dentry_folio);
if (inode) {
f2fs_i_pino_write(inode, dir->i_ino);
/* synchronize inode page's data from inode cache */
if (is_inode_flag_set(inode, FI_NEW_INODE))
- f2fs_update_inode(inode, page);
+ f2fs_update_inode(inode, folio);
- f2fs_put_page(page, 1);
+ f2fs_folio_put(folio, true);
}
f2fs_update_parent_metadata(dir, inode, current_depth);
@@ -746,7 +748,7 @@ fail:
if (inode)
f2fs_up_write(&F2FS_I(inode)->i_sem);
- f2fs_put_page(dentry_page, 1);
+ f2fs_folio_put(dentry_folio, true);
return err;
}
@@ -780,7 +782,7 @@ int f2fs_do_add_link(struct inode *dir, const struct qstr *name,
struct inode *inode, nid_t ino, umode_t mode)
{
struct f2fs_filename fname;
- struct page *page = NULL;
+ struct folio *folio = NULL;
struct f2fs_dir_entry *de = NULL;
int err;
@@ -796,14 +798,14 @@ int f2fs_do_add_link(struct inode *dir, const struct qstr *name,
* consistency more.
*/
if (current != F2FS_I(dir)->task) {
- de = __f2fs_find_entry(dir, &fname, &page);
+ de = __f2fs_find_entry(dir, &fname, &folio);
F2FS_I(dir)->task = NULL;
}
if (de) {
- f2fs_put_page(page, 0);
+ f2fs_folio_put(folio, false);
err = -EEXIST;
- } else if (IS_ERR(page)) {
- err = PTR_ERR(page);
+ } else if (IS_ERR(folio)) {
+ err = PTR_ERR(folio);
} else {
err = f2fs_add_dentry(dir, &fname, inode, ino, mode);
}
@@ -814,16 +816,16 @@ int f2fs_do_add_link(struct inode *dir, const struct qstr *name,
int f2fs_do_tmpfile(struct inode *inode, struct inode *dir,
struct f2fs_filename *fname)
{
- struct page *page;
+ struct folio *folio;
int err = 0;
f2fs_down_write(&F2FS_I(inode)->i_sem);
- page = f2fs_init_inode_metadata(inode, dir, fname, NULL);
- if (IS_ERR(page)) {
- err = PTR_ERR(page);
+ folio = f2fs_init_inode_metadata(inode, dir, fname, NULL);
+ if (IS_ERR(folio)) {
+ err = PTR_ERR(folio);
goto fail;
}
- f2fs_put_page(page, 1);
+ f2fs_folio_put(folio, true);
clear_inode_flag(inode, FI_NEW_INODE);
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
@@ -859,13 +861,13 @@ void f2fs_drop_nlink(struct inode *dir, struct inode *inode)
* It only removes the dentry from the dentry page, corresponding name
* entry in name page does not need to be touched during deletion.
*/
-void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
+void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct folio *folio,
struct inode *dir, struct inode *inode)
{
- struct f2fs_dentry_block *dentry_blk;
+ struct f2fs_dentry_block *dentry_blk;
unsigned int bit_pos;
int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len));
- pgoff_t index = page_folio(page)->index;
+ pgoff_t index = folio->index;
int i;
f2fs_update_time(F2FS_I_SB(dir), REQ_TIME);
@@ -874,12 +876,12 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
f2fs_add_ino_entry(F2FS_I_SB(dir), dir->i_ino, TRANS_DIR_INO);
if (f2fs_has_inline_dentry(dir))
- return f2fs_delete_inline_entry(dentry, page, dir, inode);
+ return f2fs_delete_inline_entry(dentry, folio, dir, inode);
- lock_page(page);
- f2fs_wait_on_page_writeback(page, DATA, true, true);
+ folio_lock(folio);
+ f2fs_folio_wait_writeback(folio, DATA, true, true);
- dentry_blk = page_address(page);
+ dentry_blk = folio_address(folio);
bit_pos = dentry - dentry_blk->dentry;
for (i = 0; i < slots; i++)
__clear_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap);
@@ -888,19 +890,19 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
NR_DENTRY_IN_BLOCK,
0);
- set_page_dirty(page);
+ folio_mark_dirty(folio);
if (bit_pos == NR_DENTRY_IN_BLOCK &&
!f2fs_truncate_hole(dir, index, index + 1)) {
- f2fs_clear_page_cache_dirty_tag(page_folio(page));
- clear_page_dirty_for_io(page);
- ClearPageUptodate(page);
- clear_page_private_all(page);
+ f2fs_clear_page_cache_dirty_tag(folio);
+ folio_clear_dirty_for_io(folio);
+ folio_clear_uptodate(folio);
+ folio_detach_private(folio);
inode_dec_dirty_pages(dir);
f2fs_remove_dirty_inode(dir);
}
- f2fs_put_page(page, 1);
+ f2fs_folio_put(folio, true);
inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
f2fs_mark_inode_dirty_sync(dir, false);
@@ -912,7 +914,6 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
bool f2fs_empty_dir(struct inode *dir)
{
unsigned long bidx = 0;
- struct page *dentry_page;
unsigned int bit_pos;
struct f2fs_dentry_block *dentry_blk;
unsigned long nblock = dir_blocks(dir);
@@ -922,10 +923,11 @@ bool f2fs_empty_dir(struct inode *dir)
while (bidx < nblock) {
pgoff_t next_pgofs;
+ struct folio *dentry_folio;
- dentry_page = f2fs_find_data_page(dir, bidx, &next_pgofs);
- if (IS_ERR(dentry_page)) {
- if (PTR_ERR(dentry_page) == -ENOENT) {
+ dentry_folio = f2fs_find_data_folio(dir, bidx, &next_pgofs);
+ if (IS_ERR(dentry_folio)) {
+ if (PTR_ERR(dentry_folio) == -ENOENT) {
bidx = next_pgofs;
continue;
} else {
@@ -933,7 +935,7 @@ bool f2fs_empty_dir(struct inode *dir)
}
}
- dentry_blk = page_address(dentry_page);
+ dentry_blk = folio_address(dentry_folio);
if (bidx == 0)
bit_pos = 2;
else
@@ -942,7 +944,7 @@ bool f2fs_empty_dir(struct inode *dir)
NR_DENTRY_IN_BLOCK,
bit_pos);
- f2fs_put_page(dentry_page, 0);
+ f2fs_folio_put(dentry_folio, false);
if (bit_pos < NR_DENTRY_IN_BLOCK)
return false;
@@ -1041,7 +1043,6 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
struct inode *inode = file_inode(file);
unsigned long npages = dir_blocks(inode);
struct f2fs_dentry_block *dentry_blk = NULL;
- struct page *dentry_page = NULL;
struct file_ra_state *ra = &file->f_ra;
loff_t start_pos = ctx->pos;
unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK);
@@ -1065,6 +1066,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
}
for (; n < npages; ctx->pos = n * NR_DENTRY_IN_BLOCK) {
+ struct folio *dentry_folio;
pgoff_t next_pgofs;
/* allow readdir() to be interrupted */
@@ -1079,9 +1081,9 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
page_cache_sync_readahead(inode->i_mapping, ra, file, n,
min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES));
- dentry_page = f2fs_find_data_page(inode, n, &next_pgofs);
- if (IS_ERR(dentry_page)) {
- err = PTR_ERR(dentry_page);
+ dentry_folio = f2fs_find_data_folio(inode, n, &next_pgofs);
+ if (IS_ERR(dentry_folio)) {
+ err = PTR_ERR(dentry_folio);
if (err == -ENOENT) {
err = 0;
n = next_pgofs;
@@ -1091,18 +1093,15 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
}
}
- dentry_blk = page_address(dentry_page);
+ dentry_blk = folio_address(dentry_folio);
make_dentry_ptr_block(inode, &d, dentry_blk);
err = f2fs_fill_dentries(ctx, &d,
n * NR_DENTRY_IN_BLOCK, &fstr);
- if (err) {
- f2fs_put_page(dentry_page, 0);
+ f2fs_folio_put(dentry_folio, false);
+ if (err)
break;
- }
-
- f2fs_put_page(dentry_page, 0);
n++;
}
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index 347b3b647834..199c1e7a83ef 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -19,10 +19,10 @@
#include "node.h"
#include <trace/events/f2fs.h>
-bool sanity_check_extent_cache(struct inode *inode, struct page *ipage)
+bool sanity_check_extent_cache(struct inode *inode, struct folio *ifolio)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct f2fs_extent *i_ext = &F2FS_INODE(ipage)->i_ext;
+ struct f2fs_extent *i_ext = &F2FS_INODE(ifolio)->i_ext;
struct extent_info ei;
int devi;
@@ -407,21 +407,21 @@ static void __drop_largest_extent(struct extent_tree *et,
}
}
-void f2fs_init_read_extent_tree(struct inode *inode, struct page *ipage)
+void f2fs_init_read_extent_tree(struct inode *inode, struct folio *ifolio)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct extent_tree_info *eti = &sbi->extent_tree[EX_READ];
- struct f2fs_extent *i_ext = &F2FS_INODE(ipage)->i_ext;
+ struct f2fs_extent *i_ext = &F2FS_INODE(ifolio)->i_ext;
struct extent_tree *et;
struct extent_node *en;
- struct extent_info ei;
+ struct extent_info ei = {0};
if (!__may_extent_tree(inode, EX_READ)) {
/* drop largest read extent */
if (i_ext->len) {
- f2fs_wait_on_page_writeback(ipage, NODE, true, true);
+ f2fs_folio_wait_writeback(ifolio, NODE, true, true);
i_ext->len = 0;
- set_page_dirty(ipage);
+ folio_mark_dirty(ifolio);
}
set_inode_flag(inode, FI_NO_EXTENT);
return;
@@ -934,7 +934,7 @@ static void __update_extent_cache(struct dnode_of_data *dn, enum extent_type typ
if (!__may_extent_tree(dn->inode, type))
return;
- ei.fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) +
+ ei.fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_folio), dn->inode) +
dn->ofs_in_node;
ei.len = 1;
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 1afa7be16e7d..46be7560548c 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -62,16 +62,26 @@ enum {
FAULT_BLKADDR_VALIDITY,
FAULT_BLKADDR_CONSISTENCE,
FAULT_NO_SEGMENT,
+ FAULT_INCONSISTENT_FOOTER,
+ FAULT_TIMEOUT,
+ FAULT_VMALLOC,
FAULT_MAX,
};
-#ifdef CONFIG_F2FS_FAULT_INJECTION
-#define F2FS_ALL_FAULT_TYPE (GENMASK(FAULT_MAX - 1, 0))
+/* indicate which option to update */
+enum fault_option {
+ FAULT_RATE = 1, /* only update fault rate */
+ FAULT_TYPE = 2, /* only update fault type */
+ FAULT_ALL = 4, /* reset all fault injection options/stats */
+};
+#ifdef CONFIG_F2FS_FAULT_INJECTION
struct f2fs_fault_info {
atomic_t inject_ops;
int inject_rate;
unsigned int inject_type;
+ /* Used to account total count of injection for each type */
+ unsigned int inject_count[FAULT_MAX];
};
extern const char *f2fs_fault_name[FAULT_MAX];
@@ -114,6 +124,13 @@ extern const char *f2fs_fault_name[FAULT_MAX];
#define F2FS_MOUNT_GC_MERGE 0x02000000
#define F2FS_MOUNT_COMPRESS_CACHE 0x04000000
#define F2FS_MOUNT_AGE_EXTENT_CACHE 0x08000000
+#define F2FS_MOUNT_NAT_BITS 0x10000000
+#define F2FS_MOUNT_INLINECRYPT 0x20000000
+/*
+ * Some f2fs environments expect to be able to pass the "lazytime" option
+ * string rather than using the MS_LAZYTIME flag, so this must remain.
+ */
+#define F2FS_MOUNT_LAZYTIME 0x40000000
#define F2FS_OPTION(sbi) ((sbi)->mount_opt)
#define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option)
@@ -309,7 +326,7 @@ struct inode_entry {
struct fsync_node_entry {
struct list_head list; /* list head */
- struct page *page; /* warm node page pointer */
+ struct folio *folio; /* warm node folio pointer */
unsigned int seq_id; /* sequence id */
};
@@ -369,7 +386,7 @@ struct discard_cmd {
struct rb_node rb_node; /* rb node located in rb-tree */
struct discard_info di; /* discard info */
struct list_head list; /* command list */
- struct completion wait; /* compleation */
+ struct completion wait; /* completion */
struct block_device *bdev; /* bdev */
unsigned short ref; /* reference count */
unsigned char state; /* state */
@@ -598,6 +615,9 @@ enum {
/* congestion wait timeout value, default: 20ms */
#define DEFAULT_IO_TIMEOUT (msecs_to_jiffies(20))
+/* timeout value injected, default: 1000ms */
+#define DEFAULT_FAULT_TIMEOUT (msecs_to_jiffies(1000))
+
/* maximum retry quota flush count */
#define DEFAULT_RETRY_QUOTA_FLUSH_COUNT 8
@@ -712,6 +732,7 @@ struct f2fs_map_blocks {
block_t m_lblk;
unsigned int m_len;
unsigned int m_flags;
+ unsigned long m_last_pblk; /* last allocated block, only used for DIO in LFS mode */
pgoff_t *m_next_pgofs; /* point next possible non-hole pgofs */
pgoff_t *m_next_extent; /* point to next possible extent */
int m_seg_type;
@@ -813,6 +834,7 @@ enum {
FI_ATOMIC_DIRTIED, /* indicate atomic file is dirtied */
FI_ATOMIC_REPLACE, /* indicate atomic replace */
FI_OPENED_FILE, /* indicate file has been opened */
+ FI_DONATE_FINISHED, /* indicate page donation of file has been finished */
FI_MAX, /* max flag, never be used */
};
@@ -830,6 +852,7 @@ struct f2fs_inode_info {
/* Use below internally in f2fs*/
unsigned long flags[BITS_TO_LONGS(FI_MAX)]; /* use to pass per-file flags */
+ unsigned int ioprio_hint; /* hint for IO priority */
struct f2fs_rwsem i_sem; /* protect fi info */
atomic_t dirty_pages; /* # of dirty pages */
f2fs_hash_t chash; /* hash value of given file name */
@@ -849,6 +872,12 @@ struct f2fs_inode_info {
#endif
struct list_head dirty_list; /* dirty list for dirs and files */
struct list_head gdirty_list; /* linked in global dirty list */
+
+ /* linked in global inode list for cache donation */
+ struct list_head gdonate_list;
+ pgoff_t donate_start, donate_end; /* inclusive */
+ atomic_t open_count; /* # of open files */
+
struct task_struct *atomic_write_task; /* store atomic write task */
struct extent_tree *extent_tree[NR_EXTENT_CACHES];
/* cached extent_tree entry */
@@ -980,11 +1009,11 @@ struct f2fs_nm_info {
*/
struct dnode_of_data {
struct inode *inode; /* vfs inode pointer */
- struct page *inode_page; /* its inode page, NULL is possible */
- struct page *node_page; /* cached direct node page */
+ struct folio *inode_folio; /* its inode folio, NULL is possible */
+ struct folio *node_folio; /* cached direct node folio */
nid_t nid; /* node id of the direct node block */
unsigned int ofs_in_node; /* data offset in the node page */
- bool inode_page_locked; /* inode page is locked or not */
+ bool inode_folio_locked; /* inode folio is locked or not */
bool node_changed; /* is node block changed */
char cur_level; /* level of hole node page */
char max_level; /* level of current page located */
@@ -992,12 +1021,12 @@ struct dnode_of_data {
};
static inline void set_new_dnode(struct dnode_of_data *dn, struct inode *inode,
- struct page *ipage, struct page *npage, nid_t nid)
+ struct folio *ifolio, struct folio *nfolio, nid_t nid)
{
memset(dn, 0, sizeof(*dn));
dn->inode = inode;
- dn->inode_page = ipage;
- dn->node_page = npage;
+ dn->inode_folio = ifolio;
+ dn->node_folio = nfolio;
dn->nid = nid;
}
@@ -1096,8 +1125,8 @@ struct f2fs_sm_info {
* f2fs monitors the number of several block types such as on-writeback,
* dirty dentry blocks, dirty node blocks, and dirty meta blocks.
*/
-#define WB_DATA_TYPE(p, f) \
- (f || f2fs_is_cp_guaranteed(p) ? F2FS_WB_CP_DATA : F2FS_WB_DATA)
+#define WB_DATA_TYPE(folio, f) \
+ (f || f2fs_is_cp_guaranteed(folio) ? F2FS_WB_CP_DATA : F2FS_WB_DATA)
enum count_type {
F2FS_DIRTY_DENTS,
F2FS_DIRTY_DATA,
@@ -1213,7 +1242,10 @@ struct f2fs_io_info {
blk_opf_t op_flags; /* req_flag_bits */
block_t new_blkaddr; /* new block address to be written */
block_t old_blkaddr; /* old block address before Cow */
- struct page *page; /* page to be written */
+ union {
+ struct page *page; /* page to be written */
+ struct folio *folio;
+ };
struct page *encrypted_page; /* encrypted page */
struct page *compressed_page; /* compressed page */
struct list_head list; /* serialize IOs */
@@ -1259,7 +1291,7 @@ struct f2fs_bio_info {
struct f2fs_dev_info {
struct file *bdev_file;
struct block_device *bdev;
- char path[MAX_PATH_LEN];
+ char path[MAX_PATH_LEN + 1];
unsigned int total_segments;
block_t start_blk;
block_t end_blk;
@@ -1273,6 +1305,7 @@ enum inode_type {
DIR_INODE, /* for dirty dir inode */
FILE_INODE, /* for dirty regular/symlink inode */
DIRTY_META, /* for all dirtied inode metadata */
+ DONATE_INODE, /* for all inode to donate pages */
NR_INODE_TYPE,
};
@@ -1399,7 +1432,7 @@ enum {
enum {
MEMORY_MODE_NORMAL, /* memory mode for normal devices */
- MEMORY_MODE_LOW, /* memory mode for low memry devices */
+ MEMORY_MODE_LOW, /* memory mode for low memory devices */
};
enum errors_option {
@@ -1463,7 +1496,7 @@ enum compress_flag {
#define COMPRESS_DATA_RESERVED_SIZE 4
struct compress_data {
__le32 clen; /* compressed data size */
- __le32 chksum; /* compressed data chksum */
+ __le32 chksum; /* compressed data checksum */
__le32 reserved[COMPRESS_DATA_RESERVED_SIZE]; /* reserved */
u8 cdata[]; /* compressed data */
};
@@ -1508,6 +1541,7 @@ struct compress_io_ctx {
struct decompress_io_ctx {
u32 magic; /* magic number to indicate page is compressed */
struct inode *inode; /* inode the context belong to */
+ struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */
pgoff_t cluster_idx; /* cluster index number */
unsigned int cluster_size; /* page count in cluster */
unsigned int log_cluster_size; /* log of cluster size */
@@ -1548,6 +1582,7 @@ struct decompress_io_ctx {
bool failed; /* IO error occurred before decompression? */
bool need_verity; /* need fs-verity verification after decompression? */
+ unsigned char compress_algorithm; /* backup algorithm type */
void *private; /* payload buffer for specified decompression algorithm */
void *private2; /* extra payload buffer */
struct work_struct verity_work; /* work to verify the decompressed pages */
@@ -1628,6 +1663,9 @@ struct f2fs_sb_info {
unsigned int warm_data_age_threshold;
unsigned int last_age_weight;
+ /* control donate caches */
+ unsigned int donate_files;
+
/* basic filesystem units */
unsigned int log_sectors_per_block; /* log2 sectors per block */
unsigned int log_blocksize; /* log2 block size */
@@ -1659,6 +1697,7 @@ struct f2fs_sb_info {
unsigned int nquota_files; /* # of quota sysfile */
struct f2fs_rwsem quota_sem; /* blocking cp for flags */
+ struct task_struct *umount_lock_holder; /* s_umount lock holder */
/* # of pages, see count_type */
atomic_t nr_pages[NR_COUNT_TYPE];
@@ -1692,6 +1731,9 @@ struct f2fs_sb_info {
/* for skip statistic */
unsigned long long skipped_gc_rwsem; /* FG_GC only */
+ /* free sections reserved for pinned file */
+ unsigned int reserved_pin_section;
+
/* threshold for gc trials on pinned files */
unsigned short gc_pin_file_threshold;
struct f2fs_rwsem pin_sem;
@@ -1761,7 +1803,7 @@ struct f2fs_sb_info {
unsigned int dirty_device; /* for checkpoint data flush */
spinlock_t dev_lock; /* protect dirty_device */
bool aligned_blksize; /* all devices has the same logical blksize */
- unsigned int first_zoned_segno; /* first zoned segno */
+ unsigned int first_seq_zone_segno; /* first segno in sequential zone */
/* For write statistics */
u64 sectors_written_start;
@@ -1800,6 +1842,9 @@ struct f2fs_sb_info {
u64 committed_atomic_block;
u64 revoked_atomic_block;
+ /* carve out reserved_blocks from total blocks */
+ bool carve_out;
+
#ifdef CONFIG_F2FS_FS_COMPRESSION
struct kmem_cache *page_array_slab; /* page array entry */
unsigned int page_array_slab_size; /* default page array slab size */
@@ -1880,6 +1925,7 @@ static inline bool __time_to_inject(struct f2fs_sb_info *sbi, int type,
atomic_inc(&ffi->inject_ops);
if (atomic_read(&ffi->inject_ops) >= ffi->inject_rate) {
atomic_set(&ffi->inject_ops, 0);
+ ffi->inject_count[type]++;
f2fs_info_ratelimited(sbi, "inject %s in %s of %pS",
f2fs_fault_name[type], func, parent_func);
return true;
@@ -1941,28 +1987,20 @@ static inline unsigned int f2fs_time_to_wait(struct f2fs_sb_info *sbi,
/*
* Inline functions
*/
-static inline u32 __f2fs_crc32(struct f2fs_sb_info *sbi, u32 crc,
- const void *address, unsigned int length)
+static inline u32 __f2fs_crc32(u32 crc, const void *address,
+ unsigned int length)
{
return crc32(crc, address, length);
}
-static inline u32 f2fs_crc32(struct f2fs_sb_info *sbi, const void *address,
- unsigned int length)
-{
- return __f2fs_crc32(sbi, F2FS_SUPER_MAGIC, address, length);
-}
-
-static inline bool f2fs_crc_valid(struct f2fs_sb_info *sbi, __u32 blk_crc,
- void *buf, size_t buf_size)
+static inline u32 f2fs_crc32(const void *address, unsigned int length)
{
- return f2fs_crc32(sbi, buf, buf_size) == blk_crc;
+ return __f2fs_crc32(F2FS_SUPER_MAGIC, address, length);
}
-static inline u32 f2fs_chksum(struct f2fs_sb_info *sbi, u32 crc,
- const void *address, unsigned int length)
+static inline u32 f2fs_chksum(u32 crc, const void *address, unsigned int length)
{
- return __f2fs_crc32(sbi, crc, address, length);
+ return __f2fs_crc32(crc, address, length);
}
static inline struct f2fs_inode_info *F2FS_I(struct inode *inode)
@@ -1985,16 +2023,11 @@ static inline struct f2fs_sb_info *F2FS_M_SB(struct address_space *mapping)
return F2FS_I_SB(mapping->host);
}
-static inline struct f2fs_sb_info *F2FS_F_SB(struct folio *folio)
+static inline struct f2fs_sb_info *F2FS_F_SB(const struct folio *folio)
{
return F2FS_M_SB(folio->mapping);
}
-static inline struct f2fs_sb_info *F2FS_P_SB(struct page *page)
-{
- return F2FS_F_SB(page_folio(page));
-}
-
static inline struct f2fs_super_block *F2FS_RAW_SUPER(struct f2fs_sb_info *sbi)
{
return (struct f2fs_super_block *)(sbi->raw_super);
@@ -2015,14 +2048,14 @@ static inline struct f2fs_checkpoint *F2FS_CKPT(struct f2fs_sb_info *sbi)
return (struct f2fs_checkpoint *)(sbi->ckpt);
}
-static inline struct f2fs_node *F2FS_NODE(struct page *page)
+static inline struct f2fs_node *F2FS_NODE(const struct folio *folio)
{
- return (struct f2fs_node *)page_address(page);
+ return (struct f2fs_node *)folio_address(folio);
}
-static inline struct f2fs_inode *F2FS_INODE(struct page *page)
+static inline struct f2fs_inode *F2FS_INODE(const struct folio *folio)
{
- return &((struct f2fs_node *)page_address(page))->i;
+ return &((struct f2fs_node *)folio_address(folio))->i;
}
static inline struct f2fs_nm_info *NM_I(struct f2fs_sb_info *sbi)
@@ -2060,6 +2093,16 @@ static inline struct address_space *NODE_MAPPING(struct f2fs_sb_info *sbi)
return sbi->node_inode->i_mapping;
}
+static inline bool is_meta_folio(struct folio *folio)
+{
+ return folio->mapping == META_MAPPING(F2FS_F_SB(folio));
+}
+
+static inline bool is_node_folio(struct folio *folio)
+{
+ return folio->mapping == NODE_MAPPING(F2FS_F_SB(folio));
+}
+
static inline bool is_sbi_flag_set(struct f2fs_sb_info *sbi, unsigned int type)
{
return test_bit(type, &sbi->s_flag);
@@ -2219,6 +2262,36 @@ static inline void f2fs_up_write(struct f2fs_rwsem *sem)
#endif
}
+static inline void disable_nat_bits(struct f2fs_sb_info *sbi, bool lock)
+{
+ unsigned long flags;
+ unsigned char *nat_bits;
+
+ /*
+ * In order to re-enable nat_bits we need to call fsck.f2fs by
+ * set_sbi_flag(sbi, SBI_NEED_FSCK). But it may give huge cost,
+ * so let's rely on regular fsck or unclean shutdown.
+ */
+
+ if (lock)
+ spin_lock_irqsave(&sbi->cp_lock, flags);
+ __clear_ckpt_flags(F2FS_CKPT(sbi), CP_NAT_BITS_FLAG);
+ nat_bits = NM_I(sbi)->nat_bits;
+ NM_I(sbi)->nat_bits = NULL;
+ if (lock)
+ spin_unlock_irqrestore(&sbi->cp_lock, flags);
+
+ kvfree(nat_bits);
+}
+
+static inline bool enabled_nat_bits(struct f2fs_sb_info *sbi,
+ struct cp_control *cpc)
+{
+ bool set = is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG);
+
+ return (cpc) ? (cpc->reason & CP_UMOUNT) && set : set;
+}
+
static inline void f2fs_lock_op(struct f2fs_sb_info *sbi)
{
f2fs_down_read(&sbi->cp_rwsem);
@@ -2385,6 +2458,13 @@ release_quota:
}
#define PAGE_PRIVATE_GET_FUNC(name, flagname) \
+static inline bool folio_test_f2fs_##name(const struct folio *folio) \
+{ \
+ unsigned long priv = (unsigned long)folio->private; \
+ unsigned long v = (1UL << PAGE_PRIVATE_NOT_POINTER) | \
+ (1UL << PAGE_PRIVATE_##flagname); \
+ return (priv & v) == v; \
+} \
static inline bool page_private_##name(struct page *page) \
{ \
return PagePrivate(page) && \
@@ -2393,6 +2473,17 @@ static inline bool page_private_##name(struct page *page) \
}
#define PAGE_PRIVATE_SET_FUNC(name, flagname) \
+static inline void folio_set_f2fs_##name(struct folio *folio) \
+{ \
+ unsigned long v = (1UL << PAGE_PRIVATE_NOT_POINTER) | \
+ (1UL << PAGE_PRIVATE_##flagname); \
+ if (!folio->private) \
+ folio_attach_private(folio, (void *)v); \
+ else { \
+ v |= (unsigned long)folio->private; \
+ folio->private = (void *)v; \
+ } \
+} \
static inline void set_page_private_##name(struct page *page) \
{ \
if (!PagePrivate(page)) \
@@ -2402,6 +2493,16 @@ static inline void set_page_private_##name(struct page *page) \
}
#define PAGE_PRIVATE_CLEAR_FUNC(name, flagname) \
+static inline void folio_clear_f2fs_##name(struct folio *folio) \
+{ \
+ unsigned long v = (unsigned long)folio->private; \
+ \
+ v &= ~(1UL << PAGE_PRIVATE_##flagname); \
+ if (v == (1UL << PAGE_PRIVATE_NOT_POINTER)) \
+ folio_detach_private(folio); \
+ else \
+ folio->private = (void *)v; \
+} \
static inline void clear_page_private_##name(struct page *page) \
{ \
clear_bit(PAGE_PRIVATE_##flagname, &page_private(page)); \
@@ -2424,39 +2525,23 @@ PAGE_PRIVATE_CLEAR_FUNC(inline, INLINE_INODE);
PAGE_PRIVATE_CLEAR_FUNC(gcing, ONGOING_MIGRATION);
PAGE_PRIVATE_CLEAR_FUNC(atomic, ATOMIC_WRITE);
-static inline unsigned long get_page_private_data(struct page *page)
+static inline unsigned long folio_get_f2fs_data(struct folio *folio)
{
- unsigned long data = page_private(page);
+ unsigned long data = (unsigned long)folio->private;
if (!test_bit(PAGE_PRIVATE_NOT_POINTER, &data))
return 0;
return data >> PAGE_PRIVATE_MAX;
}
-static inline void set_page_private_data(struct page *page, unsigned long data)
+static inline void folio_set_f2fs_data(struct folio *folio, unsigned long data)
{
- if (!PagePrivate(page))
- attach_page_private(page, (void *)0);
- set_bit(PAGE_PRIVATE_NOT_POINTER, &page_private(page));
- page_private(page) |= data << PAGE_PRIVATE_MAX;
-}
+ data = (1UL << PAGE_PRIVATE_NOT_POINTER) | (data << PAGE_PRIVATE_MAX);
-static inline void clear_page_private_data(struct page *page)
-{
- page_private(page) &= GENMASK(PAGE_PRIVATE_MAX - 1, 0);
- if (page_private(page) == BIT(PAGE_PRIVATE_NOT_POINTER))
- detach_page_private(page);
-}
-
-static inline void clear_page_private_all(struct page *page)
-{
- clear_page_private_data(page);
- clear_page_private_reference(page);
- clear_page_private_gcing(page);
- clear_page_private_inline(page);
- clear_page_private_atomic(page);
-
- f2fs_bug_on(F2FS_P_SB(page), page_private(page));
+ if (!folio_test_private(folio))
+ folio_attach_private(folio, (void *)data);
+ else
+ folio->private = (void *)((unsigned long)folio->private | data);
}
static inline void dec_valid_block_count(struct f2fs_sb_info *sbi,
@@ -2466,8 +2551,14 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi,
blkcnt_t sectors = count << F2FS_LOG_SECTORS_PER_BLOCK;
spin_lock(&sbi->stat_lock);
- f2fs_bug_on(sbi, sbi->total_valid_block_count < (block_t) count);
- sbi->total_valid_block_count -= (block_t)count;
+ if (unlikely(sbi->total_valid_block_count < count)) {
+ f2fs_warn(sbi, "Inconsistent total_valid_block_count:%u, ino:%lu, count:%u",
+ sbi->total_valid_block_count, inode->i_ino, count);
+ sbi->total_valid_block_count = 0;
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ } else {
+ sbi->total_valid_block_count -= count;
+ }
if (sbi->reserved_blocks &&
sbi->current_reserved_blocks < sbi->reserved_blocks)
sbi->current_reserved_blocks = min(sbi->reserved_blocks,
@@ -2765,33 +2856,46 @@ static inline s64 valid_inode_count(struct f2fs_sb_info *sbi)
return percpu_counter_sum_positive(&sbi->total_valid_inode_count);
}
-static inline struct page *f2fs_grab_cache_page(struct address_space *mapping,
- pgoff_t index, bool for_write)
+static inline struct folio *f2fs_grab_cache_folio(struct address_space *mapping,
+ pgoff_t index, bool for_write)
{
- struct page *page;
+ struct folio *folio;
unsigned int flags;
if (IS_ENABLED(CONFIG_F2FS_FAULT_INJECTION)) {
+ fgf_t fgf_flags;
+
if (!for_write)
- page = find_get_page_flags(mapping, index,
- FGP_LOCK | FGP_ACCESSED);
+ fgf_flags = FGP_LOCK | FGP_ACCESSED;
else
- page = find_lock_page(mapping, index);
- if (page)
- return page;
+ fgf_flags = FGP_LOCK;
+ folio = __filemap_get_folio(mapping, index, fgf_flags, 0);
+ if (!IS_ERR(folio))
+ return folio;
if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC))
- return NULL;
+ return ERR_PTR(-ENOMEM);
}
if (!for_write)
- return grab_cache_page(mapping, index);
+ return filemap_grab_folio(mapping, index);
flags = memalloc_nofs_save();
- page = grab_cache_page_write_begin(mapping, index);
+ folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
+ mapping_gfp_mask(mapping));
memalloc_nofs_restore(flags);
- return page;
+ return folio;
+}
+
+static inline struct folio *f2fs_filemap_get_folio(
+ struct address_space *mapping, pgoff_t index,
+ fgf_t fgp_flags, gfp_t gfp_mask)
+{
+ if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_GET))
+ return ERR_PTR(-ENOMEM);
+
+ return __filemap_get_folio(mapping, index, fgp_flags, gfp_mask);
}
static inline struct page *f2fs_pagecache_get_page(
@@ -2804,26 +2908,33 @@ static inline struct page *f2fs_pagecache_get_page(
return pagecache_get_page(mapping, index, fgp_flags, gfp_mask);
}
-static inline void f2fs_put_page(struct page *page, int unlock)
+static inline void f2fs_folio_put(struct folio *folio, bool unlock)
{
- if (!page)
+ if (IS_ERR_OR_NULL(folio))
return;
if (unlock) {
- f2fs_bug_on(F2FS_P_SB(page), !PageLocked(page));
- unlock_page(page);
+ f2fs_bug_on(F2FS_F_SB(folio), !folio_test_locked(folio));
+ folio_unlock(folio);
}
- put_page(page);
+ folio_put(folio);
+}
+
+static inline void f2fs_put_page(struct page *page, int unlock)
+{
+ if (!page)
+ return;
+ f2fs_folio_put(page_folio(page), unlock);
}
static inline void f2fs_put_dnode(struct dnode_of_data *dn)
{
- if (dn->node_page)
- f2fs_put_page(dn->node_page, 1);
- if (dn->inode_page && dn->node_page != dn->inode_page)
- f2fs_put_page(dn->inode_page, 0);
- dn->node_page = NULL;
- dn->inode_page = NULL;
+ if (dn->node_folio)
+ f2fs_folio_put(dn->node_folio, true);
+ if (dn->inode_folio && dn->node_folio != dn->inode_folio)
+ f2fs_folio_put(dn->inode_folio, false);
+ dn->node_folio = NULL;
+ dn->inode_folio = NULL;
}
static inline struct kmem_cache *f2fs_kmem_cache_create(const char *name,
@@ -2917,9 +3028,9 @@ static inline void f2fs_radix_tree_insert(struct radix_tree_root *root,
#define RAW_IS_INODE(p) ((p)->footer.nid == (p)->footer.ino)
-static inline bool IS_INODE(struct page *page)
+static inline bool IS_INODE(const struct folio *folio)
{
- struct f2fs_node *p = F2FS_NODE(page);
+ struct f2fs_node *p = F2FS_NODE(folio);
return RAW_IS_INODE(p);
}
@@ -2937,31 +3048,31 @@ static inline __le32 *blkaddr_in_node(struct f2fs_node *node)
static inline int f2fs_has_extra_attr(struct inode *inode);
static inline unsigned int get_dnode_base(struct inode *inode,
- struct page *node_page)
+ struct folio *node_folio)
{
- if (!IS_INODE(node_page))
+ if (!IS_INODE(node_folio))
return 0;
return inode ? get_extra_isize(inode) :
- offset_in_addr(&F2FS_NODE(node_page)->i);
+ offset_in_addr(&F2FS_NODE(node_folio)->i);
}
static inline __le32 *get_dnode_addr(struct inode *inode,
- struct page *node_page)
+ struct folio *node_folio)
{
- return blkaddr_in_node(F2FS_NODE(node_page)) +
- get_dnode_base(inode, node_page);
+ return blkaddr_in_node(F2FS_NODE(node_folio)) +
+ get_dnode_base(inode, node_folio);
}
static inline block_t data_blkaddr(struct inode *inode,
- struct page *node_page, unsigned int offset)
+ struct folio *node_folio, unsigned int offset)
{
- return le32_to_cpu(*(get_dnode_addr(inode, node_page) + offset));
+ return le32_to_cpu(*(get_dnode_addr(inode, node_folio) + offset));
}
static inline block_t f2fs_data_blkaddr(struct dnode_of_data *dn)
{
- return data_blkaddr(dn->inode, dn->node_page, dn->ofs_in_node);
+ return data_blkaddr(dn->inode, dn->node_folio, dn->ofs_in_node);
}
static inline int f2fs_test_bit(unsigned int nr, char *addr)
@@ -3272,9 +3383,10 @@ static inline unsigned int addrs_per_page(struct inode *inode,
return addrs;
}
-static inline void *inline_xattr_addr(struct inode *inode, struct page *page)
+static inline
+void *inline_xattr_addr(struct inode *inode, const struct folio *folio)
{
- struct f2fs_inode *ri = F2FS_INODE(page);
+ struct f2fs_inode *ri = F2FS_INODE(folio);
return (void *)&(ri->i_addr[DEF_ADDRS_PER_INODE -
get_inline_xattr_addrs(inode)]);
@@ -3289,7 +3401,7 @@ static inline int inline_xattr_size(struct inode *inode)
/*
* Notice: check inline_data flag without inode page lock is unsafe.
- * It could change at any time by f2fs_convert_inline_page().
+ * It could change at any time by f2fs_convert_inline_folio().
*/
static inline int f2fs_has_inline_data(struct inode *inode)
{
@@ -3321,9 +3433,9 @@ static inline bool f2fs_is_cow_file(struct inode *inode)
return is_inode_flag_set(inode, FI_COW_FILE);
}
-static inline void *inline_data_addr(struct inode *inode, struct page *page)
+static inline void *inline_data_addr(struct inode *inode, struct folio *folio)
{
- __le32 *addr = get_dnode_addr(inode, page);
+ __le32 *addr = get_dnode_addr(inode, folio);
return (void *)(addr + DEF_INLINE_RESERVED_SIZE);
}
@@ -3449,6 +3561,14 @@ static inline void *f2fs_kvzalloc(struct f2fs_sb_info *sbi,
return f2fs_kvmalloc(sbi, size, flags | __GFP_ZERO);
}
+static inline void *f2fs_vmalloc(struct f2fs_sb_info *sbi, size_t size)
+{
+ if (time_to_inject(sbi, FAULT_VMALLOC))
+ return NULL;
+
+ return vmalloc(size);
+}
+
static inline int get_extra_isize(struct inode *inode)
{
return F2FS_I(inode)->i_extra_isize / sizeof(__le32);
@@ -3513,9 +3633,9 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count);
int f2fs_do_shutdown(struct f2fs_sb_info *sbi, unsigned int flag,
bool readonly, bool need_lock);
int f2fs_precache_extents(struct inode *inode);
-int f2fs_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+int f2fs_fileattr_get(struct dentry *dentry, struct file_kattr *fa);
int f2fs_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa);
+ struct dentry *dentry, struct file_kattr *fa);
long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
int f2fs_transfer_project_quota(struct inode *inode, kprojid_t kprojid);
@@ -3525,14 +3645,15 @@ int f2fs_pin_file_control(struct inode *inode, bool inc);
* inode.c
*/
void f2fs_set_inode_flags(struct inode *inode);
-bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct page *page);
-void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page);
+bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct folio *folio);
+void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct folio *folio);
struct inode *f2fs_iget(struct super_block *sb, unsigned long ino);
struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino);
int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink);
-void f2fs_update_inode(struct inode *inode, struct page *node_page);
+void f2fs_update_inode(struct inode *inode, struct folio *node_folio);
void f2fs_update_inode_page(struct inode *inode);
int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc);
+void f2fs_remove_donate_inode(struct inode *inode);
void f2fs_evict_inode(struct inode *inode);
void f2fs_handle_failed_inode(struct inode *inode);
@@ -3576,23 +3697,22 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
unsigned int start_pos, struct fscrypt_str *fstr);
void f2fs_do_make_empty_dir(struct inode *inode, struct inode *parent,
struct f2fs_dentry_ptr *d);
-struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir,
- const struct f2fs_filename *fname, struct page *dpage);
+struct folio *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir,
+ const struct f2fs_filename *fname, struct folio *dfolio);
void f2fs_update_parent_metadata(struct inode *dir, struct inode *inode,
unsigned int current_depth);
int f2fs_room_for_filename(const void *bitmap, int slots, int max_slots);
void f2fs_drop_nlink(struct inode *dir, struct inode *inode);
struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir,
- const struct f2fs_filename *fname,
- struct page **res_page);
+ const struct f2fs_filename *fname, struct folio **res_folio);
struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
- const struct qstr *child, struct page **res_page);
-struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p);
+ const struct qstr *child, struct folio **res_folio);
+struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct folio **f);
ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr,
- struct page **page);
+ struct folio **folio);
void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
- struct page *page, struct inode *inode);
-bool f2fs_has_enough_room(struct inode *dir, struct page *ipage,
+ struct folio *folio, struct inode *inode);
+bool f2fs_has_enough_room(struct inode *dir, struct folio *ifolio,
const struct f2fs_filename *fname);
void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d,
const struct fscrypt_str *name, f2fs_hash_t name_hash,
@@ -3603,7 +3723,7 @@ int f2fs_add_dentry(struct inode *dir, const struct f2fs_filename *fname,
struct inode *inode, nid_t ino, umode_t mode);
int f2fs_do_add_link(struct inode *dir, const struct qstr *name,
struct inode *inode, nid_t ino, umode_t mode);
-void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
+void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct folio *folio,
struct inode *dir, struct inode *inode);
int f2fs_do_tmpfile(struct inode *inode, struct inode *dir,
struct f2fs_filename *fname);
@@ -3624,7 +3744,7 @@ int f2fs_inode_dirtied(struct inode *inode, bool sync);
void f2fs_inode_synced(struct inode *inode);
int f2fs_dquot_initialize(struct inode *inode);
int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly);
-int f2fs_quota_sync(struct super_block *sb, int type);
+int f2fs_do_quota_sync(struct super_block *sb, int type);
loff_t max_file_blocks(struct inode *inode);
void f2fs_quota_off_umount(struct super_block *sb);
void f2fs_save_errors(struct f2fs_sb_info *sbi, unsigned char flag);
@@ -3647,9 +3767,9 @@ struct node_info;
int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid);
bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type);
-bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, struct page *page);
+bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, struct folio *folio);
void f2fs_init_fsync_node_info(struct f2fs_sb_info *sbi);
-void f2fs_del_fsync_node_entry(struct f2fs_sb_info *sbi, struct page *page);
+void f2fs_del_fsync_node_entry(struct f2fs_sb_info *sbi, struct folio *folio);
void f2fs_reset_fsync_node_info(struct f2fs_sb_info *sbi);
int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid);
bool f2fs_is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid);
@@ -3662,14 +3782,14 @@ int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from);
int f2fs_truncate_xattr_node(struct inode *inode);
int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi,
unsigned int seq_id);
-bool f2fs_nat_bitmap_enabled(struct f2fs_sb_info *sbi);
int f2fs_remove_inode_page(struct inode *inode);
-struct page *f2fs_new_inode_page(struct inode *inode);
-struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs);
+struct folio *f2fs_new_inode_folio(struct inode *inode);
+struct folio *f2fs_new_node_folio(struct dnode_of_data *dn, unsigned int ofs);
void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid);
-struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid);
-struct page *f2fs_get_node_page_ra(struct page *parent, int start);
-int f2fs_move_node_page(struct page *node_page, int gc_type);
+struct folio *f2fs_get_node_folio(struct f2fs_sb_info *sbi, pgoff_t nid);
+struct folio *f2fs_get_inode_folio(struct f2fs_sb_info *sbi, pgoff_t ino);
+struct folio *f2fs_get_xnode_folio(struct f2fs_sb_info *sbi, pgoff_t xnid);
+int f2fs_move_node_folio(struct folio *node_folio, int gc_type);
void f2fs_flush_inline_data(struct f2fs_sb_info *sbi);
int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
struct writeback_control *wbc, bool atomic,
@@ -3682,12 +3802,11 @@ bool f2fs_alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid);
void f2fs_alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid);
void f2fs_alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid);
int f2fs_try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink);
-int f2fs_recover_inline_xattr(struct inode *inode, struct page *page);
-int f2fs_recover_xattr_data(struct inode *inode, struct page *page);
-int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page);
+int f2fs_recover_inline_xattr(struct inode *inode, struct folio *folio);
+int f2fs_recover_xattr_data(struct inode *inode, struct folio *folio);
+int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct folio *folio);
int f2fs_restore_node_summary(struct f2fs_sb_info *sbi,
unsigned int segno, struct f2fs_summary_block *sum);
-void f2fs_enable_nat_bits(struct f2fs_sb_info *sbi);
int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc);
int f2fs_build_node_manager(struct f2fs_sb_info *sbi);
void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi);
@@ -3733,7 +3852,7 @@ int f2fs_allocate_new_segments(struct f2fs_sb_info *sbi);
int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range);
bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi,
struct cp_control *cpc);
-struct page *f2fs_get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno);
+struct folio *f2fs_get_sum_folio(struct f2fs_sb_info *sbi, unsigned int segno);
void f2fs_update_meta_page(struct f2fs_sb_info *sbi, void *src,
block_t blk_addr);
void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct folio *folio,
@@ -3752,14 +3871,16 @@ void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn,
bool recover_newaddr);
enum temp_type f2fs_get_segment_temp(struct f2fs_sb_info *sbi,
enum log_type seg_type);
-int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
+int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct folio *folio,
block_t old_blkaddr, block_t *new_blkaddr,
struct f2fs_summary *sum, int type,
struct f2fs_io_info *fio);
void f2fs_update_device_state(struct f2fs_sb_info *sbi, nid_t ino,
block_t blkaddr, unsigned int blkcnt);
-void f2fs_wait_on_page_writeback(struct page *page,
- enum page_type type, bool ordered, bool locked);
+void f2fs_folio_wait_writeback(struct folio *folio, enum page_type type,
+ bool ordered, bool locked);
+#define f2fs_wait_on_page_writeback(page, type, ordered, locked) \
+ f2fs_folio_wait_writeback(page_folio(page), type, ordered, locked)
void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr);
void f2fs_wait_on_block_writeback_range(struct inode *inode, block_t blkaddr,
block_t len);
@@ -3782,6 +3903,11 @@ unsigned int f2fs_usable_blks_in_seg(struct f2fs_sb_info *sbi,
unsigned long long f2fs_get_section_mtime(struct f2fs_sb_info *sbi,
unsigned int segno);
+static inline struct inode *fio_inode(struct f2fs_io_info *fio)
+{
+ return fio->folio->mapping->host;
+}
+
#define DEF_FRAGMENT_SIZE 4
#define MIN_FRAGMENT_SIZE 1
#define MAX_FRAGMENT_SIZE 512
@@ -3798,10 +3924,10 @@ static inline bool f2fs_need_rand_seg(struct f2fs_sb_info *sbi)
void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io,
unsigned char reason);
void f2fs_flush_ckpt_thread(struct f2fs_sb_info *sbi);
-struct page *f2fs_grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index);
-struct page *f2fs_get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index);
-struct page *f2fs_get_meta_page_retry(struct f2fs_sb_info *sbi, pgoff_t index);
-struct page *f2fs_get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index);
+struct folio *f2fs_grab_meta_folio(struct f2fs_sb_info *sbi, pgoff_t index);
+struct folio *f2fs_get_meta_folio(struct f2fs_sb_info *sbi, pgoff_t index);
+struct folio *f2fs_get_meta_folio_retry(struct f2fs_sb_info *sbi, pgoff_t index);
+struct folio *f2fs_get_tmp_folio(struct f2fs_sb_info *sbi, pgoff_t index);
bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
block_t blkaddr, int type);
bool f2fs_is_valid_blkaddr_raw(struct f2fs_sb_info *sbi,
@@ -3846,7 +3972,7 @@ void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi);
*/
int __init f2fs_init_bioset(void);
void f2fs_destroy_bioset(void);
-bool f2fs_is_cp_guaranteed(struct page *page);
+bool f2fs_is_cp_guaranteed(const struct folio *folio);
int f2fs_init_bio_entry_cache(void);
void f2fs_destroy_bio_entry_cache(void);
void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio,
@@ -3854,10 +3980,10 @@ void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio,
int f2fs_init_write_merge_io(struct f2fs_sb_info *sbi);
void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type);
void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi,
- struct inode *inode, struct page *page,
+ struct inode *inode, struct folio *folio,
nid_t ino, enum page_type type);
void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi,
- struct bio **bio, struct page *page);
+ struct bio **bio, struct folio *folio);
void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi);
int f2fs_submit_page_bio(struct f2fs_io_info *fio);
int f2fs_merge_page_bio(struct f2fs_io_info *fio);
@@ -3871,14 +3997,14 @@ int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count);
int f2fs_reserve_new_block(struct dnode_of_data *dn);
int f2fs_get_block_locked(struct dnode_of_data *dn, pgoff_t index);
int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index);
-struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index,
- blk_opf_t op_flags, bool for_write, pgoff_t *next_pgofs);
-struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index,
- pgoff_t *next_pgofs);
-struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index,
+struct folio *f2fs_get_read_data_folio(struct inode *inode, pgoff_t index,
+ blk_opf_t op_flags, bool for_write, pgoff_t *next_pgofs);
+struct folio *f2fs_find_data_folio(struct inode *inode, pgoff_t index,
+ pgoff_t *next_pgofs);
+struct folio *f2fs_get_lock_data_folio(struct inode *inode, pgoff_t index,
bool for_write);
-struct page *f2fs_get_new_data_page(struct inode *inode,
- struct page *ipage, pgoff_t index, bool new_i_size);
+struct folio *f2fs_get_new_data_folio(struct inode *inode,
+ struct folio *ifolio, pgoff_t index, bool new_i_size);
int f2fs_do_write_data_page(struct f2fs_io_info *fio);
int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int flag);
int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
@@ -3966,7 +4092,8 @@ struct f2fs_stat_info {
unsigned long long allocated_data_blocks;
int ndirty_node, ndirty_dent, ndirty_meta, ndirty_imeta;
int ndirty_data, ndirty_qdata;
- unsigned int ndirty_dirs, ndirty_files, nquota_files, ndirty_all;
+ unsigned int ndirty_dirs, ndirty_files, ndirty_all;
+ unsigned int nquota_files, ndonate_files;
int nats, dirty_nats, sits, dirty_sits;
int free_nids, avail_nids, alloc_nids;
int total_count, utilization;
@@ -4195,28 +4322,26 @@ extern struct kmem_cache *f2fs_inode_entry_slab;
* inline.c
*/
bool f2fs_may_inline_data(struct inode *inode);
-bool f2fs_sanity_check_inline_data(struct inode *inode, struct page *ipage);
+bool f2fs_sanity_check_inline_data(struct inode *inode, struct folio *ifolio);
bool f2fs_may_inline_dentry(struct inode *inode);
-void f2fs_do_read_inline_data(struct folio *folio, struct page *ipage);
-void f2fs_truncate_inline_inode(struct inode *inode,
- struct page *ipage, u64 from);
+void f2fs_do_read_inline_data(struct folio *folio, struct folio *ifolio);
+void f2fs_truncate_inline_inode(struct inode *inode, struct folio *ifolio,
+ u64 from);
int f2fs_read_inline_data(struct inode *inode, struct folio *folio);
-int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page);
+int f2fs_convert_inline_folio(struct dnode_of_data *dn, struct folio *folio);
int f2fs_convert_inline_inode(struct inode *inode);
int f2fs_try_convert_inline_dir(struct inode *dir, struct dentry *dentry);
int f2fs_write_inline_data(struct inode *inode, struct folio *folio);
-int f2fs_recover_inline_data(struct inode *inode, struct page *npage);
+int f2fs_recover_inline_data(struct inode *inode, struct folio *nfolio);
struct f2fs_dir_entry *f2fs_find_in_inline_dir(struct inode *dir,
- const struct f2fs_filename *fname,
- struct page **res_page,
- bool use_hash);
+ const struct f2fs_filename *fname, struct folio **res_folio,
+ bool use_hash);
int f2fs_make_empty_inline_dir(struct inode *inode, struct inode *parent,
- struct page *ipage);
+ struct folio *ifolio);
int f2fs_add_inline_entry(struct inode *dir, const struct f2fs_filename *fname,
struct inode *inode, nid_t ino, umode_t mode);
void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry,
- struct page *page, struct inode *dir,
- struct inode *inode);
+ struct folio *folio, struct inode *dir, struct inode *inode);
bool f2fs_empty_inline_dir(struct inode *dir);
int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx,
struct fscrypt_str *fstr);
@@ -4231,13 +4356,15 @@ unsigned long f2fs_shrink_count(struct shrinker *shrink,
struct shrink_control *sc);
unsigned long f2fs_shrink_scan(struct shrinker *shrink,
struct shrink_control *sc);
+unsigned int f2fs_donate_files(void);
+void f2fs_reclaim_caches(unsigned int reclaim_caches_kb);
void f2fs_join_shrinker(struct f2fs_sb_info *sbi);
void f2fs_leave_shrinker(struct f2fs_sb_info *sbi);
/*
* extent_cache.c
*/
-bool sanity_check_extent_cache(struct inode *inode, struct page *ipage);
+bool sanity_check_extent_cache(struct inode *inode, struct folio *ifolio);
void f2fs_init_extent_tree(struct inode *inode);
void f2fs_drop_extent_tree(struct inode *inode);
void f2fs_destroy_extent_node(struct inode *inode);
@@ -4247,7 +4374,7 @@ int __init f2fs_create_extent_cache(void);
void f2fs_destroy_extent_cache(void);
/* read extent cache ops */
-void f2fs_init_read_extent_tree(struct inode *inode, struct page *ipage);
+void f2fs_init_read_extent_tree(struct inode *inode, struct folio *ifolio);
bool f2fs_lookup_read_extent_cache(struct inode *inode, pgoff_t pgofs,
struct extent_info *ei);
bool f2fs_lookup_read_extent_cache_block(struct inode *inode, pgoff_t index,
@@ -4327,20 +4454,20 @@ enum cluster_check_type {
CLUSTER_COMPR_BLKS, /* return # of compressed blocks in a cluster */
CLUSTER_RAW_BLKS /* return # of raw blocks in a cluster */
};
-bool f2fs_is_compressed_page(struct page *page);
-struct page *f2fs_compress_control_page(struct page *page);
+bool f2fs_is_compressed_page(struct folio *folio);
+struct folio *f2fs_compress_control_folio(struct folio *folio);
int f2fs_prepare_compress_overwrite(struct inode *inode,
struct page **pagep, pgoff_t index, void **fsdata);
bool f2fs_compress_write_end(struct inode *inode, void *fsdata,
pgoff_t index, unsigned copied);
int f2fs_truncate_partial_cluster(struct inode *inode, u64 from, bool lock);
-void f2fs_compress_write_end_io(struct bio *bio, struct page *page);
+void f2fs_compress_write_end_io(struct bio *bio, struct folio *folio);
bool f2fs_is_compress_backend_ready(struct inode *inode);
bool f2fs_is_compress_level_valid(int alg, int lvl);
int __init f2fs_init_compress_mempool(void);
void f2fs_destroy_compress_mempool(void);
void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task);
-void f2fs_end_read_compressed_page(struct page *page, bool failed,
+void f2fs_end_read_compressed_page(struct folio *folio, bool failed,
block_t blkaddr, bool in_task);
bool f2fs_cluster_is_empty(struct compress_ctx *cc);
bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index);
@@ -4363,7 +4490,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc);
void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed,
bool in_task);
-void f2fs_put_page_dic(struct page *page, bool in_task);
+void f2fs_put_folio_dic(struct folio *folio, bool in_task);
unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn,
unsigned int ofs_in_node);
int f2fs_init_compress_ctx(struct compress_ctx *cc);
@@ -4378,9 +4505,7 @@ void f2fs_destroy_compress_cache(void);
struct address_space *COMPRESS_MAPPING(struct f2fs_sb_info *sbi);
void f2fs_invalidate_compress_pages_range(struct f2fs_sb_info *sbi,
block_t blkaddr, unsigned int len);
-void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, struct page *page,
- nid_t ino, block_t blkaddr);
-bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi, struct page *page,
+bool f2fs_load_compressed_folio(struct f2fs_sb_info *sbi, struct folio *folio,
block_t blkaddr);
void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino);
#define inc_compr_inode_stat(inode) \
@@ -4396,7 +4521,7 @@ void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino);
sbi->compr_saved_block += diff; \
} while (0)
#else
-static inline bool f2fs_is_compressed_page(struct page *page) { return false; }
+static inline bool f2fs_is_compressed_page(struct folio *folio) { return false; }
static inline bool f2fs_is_compress_backend_ready(struct inode *inode)
{
if (!f2fs_compressed_file(inode))
@@ -4405,7 +4530,7 @@ static inline bool f2fs_is_compress_backend_ready(struct inode *inode)
return false;
}
static inline bool f2fs_is_compress_level_valid(int alg, int lvl) { return false; }
-static inline struct page *f2fs_compress_control_page(struct page *page)
+static inline struct folio *f2fs_compress_control_folio(struct folio *folio)
{
WARN_ON_ONCE(1);
return ERR_PTR(-EINVAL);
@@ -4414,12 +4539,12 @@ static inline int __init f2fs_init_compress_mempool(void) { return 0; }
static inline void f2fs_destroy_compress_mempool(void) { }
static inline void f2fs_decompress_cluster(struct decompress_io_ctx *dic,
bool in_task) { }
-static inline void f2fs_end_read_compressed_page(struct page *page,
+static inline void f2fs_end_read_compressed_page(struct folio *folio,
bool failed, block_t blkaddr, bool in_task)
{
WARN_ON_ONCE(1);
}
-static inline void f2fs_put_page_dic(struct page *page, bool in_task)
+static inline void f2fs_put_folio_dic(struct folio *folio, bool in_task)
{
WARN_ON_ONCE(1);
}
@@ -4434,10 +4559,8 @@ static inline int __init f2fs_init_compress_cache(void) { return 0; }
static inline void f2fs_destroy_compress_cache(void) { }
static inline void f2fs_invalidate_compress_pages_range(struct f2fs_sb_info *sbi,
block_t blkaddr, unsigned int len) { }
-static inline void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi,
- struct page *page, nid_t ino, block_t blkaddr) { }
-static inline bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi,
- struct page *page, block_t blkaddr) { return false; }
+static inline bool f2fs_load_compressed_folio(struct f2fs_sb_info *sbi,
+ struct folio *folio, block_t blkaddr) { return false; }
static inline void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi,
nid_t ino) { }
#define inc_compr_inode_stat(inode) do { } while (0)
@@ -4527,12 +4650,16 @@ F2FS_FEATURE_FUNCS(readonly, RO);
F2FS_FEATURE_FUNCS(device_alias, DEVICE_ALIAS);
#ifdef CONFIG_BLK_DEV_ZONED
-static inline bool f2fs_blkz_is_seq(struct f2fs_sb_info *sbi, int devi,
- block_t blkaddr)
+static inline bool f2fs_zone_is_seq(struct f2fs_sb_info *sbi, int devi,
+ unsigned int zone)
{
- unsigned int zno = blkaddr / sbi->blocks_per_blkz;
+ return test_bit(zone, FDEV(devi).blkz_seq);
+}
- return test_bit(zno, FDEV(devi).blkz_seq);
+static inline bool f2fs_blkz_is_seq(struct f2fs_sb_info *sbi, int devi,
+ block_t blkaddr)
+{
+ return f2fs_zone_is_seq(sbi, devi, blkaddr / sbi->blocks_per_blkz);
}
#endif
@@ -4604,15 +4731,31 @@ static inline bool f2fs_lfs_mode(struct f2fs_sb_info *sbi)
return F2FS_OPTION(sbi).fs_mode == FS_MODE_LFS;
}
-static inline bool f2fs_valid_pinned_area(struct f2fs_sb_info *sbi,
+static inline bool f2fs_is_sequential_zone_area(struct f2fs_sb_info *sbi,
block_t blkaddr)
{
if (f2fs_sb_has_blkzoned(sbi)) {
+#ifdef CONFIG_BLK_DEV_ZONED
int devi = f2fs_target_device_index(sbi, blkaddr);
- return !bdev_is_zoned(FDEV(devi).bdev);
+ if (!bdev_is_zoned(FDEV(devi).bdev))
+ return false;
+
+ if (f2fs_is_multi_device(sbi)) {
+ if (blkaddr < FDEV(devi).start_blk ||
+ blkaddr > FDEV(devi).end_blk) {
+ f2fs_err(sbi, "Invalid block %x", blkaddr);
+ return false;
+ }
+ blkaddr -= FDEV(devi).start_blk;
+ }
+
+ return f2fs_blkz_is_seq(sbi, devi, blkaddr);
+#else
+ return false;
+#endif
}
- return true;
+ return false;
}
static inline bool f2fs_low_mem_mode(struct f2fs_sb_info *sbi)
@@ -4667,10 +4810,11 @@ static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx)
#ifdef CONFIG_F2FS_FAULT_INJECTION
extern int f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned long rate,
- unsigned long type);
+ unsigned long type, enum fault_option fo);
#else
static inline int f2fs_build_fault_attr(struct f2fs_sb_info *sbi,
- unsigned long rate, unsigned long type)
+ unsigned long rate, unsigned long type,
+ enum fault_option fo)
{
return 0;
}
@@ -4700,6 +4844,19 @@ static inline void f2fs_io_schedule_timeout(long timeout)
io_schedule_timeout(timeout);
}
+static inline void f2fs_io_schedule_timeout_killable(long timeout)
+{
+ while (timeout) {
+ if (fatal_signal_pending(current))
+ return;
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ io_schedule_timeout(DEFAULT_IO_TIMEOUT);
+ if (timeout <= DEFAULT_IO_TIMEOUT)
+ return;
+ timeout -= DEFAULT_IO_TIMEOUT;
+ }
+}
+
static inline void f2fs_handle_page_eio(struct f2fs_sb_info *sbi,
struct folio *folio, enum page_type type)
{
@@ -4729,13 +4886,13 @@ static inline void f2fs_truncate_meta_inode_pages(struct f2fs_sb_info *sbi,
int i = 0;
do {
- struct page *page;
+ struct folio *folio;
- page = find_get_page(META_MAPPING(sbi), blkaddr + i);
- if (page) {
- if (folio_test_writeback(page_folio(page)))
+ folio = filemap_get_folio(META_MAPPING(sbi), blkaddr + i);
+ if (!IS_ERR(folio)) {
+ if (folio_test_writeback(folio))
need_submit = true;
- f2fs_put_page(page, 0);
+ f2fs_folio_put(folio, false);
}
} while (++i < cnt && !need_submit);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index f92a9fba9991..42faaed6a02d 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -35,6 +35,17 @@
#include <trace/events/f2fs.h>
#include <uapi/linux/f2fs.h>
+static void f2fs_zero_post_eof_page(struct inode *inode, loff_t new_size)
+{
+ loff_t old_size = i_size_read(inode);
+
+ if (old_size >= new_size)
+ return;
+
+ /* zero or drop pages only in range of [old_size, new_size] */
+ truncate_pagecache(inode, old_size);
+}
+
static vm_fault_t f2fs_filemap_fault(struct vm_fault *vmf)
{
struct inode *inode = file_inode(vmf->vma->vm_file);
@@ -103,8 +114,13 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
f2fs_bug_on(sbi, f2fs_has_inline_data(inode));
+ filemap_invalidate_lock(inode->i_mapping);
+ f2fs_zero_post_eof_page(inode, (folio->index + 1) << PAGE_SHIFT);
+ filemap_invalidate_unlock(inode->i_mapping);
+
file_update_time(vmf->vma->vm_file);
filemap_invalidate_lock_shared(inode->i_mapping);
+
folio_lock(folio);
if (unlikely(folio->mapping != inode->i_mapping ||
folio_pos(folio) > i_size_read(inode) ||
@@ -131,7 +147,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
goto out_sem;
}
- f2fs_wait_on_page_writeback(folio_page(folio, 0), DATA, false, true);
+ f2fs_folio_wait_writeback(folio, DATA, false, true);
/* wait for GCed page writeback via META_MAPPING */
f2fs_wait_on_block_writeback(inode, dn.data_blkaddr);
@@ -226,12 +242,13 @@ static inline enum cp_reason_type need_do_checkpoint(struct inode *inode)
static bool need_inode_page_update(struct f2fs_sb_info *sbi, nid_t ino)
{
- struct page *i = find_get_page(NODE_MAPPING(sbi), ino);
+ struct folio *i = filemap_get_folio(NODE_MAPPING(sbi), ino);
bool ret = false;
/* But we need to avoid that there are some inode updates */
- if ((i && PageDirty(i)) || f2fs_need_inode_block_update(sbi, ino))
+ if ((!IS_ERR(i) && folio_test_dirty(i)) ||
+ f2fs_need_inode_block_update(sbi, ino))
ret = true;
- f2fs_put_page(i, 0);
+ f2fs_folio_put(i, false);
return ret;
}
@@ -260,7 +277,6 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
struct writeback_control wbc = {
.sync_mode = WB_SYNC_ALL,
.nr_to_write = LONG_MAX,
- .for_reclaim = 0,
};
unsigned int seq_id = 0;
@@ -403,7 +419,7 @@ static bool __found_offset(struct address_space *mapping,
bool compressed_cluster = false;
if (f2fs_compressed_file(inode)) {
- block_t first_blkaddr = data_blkaddr(dn->inode, dn->node_page,
+ block_t first_blkaddr = data_blkaddr(dn->inode, dn->node_folio,
ALIGN_DOWN(dn->ofs_in_node, F2FS_I(inode)->i_cluster_size));
compressed_cluster = first_blkaddr == COMPRESS_ADDR;
@@ -473,7 +489,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
}
}
- end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
+ end_offset = ADDRS_PER_PAGE(dn.node_folio, inode);
/* find data/hole in dnode block */
for (; dn.ofs_in_node < end_offset;
@@ -532,8 +548,9 @@ static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence)
return -EINVAL;
}
-static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
+static int f2fs_file_mmap_prepare(struct vm_area_desc *desc)
{
+ struct file *file = desc->file;
struct inode *inode = file_inode(file);
if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
@@ -543,7 +560,7 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
return -EOPNOTSUPP;
file_accessed(file);
- vma->vm_ops = &f2fs_file_vm_ops;
+ desc->vm_ops = &f2fs_file_vm_ops;
f2fs_down_read(&F2FS_I(inode)->i_sem);
set_inode_flag(inode, FI_MMAP_FILE);
@@ -554,19 +571,21 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
static int finish_preallocate_blocks(struct inode *inode)
{
- int ret;
+ int ret = 0;
+ bool opened;
- inode_lock(inode);
- if (is_inode_flag_set(inode, FI_OPENED_FILE)) {
- inode_unlock(inode);
+ f2fs_down_read(&F2FS_I(inode)->i_sem);
+ opened = is_inode_flag_set(inode, FI_OPENED_FILE);
+ f2fs_up_read(&F2FS_I(inode)->i_sem);
+ if (opened)
return 0;
- }
- if (!file_should_truncate(inode)) {
- set_inode_flag(inode, FI_OPENED_FILE);
- inode_unlock(inode);
- return 0;
- }
+ inode_lock(inode);
+ if (is_inode_flag_set(inode, FI_OPENED_FILE))
+ goto out_unlock;
+
+ if (!file_should_truncate(inode))
+ goto out_update;
f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(inode->i_mapping);
@@ -576,16 +595,17 @@ static int finish_preallocate_blocks(struct inode *inode)
filemap_invalidate_unlock(inode->i_mapping);
f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
-
- if (!ret)
- set_inode_flag(inode, FI_OPENED_FILE);
-
- inode_unlock(inode);
if (ret)
- return ret;
+ goto out_unlock;
file_dont_truncate(inode);
- return 0;
+out_update:
+ f2fs_down_write(&F2FS_I(inode)->i_sem);
+ set_inode_flag(inode, FI_OPENED_FILE);
+ f2fs_up_write(&F2FS_I(inode)->i_sem);
+out_unlock:
+ inode_unlock(inode);
+ return ret;
}
static int f2fs_file_open(struct inode *inode, struct file *filp)
@@ -609,7 +629,10 @@ static int f2fs_file_open(struct inode *inode, struct file *filp)
if (err)
return err;
- return finish_preallocate_blocks(inode);
+ err = finish_preallocate_blocks(inode);
+ if (!err)
+ atomic_inc(&F2FS_I(inode)->open_count);
+ return err;
}
void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count)
@@ -624,7 +647,7 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count)
block_t blkstart;
int blklen = 0;
- addr = get_dnode_addr(dn->inode, dn->node_page) + ofs;
+ addr = get_dnode_addr(dn->inode, dn->node_folio) + ofs;
blkstart = le32_to_cpu(*addr);
/* Assumption: truncation starts with cluster */
@@ -688,7 +711,7 @@ next:
* once we invalidate valid blkaddr in range [ofs, ofs + count],
* we will invalidate all blkaddr in the whole range.
*/
- fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page),
+ fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_folio),
dn->inode) + ofs;
f2fs_update_read_extent_cache_range(dn, fofs, 0, len);
f2fs_update_age_extent_cache_range(dn, fofs, len);
@@ -707,31 +730,33 @@ static int truncate_partial_data_page(struct inode *inode, u64 from,
loff_t offset = from & (PAGE_SIZE - 1);
pgoff_t index = from >> PAGE_SHIFT;
struct address_space *mapping = inode->i_mapping;
- struct page *page;
+ struct folio *folio;
if (!offset && !cache_only)
return 0;
if (cache_only) {
- page = find_lock_page(mapping, index);
- if (page && PageUptodate(page))
+ folio = filemap_lock_folio(mapping, index);
+ if (IS_ERR(folio))
+ return 0;
+ if (folio_test_uptodate(folio))
goto truncate_out;
- f2fs_put_page(page, 1);
+ f2fs_folio_put(folio, true);
return 0;
}
- page = f2fs_get_lock_data_page(inode, index, true);
- if (IS_ERR(page))
- return PTR_ERR(page) == -ENOENT ? 0 : PTR_ERR(page);
+ folio = f2fs_get_lock_data_folio(inode, index, true);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio) == -ENOENT ? 0 : PTR_ERR(folio);
truncate_out:
- f2fs_wait_on_page_writeback(page, DATA, true, true);
- zero_user(page, offset, PAGE_SIZE - offset);
+ f2fs_folio_wait_writeback(folio, DATA, true, true);
+ folio_zero_segment(folio, offset, folio_size(folio));
/* An encrypted inode should have a key and truncate the last page. */
f2fs_bug_on(F2FS_I_SB(inode), cache_only && IS_ENCRYPTED(inode));
if (!cache_only)
- set_page_dirty(page);
- f2fs_put_page(page, 1);
+ folio_mark_dirty(folio);
+ f2fs_folio_put(folio, true);
return 0;
}
@@ -741,7 +766,7 @@ int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock)
struct dnode_of_data dn;
pgoff_t free_from;
int count = 0, err = 0;
- struct page *ipage;
+ struct folio *ifolio;
bool truncate_page = false;
trace_f2fs_truncate_blocks_enter(inode, from);
@@ -759,9 +784,9 @@ int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock)
if (lock)
f2fs_lock_op(sbi);
- ipage = f2fs_get_node_page(sbi, inode->i_ino);
- if (IS_ERR(ipage)) {
- err = PTR_ERR(ipage);
+ ifolio = f2fs_get_inode_folio(sbi, inode->i_ino);
+ if (IS_ERR(ifolio)) {
+ err = PTR_ERR(ifolio);
goto out;
}
@@ -774,18 +799,18 @@ int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock)
dec_valid_block_count(sbi, inode, ei.len);
f2fs_update_time(sbi, REQ_TIME);
- f2fs_put_page(ipage, 1);
+ f2fs_folio_put(ifolio, true);
goto out;
}
if (f2fs_has_inline_data(inode)) {
- f2fs_truncate_inline_inode(inode, ipage, from);
- f2fs_put_page(ipage, 1);
+ f2fs_truncate_inline_inode(inode, ifolio, from);
+ f2fs_folio_put(ifolio, true);
truncate_page = true;
goto out;
}
- set_new_dnode(&dn, inode, ipage, NULL, 0);
+ set_new_dnode(&dn, inode, ifolio, NULL, 0);
err = f2fs_get_dnode_of_data(&dn, free_from, LOOKUP_NODE_RA);
if (err) {
if (err == -ENOENT)
@@ -793,12 +818,12 @@ int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock)
goto out;
}
- count = ADDRS_PER_PAGE(dn.node_page, inode);
+ count = ADDRS_PER_PAGE(dn.node_folio, inode);
count -= dn.ofs_in_node;
f2fs_bug_on(sbi, count < 0);
- if (dn.ofs_in_node || IS_INODE(dn.node_page)) {
+ if (dn.ofs_in_node || IS_INODE(dn.node_folio)) {
f2fs_truncate_data_blocks_range(&dn, count);
free_from += count;
}
@@ -1021,11 +1046,24 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
{
struct inode *inode = d_inode(dentry);
struct f2fs_inode_info *fi = F2FS_I(inode);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
int err;
- if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
+ if (unlikely(f2fs_cp_error(sbi)))
return -EIO;
+ err = setattr_prepare(idmap, dentry, attr);
+ if (err)
+ return err;
+
+ err = fscrypt_prepare_setattr(dentry, attr);
+ if (err)
+ return err;
+
+ err = fsverity_prepare_setattr(dentry, attr);
+ if (err)
+ return err;
+
if (unlikely(IS_IMMUTABLE(inode)))
return -EPERM;
@@ -1042,20 +1080,19 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
!IS_ALIGNED(attr->ia_size,
F2FS_BLK_TO_BYTES(fi->i_cluster_size)))
return -EINVAL;
+ /*
+ * To prevent scattered pin block generation, we don't allow
+ * smaller/equal size unaligned truncation for pinned file.
+ * We only support overwrite IO to pinned file, so don't
+ * care about larger size truncation.
+ */
+ if (f2fs_is_pinned_file(inode) &&
+ attr->ia_size <= i_size_read(inode) &&
+ !IS_ALIGNED(attr->ia_size,
+ F2FS_BLK_TO_BYTES(CAP_BLKS_PER_SEC(sbi))))
+ return -EINVAL;
}
- err = setattr_prepare(idmap, dentry, attr);
- if (err)
- return err;
-
- err = fscrypt_prepare_setattr(dentry, attr);
- if (err)
- return err;
-
- err = fsverity_prepare_setattr(dentry, attr);
- if (err)
- return err;
-
if (is_quota_modification(idmap, inode, attr)) {
err = f2fs_dquot_initialize(inode);
if (err)
@@ -1063,12 +1100,11 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
}
if (i_uid_needs_update(idmap, attr, inode) ||
i_gid_needs_update(idmap, attr, inode)) {
- f2fs_lock_op(F2FS_I_SB(inode));
+ f2fs_lock_op(sbi);
err = dquot_transfer(idmap, inode, attr);
if (err) {
- set_sbi_flag(F2FS_I_SB(inode),
- SBI_QUOTA_NEED_REPAIR);
- f2fs_unlock_op(F2FS_I_SB(inode));
+ set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR);
+ f2fs_unlock_op(sbi);
return err;
}
/*
@@ -1078,7 +1114,7 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
i_uid_update(idmap, attr, inode);
i_gid_update(idmap, attr, inode);
f2fs_mark_inode_dirty_sync(inode, true);
- f2fs_unlock_op(F2FS_I_SB(inode));
+ f2fs_unlock_op(sbi);
}
if (attr->ia_valid & ATTR_SIZE) {
@@ -1104,6 +1140,8 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
f2fs_down_write(&fi->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(inode->i_mapping);
+ if (attr->ia_size > old_size)
+ f2fs_zero_post_eof_page(inode, attr->ia_size);
truncate_setsize(inode, attr->ia_size);
if (attr->ia_size <= old_size)
@@ -1139,7 +1177,7 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
f2fs_mark_inode_dirty_sync(inode, true);
/* inode change will produce dirty node pages flushed by checkpoint */
- f2fs_balance_fs(F2FS_I_SB(inode), true);
+ f2fs_balance_fs(sbi, true);
return err;
}
@@ -1159,7 +1197,7 @@ static int fill_zero(struct inode *inode, pgoff_t index,
loff_t start, loff_t len)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct page *page;
+ struct folio *folio;
if (!len)
return 0;
@@ -1167,16 +1205,16 @@ static int fill_zero(struct inode *inode, pgoff_t index,
f2fs_balance_fs(sbi, true);
f2fs_lock_op(sbi);
- page = f2fs_get_new_data_page(inode, NULL, index, false);
+ folio = f2fs_get_new_data_folio(inode, NULL, index, false);
f2fs_unlock_op(sbi);
- if (IS_ERR(page))
- return PTR_ERR(page);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
- f2fs_wait_on_page_writeback(page, DATA, true, true);
- zero_user(page, start, len);
- set_page_dirty(page);
- f2fs_put_page(page, 1);
+ f2fs_folio_wait_writeback(folio, DATA, true, true);
+ folio_zero_range(folio, start, len);
+ folio_mark_dirty(folio);
+ f2fs_folio_put(folio, true);
return 0;
}
@@ -1199,7 +1237,7 @@ int f2fs_truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end)
return err;
}
- end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
+ end_offset = ADDRS_PER_PAGE(dn.node_folio, inode);
count = min(end_offset - dn.ofs_in_node, pg_end - pg_start);
f2fs_bug_on(F2FS_I_SB(inode), count == 0 || count > end_offset);
@@ -1222,6 +1260,10 @@ static int f2fs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
if (ret)
return ret;
+ filemap_invalidate_lock(inode->i_mapping);
+ f2fs_zero_post_eof_page(inode, offset + len);
+ filemap_invalidate_unlock(inode->i_mapping);
+
pg_start = ((unsigned long long) offset) >> PAGE_SHIFT;
pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT;
@@ -1294,7 +1336,7 @@ next_dnode:
goto next;
}
- done = min((pgoff_t)ADDRS_PER_PAGE(dn.node_page, inode) -
+ done = min((pgoff_t)ADDRS_PER_PAGE(dn.node_folio, inode) -
dn.ofs_in_node, len);
for (i = 0; i < done; i++, blkaddr++, do_replace++, dn.ofs_in_node++) {
*blkaddr = f2fs_data_blkaddr(&dn);
@@ -1383,7 +1425,7 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode,
}
ilen = min((pgoff_t)
- ADDRS_PER_PAGE(dn.node_page, dst_inode) -
+ ADDRS_PER_PAGE(dn.node_folio, dst_inode) -
dn.ofs_in_node, len - i);
do {
dn.data_blkaddr = f2fs_data_blkaddr(&dn);
@@ -1408,26 +1450,26 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode,
f2fs_put_dnode(&dn);
} else {
- struct page *psrc, *pdst;
+ struct folio *fsrc, *fdst;
- psrc = f2fs_get_lock_data_page(src_inode,
+ fsrc = f2fs_get_lock_data_folio(src_inode,
src + i, true);
- if (IS_ERR(psrc))
- return PTR_ERR(psrc);
- pdst = f2fs_get_new_data_page(dst_inode, NULL, dst + i,
+ if (IS_ERR(fsrc))
+ return PTR_ERR(fsrc);
+ fdst = f2fs_get_new_data_folio(dst_inode, NULL, dst + i,
true);
- if (IS_ERR(pdst)) {
- f2fs_put_page(psrc, 1);
- return PTR_ERR(pdst);
+ if (IS_ERR(fdst)) {
+ f2fs_folio_put(fsrc, true);
+ return PTR_ERR(fdst);
}
- f2fs_wait_on_page_writeback(pdst, DATA, true, true);
+ f2fs_folio_wait_writeback(fdst, DATA, true, true);
- memcpy_page(pdst, 0, psrc, 0, PAGE_SIZE);
- set_page_dirty(pdst);
- set_page_private_gcing(pdst);
- f2fs_put_page(pdst, 1);
- f2fs_put_page(psrc, 1);
+ memcpy_folio(fdst, 0, fsrc, 0, PAGE_SIZE);
+ folio_mark_dirty(fdst);
+ folio_set_f2fs_gcing(fdst);
+ f2fs_folio_put(fdst, true);
+ f2fs_folio_put(fsrc, true);
ret = f2fs_truncate_hole(src_inode,
src + i, src + i + 1);
@@ -1505,6 +1547,8 @@ static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len)
f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(inode->i_mapping);
+ f2fs_zero_post_eof_page(inode, offset + len);
+
f2fs_lock_op(sbi);
f2fs_drop_extent_tree(inode);
truncate_pagecache(inode, offset);
@@ -1626,6 +1670,10 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
if (ret)
return ret;
+ filemap_invalidate_lock(mapping);
+ f2fs_zero_post_eof_page(inode, offset + len);
+ filemap_invalidate_unlock(mapping);
+
pg_start = ((unsigned long long) offset) >> PAGE_SHIFT;
pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT;
@@ -1673,7 +1721,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
goto out;
}
- end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
+ end_offset = ADDRS_PER_PAGE(dn.node_folio, inode);
end = min(pg_end, end_offset - dn.ofs_in_node + index);
ret = f2fs_do_zero_range(&dn, index, end);
@@ -1757,6 +1805,8 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
/* avoid gc operation during block exchange */
f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(mapping);
+
+ f2fs_zero_post_eof_page(inode, offset + len);
truncate_pagecache(inode, offset);
while (!ret && idx > pg_start) {
@@ -1814,6 +1864,10 @@ static int f2fs_expand_inode_data(struct inode *inode, loff_t offset,
if (err)
return err;
+ filemap_invalidate_lock(inode->i_mapping);
+ f2fs_zero_post_eof_page(inode, offset + len);
+ filemap_invalidate_unlock(inode->i_mapping);
+
f2fs_balance_fs(sbi, true);
pg_start = ((unsigned long long)offset) >> PAGE_SHIFT;
@@ -1834,18 +1888,31 @@ static int f2fs_expand_inode_data(struct inode *inode, loff_t offset,
map.m_len = sec_blks;
next_alloc:
- if (has_not_enough_free_secs(sbi, 0, f2fs_sb_has_blkzoned(sbi) ?
- ZONED_PIN_SEC_REQUIRED_COUNT :
- GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) {
+ f2fs_down_write(&sbi->pin_sem);
+
+ if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) {
+ if (has_not_enough_free_secs(sbi, 0, 0)) {
+ f2fs_up_write(&sbi->pin_sem);
+ err = -ENOSPC;
+ f2fs_warn_ratelimited(sbi,
+ "ino:%lu, start:%lu, end:%lu, need to trigger GC to "
+ "reclaim enough free segment when checkpoint is enabled",
+ inode->i_ino, pg_start, pg_end);
+ goto out_err;
+ }
+ }
+
+ if (has_not_enough_free_secs(sbi, 0,
+ sbi->reserved_pin_section)) {
f2fs_down_write(&sbi->gc_lock);
stat_inc_gc_call_count(sbi, FOREGROUND);
err = f2fs_gc(sbi, &gc_control);
- if (err && err != -ENODATA)
+ if (err && err != -ENODATA) {
+ f2fs_up_write(&sbi->pin_sem);
goto out_err;
+ }
}
- f2fs_down_write(&sbi->pin_sem);
-
err = f2fs_allocate_pinning_section(sbi);
if (err) {
f2fs_up_write(&sbi->pin_sem);
@@ -1974,6 +2041,9 @@ out:
static int f2fs_release_file(struct inode *inode, struct file *filp)
{
+ if (atomic_dec_and_test(&F2FS_I(inode)->open_count))
+ f2fs_remove_donate_inode(inode);
+
/*
* f2fs_release_file is called at every close calls. So we should
* not drop any inmemory pages by close called by other process.
@@ -2448,6 +2518,61 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
return ret;
}
+static int f2fs_keep_noreuse_range(struct inode *inode,
+ loff_t offset, loff_t len)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ u64 max_bytes = F2FS_BLK_TO_BYTES(max_file_blocks(inode));
+ u64 start, end;
+ int ret = 0;
+
+ if (!S_ISREG(inode->i_mode))
+ return 0;
+
+ if (offset >= max_bytes || len > max_bytes ||
+ (offset + len) > max_bytes)
+ return 0;
+
+ start = offset >> PAGE_SHIFT;
+ end = DIV_ROUND_UP(offset + len, PAGE_SIZE);
+
+ inode_lock(inode);
+ if (f2fs_is_atomic_file(inode)) {
+ inode_unlock(inode);
+ return 0;
+ }
+
+ spin_lock(&sbi->inode_lock[DONATE_INODE]);
+ /* let's remove the range, if len = 0 */
+ if (!len) {
+ if (!list_empty(&F2FS_I(inode)->gdonate_list)) {
+ list_del_init(&F2FS_I(inode)->gdonate_list);
+ sbi->donate_files--;
+ if (is_inode_flag_set(inode, FI_DONATE_FINISHED))
+ ret = -EALREADY;
+ else
+ set_inode_flag(inode, FI_DONATE_FINISHED);
+ } else
+ ret = -ENOENT;
+ } else {
+ if (list_empty(&F2FS_I(inode)->gdonate_list)) {
+ list_add_tail(&F2FS_I(inode)->gdonate_list,
+ &sbi->inode_list[DONATE_INODE]);
+ sbi->donate_files++;
+ } else {
+ list_move_tail(&F2FS_I(inode)->gdonate_list,
+ &sbi->inode_list[DONATE_INODE]);
+ }
+ F2FS_I(inode)->donate_start = start;
+ F2FS_I(inode)->donate_end = end - 1;
+ clear_inode_flag(inode, FI_DONATE_FINISHED);
+ }
+ spin_unlock(&sbi->inode_lock[DONATE_INODE]);
+ inode_unlock(inode);
+
+ return ret;
+}
+
static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg)
{
struct inode *inode = file_inode(filp);
@@ -2858,19 +2983,19 @@ do_map:
idx = map.m_lblk;
while (idx < map.m_lblk + map.m_len &&
cnt < BLKS_PER_SEG(sbi)) {
- struct page *page;
+ struct folio *folio;
- page = f2fs_get_lock_data_page(inode, idx, true);
- if (IS_ERR(page)) {
- err = PTR_ERR(page);
+ folio = f2fs_get_lock_data_folio(inode, idx, true);
+ if (IS_ERR(folio)) {
+ err = PTR_ERR(folio);
goto clear_out;
}
- f2fs_wait_on_page_writeback(page, DATA, true, true);
+ f2fs_folio_wait_writeback(folio, DATA, true, true);
- set_page_dirty(page);
- set_page_private_gcing(page);
- f2fs_put_page(page, 1);
+ folio_mark_dirty(folio);
+ folio_set_f2fs_gcing(folio);
+ f2fs_folio_put(folio, true);
idx++;
cnt++;
@@ -3282,7 +3407,7 @@ static int f2fs_ioc_setproject(struct inode *inode, __u32 projid)
}
#endif
-int f2fs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+int f2fs_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
{
struct inode *inode = d_inode(dentry);
struct f2fs_inode_info *fi = F2FS_I(inode);
@@ -3306,7 +3431,7 @@ int f2fs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
}
int f2fs_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa)
+ struct dentry *dentry, struct file_kattr *fa)
{
struct inode *inode = d_inode(dentry);
u32 fsflags = fa->flags, mask = F2FS_SETTABLE_FS_FL;
@@ -3446,6 +3571,23 @@ static int f2fs_ioc_get_dev_alias_file(struct file *filp, unsigned long arg)
(u32 __user *)arg);
}
+static int f2fs_ioc_io_prio(struct file *filp, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ __u32 level;
+
+ if (get_user(level, (__u32 __user *)arg))
+ return -EFAULT;
+
+ if (!S_ISREG(inode->i_mode) || level >= F2FS_IOPRIO_MAX)
+ return -EINVAL;
+
+ inode_lock(inode);
+ F2FS_I(inode)->ioprio_hint = level;
+ inode_unlock(inode);
+ return 0;
+}
+
int f2fs_precache_extents(struct inode *inode)
{
struct f2fs_inode_info *fi = F2FS_I(inode);
@@ -3632,7 +3774,7 @@ static int release_compress_blocks(struct dnode_of_data *dn, pgoff_t count)
int i;
for (i = 0; i < count; i++) {
- blkaddr = data_blkaddr(dn->inode, dn->node_page,
+ blkaddr = data_blkaddr(dn->inode, dn->node_folio,
dn->ofs_in_node + i);
if (!__is_valid_data_blkaddr(blkaddr))
@@ -3750,7 +3892,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg)
break;
}
- end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
+ end_offset = ADDRS_PER_PAGE(dn.node_folio, inode);
count = min(end_offset - dn.ofs_in_node, last_idx - page_idx);
count = round_up(count, fi->i_cluster_size);
@@ -3801,7 +3943,7 @@ static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count,
int i;
for (i = 0; i < count; i++) {
- blkaddr = data_blkaddr(dn->inode, dn->node_page,
+ blkaddr = data_blkaddr(dn->inode, dn->node_folio,
dn->ofs_in_node + i);
if (!__is_valid_data_blkaddr(blkaddr))
@@ -3818,7 +3960,7 @@ static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count,
int ret;
for (i = 0; i < cluster_size; i++) {
- blkaddr = data_blkaddr(dn->inode, dn->node_page,
+ blkaddr = data_blkaddr(dn->inode, dn->node_folio,
dn->ofs_in_node + i);
if (i == 0) {
@@ -3928,7 +4070,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg)
break;
}
- end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
+ end_offset = ADDRS_PER_PAGE(dn.node_folio, inode);
count = min(end_offset - dn.ofs_in_node, last_idx - page_idx);
count = round_up(count, fi->i_cluster_size);
@@ -4092,7 +4234,7 @@ static int f2fs_sec_trim_file(struct file *filp, unsigned long arg)
goto out;
}
- end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
+ end_offset = ADDRS_PER_PAGE(dn.node_folio, inode);
count = min(end_offset - dn.ofs_in_node, pg_end - index);
for (i = 0; i < count; i++, index++, dn.ofs_in_node++) {
struct block_device *cur_bdev;
@@ -4264,34 +4406,36 @@ static int redirty_blocks(struct inode *inode, pgoff_t page_idx, int len)
{
DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, page_idx);
struct address_space *mapping = inode->i_mapping;
- struct page *page;
+ struct folio *folio;
pgoff_t redirty_idx = page_idx;
- int i, page_len = 0, ret = 0;
+ int page_len = 0, ret = 0;
page_cache_ra_unbounded(&ractl, len, 0);
- for (i = 0; i < len; i++, page_idx++) {
- page = read_cache_page(mapping, page_idx, NULL, NULL);
- if (IS_ERR(page)) {
- ret = PTR_ERR(page);
+ do {
+ folio = read_cache_folio(mapping, page_idx, NULL, NULL);
+ if (IS_ERR(folio)) {
+ ret = PTR_ERR(folio);
break;
}
- page_len++;
- }
+ page_len += folio_nr_pages(folio) - (page_idx - folio->index);
+ page_idx = folio_next_index(folio);
+ } while (page_len < len);
- for (i = 0; i < page_len; i++, redirty_idx++) {
- page = find_lock_page(mapping, redirty_idx);
+ do {
+ folio = filemap_lock_folio(mapping, redirty_idx);
- /* It will never fail, when page has pinned above */
- f2fs_bug_on(F2FS_I_SB(inode), !page);
+ /* It will never fail, when folio has pinned above */
+ f2fs_bug_on(F2FS_I_SB(inode), IS_ERR(folio));
- f2fs_wait_on_page_writeback(page, DATA, true, true);
+ f2fs_folio_wait_writeback(folio, DATA, true, true);
- set_page_dirty(page);
- set_page_private_gcing(page);
- f2fs_put_page(page, 1);
- f2fs_put_page(page, 0);
- }
+ folio_mark_dirty(folio);
+ folio_set_f2fs_gcing(folio);
+ redirty_idx = folio_next_index(folio);
+ folio_unlock(folio);
+ folio_put_refs(folio, 2);
+ } while (redirty_idx < page_idx);
return ret;
}
@@ -4547,6 +4691,8 @@ static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return f2fs_ioc_compress_file(filp);
case F2FS_IOC_GET_DEV_ALIAS_FILE:
return f2fs_ioc_get_dev_alias_file(filp, arg);
+ case F2FS_IOC_IO_PRIO:
+ return f2fs_ioc_io_prio(filp, arg);
default:
return -ENOTTY;
}
@@ -4695,6 +4841,7 @@ static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
struct inode *inode = file_inode(iocb->ki_filp);
const loff_t pos = iocb->ki_pos;
ssize_t ret;
+ bool dio;
if (!f2fs_is_compress_backend_ready(inode))
return -EOPNOTSUPP;
@@ -4703,12 +4850,15 @@ static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
f2fs_trace_rw_file_path(iocb->ki_filp, iocb->ki_pos,
iov_iter_count(to), READ);
+ dio = f2fs_should_use_dio(inode, iocb, to);
+
/* In LFS mode, if there is inflight dio, wait for its completion */
if (f2fs_lfs_mode(F2FS_I_SB(inode)) &&
- get_pages(F2FS_I_SB(inode), F2FS_DIO_WRITE))
+ get_pages(F2FS_I_SB(inode), F2FS_DIO_WRITE) &&
+ (!f2fs_is_pinned_file(inode) || !dio))
inode_dio_wait(inode);
- if (f2fs_should_use_dio(inode, iocb, to)) {
+ if (dio) {
ret = f2fs_dio_read_iter(iocb, to);
} else {
ret = filemap_read(iocb, to, 0);
@@ -4716,8 +4866,7 @@ static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
f2fs_update_iostat(F2FS_I_SB(inode), inode,
APP_BUFFERED_READ_IO, ret);
}
- if (trace_f2fs_dataread_end_enabled())
- trace_f2fs_dataread_end(inode, pos, ret);
+ trace_f2fs_dataread_end(inode, pos, ret);
return ret;
}
@@ -4740,8 +4889,7 @@ static ssize_t f2fs_file_splice_read(struct file *in, loff_t *ppos,
f2fs_update_iostat(F2FS_I_SB(inode), inode,
APP_BUFFERED_READ_IO, ret);
- if (trace_f2fs_dataread_end_enabled())
- trace_f2fs_dataread_end(inode, pos, ret);
+ trace_f2fs_dataread_end(inode, pos, ret);
return ret;
}
@@ -4765,6 +4913,10 @@ static ssize_t f2fs_write_checks(struct kiocb *iocb, struct iov_iter *from)
err = file_modified(file);
if (err)
return err;
+
+ filemap_invalidate_lock(inode->i_mapping);
+ f2fs_zero_post_eof_page(inode, iocb->ki_pos + iov_iter_count(from));
+ filemap_invalidate_unlock(inode->i_mapping);
return count;
}
@@ -5082,8 +5234,7 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
f2fs_dio_write_iter(iocb, from, &may_need_sync) :
f2fs_buffered_write_iter(iocb, from);
- if (trace_f2fs_datawrite_end_enabled())
- trace_f2fs_datawrite_end(inode, orig_pos, ret);
+ trace_f2fs_datawrite_end(inode, orig_pos, ret);
}
/* Don't leave any preallocated blocks around past i_size. */
@@ -5147,11 +5298,15 @@ static int f2fs_file_fadvise(struct file *filp, loff_t offset, loff_t len,
}
err = generic_fadvise(filp, offset, len, advice);
- if (!err && advice == POSIX_FADV_DONTNEED &&
- test_opt(F2FS_I_SB(inode), COMPRESS_CACHE) &&
- f2fs_compressed_file(inode))
- f2fs_invalidate_compress_pages(F2FS_I_SB(inode), inode->i_ino);
+ if (err)
+ return err;
+ if (advice == POSIX_FADV_DONTNEED &&
+ (test_opt(F2FS_I_SB(inode), COMPRESS_CACHE) &&
+ f2fs_compressed_file(inode)))
+ f2fs_invalidate_compress_pages(F2FS_I_SB(inode), inode->i_ino);
+ else if (advice == POSIX_FADV_NOREUSE)
+ err = f2fs_keep_noreuse_range(inode, offset, len);
return err;
}
@@ -5261,6 +5416,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case F2FS_IOC_DECOMPRESS_FILE:
case F2FS_IOC_COMPRESS_FILE:
case F2FS_IOC_GET_DEV_ALIAS_FILE:
+ case F2FS_IOC_IO_PRIO:
break;
default:
return -ENOIOCTLCMD;
@@ -5276,7 +5432,7 @@ const struct file_operations f2fs_file_operations = {
.iopoll = iocb_bio_iopoll,
.open = f2fs_file_open,
.release = f2fs_release_file,
- .mmap = f2fs_file_mmap,
+ .mmap_prepare = f2fs_file_mmap_prepare,
.flush = f2fs_file_flush,
.fsync = f2fs_sync_file,
.fallocate = f2fs_fallocate,
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index faf9fa1c804d..098e9f71421e 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -141,10 +141,10 @@ do_gc:
FOREGROUND : BACKGROUND);
sync_mode = (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_SYNC) ||
- gc_control.one_time;
+ (gc_control.one_time && gc_th->boost_gc_greedy);
/* foreground GC was been triggered via f2fs_balance_fs() */
- if (foreground)
+ if (foreground && !f2fs_sb_has_blkzoned(sbi))
sync_mode = false;
gc_control.init_gc_type = sync_mode ? FG_GC : BG_GC;
@@ -197,6 +197,8 @@ int f2fs_start_gc_thread(struct f2fs_sb_info *sbi)
gc_th->urgent_sleep_time = DEF_GC_THREAD_URGENT_SLEEP_TIME;
gc_th->valid_thresh_ratio = DEF_GC_THREAD_VALID_THRESH_RATIO;
+ gc_th->boost_gc_multiple = BOOST_GC_MULTIPLE;
+ gc_th->boost_gc_greedy = GC_GREEDY;
if (f2fs_sb_has_blkzoned(sbi)) {
gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME_ZONED;
@@ -278,12 +280,7 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
- if (p->alloc_mode == SSR) {
- p->gc_mode = GC_GREEDY;
- p->dirty_bitmap = dirty_i->dirty_segmap[type];
- p->max_search = dirty_i->nr_dirty[type];
- p->ofs_unit = 1;
- } else if (p->alloc_mode == AT_SSR) {
+ if (p->alloc_mode == SSR || p->alloc_mode == AT_SSR) {
p->gc_mode = GC_GREEDY;
p->dirty_bitmap = dirty_i->dirty_segmap[type];
p->max_search = dirty_i->nr_dirty[type];
@@ -389,14 +386,15 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno)
}
static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi,
- unsigned int segno, struct victim_sel_policy *p)
+ unsigned int segno, struct victim_sel_policy *p,
+ unsigned int valid_thresh_ratio)
{
if (p->alloc_mode == SSR)
return get_seg_entry(sbi, segno)->ckpt_valid_blocks;
- if (p->one_time_gc && (get_valid_blocks(sbi, segno, true) >=
- CAP_BLKS_PER_SEC(sbi) * sbi->gc_thread->valid_thresh_ratio /
- 100))
+ if (p->one_time_gc && (valid_thresh_ratio < 100) &&
+ (get_valid_blocks(sbi, segno, true) >=
+ CAP_BLKS_PER_SEC(sbi) * valid_thresh_ratio / 100))
return UINT_MAX;
/* alloc_mode == LFS */
@@ -777,6 +775,7 @@ int f2fs_get_victim(struct f2fs_sb_info *sbi, unsigned int *result,
unsigned int secno, last_victim;
unsigned int last_segment;
unsigned int nsearched;
+ unsigned int valid_thresh_ratio = 100;
bool is_atgc;
int ret = 0;
@@ -786,7 +785,11 @@ int f2fs_get_victim(struct f2fs_sb_info *sbi, unsigned int *result,
p.alloc_mode = alloc_mode;
p.age = age;
p.age_threshold = sbi->am.age_threshold;
- p.one_time_gc = one_time;
+ if (one_time) {
+ p.one_time_gc = one_time;
+ if (has_enough_free_secs(sbi, 0, NR_PERSISTENT_LOG))
+ valid_thresh_ratio = sbi->gc_thread->valid_thresh_ratio;
+ }
retry:
select_policy(sbi, gc_type, type, &p);
@@ -912,7 +915,7 @@ retry:
goto next;
}
- cost = get_gc_cost(sbi, segno, &p);
+ cost = get_gc_cost(sbi, segno, &p, valid_thresh_ratio);
if (p.min_cost > cost) {
p.min_segno = segno;
@@ -1045,7 +1048,7 @@ next_step:
for (off = 0; off < usable_blks_in_seg; off++, entry++) {
nid_t nid = le32_to_cpu(entry->nid);
- struct page *node_page;
+ struct folio *node_folio;
struct node_info ni;
int err;
@@ -1068,27 +1071,27 @@ next_step:
}
/* phase == 2 */
- node_page = f2fs_get_node_page(sbi, nid);
- if (IS_ERR(node_page))
+ node_folio = f2fs_get_node_folio(sbi, nid);
+ if (IS_ERR(node_folio))
continue;
- /* block may become invalid during f2fs_get_node_page */
+ /* block may become invalid during f2fs_get_node_folio */
if (check_valid_map(sbi, segno, off) == 0) {
- f2fs_put_page(node_page, 1);
+ f2fs_folio_put(node_folio, true);
continue;
}
if (f2fs_get_node_info(sbi, nid, &ni, false)) {
- f2fs_put_page(node_page, 1);
+ f2fs_folio_put(node_folio, true);
continue;
}
if (ni.blk_addr != start_addr + off) {
- f2fs_put_page(node_page, 1);
+ f2fs_folio_put(node_folio, true);
continue;
}
- err = f2fs_move_node_page(node_page, gc_type);
+ err = f2fs_move_node_folio(node_folio, gc_type);
if (!err && gc_type == FG_GC)
submitted++;
stat_inc_node_blk_count(sbi, 1, gc_type);
@@ -1134,7 +1137,7 @@ block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode)
static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
struct node_info *dni, block_t blkaddr, unsigned int *nofs)
{
- struct page *node_page;
+ struct folio *node_folio;
nid_t nid;
unsigned int ofs_in_node, max_addrs, base;
block_t source_blkaddr;
@@ -1142,12 +1145,12 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
nid = le32_to_cpu(sum->nid);
ofs_in_node = le16_to_cpu(sum->ofs_in_node);
- node_page = f2fs_get_node_page(sbi, nid);
- if (IS_ERR(node_page))
+ node_folio = f2fs_get_node_folio(sbi, nid);
+ if (IS_ERR(node_folio))
return false;
if (f2fs_get_node_info(sbi, nid, dni, false)) {
- f2fs_put_page(node_page, 1);
+ f2fs_folio_put(node_folio, true);
return false;
}
@@ -1158,12 +1161,12 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
}
if (f2fs_check_nid_range(sbi, dni->ino)) {
- f2fs_put_page(node_page, 1);
+ f2fs_folio_put(node_folio, true);
return false;
}
- if (IS_INODE(node_page)) {
- base = offset_in_addr(F2FS_INODE(node_page));
+ if (IS_INODE(node_folio)) {
+ base = offset_in_addr(F2FS_INODE(node_folio));
max_addrs = DEF_ADDRS_PER_INODE;
} else {
base = 0;
@@ -1173,13 +1176,13 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
if (base + ofs_in_node >= max_addrs) {
f2fs_err(sbi, "Inconsistent blkaddr offset: base:%u, ofs_in_node:%u, max:%u, ino:%u, nid:%u",
base, ofs_in_node, max_addrs, dni->ino, dni->nid);
- f2fs_put_page(node_page, 1);
+ f2fs_folio_put(node_folio, true);
return false;
}
- *nofs = ofs_of_node(node_page);
- source_blkaddr = data_blkaddr(NULL, node_page, ofs_in_node);
- f2fs_put_page(node_page, 1);
+ *nofs = ofs_of_node(node_folio);
+ source_blkaddr = data_blkaddr(NULL, node_folio, ofs_in_node);
+ f2fs_folio_put(node_folio, true);
if (source_blkaddr != blkaddr) {
#ifdef CONFIG_F2FS_CHECK_FS
@@ -1205,7 +1208,7 @@ static int ra_data_block(struct inode *inode, pgoff_t index)
struct address_space *mapping = f2fs_is_cow_file(inode) ?
F2FS_I(inode)->atomic_inode->i_mapping : inode->i_mapping;
struct dnode_of_data dn;
- struct page *page;
+ struct folio *folio;
struct f2fs_io_info fio = {
.sbi = sbi,
.ino = inode->i_ino,
@@ -1218,16 +1221,16 @@ static int ra_data_block(struct inode *inode, pgoff_t index)
};
int err;
- page = f2fs_grab_cache_page(mapping, index, true);
- if (!page)
- return -ENOMEM;
+ folio = f2fs_grab_cache_folio(mapping, index, true);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
if (f2fs_lookup_read_extent_cache_block(inode, index,
&dn.data_blkaddr)) {
if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr,
DATA_GENERIC_ENHANCE_READ))) {
err = -EFSCORRUPTED;
- goto put_page;
+ goto put_folio;
}
goto got_it;
}
@@ -1235,28 +1238,28 @@ static int ra_data_block(struct inode *inode, pgoff_t index)
set_new_dnode(&dn, inode, NULL, NULL, 0);
err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE);
if (err)
- goto put_page;
+ goto put_folio;
f2fs_put_dnode(&dn);
if (!__is_valid_data_blkaddr(dn.data_blkaddr)) {
err = -ENOENT;
- goto put_page;
+ goto put_folio;
}
if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr,
DATA_GENERIC_ENHANCE))) {
err = -EFSCORRUPTED;
- goto put_page;
+ goto put_folio;
}
got_it:
- /* read page */
- fio.page = page;
+ /* read folio */
+ fio.folio = folio;
fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr;
/*
* don't cache encrypted data into meta inode until previous dirty
* data were writebacked to avoid racing between GC and flush.
*/
- f2fs_wait_on_page_writeback(page, DATA, true, true);
+ f2fs_folio_wait_writeback(folio, DATA, true, true);
f2fs_wait_on_block_writeback(inode, dn.data_blkaddr);
@@ -1265,14 +1268,14 @@ got_it:
FGP_LOCK | FGP_CREAT, GFP_NOFS);
if (!fio.encrypted_page) {
err = -ENOMEM;
- goto put_page;
+ goto put_folio;
}
err = f2fs_submit_page_bio(&fio);
if (err)
goto put_encrypted_page;
f2fs_put_page(fio.encrypted_page, 0);
- f2fs_put_page(page, 1);
+ f2fs_folio_put(folio, true);
f2fs_update_iostat(sbi, inode, FS_DATA_READ_IO, F2FS_BLKSIZE);
f2fs_update_iostat(sbi, NULL, FS_GDATA_READ_IO, F2FS_BLKSIZE);
@@ -1280,8 +1283,8 @@ got_it:
return 0;
put_encrypted_page:
f2fs_put_page(fio.encrypted_page, 1);
-put_page:
- f2fs_put_page(page, 1);
+put_folio:
+ f2fs_folio_put(folio, true);
return err;
}
@@ -1307,7 +1310,7 @@ static int move_data_block(struct inode *inode, block_t bidx,
struct dnode_of_data dn;
struct f2fs_summary sum;
struct node_info ni;
- struct page *page, *mpage;
+ struct folio *folio, *mfolio;
block_t newaddr;
int err = 0;
bool lfs_mode = f2fs_lfs_mode(fio.sbi);
@@ -1316,9 +1319,9 @@ static int move_data_block(struct inode *inode, block_t bidx,
CURSEG_ALL_DATA_ATGC : CURSEG_COLD_DATA;
/* do not read out */
- page = f2fs_grab_cache_page(mapping, bidx, false);
- if (!page)
- return -ENOMEM;
+ folio = f2fs_grab_cache_folio(mapping, bidx, false);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
if (!check_valid_map(F2FS_I_SB(inode), segno, off)) {
err = -ENOENT;
@@ -1335,7 +1338,7 @@ static int move_data_block(struct inode *inode, block_t bidx,
goto out;
if (unlikely(dn.data_blkaddr == NULL_ADDR)) {
- ClearPageUptodate(page);
+ folio_clear_uptodate(folio);
err = -ENOENT;
goto put_out;
}
@@ -1344,7 +1347,7 @@ static int move_data_block(struct inode *inode, block_t bidx,
* don't cache encrypted data into meta inode until previous dirty
* data were writebacked to avoid racing between GC and flush.
*/
- f2fs_wait_on_page_writeback(page, DATA, true, true);
+ f2fs_folio_wait_writeback(folio, DATA, true, true);
f2fs_wait_on_block_writeback(inode, dn.data_blkaddr);
@@ -1353,26 +1356,26 @@ static int move_data_block(struct inode *inode, block_t bidx,
goto put_out;
/* read page */
- fio.page = page;
+ fio.folio = folio;
fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr;
if (lfs_mode)
f2fs_down_write(&fio.sbi->io_order_lock);
- mpage = f2fs_grab_cache_page(META_MAPPING(fio.sbi),
+ mfolio = f2fs_grab_cache_folio(META_MAPPING(fio.sbi),
fio.old_blkaddr, false);
- if (!mpage) {
- err = -ENOMEM;
+ if (IS_ERR(mfolio)) {
+ err = PTR_ERR(mfolio);
goto up_out;
}
- fio.encrypted_page = mpage;
+ fio.encrypted_page = folio_file_page(mfolio, fio.old_blkaddr);
- /* read source block in mpage */
- if (!PageUptodate(mpage)) {
+ /* read source block in mfolio */
+ if (!folio_test_uptodate(mfolio)) {
err = f2fs_submit_page_bio(&fio);
if (err) {
- f2fs_put_page(mpage, 1);
+ f2fs_folio_put(mfolio, true);
goto up_out;
}
@@ -1381,11 +1384,11 @@ static int move_data_block(struct inode *inode, block_t bidx,
f2fs_update_iostat(fio.sbi, NULL, FS_GDATA_READ_IO,
F2FS_BLKSIZE);
- lock_page(mpage);
- if (unlikely(mpage->mapping != META_MAPPING(fio.sbi) ||
- !PageUptodate(mpage))) {
+ folio_lock(mfolio);
+ if (unlikely(!is_meta_folio(mfolio) ||
+ !folio_test_uptodate(mfolio))) {
err = -EIO;
- f2fs_put_page(mpage, 1);
+ f2fs_folio_put(mfolio, true);
goto up_out;
}
}
@@ -1396,7 +1399,7 @@ static int move_data_block(struct inode *inode, block_t bidx,
err = f2fs_allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr,
&sum, type, NULL);
if (err) {
- f2fs_put_page(mpage, 1);
+ f2fs_folio_put(mfolio, true);
/* filesystem should shutdown, no need to recovery block */
goto up_out;
}
@@ -1405,15 +1408,15 @@ static int move_data_block(struct inode *inode, block_t bidx,
newaddr, FGP_LOCK | FGP_CREAT, GFP_NOFS);
if (!fio.encrypted_page) {
err = -ENOMEM;
- f2fs_put_page(mpage, 1);
+ f2fs_folio_put(mfolio, true);
goto recover_block;
}
/* write target block */
f2fs_wait_on_page_writeback(fio.encrypted_page, DATA, true, true);
memcpy(page_address(fio.encrypted_page),
- page_address(mpage), PAGE_SIZE);
- f2fs_put_page(mpage, 1);
+ folio_address(mfolio), PAGE_SIZE);
+ f2fs_folio_put(mfolio, true);
f2fs_invalidate_internal_cache(fio.sbi, fio.old_blkaddr, 1);
@@ -1444,19 +1447,19 @@ up_out:
put_out:
f2fs_put_dnode(&dn);
out:
- f2fs_put_page(page, 1);
+ f2fs_folio_put(folio, true);
return err;
}
static int move_data_page(struct inode *inode, block_t bidx, int gc_type,
- unsigned int segno, int off)
+ unsigned int segno, int off)
{
- struct page *page;
+ struct folio *folio;
int err = 0;
- page = f2fs_get_lock_data_page(inode, bidx, true);
- if (IS_ERR(page))
- return PTR_ERR(page);
+ folio = f2fs_get_lock_data_folio(inode, bidx, true);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
if (!check_valid_map(F2FS_I_SB(inode), segno, off)) {
err = -ENOENT;
@@ -1468,12 +1471,12 @@ static int move_data_page(struct inode *inode, block_t bidx, int gc_type,
goto out;
if (gc_type == BG_GC) {
- if (folio_test_writeback(page_folio(page))) {
+ if (folio_test_writeback(folio)) {
err = -EAGAIN;
goto out;
}
- set_page_dirty(page);
- set_page_private_gcing(page);
+ folio_mark_dirty(folio);
+ folio_set_f2fs_gcing(folio);
} else {
struct f2fs_io_info fio = {
.sbi = F2FS_I_SB(inode),
@@ -1483,37 +1486,37 @@ static int move_data_page(struct inode *inode, block_t bidx, int gc_type,
.op = REQ_OP_WRITE,
.op_flags = REQ_SYNC,
.old_blkaddr = NULL_ADDR,
- .page = page,
+ .folio = folio,
.encrypted_page = NULL,
.need_lock = LOCK_REQ,
.io_type = FS_GC_DATA_IO,
};
- bool is_dirty = PageDirty(page);
+ bool is_dirty = folio_test_dirty(folio);
retry:
- f2fs_wait_on_page_writeback(page, DATA, true, true);
+ f2fs_folio_wait_writeback(folio, DATA, true, true);
- set_page_dirty(page);
- if (clear_page_dirty_for_io(page)) {
+ folio_mark_dirty(folio);
+ if (folio_clear_dirty_for_io(folio)) {
inode_dec_dirty_pages(inode);
f2fs_remove_dirty_inode(inode);
}
- set_page_private_gcing(page);
+ folio_set_f2fs_gcing(folio);
err = f2fs_do_write_data_page(&fio);
if (err) {
- clear_page_private_gcing(page);
+ folio_clear_f2fs_gcing(folio);
if (err == -ENOMEM) {
memalloc_retry_wait(GFP_NOFS);
goto retry;
}
if (is_dirty)
- set_page_dirty(page);
+ folio_mark_dirty(folio);
}
}
out:
- f2fs_put_page(page, 1);
+ f2fs_folio_put(folio, true);
return err;
}
@@ -1542,7 +1545,6 @@ next_step:
entry = sum;
for (off = 0; off < usable_blks_in_seg; off++, entry++) {
- struct page *data_page;
struct inode *inode;
struct node_info dni; /* dnode info for the data */
unsigned int ofs_in_node, nofs;
@@ -1585,6 +1587,7 @@ next_step:
ofs_in_node = le16_to_cpu(entry->ofs_in_node);
if (phase == 3) {
+ struct folio *data_folio;
int err;
inode = f2fs_iget(sb, dni.ino);
@@ -1635,15 +1638,15 @@ next_step:
continue;
}
- data_page = f2fs_get_read_data_page(inode, start_bidx,
+ data_folio = f2fs_get_read_data_folio(inode, start_bidx,
REQ_RAHEAD, true, NULL);
f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
- if (IS_ERR(data_page)) {
+ if (IS_ERR(data_folio)) {
iput(inode);
continue;
}
- f2fs_put_page(data_page, 0);
+ f2fs_folio_put(data_folio, false);
add_gc_inode(gc_list, inode);
continue;
}
@@ -1718,8 +1721,6 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
struct gc_inode_list *gc_list, int gc_type,
bool force_migrate, bool one_time)
{
- struct page *sum_page;
- struct f2fs_summary_block *sum;
struct blk_plug plug;
unsigned int segno = start_segno;
unsigned int end_segno = start_segno + SEGS_PER_SEC(sbi);
@@ -1751,7 +1752,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
!has_enough_free_blocks(sbi,
sbi->gc_thread->boost_zoned_gc_percent))
window_granularity *=
- BOOST_GC_MULTIPLE;
+ sbi->gc_thread->boost_gc_multiple;
end_segno = start_segno + window_granularity;
}
@@ -1769,40 +1770,40 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
/* reference all summary page */
while (segno < end_segno) {
- sum_page = f2fs_get_sum_page(sbi, segno++);
- if (IS_ERR(sum_page)) {
- int err = PTR_ERR(sum_page);
+ struct folio *sum_folio = f2fs_get_sum_folio(sbi, segno++);
+ if (IS_ERR(sum_folio)) {
+ int err = PTR_ERR(sum_folio);
end_segno = segno - 1;
for (segno = start_segno; segno < end_segno; segno++) {
- sum_page = find_get_page(META_MAPPING(sbi),
+ sum_folio = filemap_get_folio(META_MAPPING(sbi),
GET_SUM_BLOCK(sbi, segno));
- f2fs_put_page(sum_page, 0);
- f2fs_put_page(sum_page, 0);
+ folio_put_refs(sum_folio, 2);
}
return err;
}
- unlock_page(sum_page);
+ folio_unlock(sum_folio);
}
blk_start_plug(&plug);
for (segno = start_segno; segno < end_segno; segno++) {
+ struct f2fs_summary_block *sum;
/* find segment summary of victim */
- sum_page = find_get_page(META_MAPPING(sbi),
+ struct folio *sum_folio = filemap_get_folio(META_MAPPING(sbi),
GET_SUM_BLOCK(sbi, segno));
- f2fs_put_page(sum_page, 0);
if (get_valid_blocks(sbi, segno, false) == 0)
goto freed;
if (gc_type == BG_GC && __is_large_section(sbi) &&
migrated >= sbi->migration_granularity)
goto skip;
- if (!PageUptodate(sum_page) || unlikely(f2fs_cp_error(sbi)))
+ if (!folio_test_uptodate(sum_folio) ||
+ unlikely(f2fs_cp_error(sbi)))
goto skip;
- sum = page_address(sum_page);
+ sum = folio_address(sum_folio);
if (type != GET_SUM_TYPE((&sum->footer))) {
f2fs_err(sbi, "Inconsistent segment (%u) type [%d, %d] in SSA and SIT",
segno, type, GET_SUM_TYPE((&sum->footer)));
@@ -1840,7 +1841,7 @@ freed:
(segno + 1 < sec_end_segno) ?
segno + 1 : NULL_SEGNO;
skip:
- f2fs_put_page(sum_page, 0);
+ folio_put_refs(sum_folio, 2);
}
if (submitted)
@@ -1893,6 +1894,7 @@ gc_more:
/* Let's run FG_GC, if we don't have enough space. */
if (has_not_enough_free_secs(sbi, 0, 0)) {
gc_type = FG_GC;
+ gc_control->one_time = false;
/*
* For example, if there are many prefree_segments below given
@@ -2066,6 +2068,9 @@ int f2fs_gc_range(struct f2fs_sb_info *sbi,
.iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS),
};
+ if (is_cursec(sbi, GET_SEC_FROM_SEG(sbi, segno)))
+ continue;
+
do_garbage_collect(sbi, segno, &gc_list, FG_GC, true, false);
put_gc_inode(&gc_list);
@@ -2271,12 +2276,12 @@ out_drop_write:
if (err)
return err;
- err = freeze_super(sbi->sb, FREEZE_HOLDER_USERSPACE);
+ err = freeze_super(sbi->sb, FREEZE_HOLDER_KERNEL, NULL);
if (err)
return err;
if (f2fs_readonly(sbi->sb)) {
- err = thaw_super(sbi->sb, FREEZE_HOLDER_USERSPACE);
+ err = thaw_super(sbi->sb, FREEZE_HOLDER_KERNEL, NULL);
if (err)
return err;
return -EROFS;
@@ -2333,6 +2338,6 @@ recover_out:
out_err:
f2fs_up_write(&sbi->cp_global_sem);
f2fs_up_write(&sbi->gc_lock);
- thaw_super(sbi->sb, FREEZE_HOLDER_USERSPACE);
+ thaw_super(sbi->sb, FREEZE_HOLDER_KERNEL, NULL);
return err;
}
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index 5c1eaf55e127..24e8b1c27acc 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -68,6 +68,8 @@ struct f2fs_gc_kthread {
unsigned int no_zoned_gc_percent;
unsigned int boost_zoned_gc_percent;
unsigned int valid_thresh_ratio;
+ unsigned int boost_gc_multiple;
+ unsigned int boost_gc_greedy;
};
struct gc_inode_list {
@@ -194,6 +196,7 @@ static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi)
static inline bool need_to_boost_gc(struct f2fs_sb_info *sbi)
{
if (f2fs_sb_has_blkzoned(sbi))
- return !has_enough_free_blocks(sbi, LIMIT_BOOST_ZONED_GC);
+ return !has_enough_free_blocks(sbi,
+ sbi->gc_thread->boost_zoned_gc_percent);
return has_enough_invalid_blocks(sbi);
}
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 3e3c35d4c98b..58ac831ef704 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -33,9 +33,9 @@ bool f2fs_may_inline_data(struct inode *inode)
return !f2fs_post_read_required(inode);
}
-static bool inode_has_blocks(struct inode *inode, struct page *ipage)
+static bool inode_has_blocks(struct inode *inode, struct folio *ifolio)
{
- struct f2fs_inode *ri = F2FS_INODE(ipage);
+ struct f2fs_inode *ri = F2FS_INODE(ifolio);
int i;
if (F2FS_HAS_BLOCKS(inode))
@@ -48,12 +48,12 @@ static bool inode_has_blocks(struct inode *inode, struct page *ipage)
return false;
}
-bool f2fs_sanity_check_inline_data(struct inode *inode, struct page *ipage)
+bool f2fs_sanity_check_inline_data(struct inode *inode, struct folio *ifolio)
{
if (!f2fs_has_inline_data(inode))
return false;
- if (inode_has_blocks(inode, ipage))
+ if (inode_has_blocks(inode, ifolio))
return false;
if (!support_inline_data(inode))
@@ -79,37 +79,37 @@ bool f2fs_may_inline_dentry(struct inode *inode)
return true;
}
-void f2fs_do_read_inline_data(struct folio *folio, struct page *ipage)
+void f2fs_do_read_inline_data(struct folio *folio, struct folio *ifolio)
{
struct inode *inode = folio->mapping->host;
if (folio_test_uptodate(folio))
return;
- f2fs_bug_on(F2FS_I_SB(inode), folio_index(folio));
+ f2fs_bug_on(F2FS_I_SB(inode), folio->index);
folio_zero_segment(folio, MAX_INLINE_DATA(inode), folio_size(folio));
/* Copy the whole inline data block */
- memcpy_to_folio(folio, 0, inline_data_addr(inode, ipage),
+ memcpy_to_folio(folio, 0, inline_data_addr(inode, ifolio),
MAX_INLINE_DATA(inode));
if (!folio_test_uptodate(folio))
folio_mark_uptodate(folio);
}
-void f2fs_truncate_inline_inode(struct inode *inode,
- struct page *ipage, u64 from)
+void f2fs_truncate_inline_inode(struct inode *inode, struct folio *ifolio,
+ u64 from)
{
void *addr;
if (from >= MAX_INLINE_DATA(inode))
return;
- addr = inline_data_addr(inode, ipage);
+ addr = inline_data_addr(inode, ifolio);
- f2fs_wait_on_page_writeback(ipage, NODE, true, true);
+ f2fs_folio_wait_writeback(ifolio, NODE, true, true);
memset(addr + from, 0, MAX_INLINE_DATA(inode) - from);
- set_page_dirty(ipage);
+ folio_mark_dirty(ifolio);
if (from == 0)
clear_inode_flag(inode, FI_DATA_EXIST);
@@ -117,32 +117,32 @@ void f2fs_truncate_inline_inode(struct inode *inode,
int f2fs_read_inline_data(struct inode *inode, struct folio *folio)
{
- struct page *ipage;
+ struct folio *ifolio;
- ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino);
- if (IS_ERR(ipage)) {
+ ifolio = f2fs_get_inode_folio(F2FS_I_SB(inode), inode->i_ino);
+ if (IS_ERR(ifolio)) {
folio_unlock(folio);
- return PTR_ERR(ipage);
+ return PTR_ERR(ifolio);
}
if (!f2fs_has_inline_data(inode)) {
- f2fs_put_page(ipage, 1);
+ f2fs_folio_put(ifolio, true);
return -EAGAIN;
}
- if (folio_index(folio))
+ if (folio->index)
folio_zero_segment(folio, 0, folio_size(folio));
else
- f2fs_do_read_inline_data(folio, ipage);
+ f2fs_do_read_inline_data(folio, ifolio);
if (!folio_test_uptodate(folio))
folio_mark_uptodate(folio);
- f2fs_put_page(ipage, 1);
+ f2fs_folio_put(ifolio, true);
folio_unlock(folio);
return 0;
}
-int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
+int f2fs_convert_inline_folio(struct dnode_of_data *dn, struct folio *folio)
{
struct f2fs_io_info fio = {
.sbi = F2FS_I_SB(dn->inode),
@@ -150,7 +150,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
.type = DATA,
.op = REQ_OP_WRITE,
.op_flags = REQ_SYNC | REQ_PRIO,
- .page = page,
+ .folio = folio,
.encrypted_page = NULL,
.io_type = FS_DATA_IO,
};
@@ -182,20 +182,20 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
return -EFSCORRUPTED;
}
- f2fs_bug_on(F2FS_P_SB(page), folio_test_writeback(page_folio(page)));
+ f2fs_bug_on(F2FS_F_SB(folio), folio_test_writeback(folio));
- f2fs_do_read_inline_data(page_folio(page), dn->inode_page);
- set_page_dirty(page);
+ f2fs_do_read_inline_data(folio, dn->inode_folio);
+ folio_mark_dirty(folio);
/* clear dirty state */
- dirty = clear_page_dirty_for_io(page);
+ dirty = folio_clear_dirty_for_io(folio);
/* write data page to try to make data consistent */
- set_page_writeback(page);
+ folio_start_writeback(folio);
fio.old_blkaddr = dn->data_blkaddr;
set_inode_flag(dn->inode, FI_HOT_DATA);
f2fs_outplace_write_data(dn, &fio);
- f2fs_wait_on_page_writeback(page, DATA, true, true);
+ f2fs_folio_wait_writeback(folio, DATA, true, true);
if (dirty) {
inode_dec_dirty_pages(dn->inode);
f2fs_remove_dirty_inode(dn->inode);
@@ -205,8 +205,8 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
set_inode_flag(dn->inode, FI_APPEND_WRITE);
/* clear inline data and flag after data writeback */
- f2fs_truncate_inline_inode(dn->inode, dn->inode_page, 0);
- clear_page_private_inline(dn->inode_page);
+ f2fs_truncate_inline_inode(dn->inode, dn->inode_folio, 0);
+ folio_clear_f2fs_inline(dn->inode_folio);
clear_out:
stat_dec_inline_inode(dn->inode);
clear_inode_flag(dn->inode, FI_INLINE_DATA);
@@ -218,7 +218,7 @@ int f2fs_convert_inline_inode(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct dnode_of_data dn;
- struct page *ipage, *page;
+ struct folio *ifolio, *folio;
int err = 0;
if (f2fs_hw_is_readonly(sbi) || f2fs_readonly(sbi->sb))
@@ -231,28 +231,28 @@ int f2fs_convert_inline_inode(struct inode *inode)
if (err)
return err;
- page = f2fs_grab_cache_page(inode->i_mapping, 0, false);
- if (!page)
- return -ENOMEM;
+ folio = f2fs_grab_cache_folio(inode->i_mapping, 0, false);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
f2fs_lock_op(sbi);
- ipage = f2fs_get_node_page(sbi, inode->i_ino);
- if (IS_ERR(ipage)) {
- err = PTR_ERR(ipage);
+ ifolio = f2fs_get_inode_folio(sbi, inode->i_ino);
+ if (IS_ERR(ifolio)) {
+ err = PTR_ERR(ifolio);
goto out;
}
- set_new_dnode(&dn, inode, ipage, ipage, 0);
+ set_new_dnode(&dn, inode, ifolio, ifolio, 0);
if (f2fs_has_inline_data(inode))
- err = f2fs_convert_inline_page(&dn, page);
+ err = f2fs_convert_inline_folio(&dn, folio);
f2fs_put_dnode(&dn);
out:
f2fs_unlock_op(sbi);
- f2fs_put_page(page, 1);
+ f2fs_folio_put(folio, true);
if (!err)
f2fs_balance_fs(sbi, dn.node_changed);
@@ -263,40 +263,39 @@ out:
int f2fs_write_inline_data(struct inode *inode, struct folio *folio)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct page *ipage;
+ struct folio *ifolio;
- ipage = f2fs_get_node_page(sbi, inode->i_ino);
- if (IS_ERR(ipage))
- return PTR_ERR(ipage);
+ ifolio = f2fs_get_inode_folio(sbi, inode->i_ino);
+ if (IS_ERR(ifolio))
+ return PTR_ERR(ifolio);
if (!f2fs_has_inline_data(inode)) {
- f2fs_put_page(ipage, 1);
+ f2fs_folio_put(ifolio, true);
return -EAGAIN;
}
f2fs_bug_on(F2FS_I_SB(inode), folio->index);
- f2fs_wait_on_page_writeback(ipage, NODE, true, true);
- memcpy_from_folio(inline_data_addr(inode, ipage),
+ f2fs_folio_wait_writeback(ifolio, NODE, true, true);
+ memcpy_from_folio(inline_data_addr(inode, ifolio),
folio, 0, MAX_INLINE_DATA(inode));
- set_page_dirty(ipage);
+ folio_mark_dirty(ifolio);
f2fs_clear_page_cache_dirty_tag(folio);
set_inode_flag(inode, FI_APPEND_WRITE);
set_inode_flag(inode, FI_DATA_EXIST);
- clear_page_private_inline(ipage);
- f2fs_put_page(ipage, 1);
+ folio_clear_f2fs_inline(ifolio);
+ f2fs_folio_put(ifolio, 1);
return 0;
}
-int f2fs_recover_inline_data(struct inode *inode, struct page *npage)
+int f2fs_recover_inline_data(struct inode *inode, struct folio *nfolio)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct f2fs_inode *ri = NULL;
void *src_addr, *dst_addr;
- struct page *ipage;
/*
* The inline_data recovery policy is as follows.
@@ -306,38 +305,39 @@ int f2fs_recover_inline_data(struct inode *inode, struct page *npage)
* x o -> remove data blocks, and then recover inline_data
* x x -> recover data blocks
*/
- if (IS_INODE(npage))
- ri = F2FS_INODE(npage);
+ if (IS_INODE(nfolio))
+ ri = F2FS_INODE(nfolio);
if (f2fs_has_inline_data(inode) &&
ri && (ri->i_inline & F2FS_INLINE_DATA)) {
+ struct folio *ifolio;
process_inline:
- ipage = f2fs_get_node_page(sbi, inode->i_ino);
- if (IS_ERR(ipage))
- return PTR_ERR(ipage);
+ ifolio = f2fs_get_inode_folio(sbi, inode->i_ino);
+ if (IS_ERR(ifolio))
+ return PTR_ERR(ifolio);
- f2fs_wait_on_page_writeback(ipage, NODE, true, true);
+ f2fs_folio_wait_writeback(ifolio, NODE, true, true);
- src_addr = inline_data_addr(inode, npage);
- dst_addr = inline_data_addr(inode, ipage);
+ src_addr = inline_data_addr(inode, nfolio);
+ dst_addr = inline_data_addr(inode, ifolio);
memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode));
set_inode_flag(inode, FI_INLINE_DATA);
set_inode_flag(inode, FI_DATA_EXIST);
- set_page_dirty(ipage);
- f2fs_put_page(ipage, 1);
+ folio_mark_dirty(ifolio);
+ f2fs_folio_put(ifolio, true);
return 1;
}
if (f2fs_has_inline_data(inode)) {
- ipage = f2fs_get_node_page(sbi, inode->i_ino);
- if (IS_ERR(ipage))
- return PTR_ERR(ipage);
- f2fs_truncate_inline_inode(inode, ipage, 0);
+ struct folio *ifolio = f2fs_get_inode_folio(sbi, inode->i_ino);
+ if (IS_ERR(ifolio))
+ return PTR_ERR(ifolio);
+ f2fs_truncate_inline_inode(inode, ifolio, 0);
stat_dec_inline_inode(inode);
clear_inode_flag(inode, FI_INLINE_DATA);
- f2fs_put_page(ipage, 1);
+ f2fs_folio_put(ifolio, true);
} else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) {
int ret;
@@ -352,50 +352,50 @@ process_inline:
struct f2fs_dir_entry *f2fs_find_in_inline_dir(struct inode *dir,
const struct f2fs_filename *fname,
- struct page **res_page,
+ struct folio **res_folio,
bool use_hash)
{
struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
struct f2fs_dir_entry *de;
struct f2fs_dentry_ptr d;
- struct page *ipage;
+ struct folio *ifolio;
void *inline_dentry;
- ipage = f2fs_get_node_page(sbi, dir->i_ino);
- if (IS_ERR(ipage)) {
- *res_page = ipage;
+ ifolio = f2fs_get_inode_folio(sbi, dir->i_ino);
+ if (IS_ERR(ifolio)) {
+ *res_folio = ifolio;
return NULL;
}
- inline_dentry = inline_data_addr(dir, ipage);
+ inline_dentry = inline_data_addr(dir, ifolio);
make_dentry_ptr_inline(dir, &d, inline_dentry);
de = f2fs_find_target_dentry(&d, fname, NULL, use_hash);
- unlock_page(ipage);
+ folio_unlock(ifolio);
if (IS_ERR(de)) {
- *res_page = ERR_CAST(de);
+ *res_folio = ERR_CAST(de);
de = NULL;
}
if (de)
- *res_page = ipage;
+ *res_folio = ifolio;
else
- f2fs_put_page(ipage, 0);
+ f2fs_folio_put(ifolio, false);
return de;
}
int f2fs_make_empty_inline_dir(struct inode *inode, struct inode *parent,
- struct page *ipage)
+ struct folio *ifolio)
{
struct f2fs_dentry_ptr d;
void *inline_dentry;
- inline_dentry = inline_data_addr(inode, ipage);
+ inline_dentry = inline_data_addr(inode, ifolio);
make_dentry_ptr_inline(inode, &d, inline_dentry);
f2fs_do_make_empty_dir(inode, parent, &d);
- set_page_dirty(ipage);
+ folio_mark_dirty(ifolio);
/* update i_size to MAX_INLINE_DATA */
if (i_size_read(inode) < MAX_INLINE_DATA(inode))
@@ -407,39 +407,39 @@ int f2fs_make_empty_inline_dir(struct inode *inode, struct inode *parent,
* NOTE: ipage is grabbed by caller, but if any error occurs, we should
* release ipage in this function.
*/
-static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage,
+static int f2fs_move_inline_dirents(struct inode *dir, struct folio *ifolio,
void *inline_dentry)
{
- struct page *page;
+ struct folio *folio;
struct dnode_of_data dn;
struct f2fs_dentry_block *dentry_blk;
struct f2fs_dentry_ptr src, dst;
int err;
- page = f2fs_grab_cache_page(dir->i_mapping, 0, true);
- if (!page) {
- f2fs_put_page(ipage, 1);
- return -ENOMEM;
+ folio = f2fs_grab_cache_folio(dir->i_mapping, 0, true);
+ if (IS_ERR(folio)) {
+ f2fs_folio_put(ifolio, true);
+ return PTR_ERR(folio);
}
- set_new_dnode(&dn, dir, ipage, NULL, 0);
+ set_new_dnode(&dn, dir, ifolio, NULL, 0);
err = f2fs_reserve_block(&dn, 0);
if (err)
goto out;
if (unlikely(dn.data_blkaddr != NEW_ADDR)) {
f2fs_put_dnode(&dn);
- set_sbi_flag(F2FS_P_SB(page), SBI_NEED_FSCK);
- f2fs_warn(F2FS_P_SB(page), "%s: corrupted inline inode ino=%lx, i_addr[0]:0x%x, run fsck to fix.",
+ set_sbi_flag(F2FS_F_SB(folio), SBI_NEED_FSCK);
+ f2fs_warn(F2FS_F_SB(folio), "%s: corrupted inline inode ino=%lx, i_addr[0]:0x%x, run fsck to fix.",
__func__, dir->i_ino, dn.data_blkaddr);
- f2fs_handle_error(F2FS_P_SB(page), ERROR_INVALID_BLKADDR);
+ f2fs_handle_error(F2FS_F_SB(folio), ERROR_INVALID_BLKADDR);
err = -EFSCORRUPTED;
goto out;
}
- f2fs_wait_on_page_writeback(page, DATA, true, true);
+ f2fs_folio_wait_writeback(folio, DATA, true, true);
- dentry_blk = page_address(page);
+ dentry_blk = folio_address(folio);
/*
* Start by zeroing the full block, to ensure that all unused space is
@@ -455,12 +455,12 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage,
memcpy(dst.dentry, src.dentry, SIZE_OF_DIR_ENTRY * src.max);
memcpy(dst.filename, src.filename, src.max * F2FS_SLOT_LEN);
- if (!PageUptodate(page))
- SetPageUptodate(page);
- set_page_dirty(page);
+ if (!folio_test_uptodate(folio))
+ folio_mark_uptodate(folio);
+ folio_mark_dirty(folio);
/* clear inline dir and flag after data writeback */
- f2fs_truncate_inline_inode(dir, ipage, 0);
+ f2fs_truncate_inline_inode(dir, ifolio, 0);
stat_dec_inline_dir(dir);
clear_inode_flag(dir, FI_INLINE_DENTRY);
@@ -477,7 +477,7 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage,
if (i_size_read(dir) < PAGE_SIZE)
f2fs_i_size_write(dir, PAGE_SIZE);
out:
- f2fs_put_page(page, 1);
+ f2fs_folio_put(folio, true);
return err;
}
@@ -533,7 +533,7 @@ punch_dentry_pages:
return err;
}
-static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage,
+static int f2fs_move_rehashed_dirents(struct inode *dir, struct folio *ifolio,
void *inline_dentry)
{
void *backup_dentry;
@@ -542,20 +542,20 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage,
backup_dentry = f2fs_kmalloc(F2FS_I_SB(dir),
MAX_INLINE_DATA(dir), GFP_F2FS_ZERO);
if (!backup_dentry) {
- f2fs_put_page(ipage, 1);
+ f2fs_folio_put(ifolio, true);
return -ENOMEM;
}
memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA(dir));
- f2fs_truncate_inline_inode(dir, ipage, 0);
+ f2fs_truncate_inline_inode(dir, ifolio, 0);
- unlock_page(ipage);
+ folio_unlock(ifolio);
err = f2fs_add_inline_entries(dir, backup_dentry);
if (err)
goto recover;
- lock_page(ipage);
+ folio_lock(ifolio);
stat_dec_inline_dir(dir);
clear_inode_flag(dir, FI_INLINE_DENTRY);
@@ -571,31 +571,31 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage,
kfree(backup_dentry);
return 0;
recover:
- lock_page(ipage);
- f2fs_wait_on_page_writeback(ipage, NODE, true, true);
+ folio_lock(ifolio);
+ f2fs_folio_wait_writeback(ifolio, NODE, true, true);
memcpy(inline_dentry, backup_dentry, MAX_INLINE_DATA(dir));
f2fs_i_depth_write(dir, 0);
f2fs_i_size_write(dir, MAX_INLINE_DATA(dir));
- set_page_dirty(ipage);
- f2fs_put_page(ipage, 1);
+ folio_mark_dirty(ifolio);
+ f2fs_folio_put(ifolio, 1);
kfree(backup_dentry);
return err;
}
-static int do_convert_inline_dir(struct inode *dir, struct page *ipage,
+static int do_convert_inline_dir(struct inode *dir, struct folio *ifolio,
void *inline_dentry)
{
if (!F2FS_I(dir)->i_dir_level)
- return f2fs_move_inline_dirents(dir, ipage, inline_dentry);
+ return f2fs_move_inline_dirents(dir, ifolio, inline_dentry);
else
- return f2fs_move_rehashed_dirents(dir, ipage, inline_dentry);
+ return f2fs_move_rehashed_dirents(dir, ifolio, inline_dentry);
}
int f2fs_try_convert_inline_dir(struct inode *dir, struct dentry *dentry)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
- struct page *ipage;
+ struct folio *ifolio;
struct f2fs_filename fname;
void *inline_dentry = NULL;
int err = 0;
@@ -609,22 +609,22 @@ int f2fs_try_convert_inline_dir(struct inode *dir, struct dentry *dentry)
if (err)
goto out;
- ipage = f2fs_get_node_page(sbi, dir->i_ino);
- if (IS_ERR(ipage)) {
- err = PTR_ERR(ipage);
+ ifolio = f2fs_get_inode_folio(sbi, dir->i_ino);
+ if (IS_ERR(ifolio)) {
+ err = PTR_ERR(ifolio);
goto out_fname;
}
- if (f2fs_has_enough_room(dir, ipage, &fname)) {
- f2fs_put_page(ipage, 1);
+ if (f2fs_has_enough_room(dir, ifolio, &fname)) {
+ f2fs_folio_put(ifolio, true);
goto out_fname;
}
- inline_dentry = inline_data_addr(dir, ipage);
+ inline_dentry = inline_data_addr(dir, ifolio);
- err = do_convert_inline_dir(dir, ipage, inline_dentry);
+ err = do_convert_inline_dir(dir, ifolio, inline_dentry);
if (!err)
- f2fs_put_page(ipage, 1);
+ f2fs_folio_put(ifolio, true);
out_fname:
f2fs_free_filename(&fname);
out:
@@ -636,24 +636,24 @@ int f2fs_add_inline_entry(struct inode *dir, const struct f2fs_filename *fname,
struct inode *inode, nid_t ino, umode_t mode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
- struct page *ipage;
+ struct folio *ifolio;
unsigned int bit_pos;
void *inline_dentry = NULL;
struct f2fs_dentry_ptr d;
int slots = GET_DENTRY_SLOTS(fname->disk_name.len);
- struct page *page = NULL;
+ struct folio *folio = NULL;
int err = 0;
- ipage = f2fs_get_node_page(sbi, dir->i_ino);
- if (IS_ERR(ipage))
- return PTR_ERR(ipage);
+ ifolio = f2fs_get_inode_folio(sbi, dir->i_ino);
+ if (IS_ERR(ifolio))
+ return PTR_ERR(ifolio);
- inline_dentry = inline_data_addr(dir, ipage);
+ inline_dentry = inline_data_addr(dir, ifolio);
make_dentry_ptr_inline(dir, &d, inline_dentry);
bit_pos = f2fs_room_for_filename(d.bitmap, slots, d.max);
if (bit_pos >= d.max) {
- err = do_convert_inline_dir(dir, ipage, inline_dentry);
+ err = do_convert_inline_dir(dir, ifolio, inline_dentry);
if (err)
return err;
err = -EAGAIN;
@@ -663,19 +663,19 @@ int f2fs_add_inline_entry(struct inode *dir, const struct f2fs_filename *fname,
if (inode) {
f2fs_down_write_nested(&F2FS_I(inode)->i_sem,
SINGLE_DEPTH_NESTING);
- page = f2fs_init_inode_metadata(inode, dir, fname, ipage);
- if (IS_ERR(page)) {
- err = PTR_ERR(page);
+ folio = f2fs_init_inode_metadata(inode, dir, fname, ifolio);
+ if (IS_ERR(folio)) {
+ err = PTR_ERR(folio);
goto fail;
}
}
- f2fs_wait_on_page_writeback(ipage, NODE, true, true);
+ f2fs_folio_wait_writeback(ifolio, NODE, true, true);
f2fs_update_dentry(ino, mode, &d, &fname->disk_name, fname->hash,
bit_pos);
- set_page_dirty(ipage);
+ folio_mark_dirty(ifolio);
/* we don't need to mark_inode_dirty now */
if (inode) {
@@ -683,9 +683,9 @@ int f2fs_add_inline_entry(struct inode *dir, const struct f2fs_filename *fname,
/* synchronize inode page's data from inode cache */
if (is_inode_flag_set(inode, FI_NEW_INODE))
- f2fs_update_inode(inode, page);
+ f2fs_update_inode(inode, folio);
- f2fs_put_page(page, 1);
+ f2fs_folio_put(folio, true);
}
f2fs_update_parent_metadata(dir, inode, 0);
@@ -693,12 +693,12 @@ fail:
if (inode)
f2fs_up_write(&F2FS_I(inode)->i_sem);
out:
- f2fs_put_page(ipage, 1);
+ f2fs_folio_put(ifolio, true);
return err;
}
-void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page,
- struct inode *dir, struct inode *inode)
+void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry,
+ struct folio *folio, struct inode *dir, struct inode *inode)
{
struct f2fs_dentry_ptr d;
void *inline_dentry;
@@ -706,18 +706,18 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page,
unsigned int bit_pos;
int i;
- lock_page(page);
- f2fs_wait_on_page_writeback(page, NODE, true, true);
+ folio_lock(folio);
+ f2fs_folio_wait_writeback(folio, NODE, true, true);
- inline_dentry = inline_data_addr(dir, page);
+ inline_dentry = inline_data_addr(dir, folio);
make_dentry_ptr_inline(dir, &d, inline_dentry);
bit_pos = dentry - d.dentry;
for (i = 0; i < slots; i++)
__clear_bit_le(bit_pos + i, d.bitmap);
- set_page_dirty(page);
- f2fs_put_page(page, 1);
+ folio_mark_dirty(folio);
+ f2fs_folio_put(folio, true);
inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
f2fs_mark_inode_dirty_sync(dir, false);
@@ -729,21 +729,21 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page,
bool f2fs_empty_inline_dir(struct inode *dir)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
- struct page *ipage;
+ struct folio *ifolio;
unsigned int bit_pos = 2;
void *inline_dentry;
struct f2fs_dentry_ptr d;
- ipage = f2fs_get_node_page(sbi, dir->i_ino);
- if (IS_ERR(ipage))
+ ifolio = f2fs_get_inode_folio(sbi, dir->i_ino);
+ if (IS_ERR(ifolio))
return false;
- inline_dentry = inline_data_addr(dir, ipage);
+ inline_dentry = inline_data_addr(dir, ifolio);
make_dentry_ptr_inline(dir, &d, inline_dentry);
bit_pos = find_next_bit_le(d.bitmap, d.max, bit_pos);
- f2fs_put_page(ipage, 1);
+ f2fs_folio_put(ifolio, true);
if (bit_pos < d.max)
return false;
@@ -755,7 +755,7 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx,
struct fscrypt_str *fstr)
{
struct inode *inode = file_inode(file);
- struct page *ipage = NULL;
+ struct folio *ifolio = NULL;
struct f2fs_dentry_ptr d;
void *inline_dentry = NULL;
int err;
@@ -765,17 +765,17 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx,
if (ctx->pos == d.max)
return 0;
- ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino);
- if (IS_ERR(ipage))
- return PTR_ERR(ipage);
+ ifolio = f2fs_get_inode_folio(F2FS_I_SB(inode), inode->i_ino);
+ if (IS_ERR(ifolio))
+ return PTR_ERR(ifolio);
/*
* f2fs_readdir was protected by inode.i_rwsem, it is safe to access
* ipage without page's lock held.
*/
- unlock_page(ipage);
+ folio_unlock(ifolio);
- inline_dentry = inline_data_addr(inode, ipage);
+ inline_dentry = inline_data_addr(inode, ifolio);
make_dentry_ptr_inline(inode, &d, inline_dentry);
@@ -783,7 +783,7 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx,
if (!err)
ctx->pos = d.max;
- f2fs_put_page(ipage, 0);
+ f2fs_folio_put(ifolio, false);
return err < 0 ? err : 0;
}
@@ -794,12 +794,12 @@ int f2fs_inline_data_fiemap(struct inode *inode,
__u32 flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED |
FIEMAP_EXTENT_LAST;
struct node_info ni;
- struct page *ipage;
+ struct folio *ifolio;
int err = 0;
- ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino);
- if (IS_ERR(ipage))
- return PTR_ERR(ipage);
+ ifolio = f2fs_get_inode_folio(F2FS_I_SB(inode), inode->i_ino);
+ if (IS_ERR(ifolio))
+ return PTR_ERR(ifolio);
if ((S_ISREG(inode->i_mode) || S_ISLNK(inode->i_mode)) &&
!f2fs_has_inline_data(inode)) {
@@ -824,11 +824,11 @@ int f2fs_inline_data_fiemap(struct inode *inode,
goto out;
byteaddr = (__u64)ni.blk_addr << inode->i_sb->s_blocksize_bits;
- byteaddr += (char *)inline_data_addr(inode, ipage) -
- (char *)F2FS_INODE(ipage);
+ byteaddr += (char *)inline_data_addr(inode, ifolio) -
+ (char *)F2FS_INODE(ifolio);
err = fiemap_fill_next_extent(fieinfo, start, byteaddr, ilen, flags);
trace_f2fs_fiemap(inode, start, byteaddr, ilen, flags, err);
out:
- f2fs_put_page(ipage, 1);
+ f2fs_folio_put(ifolio, true);
return err;
}
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 3dd25f64d6f1..8c4eafe9ffac 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -34,10 +34,10 @@ void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync)
if (f2fs_inode_dirtied(inode, sync))
return;
- if (f2fs_is_atomic_file(inode)) {
- set_inode_flag(inode, FI_ATOMIC_DIRTIED);
+ /* only atomic file w/ FI_ATOMIC_COMMITTED can be set vfs dirty */
+ if (f2fs_is_atomic_file(inode) &&
+ !is_inode_flag_set(inode, FI_ATOMIC_COMMITTED))
return;
- }
mark_inode_dirty_sync(inode);
}
@@ -68,9 +68,9 @@ void f2fs_set_inode_flags(struct inode *inode)
S_ENCRYPTED|S_VERITY|S_CASEFOLD);
}
-static void __get_inode_rdev(struct inode *inode, struct page *node_page)
+static void __get_inode_rdev(struct inode *inode, struct folio *node_folio)
{
- __le32 *addr = get_dnode_addr(inode, node_page);
+ __le32 *addr = get_dnode_addr(inode, node_folio);
if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
@@ -81,9 +81,9 @@ static void __get_inode_rdev(struct inode *inode, struct page *node_page)
}
}
-static void __set_inode_rdev(struct inode *inode, struct page *node_page)
+static void __set_inode_rdev(struct inode *inode, struct folio *node_folio)
{
- __le32 *addr = get_dnode_addr(inode, node_page);
+ __le32 *addr = get_dnode_addr(inode, node_folio);
if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
if (old_valid_dev(inode->i_rdev)) {
@@ -97,33 +97,34 @@ static void __set_inode_rdev(struct inode *inode, struct page *node_page)
}
}
-static void __recover_inline_status(struct inode *inode, struct page *ipage)
+static void __recover_inline_status(struct inode *inode, struct folio *ifolio)
{
- void *inline_data = inline_data_addr(inode, ipage);
+ void *inline_data = inline_data_addr(inode, ifolio);
__le32 *start = inline_data;
__le32 *end = start + MAX_INLINE_DATA(inode) / sizeof(__le32);
while (start < end) {
if (*start++) {
- f2fs_wait_on_page_writeback(ipage, NODE, true, true);
+ f2fs_folio_wait_writeback(ifolio, NODE, true, true);
set_inode_flag(inode, FI_DATA_EXIST);
- set_raw_inline(inode, F2FS_INODE(ipage));
- set_page_dirty(ipage);
+ set_raw_inline(inode, F2FS_INODE(ifolio));
+ folio_mark_dirty(ifolio);
return;
}
}
return;
}
-static bool f2fs_enable_inode_chksum(struct f2fs_sb_info *sbi, struct page *page)
+static
+bool f2fs_enable_inode_chksum(struct f2fs_sb_info *sbi, struct folio *folio)
{
- struct f2fs_inode *ri = &F2FS_NODE(page)->i;
+ struct f2fs_inode *ri = &F2FS_NODE(folio)->i;
if (!f2fs_sb_has_inode_chksum(sbi))
return false;
- if (!IS_INODE(page) || !(ri->i_inline & F2FS_EXTRA_ATTR))
+ if (!IS_INODE(folio) || !(ri->i_inline & F2FS_EXTRA_ATTR))
return false;
if (!F2FS_FITS_IN_INODE(ri, le16_to_cpu(ri->i_extra_isize),
@@ -133,9 +134,9 @@ static bool f2fs_enable_inode_chksum(struct f2fs_sb_info *sbi, struct page *page
return true;
}
-static __u32 f2fs_inode_chksum(struct f2fs_sb_info *sbi, struct page *page)
+static __u32 f2fs_inode_chksum(struct f2fs_sb_info *sbi, struct folio *folio)
{
- struct f2fs_node *node = F2FS_NODE(page);
+ struct f2fs_node *node = F2FS_NODE(folio);
struct f2fs_inode *ri = &node->i;
__le32 ino = node->footer.ino;
__le32 gen = ri->i_generation;
@@ -144,19 +145,18 @@ static __u32 f2fs_inode_chksum(struct f2fs_sb_info *sbi, struct page *page)
unsigned int offset = offsetof(struct f2fs_inode, i_inode_checksum);
unsigned int cs_size = sizeof(dummy_cs);
- chksum = f2fs_chksum(sbi, sbi->s_chksum_seed, (__u8 *)&ino,
- sizeof(ino));
- chksum_seed = f2fs_chksum(sbi, chksum, (__u8 *)&gen, sizeof(gen));
+ chksum = f2fs_chksum(sbi->s_chksum_seed, (__u8 *)&ino, sizeof(ino));
+ chksum_seed = f2fs_chksum(chksum, (__u8 *)&gen, sizeof(gen));
- chksum = f2fs_chksum(sbi, chksum_seed, (__u8 *)ri, offset);
- chksum = f2fs_chksum(sbi, chksum, (__u8 *)&dummy_cs, cs_size);
+ chksum = f2fs_chksum(chksum_seed, (__u8 *)ri, offset);
+ chksum = f2fs_chksum(chksum, (__u8 *)&dummy_cs, cs_size);
offset += cs_size;
- chksum = f2fs_chksum(sbi, chksum, (__u8 *)ri + offset,
- F2FS_BLKSIZE - offset);
+ chksum = f2fs_chksum(chksum, (__u8 *)ri + offset,
+ F2FS_BLKSIZE - offset);
return chksum;
}
-bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct page *page)
+bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct folio *folio)
{
struct f2fs_inode *ri;
__u32 provided, calculated;
@@ -165,34 +165,34 @@ bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct page *page)
return true;
#ifdef CONFIG_F2FS_CHECK_FS
- if (!f2fs_enable_inode_chksum(sbi, page))
+ if (!f2fs_enable_inode_chksum(sbi, folio))
#else
- if (!f2fs_enable_inode_chksum(sbi, page) ||
- PageDirty(page) ||
- folio_test_writeback(page_folio(page)))
+ if (!f2fs_enable_inode_chksum(sbi, folio) ||
+ folio_test_dirty(folio) ||
+ folio_test_writeback(folio))
#endif
return true;
- ri = &F2FS_NODE(page)->i;
+ ri = &F2FS_NODE(folio)->i;
provided = le32_to_cpu(ri->i_inode_checksum);
- calculated = f2fs_inode_chksum(sbi, page);
+ calculated = f2fs_inode_chksum(sbi, folio);
if (provided != calculated)
f2fs_warn(sbi, "checksum invalid, nid = %lu, ino_of_node = %x, %x vs. %x",
- page_folio(page)->index, ino_of_node(page),
+ folio->index, ino_of_node(folio),
provided, calculated);
return provided == calculated;
}
-void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page)
+void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct folio *folio)
{
- struct f2fs_inode *ri = &F2FS_NODE(page)->i;
+ struct f2fs_inode *ri = &F2FS_NODE(folio)->i;
- if (!f2fs_enable_inode_chksum(sbi, page))
+ if (!f2fs_enable_inode_chksum(sbi, folio))
return;
- ri->i_inode_checksum = cpu_to_le32(f2fs_inode_chksum(sbi, page));
+ ri->i_inode_checksum = cpu_to_le32(f2fs_inode_chksum(sbi, folio));
}
static bool sanity_check_compress_inode(struct inode *inode,
@@ -267,24 +267,30 @@ err_level:
return false;
}
-static bool sanity_check_inode(struct inode *inode, struct page *node_page)
+static bool sanity_check_inode(struct inode *inode, struct folio *node_folio)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct f2fs_inode_info *fi = F2FS_I(inode);
- struct f2fs_inode *ri = F2FS_INODE(node_page);
+ struct f2fs_inode *ri = F2FS_INODE(node_folio);
unsigned long long iblocks;
- iblocks = le64_to_cpu(F2FS_INODE(node_page)->i_blocks);
+ iblocks = le64_to_cpu(F2FS_INODE(node_folio)->i_blocks);
if (!iblocks) {
f2fs_warn(sbi, "%s: corrupted inode i_blocks i_ino=%lx iblocks=%llu, run fsck to fix.",
__func__, inode->i_ino, iblocks);
return false;
}
- if (ino_of_node(node_page) != nid_of_node(node_page)) {
+ if (ino_of_node(node_folio) != nid_of_node(node_folio)) {
f2fs_warn(sbi, "%s: corrupted inode footer i_ino=%lx, ino,nid: [%u, %u] run fsck to fix.",
__func__, inode->i_ino,
- ino_of_node(node_page), nid_of_node(node_page));
+ ino_of_node(node_folio), nid_of_node(node_folio));
+ return false;
+ }
+
+ if (ino_of_node(node_folio) == fi->i_xattr_nid) {
+ f2fs_warn(sbi, "%s: corrupted inode i_ino=%lx, xnid=%x, run fsck to fix.",
+ __func__, inode->i_ino, fi->i_xattr_nid);
return false;
}
@@ -349,7 +355,7 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
}
}
- if (f2fs_sanity_check_inline_data(inode, node_page)) {
+ if (f2fs_sanity_check_inline_data(inode, node_folio)) {
f2fs_warn(sbi, "%s: inode (ino=%lx, mode=%u) should not have inline_data, run fsck to fix",
__func__, inode->i_ino, inode->i_mode);
return false;
@@ -402,7 +408,7 @@ static int do_read_inode(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct f2fs_inode_info *fi = F2FS_I(inode);
- struct page *node_page;
+ struct folio *node_folio;
struct f2fs_inode *ri;
projid_t i_projid;
@@ -410,11 +416,11 @@ static int do_read_inode(struct inode *inode)
if (f2fs_check_nid_range(sbi, inode->i_ino))
return -EINVAL;
- node_page = f2fs_get_node_page(sbi, inode->i_ino);
- if (IS_ERR(node_page))
- return PTR_ERR(node_page);
+ node_folio = f2fs_get_inode_folio(sbi, inode->i_ino);
+ if (IS_ERR(node_folio))
+ return PTR_ERR(node_folio);
- ri = F2FS_INODE(node_page);
+ ri = F2FS_INODE(node_folio);
inode->i_mode = le16_to_cpu(ri->i_mode);
i_uid_write(inode, le32_to_cpu(ri->i_uid));
@@ -464,8 +470,8 @@ static int do_read_inode(struct inode *inode)
fi->i_inline_xattr_size = 0;
}
- if (!sanity_check_inode(inode, node_page)) {
- f2fs_put_page(node_page, 1);
+ if (!sanity_check_inode(inode, node_folio)) {
+ f2fs_folio_put(node_folio, true);
set_sbi_flag(sbi, SBI_NEED_FSCK);
f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE);
return -EFSCORRUPTED;
@@ -473,17 +479,17 @@ static int do_read_inode(struct inode *inode)
/* check data exist */
if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode))
- __recover_inline_status(inode, node_page);
+ __recover_inline_status(inode, node_folio);
/* try to recover cold bit for non-dir inode */
- if (!S_ISDIR(inode->i_mode) && !is_cold_node(node_page)) {
- f2fs_wait_on_page_writeback(node_page, NODE, true, true);
- set_cold_node(node_page, false);
- set_page_dirty(node_page);
+ if (!S_ISDIR(inode->i_mode) && !is_cold_node(node_folio)) {
+ f2fs_folio_wait_writeback(node_folio, NODE, true, true);
+ set_cold_node(node_folio, false);
+ folio_mark_dirty(node_folio);
}
/* get rdev by using inline_info */
- __get_inode_rdev(inode, node_page);
+ __get_inode_rdev(inode, node_folio);
if (!f2fs_need_inode_block_update(sbi, inode->i_ino))
fi->last_disk_size = inode->i_size;
@@ -526,17 +532,17 @@ static int do_read_inode(struct inode *inode)
init_idisk_time(inode);
- if (!sanity_check_extent_cache(inode, node_page)) {
- f2fs_put_page(node_page, 1);
+ if (!sanity_check_extent_cache(inode, node_folio)) {
+ f2fs_folio_put(node_folio, true);
f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE);
return -EFSCORRUPTED;
}
/* Need all the flag bits */
- f2fs_init_read_extent_tree(inode, node_page);
+ f2fs_init_read_extent_tree(inode, node_folio);
f2fs_init_age_extent_tree(inode);
- f2fs_put_page(node_page, 1);
+ f2fs_folio_put(node_folio, true);
stat_inc_inline_xattr(inode);
stat_inc_inline_inode(inode);
@@ -653,18 +659,18 @@ retry:
return inode;
}
-void f2fs_update_inode(struct inode *inode, struct page *node_page)
+void f2fs_update_inode(struct inode *inode, struct folio *node_folio)
{
struct f2fs_inode_info *fi = F2FS_I(inode);
struct f2fs_inode *ri;
struct extent_tree *et = fi->extent_tree[EX_READ];
- f2fs_wait_on_page_writeback(node_page, NODE, true, true);
- set_page_dirty(node_page);
+ f2fs_folio_wait_writeback(node_folio, NODE, true, true);
+ folio_mark_dirty(node_folio);
f2fs_inode_synced(inode);
- ri = F2FS_INODE(node_page);
+ ri = F2FS_INODE(node_folio);
ri->i_mode = cpu_to_le16(inode->i_mode);
ri->i_advise = fi->i_advise;
@@ -739,39 +745,43 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page)
}
}
- __set_inode_rdev(inode, node_page);
+ __set_inode_rdev(inode, node_folio);
/* deleted inode */
if (inode->i_nlink == 0)
- clear_page_private_inline(node_page);
+ folio_clear_f2fs_inline(node_folio);
init_idisk_time(inode);
#ifdef CONFIG_F2FS_CHECK_FS
- f2fs_inode_chksum_set(F2FS_I_SB(inode), node_page);
+ f2fs_inode_chksum_set(F2FS_I_SB(inode), node_folio);
#endif
}
void f2fs_update_inode_page(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct page *node_page;
+ struct folio *node_folio;
int count = 0;
retry:
- node_page = f2fs_get_node_page(sbi, inode->i_ino);
- if (IS_ERR(node_page)) {
- int err = PTR_ERR(node_page);
+ node_folio = f2fs_get_inode_folio(sbi, inode->i_ino);
+ if (IS_ERR(node_folio)) {
+ int err = PTR_ERR(node_folio);
/* The node block was truncated. */
if (err == -ENOENT)
return;
+ if (err == -EFSCORRUPTED)
+ goto stop_checkpoint;
+
if (err == -ENOMEM || ++count <= DEFAULT_RETRY_IO_COUNT)
goto retry;
+stop_checkpoint:
f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_UPDATE_INODE);
return;
}
- f2fs_update_inode(inode, node_page);
- f2fs_put_page(node_page, 1);
+ f2fs_update_inode(inode, node_folio);
+ f2fs_folio_put(node_folio, true);
}
int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
@@ -789,6 +799,13 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
!is_inode_flag_set(inode, FI_DIRTY_INODE))
return 0;
+ /*
+ * no need to update inode page, ultimately f2fs_evict_inode() will
+ * clear dirty status of inode.
+ */
+ if (f2fs_cp_error(sbi))
+ return -EIO;
+
if (!f2fs_is_checkpoint_ready(sbi)) {
f2fs_mark_inode_dirty_sync(inode, true);
return -ENOSPC;
@@ -804,6 +821,19 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
return 0;
}
+void f2fs_remove_donate_inode(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+
+ if (list_empty(&F2FS_I(inode)->gdonate_list))
+ return;
+
+ spin_lock(&sbi->inode_lock[DONATE_INODE]);
+ list_del_init(&F2FS_I(inode)->gdonate_list);
+ sbi->donate_files--;
+ spin_unlock(&sbi->inode_lock[DONATE_INODE]);
+}
+
/*
* Called at the last iput() if i_nlink is zero
*/
@@ -838,6 +868,7 @@ void f2fs_evict_inode(struct inode *inode)
f2fs_bug_on(sbi, get_dirty_pages(inode));
f2fs_remove_dirty_inode(inode);
+ f2fs_remove_donate_inode(inode);
if (!IS_DEVICE_ALIASING(inode))
f2fs_destroy_extent_tree(inode);
@@ -903,6 +934,19 @@ retry:
f2fs_update_inode_page(inode);
if (dquot_initialize_needed(inode))
set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR);
+
+ /*
+ * If both f2fs_truncate() and f2fs_update_inode_page() failed
+ * due to fuzzed corrupted inode, call f2fs_inode_synced() to
+ * avoid triggering later f2fs_bug_on().
+ */
+ if (is_inode_flag_set(inode, FI_DIRTY_INODE)) {
+ f2fs_warn(sbi,
+ "f2fs_evict_inode: inode is dirty, ino:%lu",
+ inode->i_ino);
+ f2fs_inode_synced(inode);
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ }
}
if (freeze_protected)
sb_end_intwrite(inode->i_sb);
@@ -919,8 +963,12 @@ no_delete:
if (likely(!f2fs_cp_error(sbi) &&
!is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
f2fs_bug_on(sbi, is_inode_flag_set(inode, FI_DIRTY_INODE));
- else
- f2fs_inode_synced(inode);
+
+ /*
+ * anyway, it needs to remove the inode from sbi->inode_list[DIRTY_META]
+ * list to avoid UAF in f2fs_sync_inode_meta() during checkpoint.
+ */
+ f2fs_inode_synced(inode);
/* for the case f2fs_new_inode() was failed, .i_ino is zero, skip it */
if (inode->i_ino)
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index a278c7da8177..b882771e4699 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -414,7 +414,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
if (is_inode_flag_set(dir, FI_PROJ_INHERIT) &&
(!projid_eq(F2FS_I(dir)->i_projid,
- F2FS_I(old_dentry->d_inode)->i_projid)))
+ F2FS_I(inode)->i_projid)))
return -EXDEV;
err = f2fs_dquot_initialize(dir);
@@ -447,12 +447,12 @@ out:
struct dentry *f2fs_get_parent(struct dentry *child)
{
- struct page *page;
- unsigned long ino = f2fs_inode_by_name(d_inode(child), &dotdot_name, &page);
+ struct folio *folio;
+ unsigned long ino = f2fs_inode_by_name(d_inode(child), &dotdot_name, &folio);
if (!ino) {
- if (IS_ERR(page))
- return ERR_CAST(page);
+ if (IS_ERR(folio))
+ return ERR_CAST(folio);
return ERR_PTR(-ENOENT);
}
return d_obtain_alias(f2fs_iget(child->d_sb, ino));
@@ -463,7 +463,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
{
struct inode *inode = NULL;
struct f2fs_dir_entry *de;
- struct page *page;
+ struct folio *folio;
struct dentry *new;
nid_t ino = -1;
int err = 0;
@@ -481,12 +481,12 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
goto out_splice;
if (err)
goto out;
- de = __f2fs_find_entry(dir, &fname, &page);
+ de = __f2fs_find_entry(dir, &fname, &folio);
f2fs_free_filename(&fname);
if (!de) {
- if (IS_ERR(page)) {
- err = PTR_ERR(page);
+ if (IS_ERR(folio)) {
+ err = PTR_ERR(folio);
goto out;
}
err = -ENOENT;
@@ -494,7 +494,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
}
ino = le32_to_cpu(de->ino);
- f2fs_put_page(page, 0);
+ f2fs_folio_put(folio, false);
inode = f2fs_iget(dir->i_sb, ino);
if (IS_ERR(inode)) {
@@ -502,6 +502,14 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
goto out;
}
+ if (inode->i_nlink == 0) {
+ f2fs_warn(F2FS_I_SB(inode), "%s: inode (ino=%lx) has zero i_nlink",
+ __func__, inode->i_ino);
+ err = -EFSCORRUPTED;
+ set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK);
+ goto out_iput;
+ }
+
if (IS_ENCRYPTED(dir) &&
(S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) &&
!fscrypt_has_permitted_context(dir, inode)) {
@@ -537,7 +545,7 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
struct inode *inode = d_inode(dentry);
struct f2fs_dir_entry *de;
- struct page *page;
+ struct folio *folio;
int err;
trace_f2fs_unlink_enter(dir, dentry);
@@ -554,10 +562,19 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
if (err)
goto fail;
- de = f2fs_find_entry(dir, &dentry->d_name, &page);
+ de = f2fs_find_entry(dir, &dentry->d_name, &folio);
if (!de) {
- if (IS_ERR(page))
- err = PTR_ERR(page);
+ if (IS_ERR(folio))
+ err = PTR_ERR(folio);
+ goto fail;
+ }
+
+ if (unlikely(inode->i_nlink == 0)) {
+ f2fs_warn(F2FS_I_SB(inode), "%s: inode (ino=%lx) has zero i_nlink",
+ __func__, inode->i_ino);
+ err = -EFSCORRUPTED;
+ set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK);
+ f2fs_folio_put(folio, false);
goto fail;
}
@@ -567,10 +584,10 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
err = f2fs_acquire_orphan_inode(sbi);
if (err) {
f2fs_unlock_op(sbi);
- f2fs_put_page(page, 0);
+ f2fs_folio_put(folio, false);
goto fail;
}
- f2fs_delete_entry(de, page, dir, inode);
+ f2fs_delete_entry(de, folio, dir, inode);
f2fs_unlock_op(sbi);
/* VFS negative dentries are incompatible with Encoding and
@@ -684,23 +701,23 @@ out_free_encrypted_link:
return err;
}
-static int f2fs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *f2fs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
struct inode *inode;
int err;
if (unlikely(f2fs_cp_error(sbi)))
- return -EIO;
+ return ERR_PTR(-EIO);
err = f2fs_dquot_initialize(dir);
if (err)
- return err;
+ return ERR_PTR(err);
inode = f2fs_new_inode(idmap, dir, S_IFDIR | mode, NULL);
if (IS_ERR(inode))
- return PTR_ERR(inode);
+ return ERR_CAST(inode);
inode->i_op = &f2fs_dir_inode_operations;
inode->i_fop = &f2fs_dir_operations;
@@ -722,12 +739,12 @@ static int f2fs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
f2fs_sync_fs(sbi->sb, 1);
f2fs_balance_fs(sbi, true);
- return 0;
+ return NULL;
out_fail:
clear_inode_flag(inode, FI_INC_LINK);
f2fs_handle_failed_inode(inode);
- return err;
+ return ERR_PTR(err);
}
static int f2fs_rmdir(struct inode *dir, struct dentry *dentry)
@@ -891,8 +908,8 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
struct inode *old_inode = d_inode(old_dentry);
struct inode *new_inode = d_inode(new_dentry);
struct inode *whiteout = NULL;
- struct page *old_dir_page = NULL;
- struct page *old_page, *new_page = NULL;
+ struct folio *old_dir_folio = NULL;
+ struct folio *old_folio, *new_folio = NULL;
struct f2fs_dir_entry *old_dir_entry = NULL;
struct f2fs_dir_entry *old_entry;
struct f2fs_dir_entry *new_entry;
@@ -906,7 +923,7 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
if (is_inode_flag_set(new_dir, FI_PROJ_INHERIT) &&
(!projid_eq(F2FS_I(new_dir)->i_projid,
- F2FS_I(old_dentry->d_inode)->i_projid)))
+ F2FS_I(old_inode)->i_projid)))
return -EXDEV;
/*
@@ -951,18 +968,18 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
}
err = -ENOENT;
- old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page);
+ old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_folio);
if (!old_entry) {
- if (IS_ERR(old_page))
- err = PTR_ERR(old_page);
+ if (IS_ERR(old_folio))
+ err = PTR_ERR(old_folio);
goto out;
}
if (old_is_dir && old_dir != new_dir) {
- old_dir_entry = f2fs_parent_dir(old_inode, &old_dir_page);
+ old_dir_entry = f2fs_parent_dir(old_inode, &old_dir_folio);
if (!old_dir_entry) {
- if (IS_ERR(old_dir_page))
- err = PTR_ERR(old_dir_page);
+ if (IS_ERR(old_dir_folio))
+ err = PTR_ERR(old_dir_folio);
goto out_old;
}
}
@@ -975,10 +992,10 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
err = -ENOENT;
new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name,
- &new_page);
+ &new_folio);
if (!new_entry) {
- if (IS_ERR(new_page))
- err = PTR_ERR(new_page);
+ if (IS_ERR(new_folio))
+ err = PTR_ERR(new_folio);
goto out_dir;
}
@@ -990,8 +1007,8 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
if (err)
goto put_out_dir;
- f2fs_set_link(new_dir, new_entry, new_page, old_inode);
- new_page = NULL;
+ f2fs_set_link(new_dir, new_entry, new_folio, old_inode);
+ new_folio = NULL;
inode_set_ctime_current(new_inode);
f2fs_down_write(&F2FS_I(new_inode)->i_sem);
@@ -1030,8 +1047,8 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
inode_set_ctime_current(old_inode);
f2fs_mark_inode_dirty_sync(old_inode, false);
- f2fs_delete_entry(old_entry, old_page, old_dir, NULL);
- old_page = NULL;
+ f2fs_delete_entry(old_entry, old_folio, old_dir, NULL);
+ old_folio = NULL;
if (whiteout) {
set_inode_flag(whiteout, FI_INC_LINK);
@@ -1047,7 +1064,7 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
}
if (old_dir_entry)
- f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir);
+ f2fs_set_link(old_inode, old_dir_entry, old_dir_folio, new_dir);
if (old_is_dir)
f2fs_i_links_write(old_dir, false);
@@ -1068,12 +1085,12 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
put_out_dir:
f2fs_unlock_op(sbi);
- f2fs_put_page(new_page, 0);
+ f2fs_folio_put(new_folio, false);
out_dir:
if (old_dir_entry)
- f2fs_put_page(old_dir_page, 0);
+ f2fs_folio_put(old_dir_folio, false);
out_old:
- f2fs_put_page(old_page, 0);
+ f2fs_folio_put(old_folio, false);
out:
iput(whiteout);
return err;
@@ -1085,8 +1102,8 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
struct f2fs_sb_info *sbi = F2FS_I_SB(old_dir);
struct inode *old_inode = d_inode(old_dentry);
struct inode *new_inode = d_inode(new_dentry);
- struct page *old_dir_page, *new_dir_page;
- struct page *old_page, *new_page;
+ struct folio *old_dir_folio, *new_dir_folio;
+ struct folio *old_folio, *new_folio;
struct f2fs_dir_entry *old_dir_entry = NULL, *new_dir_entry = NULL;
struct f2fs_dir_entry *old_entry, *new_entry;
int old_nlink = 0, new_nlink = 0;
@@ -1099,10 +1116,10 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
if ((is_inode_flag_set(new_dir, FI_PROJ_INHERIT) &&
!projid_eq(F2FS_I(new_dir)->i_projid,
- F2FS_I(old_dentry->d_inode)->i_projid)) ||
- (is_inode_flag_set(new_dir, FI_PROJ_INHERIT) &&
+ F2FS_I(old_inode)->i_projid)) ||
+ (is_inode_flag_set(old_dir, FI_PROJ_INHERIT) &&
!projid_eq(F2FS_I(old_dir)->i_projid,
- F2FS_I(new_dentry->d_inode)->i_projid)))
+ F2FS_I(new_inode)->i_projid)))
return -EXDEV;
err = f2fs_dquot_initialize(old_dir);
@@ -1114,17 +1131,17 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
goto out;
err = -ENOENT;
- old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page);
+ old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_folio);
if (!old_entry) {
- if (IS_ERR(old_page))
- err = PTR_ERR(old_page);
+ if (IS_ERR(old_folio))
+ err = PTR_ERR(old_folio);
goto out;
}
- new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name, &new_page);
+ new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name, &new_folio);
if (!new_entry) {
- if (IS_ERR(new_page))
- err = PTR_ERR(new_page);
+ if (IS_ERR(new_folio))
+ err = PTR_ERR(new_folio);
goto out_old;
}
@@ -1132,20 +1149,20 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
if (old_dir != new_dir) {
if (S_ISDIR(old_inode->i_mode)) {
old_dir_entry = f2fs_parent_dir(old_inode,
- &old_dir_page);
+ &old_dir_folio);
if (!old_dir_entry) {
- if (IS_ERR(old_dir_page))
- err = PTR_ERR(old_dir_page);
+ if (IS_ERR(old_dir_folio))
+ err = PTR_ERR(old_dir_folio);
goto out_new;
}
}
if (S_ISDIR(new_inode->i_mode)) {
new_dir_entry = f2fs_parent_dir(new_inode,
- &new_dir_page);
+ &new_dir_folio);
if (!new_dir_entry) {
- if (IS_ERR(new_dir_page))
- err = PTR_ERR(new_dir_page);
+ if (IS_ERR(new_dir_folio))
+ err = PTR_ERR(new_dir_folio);
goto out_old_dir;
}
}
@@ -1172,14 +1189,14 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
/* update ".." directory entry info of old dentry */
if (old_dir_entry)
- f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir);
+ f2fs_set_link(old_inode, old_dir_entry, old_dir_folio, new_dir);
/* update ".." directory entry info of new dentry */
if (new_dir_entry)
- f2fs_set_link(new_inode, new_dir_entry, new_dir_page, old_dir);
+ f2fs_set_link(new_inode, new_dir_entry, new_dir_folio, old_dir);
/* update directory entry info of old dir inode */
- f2fs_set_link(old_dir, old_entry, old_page, new_inode);
+ f2fs_set_link(old_dir, old_entry, old_folio, new_inode);
f2fs_down_write(&F2FS_I(old_inode)->i_sem);
if (!old_dir_entry)
@@ -1198,7 +1215,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
f2fs_mark_inode_dirty_sync(old_dir, false);
/* update directory entry info of new dir inode */
- f2fs_set_link(new_dir, new_entry, new_page, old_inode);
+ f2fs_set_link(new_dir, new_entry, new_folio, old_inode);
f2fs_down_write(&F2FS_I(new_inode)->i_sem);
if (!new_dir_entry)
@@ -1230,16 +1247,16 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
return 0;
out_new_dir:
if (new_dir_entry) {
- f2fs_put_page(new_dir_page, 0);
+ f2fs_folio_put(new_dir_folio, 0);
}
out_old_dir:
if (old_dir_entry) {
- f2fs_put_page(old_dir_page, 0);
+ f2fs_folio_put(old_dir_folio, 0);
}
out_new:
- f2fs_put_page(new_page, 0);
+ f2fs_folio_put(new_folio, false);
out_old:
- f2fs_put_page(old_page, 0);
+ f2fs_folio_put(old_folio, false);
out:
return err;
}
@@ -1281,19 +1298,19 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry,
struct inode *inode,
struct delayed_call *done)
{
- struct page *page;
+ struct folio *folio;
const char *target;
if (!dentry)
return ERR_PTR(-ECHILD);
- page = read_mapping_page(inode->i_mapping, 0, NULL);
- if (IS_ERR(page))
- return ERR_CAST(page);
+ folio = read_mapping_folio(inode->i_mapping, 0, NULL);
+ if (IS_ERR(folio))
+ return ERR_CAST(folio);
- target = fscrypt_get_symlink(inode, page_address(page),
+ target = fscrypt_get_symlink(inode, folio_address(folio),
inode->i_sb->s_blocksize, done);
- put_page(page);
+ folio_put(folio);
return target;
}
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index f88392fc4ba9..27743b93e186 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -120,25 +120,25 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type)
return res;
}
-static void clear_node_page_dirty(struct page *page)
+static void clear_node_folio_dirty(struct folio *folio)
{
- if (PageDirty(page)) {
- f2fs_clear_page_cache_dirty_tag(page_folio(page));
- clear_page_dirty_for_io(page);
- dec_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES);
+ if (folio_test_dirty(folio)) {
+ f2fs_clear_page_cache_dirty_tag(folio);
+ folio_clear_dirty_for_io(folio);
+ dec_page_count(F2FS_F_SB(folio), F2FS_DIRTY_NODES);
}
- ClearPageUptodate(page);
+ folio_clear_uptodate(folio);
}
-static struct page *get_current_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
+static struct folio *get_current_nat_folio(struct f2fs_sb_info *sbi, nid_t nid)
{
- return f2fs_get_meta_page_retry(sbi, current_nat_addr(sbi, nid));
+ return f2fs_get_meta_folio_retry(sbi, current_nat_addr(sbi, nid));
}
-static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
+static struct folio *get_next_nat_folio(struct f2fs_sb_info *sbi, nid_t nid)
{
- struct page *src_page;
- struct page *dst_page;
+ struct folio *src_folio;
+ struct folio *dst_folio;
pgoff_t dst_off;
void *src_addr;
void *dst_addr;
@@ -147,21 +147,21 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
dst_off = next_nat_addr(sbi, current_nat_addr(sbi, nid));
/* get current nat block page with lock */
- src_page = get_current_nat_page(sbi, nid);
- if (IS_ERR(src_page))
- return src_page;
- dst_page = f2fs_grab_meta_page(sbi, dst_off);
- f2fs_bug_on(sbi, PageDirty(src_page));
-
- src_addr = page_address(src_page);
- dst_addr = page_address(dst_page);
+ src_folio = get_current_nat_folio(sbi, nid);
+ if (IS_ERR(src_folio))
+ return src_folio;
+ dst_folio = f2fs_grab_meta_folio(sbi, dst_off);
+ f2fs_bug_on(sbi, folio_test_dirty(src_folio));
+
+ src_addr = folio_address(src_folio);
+ dst_addr = folio_address(dst_folio);
memcpy(dst_addr, src_addr, PAGE_SIZE);
- set_page_dirty(dst_page);
- f2fs_put_page(src_page, 1);
+ folio_mark_dirty(dst_folio);
+ f2fs_folio_put(src_folio, true);
set_to_next_nat(nm_i, nid);
- return dst_page;
+ return dst_folio;
}
static struct nat_entry *__alloc_nat_entry(struct f2fs_sb_info *sbi,
@@ -185,7 +185,7 @@ static void __free_nat_entry(struct nat_entry *e)
/* must be locked by nat_tree_lock */
static struct nat_entry *__init_nat_entry(struct f2fs_nm_info *nm_i,
- struct nat_entry *ne, struct f2fs_nat_entry *raw_ne, bool no_fail)
+ struct nat_entry *ne, struct f2fs_nat_entry *raw_ne, bool no_fail, bool init_dirty)
{
if (no_fail)
f2fs_radix_tree_insert(&nm_i->nat_root, nat_get_nid(ne), ne);
@@ -195,6 +195,12 @@ static struct nat_entry *__init_nat_entry(struct f2fs_nm_info *nm_i,
if (raw_ne)
node_info_from_raw_nat(&ne->ni, raw_ne);
+ if (init_dirty) {
+ INIT_LIST_HEAD(&ne->list);
+ nm_i->nat_cnt[TOTAL_NAT]++;
+ return ne;
+ }
+
spin_lock(&nm_i->nat_list_lock);
list_add_tail(&ne->list, &nm_i->nat_entries);
spin_unlock(&nm_i->nat_list_lock);
@@ -204,14 +210,17 @@ static struct nat_entry *__init_nat_entry(struct f2fs_nm_info *nm_i,
return ne;
}
-static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n)
+static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n, bool for_dirty)
{
struct nat_entry *ne;
ne = radix_tree_lookup(&nm_i->nat_root, n);
- /* for recent accessed nat entry, move it to tail of lru list */
- if (ne && !get_nat_flag(ne, IS_DIRTY)) {
+ /*
+ * for recent accessed nat entry which will not be dirtied soon
+ * later, move it to tail of lru list.
+ */
+ if (ne && !get_nat_flag(ne, IS_DIRTY) && !for_dirty) {
spin_lock(&nm_i->nat_list_lock);
if (!list_empty(&ne->list))
list_move_tail(&ne->list, &nm_i->nat_entries);
@@ -256,7 +265,7 @@ static struct nat_entry_set *__grab_nat_entry_set(struct f2fs_nm_info *nm_i,
}
static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
- struct nat_entry *ne)
+ struct nat_entry *ne, bool init_dirty)
{
struct nat_entry_set *head;
bool new_ne = nat_get_blkaddr(ne) == NEW_ADDR;
@@ -279,7 +288,8 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
goto refresh_list;
nm_i->nat_cnt[DIRTY_NAT]++;
- nm_i->nat_cnt[RECLAIMABLE_NAT]--;
+ if (!init_dirty)
+ nm_i->nat_cnt[RECLAIMABLE_NAT]--;
set_nat_flag(ne, IS_DIRTY, true);
refresh_list:
spin_lock(&nm_i->nat_list_lock);
@@ -310,10 +320,9 @@ static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i,
start, nr);
}
-bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, struct page *page)
+bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, struct folio *folio)
{
- return NODE_MAPPING(sbi) == page->mapping &&
- IS_DNODE(page) && is_cold_node(page);
+ return is_node_folio(folio) && IS_DNODE(folio) && is_cold_node(folio);
}
void f2fs_init_fsync_node_info(struct f2fs_sb_info *sbi)
@@ -325,7 +334,7 @@ void f2fs_init_fsync_node_info(struct f2fs_sb_info *sbi)
}
static unsigned int f2fs_add_fsync_node_entry(struct f2fs_sb_info *sbi,
- struct page *page)
+ struct folio *folio)
{
struct fsync_node_entry *fn;
unsigned long flags;
@@ -334,8 +343,8 @@ static unsigned int f2fs_add_fsync_node_entry(struct f2fs_sb_info *sbi,
fn = f2fs_kmem_cache_alloc(fsync_node_entry_slab,
GFP_NOFS, true, NULL);
- get_page(page);
- fn->page = page;
+ folio_get(folio);
+ fn->folio = folio;
INIT_LIST_HEAD(&fn->list);
spin_lock_irqsave(&sbi->fsync_node_lock, flags);
@@ -348,19 +357,19 @@ static unsigned int f2fs_add_fsync_node_entry(struct f2fs_sb_info *sbi,
return seq_id;
}
-void f2fs_del_fsync_node_entry(struct f2fs_sb_info *sbi, struct page *page)
+void f2fs_del_fsync_node_entry(struct f2fs_sb_info *sbi, struct folio *folio)
{
struct fsync_node_entry *fn;
unsigned long flags;
spin_lock_irqsave(&sbi->fsync_node_lock, flags);
list_for_each_entry(fn, &sbi->fsync_node_list, list) {
- if (fn->page == page) {
+ if (fn->folio == folio) {
list_del(&fn->list);
sbi->fsync_node_num--;
spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
kmem_cache_free(fsync_node_entry_slab, fn);
- put_page(page);
+ folio_put(folio);
return;
}
}
@@ -384,7 +393,7 @@ int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid)
bool need = false;
f2fs_down_read(&nm_i->nat_tree_lock);
- e = __lookup_nat_cache(nm_i, nid);
+ e = __lookup_nat_cache(nm_i, nid, false);
if (e) {
if (!get_nat_flag(e, IS_CHECKPOINTED) &&
!get_nat_flag(e, HAS_FSYNCED_INODE))
@@ -401,7 +410,7 @@ bool f2fs_is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
bool is_cp = true;
f2fs_down_read(&nm_i->nat_tree_lock);
- e = __lookup_nat_cache(nm_i, nid);
+ e = __lookup_nat_cache(nm_i, nid, false);
if (e && !get_nat_flag(e, IS_CHECKPOINTED))
is_cp = false;
f2fs_up_read(&nm_i->nat_tree_lock);
@@ -415,7 +424,7 @@ bool f2fs_need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino)
bool need_update = true;
f2fs_down_read(&nm_i->nat_tree_lock);
- e = __lookup_nat_cache(nm_i, ino);
+ e = __lookup_nat_cache(nm_i, ino, false);
if (e && get_nat_flag(e, HAS_LAST_FSYNC) &&
(get_nat_flag(e, IS_CHECKPOINTED) ||
get_nat_flag(e, HAS_FSYNCED_INODE)))
@@ -440,9 +449,9 @@ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid,
return;
f2fs_down_write(&nm_i->nat_tree_lock);
- e = __lookup_nat_cache(nm_i, nid);
+ e = __lookup_nat_cache(nm_i, nid, false);
if (!e)
- e = __init_nat_entry(nm_i, new, ne, false);
+ e = __init_nat_entry(nm_i, new, ne, false, false);
else
f2fs_bug_on(sbi, nat_get_ino(e) != le32_to_cpu(ne->ino) ||
nat_get_blkaddr(e) !=
@@ -459,11 +468,13 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct nat_entry *e;
struct nat_entry *new = __alloc_nat_entry(sbi, ni->nid, true);
+ bool init_dirty = false;
f2fs_down_write(&nm_i->nat_tree_lock);
- e = __lookup_nat_cache(nm_i, ni->nid);
+ e = __lookup_nat_cache(nm_i, ni->nid, true);
if (!e) {
- e = __init_nat_entry(nm_i, new, NULL, true);
+ init_dirty = true;
+ e = __init_nat_entry(nm_i, new, NULL, true, true);
copy_node_info(&e->ni, ni);
f2fs_bug_on(sbi, ni->blk_addr == NEW_ADDR);
} else if (new_blkaddr == NEW_ADDR) {
@@ -499,11 +510,11 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
nat_set_blkaddr(e, new_blkaddr);
if (!__is_valid_data_blkaddr(new_blkaddr))
set_nat_flag(e, IS_CHECKPOINTED, false);
- __set_nat_cache_dirty(nm_i, e);
+ __set_nat_cache_dirty(nm_i, e, init_dirty);
/* update fsync_mark if its inode nat entry is still alive */
if (ni->nid != ni->ino)
- e = __lookup_nat_cache(nm_i, ni->ino);
+ e = __lookup_nat_cache(nm_i, ni->ino, false);
if (e) {
if (fsync_done && ni->nid == ni->ino)
set_nat_flag(e, HAS_FSYNCED_INODE, true);
@@ -551,24 +562,28 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid,
struct f2fs_journal *journal = curseg->journal;
nid_t start_nid = START_NID(nid);
struct f2fs_nat_block *nat_blk;
- struct page *page = NULL;
+ struct folio *folio = NULL;
struct f2fs_nat_entry ne;
struct nat_entry *e;
pgoff_t index;
- block_t blkaddr;
int i;
+ bool need_cache = true;
ni->flag = 0;
ni->nid = nid;
retry:
/* Check nat cache */
f2fs_down_read(&nm_i->nat_tree_lock);
- e = __lookup_nat_cache(nm_i, nid);
+ e = __lookup_nat_cache(nm_i, nid, false);
if (e) {
ni->ino = nat_get_ino(e);
ni->blk_addr = nat_get_blkaddr(e);
ni->version = nat_get_version(e);
f2fs_up_read(&nm_i->nat_tree_lock);
+ if (IS_ENABLED(CONFIG_F2FS_CHECK_FS)) {
+ need_cache = false;
+ goto sanity_check;
+ }
return 0;
}
@@ -594,38 +609,47 @@ retry:
up_read(&curseg->journal_rwsem);
if (i >= 0) {
f2fs_up_read(&nm_i->nat_tree_lock);
- goto cache;
+ goto sanity_check;
}
/* Fill node_info from nat page */
index = current_nat_addr(sbi, nid);
f2fs_up_read(&nm_i->nat_tree_lock);
- page = f2fs_get_meta_page(sbi, index);
- if (IS_ERR(page))
- return PTR_ERR(page);
+ folio = f2fs_get_meta_folio(sbi, index);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
- nat_blk = (struct f2fs_nat_block *)page_address(page);
+ nat_blk = folio_address(folio);
ne = nat_blk->entries[nid - start_nid];
node_info_from_raw_nat(ni, &ne);
- f2fs_put_page(page, 1);
-cache:
- blkaddr = le32_to_cpu(ne.block_addr);
- if (__is_valid_data_blkaddr(blkaddr) &&
- !f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE))
- return -EFAULT;
+ f2fs_folio_put(folio, true);
+sanity_check:
+ if (__is_valid_data_blkaddr(ni->blk_addr) &&
+ !f2fs_is_valid_blkaddr(sbi, ni->blk_addr,
+ DATA_GENERIC_ENHANCE)) {
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ f2fs_err_ratelimited(sbi,
+ "f2fs_get_node_info of %pS: inconsistent nat entry, "
+ "ino:%u, nid:%u, blkaddr:%u, ver:%u, flag:%u",
+ __builtin_return_address(0),
+ ni->ino, ni->nid, ni->blk_addr, ni->version, ni->flag);
+ f2fs_handle_error(sbi, ERROR_INCONSISTENT_NAT);
+ return -EFSCORRUPTED;
+ }
/* cache nat entry */
- cache_nat_entry(sbi, nid, &ne);
+ if (need_cache)
+ cache_nat_entry(sbi, nid, &ne);
return 0;
}
/*
* readahead MAX_RA_NODE number of node pages.
*/
-static void f2fs_ra_node_pages(struct page *parent, int start, int n)
+static void f2fs_ra_node_pages(struct folio *parent, int start, int n)
{
- struct f2fs_sb_info *sbi = F2FS_P_SB(parent);
+ struct f2fs_sb_info *sbi = F2FS_F_SB(parent);
struct blk_plug plug;
int i, end;
nid_t nid;
@@ -754,6 +778,8 @@ got:
return level;
}
+static struct folio *f2fs_get_node_folio_ra(struct folio *parent, int start);
+
/*
* Caller should call f2fs_put_dnode(dn).
* Also, it should grab and release a rwsem by calling f2fs_lock_op() and
@@ -762,8 +788,8 @@ got:
int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
- struct page *npage[4];
- struct page *parent = NULL;
+ struct folio *nfolio[4];
+ struct folio *parent = NULL;
int offset[4];
unsigned int noffset[4];
nid_t nids[4];
@@ -775,31 +801,42 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
return level;
nids[0] = dn->inode->i_ino;
- npage[0] = dn->inode_page;
- if (!npage[0]) {
- npage[0] = f2fs_get_node_page(sbi, nids[0]);
- if (IS_ERR(npage[0]))
- return PTR_ERR(npage[0]);
+ if (!dn->inode_folio) {
+ nfolio[0] = f2fs_get_inode_folio(sbi, nids[0]);
+ if (IS_ERR(nfolio[0]))
+ return PTR_ERR(nfolio[0]);
+ } else {
+ nfolio[0] = dn->inode_folio;
}
/* if inline_data is set, should not report any block indices */
if (f2fs_has_inline_data(dn->inode) && index) {
err = -ENOENT;
- f2fs_put_page(npage[0], 1);
+ f2fs_folio_put(nfolio[0], true);
goto release_out;
}
- parent = npage[0];
+ parent = nfolio[0];
if (level != 0)
nids[1] = get_nid(parent, offset[0], true);
- dn->inode_page = npage[0];
- dn->inode_page_locked = true;
+ dn->inode_folio = nfolio[0];
+ dn->inode_folio_locked = true;
/* get indirect or direct nodes */
for (i = 1; i <= level; i++) {
bool done = false;
+ if (nids[i] && nids[i] == dn->inode->i_ino) {
+ err = -EFSCORRUPTED;
+ f2fs_err_ratelimited(sbi,
+ "inode mapping table is corrupted, run fsck to fix it, "
+ "ino:%lu, nid:%u, level:%d, offset:%d",
+ dn->inode->i_ino, nids[i], level, offset[level]);
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ goto release_pages;
+ }
+
if (!nids[i] && mode == ALLOC_NODE) {
/* alloc new node */
if (!f2fs_alloc_nid(sbi, &(nids[i]))) {
@@ -808,10 +845,10 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
}
dn->nid = nids[i];
- npage[i] = f2fs_new_node_page(dn, noffset[i]);
- if (IS_ERR(npage[i])) {
+ nfolio[i] = f2fs_new_node_folio(dn, noffset[i]);
+ if (IS_ERR(nfolio[i])) {
f2fs_alloc_nid_failed(sbi, nids[i]);
- err = PTR_ERR(npage[i]);
+ err = PTR_ERR(nfolio[i]);
goto release_pages;
}
@@ -819,36 +856,36 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
f2fs_alloc_nid_done(sbi, nids[i]);
done = true;
} else if (mode == LOOKUP_NODE_RA && i == level && level > 1) {
- npage[i] = f2fs_get_node_page_ra(parent, offset[i - 1]);
- if (IS_ERR(npage[i])) {
- err = PTR_ERR(npage[i]);
+ nfolio[i] = f2fs_get_node_folio_ra(parent, offset[i - 1]);
+ if (IS_ERR(nfolio[i])) {
+ err = PTR_ERR(nfolio[i]);
goto release_pages;
}
done = true;
}
if (i == 1) {
- dn->inode_page_locked = false;
- unlock_page(parent);
+ dn->inode_folio_locked = false;
+ folio_unlock(parent);
} else {
- f2fs_put_page(parent, 1);
+ f2fs_folio_put(parent, true);
}
if (!done) {
- npage[i] = f2fs_get_node_page(sbi, nids[i]);
- if (IS_ERR(npage[i])) {
- err = PTR_ERR(npage[i]);
- f2fs_put_page(npage[0], 0);
+ nfolio[i] = f2fs_get_node_folio(sbi, nids[i]);
+ if (IS_ERR(nfolio[i])) {
+ err = PTR_ERR(nfolio[i]);
+ f2fs_folio_put(nfolio[0], false);
goto release_out;
}
}
if (i < level) {
- parent = npage[i];
+ parent = nfolio[i];
nids[i + 1] = get_nid(parent, offset[i], false);
}
}
dn->nid = nids[level];
dn->ofs_in_node = offset[level];
- dn->node_page = npage[level];
+ dn->node_folio = nfolio[level];
dn->data_blkaddr = f2fs_data_blkaddr(dn);
if (is_inode_flag_set(dn->inode, FI_COMPRESSED_FILE) &&
@@ -869,9 +906,9 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
if (!c_len)
goto out;
- blkaddr = data_blkaddr(dn->inode, dn->node_page, ofs_in_node);
+ blkaddr = data_blkaddr(dn->inode, dn->node_folio, ofs_in_node);
if (blkaddr == COMPRESS_ADDR)
- blkaddr = data_blkaddr(dn->inode, dn->node_page,
+ blkaddr = data_blkaddr(dn->inode, dn->node_folio,
ofs_in_node + 1);
f2fs_update_read_extent_tree_range_compressed(dn->inode,
@@ -881,12 +918,12 @@ out:
return 0;
release_pages:
- f2fs_put_page(parent, 1);
+ f2fs_folio_put(parent, true);
if (i > 1)
- f2fs_put_page(npage[0], 0);
+ f2fs_folio_put(nfolio[0], false);
release_out:
- dn->inode_page = NULL;
- dn->node_page = NULL;
+ dn->inode_folio = NULL;
+ dn->node_folio = NULL;
if (err == -ENOENT) {
dn->cur_level = i;
dn->max_level = level;
@@ -927,16 +964,16 @@ static int truncate_node(struct dnode_of_data *dn)
f2fs_inode_synced(dn->inode);
}
- clear_node_page_dirty(dn->node_page);
+ clear_node_folio_dirty(dn->node_folio);
set_sbi_flag(sbi, SBI_IS_DIRTY);
- index = page_folio(dn->node_page)->index;
- f2fs_put_page(dn->node_page, 1);
+ index = dn->node_folio->index;
+ f2fs_folio_put(dn->node_folio, true);
invalidate_mapping_pages(NODE_MAPPING(sbi),
index, index);
- dn->node_page = NULL;
+ dn->node_folio = NULL;
trace_f2fs_truncate_node(dn->inode, dn->nid, ni.blk_addr);
return 0;
@@ -945,35 +982,35 @@ static int truncate_node(struct dnode_of_data *dn)
static int truncate_dnode(struct dnode_of_data *dn)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
- struct page *page;
+ struct folio *folio;
int err;
if (dn->nid == 0)
return 1;
/* get direct node */
- page = f2fs_get_node_page(sbi, dn->nid);
- if (PTR_ERR(page) == -ENOENT)
+ folio = f2fs_get_node_folio(sbi, dn->nid);
+ if (PTR_ERR(folio) == -ENOENT)
return 1;
- else if (IS_ERR(page))
- return PTR_ERR(page);
+ else if (IS_ERR(folio))
+ return PTR_ERR(folio);
- if (IS_INODE(page) || ino_of_node(page) != dn->inode->i_ino) {
+ if (IS_INODE(folio) || ino_of_node(folio) != dn->inode->i_ino) {
f2fs_err(sbi, "incorrect node reference, ino: %lu, nid: %u, ino_of_node: %u",
- dn->inode->i_ino, dn->nid, ino_of_node(page));
+ dn->inode->i_ino, dn->nid, ino_of_node(folio));
set_sbi_flag(sbi, SBI_NEED_FSCK);
f2fs_handle_error(sbi, ERROR_INVALID_NODE_REFERENCE);
- f2fs_put_page(page, 1);
+ f2fs_folio_put(folio, true);
return -EFSCORRUPTED;
}
/* Make dnode_of_data for parameter */
- dn->node_page = page;
+ dn->node_folio = folio;
dn->ofs_in_node = 0;
f2fs_truncate_data_blocks_range(dn, ADDRS_PER_BLOCK(dn->inode));
err = truncate_node(dn);
if (err) {
- f2fs_put_page(page, 1);
+ f2fs_folio_put(folio, true);
return err;
}
@@ -984,7 +1021,7 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
int ofs, int depth)
{
struct dnode_of_data rdn = *dn;
- struct page *page;
+ struct folio *folio;
struct f2fs_node *rn;
nid_t child_nid;
unsigned int child_nofs;
@@ -996,15 +1033,15 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
trace_f2fs_truncate_nodes_enter(dn->inode, dn->nid, dn->data_blkaddr);
- page = f2fs_get_node_page(F2FS_I_SB(dn->inode), dn->nid);
- if (IS_ERR(page)) {
- trace_f2fs_truncate_nodes_exit(dn->inode, PTR_ERR(page));
- return PTR_ERR(page);
+ folio = f2fs_get_node_folio(F2FS_I_SB(dn->inode), dn->nid);
+ if (IS_ERR(folio)) {
+ trace_f2fs_truncate_nodes_exit(dn->inode, PTR_ERR(folio));
+ return PTR_ERR(folio);
}
- f2fs_ra_node_pages(page, ofs, NIDS_PER_BLOCK);
+ f2fs_ra_node_pages(folio, ofs, NIDS_PER_BLOCK);
- rn = F2FS_NODE(page);
+ rn = F2FS_NODE(folio);
if (depth < 3) {
for (i = ofs; i < NIDS_PER_BLOCK; i++, freed++) {
child_nid = le32_to_cpu(rn->in.nid[i]);
@@ -1014,7 +1051,7 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
ret = truncate_dnode(&rdn);
if (ret < 0)
goto out_err;
- if (set_nid(page, i, 0, false))
+ if (set_nid(folio, i, 0, false))
dn->node_changed = true;
}
} else {
@@ -1028,7 +1065,7 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
rdn.nid = child_nid;
ret = truncate_nodes(&rdn, child_nofs, 0, depth - 1);
if (ret == (NIDS_PER_BLOCK + 1)) {
- if (set_nid(page, i, 0, false))
+ if (set_nid(folio, i, 0, false))
dn->node_changed = true;
child_nofs += ret;
} else if (ret < 0 && ret != -ENOENT) {
@@ -1040,19 +1077,19 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
if (!ofs) {
/* remove current indirect node */
- dn->node_page = page;
+ dn->node_folio = folio;
ret = truncate_node(dn);
if (ret)
goto out_err;
freed++;
} else {
- f2fs_put_page(page, 1);
+ f2fs_folio_put(folio, true);
}
trace_f2fs_truncate_nodes_exit(dn->inode, freed);
return freed;
out_err:
- f2fs_put_page(page, 1);
+ f2fs_folio_put(folio, true);
trace_f2fs_truncate_nodes_exit(dn->inode, ret);
return ret;
}
@@ -1060,59 +1097,59 @@ out_err:
static int truncate_partial_nodes(struct dnode_of_data *dn,
struct f2fs_inode *ri, int *offset, int depth)
{
- struct page *pages[2];
+ struct folio *folios[2];
nid_t nid[3];
nid_t child_nid;
int err = 0;
int i;
int idx = depth - 2;
- nid[0] = get_nid(dn->inode_page, offset[0], true);
+ nid[0] = get_nid(dn->inode_folio, offset[0], true);
if (!nid[0])
return 0;
/* get indirect nodes in the path */
for (i = 0; i < idx + 1; i++) {
/* reference count'll be increased */
- pages[i] = f2fs_get_node_page(F2FS_I_SB(dn->inode), nid[i]);
- if (IS_ERR(pages[i])) {
- err = PTR_ERR(pages[i]);
+ folios[i] = f2fs_get_node_folio(F2FS_I_SB(dn->inode), nid[i]);
+ if (IS_ERR(folios[i])) {
+ err = PTR_ERR(folios[i]);
idx = i - 1;
goto fail;
}
- nid[i + 1] = get_nid(pages[i], offset[i + 1], false);
+ nid[i + 1] = get_nid(folios[i], offset[i + 1], false);
}
- f2fs_ra_node_pages(pages[idx], offset[idx + 1], NIDS_PER_BLOCK);
+ f2fs_ra_node_pages(folios[idx], offset[idx + 1], NIDS_PER_BLOCK);
/* free direct nodes linked to a partial indirect node */
for (i = offset[idx + 1]; i < NIDS_PER_BLOCK; i++) {
- child_nid = get_nid(pages[idx], i, false);
+ child_nid = get_nid(folios[idx], i, false);
if (!child_nid)
continue;
dn->nid = child_nid;
err = truncate_dnode(dn);
if (err < 0)
goto fail;
- if (set_nid(pages[idx], i, 0, false))
+ if (set_nid(folios[idx], i, 0, false))
dn->node_changed = true;
}
if (offset[idx + 1] == 0) {
- dn->node_page = pages[idx];
+ dn->node_folio = folios[idx];
dn->nid = nid[idx];
err = truncate_node(dn);
if (err)
goto fail;
} else {
- f2fs_put_page(pages[idx], 1);
+ f2fs_folio_put(folios[idx], true);
}
offset[idx]++;
offset[idx + 1] = 0;
idx--;
fail:
for (i = idx; i >= 0; i--)
- f2fs_put_page(pages[i], 1);
+ f2fs_folio_put(folios[i], true);
trace_f2fs_truncate_partial_nodes(dn->inode, nid, depth, err);
@@ -1130,26 +1167,33 @@ int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from)
unsigned int nofs = 0;
struct f2fs_inode *ri;
struct dnode_of_data dn;
- struct page *page;
+ struct folio *folio;
trace_f2fs_truncate_inode_blocks_enter(inode, from);
level = get_node_path(inode, from, offset, noffset);
- if (level < 0) {
+ if (level <= 0) {
+ if (!level) {
+ level = -EFSCORRUPTED;
+ f2fs_err(sbi, "%s: inode ino=%lx has corrupted node block, from:%lu addrs:%u",
+ __func__, inode->i_ino,
+ from, ADDRS_PER_INODE(inode));
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ }
trace_f2fs_truncate_inode_blocks_exit(inode, level);
return level;
}
- page = f2fs_get_node_page(sbi, inode->i_ino);
- if (IS_ERR(page)) {
- trace_f2fs_truncate_inode_blocks_exit(inode, PTR_ERR(page));
- return PTR_ERR(page);
+ folio = f2fs_get_inode_folio(sbi, inode->i_ino);
+ if (IS_ERR(folio)) {
+ trace_f2fs_truncate_inode_blocks_exit(inode, PTR_ERR(folio));
+ return PTR_ERR(folio);
}
- set_new_dnode(&dn, inode, page, NULL, 0);
- unlock_page(page);
+ set_new_dnode(&dn, inode, folio, NULL, 0);
+ folio_unlock(folio);
- ri = F2FS_INODE(page);
+ ri = F2FS_INODE(folio);
switch (level) {
case 0:
case 1:
@@ -1178,7 +1222,7 @@ int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from)
skip_partial:
while (cont) {
- dn.nid = get_nid(page, offset[0], true);
+ dn.nid = get_nid(folio, offset[0], true);
switch (offset[0]) {
case NODE_DIR1_BLOCK:
case NODE_DIR2_BLOCK:
@@ -1199,7 +1243,7 @@ skip_partial:
BUG();
}
if (err == -ENOENT) {
- set_sbi_flag(F2FS_P_SB(page), SBI_NEED_FSCK);
+ set_sbi_flag(F2FS_F_SB(folio), SBI_NEED_FSCK);
f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
f2fs_err_ratelimited(sbi,
"truncate node fail, ino:%lu, nid:%u, "
@@ -1210,18 +1254,18 @@ skip_partial:
}
if (err < 0)
goto fail;
- if (offset[1] == 0 && get_nid(page, offset[0], true)) {
- lock_page(page);
- BUG_ON(page->mapping != NODE_MAPPING(sbi));
- set_nid(page, offset[0], 0, true);
- unlock_page(page);
+ if (offset[1] == 0 && get_nid(folio, offset[0], true)) {
+ folio_lock(folio);
+ BUG_ON(!is_node_folio(folio));
+ set_nid(folio, offset[0], 0, true);
+ folio_unlock(folio);
}
offset[1] = 0;
offset[0]++;
nofs += err;
}
fail:
- f2fs_put_page(page, 0);
+ f2fs_folio_put(folio, false);
trace_f2fs_truncate_inode_blocks_exit(inode, err);
return err > 0 ? 0 : err;
}
@@ -1232,20 +1276,20 @@ int f2fs_truncate_xattr_node(struct inode *inode)
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
nid_t nid = F2FS_I(inode)->i_xattr_nid;
struct dnode_of_data dn;
- struct page *npage;
+ struct folio *nfolio;
int err;
if (!nid)
return 0;
- npage = f2fs_get_node_page(sbi, nid);
- if (IS_ERR(npage))
- return PTR_ERR(npage);
+ nfolio = f2fs_get_xnode_folio(sbi, nid);
+ if (IS_ERR(nfolio))
+ return PTR_ERR(nfolio);
- set_new_dnode(&dn, inode, NULL, npage, nid);
+ set_new_dnode(&dn, inode, NULL, nfolio, nid);
err = truncate_node(&dn);
if (err) {
- f2fs_put_page(npage, 1);
+ f2fs_folio_put(nfolio, true);
return err;
}
@@ -1302,30 +1346,30 @@ int f2fs_remove_inode_page(struct inode *inode)
return 0;
}
-struct page *f2fs_new_inode_page(struct inode *inode)
+struct folio *f2fs_new_inode_folio(struct inode *inode)
{
struct dnode_of_data dn;
/* allocate inode page for new inode */
set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino);
- /* caller should f2fs_put_page(page, 1); */
- return f2fs_new_node_page(&dn, 0);
+ /* caller should f2fs_folio_put(folio, true); */
+ return f2fs_new_node_folio(&dn, 0);
}
-struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs)
+struct folio *f2fs_new_node_folio(struct dnode_of_data *dn, unsigned int ofs)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
struct node_info new_ni;
- struct page *page;
+ struct folio *folio;
int err;
if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC)))
return ERR_PTR(-EPERM);
- page = f2fs_grab_cache_page(NODE_MAPPING(sbi), dn->nid, false);
- if (!page)
- return ERR_PTR(-ENOMEM);
+ folio = f2fs_grab_cache_folio(NODE_MAPPING(sbi), dn->nid, false);
+ if (IS_ERR(folio))
+ return folio;
if (unlikely((err = inc_valid_node_count(sbi, dn->inode, !ofs))))
goto fail;
@@ -1341,7 +1385,7 @@ struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs)
dec_valid_node_count(sbi, dn->inode, !ofs);
set_sbi_flag(sbi, SBI_NEED_FSCK);
f2fs_warn_ratelimited(sbi,
- "f2fs_new_node_page: inconsistent nat entry, "
+ "f2fs_new_node_folio: inconsistent nat entry, "
"ino:%u, nid:%u, blkaddr:%u, ver:%u, flag:%u",
new_ni.ino, new_ni.nid, new_ni.blk_addr,
new_ni.version, new_ni.flag);
@@ -1356,12 +1400,12 @@ struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs)
new_ni.version = 0;
set_node_addr(sbi, &new_ni, NEW_ADDR, false);
- f2fs_wait_on_page_writeback(page, NODE, true, true);
- fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true);
- set_cold_node(page, S_ISDIR(dn->inode->i_mode));
- if (!PageUptodate(page))
- SetPageUptodate(page);
- if (set_page_dirty(page))
+ f2fs_folio_wait_writeback(folio, NODE, true, true);
+ fill_node_footer(folio, dn->nid, dn->inode->i_ino, ofs, true);
+ set_cold_node(folio, S_ISDIR(dn->inode->i_mode));
+ if (!folio_test_uptodate(folio))
+ folio_mark_uptodate(folio);
+ if (folio_mark_dirty(folio))
dn->node_changed = true;
if (f2fs_has_xattr_block(ofs))
@@ -1369,35 +1413,34 @@ struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs)
if (ofs == 0)
inc_valid_inode_count(sbi);
- return page;
+ return folio;
fail:
- clear_node_page_dirty(page);
- f2fs_put_page(page, 1);
+ clear_node_folio_dirty(folio);
+ f2fs_folio_put(folio, true);
return ERR_PTR(err);
}
/*
* Caller should do after getting the following values.
- * 0: f2fs_put_page(page, 0)
- * LOCKED_PAGE or error: f2fs_put_page(page, 1)
+ * 0: f2fs_folio_put(folio, false)
+ * LOCKED_PAGE or error: f2fs_folio_put(folio, true)
*/
-static int read_node_page(struct page *page, blk_opf_t op_flags)
+static int read_node_folio(struct folio *folio, blk_opf_t op_flags)
{
- struct folio *folio = page_folio(page);
- struct f2fs_sb_info *sbi = F2FS_P_SB(page);
+ struct f2fs_sb_info *sbi = F2FS_F_SB(folio);
struct node_info ni;
struct f2fs_io_info fio = {
.sbi = sbi,
.type = NODE,
.op = REQ_OP_READ,
.op_flags = op_flags,
- .page = page,
+ .folio = folio,
.encrypted_page = NULL,
};
int err;
if (folio_test_uptodate(folio)) {
- if (!f2fs_inode_chksum_verify(sbi, page)) {
+ if (!f2fs_inode_chksum_verify(sbi, folio)) {
folio_clear_uptodate(folio);
return -EFSBADCRC;
}
@@ -1429,7 +1472,7 @@ static int read_node_page(struct page *page, blk_opf_t op_flags)
*/
void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
{
- struct page *apage;
+ struct folio *afolio;
int err;
if (!nid)
@@ -1437,22 +1480,43 @@ void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
if (f2fs_check_nid_range(sbi, nid))
return;
- apage = xa_load(&NODE_MAPPING(sbi)->i_pages, nid);
- if (apage)
+ afolio = xa_load(&NODE_MAPPING(sbi)->i_pages, nid);
+ if (afolio)
return;
- apage = f2fs_grab_cache_page(NODE_MAPPING(sbi), nid, false);
- if (!apage)
+ afolio = f2fs_grab_cache_folio(NODE_MAPPING(sbi), nid, false);
+ if (IS_ERR(afolio))
return;
- err = read_node_page(apage, REQ_RAHEAD);
- f2fs_put_page(apage, err ? 1 : 0);
+ err = read_node_folio(afolio, REQ_RAHEAD);
+ f2fs_folio_put(afolio, err ? true : false);
}
-static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid,
- struct page *parent, int start)
+static int sanity_check_node_footer(struct f2fs_sb_info *sbi,
+ struct folio *folio, pgoff_t nid,
+ enum node_type ntype)
{
- struct page *page;
+ if (unlikely(nid != nid_of_node(folio) ||
+ (ntype == NODE_TYPE_INODE && !IS_INODE(folio)) ||
+ (ntype == NODE_TYPE_XATTR &&
+ !f2fs_has_xattr_block(ofs_of_node(folio))) ||
+ time_to_inject(sbi, FAULT_INCONSISTENT_FOOTER))) {
+ f2fs_warn(sbi, "inconsistent node block, node_type:%d, nid:%lu, "
+ "node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]",
+ ntype, nid, nid_of_node(folio), ino_of_node(folio),
+ ofs_of_node(folio), cpver_of_node(folio),
+ next_blkaddr_of_node(folio));
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER);
+ return -EFSCORRUPTED;
+ }
+ return 0;
+}
+
+static struct folio *__get_node_folio(struct f2fs_sb_info *sbi, pgoff_t nid,
+ struct folio *parent, int start, enum node_type ntype)
+{
+ struct folio *folio;
int err;
if (!nid)
@@ -1460,75 +1524,76 @@ static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid,
if (f2fs_check_nid_range(sbi, nid))
return ERR_PTR(-EINVAL);
repeat:
- page = f2fs_grab_cache_page(NODE_MAPPING(sbi), nid, false);
- if (!page)
- return ERR_PTR(-ENOMEM);
+ folio = f2fs_grab_cache_folio(NODE_MAPPING(sbi), nid, false);
+ if (IS_ERR(folio))
+ return folio;
- err = read_node_page(page, 0);
- if (err < 0) {
+ err = read_node_folio(folio, 0);
+ if (err < 0)
goto out_put_err;
- } else if (err == LOCKED_PAGE) {
- err = 0;
+ if (err == LOCKED_PAGE)
goto page_hit;
- }
if (parent)
f2fs_ra_node_pages(parent, start + 1, MAX_RA_NODE);
- lock_page(page);
+ folio_lock(folio);
- if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
- f2fs_put_page(page, 1);
+ if (unlikely(!is_node_folio(folio))) {
+ f2fs_folio_put(folio, true);
goto repeat;
}
- if (unlikely(!PageUptodate(page))) {
+ if (unlikely(!folio_test_uptodate(folio))) {
err = -EIO;
goto out_err;
}
- if (!f2fs_inode_chksum_verify(sbi, page)) {
+ if (!f2fs_inode_chksum_verify(sbi, folio)) {
err = -EFSBADCRC;
goto out_err;
}
page_hit:
- if (likely(nid == nid_of_node(page)))
- return page;
-
- f2fs_warn(sbi, "inconsistent node block, nid:%lu, node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]",
- nid, nid_of_node(page), ino_of_node(page),
- ofs_of_node(page), cpver_of_node(page),
- next_blkaddr_of_node(page));
- set_sbi_flag(sbi, SBI_NEED_FSCK);
- f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER);
- err = -EFSCORRUPTED;
+ err = sanity_check_node_footer(sbi, folio, nid, ntype);
+ if (!err)
+ return folio;
out_err:
- ClearPageUptodate(page);
+ folio_clear_uptodate(folio);
out_put_err:
- /* ENOENT comes from read_node_page which is not an error. */
+ /* ENOENT comes from read_node_folio which is not an error. */
if (err != -ENOENT)
- f2fs_handle_page_eio(sbi, page_folio(page), NODE);
- f2fs_put_page(page, 1);
+ f2fs_handle_page_eio(sbi, folio, NODE);
+ f2fs_folio_put(folio, true);
return ERR_PTR(err);
}
-struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid)
+struct folio *f2fs_get_node_folio(struct f2fs_sb_info *sbi, pgoff_t nid)
+{
+ return __get_node_folio(sbi, nid, NULL, 0, NODE_TYPE_REGULAR);
+}
+
+struct folio *f2fs_get_inode_folio(struct f2fs_sb_info *sbi, pgoff_t ino)
{
- return __get_node_page(sbi, nid, NULL, 0);
+ return __get_node_folio(sbi, ino, NULL, 0, NODE_TYPE_INODE);
}
-struct page *f2fs_get_node_page_ra(struct page *parent, int start)
+struct folio *f2fs_get_xnode_folio(struct f2fs_sb_info *sbi, pgoff_t xnid)
{
- struct f2fs_sb_info *sbi = F2FS_P_SB(parent);
+ return __get_node_folio(sbi, xnid, NULL, 0, NODE_TYPE_XATTR);
+}
+
+static struct folio *f2fs_get_node_folio_ra(struct folio *parent, int start)
+{
+ struct f2fs_sb_info *sbi = F2FS_F_SB(parent);
nid_t nid = get_nid(parent, start, false);
- return __get_node_page(sbi, nid, parent, start);
+ return __get_node_folio(sbi, nid, parent, start, NODE_TYPE_REGULAR);
}
static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino)
{
struct inode *inode;
- struct page *page;
+ struct folio *folio;
int ret;
/* should flush inline_data before evict_inode */
@@ -1536,36 +1601,36 @@ static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino)
if (!inode)
return;
- page = f2fs_pagecache_get_page(inode->i_mapping, 0,
+ folio = f2fs_filemap_get_folio(inode->i_mapping, 0,
FGP_LOCK|FGP_NOWAIT, 0);
- if (!page)
+ if (IS_ERR(folio))
goto iput_out;
- if (!PageUptodate(page))
- goto page_out;
+ if (!folio_test_uptodate(folio))
+ goto folio_out;
- if (!PageDirty(page))
- goto page_out;
+ if (!folio_test_dirty(folio))
+ goto folio_out;
- if (!clear_page_dirty_for_io(page))
- goto page_out;
+ if (!folio_clear_dirty_for_io(folio))
+ goto folio_out;
- ret = f2fs_write_inline_data(inode, page_folio(page));
+ ret = f2fs_write_inline_data(inode, folio);
inode_dec_dirty_pages(inode);
f2fs_remove_dirty_inode(inode);
if (ret)
- set_page_dirty(page);
-page_out:
- f2fs_put_page(page, 1);
+ folio_mark_dirty(folio);
+folio_out:
+ f2fs_folio_put(folio, true);
iput_out:
iput(inode);
}
-static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino)
+static struct folio *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino)
{
pgoff_t index;
struct folio_batch fbatch;
- struct page *last_page = NULL;
+ struct folio *last_folio = NULL;
int nr_folios;
folio_batch_init(&fbatch);
@@ -1577,62 +1642,61 @@ static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino)
int i;
for (i = 0; i < nr_folios; i++) {
- struct page *page = &fbatch.folios[i]->page;
+ struct folio *folio = fbatch.folios[i];
if (unlikely(f2fs_cp_error(sbi))) {
- f2fs_put_page(last_page, 0);
+ f2fs_folio_put(last_folio, false);
folio_batch_release(&fbatch);
return ERR_PTR(-EIO);
}
- if (!IS_DNODE(page) || !is_cold_node(page))
+ if (!IS_DNODE(folio) || !is_cold_node(folio))
continue;
- if (ino_of_node(page) != ino)
+ if (ino_of_node(folio) != ino)
continue;
- lock_page(page);
+ folio_lock(folio);
- if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
+ if (unlikely(!is_node_folio(folio))) {
continue_unlock:
- unlock_page(page);
+ folio_unlock(folio);
continue;
}
- if (ino_of_node(page) != ino)
+ if (ino_of_node(folio) != ino)
goto continue_unlock;
- if (!PageDirty(page)) {
+ if (!folio_test_dirty(folio)) {
/* someone wrote it for us */
goto continue_unlock;
}
- if (last_page)
- f2fs_put_page(last_page, 0);
+ if (last_folio)
+ f2fs_folio_put(last_folio, false);
- get_page(page);
- last_page = page;
- unlock_page(page);
+ folio_get(folio);
+ last_folio = folio;
+ folio_unlock(folio);
}
folio_batch_release(&fbatch);
cond_resched();
}
- return last_page;
+ return last_folio;
}
-static int __write_node_page(struct page *page, bool atomic, bool *submitted,
+static bool __write_node_folio(struct folio *folio, bool atomic, bool *submitted,
struct writeback_control *wbc, bool do_balance,
enum iostat_type io_type, unsigned int *seq_id)
{
- struct f2fs_sb_info *sbi = F2FS_P_SB(page);
- struct folio *folio = page_folio(page);
+ struct f2fs_sb_info *sbi = F2FS_F_SB(folio);
nid_t nid;
struct node_info ni;
struct f2fs_io_info fio = {
.sbi = sbi,
- .ino = ino_of_node(page),
+ .ino = ino_of_node(folio),
.type = NODE,
.op = REQ_OP_WRITE,
.op_flags = wbc_to_write_flags(wbc),
- .page = page,
+ .folio = folio,
.encrypted_page = NULL,
.submitted = 0,
.io_type = io_type,
@@ -1649,7 +1713,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
folio_clear_uptodate(folio);
dec_page_count(sbi, F2FS_DIRTY_NODES);
folio_unlock(folio);
- return 0;
+ return true;
}
if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
@@ -1657,22 +1721,17 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
if (!is_sbi_flag_set(sbi, SBI_CP_DISABLED) &&
wbc->sync_mode == WB_SYNC_NONE &&
- IS_DNODE(page) && is_cold_node(page))
+ IS_DNODE(folio) && is_cold_node(folio))
goto redirty_out;
/* get old block addr of this node page */
- nid = nid_of_node(page);
+ nid = nid_of_node(folio);
f2fs_bug_on(sbi, folio->index != nid);
if (f2fs_get_node_info(sbi, nid, &ni, !do_balance))
goto redirty_out;
- if (wbc->for_reclaim) {
- if (!f2fs_down_read_trylock(&sbi->node_write))
- goto redirty_out;
- } else {
- f2fs_down_read(&sbi->node_write);
- }
+ f2fs_down_read(&sbi->node_write);
/* This page is already truncated */
if (unlikely(ni.blk_addr == NULL_ADDR)) {
@@ -1680,7 +1739,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
dec_page_count(sbi, F2FS_DIRTY_NODES);
f2fs_up_read(&sbi->node_write);
folio_unlock(folio);
- return 0;
+ return true;
}
if (__is_valid_data_blkaddr(ni.blk_addr) &&
@@ -1694,8 +1753,8 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
fio.op_flags |= REQ_PREFLUSH | REQ_FUA;
/* should add to global list before clearing PAGECACHE status */
- if (f2fs_in_warm_node_list(sbi, page)) {
- seq = f2fs_add_fsync_node_entry(sbi, page);
+ if (f2fs_in_warm_node_list(sbi, folio)) {
+ seq = f2fs_add_fsync_node_entry(sbi, folio);
if (seq_id)
*seq_id = seq;
}
@@ -1704,15 +1763,10 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
fio.old_blkaddr = ni.blk_addr;
f2fs_do_write_node_page(nid, &fio);
- set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page));
+ set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(folio));
dec_page_count(sbi, F2FS_DIRTY_NODES);
f2fs_up_read(&sbi->node_write);
- if (wbc->for_reclaim) {
- f2fs_submit_merged_write_cond(sbi, NULL, page, 0, NODE);
- submitted = NULL;
- }
-
folio_unlock(folio);
if (unlikely(f2fs_cp_error(sbi))) {
@@ -1724,14 +1778,15 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
if (do_balance)
f2fs_balance_fs(sbi, false);
- return 0;
+ return true;
redirty_out:
folio_redirty_for_writepage(wbc, folio);
- return AOP_WRITEPAGE_ACTIVATE;
+ folio_unlock(folio);
+ return false;
}
-int f2fs_move_node_page(struct page *node_page, int gc_type)
+int f2fs_move_node_folio(struct folio *node_folio, int gc_type)
{
int err = 0;
@@ -1739,43 +1794,33 @@ int f2fs_move_node_page(struct page *node_page, int gc_type)
struct writeback_control wbc = {
.sync_mode = WB_SYNC_ALL,
.nr_to_write = 1,
- .for_reclaim = 0,
};
- f2fs_wait_on_page_writeback(node_page, NODE, true, true);
+ f2fs_folio_wait_writeback(node_folio, NODE, true, true);
- set_page_dirty(node_page);
+ folio_mark_dirty(node_folio);
- if (!clear_page_dirty_for_io(node_page)) {
+ if (!folio_clear_dirty_for_io(node_folio)) {
err = -EAGAIN;
goto out_page;
}
- if (__write_node_page(node_page, false, NULL,
- &wbc, false, FS_GC_NODE_IO, NULL)) {
+ if (!__write_node_folio(node_folio, false, NULL,
+ &wbc, false, FS_GC_NODE_IO, NULL))
err = -EAGAIN;
- unlock_page(node_page);
- }
goto release_page;
} else {
/* set page dirty and write it */
- if (!folio_test_writeback(page_folio(node_page)))
- set_page_dirty(node_page);
+ if (!folio_test_writeback(node_folio))
+ folio_mark_dirty(node_folio);
}
out_page:
- unlock_page(node_page);
+ folio_unlock(node_folio);
release_page:
- f2fs_put_page(node_page, 0);
+ f2fs_folio_put(node_folio, false);
return err;
}
-static int f2fs_write_node_page(struct page *page,
- struct writeback_control *wbc)
-{
- return __write_node_page(page, false, NULL, wbc, false,
- FS_NODE_IO, NULL);
-}
-
int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
struct writeback_control *wbc, bool atomic,
unsigned int *seq_id)
@@ -1783,16 +1828,16 @@ int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
pgoff_t index;
struct folio_batch fbatch;
int ret = 0;
- struct page *last_page = NULL;
+ struct folio *last_folio = NULL;
bool marked = false;
nid_t ino = inode->i_ino;
int nr_folios;
int nwritten = 0;
if (atomic) {
- last_page = last_fsync_dnode(sbi, ino);
- if (IS_ERR_OR_NULL(last_page))
- return PTR_ERR_OR_ZERO(last_page);
+ last_folio = last_fsync_dnode(sbi, ino);
+ if (IS_ERR_OR_NULL(last_folio))
+ return PTR_ERR_OR_ZERO(last_folio);
}
retry:
folio_batch_init(&fbatch);
@@ -1804,96 +1849,94 @@ retry:
int i;
for (i = 0; i < nr_folios; i++) {
- struct page *page = &fbatch.folios[i]->page;
+ struct folio *folio = fbatch.folios[i];
bool submitted = false;
if (unlikely(f2fs_cp_error(sbi))) {
- f2fs_put_page(last_page, 0);
+ f2fs_folio_put(last_folio, false);
folio_batch_release(&fbatch);
ret = -EIO;
goto out;
}
- if (!IS_DNODE(page) || !is_cold_node(page))
+ if (!IS_DNODE(folio) || !is_cold_node(folio))
continue;
- if (ino_of_node(page) != ino)
+ if (ino_of_node(folio) != ino)
continue;
- lock_page(page);
+ folio_lock(folio);
- if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
+ if (unlikely(!is_node_folio(folio))) {
continue_unlock:
- unlock_page(page);
+ folio_unlock(folio);
continue;
}
- if (ino_of_node(page) != ino)
+ if (ino_of_node(folio) != ino)
goto continue_unlock;
- if (!PageDirty(page) && page != last_page) {
+ if (!folio_test_dirty(folio) && folio != last_folio) {
/* someone wrote it for us */
goto continue_unlock;
}
- f2fs_wait_on_page_writeback(page, NODE, true, true);
+ f2fs_folio_wait_writeback(folio, NODE, true, true);
- set_fsync_mark(page, 0);
- set_dentry_mark(page, 0);
+ set_fsync_mark(folio, 0);
+ set_dentry_mark(folio, 0);
- if (!atomic || page == last_page) {
- set_fsync_mark(page, 1);
+ if (!atomic || folio == last_folio) {
+ set_fsync_mark(folio, 1);
percpu_counter_inc(&sbi->rf_node_block_count);
- if (IS_INODE(page)) {
+ if (IS_INODE(folio)) {
if (is_inode_flag_set(inode,
FI_DIRTY_INODE))
- f2fs_update_inode(inode, page);
- set_dentry_mark(page,
+ f2fs_update_inode(inode, folio);
+ set_dentry_mark(folio,
f2fs_need_dentry_mark(sbi, ino));
}
/* may be written by other thread */
- if (!PageDirty(page))
- set_page_dirty(page);
+ if (!folio_test_dirty(folio))
+ folio_mark_dirty(folio);
}
- if (!clear_page_dirty_for_io(page))
+ if (!folio_clear_dirty_for_io(folio))
goto continue_unlock;
- ret = __write_node_page(page, atomic &&
- page == last_page,
+ if (!__write_node_folio(folio, atomic &&
+ folio == last_folio,
&submitted, wbc, true,
- FS_NODE_IO, seq_id);
- if (ret) {
- unlock_page(page);
- f2fs_put_page(last_page, 0);
- break;
- } else if (submitted) {
- nwritten++;
+ FS_NODE_IO, seq_id)) {
+ f2fs_folio_put(last_folio, false);
+ folio_batch_release(&fbatch);
+ ret = -EIO;
+ goto out;
}
+ if (submitted)
+ nwritten++;
- if (page == last_page) {
- f2fs_put_page(page, 0);
+ if (folio == last_folio) {
+ f2fs_folio_put(folio, false);
+ folio_batch_release(&fbatch);
marked = true;
- break;
+ goto out;
}
}
folio_batch_release(&fbatch);
cond_resched();
-
- if (ret || marked)
- break;
}
- if (!ret && atomic && !marked) {
+ if (atomic && !marked) {
f2fs_debug(sbi, "Retry to write fsync mark: ino=%u, idx=%lx",
- ino, page_folio(last_page)->index);
- lock_page(last_page);
- f2fs_wait_on_page_writeback(last_page, NODE, true, true);
- set_page_dirty(last_page);
- unlock_page(last_page);
+ ino, last_folio->index);
+ folio_lock(last_folio);
+ f2fs_folio_wait_writeback(last_folio, NODE, true, true);
+ folio_mark_dirty(last_folio);
+ folio_unlock(last_folio);
goto retry;
}
out:
if (nwritten)
f2fs_submit_merged_write_cond(sbi, NULL, NULL, ino, NODE);
- return ret ? -EIO : 0;
+ return ret;
}
static int f2fs_match_ino(struct inode *inode, unsigned long ino, void *data)
@@ -1920,18 +1963,18 @@ static int f2fs_match_ino(struct inode *inode, unsigned long ino, void *data)
return 1;
}
-static bool flush_dirty_inode(struct page *page)
+static bool flush_dirty_inode(struct folio *folio)
{
- struct f2fs_sb_info *sbi = F2FS_P_SB(page);
+ struct f2fs_sb_info *sbi = F2FS_F_SB(folio);
struct inode *inode;
- nid_t ino = ino_of_node(page);
+ nid_t ino = ino_of_node(folio);
inode = find_inode_nowait(sbi->sb, ino, f2fs_match_ino, NULL);
if (!inode)
return false;
- f2fs_update_inode(inode, page);
- unlock_page(page);
+ f2fs_update_inode(inode, folio);
+ folio_unlock(folio);
iput(inode);
return true;
@@ -1951,32 +1994,27 @@ void f2fs_flush_inline_data(struct f2fs_sb_info *sbi)
int i;
for (i = 0; i < nr_folios; i++) {
- struct page *page = &fbatch.folios[i]->page;
+ struct folio *folio = fbatch.folios[i];
- if (!IS_INODE(page))
+ if (!IS_INODE(folio))
continue;
- lock_page(page);
+ folio_lock(folio);
- if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
-continue_unlock:
- unlock_page(page);
- continue;
- }
-
- if (!PageDirty(page)) {
- /* someone wrote it for us */
- goto continue_unlock;
- }
+ if (unlikely(!is_node_folio(folio)))
+ goto unlock;
+ if (!folio_test_dirty(folio))
+ goto unlock;
/* flush inline_data, if it's async context. */
- if (page_private_inline(page)) {
- clear_page_private_inline(page);
- unlock_page(page);
- flush_inline_data(sbi, ino_of_node(page));
+ if (folio_test_f2fs_inline(folio)) {
+ folio_clear_f2fs_inline(folio);
+ folio_unlock(folio);
+ flush_inline_data(sbi, ino_of_node(folio));
continue;
}
- unlock_page(page);
+unlock:
+ folio_unlock(folio);
}
folio_batch_release(&fbatch);
cond_resched();
@@ -2005,7 +2043,7 @@ next_step:
int i;
for (i = 0; i < nr_folios; i++) {
- struct page *page = &fbatch.folios[i]->page;
+ struct folio *folio = fbatch.folios[i];
bool submitted = false;
/* give a priority to WB_SYNC threads */
@@ -2021,27 +2059,27 @@ next_step:
* 1. dentry dnodes
* 2. file dnodes
*/
- if (step == 0 && IS_DNODE(page))
+ if (step == 0 && IS_DNODE(folio))
continue;
- if (step == 1 && (!IS_DNODE(page) ||
- is_cold_node(page)))
+ if (step == 1 && (!IS_DNODE(folio) ||
+ is_cold_node(folio)))
continue;
- if (step == 2 && (!IS_DNODE(page) ||
- !is_cold_node(page)))
+ if (step == 2 && (!IS_DNODE(folio) ||
+ !is_cold_node(folio)))
continue;
lock_node:
if (wbc->sync_mode == WB_SYNC_ALL)
- lock_page(page);
- else if (!trylock_page(page))
+ folio_lock(folio);
+ else if (!folio_trylock(folio))
continue;
- if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
+ if (unlikely(!is_node_folio(folio))) {
continue_unlock:
- unlock_page(page);
+ folio_unlock(folio);
continue;
}
- if (!PageDirty(page)) {
+ if (!folio_test_dirty(folio)) {
/* someone wrote it for us */
goto continue_unlock;
}
@@ -2051,30 +2089,32 @@ continue_unlock:
goto write_node;
/* flush inline_data */
- if (page_private_inline(page)) {
- clear_page_private_inline(page);
- unlock_page(page);
- flush_inline_data(sbi, ino_of_node(page));
+ if (folio_test_f2fs_inline(folio)) {
+ folio_clear_f2fs_inline(folio);
+ folio_unlock(folio);
+ flush_inline_data(sbi, ino_of_node(folio));
goto lock_node;
}
/* flush dirty inode */
- if (IS_INODE(page) && flush_dirty_inode(page))
+ if (IS_INODE(folio) && flush_dirty_inode(folio))
goto lock_node;
write_node:
- f2fs_wait_on_page_writeback(page, NODE, true, true);
+ f2fs_folio_wait_writeback(folio, NODE, true, true);
- if (!clear_page_dirty_for_io(page))
+ if (!folio_clear_dirty_for_io(folio))
goto continue_unlock;
- set_fsync_mark(page, 0);
- set_dentry_mark(page, 0);
+ set_fsync_mark(folio, 0);
+ set_dentry_mark(folio, 0);
- ret = __write_node_page(page, false, &submitted,
- wbc, do_balance, io_type, NULL);
- if (ret)
- unlock_page(page);
- else if (submitted)
+ if (!__write_node_folio(folio, false, &submitted,
+ wbc, do_balance, io_type, NULL)) {
+ folio_batch_release(&fbatch);
+ ret = -EIO;
+ goto out;
+ }
+ if (submitted)
nwritten++;
if (--wbc->nr_to_write == 0)
@@ -2109,12 +2149,13 @@ int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi,
unsigned int seq_id)
{
struct fsync_node_entry *fn;
- struct page *page;
struct list_head *head = &sbi->fsync_node_list;
unsigned long flags;
unsigned int cur_seq_id = 0;
while (seq_id && cur_seq_id < seq_id) {
+ struct folio *folio;
+
spin_lock_irqsave(&sbi->fsync_node_lock, flags);
if (list_empty(head)) {
spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
@@ -2126,13 +2167,13 @@ int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi,
break;
}
cur_seq_id = fn->seq_id;
- page = fn->page;
- get_page(page);
+ folio = fn->folio;
+ folio_get(folio);
spin_unlock_irqrestore(&sbi->fsync_node_lock, flags);
- f2fs_wait_on_page_writeback(page, NODE, true, false);
+ f2fs_folio_wait_writeback(folio, NODE, true, false);
- put_page(page);
+ folio_put(folio);
}
return filemap_check_errors(NODE_MAPPING(sbi));
@@ -2192,12 +2233,12 @@ static bool f2fs_dirty_node_folio(struct address_space *mapping,
if (!folio_test_uptodate(folio))
folio_mark_uptodate(folio);
#ifdef CONFIG_F2FS_CHECK_FS
- if (IS_INODE(&folio->page))
- f2fs_inode_chksum_set(F2FS_M_SB(mapping), &folio->page);
+ if (IS_INODE(folio))
+ f2fs_inode_chksum_set(F2FS_M_SB(mapping), folio);
#endif
if (filemap_dirty_folio(mapping, folio)) {
inc_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_NODES);
- set_page_private_reference(&folio->page);
+ folio_set_f2fs_reference(folio);
return true;
}
return false;
@@ -2207,7 +2248,6 @@ static bool f2fs_dirty_node_folio(struct address_space *mapping,
* Structure of the f2fs node operations
*/
const struct address_space_operations f2fs_node_aops = {
- .writepage = f2fs_write_node_page,
.writepages = f2fs_write_node_pages,
.dirty_folio = f2fs_dirty_node_folio,
.invalidate_folio = f2fs_invalidate_folio,
@@ -2269,24 +2309,6 @@ static void __move_free_nid(struct f2fs_sb_info *sbi, struct free_nid *i,
}
}
-bool f2fs_nat_bitmap_enabled(struct f2fs_sb_info *sbi)
-{
- struct f2fs_nm_info *nm_i = NM_I(sbi);
- unsigned int i;
- bool ret = true;
-
- f2fs_down_read(&nm_i->nat_tree_lock);
- for (i = 0; i < nm_i->nat_blocks; i++) {
- if (!test_bit_le(i, nm_i->nat_block_bitmap)) {
- ret = false;
- break;
- }
- }
- f2fs_up_read(&nm_i->nat_tree_lock);
-
- return ret;
-}
-
static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid,
bool set, bool build)
{
@@ -2318,7 +2340,7 @@ static bool add_free_nid(struct f2fs_sb_info *sbi,
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct free_nid *i, *e;
struct nat_entry *ne;
- int err = -EINVAL;
+ int err;
bool ret = false;
/* 0 nid should not be used */
@@ -2332,7 +2354,10 @@ static bool add_free_nid(struct f2fs_sb_info *sbi,
i->nid = nid;
i->state = FREE_NID;
- radix_tree_preload(GFP_NOFS | __GFP_NOFAIL);
+ err = radix_tree_preload(GFP_NOFS | __GFP_NOFAIL);
+ f2fs_bug_on(sbi, err);
+
+ err = -EINVAL;
spin_lock(&nm_i->nid_list_lock);
@@ -2351,14 +2376,14 @@ static bool add_free_nid(struct f2fs_sb_info *sbi,
* - __lookup_nat_cache
* - f2fs_add_link
* - f2fs_init_inode_metadata
- * - f2fs_new_inode_page
- * - f2fs_new_node_page
+ * - f2fs_new_inode_folio
+ * - f2fs_new_node_folio
* - set_node_addr
* - f2fs_alloc_nid_done
* - __remove_nid_from_list(PREALLOC_NID)
* - __insert_nid_to_list(FREE_NID)
*/
- ne = __lookup_nat_cache(nm_i, nid);
+ ne = __lookup_nat_cache(nm_i, nid, false);
if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) ||
nat_get_blkaddr(ne) != NULL_ADDR))
goto err_out;
@@ -2405,10 +2430,9 @@ static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid)
}
static int scan_nat_page(struct f2fs_sb_info *sbi,
- struct page *nat_page, nid_t start_nid)
+ struct f2fs_nat_block *nat_blk, nid_t start_nid)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
- struct f2fs_nat_block *nat_blk = page_address(nat_page);
block_t blk_addr;
unsigned int nat_ofs = NAT_BLOCK_OFFSET(start_nid);
int i;
@@ -2528,13 +2552,14 @@ static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi,
while (1) {
if (!test_bit_le(NAT_BLOCK_OFFSET(nid),
nm_i->nat_block_bitmap)) {
- struct page *page = get_current_nat_page(sbi, nid);
+ struct folio *folio = get_current_nat_folio(sbi, nid);
- if (IS_ERR(page)) {
- ret = PTR_ERR(page);
+ if (IS_ERR(folio)) {
+ ret = PTR_ERR(folio);
} else {
- ret = scan_nat_page(sbi, page, nid);
- f2fs_put_page(page, 1);
+ ret = scan_nat_page(sbi, folio_address(folio),
+ nid);
+ f2fs_folio_put(folio, true);
}
if (ret) {
@@ -2710,18 +2735,18 @@ int f2fs_try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink)
return nr - nr_shrink;
}
-int f2fs_recover_inline_xattr(struct inode *inode, struct page *page)
+int f2fs_recover_inline_xattr(struct inode *inode, struct folio *folio)
{
void *src_addr, *dst_addr;
size_t inline_size;
- struct page *ipage;
+ struct folio *ifolio;
struct f2fs_inode *ri;
- ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino);
- if (IS_ERR(ipage))
- return PTR_ERR(ipage);
+ ifolio = f2fs_get_inode_folio(F2FS_I_SB(inode), inode->i_ino);
+ if (IS_ERR(ifolio))
+ return PTR_ERR(ifolio);
- ri = F2FS_INODE(page);
+ ri = F2FS_INODE(folio);
if (ri->i_inline & F2FS_INLINE_XATTR) {
if (!f2fs_has_inline_xattr(inode)) {
set_inode_flag(inode, FI_INLINE_XATTR);
@@ -2735,26 +2760,26 @@ int f2fs_recover_inline_xattr(struct inode *inode, struct page *page)
goto update_inode;
}
- dst_addr = inline_xattr_addr(inode, ipage);
- src_addr = inline_xattr_addr(inode, page);
+ dst_addr = inline_xattr_addr(inode, ifolio);
+ src_addr = inline_xattr_addr(inode, folio);
inline_size = inline_xattr_size(inode);
- f2fs_wait_on_page_writeback(ipage, NODE, true, true);
+ f2fs_folio_wait_writeback(ifolio, NODE, true, true);
memcpy(dst_addr, src_addr, inline_size);
update_inode:
- f2fs_update_inode(inode, ipage);
- f2fs_put_page(ipage, 1);
+ f2fs_update_inode(inode, ifolio);
+ f2fs_folio_put(ifolio, true);
return 0;
}
-int f2fs_recover_xattr_data(struct inode *inode, struct page *page)
+int f2fs_recover_xattr_data(struct inode *inode, struct folio *folio)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid;
nid_t new_xnid;
struct dnode_of_data dn;
struct node_info ni;
- struct page *xpage;
+ struct folio *xfolio;
int err;
if (!prev_xnid)
@@ -2775,32 +2800,32 @@ recover_xnid:
return -ENOSPC;
set_new_dnode(&dn, inode, NULL, NULL, new_xnid);
- xpage = f2fs_new_node_page(&dn, XATTR_NODE_OFFSET);
- if (IS_ERR(xpage)) {
+ xfolio = f2fs_new_node_folio(&dn, XATTR_NODE_OFFSET);
+ if (IS_ERR(xfolio)) {
f2fs_alloc_nid_failed(sbi, new_xnid);
- return PTR_ERR(xpage);
+ return PTR_ERR(xfolio);
}
f2fs_alloc_nid_done(sbi, new_xnid);
f2fs_update_inode_page(inode);
/* 3: update and set xattr node page dirty */
- if (page) {
- memcpy(F2FS_NODE(xpage), F2FS_NODE(page),
+ if (folio) {
+ memcpy(F2FS_NODE(xfolio), F2FS_NODE(folio),
VALID_XATTR_BLOCK_SIZE);
- set_page_dirty(xpage);
+ folio_mark_dirty(xfolio);
}
- f2fs_put_page(xpage, 1);
+ f2fs_folio_put(xfolio, true);
return 0;
}
-int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
+int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct folio *folio)
{
struct f2fs_inode *src, *dst;
- nid_t ino = ino_of_node(page);
+ nid_t ino = ino_of_node(folio);
struct node_info old_ni, new_ni;
- struct page *ipage;
+ struct folio *ifolio;
int err;
err = f2fs_get_node_info(sbi, ino, &old_ni, false);
@@ -2810,8 +2835,8 @@ int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
if (unlikely(old_ni.blk_addr != NULL_ADDR))
return -EINVAL;
retry:
- ipage = f2fs_grab_cache_page(NODE_MAPPING(sbi), ino, false);
- if (!ipage) {
+ ifolio = f2fs_grab_cache_folio(NODE_MAPPING(sbi), ino, false);
+ if (IS_ERR(ifolio)) {
memalloc_retry_wait(GFP_NOFS);
goto retry;
}
@@ -2819,13 +2844,13 @@ retry:
/* Should not use this inode from free nid list */
remove_free_nid(sbi, ino);
- if (!PageUptodate(ipage))
- SetPageUptodate(ipage);
- fill_node_footer(ipage, ino, ino, 0, true);
- set_cold_node(ipage, false);
+ if (!folio_test_uptodate(ifolio))
+ folio_mark_uptodate(ifolio);
+ fill_node_footer(ifolio, ino, ino, 0, true);
+ set_cold_node(ifolio, false);
- src = F2FS_INODE(page);
- dst = F2FS_INODE(ipage);
+ src = F2FS_INODE(folio);
+ dst = F2FS_INODE(ifolio);
memcpy(dst, src, offsetof(struct f2fs_inode, i_ext));
dst->i_size = 0;
@@ -2861,8 +2886,8 @@ retry:
WARN_ON(1);
set_node_addr(sbi, &new_ni, NEW_ADDR, false);
inc_valid_inode_count(sbi);
- set_page_dirty(ipage);
- f2fs_put_page(ipage, 1);
+ folio_mark_dirty(ifolio);
+ f2fs_folio_put(ifolio, true);
return 0;
}
@@ -2886,17 +2911,17 @@ int f2fs_restore_node_summary(struct f2fs_sb_info *sbi,
f2fs_ra_meta_pages(sbi, addr, nrpages, META_POR, true);
for (idx = addr; idx < addr + nrpages; idx++) {
- struct page *page = f2fs_get_tmp_page(sbi, idx);
+ struct folio *folio = f2fs_get_tmp_folio(sbi, idx);
- if (IS_ERR(page))
- return PTR_ERR(page);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
- rn = F2FS_NODE(page);
+ rn = F2FS_NODE(folio);
sum_entry->nid = rn->footer.nid;
sum_entry->version = 0;
sum_entry->ofs_in_node = 0;
sum_entry++;
- f2fs_put_page(page, 1);
+ f2fs_folio_put(folio, true);
}
invalidate_mapping_pages(META_MAPPING(sbi), addr,
@@ -2911,6 +2936,7 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
struct f2fs_journal *journal = curseg->journal;
int i;
+ bool init_dirty;
down_write(&curseg->journal_rwsem);
for (i = 0; i < nats_in_cursum(journal); i++) {
@@ -2921,12 +2947,15 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
if (f2fs_check_nid_range(sbi, nid))
continue;
+ init_dirty = false;
+
raw_ne = nat_in_journal(journal, i);
- ne = __lookup_nat_cache(nm_i, nid);
+ ne = __lookup_nat_cache(nm_i, nid, true);
if (!ne) {
+ init_dirty = true;
ne = __alloc_nat_entry(sbi, nid, true);
- __init_nat_entry(nm_i, ne, &raw_ne, true);
+ __init_nat_entry(nm_i, ne, &raw_ne, true, true);
}
/*
@@ -2941,7 +2970,7 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
spin_unlock(&nm_i->nid_list_lock);
}
- __set_nat_cache_dirty(nm_i, ne);
+ __set_nat_cache_dirty(nm_i, ne, init_dirty);
}
update_nats_in_cursum(journal, -i);
up_write(&curseg->journal_rwsem);
@@ -2965,32 +2994,15 @@ add_out:
list_add_tail(&nes->set_list, head);
}
-static void __update_nat_bits(struct f2fs_nm_info *nm_i, unsigned int nat_ofs,
- unsigned int valid)
-{
- if (valid == 0) {
- __set_bit_le(nat_ofs, nm_i->empty_nat_bits);
- __clear_bit_le(nat_ofs, nm_i->full_nat_bits);
- return;
- }
-
- __clear_bit_le(nat_ofs, nm_i->empty_nat_bits);
- if (valid == NAT_ENTRY_PER_BLOCK)
- __set_bit_le(nat_ofs, nm_i->full_nat_bits);
- else
- __clear_bit_le(nat_ofs, nm_i->full_nat_bits);
-}
-
-static void update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid,
- struct page *page)
+static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid,
+ const struct f2fs_nat_block *nat_blk)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
unsigned int nat_index = start_nid / NAT_ENTRY_PER_BLOCK;
- struct f2fs_nat_block *nat_blk = page_address(page);
int valid = 0;
int i = 0;
- if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG))
+ if (!enabled_nat_bits(sbi, NULL))
return;
if (nat_index == 0) {
@@ -3001,36 +3013,17 @@ static void update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid,
if (le32_to_cpu(nat_blk->entries[i].block_addr) != NULL_ADDR)
valid++;
}
-
- __update_nat_bits(nm_i, nat_index, valid);
-}
-
-void f2fs_enable_nat_bits(struct f2fs_sb_info *sbi)
-{
- struct f2fs_nm_info *nm_i = NM_I(sbi);
- unsigned int nat_ofs;
-
- f2fs_down_read(&nm_i->nat_tree_lock);
-
- for (nat_ofs = 0; nat_ofs < nm_i->nat_blocks; nat_ofs++) {
- unsigned int valid = 0, nid_ofs = 0;
-
- /* handle nid zero due to it should never be used */
- if (unlikely(nat_ofs == 0)) {
- valid = 1;
- nid_ofs = 1;
- }
-
- for (; nid_ofs < NAT_ENTRY_PER_BLOCK; nid_ofs++) {
- if (!test_bit_le(nid_ofs,
- nm_i->free_nid_bitmap[nat_ofs]))
- valid++;
- }
-
- __update_nat_bits(nm_i, nat_ofs, valid);
+ if (valid == 0) {
+ __set_bit_le(nat_index, nm_i->empty_nat_bits);
+ __clear_bit_le(nat_index, nm_i->full_nat_bits);
+ return;
}
- f2fs_up_read(&nm_i->nat_tree_lock);
+ __clear_bit_le(nat_index, nm_i->empty_nat_bits);
+ if (valid == NAT_ENTRY_PER_BLOCK)
+ __set_bit_le(nat_index, nm_i->full_nat_bits);
+ else
+ __clear_bit_le(nat_index, nm_i->full_nat_bits);
}
static int __flush_nat_entry_set(struct f2fs_sb_info *sbi,
@@ -3042,25 +3035,25 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi,
bool to_journal = true;
struct f2fs_nat_block *nat_blk;
struct nat_entry *ne, *cur;
- struct page *page = NULL;
+ struct folio *folio = NULL;
/*
* there are two steps to flush nat entries:
* #1, flush nat entries to journal in current hot data summary block.
* #2, flush nat entries to nat page.
*/
- if ((cpc->reason & CP_UMOUNT) ||
+ if (enabled_nat_bits(sbi, cpc) ||
!__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL))
to_journal = false;
if (to_journal) {
down_write(&curseg->journal_rwsem);
} else {
- page = get_next_nat_page(sbi, start_nid);
- if (IS_ERR(page))
- return PTR_ERR(page);
+ folio = get_next_nat_folio(sbi, start_nid);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
- nat_blk = page_address(page);
+ nat_blk = folio_address(folio);
f2fs_bug_on(sbi, !nat_blk);
}
@@ -3096,8 +3089,8 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi,
if (to_journal) {
up_write(&curseg->journal_rwsem);
} else {
- update_nat_bits(sbi, start_nid, page);
- f2fs_put_page(page, 1);
+ __update_nat_bits(sbi, start_nid, nat_blk);
+ f2fs_folio_put(folio, true);
}
/* Allow dirty nats by node block allocation in write_begin */
@@ -3127,7 +3120,7 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
* during unmount, let's flush nat_bits before checking
* nat_cnt[DIRTY_NAT].
*/
- if (cpc->reason & CP_UMOUNT) {
+ if (enabled_nat_bits(sbi, cpc)) {
f2fs_down_write(&nm_i->nat_tree_lock);
remove_nats_in_journal(sbi);
f2fs_up_write(&nm_i->nat_tree_lock);
@@ -3143,7 +3136,7 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
* entries, remove all entries from journal and merge them
* into nat entry set.
*/
- if (cpc->reason & CP_UMOUNT ||
+ if (enabled_nat_bits(sbi, cpc) ||
!__has_cursum_space(journal,
nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL))
remove_nats_in_journal(sbi);
@@ -3180,40 +3173,38 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi)
__u64 cp_ver = cur_cp_version(ckpt);
block_t nat_bits_addr;
+ if (!enabled_nat_bits(sbi, NULL))
+ return 0;
+
nm_i->nat_bits_blocks = F2FS_BLK_ALIGN((nat_bits_bytes << 1) + 8);
nm_i->nat_bits = f2fs_kvzalloc(sbi,
F2FS_BLK_TO_BYTES(nm_i->nat_bits_blocks), GFP_KERNEL);
if (!nm_i->nat_bits)
return -ENOMEM;
- nm_i->full_nat_bits = nm_i->nat_bits + 8;
- nm_i->empty_nat_bits = nm_i->full_nat_bits + nat_bits_bytes;
-
- if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG))
- return 0;
-
nat_bits_addr = __start_cp_addr(sbi) + BLKS_PER_SEG(sbi) -
nm_i->nat_bits_blocks;
for (i = 0; i < nm_i->nat_bits_blocks; i++) {
- struct page *page;
+ struct folio *folio;
- page = f2fs_get_meta_page(sbi, nat_bits_addr++);
- if (IS_ERR(page))
- return PTR_ERR(page);
+ folio = f2fs_get_meta_folio(sbi, nat_bits_addr++);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
memcpy(nm_i->nat_bits + F2FS_BLK_TO_BYTES(i),
- page_address(page), F2FS_BLKSIZE);
- f2fs_put_page(page, 1);
+ folio_address(folio), F2FS_BLKSIZE);
+ f2fs_folio_put(folio, true);
}
cp_ver |= (cur_cp_crc(ckpt) << 32);
if (cpu_to_le64(cp_ver) != *(__le64 *)nm_i->nat_bits) {
- clear_ckpt_flags(sbi, CP_NAT_BITS_FLAG);
- f2fs_notice(sbi, "Disable nat_bits due to incorrect cp_ver (%llu, %llu)",
- cp_ver, le64_to_cpu(*(__le64 *)nm_i->nat_bits));
+ disable_nat_bits(sbi, true);
return 0;
}
+ nm_i->full_nat_bits = nm_i->nat_bits + 8;
+ nm_i->empty_nat_bits = nm_i->full_nat_bits + nat_bits_bytes;
+
f2fs_notice(sbi, "Found nat_bits in checkpoint");
return 0;
}
@@ -3224,7 +3215,7 @@ static inline void load_free_nid_bitmap(struct f2fs_sb_info *sbi)
unsigned int i = 0;
nid_t nid, last_nid;
- if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG))
+ if (!enabled_nat_bits(sbi, NULL))
return;
for (i = 0; i < nm_i->nat_blocks; i++) {
@@ -3296,6 +3287,9 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
if (!nm_i->nat_bitmap)
return -ENOMEM;
+ if (!test_opt(sbi, NAT_BITS))
+ disable_nat_bits(sbi, true);
+
err = __get_nat_bitmaps(sbi);
if (err)
return err;
@@ -3436,10 +3430,10 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi)
}
kvfree(nm_i->free_nid_count);
- kvfree(nm_i->nat_bitmap);
+ kfree(nm_i->nat_bitmap);
kvfree(nm_i->nat_bits);
#ifdef CONFIG_F2FS_CHECK_FS
- kvfree(nm_i->nat_bitmap_mir);
+ kfree(nm_i->nat_bitmap_mir);
#endif
sbi->nm_info = NULL;
kfree(nm_i);
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 6aea13024ac1..030390543b54 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -31,7 +31,7 @@
/* control total # of nats */
#define DEF_NAT_CACHE_THRESHOLD 100000
-/* control total # of node writes used for roll-fowrad recovery */
+/* control total # of node writes used for roll-forward recovery */
#define DEF_RF_NODE_BLOCKS 0
/* vector size for gang look-up from nat cache that consists of radix tree */
@@ -52,6 +52,13 @@ enum {
IS_PREALLOC, /* nat entry is preallocated */
};
+/* For node type in __get_node_folio() */
+enum node_type {
+ NODE_TYPE_REGULAR,
+ NODE_TYPE_INODE,
+ NODE_TYPE_XATTR,
+};
+
/*
* For node information
*/
@@ -236,41 +243,41 @@ static inline void set_to_next_nat(struct f2fs_nm_info *nm_i, nid_t start_nid)
#endif
}
-static inline nid_t ino_of_node(struct page *node_page)
+static inline nid_t ino_of_node(const struct folio *node_folio)
{
- struct f2fs_node *rn = F2FS_NODE(node_page);
+ struct f2fs_node *rn = F2FS_NODE(node_folio);
return le32_to_cpu(rn->footer.ino);
}
-static inline nid_t nid_of_node(struct page *node_page)
+static inline nid_t nid_of_node(const struct folio *node_folio)
{
- struct f2fs_node *rn = F2FS_NODE(node_page);
+ struct f2fs_node *rn = F2FS_NODE(node_folio);
return le32_to_cpu(rn->footer.nid);
}
-static inline unsigned int ofs_of_node(struct page *node_page)
+static inline unsigned int ofs_of_node(const struct folio *node_folio)
{
- struct f2fs_node *rn = F2FS_NODE(node_page);
+ struct f2fs_node *rn = F2FS_NODE(node_folio);
unsigned flag = le32_to_cpu(rn->footer.flag);
return flag >> OFFSET_BIT_SHIFT;
}
-static inline __u64 cpver_of_node(struct page *node_page)
+static inline __u64 cpver_of_node(const struct folio *node_folio)
{
- struct f2fs_node *rn = F2FS_NODE(node_page);
+ struct f2fs_node *rn = F2FS_NODE(node_folio);
return le64_to_cpu(rn->footer.cp_ver);
}
-static inline block_t next_blkaddr_of_node(struct page *node_page)
+static inline block_t next_blkaddr_of_node(const struct folio *node_folio)
{
- struct f2fs_node *rn = F2FS_NODE(node_page);
+ struct f2fs_node *rn = F2FS_NODE(node_folio);
return le32_to_cpu(rn->footer.next_blkaddr);
}
-static inline void fill_node_footer(struct page *page, nid_t nid,
+static inline void fill_node_footer(const struct folio *folio, nid_t nid,
nid_t ino, unsigned int ofs, bool reset)
{
- struct f2fs_node *rn = F2FS_NODE(page);
+ struct f2fs_node *rn = F2FS_NODE(folio);
unsigned int old_flag = 0;
if (reset)
@@ -286,17 +293,18 @@ static inline void fill_node_footer(struct page *page, nid_t nid,
(old_flag & OFFSET_BIT_MASK));
}
-static inline void copy_node_footer(struct page *dst, struct page *src)
+static inline void copy_node_footer(const struct folio *dst,
+ const struct folio *src)
{
struct f2fs_node *src_rn = F2FS_NODE(src);
struct f2fs_node *dst_rn = F2FS_NODE(dst);
memcpy(&dst_rn->footer, &src_rn->footer, sizeof(struct node_footer));
}
-static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr)
+static inline void fill_node_footer_blkaddr(struct folio *folio, block_t blkaddr)
{
- struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page));
- struct f2fs_node *rn = F2FS_NODE(page);
+ struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_F_SB(folio));
+ struct f2fs_node *rn = F2FS_NODE(folio);
__u64 cp_ver = cur_cp_version(ckpt);
if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG))
@@ -306,19 +314,19 @@ static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr)
rn->footer.next_blkaddr = cpu_to_le32(blkaddr);
}
-static inline bool is_recoverable_dnode(struct page *page)
+static inline bool is_recoverable_dnode(const struct folio *folio)
{
- struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page));
+ struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_F_SB(folio));
__u64 cp_ver = cur_cp_version(ckpt);
/* Don't care crc part, if fsck.f2fs sets it. */
if (__is_set_ckpt_flags(ckpt, CP_NOCRC_RECOVERY_FLAG))
- return (cp_ver << 32) == (cpver_of_node(page) << 32);
+ return (cp_ver << 32) == (cpver_of_node(folio) << 32);
if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG))
cp_ver |= (cur_cp_crc(ckpt) << 32);
- return cp_ver == cpver_of_node(page);
+ return cp_ver == cpver_of_node(folio);
}
/*
@@ -342,9 +350,9 @@ static inline bool is_recoverable_dnode(struct page *page)
* `- indirect node ((6 + 2N) + (N - 1)(N + 1))
* `- direct node
*/
-static inline bool IS_DNODE(struct page *node_page)
+static inline bool IS_DNODE(const struct folio *node_folio)
{
- unsigned int ofs = ofs_of_node(node_page);
+ unsigned int ofs = ofs_of_node(node_folio);
if (f2fs_has_xattr_block(ofs))
return true;
@@ -360,22 +368,22 @@ static inline bool IS_DNODE(struct page *node_page)
return true;
}
-static inline int set_nid(struct page *p, int off, nid_t nid, bool i)
+static inline int set_nid(struct folio *folio, int off, nid_t nid, bool i)
{
- struct f2fs_node *rn = F2FS_NODE(p);
+ struct f2fs_node *rn = F2FS_NODE(folio);
- f2fs_wait_on_page_writeback(p, NODE, true, true);
+ f2fs_folio_wait_writeback(folio, NODE, true, true);
if (i)
rn->i.i_nid[off - NODE_DIR1_BLOCK] = cpu_to_le32(nid);
else
rn->in.nid[off] = cpu_to_le32(nid);
- return set_page_dirty(p);
+ return folio_mark_dirty(folio);
}
-static inline nid_t get_nid(struct page *p, int off, bool i)
+static inline nid_t get_nid(const struct folio *folio, int off, bool i)
{
- struct f2fs_node *rn = F2FS_NODE(p);
+ struct f2fs_node *rn = F2FS_NODE(folio);
if (i)
return le32_to_cpu(rn->i.i_nid[off - NODE_DIR1_BLOCK]);
@@ -389,19 +397,19 @@ static inline nid_t get_nid(struct page *p, int off, bool i)
* - Mark cold data pages in page cache
*/
-static inline int is_node(struct page *page, int type)
+static inline int is_node(const struct folio *folio, int type)
{
- struct f2fs_node *rn = F2FS_NODE(page);
+ struct f2fs_node *rn = F2FS_NODE(folio);
return le32_to_cpu(rn->footer.flag) & BIT(type);
}
-#define is_cold_node(page) is_node(page, COLD_BIT_SHIFT)
-#define is_fsync_dnode(page) is_node(page, FSYNC_BIT_SHIFT)
-#define is_dent_dnode(page) is_node(page, DENT_BIT_SHIFT)
+#define is_cold_node(folio) is_node(folio, COLD_BIT_SHIFT)
+#define is_fsync_dnode(folio) is_node(folio, FSYNC_BIT_SHIFT)
+#define is_dent_dnode(folio) is_node(folio, DENT_BIT_SHIFT)
-static inline void set_cold_node(struct page *page, bool is_dir)
+static inline void set_cold_node(const struct folio *folio, bool is_dir)
{
- struct f2fs_node *rn = F2FS_NODE(page);
+ struct f2fs_node *rn = F2FS_NODE(folio);
unsigned int flag = le32_to_cpu(rn->footer.flag);
if (is_dir)
@@ -411,9 +419,9 @@ static inline void set_cold_node(struct page *page, bool is_dir)
rn->footer.flag = cpu_to_le32(flag);
}
-static inline void set_mark(struct page *page, int mark, int type)
+static inline void set_mark(struct folio *folio, int mark, int type)
{
- struct f2fs_node *rn = F2FS_NODE(page);
+ struct f2fs_node *rn = F2FS_NODE(folio);
unsigned int flag = le32_to_cpu(rn->footer.flag);
if (mark)
flag |= BIT(type);
@@ -422,8 +430,8 @@ static inline void set_mark(struct page *page, int mark, int type)
rn->footer.flag = cpu_to_le32(flag);
#ifdef CONFIG_F2FS_CHECK_FS
- f2fs_inode_chksum_set(F2FS_P_SB(page), page);
+ f2fs_inode_chksum_set(F2FS_F_SB(folio), folio);
#endif
}
-#define set_dentry_mark(page, mark) set_mark(page, mark, DENT_BIT_SHIFT)
-#define set_fsync_mark(page, mark) set_mark(page, mark, FSYNC_BIT_SHIFT)
+#define set_dentry_mark(folio, mark) set_mark(folio, mark, DENT_BIT_SHIFT)
+#define set_fsync_mark(folio, mark) set_mark(folio, mark, FSYNC_BIT_SHIFT)
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 69a2027e3ebc..4cb3a91801b4 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -157,15 +157,15 @@ static int init_recovered_filename(const struct inode *dir,
return 0;
}
-static int recover_dentry(struct inode *inode, struct page *ipage,
+static int recover_dentry(struct inode *inode, struct folio *ifolio,
struct list_head *dir_list)
{
- struct f2fs_inode *raw_inode = F2FS_INODE(ipage);
+ struct f2fs_inode *raw_inode = F2FS_INODE(ifolio);
nid_t pino = le32_to_cpu(raw_inode->i_pino);
struct f2fs_dir_entry *de;
struct f2fs_filename fname;
struct qstr usr_fname;
- struct page *page;
+ struct folio *folio;
struct inode *dir, *einode;
struct fsync_inode_entry *entry;
int err = 0;
@@ -187,7 +187,7 @@ static int recover_dentry(struct inode *inode, struct page *ipage,
if (err)
goto out;
retry:
- de = __f2fs_find_entry(dir, &fname, &page);
+ de = __f2fs_find_entry(dir, &fname, &folio);
if (de && inode->i_ino == le32_to_cpu(de->ino))
goto out_put;
@@ -212,11 +212,11 @@ retry:
iput(einode);
goto out_put;
}
- f2fs_delete_entry(de, page, dir, einode);
+ f2fs_delete_entry(de, folio, dir, einode);
iput(einode);
goto retry;
- } else if (IS_ERR(page)) {
- err = PTR_ERR(page);
+ } else if (IS_ERR(folio)) {
+ err = PTR_ERR(folio);
} else {
err = f2fs_add_dentry(dir, &fname, inode,
inode->i_ino, inode->i_mode);
@@ -226,21 +226,21 @@ retry:
goto out;
out_put:
- f2fs_put_page(page, 0);
+ f2fs_folio_put(folio, false);
out:
if (file_enc_name(inode))
name = "<encrypted>";
else
name = raw_inode->i_name;
f2fs_notice(F2FS_I_SB(inode), "%s: ino = %x, name = %s, dir = %lx, err = %d",
- __func__, ino_of_node(ipage), name,
+ __func__, ino_of_node(ifolio), name,
IS_ERR(dir) ? 0 : dir->i_ino, err);
return err;
}
-static int recover_quota_data(struct inode *inode, struct page *page)
+static int recover_quota_data(struct inode *inode, struct folio *folio)
{
- struct f2fs_inode *raw = F2FS_INODE(page);
+ struct f2fs_inode *raw = F2FS_INODE(folio);
struct iattr attr;
uid_t i_uid = le32_to_cpu(raw->i_uid);
gid_t i_gid = le32_to_cpu(raw->i_gid);
@@ -277,16 +277,16 @@ static void recover_inline_flags(struct inode *inode, struct f2fs_inode *ri)
clear_inode_flag(inode, FI_DATA_EXIST);
}
-static int recover_inode(struct inode *inode, struct page *page)
+static int recover_inode(struct inode *inode, struct folio *folio)
{
- struct f2fs_inode *raw = F2FS_INODE(page);
+ struct f2fs_inode *raw = F2FS_INODE(folio);
struct f2fs_inode_info *fi = F2FS_I(inode);
char *name;
int err;
inode->i_mode = le16_to_cpu(raw->i_mode);
- err = recover_quota_data(inode, page);
+ err = recover_quota_data(inode, folio);
if (err)
return err;
@@ -333,10 +333,10 @@ static int recover_inode(struct inode *inode, struct page *page)
if (file_enc_name(inode))
name = "<encrypted>";
else
- name = F2FS_INODE(page)->i_name;
+ name = F2FS_INODE(folio)->i_name;
f2fs_notice(F2FS_I_SB(inode), "recover_inode: ino = %x, name = %s, inline = %x",
- ino_of_node(page), name, raw->i_inline);
+ ino_of_node(folio), name, raw->i_inline);
return 0;
}
@@ -358,33 +358,34 @@ static int sanity_check_node_chain(struct f2fs_sb_info *sbi, block_t blkaddr,
block_t *blkaddr_fast, bool *is_detecting)
{
unsigned int ra_blocks = RECOVERY_MAX_RA_BLOCKS;
- struct page *page = NULL;
int i;
if (!*is_detecting)
return 0;
for (i = 0; i < 2; i++) {
+ struct folio *folio;
+
if (!f2fs_is_valid_blkaddr(sbi, *blkaddr_fast, META_POR)) {
*is_detecting = false;
return 0;
}
- page = f2fs_get_tmp_page(sbi, *blkaddr_fast);
- if (IS_ERR(page))
- return PTR_ERR(page);
+ folio = f2fs_get_tmp_folio(sbi, *blkaddr_fast);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
- if (!is_recoverable_dnode(page)) {
- f2fs_put_page(page, 1);
+ if (!is_recoverable_dnode(folio)) {
+ f2fs_folio_put(folio, true);
*is_detecting = false;
return 0;
}
ra_blocks = adjust_por_ra_blocks(sbi, ra_blocks, *blkaddr_fast,
- next_blkaddr_of_node(page));
+ next_blkaddr_of_node(folio));
- *blkaddr_fast = next_blkaddr_of_node(page);
- f2fs_put_page(page, 1);
+ *blkaddr_fast = next_blkaddr_of_node(folio);
+ f2fs_folio_put(folio, true);
f2fs_ra_meta_pages_cond(sbi, *blkaddr_fast, ra_blocks);
}
@@ -401,7 +402,6 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head,
bool check_only)
{
struct curseg_info *curseg;
- struct page *page = NULL;
block_t blkaddr, blkaddr_fast;
bool is_detecting = true;
int err = 0;
@@ -413,33 +413,35 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head,
while (1) {
struct fsync_inode_entry *entry;
+ struct folio *folio;
if (!f2fs_is_valid_blkaddr(sbi, blkaddr, META_POR))
return 0;
- page = f2fs_get_tmp_page(sbi, blkaddr);
- if (IS_ERR(page)) {
- err = PTR_ERR(page);
+ folio = f2fs_get_tmp_folio(sbi, blkaddr);
+ if (IS_ERR(folio)) {
+ err = PTR_ERR(folio);
break;
}
- if (!is_recoverable_dnode(page)) {
- f2fs_put_page(page, 1);
+ if (!is_recoverable_dnode(folio)) {
+ f2fs_folio_put(folio, true);
break;
}
- if (!is_fsync_dnode(page))
+ if (!is_fsync_dnode(folio))
goto next;
- entry = get_fsync_inode(head, ino_of_node(page));
+ entry = get_fsync_inode(head, ino_of_node(folio));
if (!entry) {
bool quota_inode = false;
if (!check_only &&
- IS_INODE(page) && is_dent_dnode(page)) {
- err = f2fs_recover_inode_page(sbi, page);
+ IS_INODE(folio) &&
+ is_dent_dnode(folio)) {
+ err = f2fs_recover_inode_page(sbi, folio);
if (err) {
- f2fs_put_page(page, 1);
+ f2fs_folio_put(folio, true);
break;
}
quota_inode = true;
@@ -449,24 +451,24 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head,
* CP | dnode(F) | inode(DF)
* For this case, we should not give up now.
*/
- entry = add_fsync_inode(sbi, head, ino_of_node(page),
+ entry = add_fsync_inode(sbi, head, ino_of_node(folio),
quota_inode);
if (IS_ERR(entry)) {
err = PTR_ERR(entry);
if (err == -ENOENT)
goto next;
- f2fs_put_page(page, 1);
+ f2fs_folio_put(folio, true);
break;
}
}
entry->blkaddr = blkaddr;
- if (IS_INODE(page) && is_dent_dnode(page))
+ if (IS_INODE(folio) && is_dent_dnode(folio))
entry->last_dentry = blkaddr;
next:
/* check next segment */
- blkaddr = next_blkaddr_of_node(page);
- f2fs_put_page(page, 1);
+ blkaddr = next_blkaddr_of_node(folio);
+ f2fs_folio_put(folio, true);
err = sanity_check_node_chain(sbi, blkaddr, &blkaddr_fast,
&is_detecting);
@@ -492,7 +494,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
unsigned short blkoff = GET_BLKOFF_FROM_SEG0(sbi, blkaddr);
struct f2fs_summary_block *sum_node;
struct f2fs_summary sum;
- struct page *sum_page, *node_page;
+ struct folio *sum_folio, *node_folio;
struct dnode_of_data tdn = *dn;
nid_t ino, nid;
struct inode *inode;
@@ -514,18 +516,18 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
}
}
- sum_page = f2fs_get_sum_page(sbi, segno);
- if (IS_ERR(sum_page))
- return PTR_ERR(sum_page);
- sum_node = (struct f2fs_summary_block *)page_address(sum_page);
+ sum_folio = f2fs_get_sum_folio(sbi, segno);
+ if (IS_ERR(sum_folio))
+ return PTR_ERR(sum_folio);
+ sum_node = folio_address(sum_folio);
sum = sum_node->entries[blkoff];
- f2fs_put_page(sum_page, 1);
+ f2fs_folio_put(sum_folio, true);
got_it:
/* Use the locked dnode page and inode */
nid = le32_to_cpu(sum.nid);
ofs_in_node = le16_to_cpu(sum.ofs_in_node);
- max_addrs = ADDRS_PER_PAGE(dn->node_page, dn->inode);
+ max_addrs = ADDRS_PER_PAGE(dn->node_folio, dn->inode);
if (ofs_in_node >= max_addrs) {
f2fs_err(sbi, "Inconsistent ofs_in_node:%u in summary, ino:%lu, nid:%u, max:%u",
ofs_in_node, dn->inode->i_ino, nid, max_addrs);
@@ -535,9 +537,9 @@ got_it:
if (dn->inode->i_ino == nid) {
tdn.nid = nid;
- if (!dn->inode_page_locked)
- lock_page(dn->inode_page);
- tdn.node_page = dn->inode_page;
+ if (!dn->inode_folio_locked)
+ folio_lock(dn->inode_folio);
+ tdn.node_folio = dn->inode_folio;
tdn.ofs_in_node = ofs_in_node;
goto truncate_out;
} else if (dn->nid == nid) {
@@ -546,13 +548,13 @@ got_it:
}
/* Get the node page */
- node_page = f2fs_get_node_page(sbi, nid);
- if (IS_ERR(node_page))
- return PTR_ERR(node_page);
+ node_folio = f2fs_get_node_folio(sbi, nid);
+ if (IS_ERR(node_folio))
+ return PTR_ERR(node_folio);
- offset = ofs_of_node(node_page);
- ino = ino_of_node(node_page);
- f2fs_put_page(node_page, 1);
+ offset = ofs_of_node(node_folio);
+ ino = ino_of_node(node_folio);
+ f2fs_folio_put(node_folio, true);
if (ino != dn->inode->i_ino) {
int ret;
@@ -578,8 +580,8 @@ got_it:
* if inode page is locked, unlock temporarily, but its reference
* count keeps alive.
*/
- if (ino == dn->inode->i_ino && dn->inode_page_locked)
- unlock_page(dn->inode_page);
+ if (ino == dn->inode->i_ino && dn->inode_folio_locked)
+ folio_unlock(dn->inode_folio);
set_new_dnode(&tdn, inode, NULL, NULL, 0);
if (f2fs_get_dnode_of_data(&tdn, bidx, LOOKUP_NODE))
@@ -592,15 +594,15 @@ got_it:
out:
if (ino != dn->inode->i_ino)
iput(inode);
- else if (dn->inode_page_locked)
- lock_page(dn->inode_page);
+ else if (dn->inode_folio_locked)
+ folio_lock(dn->inode_folio);
return 0;
truncate_out:
if (f2fs_data_blkaddr(&tdn) == blkaddr)
f2fs_truncate_data_blocks_range(&tdn, 1);
- if (dn->inode->i_ino == nid && !dn->inode_page_locked)
- unlock_page(dn->inode_page);
+ if (dn->inode->i_ino == nid && !dn->inode_folio_locked)
+ folio_unlock(dn->inode_folio);
return 0;
}
@@ -618,27 +620,27 @@ static int f2fs_reserve_new_block_retry(struct dnode_of_data *dn)
}
static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
- struct page *page)
+ struct folio *folio)
{
struct dnode_of_data dn;
struct node_info ni;
- unsigned int start, end;
+ unsigned int start = 0, end = 0, index;
int err = 0, recovered = 0;
/* step 1: recover xattr */
- if (IS_INODE(page)) {
- err = f2fs_recover_inline_xattr(inode, page);
+ if (IS_INODE(folio)) {
+ err = f2fs_recover_inline_xattr(inode, folio);
if (err)
goto out;
- } else if (f2fs_has_xattr_block(ofs_of_node(page))) {
- err = f2fs_recover_xattr_data(inode, page);
+ } else if (f2fs_has_xattr_block(ofs_of_node(folio))) {
+ err = f2fs_recover_xattr_data(inode, folio);
if (!err)
recovered++;
goto out;
}
/* step 2: recover inline data */
- err = f2fs_recover_inline_data(inode, page);
+ err = f2fs_recover_inline_data(inode, folio);
if (err) {
if (err == 1)
err = 0;
@@ -646,8 +648,8 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
}
/* step 3: recover data indices */
- start = f2fs_start_bidx_of_node(ofs_of_node(page), inode);
- end = start + ADDRS_PER_PAGE(page, inode);
+ start = f2fs_start_bidx_of_node(ofs_of_node(folio), inode);
+ end = start + ADDRS_PER_PAGE(folio, inode);
set_new_dnode(&dn, inode, NULL, NULL, 0);
retry_dn:
@@ -660,28 +662,28 @@ retry_dn:
goto out;
}
- f2fs_wait_on_page_writeback(dn.node_page, NODE, true, true);
+ f2fs_folio_wait_writeback(dn.node_folio, NODE, true, true);
err = f2fs_get_node_info(sbi, dn.nid, &ni, false);
if (err)
goto err;
- f2fs_bug_on(sbi, ni.ino != ino_of_node(page));
+ f2fs_bug_on(sbi, ni.ino != ino_of_node(folio));
- if (ofs_of_node(dn.node_page) != ofs_of_node(page)) {
+ if (ofs_of_node(dn.node_folio) != ofs_of_node(folio)) {
f2fs_warn(sbi, "Inconsistent ofs_of_node, ino:%lu, ofs:%u, %u",
- inode->i_ino, ofs_of_node(dn.node_page),
- ofs_of_node(page));
+ inode->i_ino, ofs_of_node(dn.node_folio),
+ ofs_of_node(folio));
err = -EFSCORRUPTED;
f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER);
goto err;
}
- for (; start < end; start++, dn.ofs_in_node++) {
+ for (index = start; index < end; index++, dn.ofs_in_node++) {
block_t src, dest;
src = f2fs_data_blkaddr(&dn);
- dest = data_blkaddr(dn.inode, page, dn.ofs_in_node);
+ dest = data_blkaddr(dn.inode, folio, dn.ofs_in_node);
if (__is_valid_data_blkaddr(src) &&
!f2fs_is_valid_blkaddr(sbi, src, META_POR)) {
@@ -706,9 +708,9 @@ retry_dn:
}
if (!file_keep_isize(inode) &&
- (i_size_read(inode) <= ((loff_t)start << PAGE_SHIFT)))
+ (i_size_read(inode) <= ((loff_t)index << PAGE_SHIFT)))
f2fs_i_size_write(inode,
- (loff_t)(start + 1) << PAGE_SHIFT);
+ (loff_t)(index + 1) << PAGE_SHIFT);
/*
* dest is reserved block, invalidate src block
@@ -756,16 +758,18 @@ retry_prev:
}
}
- copy_node_footer(dn.node_page, page);
- fill_node_footer(dn.node_page, dn.nid, ni.ino,
- ofs_of_node(page), false);
- set_page_dirty(dn.node_page);
+ copy_node_footer(dn.node_folio, folio);
+ fill_node_footer(dn.node_folio, dn.nid, ni.ino,
+ ofs_of_node(folio), false);
+ folio_mark_dirty(dn.node_folio);
err:
f2fs_put_dnode(&dn);
out:
- f2fs_notice(sbi, "recover_data: ino = %lx (i_size: %s) recovered = %d, err = %d",
- inode->i_ino, file_keep_isize(inode) ? "keep" : "recover",
- recovered, err);
+ f2fs_notice(sbi, "recover_data: ino = %lx, nid = %x (i_size: %s), "
+ "range (%u, %u), recovered = %d, err = %d",
+ inode->i_ino, nid_of_node(folio),
+ file_keep_isize(inode) ? "keep" : "recover",
+ start, end, recovered, err);
return err;
}
@@ -773,10 +777,17 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list,
struct list_head *tmp_inode_list, struct list_head *dir_list)
{
struct curseg_info *curseg;
- struct page *page = NULL;
int err = 0;
block_t blkaddr;
unsigned int ra_blocks = RECOVERY_MAX_RA_BLOCKS;
+ unsigned int recoverable_dnode = 0;
+ unsigned int fsynced_dnode = 0;
+ unsigned int total_dnode = 0;
+ unsigned int recovered_inode = 0;
+ unsigned int recovered_dentry = 0;
+ unsigned int recovered_dnode = 0;
+
+ f2fs_notice(sbi, "do_recover_data: start to recover dnode");
/* get node pages in the current segment */
curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
@@ -784,63 +795,75 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list,
while (1) {
struct fsync_inode_entry *entry;
+ struct folio *folio;
if (!f2fs_is_valid_blkaddr(sbi, blkaddr, META_POR))
break;
- page = f2fs_get_tmp_page(sbi, blkaddr);
- if (IS_ERR(page)) {
- err = PTR_ERR(page);
+ folio = f2fs_get_tmp_folio(sbi, blkaddr);
+ if (IS_ERR(folio)) {
+ err = PTR_ERR(folio);
break;
}
- if (!is_recoverable_dnode(page)) {
- f2fs_put_page(page, 1);
+ if (!is_recoverable_dnode(folio)) {
+ f2fs_folio_put(folio, true);
break;
}
+ recoverable_dnode++;
- entry = get_fsync_inode(inode_list, ino_of_node(page));
+ entry = get_fsync_inode(inode_list, ino_of_node(folio));
if (!entry)
goto next;
+ fsynced_dnode++;
/*
* inode(x) | CP | inode(x) | dnode(F)
* In this case, we can lose the latest inode(x).
* So, call recover_inode for the inode update.
*/
- if (IS_INODE(page)) {
- err = recover_inode(entry->inode, page);
+ if (IS_INODE(folio)) {
+ err = recover_inode(entry->inode, folio);
if (err) {
- f2fs_put_page(page, 1);
+ f2fs_folio_put(folio, true);
break;
}
+ recovered_inode++;
}
if (entry->last_dentry == blkaddr) {
- err = recover_dentry(entry->inode, page, dir_list);
+ err = recover_dentry(entry->inode, folio, dir_list);
if (err) {
- f2fs_put_page(page, 1);
+ f2fs_folio_put(folio, true);
break;
}
+ recovered_dentry++;
}
- err = do_recover_data(sbi, entry->inode, page);
+ err = do_recover_data(sbi, entry->inode, folio);
if (err) {
- f2fs_put_page(page, 1);
+ f2fs_folio_put(folio, true);
break;
}
+ recovered_dnode++;
if (entry->blkaddr == blkaddr)
list_move_tail(&entry->list, tmp_inode_list);
next:
ra_blocks = adjust_por_ra_blocks(sbi, ra_blocks, blkaddr,
- next_blkaddr_of_node(page));
+ next_blkaddr_of_node(folio));
/* check next segment */
- blkaddr = next_blkaddr_of_node(page);
- f2fs_put_page(page, 1);
+ blkaddr = next_blkaddr_of_node(folio);
+ f2fs_folio_put(folio, true);
f2fs_ra_meta_pages_cond(sbi, blkaddr, ra_blocks);
+ total_dnode++;
}
if (!err)
err = f2fs_allocate_new_segments(sbi);
+
+ f2fs_notice(sbi, "do_recover_data: dnode: (recoverable: %u, fsynced: %u, "
+ "total: %u), recovered: (inode: %u, dentry: %u, dnode: %u), err: %d",
+ recoverable_dnode, fsynced_dnode, total_dnode, recovered_inode,
+ recovered_dentry, recovered_dnode, err);
return err;
}
@@ -853,6 +876,9 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
unsigned long s_flags = sbi->sb->s_flags;
bool need_writecp = false;
+ f2fs_notice(sbi, "f2fs_recover_fsync_data: recovery fsync data, "
+ "check_only: %d", check_only);
+
if (is_sbi_flag_set(sbi, SBI_IS_WRITABLE))
f2fs_info(sbi, "recover fsync data on readonly fs");
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index c282e8a0a2ec..cc82d42ef14c 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -334,7 +334,7 @@ static int __f2fs_commit_atomic_write(struct inode *inode)
goto next;
}
- blen = min((pgoff_t)ADDRS_PER_PAGE(dn.node_page, cow_inode),
+ blen = min((pgoff_t)ADDRS_PER_PAGE(dn.node_folio, cow_inode),
len);
index = off;
for (i = 0; i < blen; i++, dn.ofs_in_node++, index++) {
@@ -371,12 +371,21 @@ next:
}
out:
+ if (time_to_inject(sbi, FAULT_TIMEOUT))
+ f2fs_io_schedule_timeout_killable(DEFAULT_FAULT_TIMEOUT);
+
if (ret) {
sbi->revoked_atomic_block += fi->atomic_write_cnt;
} else {
sbi->committed_atomic_block += fi->atomic_write_cnt;
set_inode_flag(inode, FI_ATOMIC_COMMITTED);
+
+ /*
+ * inode may has no FI_ATOMIC_DIRTIED flag due to no write
+ * before commit.
+ */
if (is_inode_flag_set(inode, FI_ATOMIC_DIRTIED)) {
+ /* clear atomic dirty status and set vfs dirty status */
clear_inode_flag(inode, FI_ATOMIC_DIRTIED);
f2fs_mark_inode_dirty_sync(inode, true);
}
@@ -424,7 +433,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
if (need && excess_cached_nats(sbi))
f2fs_balance_fs_bg(sbi, false);
- if (!f2fs_is_checkpoint_ready(sbi))
+ if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
return;
/*
@@ -446,7 +455,8 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
} else {
struct f2fs_gc_control gc_control = {
.victim_segno = NULL_SEGNO,
- .init_gc_type = BG_GC,
+ .init_gc_type = f2fs_sb_has_blkzoned(sbi) ?
+ FG_GC : BG_GC,
.no_bg_gc = true,
.should_migrate_blocks = false,
.err_gc_skipped = false,
@@ -763,7 +773,7 @@ static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
/* need not be added */
- if (IS_CURSEG(sbi, segno))
+ if (is_curseg(sbi, segno))
return;
if (!test_and_set_bit(segno, dirty_i->dirty_segmap[dirty_type]))
@@ -790,7 +800,7 @@ static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
!valid_blocks) ||
valid_blocks == CAP_BLKS_PER_SEC(sbi));
- if (!IS_CURSEC(sbi, secno))
+ if (!is_cursec(sbi, secno))
set_bit(secno, dirty_i->dirty_secmap);
}
}
@@ -829,7 +839,7 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
return;
}
- if (!IS_CURSEC(sbi, secno))
+ if (!is_cursec(sbi, secno))
set_bit(secno, dirty_i->dirty_secmap);
}
}
@@ -846,7 +856,7 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
unsigned short valid_blocks, ckpt_valid_blocks;
unsigned int usable_blocks;
- if (segno == NULL_SEGNO || IS_CURSEG(sbi, segno))
+ if (segno == NULL_SEGNO || is_curseg(sbi, segno))
return;
usable_blocks = f2fs_usable_blks_in_seg(sbi, segno);
@@ -879,7 +889,7 @@ void f2fs_dirty_to_prefree(struct f2fs_sb_info *sbi)
for_each_set_bit(segno, dirty_i->dirty_segmap[DIRTY], MAIN_SEGS(sbi)) {
if (get_valid_blocks(sbi, segno, false))
continue;
- if (IS_CURSEG(sbi, segno))
+ if (is_curseg(sbi, segno))
continue;
__locate_dirty_segment(sbi, segno, PRE);
__remove_dirty_segment(sbi, segno, DIRTY);
@@ -2096,7 +2106,9 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc,
return false;
if (!force) {
- if (!f2fs_realtime_discard_enable(sbi) || !se->valid_blocks ||
+ if (!f2fs_realtime_discard_enable(sbi) ||
+ (!se->valid_blocks &&
+ !is_curseg(sbi, cpc->trim_start)) ||
SM_I(sbi)->dcc_info->nr_discards >=
SM_I(sbi)->dcc_info->max_discards)
return false;
@@ -2224,7 +2236,7 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi,
next:
secno = GET_SEC_FROM_SEG(sbi, start);
start_segno = GET_SEG_FROM_SEC(sbi, secno);
- if (!IS_CURSEC(sbi, secno) &&
+ if (!is_cursec(sbi, secno) &&
!get_valid_blocks(sbi, start, true))
f2fs_issue_discard(sbi, START_BLOCK(sbi, start_segno),
BLKS_PER_SEC(sbi));
@@ -2320,10 +2332,9 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY;
dcc->max_ordered_discard = DEFAULT_MAX_ORDERED_DISCARD_GRANULARITY;
dcc->discard_io_aware = DPOLICY_IO_AWARE_ENABLE;
- if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT)
+ if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT ||
+ F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION)
dcc->discard_granularity = BLKS_PER_SEG(sbi);
- else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION)
- dcc->discard_granularity = BLKS_PER_SEC(sbi);
INIT_LIST_HEAD(&dcc->entry_list);
for (i = 0; i < MAX_PLIST_NUM; i++)
@@ -2437,7 +2448,7 @@ static void update_segment_mtime(struct f2fs_sb_info *sbi, block_t blkaddr,
* that the consecutive input blocks belong to the same segment.
*/
static int update_sit_entry_for_release(struct f2fs_sb_info *sbi, struct seg_entry *se,
- block_t blkaddr, unsigned int offset, int del)
+ unsigned int segno, block_t blkaddr, unsigned int offset, int del)
{
bool exist;
#ifdef CONFIG_F2FS_CHECK_FS
@@ -2482,15 +2493,21 @@ static int update_sit_entry_for_release(struct f2fs_sb_info *sbi, struct seg_ent
f2fs_test_and_clear_bit(offset + i, se->discard_map))
sbi->discard_blks++;
- if (!f2fs_test_bit(offset + i, se->ckpt_valid_map))
+ if (!f2fs_test_bit(offset + i, se->ckpt_valid_map)) {
se->ckpt_valid_blocks -= 1;
+ if (__is_large_section(sbi))
+ get_sec_entry(sbi, segno)->ckpt_valid_blocks -= 1;
+ }
}
+ if (__is_large_section(sbi))
+ sanity_check_valid_blocks(sbi, segno);
+
return del;
}
static int update_sit_entry_for_alloc(struct f2fs_sb_info *sbi, struct seg_entry *se,
- block_t blkaddr, unsigned int offset, int del)
+ unsigned int segno, block_t blkaddr, unsigned int offset, int del)
{
bool exist;
#ifdef CONFIG_F2FS_CHECK_FS
@@ -2523,12 +2540,21 @@ static int update_sit_entry_for_alloc(struct f2fs_sb_info *sbi, struct seg_entry
* or newly invalidated.
*/
if (!is_sbi_flag_set(sbi, SBI_CP_DISABLED)) {
- if (!f2fs_test_and_set_bit(offset, se->ckpt_valid_map))
+ if (!f2fs_test_and_set_bit(offset, se->ckpt_valid_map)) {
se->ckpt_valid_blocks++;
+ if (__is_large_section(sbi))
+ get_sec_entry(sbi, segno)->ckpt_valid_blocks++;
+ }
}
- if (!f2fs_test_bit(offset, se->ckpt_valid_map))
+ if (!f2fs_test_bit(offset, se->ckpt_valid_map)) {
se->ckpt_valid_blocks += del;
+ if (__is_large_section(sbi))
+ get_sec_entry(sbi, segno)->ckpt_valid_blocks += del;
+ }
+
+ if (__is_large_section(sbi))
+ sanity_check_valid_blocks(sbi, segno);
return del;
}
@@ -2559,9 +2585,9 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
/* Update valid block bitmap */
if (del > 0) {
- del = update_sit_entry_for_alloc(sbi, se, blkaddr, offset, del);
+ del = update_sit_entry_for_alloc(sbi, se, segno, blkaddr, offset, del);
} else {
- del = update_sit_entry_for_release(sbi, se, blkaddr, offset, del);
+ del = update_sit_entry_for_release(sbi, se, segno, blkaddr, offset, del);
}
__mark_sit_entry_dirty(sbi, segno);
@@ -2674,23 +2700,23 @@ int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra)
}
/*
- * Caller should put this summary page
+ * Caller should put this summary folio
*/
-struct page *f2fs_get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno)
+struct folio *f2fs_get_sum_folio(struct f2fs_sb_info *sbi, unsigned int segno)
{
if (unlikely(f2fs_cp_error(sbi)))
return ERR_PTR(-EIO);
- return f2fs_get_meta_page_retry(sbi, GET_SUM_BLOCK(sbi, segno));
+ return f2fs_get_meta_folio_retry(sbi, GET_SUM_BLOCK(sbi, segno));
}
void f2fs_update_meta_page(struct f2fs_sb_info *sbi,
void *src, block_t blk_addr)
{
- struct page *page = f2fs_grab_meta_page(sbi, blk_addr);
+ struct folio *folio = f2fs_grab_meta_folio(sbi, blk_addr);
- memcpy(page_address(page), src, PAGE_SIZE);
- set_page_dirty(page);
- f2fs_put_page(page, 1);
+ memcpy(folio_address(folio), src, PAGE_SIZE);
+ folio_mark_dirty(folio);
+ f2fs_folio_put(folio, true);
}
static void write_sum_page(struct f2fs_sb_info *sbi,
@@ -2703,11 +2729,11 @@ static void write_current_sum_page(struct f2fs_sb_info *sbi,
int type, block_t blk_addr)
{
struct curseg_info *curseg = CURSEG_I(sbi, type);
- struct page *page = f2fs_grab_meta_page(sbi, blk_addr);
+ struct folio *folio = f2fs_grab_meta_folio(sbi, blk_addr);
struct f2fs_summary_block *src = curseg->sum_blk;
struct f2fs_summary_block *dst;
- dst = (struct f2fs_summary_block *)page_address(page);
+ dst = folio_address(folio);
memset(dst, 0, PAGE_SIZE);
mutex_lock(&curseg->curseg_mutex);
@@ -2721,8 +2747,8 @@ static void write_current_sum_page(struct f2fs_sb_info *sbi,
mutex_unlock(&curseg->curseg_mutex);
- set_page_dirty(page);
- f2fs_put_page(page, 1);
+ folio_mark_dirty(folio);
+ f2fs_folio_put(folio, true);
}
static int is_next_segment_free(struct f2fs_sb_info *sbi,
@@ -2776,7 +2802,7 @@ static int get_new_segment(struct f2fs_sb_info *sbi,
if (sbi->blkzone_alloc_policy == BLKZONE_ALLOC_PRIOR_CONV || pinning)
segno = 0;
else
- segno = max(sbi->first_zoned_segno, *newseg);
+ segno = max(sbi->first_seq_zone_segno, *newseg);
hint = GET_SEC_FROM_SEG(sbi, segno);
}
#endif
@@ -2788,7 +2814,7 @@ find_other_zone:
if (secno >= MAIN_SECS(sbi) && f2fs_sb_has_blkzoned(sbi)) {
/* Write only to sequential zones */
if (sbi->blkzone_alloc_policy == BLKZONE_ALLOC_ONLY_SEQ) {
- hint = GET_SEC_FROM_SEG(sbi, sbi->first_zoned_segno);
+ hint = GET_SEC_FROM_SEG(sbi, sbi->first_seq_zone_segno);
secno = find_next_zero_bit(free_i->free_secmap, MAIN_SECS(sbi), hint);
} else
secno = find_first_zero_bit(free_i->free_secmap,
@@ -2806,7 +2832,7 @@ find_other_zone:
MAIN_SECS(sbi));
if (secno >= MAIN_SECS(sbi)) {
ret = -ENOSPC;
- f2fs_bug_on(sbi, 1);
+ f2fs_bug_on(sbi, !pinning);
goto out_unlock;
}
}
@@ -2835,11 +2861,15 @@ find_other_zone:
}
got_it:
/* set it as dirty segment in free segmap */
- f2fs_bug_on(sbi, test_bit(segno, free_i->free_segmap));
+ if (test_bit(segno, free_i->free_segmap)) {
+ ret = -EFSCORRUPTED;
+ f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_CORRUPTED_FREE_BITMAP);
+ goto out_unlock;
+ }
- /* no free section in conventional zone */
+ /* no free section in conventional device or conventional zone */
if (new_sec && pinning &&
- !f2fs_valid_pinned_area(sbi, START_BLOCK(sbi, segno))) {
+ f2fs_is_sequential_zone_area(sbi, START_BLOCK(sbi, segno))) {
ret = -EAGAIN;
goto out_unlock;
}
@@ -2848,7 +2878,7 @@ got_it:
out_unlock:
spin_unlock(&free_i->segmap_lock);
- if (ret == -ENOSPC)
+ if (ret == -ENOSPC && !pinning)
f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_NO_SEGMENT);
return ret;
}
@@ -2921,6 +2951,13 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type)
return curseg->segno;
}
+static void reset_curseg_fields(struct curseg_info *curseg)
+{
+ curseg->inited = false;
+ curseg->segno = NULL_SEGNO;
+ curseg->next_segno = 0;
+}
+
/*
* Allocate a current working segment.
* This function always allocates a free segment in LFS manner.
@@ -2939,7 +2976,7 @@ static int new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec)
ret = get_new_segment(sbi, &segno, new_sec, pinning);
if (ret) {
if (ret == -ENOSPC)
- curseg->segno = NULL_SEGNO;
+ reset_curseg_fields(curseg);
return ret;
}
@@ -2989,7 +3026,7 @@ static int change_curseg(struct f2fs_sb_info *sbi, int type)
struct curseg_info *curseg = CURSEG_I(sbi, type);
unsigned int new_segno = curseg->next_segno;
struct f2fs_summary_block *sum_node;
- struct page *sum_page;
+ struct folio *sum_folio;
if (curseg->inited)
write_sum_page(sbi, curseg->sum_blk, GET_SUM_BLOCK(sbi, curseg->segno));
@@ -3005,15 +3042,15 @@ static int change_curseg(struct f2fs_sb_info *sbi, int type)
curseg->alloc_type = SSR;
curseg->next_blkoff = __next_free_blkoff(sbi, curseg->segno, 0);
- sum_page = f2fs_get_sum_page(sbi, new_segno);
- if (IS_ERR(sum_page)) {
+ sum_folio = f2fs_get_sum_folio(sbi, new_segno);
+ if (IS_ERR(sum_folio)) {
/* GC won't be able to use stale summary pages by cp_error */
memset(curseg->sum_blk, 0, SUM_ENTRY_SIZE);
- return PTR_ERR(sum_page);
+ return PTR_ERR(sum_folio);
}
- sum_node = (struct f2fs_summary_block *)page_address(sum_page);
+ sum_node = folio_address(sum_folio);
memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE);
- f2fs_put_page(sum_page, 1);
+ f2fs_folio_put(sum_folio, true);
return 0;
}
@@ -3303,7 +3340,7 @@ retry:
if (f2fs_sb_has_blkzoned(sbi) && err == -EAGAIN && gc_required) {
f2fs_down_write(&sbi->gc_lock);
- err = f2fs_gc_range(sbi, 0, GET_SEGNO(sbi, FDEV(0).end_blk),
+ err = f2fs_gc_range(sbi, 0, sbi->first_seq_zone_segno - 1,
true, ZONED_PIN_SEC_REQUIRED_COUNT);
f2fs_up_write(&sbi->gc_lock);
@@ -3576,14 +3613,14 @@ static int __get_segment_type_2(struct f2fs_io_info *fio)
static int __get_segment_type_4(struct f2fs_io_info *fio)
{
if (fio->type == DATA) {
- struct inode *inode = fio->page->mapping->host;
+ struct inode *inode = fio_inode(fio);
if (S_ISDIR(inode->i_mode))
return CURSEG_HOT_DATA;
else
return CURSEG_COLD_DATA;
} else {
- if (IS_DNODE(fio->page) && is_cold_node(fio->page))
+ if (IS_DNODE(fio->folio) && is_cold_node(fio->folio))
return CURSEG_WARM_NODE;
else
return CURSEG_COLD_NODE;
@@ -3610,7 +3647,7 @@ static int __get_age_segment_type(struct inode *inode, pgoff_t pgofs)
static int __get_segment_type_6(struct f2fs_io_info *fio)
{
if (fio->type == DATA) {
- struct inode *inode = fio->page->mapping->host;
+ struct inode *inode = fio_inode(fio);
int type;
if (is_inode_flag_set(inode, FI_ALIGNED_WRITE))
@@ -3629,8 +3666,7 @@ static int __get_segment_type_6(struct f2fs_io_info *fio)
if (file_is_cold(inode) || f2fs_need_compress_data(inode))
return CURSEG_COLD_DATA;
- type = __get_age_segment_type(inode,
- page_folio(fio->page)->index);
+ type = __get_age_segment_type(inode, fio->folio->index);
if (type != NO_CHECK_TYPE)
return type;
@@ -3641,8 +3677,8 @@ static int __get_segment_type_6(struct f2fs_io_info *fio)
return f2fs_rw_hint_to_seg_type(F2FS_I_SB(inode),
inode->i_write_hint);
} else {
- if (IS_DNODE(fio->page))
- return is_cold_node(fio->page) ? CURSEG_WARM_NODE :
+ if (IS_DNODE(fio->folio))
+ return is_cold_node(fio->folio) ? CURSEG_WARM_NODE :
CURSEG_HOT_NODE;
return CURSEG_COLD_NODE;
}
@@ -3710,14 +3746,7 @@ static void f2fs_randomize_chunk(struct f2fs_sb_info *sbi,
get_random_u32_inclusive(1, sbi->max_fragment_hole);
}
-static void reset_curseg_fields(struct curseg_info *curseg)
-{
- curseg->inited = false;
- curseg->segno = NULL_SEGNO;
- curseg->next_segno = 0;
-}
-
-int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
+int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct folio *folio,
block_t old_blkaddr, block_t *new_blkaddr,
struct f2fs_summary *sum, int type,
struct f2fs_io_info *fio)
@@ -3821,10 +3850,10 @@ skip_new_segment:
up_write(&sit_i->sentry_lock);
- if (page && IS_NODESEG(curseg->seg_type)) {
- fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg));
+ if (folio && IS_NODESEG(curseg->seg_type)) {
+ fill_node_footer_blkaddr(folio, NEXT_FREE_BLKADDR(sbi, curseg));
- f2fs_inode_chksum_set(sbi, page);
+ f2fs_inode_chksum_set(sbi, folio);
}
if (fio) {
@@ -3902,6 +3931,7 @@ static int log_type_to_seg_type(enum log_type type)
static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio)
{
+ struct folio *folio = fio->folio;
enum log_type type = __get_segment_type(fio);
int seg_type = log_type_to_seg_type(type);
bool keep_order = (f2fs_lfs_mode(fio->sbi) &&
@@ -3910,15 +3940,21 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio)
if (keep_order)
f2fs_down_read(&fio->sbi->io_order_lock);
- if (f2fs_allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr,
+ if (f2fs_allocate_data_block(fio->sbi, folio, fio->old_blkaddr,
&fio->new_blkaddr, sum, type, fio)) {
- if (fscrypt_inode_uses_fs_layer_crypto(fio->page->mapping->host))
+ if (fscrypt_inode_uses_fs_layer_crypto(folio->mapping->host))
fscrypt_finalize_bounce_page(&fio->encrypted_page);
- end_page_writeback(fio->page);
- if (f2fs_in_warm_node_list(fio->sbi, fio->page))
- f2fs_del_fsync_node_entry(fio->sbi, fio->page);
+ folio_end_writeback(folio);
+ if (f2fs_in_warm_node_list(fio->sbi, folio))
+ f2fs_del_fsync_node_entry(fio->sbi, folio);
+ f2fs_bug_on(fio->sbi, !is_set_ckpt_flags(fio->sbi,
+ CP_ERROR_FLAG));
goto out;
}
+
+ f2fs_bug_on(fio->sbi, !f2fs_is_valid_blkaddr_raw(fio->sbi,
+ fio->new_blkaddr, DATA_GENERIC_ENHANCE));
+
if (GET_SEGNO(fio->sbi, fio->old_blkaddr) != NULL_SEGNO)
f2fs_invalidate_internal_cache(fio->sbi, fio->old_blkaddr, 1);
@@ -3942,7 +3978,7 @@ void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct folio *folio,
.op_flags = REQ_SYNC | REQ_META | REQ_PRIO,
.old_blkaddr = folio->index,
.new_blkaddr = folio->index,
- .page = folio_page(folio, 0),
+ .folio = folio,
.encrypted_page = NULL,
.in_list = 0,
};
@@ -4021,7 +4057,7 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio)
if (!err) {
f2fs_update_device_state(fio->sbi, fio->ino,
fio->new_blkaddr, 1);
- f2fs_update_iostat(fio->sbi, fio->page->mapping->host,
+ f2fs_update_iostat(fio->sbi, fio_inode(fio),
fio->io_type, F2FS_BLKSIZE);
}
@@ -4070,14 +4106,14 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
if (!recover_curseg) {
/* for recovery flow */
- if (se->valid_blocks == 0 && !IS_CURSEG(sbi, segno)) {
+ if (se->valid_blocks == 0 && !is_curseg(sbi, segno)) {
if (old_blkaddr == NULL_ADDR)
type = CURSEG_COLD_DATA;
else
type = CURSEG_WARM_DATA;
}
} else {
- if (IS_CURSEG(sbi, segno)) {
+ if (is_curseg(sbi, segno)) {
/* se->type is volatile as SSR allocation */
type = __f2fs_get_curseg(sbi, segno);
f2fs_bug_on(sbi, type == NO_CHECK_TYPE);
@@ -4154,22 +4190,21 @@ void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn,
f2fs_update_data_blkaddr(dn, new_addr);
}
-void f2fs_wait_on_page_writeback(struct page *page,
- enum page_type type, bool ordered, bool locked)
+void f2fs_folio_wait_writeback(struct folio *folio, enum page_type type,
+ bool ordered, bool locked)
{
- if (folio_test_writeback(page_folio(page))) {
- struct f2fs_sb_info *sbi = F2FS_P_SB(page);
+ if (folio_test_writeback(folio)) {
+ struct f2fs_sb_info *sbi = F2FS_F_SB(folio);
/* submit cached LFS IO */
- f2fs_submit_merged_write_cond(sbi, NULL, page, 0, type);
+ f2fs_submit_merged_write_cond(sbi, NULL, folio, 0, type);
/* submit cached IPU IO */
- f2fs_submit_merged_ipu_write(sbi, NULL, page);
+ f2fs_submit_merged_ipu_write(sbi, NULL, folio);
if (ordered) {
- wait_on_page_writeback(page);
- f2fs_bug_on(sbi, locked &&
- folio_test_writeback(page_folio(page)));
+ folio_wait_writeback(folio);
+ f2fs_bug_on(sbi, locked && folio_test_writeback(folio));
} else {
- wait_for_stable_page(page);
+ folio_wait_stable(folio);
}
}
}
@@ -4177,7 +4212,7 @@ void f2fs_wait_on_page_writeback(struct page *page,
void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct page *cpage;
+ struct folio *cfolio;
if (!f2fs_meta_inode_gc_required(inode))
return;
@@ -4185,10 +4220,10 @@ void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr)
if (!__is_valid_data_blkaddr(blkaddr))
return;
- cpage = find_lock_page(META_MAPPING(sbi), blkaddr);
- if (cpage) {
- f2fs_wait_on_page_writeback(cpage, DATA, true, true);
- f2fs_put_page(cpage, 1);
+ cfolio = filemap_lock_folio(META_MAPPING(sbi), blkaddr);
+ if (!IS_ERR(cfolio)) {
+ f2fs_folio_wait_writeback(cfolio, DATA, true, true);
+ f2fs_folio_put(cfolio, true);
}
}
@@ -4212,16 +4247,16 @@ static int read_compacted_summaries(struct f2fs_sb_info *sbi)
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
struct curseg_info *seg_i;
unsigned char *kaddr;
- struct page *page;
+ struct folio *folio;
block_t start;
int i, j, offset;
start = start_sum_block(sbi);
- page = f2fs_get_meta_page(sbi, start++);
- if (IS_ERR(page))
- return PTR_ERR(page);
- kaddr = (unsigned char *)page_address(page);
+ folio = f2fs_get_meta_folio(sbi, start++);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
+ kaddr = folio_address(folio);
/* Step 1: restore nat cache */
seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA);
@@ -4258,17 +4293,16 @@ static int read_compacted_summaries(struct f2fs_sb_info *sbi)
SUM_FOOTER_SIZE)
continue;
- f2fs_put_page(page, 1);
- page = NULL;
+ f2fs_folio_put(folio, true);
- page = f2fs_get_meta_page(sbi, start++);
- if (IS_ERR(page))
- return PTR_ERR(page);
- kaddr = (unsigned char *)page_address(page);
+ folio = f2fs_get_meta_folio(sbi, start++);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
+ kaddr = folio_address(folio);
offset = 0;
}
}
- f2fs_put_page(page, 1);
+ f2fs_folio_put(folio, true);
return 0;
}
@@ -4277,7 +4311,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
struct f2fs_summary_block *sum;
struct curseg_info *curseg;
- struct page *new;
+ struct folio *new;
unsigned short blk_off;
unsigned int segno = 0;
block_t blk_addr = 0;
@@ -4304,10 +4338,10 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
blk_addr = GET_SUM_BLOCK(sbi, segno);
}
- new = f2fs_get_meta_page(sbi, blk_addr);
+ new = f2fs_get_meta_folio(sbi, blk_addr);
if (IS_ERR(new))
return PTR_ERR(new);
- sum = (struct f2fs_summary_block *)page_address(new);
+ sum = folio_address(new);
if (IS_NODESEG(type)) {
if (__exist_node_summaries(sbi)) {
@@ -4342,7 +4376,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
curseg->next_blkoff = blk_off;
mutex_unlock(&curseg->curseg_mutex);
out:
- f2fs_put_page(new, 1);
+ f2fs_folio_put(new, true);
return err;
}
@@ -4391,15 +4425,15 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi)
static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr)
{
- struct page *page;
+ struct folio *folio;
unsigned char *kaddr;
struct f2fs_summary *summary;
struct curseg_info *seg_i;
int written_size = 0;
int i, j;
- page = f2fs_grab_meta_page(sbi, blkaddr++);
- kaddr = (unsigned char *)page_address(page);
+ folio = f2fs_grab_meta_folio(sbi, blkaddr++);
+ kaddr = folio_address(folio);
memset(kaddr, 0, PAGE_SIZE);
/* Step 1: write nat cache */
@@ -4416,9 +4450,9 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr)
for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
seg_i = CURSEG_I(sbi, i);
for (j = 0; j < f2fs_curseg_valid_blocks(sbi, i); j++) {
- if (!page) {
- page = f2fs_grab_meta_page(sbi, blkaddr++);
- kaddr = (unsigned char *)page_address(page);
+ if (!folio) {
+ folio = f2fs_grab_meta_folio(sbi, blkaddr++);
+ kaddr = folio_address(folio);
memset(kaddr, 0, PAGE_SIZE);
written_size = 0;
}
@@ -4430,14 +4464,14 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr)
SUM_FOOTER_SIZE)
continue;
- set_page_dirty(page);
- f2fs_put_page(page, 1);
- page = NULL;
+ folio_mark_dirty(folio);
+ f2fs_folio_put(folio, true);
+ folio = NULL;
}
}
- if (page) {
- set_page_dirty(page);
- f2fs_put_page(page, 1);
+ if (folio) {
+ folio_mark_dirty(folio);
+ f2fs_folio_put(folio, true);
}
}
@@ -4490,29 +4524,29 @@ int f2fs_lookup_journal_in_cursum(struct f2fs_journal *journal, int type,
return -1;
}
-static struct page *get_current_sit_page(struct f2fs_sb_info *sbi,
+static struct folio *get_current_sit_folio(struct f2fs_sb_info *sbi,
unsigned int segno)
{
- return f2fs_get_meta_page(sbi, current_sit_addr(sbi, segno));
+ return f2fs_get_meta_folio(sbi, current_sit_addr(sbi, segno));
}
-static struct page *get_next_sit_page(struct f2fs_sb_info *sbi,
+static struct folio *get_next_sit_folio(struct f2fs_sb_info *sbi,
unsigned int start)
{
struct sit_info *sit_i = SIT_I(sbi);
- struct page *page;
+ struct folio *folio;
pgoff_t src_off, dst_off;
src_off = current_sit_addr(sbi, start);
dst_off = next_sit_addr(sbi, src_off);
- page = f2fs_grab_meta_page(sbi, dst_off);
- seg_info_to_sit_page(sbi, page, start);
+ folio = f2fs_grab_meta_folio(sbi, dst_off);
+ seg_info_to_sit_folio(sbi, folio, start);
- set_page_dirty(page);
+ folio_mark_dirty(folio);
set_to_next_sit(sit_i, start);
- return page;
+ return folio;
}
static struct sit_entry_set *grab_sit_entry_set(void)
@@ -4642,7 +4676,7 @@ void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
* #2, flush sit entries to sit page.
*/
list_for_each_entry_safe(ses, tmp, head, set_list) {
- struct page *page = NULL;
+ struct folio *folio = NULL;
struct f2fs_sit_block *raw_sit = NULL;
unsigned int start_segno = ses->start_segno;
unsigned int end = min(start_segno + SIT_ENTRY_PER_BLOCK,
@@ -4656,8 +4690,8 @@ void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
if (to_journal) {
down_write(&curseg->journal_rwsem);
} else {
- page = get_next_sit_page(sbi, start_segno);
- raw_sit = page_address(page);
+ folio = get_next_sit_folio(sbi, start_segno);
+ raw_sit = folio_address(folio);
}
/* flush dirty sit entries in region of current sit set */
@@ -4695,6 +4729,12 @@ void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
&raw_sit->entries[sit_offset]);
}
+ /* update ckpt_valid_block */
+ if (__is_large_section(sbi)) {
+ set_ckpt_valid_blocks(sbi, segno);
+ sanity_check_valid_blocks(sbi, segno);
+ }
+
__clear_bit(segno, bitmap);
sit_i->dirty_sentries--;
ses->entry_cnt--;
@@ -4703,7 +4743,7 @@ void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
if (to_journal)
up_write(&curseg->journal_rwsem);
else
- f2fs_put_page(page, 1);
+ f2fs_folio_put(folio, true);
f2fs_bug_on(sbi, ses->entry_cnt);
release_sit_entry_set(ses);
@@ -4915,15 +4955,15 @@ static int build_sit_entries(struct f2fs_sb_info *sbi)
for (; start < end && start < MAIN_SEGS(sbi); start++) {
struct f2fs_sit_block *sit_blk;
- struct page *page;
+ struct folio *folio;
se = &sit_i->sentries[start];
- page = get_current_sit_page(sbi, start);
- if (IS_ERR(page))
- return PTR_ERR(page);
- sit_blk = (struct f2fs_sit_block *)page_address(page);
+ folio = get_current_sit_folio(sbi, start);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
+ sit_blk = folio_address(folio);
sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)];
- f2fs_put_page(page, 1);
+ f2fs_folio_put(folio, true);
err = check_block_count(sbi, start, &sit);
if (err)
@@ -5016,6 +5056,16 @@ init_discard_map_done:
}
up_read(&curseg->journal_rwsem);
+ /* update ckpt_valid_block */
+ if (__is_large_section(sbi)) {
+ unsigned int segno;
+
+ for (segno = 0; segno < MAIN_SEGS(sbi); segno += SEGS_PER_SEC(sbi)) {
+ set_ckpt_valid_blocks(sbi, segno);
+ sanity_check_valid_blocks(sbi, segno);
+ }
+ }
+
if (err)
return err;
@@ -5099,7 +5149,7 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi)
if (!valid_blocks || valid_blocks == CAP_BLKS_PER_SEC(sbi))
continue;
- if (IS_CURSEC(sbi, secno))
+ if (is_cursec(sbi, secno))
continue;
set_bit(secno, dirty_i->dirty_secmap);
}
@@ -5235,7 +5285,7 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi,
* Get # of valid block of the zone.
*/
valid_block_cnt = get_valid_blocks(sbi, zone_segno, true);
- if (IS_CURSEC(sbi, GET_SEC_FROM_SEG(sbi, zone_segno))) {
+ if (is_cursec(sbi, GET_SEC_FROM_SEG(sbi, zone_segno))) {
f2fs_notice(sbi, "Open zones: valid block[0x%x,0x%x] cond[%s]",
zone_segno, valid_block_cnt,
blk_zone_cond_str(zone->cond));
@@ -5762,9 +5812,9 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi)
kvfree(sit_i->dirty_sentries_bitmap);
SM_I(sbi)->sit_info = NULL;
- kvfree(sit_i->sit_bitmap);
+ kfree(sit_i->sit_bitmap);
#ifdef CONFIG_F2FS_CHECK_FS
- kvfree(sit_i->sit_bitmap_mir);
+ kfree(sit_i->sit_bitmap_mir);
kvfree(sit_i->invalid_segmap);
#endif
kfree(sit_i);
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 943be4f1d6d2..5e2ee5c686b1 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -34,34 +34,6 @@ static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi,
f2fs_bug_on(sbi, seg_type >= NR_PERSISTENT_LOG);
}
-#define IS_CURSEG(sbi, seg) \
- (((seg) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \
- ((seg) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \
- ((seg) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \
- ((seg) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \
- ((seg) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \
- ((seg) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno) || \
- ((seg) == CURSEG_I(sbi, CURSEG_COLD_DATA_PINNED)->segno) || \
- ((seg) == CURSEG_I(sbi, CURSEG_ALL_DATA_ATGC)->segno))
-
-#define IS_CURSEC(sbi, secno) \
- (((secno) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \
- SEGS_PER_SEC(sbi)) || \
- ((secno) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno / \
- SEGS_PER_SEC(sbi)) || \
- ((secno) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno / \
- SEGS_PER_SEC(sbi)) || \
- ((secno) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno / \
- SEGS_PER_SEC(sbi)) || \
- ((secno) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno / \
- SEGS_PER_SEC(sbi)) || \
- ((secno) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \
- SEGS_PER_SEC(sbi)) || \
- ((secno) == CURSEG_I(sbi, CURSEG_COLD_DATA_PINNED)->segno / \
- SEGS_PER_SEC(sbi)) || \
- ((secno) == CURSEG_I(sbi, CURSEG_ALL_DATA_ATGC)->segno / \
- SEGS_PER_SEC(sbi)))
-
#define MAIN_BLKADDR(sbi) \
(SM_I(sbi) ? SM_I(sbi)->main_blkaddr : \
le32_to_cpu(F2FS_RAW_SUPER(sbi)->main_blkaddr))
@@ -102,6 +74,8 @@ static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi,
#define CAP_SEGS_PER_SEC(sbi) \
(SEGS_PER_SEC(sbi) - \
BLKS_TO_SEGS(sbi, (sbi)->unusable_blocks_per_sec))
+#define GET_START_SEG_FROM_SEC(sbi, segno) \
+ (rounddown(segno, SEGS_PER_SEC(sbi)))
#define GET_SEC_FROM_SEG(sbi, segno) \
(((segno) == -1) ? -1 : (segno) / SEGS_PER_SEC(sbi))
#define GET_SEG_FROM_SEC(sbi, secno) \
@@ -209,6 +183,7 @@ struct seg_entry {
struct sec_entry {
unsigned int valid_blocks; /* # of valid blocks in a section */
+ unsigned int ckpt_valid_blocks; /* # of valid blocks last cp in a section */
};
#define MAX_SKIP_GC_COUNT 16
@@ -315,6 +290,28 @@ static inline struct curseg_info *CURSEG_I(struct f2fs_sb_info *sbi, int type)
return (struct curseg_info *)(SM_I(sbi)->curseg_array + type);
}
+static inline bool is_curseg(struct f2fs_sb_info *sbi, unsigned int segno)
+{
+ int i;
+
+ for (i = CURSEG_HOT_DATA; i < NO_CHECK_TYPE; i++) {
+ if (segno == CURSEG_I(sbi, i)->segno)
+ return true;
+ }
+ return false;
+}
+
+static inline bool is_cursec(struct f2fs_sb_info *sbi, unsigned int secno)
+{
+ int i;
+
+ for (i = CURSEG_HOT_DATA; i < NO_CHECK_TYPE; i++) {
+ if (secno == GET_SEC_FROM_SEG(sbi, CURSEG_I(sbi, i)->segno))
+ return true;
+ }
+ return false;
+}
+
static inline struct seg_entry *get_seg_entry(struct f2fs_sb_info *sbi,
unsigned int segno)
{
@@ -345,22 +342,57 @@ static inline unsigned int get_valid_blocks(struct f2fs_sb_info *sbi,
static inline unsigned int get_ckpt_valid_blocks(struct f2fs_sb_info *sbi,
unsigned int segno, bool use_section)
{
- if (use_section && __is_large_section(sbi)) {
- unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
- unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno);
- unsigned int blocks = 0;
- int i;
+ if (use_section && __is_large_section(sbi))
+ return get_sec_entry(sbi, segno)->ckpt_valid_blocks;
+ else
+ return get_seg_entry(sbi, segno)->ckpt_valid_blocks;
+}
+
+static inline void set_ckpt_valid_blocks(struct f2fs_sb_info *sbi,
+ unsigned int segno)
+{
+ unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
+ unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno);
+ unsigned int blocks = 0;
+ int i;
- for (i = 0; i < SEGS_PER_SEC(sbi); i++, start_segno++) {
- struct seg_entry *se = get_seg_entry(sbi, start_segno);
+ for (i = 0; i < SEGS_PER_SEC(sbi); i++, start_segno++) {
+ struct seg_entry *se = get_seg_entry(sbi, start_segno);
- blocks += se->ckpt_valid_blocks;
- }
- return blocks;
+ blocks += se->ckpt_valid_blocks;
}
- return get_seg_entry(sbi, segno)->ckpt_valid_blocks;
+ get_sec_entry(sbi, segno)->ckpt_valid_blocks = blocks;
}
+#ifdef CONFIG_F2FS_CHECK_FS
+static inline void sanity_check_valid_blocks(struct f2fs_sb_info *sbi,
+ unsigned int segno)
+{
+ unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
+ unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno);
+ unsigned int blocks = 0;
+ int i;
+
+ for (i = 0; i < SEGS_PER_SEC(sbi); i++, start_segno++) {
+ struct seg_entry *se = get_seg_entry(sbi, start_segno);
+
+ blocks += se->ckpt_valid_blocks;
+ }
+
+ if (blocks != get_sec_entry(sbi, segno)->ckpt_valid_blocks) {
+ f2fs_err(sbi,
+ "Inconsistent ckpt valid blocks: "
+ "seg entry(%d) vs sec entry(%d) at secno %d",
+ blocks, get_sec_entry(sbi, segno)->ckpt_valid_blocks, secno);
+ f2fs_bug_on(sbi, 1);
+ }
+}
+#else
+static inline void sanity_check_valid_blocks(struct f2fs_sb_info *sbi,
+ unsigned int segno)
+{
+}
+#endif
static inline void seg_info_from_raw_sit(struct seg_entry *se,
struct f2fs_sit_entry *rs)
{
@@ -385,8 +417,8 @@ static inline void __seg_info_to_raw_sit(struct seg_entry *se,
rs->mtime = cpu_to_le64(se->mtime);
}
-static inline void seg_info_to_sit_page(struct f2fs_sb_info *sbi,
- struct page *page, unsigned int start)
+static inline void seg_info_to_sit_folio(struct f2fs_sb_info *sbi,
+ struct folio *folio, unsigned int start)
{
struct f2fs_sit_block *raw_sit;
struct seg_entry *se;
@@ -395,7 +427,7 @@ static inline void seg_info_to_sit_page(struct f2fs_sb_info *sbi,
(unsigned long)MAIN_SEGS(sbi));
int i;
- raw_sit = (struct f2fs_sit_block *)page_address(page);
+ raw_sit = folio_address(folio);
memset(raw_sit, 0, PAGE_SIZE);
for (i = 0; i < end - start; i++) {
rs = &raw_sit->entries[i];
@@ -429,7 +461,6 @@ static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno)
unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno);
unsigned int next;
- unsigned int usable_segs = f2fs_usable_segs_in_sec(sbi);
spin_lock(&free_i->segmap_lock);
clear_bit(segno, free_i->free_segmap);
@@ -437,7 +468,7 @@ static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno)
next = find_next_bit(free_i->free_segmap,
start_segno + SEGS_PER_SEC(sbi), start_segno);
- if (next >= start_segno + usable_segs) {
+ if (next >= start_segno + f2fs_usable_segs_in_sec(sbi)) {
clear_bit(secno, free_i->free_secmap);
free_i->free_sections++;
}
@@ -463,22 +494,36 @@ static inline void __set_test_and_free(struct f2fs_sb_info *sbi,
unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno);
unsigned int next;
- unsigned int usable_segs = f2fs_usable_segs_in_sec(sbi);
+ bool ret;
spin_lock(&free_i->segmap_lock);
- if (test_and_clear_bit(segno, free_i->free_segmap)) {
- free_i->free_segments++;
-
- if (!inmem && IS_CURSEC(sbi, secno))
- goto skip_free;
- next = find_next_bit(free_i->free_segmap,
- start_segno + SEGS_PER_SEC(sbi), start_segno);
- if (next >= start_segno + usable_segs) {
- if (test_and_clear_bit(secno, free_i->free_secmap))
- free_i->free_sections++;
- }
- }
-skip_free:
+ ret = test_and_clear_bit(segno, free_i->free_segmap);
+ if (!ret)
+ goto unlock_out;
+
+ free_i->free_segments++;
+
+ if (!inmem && is_cursec(sbi, secno))
+ goto unlock_out;
+
+ /* check large section */
+ next = find_next_bit(free_i->free_segmap,
+ start_segno + SEGS_PER_SEC(sbi), start_segno);
+ if (next < start_segno + f2fs_usable_segs_in_sec(sbi))
+ goto unlock_out;
+
+ ret = test_and_clear_bit(secno, free_i->free_secmap);
+ if (!ret)
+ goto unlock_out;
+
+ free_i->free_sections++;
+
+ if (GET_SEC_FROM_SEG(sbi, sbi->next_victim_seg[BG_GC]) == secno)
+ sbi->next_victim_seg[BG_GC] = NULL_SEGNO;
+ if (GET_SEC_FROM_SEG(sbi, sbi->next_victim_seg[FG_GC]) == secno)
+ sbi->next_victim_seg[FG_GC] = NULL_SEGNO;
+
+unlock_out:
spin_unlock(&free_i->segmap_lock);
}
@@ -559,15 +604,24 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi,
unsigned int node_blocks, unsigned int data_blocks,
unsigned int dent_blocks)
{
-
unsigned int segno, left_blocks, blocks;
int i;
/* check current data/node sections in the worst case. */
for (i = CURSEG_HOT_DATA; i < NR_PERSISTENT_LOG; i++) {
segno = CURSEG_I(sbi, i)->segno;
- left_blocks = CAP_BLKS_PER_SEC(sbi) -
- get_ckpt_valid_blocks(sbi, segno, true);
+
+ if (unlikely(segno == NULL_SEGNO))
+ return false;
+
+ if (f2fs_lfs_mode(sbi) && __is_large_section(sbi)) {
+ left_blocks = CAP_BLKS_PER_SEC(sbi) -
+ SEGS_TO_BLKS(sbi, (segno - GET_START_SEG_FROM_SEC(sbi, segno))) -
+ CURSEG_I(sbi, i)->next_blkoff;
+ } else {
+ left_blocks = CAP_BLKS_PER_SEC(sbi) -
+ get_ckpt_valid_blocks(sbi, segno, true);
+ }
blocks = i <= CURSEG_COLD_DATA ? data_blocks : node_blocks;
if (blocks > left_blocks)
@@ -576,8 +630,19 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi,
/* check current data section for dentry blocks. */
segno = CURSEG_I(sbi, CURSEG_HOT_DATA)->segno;
- left_blocks = CAP_BLKS_PER_SEC(sbi) -
- get_ckpt_valid_blocks(sbi, segno, true);
+
+ if (unlikely(segno == NULL_SEGNO))
+ return false;
+
+ if (f2fs_lfs_mode(sbi) && __is_large_section(sbi)) {
+ left_blocks = CAP_BLKS_PER_SEC(sbi) -
+ SEGS_TO_BLKS(sbi, (segno - GET_START_SEG_FROM_SEC(sbi, segno))) -
+ CURSEG_I(sbi, CURSEG_HOT_DATA)->next_blkoff;
+ } else {
+ left_blocks = CAP_BLKS_PER_SEC(sbi) -
+ get_ckpt_valid_blocks(sbi, segno, true);
+ }
+
if (dent_blocks > left_blocks)
return false;
return true;
@@ -603,8 +668,7 @@ static inline void __get_secs_required(struct f2fs_sb_info *sbi,
unsigned int dent_blocks = total_dent_blocks % CAP_BLKS_PER_SEC(sbi);
unsigned int data_blocks = 0;
- if (f2fs_lfs_mode(sbi) &&
- unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) {
+ if (f2fs_lfs_mode(sbi)) {
total_data_blocks = get_pages(sbi, F2FS_DIRTY_DATA);
data_secs = total_data_blocks / CAP_BLKS_PER_SEC(sbi);
data_blocks = total_data_blocks % CAP_BLKS_PER_SEC(sbi);
@@ -613,7 +677,7 @@ static inline void __get_secs_required(struct f2fs_sb_info *sbi,
if (lower_p)
*lower_p = node_secs + dent_secs + data_secs;
if (upper_p)
- *upper_p = node_secs + dent_secs +
+ *upper_p = node_secs + dent_secs + data_secs +
(node_blocks ? 1 : 0) + (dent_blocks ? 1 : 0) +
(data_blocks ? 1 : 0);
if (curseg_p)
@@ -915,7 +979,7 @@ static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type)
static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno)
{
- if (IS_CURSEC(sbi, secno) || (sbi->cur_victim_sec == secno))
+ if (is_cursec(sbi, secno) || (sbi->cur_victim_sec == secno))
return true;
return false;
}
diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c
index 83d6fb97dcae..b88babcf6ab4 100644
--- a/fs/f2fs/shrinker.c
+++ b/fs/f2fs/shrinker.c
@@ -73,7 +73,7 @@ unsigned long f2fs_shrink_count(struct shrinker *shrink,
mutex_unlock(&sbi->umount_mutex);
}
spin_unlock(&f2fs_list_lock);
- return count;
+ return count ?: SHRINK_EMPTY;
}
unsigned long f2fs_shrink_scan(struct shrinker *shrink,
@@ -130,6 +130,103 @@ unsigned long f2fs_shrink_scan(struct shrinker *shrink,
return freed;
}
+unsigned int f2fs_donate_files(void)
+{
+ struct f2fs_sb_info *sbi;
+ struct list_head *p;
+ unsigned int donate_files = 0;
+
+ spin_lock(&f2fs_list_lock);
+ p = f2fs_list.next;
+ while (p != &f2fs_list) {
+ sbi = list_entry(p, struct f2fs_sb_info, s_list);
+
+ /* stop f2fs_put_super */
+ if (!mutex_trylock(&sbi->umount_mutex)) {
+ p = p->next;
+ continue;
+ }
+ spin_unlock(&f2fs_list_lock);
+
+ donate_files += sbi->donate_files;
+
+ spin_lock(&f2fs_list_lock);
+ p = p->next;
+ mutex_unlock(&sbi->umount_mutex);
+ }
+ spin_unlock(&f2fs_list_lock);
+
+ return donate_files;
+}
+
+static unsigned int do_reclaim_caches(struct f2fs_sb_info *sbi,
+ unsigned int reclaim_caches_kb)
+{
+ struct inode *inode;
+ struct f2fs_inode_info *fi;
+ unsigned int nfiles = sbi->donate_files;
+ pgoff_t npages = reclaim_caches_kb >> (PAGE_SHIFT - 10);
+
+ while (npages && nfiles--) {
+ pgoff_t len;
+
+ spin_lock(&sbi->inode_lock[DONATE_INODE]);
+ if (list_empty(&sbi->inode_list[DONATE_INODE])) {
+ spin_unlock(&sbi->inode_lock[DONATE_INODE]);
+ break;
+ }
+ fi = list_first_entry(&sbi->inode_list[DONATE_INODE],
+ struct f2fs_inode_info, gdonate_list);
+ list_move_tail(&fi->gdonate_list, &sbi->inode_list[DONATE_INODE]);
+ inode = igrab(&fi->vfs_inode);
+ spin_unlock(&sbi->inode_lock[DONATE_INODE]);
+
+ if (!inode)
+ continue;
+
+ inode_lock(inode);
+ if (!is_inode_flag_set(inode, FI_DONATE_FINISHED)) {
+ len = fi->donate_end - fi->donate_start + 1;
+ npages = npages < len ? 0 : npages - len;
+
+ invalidate_inode_pages2_range(inode->i_mapping,
+ fi->donate_start, fi->donate_end);
+ set_inode_flag(inode, FI_DONATE_FINISHED);
+ }
+ inode_unlock(inode);
+
+ iput(inode);
+ cond_resched();
+ }
+ return npages << (PAGE_SHIFT - 10);
+}
+
+void f2fs_reclaim_caches(unsigned int reclaim_caches_kb)
+{
+ struct f2fs_sb_info *sbi;
+ struct list_head *p;
+
+ spin_lock(&f2fs_list_lock);
+ p = f2fs_list.next;
+ while (p != &f2fs_list && reclaim_caches_kb) {
+ sbi = list_entry(p, struct f2fs_sb_info, s_list);
+
+ /* stop f2fs_put_super */
+ if (!mutex_trylock(&sbi->umount_mutex)) {
+ p = p->next;
+ continue;
+ }
+ spin_unlock(&f2fs_list_lock);
+
+ reclaim_caches_kb = do_reclaim_caches(sbi, reclaim_caches_kb);
+
+ spin_lock(&f2fs_list_lock);
+ p = p->next;
+ mutex_unlock(&sbi->umount_mutex);
+ }
+ spin_unlock(&f2fs_list_lock);
+}
+
void f2fs_join_shrinker(struct f2fs_sb_info *sbi)
{
spin_lock(&f2fs_list_lock);
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 19b67828ae32..e16c4e2830c2 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -27,6 +27,8 @@
#include <linux/part_stat.h>
#include <linux/zstd.h>
#include <linux/lz4.h>
+#include <linux/ctype.h>
+#include <linux/fs_parser.h>
#include "f2fs.h"
#include "node.h"
@@ -47,6 +49,7 @@ const char *f2fs_fault_name[FAULT_MAX] = {
[FAULT_KVMALLOC] = "kvmalloc",
[FAULT_PAGE_ALLOC] = "page alloc",
[FAULT_PAGE_GET] = "page get",
+ [FAULT_ALLOC_BIO] = "alloc bio(obsolete)",
[FAULT_ALLOC_NID] = "alloc nid",
[FAULT_ORPHAN] = "orphan",
[FAULT_BLOCK] = "no more block",
@@ -63,32 +66,36 @@ const char *f2fs_fault_name[FAULT_MAX] = {
[FAULT_BLKADDR_VALIDITY] = "invalid blkaddr",
[FAULT_BLKADDR_CONSISTENCE] = "inconsistent blkaddr",
[FAULT_NO_SEGMENT] = "no free segment",
+ [FAULT_INCONSISTENT_FOOTER] = "inconsistent footer",
+ [FAULT_TIMEOUT] = "timeout",
+ [FAULT_VMALLOC] = "vmalloc",
};
int f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned long rate,
- unsigned long type)
+ unsigned long type, enum fault_option fo)
{
struct f2fs_fault_info *ffi = &F2FS_OPTION(sbi).fault_info;
- if (rate) {
+ if (fo & FAULT_ALL) {
+ memset(ffi, 0, sizeof(struct f2fs_fault_info));
+ return 0;
+ }
+
+ if (fo & FAULT_RATE) {
if (rate > INT_MAX)
return -EINVAL;
atomic_set(&ffi->inject_ops, 0);
ffi->inject_rate = (int)rate;
+ f2fs_info(sbi, "build fault injection rate: %lu", rate);
}
- if (type) {
+ if (fo & FAULT_TYPE) {
if (type >= BIT(FAULT_MAX))
return -EINVAL;
ffi->inject_type = (unsigned int)type;
+ f2fs_info(sbi, "build fault injection type: 0x%lx", type);
}
- if (!rate && !type)
- memset(ffi, 0, sizeof(struct f2fs_fault_info));
- else
- f2fs_info(sbi,
- "build fault injection attr: rate: %lu, type: 0x%lx",
- rate, type);
return 0;
}
#endif
@@ -120,29 +127,20 @@ enum {
Opt_disable_roll_forward,
Opt_norecovery,
Opt_discard,
- Opt_nodiscard,
Opt_noheap,
Opt_heap,
Opt_user_xattr,
- Opt_nouser_xattr,
Opt_acl,
- Opt_noacl,
Opt_active_logs,
Opt_disable_ext_identify,
Opt_inline_xattr,
- Opt_noinline_xattr,
Opt_inline_xattr_size,
Opt_inline_data,
Opt_inline_dentry,
- Opt_noinline_dentry,
Opt_flush_merge,
- Opt_noflush_merge,
Opt_barrier,
- Opt_nobarrier,
Opt_fastboot,
Opt_extent_cache,
- Opt_noextent_cache,
- Opt_noinline_data,
Opt_data_flush,
Opt_reserve_root,
Opt_resgid,
@@ -151,21 +149,13 @@ enum {
Opt_fault_injection,
Opt_fault_type,
Opt_lazytime,
- Opt_nolazytime,
Opt_quota,
- Opt_noquota,
Opt_usrquota,
Opt_grpquota,
Opt_prjquota,
Opt_usrjquota,
Opt_grpjquota,
Opt_prjjquota,
- Opt_offusrjquota,
- Opt_offgrpjquota,
- Opt_offprjjquota,
- Opt_jqfmt_vfsold,
- Opt_jqfmt_vfsv0,
- Opt_jqfmt_vfsv1,
Opt_alloc,
Opt_fsync,
Opt_test_dummy_encryption,
@@ -175,105 +165,209 @@ enum {
Opt_checkpoint_disable_cap_perc,
Opt_checkpoint_enable,
Opt_checkpoint_merge,
- Opt_nocheckpoint_merge,
Opt_compress_algorithm,
Opt_compress_log_size,
- Opt_compress_extension,
Opt_nocompress_extension,
+ Opt_compress_extension,
Opt_compress_chksum,
Opt_compress_mode,
Opt_compress_cache,
Opt_atgc,
Opt_gc_merge,
- Opt_nogc_merge,
Opt_discard_unit,
Opt_memory_mode,
Opt_age_extent_cache,
Opt_errors,
+ Opt_nat_bits,
+ Opt_jqfmt,
+ Opt_checkpoint,
Opt_err,
};
-static match_table_t f2fs_tokens = {
- {Opt_gc_background, "background_gc=%s"},
- {Opt_disable_roll_forward, "disable_roll_forward"},
- {Opt_norecovery, "norecovery"},
- {Opt_discard, "discard"},
- {Opt_nodiscard, "nodiscard"},
- {Opt_noheap, "no_heap"},
- {Opt_heap, "heap"},
- {Opt_user_xattr, "user_xattr"},
- {Opt_nouser_xattr, "nouser_xattr"},
- {Opt_acl, "acl"},
- {Opt_noacl, "noacl"},
- {Opt_active_logs, "active_logs=%u"},
- {Opt_disable_ext_identify, "disable_ext_identify"},
- {Opt_inline_xattr, "inline_xattr"},
- {Opt_noinline_xattr, "noinline_xattr"},
- {Opt_inline_xattr_size, "inline_xattr_size=%u"},
- {Opt_inline_data, "inline_data"},
- {Opt_inline_dentry, "inline_dentry"},
- {Opt_noinline_dentry, "noinline_dentry"},
- {Opt_flush_merge, "flush_merge"},
- {Opt_noflush_merge, "noflush_merge"},
- {Opt_barrier, "barrier"},
- {Opt_nobarrier, "nobarrier"},
- {Opt_fastboot, "fastboot"},
- {Opt_extent_cache, "extent_cache"},
- {Opt_noextent_cache, "noextent_cache"},
- {Opt_noinline_data, "noinline_data"},
- {Opt_data_flush, "data_flush"},
- {Opt_reserve_root, "reserve_root=%u"},
- {Opt_resgid, "resgid=%u"},
- {Opt_resuid, "resuid=%u"},
- {Opt_mode, "mode=%s"},
- {Opt_fault_injection, "fault_injection=%u"},
- {Opt_fault_type, "fault_type=%u"},
- {Opt_lazytime, "lazytime"},
- {Opt_nolazytime, "nolazytime"},
- {Opt_quota, "quota"},
- {Opt_noquota, "noquota"},
- {Opt_usrquota, "usrquota"},
- {Opt_grpquota, "grpquota"},
- {Opt_prjquota, "prjquota"},
- {Opt_usrjquota, "usrjquota=%s"},
- {Opt_grpjquota, "grpjquota=%s"},
- {Opt_prjjquota, "prjjquota=%s"},
- {Opt_offusrjquota, "usrjquota="},
- {Opt_offgrpjquota, "grpjquota="},
- {Opt_offprjjquota, "prjjquota="},
- {Opt_jqfmt_vfsold, "jqfmt=vfsold"},
- {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
- {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"},
- {Opt_alloc, "alloc_mode=%s"},
- {Opt_fsync, "fsync_mode=%s"},
- {Opt_test_dummy_encryption, "test_dummy_encryption=%s"},
- {Opt_test_dummy_encryption, "test_dummy_encryption"},
- {Opt_inlinecrypt, "inlinecrypt"},
- {Opt_checkpoint_disable, "checkpoint=disable"},
- {Opt_checkpoint_disable_cap, "checkpoint=disable:%u"},
- {Opt_checkpoint_disable_cap_perc, "checkpoint=disable:%u%%"},
- {Opt_checkpoint_enable, "checkpoint=enable"},
- {Opt_checkpoint_merge, "checkpoint_merge"},
- {Opt_nocheckpoint_merge, "nocheckpoint_merge"},
- {Opt_compress_algorithm, "compress_algorithm=%s"},
- {Opt_compress_log_size, "compress_log_size=%u"},
- {Opt_compress_extension, "compress_extension=%s"},
- {Opt_nocompress_extension, "nocompress_extension=%s"},
- {Opt_compress_chksum, "compress_chksum"},
- {Opt_compress_mode, "compress_mode=%s"},
- {Opt_compress_cache, "compress_cache"},
- {Opt_atgc, "atgc"},
- {Opt_gc_merge, "gc_merge"},
- {Opt_nogc_merge, "nogc_merge"},
- {Opt_discard_unit, "discard_unit=%s"},
- {Opt_memory_mode, "memory=%s"},
- {Opt_age_extent_cache, "age_extent_cache"},
- {Opt_errors, "errors=%s"},
+static const struct constant_table f2fs_param_background_gc[] = {
+ {"on", BGGC_MODE_ON},
+ {"off", BGGC_MODE_OFF},
+ {"sync", BGGC_MODE_SYNC},
+ {}
+};
+
+static const struct constant_table f2fs_param_mode[] = {
+ {"adaptive", FS_MODE_ADAPTIVE},
+ {"lfs", FS_MODE_LFS},
+ {"fragment:segment", FS_MODE_FRAGMENT_SEG},
+ {"fragment:block", FS_MODE_FRAGMENT_BLK},
+ {}
+};
+
+static const struct constant_table f2fs_param_jqfmt[] = {
+ {"vfsold", QFMT_VFS_OLD},
+ {"vfsv0", QFMT_VFS_V0},
+ {"vfsv1", QFMT_VFS_V1},
+ {}
+};
+
+static const struct constant_table f2fs_param_alloc_mode[] = {
+ {"default", ALLOC_MODE_DEFAULT},
+ {"reuse", ALLOC_MODE_REUSE},
+ {}
+};
+static const struct constant_table f2fs_param_fsync_mode[] = {
+ {"posix", FSYNC_MODE_POSIX},
+ {"strict", FSYNC_MODE_STRICT},
+ {"nobarrier", FSYNC_MODE_NOBARRIER},
+ {}
+};
+
+static const struct constant_table f2fs_param_compress_mode[] = {
+ {"fs", COMPR_MODE_FS},
+ {"user", COMPR_MODE_USER},
+ {}
+};
+
+static const struct constant_table f2fs_param_discard_unit[] = {
+ {"block", DISCARD_UNIT_BLOCK},
+ {"segment", DISCARD_UNIT_SEGMENT},
+ {"section", DISCARD_UNIT_SECTION},
+ {}
+};
+
+static const struct constant_table f2fs_param_memory_mode[] = {
+ {"normal", MEMORY_MODE_NORMAL},
+ {"low", MEMORY_MODE_LOW},
+ {}
+};
+
+static const struct constant_table f2fs_param_errors[] = {
+ {"remount-ro", MOUNT_ERRORS_READONLY},
+ {"continue", MOUNT_ERRORS_CONTINUE},
+ {"panic", MOUNT_ERRORS_PANIC},
+ {}
+};
+
+static const struct fs_parameter_spec f2fs_param_specs[] = {
+ fsparam_enum("background_gc", Opt_gc_background, f2fs_param_background_gc),
+ fsparam_flag("disable_roll_forward", Opt_disable_roll_forward),
+ fsparam_flag("norecovery", Opt_norecovery),
+ fsparam_flag_no("discard", Opt_discard),
+ fsparam_flag("no_heap", Opt_noheap),
+ fsparam_flag("heap", Opt_heap),
+ fsparam_flag_no("user_xattr", Opt_user_xattr),
+ fsparam_flag_no("acl", Opt_acl),
+ fsparam_s32("active_logs", Opt_active_logs),
+ fsparam_flag("disable_ext_identify", Opt_disable_ext_identify),
+ fsparam_flag_no("inline_xattr", Opt_inline_xattr),
+ fsparam_s32("inline_xattr_size", Opt_inline_xattr_size),
+ fsparam_flag_no("inline_data", Opt_inline_data),
+ fsparam_flag_no("inline_dentry", Opt_inline_dentry),
+ fsparam_flag_no("flush_merge", Opt_flush_merge),
+ fsparam_flag_no("barrier", Opt_barrier),
+ fsparam_flag("fastboot", Opt_fastboot),
+ fsparam_flag_no("extent_cache", Opt_extent_cache),
+ fsparam_flag("data_flush", Opt_data_flush),
+ fsparam_u32("reserve_root", Opt_reserve_root),
+ fsparam_gid("resgid", Opt_resgid),
+ fsparam_uid("resuid", Opt_resuid),
+ fsparam_enum("mode", Opt_mode, f2fs_param_mode),
+ fsparam_s32("fault_injection", Opt_fault_injection),
+ fsparam_u32("fault_type", Opt_fault_type),
+ fsparam_flag_no("lazytime", Opt_lazytime),
+ fsparam_flag_no("quota", Opt_quota),
+ fsparam_flag("usrquota", Opt_usrquota),
+ fsparam_flag("grpquota", Opt_grpquota),
+ fsparam_flag("prjquota", Opt_prjquota),
+ fsparam_string_empty("usrjquota", Opt_usrjquota),
+ fsparam_string_empty("grpjquota", Opt_grpjquota),
+ fsparam_string_empty("prjjquota", Opt_prjjquota),
+ fsparam_flag("nat_bits", Opt_nat_bits),
+ fsparam_enum("jqfmt", Opt_jqfmt, f2fs_param_jqfmt),
+ fsparam_enum("alloc_mode", Opt_alloc, f2fs_param_alloc_mode),
+ fsparam_enum("fsync_mode", Opt_fsync, f2fs_param_fsync_mode),
+ fsparam_string("test_dummy_encryption", Opt_test_dummy_encryption),
+ fsparam_flag("test_dummy_encryption", Opt_test_dummy_encryption),
+ fsparam_flag("inlinecrypt", Opt_inlinecrypt),
+ fsparam_string("checkpoint", Opt_checkpoint),
+ fsparam_flag_no("checkpoint_merge", Opt_checkpoint_merge),
+ fsparam_string("compress_algorithm", Opt_compress_algorithm),
+ fsparam_u32("compress_log_size", Opt_compress_log_size),
+ fsparam_string("compress_extension", Opt_compress_extension),
+ fsparam_string("nocompress_extension", Opt_nocompress_extension),
+ fsparam_flag("compress_chksum", Opt_compress_chksum),
+ fsparam_enum("compress_mode", Opt_compress_mode, f2fs_param_compress_mode),
+ fsparam_flag("compress_cache", Opt_compress_cache),
+ fsparam_flag("atgc", Opt_atgc),
+ fsparam_flag_no("gc_merge", Opt_gc_merge),
+ fsparam_enum("discard_unit", Opt_discard_unit, f2fs_param_discard_unit),
+ fsparam_enum("memory", Opt_memory_mode, f2fs_param_memory_mode),
+ fsparam_flag("age_extent_cache", Opt_age_extent_cache),
+ fsparam_enum("errors", Opt_errors, f2fs_param_errors),
+ {}
+};
+
+/* Resort to a match_table for this interestingly formatted option */
+static match_table_t f2fs_checkpoint_tokens = {
+ {Opt_checkpoint_disable, "disable"},
+ {Opt_checkpoint_disable_cap, "disable:%u"},
+ {Opt_checkpoint_disable_cap_perc, "disable:%u%%"},
+ {Opt_checkpoint_enable, "enable"},
{Opt_err, NULL},
};
+#define F2FS_SPEC_background_gc (1 << 0)
+#define F2FS_SPEC_inline_xattr_size (1 << 1)
+#define F2FS_SPEC_active_logs (1 << 2)
+#define F2FS_SPEC_reserve_root (1 << 3)
+#define F2FS_SPEC_resgid (1 << 4)
+#define F2FS_SPEC_resuid (1 << 5)
+#define F2FS_SPEC_mode (1 << 6)
+#define F2FS_SPEC_fault_injection (1 << 7)
+#define F2FS_SPEC_fault_type (1 << 8)
+#define F2FS_SPEC_jqfmt (1 << 9)
+#define F2FS_SPEC_alloc_mode (1 << 10)
+#define F2FS_SPEC_fsync_mode (1 << 11)
+#define F2FS_SPEC_checkpoint_disable_cap (1 << 12)
+#define F2FS_SPEC_checkpoint_disable_cap_perc (1 << 13)
+#define F2FS_SPEC_compress_level (1 << 14)
+#define F2FS_SPEC_compress_algorithm (1 << 15)
+#define F2FS_SPEC_compress_log_size (1 << 16)
+#define F2FS_SPEC_compress_extension (1 << 17)
+#define F2FS_SPEC_nocompress_extension (1 << 18)
+#define F2FS_SPEC_compress_chksum (1 << 19)
+#define F2FS_SPEC_compress_mode (1 << 20)
+#define F2FS_SPEC_discard_unit (1 << 21)
+#define F2FS_SPEC_memory_mode (1 << 22)
+#define F2FS_SPEC_errors (1 << 23)
+
+struct f2fs_fs_context {
+ struct f2fs_mount_info info;
+ unsigned int opt_mask; /* Bits changed */
+ unsigned int spec_mask;
+ unsigned short qname_mask;
+};
+
+#define F2FS_CTX_INFO(ctx) ((ctx)->info)
+
+static inline void ctx_set_opt(struct f2fs_fs_context *ctx,
+ unsigned int flag)
+{
+ ctx->info.opt |= flag;
+ ctx->opt_mask |= flag;
+}
+
+static inline void ctx_clear_opt(struct f2fs_fs_context *ctx,
+ unsigned int flag)
+{
+ ctx->info.opt &= ~flag;
+ ctx->opt_mask |= flag;
+}
+
+static inline bool ctx_test_opt(struct f2fs_fs_context *ctx,
+ unsigned int flag)
+{
+ return ctx->info.opt & flag;
+}
+
void f2fs_printk(struct f2fs_sb_info *sbi, bool limit_rate,
- const char *fmt, ...)
+ const char *fmt, ...)
{
struct va_format vaf;
va_list args;
@@ -285,11 +379,19 @@ void f2fs_printk(struct f2fs_sb_info *sbi, bool limit_rate,
vaf.fmt = printk_skip_level(fmt);
vaf.va = &args;
if (limit_rate)
- printk_ratelimited("%c%cF2FS-fs (%s): %pV\n",
- KERN_SOH_ASCII, level, sbi->sb->s_id, &vaf);
+ if (sbi)
+ printk_ratelimited("%c%cF2FS-fs (%s): %pV\n",
+ KERN_SOH_ASCII, level, sbi->sb->s_id, &vaf);
+ else
+ printk_ratelimited("%c%cF2FS-fs: %pV\n",
+ KERN_SOH_ASCII, level, &vaf);
else
- printk("%c%cF2FS-fs (%s): %pV\n",
- KERN_SOH_ASCII, level, sbi->sb->s_id, &vaf);
+ if (sbi)
+ printk("%c%cF2FS-fs (%s): %pV\n",
+ KERN_SOH_ASCII, level, sbi->sb->s_id, &vaf);
+ else
+ printk("%c%cF2FS-fs: %pV\n",
+ KERN_SOH_ASCII, level, &vaf);
va_end(args);
}
@@ -383,160 +485,90 @@ static void init_once(void *foo)
#ifdef CONFIG_QUOTA
static const char * const quotatypes[] = INITQFNAMES;
#define QTYPE2NAME(t) (quotatypes[t])
-static int f2fs_set_qf_name(struct super_block *sb, int qtype,
- substring_t *args)
+/*
+ * Note the name of the specified quota file.
+ */
+static int f2fs_note_qf_name(struct fs_context *fc, int qtype,
+ struct fs_parameter *param)
{
- struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ struct f2fs_fs_context *ctx = fc->fs_private;
char *qname;
- int ret = -EINVAL;
- if (sb_any_quota_loaded(sb) && !F2FS_OPTION(sbi).s_qf_names[qtype]) {
- f2fs_err(sbi, "Cannot change journaled quota options when quota turned on");
+ if (param->size < 1) {
+ f2fs_err(NULL, "Missing quota name");
return -EINVAL;
}
- if (f2fs_sb_has_quota_ino(sbi)) {
- f2fs_info(sbi, "QUOTA feature is enabled, so ignore qf_name");
+ if (strchr(param->string, '/')) {
+ f2fs_err(NULL, "quotafile must be on filesystem root");
+ return -EINVAL;
+ }
+ if (ctx->info.s_qf_names[qtype]) {
+ if (strcmp(ctx->info.s_qf_names[qtype], param->string) != 0) {
+ f2fs_err(NULL, "Quota file already specified");
+ return -EINVAL;
+ }
return 0;
}
- qname = match_strdup(args);
+ qname = kmemdup_nul(param->string, param->size, GFP_KERNEL);
if (!qname) {
- f2fs_err(sbi, "Not enough memory for storing quotafile name");
+ f2fs_err(NULL, "Not enough memory for storing quotafile name");
return -ENOMEM;
}
- if (F2FS_OPTION(sbi).s_qf_names[qtype]) {
- if (strcmp(F2FS_OPTION(sbi).s_qf_names[qtype], qname) == 0)
- ret = 0;
- else
- f2fs_err(sbi, "%s quota file already specified",
- QTYPE2NAME(qtype));
- goto errout;
- }
- if (strchr(qname, '/')) {
- f2fs_err(sbi, "quotafile must be on filesystem root");
- goto errout;
- }
- F2FS_OPTION(sbi).s_qf_names[qtype] = qname;
- set_opt(sbi, QUOTA);
+ F2FS_CTX_INFO(ctx).s_qf_names[qtype] = qname;
+ ctx->qname_mask |= 1 << qtype;
return 0;
-errout:
- kfree(qname);
- return ret;
}
-static int f2fs_clear_qf_name(struct super_block *sb, int qtype)
+/*
+ * Clear the name of the specified quota file.
+ */
+static int f2fs_unnote_qf_name(struct fs_context *fc, int qtype)
{
- struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ struct f2fs_fs_context *ctx = fc->fs_private;
- if (sb_any_quota_loaded(sb) && F2FS_OPTION(sbi).s_qf_names[qtype]) {
- f2fs_err(sbi, "Cannot change journaled quota options when quota turned on");
- return -EINVAL;
- }
- kfree(F2FS_OPTION(sbi).s_qf_names[qtype]);
- F2FS_OPTION(sbi).s_qf_names[qtype] = NULL;
+ kfree(ctx->info.s_qf_names[qtype]);
+ ctx->info.s_qf_names[qtype] = NULL;
+ ctx->qname_mask |= 1 << qtype;
return 0;
}
-static int f2fs_check_quota_options(struct f2fs_sb_info *sbi)
+static void f2fs_unnote_qf_name_all(struct fs_context *fc)
{
- /*
- * We do the test below only for project quotas. 'usrquota' and
- * 'grpquota' mount options are allowed even without quota feature
- * to support legacy quotas in quota files.
- */
- if (test_opt(sbi, PRJQUOTA) && !f2fs_sb_has_project_quota(sbi)) {
- f2fs_err(sbi, "Project quota feature not enabled. Cannot enable project quota enforcement.");
- return -1;
- }
- if (F2FS_OPTION(sbi).s_qf_names[USRQUOTA] ||
- F2FS_OPTION(sbi).s_qf_names[GRPQUOTA] ||
- F2FS_OPTION(sbi).s_qf_names[PRJQUOTA]) {
- if (test_opt(sbi, USRQUOTA) &&
- F2FS_OPTION(sbi).s_qf_names[USRQUOTA])
- clear_opt(sbi, USRQUOTA);
-
- if (test_opt(sbi, GRPQUOTA) &&
- F2FS_OPTION(sbi).s_qf_names[GRPQUOTA])
- clear_opt(sbi, GRPQUOTA);
-
- if (test_opt(sbi, PRJQUOTA) &&
- F2FS_OPTION(sbi).s_qf_names[PRJQUOTA])
- clear_opt(sbi, PRJQUOTA);
-
- if (test_opt(sbi, GRPQUOTA) || test_opt(sbi, USRQUOTA) ||
- test_opt(sbi, PRJQUOTA)) {
- f2fs_err(sbi, "old and new quota format mixing");
- return -1;
- }
-
- if (!F2FS_OPTION(sbi).s_jquota_fmt) {
- f2fs_err(sbi, "journaled quota format not specified");
- return -1;
- }
- }
+ int i;
- if (f2fs_sb_has_quota_ino(sbi) && F2FS_OPTION(sbi).s_jquota_fmt) {
- f2fs_info(sbi, "QUOTA feature is enabled, so ignore jquota_fmt");
- F2FS_OPTION(sbi).s_jquota_fmt = 0;
- }
- return 0;
+ for (i = 0; i < MAXQUOTAS; i++)
+ f2fs_unnote_qf_name(fc, i);
}
#endif
-static int f2fs_set_test_dummy_encryption(struct super_block *sb,
- const char *opt,
- const substring_t *arg,
- bool is_remount)
+static int f2fs_parse_test_dummy_encryption(const struct fs_parameter *param,
+ struct f2fs_fs_context *ctx)
{
- struct f2fs_sb_info *sbi = F2FS_SB(sb);
- struct fs_parameter param = {
- .type = fs_value_is_string,
- .string = arg->from ? arg->from : "",
- };
- struct fscrypt_dummy_policy *policy =
- &F2FS_OPTION(sbi).dummy_enc_policy;
int err;
if (!IS_ENABLED(CONFIG_FS_ENCRYPTION)) {
- f2fs_warn(sbi, "test_dummy_encryption option not supported");
- return -EINVAL;
- }
-
- if (!f2fs_sb_has_encrypt(sbi)) {
- f2fs_err(sbi, "Encrypt feature is off");
+ f2fs_warn(NULL, "test_dummy_encryption option not supported");
return -EINVAL;
}
-
- /*
- * This mount option is just for testing, and it's not worthwhile to
- * implement the extra complexity (e.g. RCU protection) that would be
- * needed to allow it to be set or changed during remount. We do allow
- * it to be specified during remount, but only if there is no change.
- */
- if (is_remount && !fscrypt_is_dummy_policy_set(policy)) {
- f2fs_warn(sbi, "Can't set test_dummy_encryption on remount");
- return -EINVAL;
- }
-
- err = fscrypt_parse_test_dummy_encryption(&param, policy);
+ err = fscrypt_parse_test_dummy_encryption(param,
+ &ctx->info.dummy_enc_policy);
if (err) {
- if (err == -EEXIST)
- f2fs_warn(sbi,
- "Can't change test_dummy_encryption on remount");
- else if (err == -EINVAL)
- f2fs_warn(sbi, "Value of option \"%s\" is unrecognized",
- opt);
+ if (err == -EINVAL)
+ f2fs_warn(NULL, "Value of option \"%s\" is unrecognized",
+ param->key);
+ else if (err == -EEXIST)
+ f2fs_warn(NULL, "Conflicting test_dummy_encryption options");
else
- f2fs_warn(sbi, "Error processing option \"%s\" [%d]",
- opt, err);
+ f2fs_warn(NULL, "Error processing option \"%s\" [%d]",
+ param->key, err);
return -EINVAL;
}
- f2fs_warn(sbi, "Test dummy encryption mode enabled");
return 0;
}
#ifdef CONFIG_F2FS_FS_COMPRESSION
-static bool is_compress_extension_exist(struct f2fs_sb_info *sbi,
+static bool is_compress_extension_exist(struct f2fs_mount_info *info,
const char *new_ext, bool is_ext)
{
unsigned char (*ext)[F2FS_EXTENSION_LEN];
@@ -544,11 +576,11 @@ static bool is_compress_extension_exist(struct f2fs_sb_info *sbi,
int i;
if (is_ext) {
- ext = F2FS_OPTION(sbi).extensions;
- ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt;
+ ext = info->extensions;
+ ext_cnt = info->compress_ext_cnt;
} else {
- ext = F2FS_OPTION(sbi).noextensions;
- ext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt;
+ ext = info->noextensions;
+ ext_cnt = info->nocompress_ext_cnt;
}
for (i = 0; i < ext_cnt; i++) {
@@ -566,28 +598,28 @@ static bool is_compress_extension_exist(struct f2fs_sb_info *sbi,
* extension will be treated as special cases and will not be compressed.
* 3. Don't allow the non-compress extension specifies all files.
*/
-static int f2fs_test_compress_extension(struct f2fs_sb_info *sbi)
+static int f2fs_test_compress_extension(unsigned char (*noext)[F2FS_EXTENSION_LEN],
+ int noext_cnt,
+ unsigned char (*ext)[F2FS_EXTENSION_LEN],
+ int ext_cnt)
{
- unsigned char (*ext)[F2FS_EXTENSION_LEN];
- unsigned char (*noext)[F2FS_EXTENSION_LEN];
- int ext_cnt, noext_cnt, index = 0, no_index = 0;
-
- ext = F2FS_OPTION(sbi).extensions;
- ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt;
- noext = F2FS_OPTION(sbi).noextensions;
- noext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt;
+ int index = 0, no_index = 0;
if (!noext_cnt)
return 0;
for (no_index = 0; no_index < noext_cnt; no_index++) {
+ if (strlen(noext[no_index]) == 0)
+ continue;
if (!strcasecmp("*", noext[no_index])) {
- f2fs_info(sbi, "Don't allow the nocompress extension specifies all files");
+ f2fs_info(NULL, "Don't allow the nocompress extension specifies all files");
return -EINVAL;
}
for (index = 0; index < ext_cnt; index++) {
+ if (strlen(ext[index]) == 0)
+ continue;
if (!strcasecmp(ext[index], noext[no_index])) {
- f2fs_info(sbi, "Don't allow the same extension %s appear in both compress and nocompress extension",
+ f2fs_info(NULL, "Don't allow the same extension %s appear in both compress and nocompress extension",
ext[index]);
return -EINVAL;
}
@@ -597,58 +629,62 @@ static int f2fs_test_compress_extension(struct f2fs_sb_info *sbi)
}
#ifdef CONFIG_F2FS_FS_LZ4
-static int f2fs_set_lz4hc_level(struct f2fs_sb_info *sbi, const char *str)
+static int f2fs_set_lz4hc_level(struct f2fs_fs_context *ctx, const char *str)
{
#ifdef CONFIG_F2FS_FS_LZ4HC
unsigned int level;
if (strlen(str) == 3) {
- F2FS_OPTION(sbi).compress_level = 0;
+ F2FS_CTX_INFO(ctx).compress_level = 0;
+ ctx->spec_mask |= F2FS_SPEC_compress_level;
return 0;
}
str += 3;
if (str[0] != ':') {
- f2fs_info(sbi, "wrong format, e.g. <alg_name>:<compr_level>");
+ f2fs_info(NULL, "wrong format, e.g. <alg_name>:<compr_level>");
return -EINVAL;
}
if (kstrtouint(str + 1, 10, &level))
return -EINVAL;
if (!f2fs_is_compress_level_valid(COMPRESS_LZ4, level)) {
- f2fs_info(sbi, "invalid lz4hc compress level: %d", level);
+ f2fs_info(NULL, "invalid lz4hc compress level: %d", level);
return -EINVAL;
}
- F2FS_OPTION(sbi).compress_level = level;
+ F2FS_CTX_INFO(ctx).compress_level = level;
+ ctx->spec_mask |= F2FS_SPEC_compress_level;
return 0;
#else
if (strlen(str) == 3) {
- F2FS_OPTION(sbi).compress_level = 0;
+ F2FS_CTX_INFO(ctx).compress_level = 0;
+ ctx->spec_mask |= F2FS_SPEC_compress_level;
return 0;
}
- f2fs_info(sbi, "kernel doesn't support lz4hc compression");
+ f2fs_info(NULL, "kernel doesn't support lz4hc compression");
return -EINVAL;
#endif
}
#endif
#ifdef CONFIG_F2FS_FS_ZSTD
-static int f2fs_set_zstd_level(struct f2fs_sb_info *sbi, const char *str)
+static int f2fs_set_zstd_level(struct f2fs_fs_context *ctx, const char *str)
{
int level;
int len = 4;
if (strlen(str) == len) {
- F2FS_OPTION(sbi).compress_level = F2FS_ZSTD_DEFAULT_CLEVEL;
+ F2FS_CTX_INFO(ctx).compress_level = F2FS_ZSTD_DEFAULT_CLEVEL;
+ ctx->spec_mask |= F2FS_SPEC_compress_level;
return 0;
}
str += len;
if (str[0] != ':') {
- f2fs_info(sbi, "wrong format, e.g. <alg_name>:<compr_level>");
+ f2fs_info(NULL, "wrong format, e.g. <alg_name>:<compr_level>");
return -EINVAL;
}
if (kstrtoint(str + 1, 10, &level))
@@ -656,692 +692,750 @@ static int f2fs_set_zstd_level(struct f2fs_sb_info *sbi, const char *str)
/* f2fs does not support negative compress level now */
if (level < 0) {
- f2fs_info(sbi, "do not support negative compress level: %d", level);
+ f2fs_info(NULL, "do not support negative compress level: %d", level);
return -ERANGE;
}
if (!f2fs_is_compress_level_valid(COMPRESS_ZSTD, level)) {
- f2fs_info(sbi, "invalid zstd compress level: %d", level);
+ f2fs_info(NULL, "invalid zstd compress level: %d", level);
return -EINVAL;
}
- F2FS_OPTION(sbi).compress_level = level;
+ F2FS_CTX_INFO(ctx).compress_level = level;
+ ctx->spec_mask |= F2FS_SPEC_compress_level;
return 0;
}
#endif
#endif
-static int parse_options(struct super_block *sb, char *options, bool is_remount)
+static int f2fs_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- struct f2fs_sb_info *sbi = F2FS_SB(sb);
- substring_t args[MAX_OPT_ARGS];
+ struct f2fs_fs_context *ctx = fc->fs_private;
#ifdef CONFIG_F2FS_FS_COMPRESSION
unsigned char (*ext)[F2FS_EXTENSION_LEN];
unsigned char (*noext)[F2FS_EXTENSION_LEN];
int ext_cnt, noext_cnt;
+ char *name;
#endif
- char *p, *name;
- int arg = 0;
- kuid_t uid;
- kgid_t gid;
- int ret;
-
- if (!options)
- goto default_check;
-
- while ((p = strsep(&options, ",")) != NULL) {
- int token;
-
- if (!*p)
- continue;
- /*
- * Initialize args struct so we know whether arg was
- * found; some options take optional arguments.
- */
- args[0].to = args[0].from = NULL;
- token = match_token(p, f2fs_tokens, args);
+ substring_t args[MAX_OPT_ARGS];
+ struct fs_parse_result result;
+ int token, ret, arg;
- switch (token) {
- case Opt_gc_background:
- name = match_strdup(&args[0]);
+ token = fs_parse(fc, f2fs_param_specs, param, &result);
+ if (token < 0)
+ return token;
- if (!name)
- return -ENOMEM;
- if (!strcmp(name, "on")) {
- F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_ON;
- } else if (!strcmp(name, "off")) {
- if (f2fs_sb_has_blkzoned(sbi)) {
- f2fs_warn(sbi, "zoned devices need bggc");
- kfree(name);
- return -EINVAL;
- }
- F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_OFF;
- } else if (!strcmp(name, "sync")) {
- F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_SYNC;
- } else {
- kfree(name);
- return -EINVAL;
- }
- kfree(name);
- break;
- case Opt_disable_roll_forward:
- set_opt(sbi, DISABLE_ROLL_FORWARD);
- break;
- case Opt_norecovery:
- /* this option mounts f2fs with ro */
- set_opt(sbi, NORECOVERY);
- if (!f2fs_readonly(sb))
- return -EINVAL;
- break;
- case Opt_discard:
- if (!f2fs_hw_support_discard(sbi)) {
- f2fs_warn(sbi, "device does not support discard");
- break;
- }
- set_opt(sbi, DISCARD);
- break;
- case Opt_nodiscard:
- if (f2fs_hw_should_discard(sbi)) {
- f2fs_warn(sbi, "discard is required for zoned block devices");
- return -EINVAL;
- }
- clear_opt(sbi, DISCARD);
- break;
- case Opt_noheap:
- case Opt_heap:
- f2fs_warn(sbi, "heap/no_heap options were deprecated");
- break;
+ switch (token) {
+ case Opt_gc_background:
+ F2FS_CTX_INFO(ctx).bggc_mode = result.uint_32;
+ ctx->spec_mask |= F2FS_SPEC_background_gc;
+ break;
+ case Opt_disable_roll_forward:
+ ctx_set_opt(ctx, F2FS_MOUNT_DISABLE_ROLL_FORWARD);
+ break;
+ case Opt_norecovery:
+ /* requires ro mount, checked in f2fs_validate_options */
+ ctx_set_opt(ctx, F2FS_MOUNT_NORECOVERY);
+ break;
+ case Opt_discard:
+ if (result.negated)
+ ctx_clear_opt(ctx, F2FS_MOUNT_DISCARD);
+ else
+ ctx_set_opt(ctx, F2FS_MOUNT_DISCARD);
+ break;
+ case Opt_noheap:
+ case Opt_heap:
+ f2fs_warn(NULL, "heap/no_heap options were deprecated");
+ break;
#ifdef CONFIG_F2FS_FS_XATTR
- case Opt_user_xattr:
- set_opt(sbi, XATTR_USER);
- break;
- case Opt_nouser_xattr:
- clear_opt(sbi, XATTR_USER);
- break;
- case Opt_inline_xattr:
- set_opt(sbi, INLINE_XATTR);
- break;
- case Opt_noinline_xattr:
- clear_opt(sbi, INLINE_XATTR);
- break;
- case Opt_inline_xattr_size:
- if (args->from && match_int(args, &arg))
- return -EINVAL;
- set_opt(sbi, INLINE_XATTR_SIZE);
- F2FS_OPTION(sbi).inline_xattr_size = arg;
- break;
+ case Opt_user_xattr:
+ if (result.negated)
+ ctx_clear_opt(ctx, F2FS_MOUNT_XATTR_USER);
+ else
+ ctx_set_opt(ctx, F2FS_MOUNT_XATTR_USER);
+ break;
+ case Opt_inline_xattr:
+ if (result.negated)
+ ctx_clear_opt(ctx, F2FS_MOUNT_INLINE_XATTR);
+ else
+ ctx_set_opt(ctx, F2FS_MOUNT_INLINE_XATTR);
+ break;
+ case Opt_inline_xattr_size:
+ if (result.int_32 < MIN_INLINE_XATTR_SIZE ||
+ result.int_32 > MAX_INLINE_XATTR_SIZE) {
+ f2fs_err(NULL, "inline xattr size is out of range: %u ~ %u",
+ (u32)MIN_INLINE_XATTR_SIZE, (u32)MAX_INLINE_XATTR_SIZE);
+ return -EINVAL;
+ }
+ ctx_set_opt(ctx, F2FS_MOUNT_INLINE_XATTR_SIZE);
+ F2FS_CTX_INFO(ctx).inline_xattr_size = result.int_32;
+ ctx->spec_mask |= F2FS_SPEC_inline_xattr_size;
+ break;
#else
- case Opt_user_xattr:
- f2fs_info(sbi, "user_xattr options not supported");
- break;
- case Opt_nouser_xattr:
- f2fs_info(sbi, "nouser_xattr options not supported");
- break;
- case Opt_inline_xattr:
- f2fs_info(sbi, "inline_xattr options not supported");
- break;
- case Opt_noinline_xattr:
- f2fs_info(sbi, "noinline_xattr options not supported");
- break;
+ case Opt_user_xattr:
+ case Opt_inline_xattr:
+ case Opt_inline_xattr_size:
+ f2fs_info(NULL, "%s options not supported", param->key);
+ break;
#endif
#ifdef CONFIG_F2FS_FS_POSIX_ACL
- case Opt_acl:
- set_opt(sbi, POSIX_ACL);
- break;
- case Opt_noacl:
- clear_opt(sbi, POSIX_ACL);
- break;
+ case Opt_acl:
+ if (result.negated)
+ ctx_clear_opt(ctx, F2FS_MOUNT_POSIX_ACL);
+ else
+ ctx_set_opt(ctx, F2FS_MOUNT_POSIX_ACL);
+ break;
#else
- case Opt_acl:
- f2fs_info(sbi, "acl options not supported");
- break;
- case Opt_noacl:
- f2fs_info(sbi, "noacl options not supported");
- break;
+ case Opt_acl:
+ f2fs_info(NULL, "%s options not supported", param->key);
+ break;
#endif
- case Opt_active_logs:
- if (args->from && match_int(args, &arg))
- return -EINVAL;
- if (arg != 2 && arg != 4 &&
- arg != NR_CURSEG_PERSIST_TYPE)
- return -EINVAL;
- F2FS_OPTION(sbi).active_logs = arg;
- break;
- case Opt_disable_ext_identify:
- set_opt(sbi, DISABLE_EXT_IDENTIFY);
- break;
- case Opt_inline_data:
- set_opt(sbi, INLINE_DATA);
- break;
- case Opt_inline_dentry:
- set_opt(sbi, INLINE_DENTRY);
- break;
- case Opt_noinline_dentry:
- clear_opt(sbi, INLINE_DENTRY);
- break;
- case Opt_flush_merge:
- set_opt(sbi, FLUSH_MERGE);
- break;
- case Opt_noflush_merge:
- clear_opt(sbi, FLUSH_MERGE);
- break;
- case Opt_nobarrier:
- set_opt(sbi, NOBARRIER);
- break;
- case Opt_barrier:
- clear_opt(sbi, NOBARRIER);
- break;
- case Opt_fastboot:
- set_opt(sbi, FASTBOOT);
- break;
- case Opt_extent_cache:
- set_opt(sbi, READ_EXTENT_CACHE);
- break;
- case Opt_noextent_cache:
- if (F2FS_HAS_FEATURE(sbi, F2FS_FEATURE_DEVICE_ALIAS)) {
- f2fs_err(sbi, "device aliasing requires extent cache");
- return -EINVAL;
- }
- clear_opt(sbi, READ_EXTENT_CACHE);
- break;
- case Opt_noinline_data:
- clear_opt(sbi, INLINE_DATA);
- break;
- case Opt_data_flush:
- set_opt(sbi, DATA_FLUSH);
- break;
- case Opt_reserve_root:
- if (args->from && match_int(args, &arg))
- return -EINVAL;
- if (test_opt(sbi, RESERVE_ROOT)) {
- f2fs_info(sbi, "Preserve previous reserve_root=%u",
- F2FS_OPTION(sbi).root_reserved_blocks);
- } else {
- F2FS_OPTION(sbi).root_reserved_blocks = arg;
- set_opt(sbi, RESERVE_ROOT);
- }
- break;
- case Opt_resuid:
- if (args->from && match_int(args, &arg))
- return -EINVAL;
- uid = make_kuid(current_user_ns(), arg);
- if (!uid_valid(uid)) {
- f2fs_err(sbi, "Invalid uid value %d", arg);
- return -EINVAL;
- }
- F2FS_OPTION(sbi).s_resuid = uid;
- break;
- case Opt_resgid:
- if (args->from && match_int(args, &arg))
- return -EINVAL;
- gid = make_kgid(current_user_ns(), arg);
- if (!gid_valid(gid)) {
- f2fs_err(sbi, "Invalid gid value %d", arg);
- return -EINVAL;
- }
- F2FS_OPTION(sbi).s_resgid = gid;
- break;
- case Opt_mode:
- name = match_strdup(&args[0]);
-
- if (!name)
- return -ENOMEM;
- if (!strcmp(name, "adaptive")) {
- F2FS_OPTION(sbi).fs_mode = FS_MODE_ADAPTIVE;
- } else if (!strcmp(name, "lfs")) {
- F2FS_OPTION(sbi).fs_mode = FS_MODE_LFS;
- } else if (!strcmp(name, "fragment:segment")) {
- F2FS_OPTION(sbi).fs_mode = FS_MODE_FRAGMENT_SEG;
- } else if (!strcmp(name, "fragment:block")) {
- F2FS_OPTION(sbi).fs_mode = FS_MODE_FRAGMENT_BLK;
- } else {
- kfree(name);
- return -EINVAL;
- }
- kfree(name);
- break;
+ case Opt_active_logs:
+ if (result.int_32 != 2 && result.int_32 != 4 &&
+ result.int_32 != NR_CURSEG_PERSIST_TYPE)
+ return -EINVAL;
+ ctx->spec_mask |= F2FS_SPEC_active_logs;
+ F2FS_CTX_INFO(ctx).active_logs = result.int_32;
+ break;
+ case Opt_disable_ext_identify:
+ ctx_set_opt(ctx, F2FS_MOUNT_DISABLE_EXT_IDENTIFY);
+ break;
+ case Opt_inline_data:
+ if (result.negated)
+ ctx_clear_opt(ctx, F2FS_MOUNT_INLINE_DATA);
+ else
+ ctx_set_opt(ctx, F2FS_MOUNT_INLINE_DATA);
+ break;
+ case Opt_inline_dentry:
+ if (result.negated)
+ ctx_clear_opt(ctx, F2FS_MOUNT_INLINE_DENTRY);
+ else
+ ctx_set_opt(ctx, F2FS_MOUNT_INLINE_DENTRY);
+ break;
+ case Opt_flush_merge:
+ if (result.negated)
+ ctx_clear_opt(ctx, F2FS_MOUNT_FLUSH_MERGE);
+ else
+ ctx_set_opt(ctx, F2FS_MOUNT_FLUSH_MERGE);
+ break;
+ case Opt_barrier:
+ if (result.negated)
+ ctx_set_opt(ctx, F2FS_MOUNT_NOBARRIER);
+ else
+ ctx_clear_opt(ctx, F2FS_MOUNT_NOBARRIER);
+ break;
+ case Opt_fastboot:
+ ctx_set_opt(ctx, F2FS_MOUNT_FASTBOOT);
+ break;
+ case Opt_extent_cache:
+ if (result.negated)
+ ctx_clear_opt(ctx, F2FS_MOUNT_READ_EXTENT_CACHE);
+ else
+ ctx_set_opt(ctx, F2FS_MOUNT_READ_EXTENT_CACHE);
+ break;
+ case Opt_data_flush:
+ ctx_set_opt(ctx, F2FS_MOUNT_DATA_FLUSH);
+ break;
+ case Opt_reserve_root:
+ ctx_set_opt(ctx, F2FS_MOUNT_RESERVE_ROOT);
+ F2FS_CTX_INFO(ctx).root_reserved_blocks = result.uint_32;
+ ctx->spec_mask |= F2FS_SPEC_reserve_root;
+ break;
+ case Opt_resuid:
+ F2FS_CTX_INFO(ctx).s_resuid = result.uid;
+ ctx->spec_mask |= F2FS_SPEC_resuid;
+ break;
+ case Opt_resgid:
+ F2FS_CTX_INFO(ctx).s_resgid = result.gid;
+ ctx->spec_mask |= F2FS_SPEC_resgid;
+ break;
+ case Opt_mode:
+ F2FS_CTX_INFO(ctx).fs_mode = result.uint_32;
+ ctx->spec_mask |= F2FS_SPEC_mode;
+ break;
#ifdef CONFIG_F2FS_FAULT_INJECTION
- case Opt_fault_injection:
- if (args->from && match_int(args, &arg))
- return -EINVAL;
- if (f2fs_build_fault_attr(sbi, arg,
- F2FS_ALL_FAULT_TYPE))
- return -EINVAL;
- set_opt(sbi, FAULT_INJECTION);
- break;
+ case Opt_fault_injection:
+ F2FS_CTX_INFO(ctx).fault_info.inject_rate = result.int_32;
+ ctx->spec_mask |= F2FS_SPEC_fault_injection;
+ ctx_set_opt(ctx, F2FS_MOUNT_FAULT_INJECTION);
+ break;
- case Opt_fault_type:
- if (args->from && match_int(args, &arg))
- return -EINVAL;
- if (f2fs_build_fault_attr(sbi, 0, arg))
- return -EINVAL;
- set_opt(sbi, FAULT_INJECTION);
- break;
+ case Opt_fault_type:
+ if (result.uint_32 > BIT(FAULT_MAX))
+ return -EINVAL;
+ F2FS_CTX_INFO(ctx).fault_info.inject_type = result.uint_32;
+ ctx->spec_mask |= F2FS_SPEC_fault_type;
+ ctx_set_opt(ctx, F2FS_MOUNT_FAULT_INJECTION);
+ break;
#else
- case Opt_fault_injection:
- f2fs_info(sbi, "fault_injection options not supported");
- break;
-
- case Opt_fault_type:
- f2fs_info(sbi, "fault_type options not supported");
- break;
+ case Opt_fault_injection:
+ case Opt_fault_type:
+ f2fs_info(NULL, "%s options not supported", param->key);
+ break;
#endif
- case Opt_lazytime:
- sb->s_flags |= SB_LAZYTIME;
- break;
- case Opt_nolazytime:
- sb->s_flags &= ~SB_LAZYTIME;
- break;
+ case Opt_lazytime:
+ if (result.negated)
+ ctx_clear_opt(ctx, F2FS_MOUNT_LAZYTIME);
+ else
+ ctx_set_opt(ctx, F2FS_MOUNT_LAZYTIME);
+ break;
#ifdef CONFIG_QUOTA
- case Opt_quota:
- case Opt_usrquota:
- set_opt(sbi, USRQUOTA);
- break;
- case Opt_grpquota:
- set_opt(sbi, GRPQUOTA);
- break;
- case Opt_prjquota:
- set_opt(sbi, PRJQUOTA);
- break;
- case Opt_usrjquota:
- ret = f2fs_set_qf_name(sb, USRQUOTA, &args[0]);
- if (ret)
- return ret;
- break;
- case Opt_grpjquota:
- ret = f2fs_set_qf_name(sb, GRPQUOTA, &args[0]);
- if (ret)
- return ret;
- break;
- case Opt_prjjquota:
- ret = f2fs_set_qf_name(sb, PRJQUOTA, &args[0]);
- if (ret)
- return ret;
- break;
- case Opt_offusrjquota:
- ret = f2fs_clear_qf_name(sb, USRQUOTA);
- if (ret)
- return ret;
- break;
- case Opt_offgrpjquota:
- ret = f2fs_clear_qf_name(sb, GRPQUOTA);
- if (ret)
- return ret;
- break;
- case Opt_offprjjquota:
- ret = f2fs_clear_qf_name(sb, PRJQUOTA);
- if (ret)
- return ret;
- break;
- case Opt_jqfmt_vfsold:
- F2FS_OPTION(sbi).s_jquota_fmt = QFMT_VFS_OLD;
- break;
- case Opt_jqfmt_vfsv0:
- F2FS_OPTION(sbi).s_jquota_fmt = QFMT_VFS_V0;
- break;
- case Opt_jqfmt_vfsv1:
- F2FS_OPTION(sbi).s_jquota_fmt = QFMT_VFS_V1;
- break;
- case Opt_noquota:
- clear_opt(sbi, QUOTA);
- clear_opt(sbi, USRQUOTA);
- clear_opt(sbi, GRPQUOTA);
- clear_opt(sbi, PRJQUOTA);
- break;
+ case Opt_quota:
+ if (result.negated) {
+ ctx_clear_opt(ctx, F2FS_MOUNT_QUOTA);
+ ctx_clear_opt(ctx, F2FS_MOUNT_USRQUOTA);
+ ctx_clear_opt(ctx, F2FS_MOUNT_GRPQUOTA);
+ ctx_clear_opt(ctx, F2FS_MOUNT_PRJQUOTA);
+ } else
+ ctx_set_opt(ctx, F2FS_MOUNT_USRQUOTA);
+ break;
+ case Opt_usrquota:
+ ctx_set_opt(ctx, F2FS_MOUNT_USRQUOTA);
+ break;
+ case Opt_grpquota:
+ ctx_set_opt(ctx, F2FS_MOUNT_GRPQUOTA);
+ break;
+ case Opt_prjquota:
+ ctx_set_opt(ctx, F2FS_MOUNT_PRJQUOTA);
+ break;
+ case Opt_usrjquota:
+ if (!*param->string)
+ ret = f2fs_unnote_qf_name(fc, USRQUOTA);
+ else
+ ret = f2fs_note_qf_name(fc, USRQUOTA, param);
+ if (ret)
+ return ret;
+ break;
+ case Opt_grpjquota:
+ if (!*param->string)
+ ret = f2fs_unnote_qf_name(fc, GRPQUOTA);
+ else
+ ret = f2fs_note_qf_name(fc, GRPQUOTA, param);
+ if (ret)
+ return ret;
+ break;
+ case Opt_prjjquota:
+ if (!*param->string)
+ ret = f2fs_unnote_qf_name(fc, PRJQUOTA);
+ else
+ ret = f2fs_note_qf_name(fc, PRJQUOTA, param);
+ if (ret)
+ return ret;
+ break;
+ case Opt_jqfmt:
+ F2FS_CTX_INFO(ctx).s_jquota_fmt = result.int_32;
+ ctx->spec_mask |= F2FS_SPEC_jqfmt;
+ break;
#else
- case Opt_quota:
- case Opt_usrquota:
- case Opt_grpquota:
- case Opt_prjquota:
- case Opt_usrjquota:
- case Opt_grpjquota:
- case Opt_prjjquota:
- case Opt_offusrjquota:
- case Opt_offgrpjquota:
- case Opt_offprjjquota:
- case Opt_jqfmt_vfsold:
- case Opt_jqfmt_vfsv0:
- case Opt_jqfmt_vfsv1:
- case Opt_noquota:
- f2fs_info(sbi, "quota operations not supported");
- break;
+ case Opt_quota:
+ case Opt_usrquota:
+ case Opt_grpquota:
+ case Opt_prjquota:
+ case Opt_usrjquota:
+ case Opt_grpjquota:
+ case Opt_prjjquota:
+ f2fs_info(NULL, "quota operations not supported");
+ break;
#endif
- case Opt_alloc:
- name = match_strdup(&args[0]);
- if (!name)
- return -ENOMEM;
-
- if (!strcmp(name, "default")) {
- F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_DEFAULT;
- } else if (!strcmp(name, "reuse")) {
- F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE;
- } else {
- kfree(name);
- return -EINVAL;
- }
- kfree(name);
- break;
- case Opt_fsync:
- name = match_strdup(&args[0]);
- if (!name)
- return -ENOMEM;
- if (!strcmp(name, "posix")) {
- F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_POSIX;
- } else if (!strcmp(name, "strict")) {
- F2FS_OPTION(sbi).fsync_mode = FSYNC_MODE_STRICT;
- } else if (!strcmp(name, "nobarrier")) {
- F2FS_OPTION(sbi).fsync_mode =
- FSYNC_MODE_NOBARRIER;
- } else {
- kfree(name);
- return -EINVAL;
- }
- kfree(name);
- break;
- case Opt_test_dummy_encryption:
- ret = f2fs_set_test_dummy_encryption(sb, p, &args[0],
- is_remount);
- if (ret)
- return ret;
- break;
- case Opt_inlinecrypt:
+ case Opt_alloc:
+ F2FS_CTX_INFO(ctx).alloc_mode = result.uint_32;
+ ctx->spec_mask |= F2FS_SPEC_alloc_mode;
+ break;
+ case Opt_fsync:
+ F2FS_CTX_INFO(ctx).fsync_mode = result.uint_32;
+ ctx->spec_mask |= F2FS_SPEC_fsync_mode;
+ break;
+ case Opt_test_dummy_encryption:
+ ret = f2fs_parse_test_dummy_encryption(param, ctx);
+ if (ret)
+ return ret;
+ break;
+ case Opt_inlinecrypt:
#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
- sb->s_flags |= SB_INLINECRYPT;
+ ctx_set_opt(ctx, F2FS_MOUNT_INLINECRYPT);
#else
- f2fs_info(sbi, "inline encryption not supported");
+ f2fs_info(NULL, "inline encryption not supported");
#endif
- break;
+ break;
+ case Opt_checkpoint:
+ /*
+ * Initialize args struct so we know whether arg was
+ * found; some options take optional arguments.
+ */
+ args[0].from = args[0].to = NULL;
+ arg = 0;
+
+ /* revert to match_table for checkpoint= options */
+ token = match_token(param->string, f2fs_checkpoint_tokens, args);
+ switch (token) {
case Opt_checkpoint_disable_cap_perc:
if (args->from && match_int(args, &arg))
return -EINVAL;
if (arg < 0 || arg > 100)
return -EINVAL;
- F2FS_OPTION(sbi).unusable_cap_perc = arg;
- set_opt(sbi, DISABLE_CHECKPOINT);
+ F2FS_CTX_INFO(ctx).unusable_cap_perc = arg;
+ ctx->spec_mask |= F2FS_SPEC_checkpoint_disable_cap_perc;
+ ctx_set_opt(ctx, F2FS_MOUNT_DISABLE_CHECKPOINT);
break;
case Opt_checkpoint_disable_cap:
if (args->from && match_int(args, &arg))
return -EINVAL;
- F2FS_OPTION(sbi).unusable_cap = arg;
- set_opt(sbi, DISABLE_CHECKPOINT);
+ F2FS_CTX_INFO(ctx).unusable_cap = arg;
+ ctx->spec_mask |= F2FS_SPEC_checkpoint_disable_cap;
+ ctx_set_opt(ctx, F2FS_MOUNT_DISABLE_CHECKPOINT);
break;
case Opt_checkpoint_disable:
- set_opt(sbi, DISABLE_CHECKPOINT);
+ ctx_set_opt(ctx, F2FS_MOUNT_DISABLE_CHECKPOINT);
break;
case Opt_checkpoint_enable:
- clear_opt(sbi, DISABLE_CHECKPOINT);
- break;
- case Opt_checkpoint_merge:
- set_opt(sbi, MERGE_CHECKPOINT);
- break;
- case Opt_nocheckpoint_merge:
- clear_opt(sbi, MERGE_CHECKPOINT);
+ ctx_clear_opt(ctx, F2FS_MOUNT_DISABLE_CHECKPOINT);
break;
+ default:
+ return -EINVAL;
+ }
+ break;
+ case Opt_checkpoint_merge:
+ if (result.negated)
+ ctx_clear_opt(ctx, F2FS_MOUNT_MERGE_CHECKPOINT);
+ else
+ ctx_set_opt(ctx, F2FS_MOUNT_MERGE_CHECKPOINT);
+ break;
#ifdef CONFIG_F2FS_FS_COMPRESSION
- case Opt_compress_algorithm:
- if (!f2fs_sb_has_compression(sbi)) {
- f2fs_info(sbi, "Image doesn't support compression");
- break;
- }
- name = match_strdup(&args[0]);
- if (!name)
- return -ENOMEM;
- if (!strcmp(name, "lzo")) {
+ case Opt_compress_algorithm:
+ name = param->string;
+ if (!strcmp(name, "lzo")) {
#ifdef CONFIG_F2FS_FS_LZO
- F2FS_OPTION(sbi).compress_level = 0;
- F2FS_OPTION(sbi).compress_algorithm =
- COMPRESS_LZO;
+ F2FS_CTX_INFO(ctx).compress_level = 0;
+ F2FS_CTX_INFO(ctx).compress_algorithm = COMPRESS_LZO;
+ ctx->spec_mask |= F2FS_SPEC_compress_level;
+ ctx->spec_mask |= F2FS_SPEC_compress_algorithm;
#else
- f2fs_info(sbi, "kernel doesn't support lzo compression");
+ f2fs_info(NULL, "kernel doesn't support lzo compression");
#endif
- } else if (!strncmp(name, "lz4", 3)) {
+ } else if (!strncmp(name, "lz4", 3)) {
#ifdef CONFIG_F2FS_FS_LZ4
- ret = f2fs_set_lz4hc_level(sbi, name);
- if (ret) {
- kfree(name);
- return -EINVAL;
- }
- F2FS_OPTION(sbi).compress_algorithm =
- COMPRESS_LZ4;
+ ret = f2fs_set_lz4hc_level(ctx, name);
+ if (ret)
+ return -EINVAL;
+ F2FS_CTX_INFO(ctx).compress_algorithm = COMPRESS_LZ4;
+ ctx->spec_mask |= F2FS_SPEC_compress_algorithm;
#else
- f2fs_info(sbi, "kernel doesn't support lz4 compression");
+ f2fs_info(NULL, "kernel doesn't support lz4 compression");
#endif
- } else if (!strncmp(name, "zstd", 4)) {
+ } else if (!strncmp(name, "zstd", 4)) {
#ifdef CONFIG_F2FS_FS_ZSTD
- ret = f2fs_set_zstd_level(sbi, name);
- if (ret) {
- kfree(name);
- return -EINVAL;
- }
- F2FS_OPTION(sbi).compress_algorithm =
- COMPRESS_ZSTD;
+ ret = f2fs_set_zstd_level(ctx, name);
+ if (ret)
+ return -EINVAL;
+ F2FS_CTX_INFO(ctx).compress_algorithm = COMPRESS_ZSTD;
+ ctx->spec_mask |= F2FS_SPEC_compress_algorithm;
#else
- f2fs_info(sbi, "kernel doesn't support zstd compression");
+ f2fs_info(NULL, "kernel doesn't support zstd compression");
#endif
- } else if (!strcmp(name, "lzo-rle")) {
+ } else if (!strcmp(name, "lzo-rle")) {
#ifdef CONFIG_F2FS_FS_LZORLE
- F2FS_OPTION(sbi).compress_level = 0;
- F2FS_OPTION(sbi).compress_algorithm =
- COMPRESS_LZORLE;
+ F2FS_CTX_INFO(ctx).compress_level = 0;
+ F2FS_CTX_INFO(ctx).compress_algorithm = COMPRESS_LZORLE;
+ ctx->spec_mask |= F2FS_SPEC_compress_level;
+ ctx->spec_mask |= F2FS_SPEC_compress_algorithm;
#else
- f2fs_info(sbi, "kernel doesn't support lzorle compression");
+ f2fs_info(NULL, "kernel doesn't support lzorle compression");
#endif
- } else {
- kfree(name);
- return -EINVAL;
- }
- kfree(name);
+ } else
+ return -EINVAL;
+ break;
+ case Opt_compress_log_size:
+ if (result.uint_32 < MIN_COMPRESS_LOG_SIZE ||
+ result.uint_32 > MAX_COMPRESS_LOG_SIZE) {
+ f2fs_err(NULL,
+ "Compress cluster log size is out of range");
+ return -EINVAL;
+ }
+ F2FS_CTX_INFO(ctx).compress_log_size = result.uint_32;
+ ctx->spec_mask |= F2FS_SPEC_compress_log_size;
+ break;
+ case Opt_compress_extension:
+ name = param->string;
+ ext = F2FS_CTX_INFO(ctx).extensions;
+ ext_cnt = F2FS_CTX_INFO(ctx).compress_ext_cnt;
+
+ if (strlen(name) >= F2FS_EXTENSION_LEN ||
+ ext_cnt >= COMPRESS_EXT_NUM) {
+ f2fs_err(NULL, "invalid extension length/number");
+ return -EINVAL;
+ }
+
+ if (is_compress_extension_exist(&ctx->info, name, true))
break;
- case Opt_compress_log_size:
- if (!f2fs_sb_has_compression(sbi)) {
- f2fs_info(sbi, "Image doesn't support compression");
- break;
- }
- if (args->from && match_int(args, &arg))
- return -EINVAL;
- if (arg < MIN_COMPRESS_LOG_SIZE ||
- arg > MAX_COMPRESS_LOG_SIZE) {
- f2fs_err(sbi,
- "Compress cluster log size is out of range");
- return -EINVAL;
- }
- F2FS_OPTION(sbi).compress_log_size = arg;
+
+ ret = strscpy(ext[ext_cnt], name, F2FS_EXTENSION_LEN);
+ if (ret < 0)
+ return ret;
+ F2FS_CTX_INFO(ctx).compress_ext_cnt++;
+ ctx->spec_mask |= F2FS_SPEC_compress_extension;
+ break;
+ case Opt_nocompress_extension:
+ name = param->string;
+ noext = F2FS_CTX_INFO(ctx).noextensions;
+ noext_cnt = F2FS_CTX_INFO(ctx).nocompress_ext_cnt;
+
+ if (strlen(name) >= F2FS_EXTENSION_LEN ||
+ noext_cnt >= COMPRESS_EXT_NUM) {
+ f2fs_err(NULL, "invalid extension length/number");
+ return -EINVAL;
+ }
+
+ if (is_compress_extension_exist(&ctx->info, name, false))
break;
- case Opt_compress_extension:
- if (!f2fs_sb_has_compression(sbi)) {
- f2fs_info(sbi, "Image doesn't support compression");
- break;
- }
- name = match_strdup(&args[0]);
- if (!name)
- return -ENOMEM;
- ext = F2FS_OPTION(sbi).extensions;
- ext_cnt = F2FS_OPTION(sbi).compress_ext_cnt;
+ ret = strscpy(noext[noext_cnt], name, F2FS_EXTENSION_LEN);
+ if (ret < 0)
+ return ret;
+ F2FS_CTX_INFO(ctx).nocompress_ext_cnt++;
+ ctx->spec_mask |= F2FS_SPEC_nocompress_extension;
+ break;
+ case Opt_compress_chksum:
+ F2FS_CTX_INFO(ctx).compress_chksum = true;
+ ctx->spec_mask |= F2FS_SPEC_compress_chksum;
+ break;
+ case Opt_compress_mode:
+ F2FS_CTX_INFO(ctx).compress_mode = result.uint_32;
+ ctx->spec_mask |= F2FS_SPEC_compress_mode;
+ break;
+ case Opt_compress_cache:
+ ctx_set_opt(ctx, F2FS_MOUNT_COMPRESS_CACHE);
+ break;
+#else
+ case Opt_compress_algorithm:
+ case Opt_compress_log_size:
+ case Opt_compress_extension:
+ case Opt_nocompress_extension:
+ case Opt_compress_chksum:
+ case Opt_compress_mode:
+ case Opt_compress_cache:
+ f2fs_info(NULL, "compression options not supported");
+ break;
+#endif
+ case Opt_atgc:
+ ctx_set_opt(ctx, F2FS_MOUNT_ATGC);
+ break;
+ case Opt_gc_merge:
+ if (result.negated)
+ ctx_clear_opt(ctx, F2FS_MOUNT_GC_MERGE);
+ else
+ ctx_set_opt(ctx, F2FS_MOUNT_GC_MERGE);
+ break;
+ case Opt_discard_unit:
+ F2FS_CTX_INFO(ctx).discard_unit = result.uint_32;
+ ctx->spec_mask |= F2FS_SPEC_discard_unit;
+ break;
+ case Opt_memory_mode:
+ F2FS_CTX_INFO(ctx).memory_mode = result.uint_32;
+ ctx->spec_mask |= F2FS_SPEC_memory_mode;
+ break;
+ case Opt_age_extent_cache:
+ ctx_set_opt(ctx, F2FS_MOUNT_AGE_EXTENT_CACHE);
+ break;
+ case Opt_errors:
+ F2FS_CTX_INFO(ctx).errors = result.uint_32;
+ ctx->spec_mask |= F2FS_SPEC_errors;
+ break;
+ case Opt_nat_bits:
+ ctx_set_opt(ctx, F2FS_MOUNT_NAT_BITS);
+ break;
+ }
+ return 0;
+}
- if (strlen(name) >= F2FS_EXTENSION_LEN ||
- ext_cnt >= COMPRESS_EXT_NUM) {
- f2fs_err(sbi,
- "invalid extension length/number");
- kfree(name);
- return -EINVAL;
- }
+/*
+ * Check quota settings consistency.
+ */
+static int f2fs_check_quota_consistency(struct fs_context *fc,
+ struct super_block *sb)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ #ifdef CONFIG_QUOTA
+ struct f2fs_fs_context *ctx = fc->fs_private;
+ bool quota_feature = f2fs_sb_has_quota_ino(sbi);
+ bool quota_turnon = sb_any_quota_loaded(sb);
+ char *old_qname, *new_qname;
+ bool usr_qf_name, grp_qf_name, prj_qf_name, usrquota, grpquota, prjquota;
+ int i;
- if (is_compress_extension_exist(sbi, name, true)) {
- kfree(name);
- break;
- }
+ /*
+ * We do the test below only for project quotas. 'usrquota' and
+ * 'grpquota' mount options are allowed even without quota feature
+ * to support legacy quotas in quota files.
+ */
+ if (ctx_test_opt(ctx, F2FS_MOUNT_PRJQUOTA) &&
+ !f2fs_sb_has_project_quota(sbi)) {
+ f2fs_err(sbi, "Project quota feature not enabled. Cannot enable project quota enforcement.");
+ return -EINVAL;
+ }
- ret = strscpy(ext[ext_cnt], name);
- if (ret < 0) {
- kfree(name);
- return ret;
- }
- F2FS_OPTION(sbi).compress_ext_cnt++;
- kfree(name);
- break;
- case Opt_nocompress_extension:
- if (!f2fs_sb_has_compression(sbi)) {
- f2fs_info(sbi, "Image doesn't support compression");
- break;
- }
- name = match_strdup(&args[0]);
- if (!name)
- return -ENOMEM;
+ if (ctx->qname_mask) {
+ for (i = 0; i < MAXQUOTAS; i++) {
+ if (!(ctx->qname_mask & (1 << i)))
+ continue;
- noext = F2FS_OPTION(sbi).noextensions;
- noext_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt;
+ old_qname = F2FS_OPTION(sbi).s_qf_names[i];
+ new_qname = F2FS_CTX_INFO(ctx).s_qf_names[i];
+ if (quota_turnon &&
+ !!old_qname != !!new_qname)
+ goto err_jquota_change;
- if (strlen(name) >= F2FS_EXTENSION_LEN ||
- noext_cnt >= COMPRESS_EXT_NUM) {
- f2fs_err(sbi,
- "invalid extension length/number");
- kfree(name);
- return -EINVAL;
+ if (old_qname) {
+ if (strcmp(old_qname, new_qname) == 0) {
+ ctx->qname_mask &= ~(1 << i);
+ continue;
+ }
+ goto err_jquota_specified;
}
- if (is_compress_extension_exist(sbi, name, false)) {
- kfree(name);
- break;
+ if (quota_feature) {
+ f2fs_info(sbi, "QUOTA feature is enabled, so ignore qf_name");
+ ctx->qname_mask &= ~(1 << i);
+ kfree(F2FS_CTX_INFO(ctx).s_qf_names[i]);
+ F2FS_CTX_INFO(ctx).s_qf_names[i] = NULL;
}
+ }
+ }
+
+ /* Make sure we don't mix old and new quota format */
+ usr_qf_name = F2FS_OPTION(sbi).s_qf_names[USRQUOTA] ||
+ F2FS_CTX_INFO(ctx).s_qf_names[USRQUOTA];
+ grp_qf_name = F2FS_OPTION(sbi).s_qf_names[GRPQUOTA] ||
+ F2FS_CTX_INFO(ctx).s_qf_names[GRPQUOTA];
+ prj_qf_name = F2FS_OPTION(sbi).s_qf_names[PRJQUOTA] ||
+ F2FS_CTX_INFO(ctx).s_qf_names[PRJQUOTA];
+ usrquota = test_opt(sbi, USRQUOTA) ||
+ ctx_test_opt(ctx, F2FS_MOUNT_USRQUOTA);
+ grpquota = test_opt(sbi, GRPQUOTA) ||
+ ctx_test_opt(ctx, F2FS_MOUNT_GRPQUOTA);
+ prjquota = test_opt(sbi, PRJQUOTA) ||
+ ctx_test_opt(ctx, F2FS_MOUNT_PRJQUOTA);
+
+ if (usr_qf_name) {
+ ctx_clear_opt(ctx, F2FS_MOUNT_USRQUOTA);
+ usrquota = false;
+ }
+ if (grp_qf_name) {
+ ctx_clear_opt(ctx, F2FS_MOUNT_GRPQUOTA);
+ grpquota = false;
+ }
+ if (prj_qf_name) {
+ ctx_clear_opt(ctx, F2FS_MOUNT_PRJQUOTA);
+ prjquota = false;
+ }
+ if (usr_qf_name || grp_qf_name || prj_qf_name) {
+ if (grpquota || usrquota || prjquota) {
+ f2fs_err(sbi, "old and new quota format mixing");
+ return -EINVAL;
+ }
+ if (!(ctx->spec_mask & F2FS_SPEC_jqfmt ||
+ F2FS_OPTION(sbi).s_jquota_fmt)) {
+ f2fs_err(sbi, "journaled quota format not specified");
+ return -EINVAL;
+ }
+ }
+ return 0;
+
+err_jquota_change:
+ f2fs_err(sbi, "Cannot change journaled quota options when quota turned on");
+ return -EINVAL;
+err_jquota_specified:
+ f2fs_err(sbi, "%s quota file already specified",
+ QTYPE2NAME(i));
+ return -EINVAL;
- ret = strscpy(noext[noext_cnt], name);
- if (ret < 0) {
- kfree(name);
- return ret;
- }
- F2FS_OPTION(sbi).nocompress_ext_cnt++;
- kfree(name);
- break;
- case Opt_compress_chksum:
- if (!f2fs_sb_has_compression(sbi)) {
- f2fs_info(sbi, "Image doesn't support compression");
- break;
- }
- F2FS_OPTION(sbi).compress_chksum = true;
- break;
- case Opt_compress_mode:
- if (!f2fs_sb_has_compression(sbi)) {
- f2fs_info(sbi, "Image doesn't support compression");
- break;
- }
- name = match_strdup(&args[0]);
- if (!name)
- return -ENOMEM;
- if (!strcmp(name, "fs")) {
- F2FS_OPTION(sbi).compress_mode = COMPR_MODE_FS;
- } else if (!strcmp(name, "user")) {
- F2FS_OPTION(sbi).compress_mode = COMPR_MODE_USER;
- } else {
- kfree(name);
- return -EINVAL;
- }
- kfree(name);
- break;
- case Opt_compress_cache:
- if (!f2fs_sb_has_compression(sbi)) {
- f2fs_info(sbi, "Image doesn't support compression");
- break;
- }
- set_opt(sbi, COMPRESS_CACHE);
- break;
#else
- case Opt_compress_algorithm:
- case Opt_compress_log_size:
- case Opt_compress_extension:
- case Opt_nocompress_extension:
- case Opt_compress_chksum:
- case Opt_compress_mode:
- case Opt_compress_cache:
- f2fs_info(sbi, "compression options not supported");
- break;
+ if (f2fs_readonly(sbi->sb))
+ return 0;
+ if (f2fs_sb_has_quota_ino(sbi)) {
+ f2fs_info(sbi, "Filesystem with quota feature cannot be mounted RDWR without CONFIG_QUOTA");
+ return -EINVAL;
+ }
+ if (f2fs_sb_has_project_quota(sbi)) {
+ f2fs_err(sbi, "Filesystem with project quota feature cannot be mounted RDWR without CONFIG_QUOTA");
+ return -EINVAL;
+ }
+
+ return 0;
#endif
- case Opt_atgc:
- set_opt(sbi, ATGC);
- break;
- case Opt_gc_merge:
- set_opt(sbi, GC_MERGE);
- break;
- case Opt_nogc_merge:
- clear_opt(sbi, GC_MERGE);
- break;
- case Opt_discard_unit:
- name = match_strdup(&args[0]);
- if (!name)
- return -ENOMEM;
- if (!strcmp(name, "block")) {
- F2FS_OPTION(sbi).discard_unit =
- DISCARD_UNIT_BLOCK;
- } else if (!strcmp(name, "segment")) {
- F2FS_OPTION(sbi).discard_unit =
- DISCARD_UNIT_SEGMENT;
- } else if (!strcmp(name, "section")) {
- F2FS_OPTION(sbi).discard_unit =
- DISCARD_UNIT_SECTION;
- } else {
- kfree(name);
- return -EINVAL;
- }
- kfree(name);
- break;
- case Opt_memory_mode:
- name = match_strdup(&args[0]);
- if (!name)
- return -ENOMEM;
- if (!strcmp(name, "normal")) {
- F2FS_OPTION(sbi).memory_mode =
- MEMORY_MODE_NORMAL;
- } else if (!strcmp(name, "low")) {
- F2FS_OPTION(sbi).memory_mode =
- MEMORY_MODE_LOW;
- } else {
- kfree(name);
- return -EINVAL;
+}
+
+static int f2fs_check_test_dummy_encryption(struct fs_context *fc,
+ struct super_block *sb)
+{
+ struct f2fs_fs_context *ctx = fc->fs_private;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+
+ if (!fscrypt_is_dummy_policy_set(&F2FS_CTX_INFO(ctx).dummy_enc_policy))
+ return 0;
+
+ if (!f2fs_sb_has_encrypt(sbi)) {
+ f2fs_err(sbi, "Encrypt feature is off");
+ return -EINVAL;
+ }
+
+ /*
+ * This mount option is just for testing, and it's not worthwhile to
+ * implement the extra complexity (e.g. RCU protection) that would be
+ * needed to allow it to be set or changed during remount. We do allow
+ * it to be specified during remount, but only if there is no change.
+ */
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+ if (fscrypt_dummy_policies_equal(&F2FS_OPTION(sbi).dummy_enc_policy,
+ &F2FS_CTX_INFO(ctx).dummy_enc_policy))
+ return 0;
+ f2fs_warn(sbi, "Can't set or change test_dummy_encryption on remount");
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static inline bool test_compression_spec(unsigned int mask)
+{
+ return mask & (F2FS_SPEC_compress_algorithm
+ | F2FS_SPEC_compress_log_size
+ | F2FS_SPEC_compress_extension
+ | F2FS_SPEC_nocompress_extension
+ | F2FS_SPEC_compress_chksum
+ | F2FS_SPEC_compress_mode);
+}
+
+static inline void clear_compression_spec(struct f2fs_fs_context *ctx)
+{
+ ctx->spec_mask &= ~(F2FS_SPEC_compress_algorithm
+ | F2FS_SPEC_compress_log_size
+ | F2FS_SPEC_compress_extension
+ | F2FS_SPEC_nocompress_extension
+ | F2FS_SPEC_compress_chksum
+ | F2FS_SPEC_compress_mode);
+}
+
+static int f2fs_check_compression(struct fs_context *fc,
+ struct super_block *sb)
+{
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+ struct f2fs_fs_context *ctx = fc->fs_private;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ int i, cnt;
+
+ if (!f2fs_sb_has_compression(sbi)) {
+ if (test_compression_spec(ctx->spec_mask) ||
+ ctx_test_opt(ctx, F2FS_MOUNT_COMPRESS_CACHE))
+ f2fs_info(sbi, "Image doesn't support compression");
+ clear_compression_spec(ctx);
+ ctx->opt_mask &= ~F2FS_MOUNT_COMPRESS_CACHE;
+ return 0;
+ }
+ if (ctx->spec_mask & F2FS_SPEC_compress_extension) {
+ cnt = F2FS_CTX_INFO(ctx).compress_ext_cnt;
+ for (i = 0; i < F2FS_CTX_INFO(ctx).compress_ext_cnt; i++) {
+ if (is_compress_extension_exist(&F2FS_OPTION(sbi),
+ F2FS_CTX_INFO(ctx).extensions[i], true)) {
+ F2FS_CTX_INFO(ctx).extensions[i][0] = '\0';
+ cnt--;
}
- kfree(name);
- break;
- case Opt_age_extent_cache:
- set_opt(sbi, AGE_EXTENT_CACHE);
- break;
- case Opt_errors:
- name = match_strdup(&args[0]);
- if (!name)
- return -ENOMEM;
- if (!strcmp(name, "remount-ro")) {
- F2FS_OPTION(sbi).errors =
- MOUNT_ERRORS_READONLY;
- } else if (!strcmp(name, "continue")) {
- F2FS_OPTION(sbi).errors =
- MOUNT_ERRORS_CONTINUE;
- } else if (!strcmp(name, "panic")) {
- F2FS_OPTION(sbi).errors =
- MOUNT_ERRORS_PANIC;
- } else {
- kfree(name);
- return -EINVAL;
+ }
+ if (F2FS_OPTION(sbi).compress_ext_cnt + cnt > COMPRESS_EXT_NUM) {
+ f2fs_err(sbi, "invalid extension length/number");
+ return -EINVAL;
+ }
+ }
+ if (ctx->spec_mask & F2FS_SPEC_nocompress_extension) {
+ cnt = F2FS_CTX_INFO(ctx).nocompress_ext_cnt;
+ for (i = 0; i < F2FS_CTX_INFO(ctx).nocompress_ext_cnt; i++) {
+ if (is_compress_extension_exist(&F2FS_OPTION(sbi),
+ F2FS_CTX_INFO(ctx).noextensions[i], false)) {
+ F2FS_CTX_INFO(ctx).noextensions[i][0] = '\0';
+ cnt--;
}
- kfree(name);
- break;
- default:
- f2fs_err(sbi, "Unrecognized mount option \"%s\" or missing value",
- p);
+ }
+ if (F2FS_OPTION(sbi).nocompress_ext_cnt + cnt > COMPRESS_EXT_NUM) {
+ f2fs_err(sbi, "invalid noextension length/number");
return -EINVAL;
}
}
-default_check:
-#ifdef CONFIG_QUOTA
- if (f2fs_check_quota_options(sbi))
+
+ if (f2fs_test_compress_extension(F2FS_CTX_INFO(ctx).noextensions,
+ F2FS_CTX_INFO(ctx).nocompress_ext_cnt,
+ F2FS_CTX_INFO(ctx).extensions,
+ F2FS_CTX_INFO(ctx).compress_ext_cnt)) {
+ f2fs_err(sbi, "new noextensions conflicts with new extensions");
return -EINVAL;
-#else
- if (f2fs_sb_has_quota_ino(sbi) && !f2fs_readonly(sbi->sb)) {
- f2fs_info(sbi, "Filesystem with quota feature cannot be mounted RDWR without CONFIG_QUOTA");
+ }
+ if (f2fs_test_compress_extension(F2FS_CTX_INFO(ctx).noextensions,
+ F2FS_CTX_INFO(ctx).nocompress_ext_cnt,
+ F2FS_OPTION(sbi).extensions,
+ F2FS_OPTION(sbi).compress_ext_cnt)) {
+ f2fs_err(sbi, "new noextensions conflicts with old extensions");
return -EINVAL;
}
- if (f2fs_sb_has_project_quota(sbi) && !f2fs_readonly(sbi->sb)) {
- f2fs_err(sbi, "Filesystem with project quota feature cannot be mounted RDWR without CONFIG_QUOTA");
+ if (f2fs_test_compress_extension(F2FS_OPTION(sbi).noextensions,
+ F2FS_OPTION(sbi).nocompress_ext_cnt,
+ F2FS_CTX_INFO(ctx).extensions,
+ F2FS_CTX_INFO(ctx).compress_ext_cnt)) {
+ f2fs_err(sbi, "new extensions conflicts with old noextensions");
return -EINVAL;
}
#endif
+ return 0;
+}
+
+static int f2fs_check_opt_consistency(struct fs_context *fc,
+ struct super_block *sb)
+{
+ struct f2fs_fs_context *ctx = fc->fs_private;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ int err;
+
+ if (ctx_test_opt(ctx, F2FS_MOUNT_NORECOVERY) && !f2fs_readonly(sb))
+ return -EINVAL;
+
+ if (f2fs_hw_should_discard(sbi) &&
+ (ctx->opt_mask & F2FS_MOUNT_DISCARD) &&
+ !ctx_test_opt(ctx, F2FS_MOUNT_DISCARD)) {
+ f2fs_warn(sbi, "discard is required for zoned block devices");
+ return -EINVAL;
+ }
+
+ if (!f2fs_hw_support_discard(sbi) &&
+ (ctx->opt_mask & F2FS_MOUNT_DISCARD) &&
+ ctx_test_opt(ctx, F2FS_MOUNT_DISCARD)) {
+ f2fs_warn(sbi, "device does not support discard");
+ ctx_clear_opt(ctx, F2FS_MOUNT_DISCARD);
+ ctx->opt_mask &= ~F2FS_MOUNT_DISCARD;
+ }
+
+ if (f2fs_sb_has_device_alias(sbi) &&
+ (ctx->opt_mask & F2FS_MOUNT_READ_EXTENT_CACHE) &&
+ !ctx_test_opt(ctx, F2FS_MOUNT_READ_EXTENT_CACHE)) {
+ f2fs_err(sbi, "device aliasing requires extent cache");
+ return -EINVAL;
+ }
+
+ if (test_opt(sbi, RESERVE_ROOT) &&
+ (ctx->opt_mask & F2FS_MOUNT_RESERVE_ROOT) &&
+ ctx_test_opt(ctx, F2FS_MOUNT_RESERVE_ROOT)) {
+ f2fs_info(sbi, "Preserve previous reserve_root=%u",
+ F2FS_OPTION(sbi).root_reserved_blocks);
+ ctx_clear_opt(ctx, F2FS_MOUNT_RESERVE_ROOT);
+ ctx->opt_mask &= ~F2FS_MOUNT_RESERVE_ROOT;
+ }
+
+ err = f2fs_check_test_dummy_encryption(fc, sb);
+ if (err)
+ return err;
+
+ err = f2fs_check_compression(fc, sb);
+ if (err)
+ return err;
+
+ err = f2fs_check_quota_consistency(fc, sb);
+ if (err)
+ return err;
if (!IS_ENABLED(CONFIG_UNICODE) && f2fs_sb_has_casefold(sbi)) {
f2fs_err(sbi,
@@ -1355,15 +1449,19 @@ default_check:
* devices, but mandatory for host-managed zoned block devices.
*/
if (f2fs_sb_has_blkzoned(sbi)) {
+ if (F2FS_CTX_INFO(ctx).bggc_mode == BGGC_MODE_OFF) {
+ f2fs_warn(sbi, "zoned devices need bggc");
+ return -EINVAL;
+ }
#ifdef CONFIG_BLK_DEV_ZONED
- if (F2FS_OPTION(sbi).discard_unit !=
- DISCARD_UNIT_SECTION) {
+ if ((ctx->spec_mask & F2FS_SPEC_discard_unit) &&
+ F2FS_CTX_INFO(ctx).discard_unit != DISCARD_UNIT_SECTION) {
f2fs_info(sbi, "Zoned block device doesn't need small discard, set discard_unit=section by default");
- F2FS_OPTION(sbi).discard_unit =
- DISCARD_UNIT_SECTION;
+ F2FS_CTX_INFO(ctx).discard_unit = DISCARD_UNIT_SECTION;
}
- if (F2FS_OPTION(sbi).fs_mode != FS_MODE_LFS) {
+ if ((ctx->spec_mask & F2FS_SPEC_mode) &&
+ F2FS_CTX_INFO(ctx).fs_mode != FS_MODE_LFS) {
f2fs_info(sbi, "Only lfs mode is allowed with zoned block device feature");
return -EINVAL;
}
@@ -1373,43 +1471,25 @@ default_check:
#endif
}
-#ifdef CONFIG_F2FS_FS_COMPRESSION
- if (f2fs_test_compress_extension(sbi)) {
- f2fs_err(sbi, "invalid compress or nocompress extension");
- return -EINVAL;
- }
-#endif
-
- if (test_opt(sbi, INLINE_XATTR_SIZE)) {
- int min_size, max_size;
-
+ if (ctx_test_opt(ctx, F2FS_MOUNT_INLINE_XATTR_SIZE)) {
if (!f2fs_sb_has_extra_attr(sbi) ||
!f2fs_sb_has_flexible_inline_xattr(sbi)) {
f2fs_err(sbi, "extra_attr or flexible_inline_xattr feature is off");
return -EINVAL;
}
- if (!test_opt(sbi, INLINE_XATTR)) {
+ if (!ctx_test_opt(ctx, F2FS_MOUNT_INLINE_XATTR) && !test_opt(sbi, INLINE_XATTR)) {
f2fs_err(sbi, "inline_xattr_size option should be set with inline_xattr option");
return -EINVAL;
}
-
- min_size = MIN_INLINE_XATTR_SIZE;
- max_size = MAX_INLINE_XATTR_SIZE;
-
- if (F2FS_OPTION(sbi).inline_xattr_size < min_size ||
- F2FS_OPTION(sbi).inline_xattr_size > max_size) {
- f2fs_err(sbi, "inline xattr size is out of range: %d ~ %d",
- min_size, max_size);
- return -EINVAL;
- }
}
- if (test_opt(sbi, ATGC) && f2fs_lfs_mode(sbi)) {
+ if (ctx_test_opt(ctx, F2FS_MOUNT_ATGC) &&
+ F2FS_CTX_INFO(ctx).fs_mode == FS_MODE_LFS) {
f2fs_err(sbi, "LFS is not compatible with ATGC");
return -EINVAL;
}
- if (f2fs_is_readonly(sbi) && test_opt(sbi, FLUSH_MERGE)) {
+ if (f2fs_is_readonly(sbi) && ctx_test_opt(ctx, F2FS_MOUNT_FLUSH_MERGE)) {
f2fs_err(sbi, "FLUSH_MERGE not compatible with readonly mode");
return -EINVAL;
}
@@ -1421,6 +1501,190 @@ default_check:
return 0;
}
+static void f2fs_apply_quota_options(struct fs_context *fc,
+ struct super_block *sb)
+{
+#ifdef CONFIG_QUOTA
+ struct f2fs_fs_context *ctx = fc->fs_private;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ bool quota_feature = f2fs_sb_has_quota_ino(sbi);
+ char *qname;
+ int i;
+
+ if (quota_feature)
+ return;
+
+ for (i = 0; i < MAXQUOTAS; i++) {
+ if (!(ctx->qname_mask & (1 << i)))
+ continue;
+
+ qname = F2FS_CTX_INFO(ctx).s_qf_names[i];
+ if (qname) {
+ qname = kstrdup(F2FS_CTX_INFO(ctx).s_qf_names[i],
+ GFP_KERNEL | __GFP_NOFAIL);
+ set_opt(sbi, QUOTA);
+ }
+ F2FS_OPTION(sbi).s_qf_names[i] = qname;
+ }
+
+ if (ctx->spec_mask & F2FS_SPEC_jqfmt)
+ F2FS_OPTION(sbi).s_jquota_fmt = F2FS_CTX_INFO(ctx).s_jquota_fmt;
+
+ if (quota_feature && F2FS_OPTION(sbi).s_jquota_fmt) {
+ f2fs_info(sbi, "QUOTA feature is enabled, so ignore jquota_fmt");
+ F2FS_OPTION(sbi).s_jquota_fmt = 0;
+ }
+#endif
+}
+
+static void f2fs_apply_test_dummy_encryption(struct fs_context *fc,
+ struct super_block *sb)
+{
+ struct f2fs_fs_context *ctx = fc->fs_private;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+
+ if (!fscrypt_is_dummy_policy_set(&F2FS_CTX_INFO(ctx).dummy_enc_policy) ||
+ /* if already set, it was already verified to be the same */
+ fscrypt_is_dummy_policy_set(&F2FS_OPTION(sbi).dummy_enc_policy))
+ return;
+ swap(F2FS_OPTION(sbi).dummy_enc_policy, F2FS_CTX_INFO(ctx).dummy_enc_policy);
+ f2fs_warn(sbi, "Test dummy encryption mode enabled");
+}
+
+static void f2fs_apply_compression(struct fs_context *fc,
+ struct super_block *sb)
+{
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+ struct f2fs_fs_context *ctx = fc->fs_private;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ unsigned char (*ctx_ext)[F2FS_EXTENSION_LEN];
+ unsigned char (*sbi_ext)[F2FS_EXTENSION_LEN];
+ int ctx_cnt, sbi_cnt, i;
+
+ if (ctx->spec_mask & F2FS_SPEC_compress_level)
+ F2FS_OPTION(sbi).compress_level =
+ F2FS_CTX_INFO(ctx).compress_level;
+ if (ctx->spec_mask & F2FS_SPEC_compress_algorithm)
+ F2FS_OPTION(sbi).compress_algorithm =
+ F2FS_CTX_INFO(ctx).compress_algorithm;
+ if (ctx->spec_mask & F2FS_SPEC_compress_log_size)
+ F2FS_OPTION(sbi).compress_log_size =
+ F2FS_CTX_INFO(ctx).compress_log_size;
+ if (ctx->spec_mask & F2FS_SPEC_compress_chksum)
+ F2FS_OPTION(sbi).compress_chksum =
+ F2FS_CTX_INFO(ctx).compress_chksum;
+ if (ctx->spec_mask & F2FS_SPEC_compress_mode)
+ F2FS_OPTION(sbi).compress_mode =
+ F2FS_CTX_INFO(ctx).compress_mode;
+ if (ctx->spec_mask & F2FS_SPEC_compress_extension) {
+ ctx_ext = F2FS_CTX_INFO(ctx).extensions;
+ ctx_cnt = F2FS_CTX_INFO(ctx).compress_ext_cnt;
+ sbi_ext = F2FS_OPTION(sbi).extensions;
+ sbi_cnt = F2FS_OPTION(sbi).compress_ext_cnt;
+ for (i = 0; i < ctx_cnt; i++) {
+ if (strlen(ctx_ext[i]) == 0)
+ continue;
+ strscpy(sbi_ext[sbi_cnt], ctx_ext[i]);
+ sbi_cnt++;
+ }
+ F2FS_OPTION(sbi).compress_ext_cnt = sbi_cnt;
+ }
+ if (ctx->spec_mask & F2FS_SPEC_nocompress_extension) {
+ ctx_ext = F2FS_CTX_INFO(ctx).noextensions;
+ ctx_cnt = F2FS_CTX_INFO(ctx).nocompress_ext_cnt;
+ sbi_ext = F2FS_OPTION(sbi).noextensions;
+ sbi_cnt = F2FS_OPTION(sbi).nocompress_ext_cnt;
+ for (i = 0; i < ctx_cnt; i++) {
+ if (strlen(ctx_ext[i]) == 0)
+ continue;
+ strscpy(sbi_ext[sbi_cnt], ctx_ext[i]);
+ sbi_cnt++;
+ }
+ F2FS_OPTION(sbi).nocompress_ext_cnt = sbi_cnt;
+ }
+#endif
+}
+
+static void f2fs_apply_options(struct fs_context *fc, struct super_block *sb)
+{
+ struct f2fs_fs_context *ctx = fc->fs_private;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+
+ F2FS_OPTION(sbi).opt &= ~ctx->opt_mask;
+ F2FS_OPTION(sbi).opt |= F2FS_CTX_INFO(ctx).opt;
+
+ if (ctx->spec_mask & F2FS_SPEC_background_gc)
+ F2FS_OPTION(sbi).bggc_mode = F2FS_CTX_INFO(ctx).bggc_mode;
+ if (ctx->spec_mask & F2FS_SPEC_inline_xattr_size)
+ F2FS_OPTION(sbi).inline_xattr_size =
+ F2FS_CTX_INFO(ctx).inline_xattr_size;
+ if (ctx->spec_mask & F2FS_SPEC_active_logs)
+ F2FS_OPTION(sbi).active_logs = F2FS_CTX_INFO(ctx).active_logs;
+ if (ctx->spec_mask & F2FS_SPEC_reserve_root)
+ F2FS_OPTION(sbi).root_reserved_blocks =
+ F2FS_CTX_INFO(ctx).root_reserved_blocks;
+ if (ctx->spec_mask & F2FS_SPEC_resgid)
+ F2FS_OPTION(sbi).s_resgid = F2FS_CTX_INFO(ctx).s_resgid;
+ if (ctx->spec_mask & F2FS_SPEC_resuid)
+ F2FS_OPTION(sbi).s_resuid = F2FS_CTX_INFO(ctx).s_resuid;
+ if (ctx->spec_mask & F2FS_SPEC_mode)
+ F2FS_OPTION(sbi).fs_mode = F2FS_CTX_INFO(ctx).fs_mode;
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ if (ctx->spec_mask & F2FS_SPEC_fault_injection)
+ (void)f2fs_build_fault_attr(sbi,
+ F2FS_CTX_INFO(ctx).fault_info.inject_rate, 0, FAULT_RATE);
+ if (ctx->spec_mask & F2FS_SPEC_fault_type)
+ (void)f2fs_build_fault_attr(sbi, 0,
+ F2FS_CTX_INFO(ctx).fault_info.inject_type, FAULT_TYPE);
+#endif
+ if (ctx->spec_mask & F2FS_SPEC_alloc_mode)
+ F2FS_OPTION(sbi).alloc_mode = F2FS_CTX_INFO(ctx).alloc_mode;
+ if (ctx->spec_mask & F2FS_SPEC_fsync_mode)
+ F2FS_OPTION(sbi).fsync_mode = F2FS_CTX_INFO(ctx).fsync_mode;
+ if (ctx->spec_mask & F2FS_SPEC_checkpoint_disable_cap)
+ F2FS_OPTION(sbi).unusable_cap = F2FS_CTX_INFO(ctx).unusable_cap;
+ if (ctx->spec_mask & F2FS_SPEC_checkpoint_disable_cap_perc)
+ F2FS_OPTION(sbi).unusable_cap_perc =
+ F2FS_CTX_INFO(ctx).unusable_cap_perc;
+ if (ctx->spec_mask & F2FS_SPEC_discard_unit)
+ F2FS_OPTION(sbi).discard_unit = F2FS_CTX_INFO(ctx).discard_unit;
+ if (ctx->spec_mask & F2FS_SPEC_memory_mode)
+ F2FS_OPTION(sbi).memory_mode = F2FS_CTX_INFO(ctx).memory_mode;
+ if (ctx->spec_mask & F2FS_SPEC_errors)
+ F2FS_OPTION(sbi).errors = F2FS_CTX_INFO(ctx).errors;
+
+ f2fs_apply_compression(fc, sb);
+ f2fs_apply_test_dummy_encryption(fc, sb);
+ f2fs_apply_quota_options(fc, sb);
+}
+
+static int f2fs_sanity_check_options(struct f2fs_sb_info *sbi, bool remount)
+{
+ if (f2fs_sb_has_device_alias(sbi) &&
+ !test_opt(sbi, READ_EXTENT_CACHE)) {
+ f2fs_err(sbi, "device aliasing requires extent cache");
+ return -EINVAL;
+ }
+
+ if (!remount)
+ return 0;
+
+#ifdef CONFIG_BLK_DEV_ZONED
+ if (f2fs_sb_has_blkzoned(sbi) &&
+ sbi->max_open_zones < F2FS_OPTION(sbi).active_logs) {
+ f2fs_err(sbi,
+ "zoned: max open zones %u is too small, need at least %u open zones",
+ sbi->max_open_zones, F2FS_OPTION(sbi).active_logs);
+ return -EINVAL;
+ }
+#endif
+ if (f2fs_lfs_mode(sbi) && !IS_F2FS_IPU_DISABLE(sbi)) {
+ f2fs_warn(sbi, "LFS is not compatible with IPU");
+ return -EINVAL;
+ }
+ return 0;
+}
+
static struct inode *f2fs_alloc_inode(struct super_block *sb)
{
struct f2fs_inode_info *fi;
@@ -1437,10 +1701,12 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
/* Initialize f2fs-specific inode info */
atomic_set(&fi->dirty_pages, 0);
atomic_set(&fi->i_compr_blocks, 0);
+ atomic_set(&fi->open_count, 0);
init_f2fs_rwsem(&fi->i_sem);
spin_lock_init(&fi->i_size_lock);
INIT_LIST_HEAD(&fi->dirty_list);
INIT_LIST_HEAD(&fi->gdirty_list);
+ INIT_LIST_HEAD(&fi->gdonate_list);
init_f2fs_rwsem(&fi->i_gc_rwsem[READ]);
init_f2fs_rwsem(&fi->i_gc_rwsem[WRITE]);
init_f2fs_rwsem(&fi->i_xattr_sem);
@@ -1527,6 +1793,12 @@ int f2fs_inode_dirtied(struct inode *inode, bool sync)
inc_page_count(sbi, F2FS_DIRTY_IMETA);
}
spin_unlock(&sbi->inode_lock[DIRTY_META]);
+
+ /* if atomic write is not committed, set inode w/ atomic dirty */
+ if (!ret && f2fs_is_atomic_file(inode) &&
+ !is_inode_flag_set(inode, FI_ATOMIC_COMMITTED))
+ set_inode_flag(inode, FI_ATOMIC_DIRTIED);
+
return ret;
}
@@ -1706,7 +1978,7 @@ static void f2fs_put_super(struct super_block *sb)
destroy_percpu_info(sbi);
f2fs_destroy_iostat(sbi);
for (i = 0; i < NR_PAGE_TYPE; i++)
- kvfree(sbi->write_io[i]);
+ kfree(sbi->write_io[i]);
#if IS_ENABLED(CONFIG_UNICODE)
utf8_unload(sb->s_encoding);
#endif
@@ -1737,22 +2009,28 @@ int f2fs_sync_fs(struct super_block *sb, int sync)
static int f2fs_freeze(struct super_block *sb)
{
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+
if (f2fs_readonly(sb))
return 0;
/* IO error happened before */
- if (unlikely(f2fs_cp_error(F2FS_SB(sb))))
+ if (unlikely(f2fs_cp_error(sbi)))
return -EIO;
/* must be clean, since sync_filesystem() was already called */
- if (is_sbi_flag_set(F2FS_SB(sb), SBI_IS_DIRTY))
+ if (is_sbi_flag_set(sbi, SBI_IS_DIRTY))
return -EINVAL;
+ sbi->umount_lock_holder = current;
+
/* Let's flush checkpoints and stop the thread. */
- f2fs_flush_ckpt_thread(F2FS_SB(sb));
+ f2fs_flush_ckpt_thread(sbi);
+
+ sbi->umount_lock_holder = NULL;
/* to avoid deadlock on f2fs_evict_inode->SB_FREEZE_FS */
- set_sbi_flag(F2FS_SB(sb), SBI_IS_FREEZING);
+ set_sbi_flag(sbi, SBI_IS_FREEZING);
return 0;
}
@@ -1791,26 +2069,32 @@ static int f2fs_statfs_project(struct super_block *sb,
limit = min_not_zero(dquot->dq_dqb.dqb_bsoftlimit,
dquot->dq_dqb.dqb_bhardlimit);
- if (limit)
- limit >>= sb->s_blocksize_bits;
+ limit >>= sb->s_blocksize_bits;
+
+ if (limit) {
+ uint64_t remaining = 0;
- if (limit && buf->f_blocks > limit) {
curblock = (dquot->dq_dqb.dqb_curspace +
dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits;
- buf->f_blocks = limit;
- buf->f_bfree = buf->f_bavail =
- (buf->f_blocks > curblock) ?
- (buf->f_blocks - curblock) : 0;
+ if (limit > curblock)
+ remaining = limit - curblock;
+
+ buf->f_blocks = min(buf->f_blocks, limit);
+ buf->f_bfree = min(buf->f_bfree, remaining);
+ buf->f_bavail = min(buf->f_bavail, remaining);
}
limit = min_not_zero(dquot->dq_dqb.dqb_isoftlimit,
dquot->dq_dqb.dqb_ihardlimit);
- if (limit && buf->f_files > limit) {
- buf->f_files = limit;
- buf->f_ffree =
- (buf->f_files > dquot->dq_dqb.dqb_curinodes) ?
- (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0;
+ if (limit) {
+ uint64_t remaining = 0;
+
+ if (limit > dquot->dq_dqb.dqb_curinodes)
+ remaining = limit - dquot->dq_dqb.dqb_curinodes;
+
+ buf->f_files = min(buf->f_files, limit);
+ buf->f_ffree = min(buf->f_ffree, remaining);
}
spin_unlock(&dquot->dq_dqb_lock);
@@ -1836,7 +2120,8 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_blocks = total_count - start_count;
spin_lock(&sbi->stat_lock);
-
+ if (sbi->carve_out)
+ buf->f_blocks -= sbi->current_reserved_blocks;
user_block_count = sbi->user_block_count;
total_valid_node_count = valid_node_count(sbi);
avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM;
@@ -1868,9 +2153,9 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_fsid = u64_to_fsid(id);
#ifdef CONFIG_QUOTA
- if (is_inode_flag_set(dentry->d_inode, FI_PROJ_INHERIT) &&
+ if (is_inode_flag_set(d_inode(dentry), FI_PROJ_INHERIT) &&
sb_has_quota_limits_enabled(sb, PRJQUOTA)) {
- f2fs_statfs_project(sb, F2FS_I(dentry->d_inode)->i_projid, buf);
+ f2fs_statfs_project(sb, F2FS_I(d_inode(dentry))->i_projid, buf);
}
#endif
return 0;
@@ -2128,6 +2413,9 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
else if (F2FS_OPTION(sbi).errors == MOUNT_ERRORS_PANIC)
seq_printf(seq, ",errors=%s", "panic");
+ if (test_opt(sbi, NAT_BITS))
+ seq_puts(seq, ",nat_bits");
+
return 0;
}
@@ -2175,8 +2463,8 @@ static void default_options(struct f2fs_sb_info *sbi, bool remount)
set_opt(sbi, INLINE_DATA);
set_opt(sbi, INLINE_DENTRY);
set_opt(sbi, MERGE_CHECKPOINT);
+ set_opt(sbi, LAZYTIME);
F2FS_OPTION(sbi).unusable_cap = 0;
- sbi->sb->s_flags |= SB_LAZYTIME;
if (!f2fs_is_readonly(sbi))
set_opt(sbi, FLUSH_MERGE);
if (f2fs_sb_has_blkzoned(sbi))
@@ -2191,7 +2479,7 @@ static void default_options(struct f2fs_sb_info *sbi, bool remount)
set_opt(sbi, POSIX_ACL);
#endif
- f2fs_build_fault_attr(sbi, 0, 0);
+ f2fs_build_fault_attr(sbi, 0, 0, FAULT_ALL);
}
#ifdef CONFIG_QUOTA
@@ -2301,11 +2589,12 @@ static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi)
f2fs_flush_ckpt_thread(sbi);
}
-static int f2fs_remount(struct super_block *sb, int *flags, char *data)
+static int __f2fs_remount(struct fs_context *fc, struct super_block *sb)
{
struct f2fs_sb_info *sbi = F2FS_SB(sb);
struct f2fs_mount_info org_mount_opt;
unsigned long old_sb_flags;
+ unsigned int flags = fc->sb_flags;
int err;
bool need_restart_gc = false, need_stop_gc = false;
bool need_restart_flush = false, need_stop_flush = false;
@@ -2318,6 +2607,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
bool no_discard = !test_opt(sbi, DISCARD);
bool no_compress_cache = !test_opt(sbi, COMPRESS_CACHE);
bool block_unit_discard = f2fs_block_unit_discard(sbi);
+ bool no_nat_bits = !test_opt(sbi, NAT_BITS);
#ifdef CONFIG_QUOTA
int i, j;
#endif
@@ -2329,6 +2619,8 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
org_mount_opt = sbi->mount_opt;
old_sb_flags = sb->s_flags;
+ sbi->umount_lock_holder = current;
+
#ifdef CONFIG_QUOTA
org_mount_opt.s_jquota_fmt = F2FS_OPTION(sbi).s_jquota_fmt;
for (i = 0; i < MAXQUOTAS; i++) {
@@ -2348,7 +2640,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
#endif
/* recover superblocks we couldn't write due to previous RO mount */
- if (!(*flags & SB_RDONLY) && is_sbi_flag_set(sbi, SBI_NEED_SB_WRITE)) {
+ if (!(flags & SB_RDONLY) && is_sbi_flag_set(sbi, SBI_NEED_SB_WRITE)) {
err = f2fs_commit_super(sbi, false);
f2fs_info(sbi, "Try to recover all the superblocks, ret: %d",
err);
@@ -2358,21 +2650,15 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
default_options(sbi, true);
- /* parse mount options */
- err = parse_options(sb, data, true);
+ err = f2fs_check_opt_consistency(fc, sb);
if (err)
goto restore_opts;
-#ifdef CONFIG_BLK_DEV_ZONED
- if (f2fs_sb_has_blkzoned(sbi) &&
- sbi->max_open_zones < F2FS_OPTION(sbi).active_logs) {
- f2fs_err(sbi,
- "zoned: max open zones %u is too small, need at least %u open zones",
- sbi->max_open_zones, F2FS_OPTION(sbi).active_logs);
- err = -EINVAL;
+ f2fs_apply_options(fc, sb);
+
+ err = f2fs_sanity_check_options(sbi, true);
+ if (err)
goto restore_opts;
- }
-#endif
/* flush outstanding errors before changing fs state */
flush_work(&sbi->s_error_work);
@@ -2381,20 +2667,20 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
* Previous and new state of filesystem is RO,
* so skip checking GC and FLUSH_MERGE conditions.
*/
- if (f2fs_readonly(sb) && (*flags & SB_RDONLY))
+ if (f2fs_readonly(sb) && (flags & SB_RDONLY))
goto skip;
- if (f2fs_dev_is_readonly(sbi) && !(*flags & SB_RDONLY)) {
+ if (f2fs_dev_is_readonly(sbi) && !(flags & SB_RDONLY)) {
err = -EROFS;
goto restore_opts;
}
#ifdef CONFIG_QUOTA
- if (!f2fs_readonly(sb) && (*flags & SB_RDONLY)) {
+ if (!f2fs_readonly(sb) && (flags & SB_RDONLY)) {
err = dquot_suspend(sb, -1);
if (err < 0)
goto restore_opts;
- } else if (f2fs_readonly(sb) && !(*flags & SB_RDONLY)) {
+ } else if (f2fs_readonly(sb) && !(flags & SB_RDONLY)) {
/* dquot_resume needs RW */
sb->s_flags &= ~SB_RDONLY;
if (sb_any_quota_suspended(sb)) {
@@ -2406,12 +2692,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
}
}
#endif
- if (f2fs_lfs_mode(sbi) && !IS_F2FS_IPU_DISABLE(sbi)) {
- err = -EINVAL;
- f2fs_warn(sbi, "LFS is not compatible with IPU");
- goto restore_opts;
- }
-
/* disallow enable atgc dynamically */
if (no_atgc == !!test_opt(sbi, ATGC)) {
err = -EINVAL;
@@ -2444,7 +2724,13 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
goto restore_opts;
}
- if ((*flags & SB_RDONLY) && test_opt(sbi, DISABLE_CHECKPOINT)) {
+ if (no_nat_bits == !!test_opt(sbi, NAT_BITS)) {
+ err = -EINVAL;
+ f2fs_warn(sbi, "switch nat_bits option is not allowed");
+ goto restore_opts;
+ }
+
+ if ((flags & SB_RDONLY) && test_opt(sbi, DISABLE_CHECKPOINT)) {
err = -EINVAL;
f2fs_warn(sbi, "disabling checkpoint not compatible with read-only");
goto restore_opts;
@@ -2455,7 +2741,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
* or if background_gc = off is passed in mount
* option. Also sync the filesystem.
*/
- if ((*flags & SB_RDONLY) ||
+ if ((flags & SB_RDONLY) ||
(F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_OFF &&
!test_opt(sbi, GC_MERGE))) {
if (sbi->gc_thread) {
@@ -2469,7 +2755,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
need_stop_gc = true;
}
- if (*flags & SB_RDONLY) {
+ if (flags & SB_RDONLY) {
sync_inodes_sb(sb);
set_sbi_flag(sbi, SBI_IS_DIRTY);
@@ -2482,7 +2768,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
* We stop issue flush thread if FS is mounted as RO
* or if flush_merge is not passed in mount option.
*/
- if ((*flags & SB_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) {
+ if ((flags & SB_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) {
clear_opt(sbi, FLUSH_MERGE);
f2fs_destroy_flush_cmd_control(sbi, false);
need_restart_flush = true;
@@ -2524,11 +2810,11 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
* triggered while remount and we need to take care of it before
* returning from remount.
*/
- if ((*flags & SB_RDONLY) || test_opt(sbi, DISABLE_CHECKPOINT) ||
+ if ((flags & SB_RDONLY) || test_opt(sbi, DISABLE_CHECKPOINT) ||
!test_opt(sbi, MERGE_CHECKPOINT)) {
f2fs_stop_ckpt_thread(sbi);
} else {
- /* Flush if the prevous checkpoint, if exists. */
+ /* Flush if the previous checkpoint, if exists. */
f2fs_flush_ckpt_thread(sbi);
err = f2fs_start_ckpt_thread(sbi);
@@ -2551,7 +2837,9 @@ skip:
(test_opt(sbi, POSIX_ACL) ? SB_POSIXACL : 0);
limit_reserve_root(sbi);
- *flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME);
+ fc->sb_flags = (flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME);
+
+ sbi->umount_lock_holder = NULL;
return 0;
restore_checkpoint:
if (need_enable_checkpoint) {
@@ -2592,6 +2880,8 @@ restore_opts:
#endif
sbi->mount_opt = org_mount_opt;
sb->s_flags = old_sb_flags;
+
+ sbi->umount_lock_holder = NULL;
return err;
}
@@ -2655,12 +2945,9 @@ static ssize_t f2fs_quota_read(struct super_block *sb, int type, char *data,
{
struct inode *inode = sb_dqopt(sb)->files[type];
struct address_space *mapping = inode->i_mapping;
- block_t blkidx = F2FS_BYTES_TO_BLK(off);
- int offset = off & (sb->s_blocksize - 1);
int tocopy;
size_t toread;
loff_t i_size = i_size_read(inode);
- struct page *page;
if (off > i_size)
return 0;
@@ -2669,37 +2956,42 @@ static ssize_t f2fs_quota_read(struct super_block *sb, int type, char *data,
len = i_size - off;
toread = len;
while (toread > 0) {
- tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread);
+ struct folio *folio;
+ size_t offset;
+
repeat:
- page = read_cache_page_gfp(mapping, blkidx, GFP_NOFS);
- if (IS_ERR(page)) {
- if (PTR_ERR(page) == -ENOMEM) {
+ folio = mapping_read_folio_gfp(mapping, off >> PAGE_SHIFT,
+ GFP_NOFS);
+ if (IS_ERR(folio)) {
+ if (PTR_ERR(folio) == -ENOMEM) {
memalloc_retry_wait(GFP_NOFS);
goto repeat;
}
set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR);
- return PTR_ERR(page);
+ return PTR_ERR(folio);
}
+ offset = offset_in_folio(folio, off);
+ tocopy = min(folio_size(folio) - offset, toread);
- lock_page(page);
+ folio_lock(folio);
- if (unlikely(page->mapping != mapping)) {
- f2fs_put_page(page, 1);
+ if (unlikely(folio->mapping != mapping)) {
+ f2fs_folio_put(folio, true);
goto repeat;
}
- if (unlikely(!PageUptodate(page))) {
- f2fs_put_page(page, 1);
- set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR);
- return -EIO;
- }
- memcpy_from_page(data, page, offset, tocopy);
- f2fs_put_page(page, 1);
+ /*
+ * should never happen, just leave f2fs_bug_on() here to catch
+ * any potential bug.
+ */
+ f2fs_bug_on(F2FS_SB(sb), !folio_test_uptodate(folio));
+
+ memcpy_from_folio(data, folio, offset, tocopy);
+ f2fs_folio_put(folio, true);
- offset = 0;
toread -= tocopy;
data += tocopy;
- blkidx++;
+ off += tocopy;
}
return len;
}
@@ -2908,7 +3200,7 @@ out:
return ret;
}
-int f2fs_quota_sync(struct super_block *sb, int type)
+int f2fs_do_quota_sync(struct super_block *sb, int type)
{
struct f2fs_sb_info *sbi = F2FS_SB(sb);
struct quota_info *dqopt = sb_dqopt(sb);
@@ -2956,11 +3248,21 @@ int f2fs_quota_sync(struct super_block *sb, int type)
return ret;
}
+static int f2fs_quota_sync(struct super_block *sb, int type)
+{
+ int ret;
+
+ F2FS_SB(sb)->umount_lock_holder = current;
+ ret = f2fs_do_quota_sync(sb, type);
+ F2FS_SB(sb)->umount_lock_holder = NULL;
+ return ret;
+}
+
static int f2fs_quota_on(struct super_block *sb, int type, int format_id,
const struct path *path)
{
struct inode *inode;
- int err;
+ int err = 0;
/* if quota sysfile exists, deny enabling quota with specific file */
if (f2fs_sb_has_quota_ino(F2FS_SB(sb))) {
@@ -2971,31 +3273,34 @@ static int f2fs_quota_on(struct super_block *sb, int type, int format_id,
if (path->dentry->d_sb != sb)
return -EXDEV;
- err = f2fs_quota_sync(sb, type);
+ F2FS_SB(sb)->umount_lock_holder = current;
+
+ err = f2fs_do_quota_sync(sb, type);
if (err)
- return err;
+ goto out;
inode = d_inode(path->dentry);
err = filemap_fdatawrite(inode->i_mapping);
if (err)
- return err;
+ goto out;
err = filemap_fdatawait(inode->i_mapping);
if (err)
- return err;
+ goto out;
err = dquot_quota_on(sb, type, format_id, path);
if (err)
- return err;
+ goto out;
inode_lock(inode);
F2FS_I(inode)->i_flags |= F2FS_QUOTA_DEFAULT_FL;
f2fs_set_inode_flags(inode);
inode_unlock(inode);
f2fs_mark_inode_dirty_sync(inode, false);
-
- return 0;
+out:
+ F2FS_SB(sb)->umount_lock_holder = NULL;
+ return err;
}
static int __f2fs_quota_off(struct super_block *sb, int type)
@@ -3006,7 +3311,7 @@ static int __f2fs_quota_off(struct super_block *sb, int type)
if (!inode || !igrab(inode))
return dquot_quota_off(sb, type);
- err = f2fs_quota_sync(sb, type);
+ err = f2fs_do_quota_sync(sb, type);
if (err)
goto out_put;
@@ -3029,6 +3334,8 @@ static int f2fs_quota_off(struct super_block *sb, int type)
struct f2fs_sb_info *sbi = F2FS_SB(sb);
int err;
+ F2FS_SB(sb)->umount_lock_holder = current;
+
err = __f2fs_quota_off(sb, type);
/*
@@ -3038,6 +3345,9 @@ static int f2fs_quota_off(struct super_block *sb, int type)
*/
if (is_journalled_quota(sbi))
set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR);
+
+ F2FS_SB(sb)->umount_lock_holder = NULL;
+
return err;
}
@@ -3170,7 +3480,7 @@ int f2fs_dquot_initialize(struct inode *inode)
return 0;
}
-int f2fs_quota_sync(struct super_block *sb, int type)
+int f2fs_do_quota_sync(struct super_block *sb, int type)
{
return 0;
}
@@ -3198,7 +3508,6 @@ static const struct super_operations f2fs_sops = {
.freeze_fs = f2fs_freeze,
.unfreeze_fs = f2fs_unfreeze,
.statfs = f2fs_statfs,
- .remount_fs = f2fs_remount,
.shutdown = f2fs_shutdown,
};
@@ -3380,12 +3689,13 @@ static int __f2fs_commit_super(struct f2fs_sb_info *sbi, struct folio *folio,
bio = bio_alloc(sbi->sb->s_bdev, 1, opf, GFP_NOFS);
/* it doesn't need to set crypto context for superblock update */
- bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(folio_index(folio));
+ bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(folio->index);
if (!bio_add_folio(bio, folio, folio_size(folio), 0))
f2fs_bug_on(sbi, 1);
ret = submit_bio_wait(bio);
+ bio_put(bio);
folio_end_writeback(folio);
return ret;
@@ -3506,7 +3816,7 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
return -EFSCORRUPTED;
}
crc = le32_to_cpu(raw_super->crc);
- if (!f2fs_crc_valid(sbi, crc, raw_super, crc_offset)) {
+ if (crc != f2fs_crc32(raw_super, crc_offset)) {
f2fs_info(sbi, "Invalid SB checksum value: %u", crc);
return -EFSCORRUPTED;
}
@@ -3665,6 +3975,7 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi)
block_t user_block_count, valid_user_blocks;
block_t avail_node_count, valid_node_count;
unsigned int nat_blocks, nat_bits_bytes, nat_bits_blocks;
+ unsigned int sit_blk_cnt;
int i, j;
total = le32_to_cpu(raw_super->segment_count);
@@ -3776,6 +4087,13 @@ skip_cross:
return 1;
}
+ sit_blk_cnt = DIV_ROUND_UP(main_segs, SIT_ENTRY_PER_BLOCK);
+ if (sit_bitmap_size * 8 < sit_blk_cnt) {
+ f2fs_err(sbi, "Wrong bitmap size: sit: %u, sit_blk_cnt:%u",
+ sit_bitmap_size, sit_blk_cnt);
+ return 1;
+ }
+
cp_pack_start_sum = __start_sum_addr(sbi);
cp_payload = __cp_payload(sbi);
if (cp_pack_start_sum < cp_payload + 1 ||
@@ -4054,7 +4372,7 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover)
/* we should update superblock crc here */
if (!recover && f2fs_sb_has_sb_chksum(sbi)) {
- crc = f2fs_crc32(sbi, F2FS_RAW_SUPER(sbi),
+ crc = f2fs_crc32(F2FS_RAW_SUPER(sbi),
offsetof(struct f2fs_super_block, crc));
F2FS_RAW_SUPER(sbi)->crc = cpu_to_le32(crc);
}
@@ -4220,6 +4538,8 @@ void f2fs_handle_critical_error(struct f2fs_sb_info *sbi, unsigned char reason)
if (shutdown)
set_sbi_flag(sbi, SBI_IS_SHUTDOWN);
+ else
+ dump_stack();
/*
* Continue filesystem operators if errors=continue. Should not set
@@ -4250,14 +4570,35 @@ static void f2fs_record_error_work(struct work_struct *work)
f2fs_record_stop_reason(sbi);
}
-static inline unsigned int get_first_zoned_segno(struct f2fs_sb_info *sbi)
+static inline unsigned int get_first_seq_zone_segno(struct f2fs_sb_info *sbi)
{
+#ifdef CONFIG_BLK_DEV_ZONED
+ unsigned int zoneno, total_zones;
int devi;
- for (devi = 0; devi < sbi->s_ndevs; devi++)
- if (bdev_is_zoned(FDEV(devi).bdev))
- return GET_SEGNO(sbi, FDEV(devi).start_blk);
- return 0;
+ if (!f2fs_sb_has_blkzoned(sbi))
+ return NULL_SEGNO;
+
+ for (devi = 0; devi < sbi->s_ndevs; devi++) {
+ if (!bdev_is_zoned(FDEV(devi).bdev))
+ continue;
+
+ total_zones = GET_ZONE_FROM_SEG(sbi, FDEV(devi).total_segments);
+
+ for (zoneno = 0; zoneno < total_zones; zoneno++) {
+ unsigned int segs, blks;
+
+ if (!f2fs_zone_is_seq(sbi, devi, zoneno))
+ continue;
+
+ segs = GET_SEG_FROM_SEC(sbi,
+ zoneno * sbi->secs_per_zone);
+ blks = SEGS_TO_BLKS(sbi, segs);
+ return GET_SEGNO(sbi, FDEV(devi).start_blk + blks);
+ }
+ }
+#endif
+ return NULL_SEGNO;
}
static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
@@ -4294,6 +4635,14 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
#endif
for (i = 0; i < max_devices; i++) {
+ if (max_devices == 1) {
+ FDEV(i).total_segments =
+ le32_to_cpu(raw_super->segment_count_main);
+ FDEV(i).start_blk = 0;
+ FDEV(i).end_blk = FDEV(i).total_segments *
+ BLKS_PER_SEG(sbi);
+ }
+
if (i == 0)
FDEV(0).bdev_file = sbi->sb->s_bdev_file;
else if (!RDEV(i).path[0])
@@ -4418,14 +4767,14 @@ static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi)
sbi->readdir_ra = true;
}
-static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
+static int f2fs_fill_super(struct super_block *sb, struct fs_context *fc)
{
+ struct f2fs_fs_context *ctx = fc->fs_private;
struct f2fs_sb_info *sbi;
struct f2fs_super_block *raw_super;
struct inode *root;
int err;
bool skip_recovery = false, need_fsck = false;
- char *options = NULL;
int recovery, i, valid_super_block;
struct curseg_info *seg_i;
int retry_cnt = 1;
@@ -4484,18 +4833,18 @@ try_onemore:
/* precompute checksum seed for metadata */
if (f2fs_sb_has_inode_chksum(sbi))
- sbi->s_chksum_seed = f2fs_chksum(sbi, ~0, raw_super->uuid,
- sizeof(raw_super->uuid));
+ sbi->s_chksum_seed = f2fs_chksum(~0, raw_super->uuid,
+ sizeof(raw_super->uuid));
default_options(sbi, false);
- /* parse mount options */
- options = kstrdup((const char *)data, GFP_KERNEL);
- if (data && !options) {
- err = -ENOMEM;
+
+ err = f2fs_check_opt_consistency(fc, sb);
+ if (err)
goto free_sb_buf;
- }
- err = parse_options(sb, options, false);
+ f2fs_apply_options(fc, sb);
+
+ err = f2fs_sanity_check_options(sbi, false);
if (err)
goto free_options;
@@ -4533,6 +4882,14 @@ try_onemore:
sb->s_time_gran = 1;
sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
(test_opt(sbi, POSIX_ACL) ? SB_POSIXACL : 0);
+ if (test_opt(sbi, INLINECRYPT))
+ sb->s_flags |= SB_INLINECRYPT;
+
+ if (test_opt(sbi, LAZYTIME))
+ sb->s_flags |= SB_LAZYTIME;
+ else
+ sb->s_flags &= ~SB_LAZYTIME;
+
super_set_uuid(sb, (void *) raw_super->uuid, sizeof(raw_super->uuid));
super_set_sysfs_name_bdev(sb);
sb->s_iflags |= SB_I_CGROUPWB;
@@ -4652,7 +5009,11 @@ try_onemore:
sbi->sectors_written_start = f2fs_get_sectors_written(sbi);
/* get segno of first zoned block device */
- sbi->first_zoned_segno = get_first_zoned_segno(sbi);
+ sbi->first_seq_zone_segno = get_first_seq_zone_segno(sbi);
+
+ sbi->reserved_pin_section = f2fs_sb_has_blkzoned(sbi) ?
+ ZONED_PIN_SEC_REQUIRED_COUNT :
+ GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi));
/* Read accumulated write IO statistics if exists */
seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE);
@@ -4703,6 +5064,7 @@ try_onemore:
if (err)
goto free_compress_inode;
+ sbi->umount_lock_holder = current;
#ifdef CONFIG_QUOTA
/* Enable quota usage during mount */
if (f2fs_sb_has_quota_ino(sbi) && !f2fs_readonly(sb)) {
@@ -4718,8 +5080,10 @@ try_onemore:
if (err)
goto free_meta;
- if (unlikely(is_set_ckpt_flags(sbi, CP_DISABLED_FLAG)))
+ if (unlikely(is_set_ckpt_flags(sbi, CP_DISABLED_FLAG))) {
+ skip_recovery = true;
goto reset_checkpoint;
+ }
/* recover fsynced data */
if (!test_opt(sbi, DISABLE_ROLL_FORWARD) &&
@@ -4769,10 +5133,10 @@ try_onemore:
}
}
+reset_checkpoint:
#ifdef CONFIG_QUOTA
f2fs_recover_quota_end(sbi, quota_enabled);
#endif
-reset_checkpoint:
/*
* If the f2fs is not readonly and fsync data recovery succeeds,
* write pointer consistency of cursegs and other zones are already
@@ -4811,7 +5175,6 @@ reset_checkpoint:
if (err)
goto sync_free_meta;
}
- kvfree(options);
/* recover broken superblock */
if (recovery) {
@@ -4829,6 +5192,8 @@ reset_checkpoint:
f2fs_update_time(sbi, CP_TIME);
f2fs_update_time(sbi, REQ_TIME);
clear_sbi_flag(sbi, SBI_CP_DISABLED_QUICK);
+
+ sbi->umount_lock_holder = NULL;
return 0;
sync_free_meta:
@@ -4892,7 +5257,7 @@ free_iostat:
f2fs_destroy_iostat(sbi);
free_bio_info:
for (i = 0; i < NR_PAGE_TYPE; i++)
- kvfree(sbi->write_io[i]);
+ kfree(sbi->write_io[i]);
#if IS_ENABLED(CONFIG_UNICODE)
utf8_unload(sb->s_encoding);
@@ -4903,8 +5268,8 @@ free_options:
for (i = 0; i < MAXQUOTAS; i++)
kfree(F2FS_OPTION(sbi).s_qf_names[i]);
#endif
- fscrypt_free_dummy_policy(&F2FS_OPTION(sbi).dummy_enc_policy);
- kvfree(options);
+ /* no need to free dummy_enc_policy, we just keep it in ctx when failed */
+ swap(F2FS_CTX_INFO(ctx).dummy_enc_policy, F2FS_OPTION(sbi).dummy_enc_policy);
free_sb_buf:
kfree(raw_super);
free_sbi:
@@ -4920,17 +5285,46 @@ free_sbi:
return err;
}
-static struct dentry *f2fs_mount(struct file_system_type *fs_type, int flags,
- const char *dev_name, void *data)
+static int f2fs_get_tree(struct fs_context *fc)
{
- return mount_bdev(fs_type, flags, dev_name, data, f2fs_fill_super);
+ return get_tree_bdev(fc, f2fs_fill_super);
}
+static int f2fs_reconfigure(struct fs_context *fc)
+{
+ struct super_block *sb = fc->root->d_sb;
+
+ return __f2fs_remount(fc, sb);
+}
+
+static void f2fs_fc_free(struct fs_context *fc)
+{
+ struct f2fs_fs_context *ctx = fc->fs_private;
+
+ if (!ctx)
+ return;
+
+#ifdef CONFIG_QUOTA
+ f2fs_unnote_qf_name_all(fc);
+#endif
+ fscrypt_free_dummy_policy(&F2FS_CTX_INFO(ctx).dummy_enc_policy);
+ kfree(ctx);
+}
+
+static const struct fs_context_operations f2fs_context_ops = {
+ .parse_param = f2fs_parse_param,
+ .get_tree = f2fs_get_tree,
+ .reconfigure = f2fs_reconfigure,
+ .free = f2fs_fc_free,
+};
+
static void kill_f2fs_super(struct super_block *sb)
{
struct f2fs_sb_info *sbi = F2FS_SB(sb);
if (sb->s_root) {
+ sbi->umount_lock_holder = current;
+
set_sbi_flag(sbi, SBI_IS_CLOSE);
f2fs_stop_gc_thread(sbi);
f2fs_stop_discard_thread(sbi);
@@ -4965,10 +5359,24 @@ static void kill_f2fs_super(struct super_block *sb)
}
}
+static int f2fs_init_fs_context(struct fs_context *fc)
+{
+ struct f2fs_fs_context *ctx;
+
+ ctx = kzalloc(sizeof(struct f2fs_fs_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ fc->fs_private = ctx;
+ fc->ops = &f2fs_context_ops;
+
+ return 0;
+}
+
static struct file_system_type f2fs_fs_type = {
.owner = THIS_MODULE,
.name = "f2fs",
- .mount = f2fs_mount,
+ .init_fs_context = f2fs_init_fs_context,
.kill_sb = kill_f2fs_super,
.fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
};
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index d15c68b28952..f736052dea50 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -61,6 +61,12 @@ struct f2fs_attr {
int id;
};
+struct f2fs_base_attr {
+ struct attribute attr;
+ ssize_t (*show)(struct f2fs_base_attr *a, char *buf);
+ ssize_t (*store)(struct f2fs_base_attr *a, const char *buf, size_t len);
+};
+
static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf);
@@ -268,6 +274,13 @@ static ssize_t encoding_show(struct f2fs_attr *a,
return sysfs_emit(buf, "(none)\n");
}
+static ssize_t encoding_flags_show(struct f2fs_attr *a,
+ struct f2fs_sb_info *sbi, char *buf)
+{
+ return sysfs_emit(buf, "%x\n",
+ le16_to_cpu(F2FS_RAW_SUPER(sbi)->s_encoding_flags));
+}
+
static ssize_t mounted_time_sec_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
@@ -488,12 +501,12 @@ out:
return ret;
#ifdef CONFIG_F2FS_FAULT_INJECTION
if (a->struct_type == FAULT_INFO_TYPE) {
- if (f2fs_build_fault_attr(sbi, 0, t))
+ if (f2fs_build_fault_attr(sbi, 0, t, FAULT_TYPE))
return -EINVAL;
return count;
}
if (a->struct_type == FAULT_INFO_RATE) {
- if (f2fs_build_fault_attr(sbi, t, 0))
+ if (f2fs_build_fault_attr(sbi, t, 0, FAULT_RATE))
return -EINVAL;
return count;
}
@@ -615,6 +628,27 @@ out:
return count;
}
+ if (!strcmp(a->attr.name, "gc_no_zoned_gc_percent")) {
+ if (t > 100)
+ return -EINVAL;
+ *ui = (unsigned int)t;
+ return count;
+ }
+
+ if (!strcmp(a->attr.name, "gc_boost_zoned_gc_percent")) {
+ if (t > 100)
+ return -EINVAL;
+ *ui = (unsigned int)t;
+ return count;
+ }
+
+ if (!strcmp(a->attr.name, "gc_valid_thresh_ratio")) {
+ if (t > 100)
+ return -EINVAL;
+ *ui = (unsigned int)t;
+ return count;
+ }
+
#ifdef CONFIG_F2FS_IOSTAT
if (!strcmp(a->attr.name, "iostat_enable")) {
sbi->iostat_enable = !!t;
@@ -811,6 +845,27 @@ out:
return count;
}
+ if (!strcmp(a->attr.name, "reserved_pin_section")) {
+ if (t > GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))
+ return -EINVAL;
+ *ui = (unsigned int)t;
+ return count;
+ }
+
+ if (!strcmp(a->attr.name, "gc_boost_gc_multiple")) {
+ if (t < 1 || t > SEGS_PER_SEC(sbi))
+ return -EINVAL;
+ sbi->gc_thread->boost_gc_multiple = (unsigned int)t;
+ return count;
+ }
+
+ if (!strcmp(a->attr.name, "gc_boost_gc_greedy")) {
+ if (t > GC_GREEDY)
+ return -EINVAL;
+ sbi->gc_thread->boost_gc_greedy = (unsigned int)t;
+ return count;
+ }
+
*ui = (unsigned int)t;
return count;
@@ -862,6 +917,25 @@ static void f2fs_sb_release(struct kobject *kobj)
complete(&sbi->s_kobj_unregister);
}
+static ssize_t f2fs_base_attr_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ struct f2fs_base_attr *a = container_of(attr,
+ struct f2fs_base_attr, attr);
+
+ return a->show ? a->show(a, buf) : 0;
+}
+
+static ssize_t f2fs_base_attr_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf, size_t len)
+{
+ struct f2fs_base_attr *a = container_of(attr,
+ struct f2fs_base_attr, attr);
+
+ return a->store ? a->store(a, buf, len) : 0;
+}
+
/*
* Note that there are three feature list entries:
* 1) /sys/fs/f2fs/features
@@ -880,18 +954,50 @@ static void f2fs_sb_release(struct kobject *kobj)
* please add new on-disk feature in this list only.
* - ref. F2FS_SB_FEATURE_RO_ATTR()
*/
-static ssize_t f2fs_feature_show(struct f2fs_attr *a,
- struct f2fs_sb_info *sbi, char *buf)
+static ssize_t f2fs_feature_show(struct f2fs_base_attr *a, char *buf)
{
return sysfs_emit(buf, "supported\n");
}
#define F2FS_FEATURE_RO_ATTR(_name) \
-static struct f2fs_attr f2fs_attr_##_name = { \
+static struct f2fs_base_attr f2fs_base_attr_##_name = { \
.attr = {.name = __stringify(_name), .mode = 0444 }, \
.show = f2fs_feature_show, \
}
+static ssize_t f2fs_tune_show(struct f2fs_base_attr *a, char *buf)
+{
+ unsigned int res = 0;
+
+ if (!strcmp(a->attr.name, "reclaim_caches_kb"))
+ res = f2fs_donate_files();
+
+ return sysfs_emit(buf, "%u\n", res);
+}
+
+static ssize_t f2fs_tune_store(struct f2fs_base_attr *a,
+ const char *buf, size_t count)
+{
+ unsigned long t;
+ int ret;
+
+ ret = kstrtoul(skip_spaces(buf), 0, &t);
+ if (ret)
+ return ret;
+
+ if (!strcmp(a->attr.name, "reclaim_caches_kb"))
+ f2fs_reclaim_caches(t);
+
+ return count;
+}
+
+#define F2FS_TUNE_RW_ATTR(_name) \
+static struct f2fs_base_attr f2fs_base_attr_##_name = { \
+ .attr = {.name = __stringify(_name), .mode = 0644 }, \
+ .show = f2fs_tune_show, \
+ .store = f2fs_tune_store, \
+}
+
static ssize_t f2fs_sb_feature_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
@@ -986,6 +1092,8 @@ GC_THREAD_RW_ATTR(gc_no_gc_sleep_time, no_gc_sleep_time);
GC_THREAD_RW_ATTR(gc_no_zoned_gc_percent, no_zoned_gc_percent);
GC_THREAD_RW_ATTR(gc_boost_zoned_gc_percent, boost_zoned_gc_percent);
GC_THREAD_RW_ATTR(gc_valid_thresh_ratio, valid_thresh_ratio);
+GC_THREAD_RW_ATTR(gc_boost_gc_multiple, boost_gc_multiple);
+GC_THREAD_RW_ATTR(gc_boost_gc_greedy, boost_gc_greedy);
/* SM_INFO ATTR */
SM_INFO_RW_ATTR(reclaim_segments, rec_prefree_segments);
@@ -1065,6 +1173,8 @@ F2FS_SBI_GENERAL_RW_ATTR(max_read_extent_count);
F2FS_SBI_GENERAL_RO_ATTR(unusable_blocks_per_sec);
F2FS_SBI_GENERAL_RW_ATTR(blkzone_alloc_policy);
#endif
+F2FS_SBI_GENERAL_RW_ATTR(carve_out);
+F2FS_SBI_GENERAL_RW_ATTR(reserved_pin_section);
/* STAT_INFO ATTR */
#ifdef CONFIG_F2FS_STAT_FS
@@ -1100,6 +1210,7 @@ F2FS_GENERAL_RO_ATTR(features);
F2FS_GENERAL_RO_ATTR(current_reserved_blocks);
F2FS_GENERAL_RO_ATTR(unusable);
F2FS_GENERAL_RO_ATTR(encoding);
+F2FS_GENERAL_RO_ATTR(encoding_flags);
F2FS_GENERAL_RO_ATTR(mounted_time_sec);
F2FS_GENERAL_RO_ATTR(main_blkaddr);
F2FS_GENERAL_RO_ATTR(pending_discard);
@@ -1141,6 +1252,9 @@ F2FS_FEATURE_RO_ATTR(readonly);
F2FS_FEATURE_RO_ATTR(compression);
#endif
F2FS_FEATURE_RO_ATTR(pin_file);
+#ifdef CONFIG_UNICODE
+F2FS_FEATURE_RO_ATTR(linear_lookup);
+#endif
#define ATTR_LIST(name) (&f2fs_attr_##name.attr)
static struct attribute *f2fs_attrs[] = {
@@ -1151,6 +1265,8 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(gc_no_zoned_gc_percent),
ATTR_LIST(gc_boost_zoned_gc_percent),
ATTR_LIST(gc_valid_thresh_ratio),
+ ATTR_LIST(gc_boost_gc_multiple),
+ ATTR_LIST(gc_boost_gc_greedy),
ATTR_LIST(gc_idle),
ATTR_LIST(gc_urgent),
ATTR_LIST(reclaim_segments),
@@ -1212,6 +1328,7 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(reserved_blocks),
ATTR_LIST(current_reserved_blocks),
ATTR_LIST(encoding),
+ ATTR_LIST(encoding_flags),
ATTR_LIST(mounted_time_sec),
#ifdef CONFIG_F2FS_STAT_FS
ATTR_LIST(cp_foreground_calls),
@@ -1252,41 +1369,47 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(warm_data_age_threshold),
ATTR_LIST(last_age_weight),
ATTR_LIST(max_read_extent_count),
+ ATTR_LIST(carve_out),
+ ATTR_LIST(reserved_pin_section),
NULL,
};
ATTRIBUTE_GROUPS(f2fs);
+#define BASE_ATTR_LIST(name) (&f2fs_base_attr_##name.attr)
static struct attribute *f2fs_feat_attrs[] = {
#ifdef CONFIG_FS_ENCRYPTION
- ATTR_LIST(encryption),
- ATTR_LIST(test_dummy_encryption_v2),
+ BASE_ATTR_LIST(encryption),
+ BASE_ATTR_LIST(test_dummy_encryption_v2),
#if IS_ENABLED(CONFIG_UNICODE)
- ATTR_LIST(encrypted_casefold),
+ BASE_ATTR_LIST(encrypted_casefold),
#endif
#endif /* CONFIG_FS_ENCRYPTION */
#ifdef CONFIG_BLK_DEV_ZONED
- ATTR_LIST(block_zoned),
+ BASE_ATTR_LIST(block_zoned),
#endif
- ATTR_LIST(atomic_write),
- ATTR_LIST(extra_attr),
- ATTR_LIST(project_quota),
- ATTR_LIST(inode_checksum),
- ATTR_LIST(flexible_inline_xattr),
- ATTR_LIST(quota_ino),
- ATTR_LIST(inode_crtime),
- ATTR_LIST(lost_found),
+ BASE_ATTR_LIST(atomic_write),
+ BASE_ATTR_LIST(extra_attr),
+ BASE_ATTR_LIST(project_quota),
+ BASE_ATTR_LIST(inode_checksum),
+ BASE_ATTR_LIST(flexible_inline_xattr),
+ BASE_ATTR_LIST(quota_ino),
+ BASE_ATTR_LIST(inode_crtime),
+ BASE_ATTR_LIST(lost_found),
#ifdef CONFIG_FS_VERITY
- ATTR_LIST(verity),
+ BASE_ATTR_LIST(verity),
#endif
- ATTR_LIST(sb_checksum),
+ BASE_ATTR_LIST(sb_checksum),
#if IS_ENABLED(CONFIG_UNICODE)
- ATTR_LIST(casefold),
+ BASE_ATTR_LIST(casefold),
#endif
- ATTR_LIST(readonly),
+ BASE_ATTR_LIST(readonly),
#ifdef CONFIG_F2FS_FS_COMPRESSION
- ATTR_LIST(compression),
+ BASE_ATTR_LIST(compression),
+#endif
+ BASE_ATTR_LIST(pin_file),
+#ifdef CONFIG_UNICODE
+ BASE_ATTR_LIST(linear_lookup),
#endif
- ATTR_LIST(pin_file),
NULL,
};
ATTRIBUTE_GROUPS(f2fs_feat);
@@ -1343,6 +1466,14 @@ static struct attribute *f2fs_sb_feat_attrs[] = {
};
ATTRIBUTE_GROUPS(f2fs_sb_feat);
+F2FS_TUNE_RW_ATTR(reclaim_caches_kb);
+
+static struct attribute *f2fs_tune_attrs[] = {
+ BASE_ATTR_LIST(reclaim_caches_kb),
+ NULL,
+};
+ATTRIBUTE_GROUPS(f2fs_tune);
+
static const struct sysfs_ops f2fs_attr_ops = {
.show = f2fs_attr_show,
.store = f2fs_attr_store,
@@ -1362,15 +1493,34 @@ static struct kset f2fs_kset = {
.kobj = {.ktype = &f2fs_ktype},
};
+static const struct sysfs_ops f2fs_feat_attr_ops = {
+ .show = f2fs_base_attr_show,
+ .store = f2fs_base_attr_store,
+};
+
static const struct kobj_type f2fs_feat_ktype = {
.default_groups = f2fs_feat_groups,
- .sysfs_ops = &f2fs_attr_ops,
+ .sysfs_ops = &f2fs_feat_attr_ops,
};
static struct kobject f2fs_feat = {
.kset = &f2fs_kset,
};
+static const struct sysfs_ops f2fs_tune_attr_ops = {
+ .show = f2fs_base_attr_show,
+ .store = f2fs_base_attr_store,
+};
+
+static const struct kobj_type f2fs_tune_ktype = {
+ .default_groups = f2fs_tune_groups,
+ .sysfs_ops = &f2fs_tune_attr_ops,
+};
+
+static struct kobject f2fs_tune = {
+ .kset = &f2fs_kset,
+};
+
static ssize_t f2fs_stat_attr_show(struct kobject *kobj,
struct attribute *attr, char *buf)
{
@@ -1592,6 +1742,24 @@ static int __maybe_unused disk_map_seq_show(struct seq_file *seq,
return 0;
}
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+static int __maybe_unused inject_stats_seq_show(struct seq_file *seq,
+ void *offset)
+{
+ struct super_block *sb = seq->private;
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ struct f2fs_fault_info *ffi = &F2FS_OPTION(sbi).fault_info;
+ int i;
+
+ seq_puts(seq, "fault_type injected_count\n");
+
+ for (i = 0; i < FAULT_MAX; i++)
+ seq_printf(seq, "%-24s%-10u\n", f2fs_fault_name[i],
+ ffi->inject_count[i]);
+ return 0;
+}
+#endif
+
int __init f2fs_init_sysfs(void)
{
int ret;
@@ -1607,6 +1775,11 @@ int __init f2fs_init_sysfs(void)
if (ret)
goto put_kobject;
+ ret = kobject_init_and_add(&f2fs_tune, &f2fs_tune_ktype,
+ NULL, "tuning");
+ if (ret)
+ goto put_kobject;
+
f2fs_proc_root = proc_mkdir("fs/f2fs", NULL);
if (!f2fs_proc_root) {
ret = -ENOMEM;
@@ -1614,7 +1787,9 @@ int __init f2fs_init_sysfs(void)
}
return 0;
+
put_kobject:
+ kobject_put(&f2fs_tune);
kobject_put(&f2fs_feat);
kset_unregister(&f2fs_kset);
return ret;
@@ -1622,6 +1797,7 @@ put_kobject:
void f2fs_exit_sysfs(void)
{
+ kobject_put(&f2fs_tune);
kobject_put(&f2fs_feat);
kset_unregister(&f2fs_kset);
remove_proc_entry("fs/f2fs", NULL);
@@ -1675,6 +1851,10 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi)
discard_plist_seq_show, sb);
proc_create_single_data("disk_map", 0444, sbi->s_proc,
disk_map_seq_show, sb);
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+ proc_create_single_data("inject_stats", 0444, sbi->s_proc,
+ inject_stats_seq_show, sb);
+#endif
return 0;
put_feature_list_kobj:
kobject_put(&sbi->s_feature_list_kobj);
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 3f3874943679..58632a2b6613 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -136,7 +136,7 @@ static int f2fs_xattr_advise_set(const struct xattr_handler *handler,
#ifdef CONFIG_F2FS_FS_SECURITY
static int f2fs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
- void *page)
+ void *folio)
{
const struct xattr *xattr;
int err = 0;
@@ -144,7 +144,7 @@ static int f2fs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
for (xattr = xattr_array; xattr->name != NULL; xattr++) {
err = f2fs_setxattr(inode, F2FS_XATTR_INDEX_SECURITY,
xattr->name, xattr->value,
- xattr->value_len, (struct page *)page, 0);
+ xattr->value_len, folio, 0);
if (err < 0)
break;
}
@@ -152,10 +152,10 @@ static int f2fs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
}
int f2fs_init_security(struct inode *inode, struct inode *dir,
- const struct qstr *qstr, struct page *ipage)
+ const struct qstr *qstr, struct folio *ifolio)
{
return security_inode_init_security(inode, dir, qstr,
- &f2fs_initxattrs, ipage);
+ f2fs_initxattrs, ifolio);
}
#endif
@@ -271,25 +271,25 @@ static struct f2fs_xattr_entry *__find_inline_xattr(struct inode *inode,
return entry;
}
-static int read_inline_xattr(struct inode *inode, struct page *ipage,
+static int read_inline_xattr(struct inode *inode, struct folio *ifolio,
void *txattr_addr)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
unsigned int inline_size = inline_xattr_size(inode);
- struct page *page = NULL;
+ struct folio *folio = NULL;
void *inline_addr;
- if (ipage) {
- inline_addr = inline_xattr_addr(inode, ipage);
+ if (ifolio) {
+ inline_addr = inline_xattr_addr(inode, ifolio);
} else {
- page = f2fs_get_node_page(sbi, inode->i_ino);
- if (IS_ERR(page))
- return PTR_ERR(page);
+ folio = f2fs_get_inode_folio(sbi, inode->i_ino);
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
- inline_addr = inline_xattr_addr(inode, page);
+ inline_addr = inline_xattr_addr(inode, folio);
}
memcpy(txattr_addr, inline_addr, inline_size);
- f2fs_put_page(page, 1);
+ f2fs_folio_put(folio, true);
return 0;
}
@@ -299,22 +299,22 @@ static int read_xattr_block(struct inode *inode, void *txattr_addr)
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
nid_t xnid = F2FS_I(inode)->i_xattr_nid;
unsigned int inline_size = inline_xattr_size(inode);
- struct page *xpage;
+ struct folio *xfolio;
void *xattr_addr;
/* The inode already has an extended attribute block. */
- xpage = f2fs_get_node_page(sbi, xnid);
- if (IS_ERR(xpage))
- return PTR_ERR(xpage);
+ xfolio = f2fs_get_xnode_folio(sbi, xnid);
+ if (IS_ERR(xfolio))
+ return PTR_ERR(xfolio);
- xattr_addr = page_address(xpage);
+ xattr_addr = folio_address(xfolio);
memcpy(txattr_addr + inline_size, xattr_addr, VALID_XATTR_BLOCK_SIZE);
- f2fs_put_page(xpage, 1);
+ f2fs_folio_put(xfolio, true);
return 0;
}
-static int lookup_all_xattrs(struct inode *inode, struct page *ipage,
+static int lookup_all_xattrs(struct inode *inode, struct folio *ifolio,
unsigned int index, unsigned int len,
const char *name, struct f2fs_xattr_entry **xe,
void **base_addr, int *base_size,
@@ -338,7 +338,7 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage,
/* read from inline xattr */
if (inline_size) {
- err = read_inline_xattr(inode, ipage, txattr_addr);
+ err = read_inline_xattr(inode, ifolio, txattr_addr);
if (err)
goto out;
@@ -385,7 +385,7 @@ out:
return err;
}
-static int read_all_xattrs(struct inode *inode, struct page *ipage,
+static int read_all_xattrs(struct inode *inode, struct folio *ifolio,
void **base_addr)
{
struct f2fs_xattr_header *header;
@@ -402,7 +402,7 @@ static int read_all_xattrs(struct inode *inode, struct page *ipage,
/* read from inline xattr */
if (inline_size) {
- err = read_inline_xattr(inode, ipage, txattr_addr);
+ err = read_inline_xattr(inode, ifolio, txattr_addr);
if (err)
goto fail;
}
@@ -429,14 +429,14 @@ fail:
}
static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
- void *txattr_addr, struct page *ipage)
+ void *txattr_addr, struct folio *ifolio)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
size_t inline_size = inline_xattr_size(inode);
- struct page *in_page = NULL;
+ struct folio *in_folio = NULL;
void *xattr_addr;
void *inline_addr = NULL;
- struct page *xpage;
+ struct folio *xfolio;
nid_t new_nid = 0;
int err = 0;
@@ -446,73 +446,73 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
/* write to inline xattr */
if (inline_size) {
- if (ipage) {
- inline_addr = inline_xattr_addr(inode, ipage);
+ if (ifolio) {
+ inline_addr = inline_xattr_addr(inode, ifolio);
} else {
- in_page = f2fs_get_node_page(sbi, inode->i_ino);
- if (IS_ERR(in_page)) {
+ in_folio = f2fs_get_inode_folio(sbi, inode->i_ino);
+ if (IS_ERR(in_folio)) {
f2fs_alloc_nid_failed(sbi, new_nid);
- return PTR_ERR(in_page);
+ return PTR_ERR(in_folio);
}
- inline_addr = inline_xattr_addr(inode, in_page);
+ inline_addr = inline_xattr_addr(inode, in_folio);
}
- f2fs_wait_on_page_writeback(ipage ? ipage : in_page,
+ f2fs_folio_wait_writeback(ifolio ? ifolio : in_folio,
NODE, true, true);
/* no need to use xattr node block */
if (hsize <= inline_size) {
err = f2fs_truncate_xattr_node(inode);
f2fs_alloc_nid_failed(sbi, new_nid);
if (err) {
- f2fs_put_page(in_page, 1);
+ f2fs_folio_put(in_folio, true);
return err;
}
memcpy(inline_addr, txattr_addr, inline_size);
- set_page_dirty(ipage ? ipage : in_page);
+ folio_mark_dirty(ifolio ? ifolio : in_folio);
goto in_page_out;
}
}
/* write to xattr node block */
if (F2FS_I(inode)->i_xattr_nid) {
- xpage = f2fs_get_node_page(sbi, F2FS_I(inode)->i_xattr_nid);
- if (IS_ERR(xpage)) {
- err = PTR_ERR(xpage);
+ xfolio = f2fs_get_xnode_folio(sbi, F2FS_I(inode)->i_xattr_nid);
+ if (IS_ERR(xfolio)) {
+ err = PTR_ERR(xfolio);
f2fs_alloc_nid_failed(sbi, new_nid);
goto in_page_out;
}
f2fs_bug_on(sbi, new_nid);
- f2fs_wait_on_page_writeback(xpage, NODE, true, true);
+ f2fs_folio_wait_writeback(xfolio, NODE, true, true);
} else {
struct dnode_of_data dn;
set_new_dnode(&dn, inode, NULL, NULL, new_nid);
- xpage = f2fs_new_node_page(&dn, XATTR_NODE_OFFSET);
- if (IS_ERR(xpage)) {
- err = PTR_ERR(xpage);
+ xfolio = f2fs_new_node_folio(&dn, XATTR_NODE_OFFSET);
+ if (IS_ERR(xfolio)) {
+ err = PTR_ERR(xfolio);
f2fs_alloc_nid_failed(sbi, new_nid);
goto in_page_out;
}
f2fs_alloc_nid_done(sbi, new_nid);
}
- xattr_addr = page_address(xpage);
+ xattr_addr = folio_address(xfolio);
if (inline_size)
memcpy(inline_addr, txattr_addr, inline_size);
memcpy(xattr_addr, txattr_addr + inline_size, VALID_XATTR_BLOCK_SIZE);
if (inline_size)
- set_page_dirty(ipage ? ipage : in_page);
- set_page_dirty(xpage);
+ folio_mark_dirty(ifolio ? ifolio : in_folio);
+ folio_mark_dirty(xfolio);
- f2fs_put_page(xpage, 1);
+ f2fs_folio_put(xfolio, true);
in_page_out:
- f2fs_put_page(in_page, 1);
+ f2fs_folio_put(in_folio, true);
return err;
}
int f2fs_getxattr(struct inode *inode, int index, const char *name,
- void *buffer, size_t buffer_size, struct page *ipage)
+ void *buffer, size_t buffer_size, struct folio *ifolio)
{
struct f2fs_xattr_entry *entry = NULL;
int error;
@@ -528,11 +528,11 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name,
if (len > F2FS_NAME_LEN)
return -ERANGE;
- if (!ipage)
+ if (!ifolio)
f2fs_down_read(&F2FS_I(inode)->i_xattr_sem);
- error = lookup_all_xattrs(inode, ipage, index, len, name,
+ error = lookup_all_xattrs(inode, ifolio, index, len, name,
&entry, &base_addr, &base_size, &is_inline);
- if (!ipage)
+ if (!ifolio)
f2fs_up_read(&F2FS_I(inode)->i_xattr_sem);
if (error)
return error;
@@ -627,7 +627,7 @@ static bool f2fs_xattr_value_same(struct f2fs_xattr_entry *entry,
static int __f2fs_setxattr(struct inode *inode, int index,
const char *name, const void *value, size_t size,
- struct page *ipage, int flags)
+ struct folio *ifolio, int flags)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct f2fs_xattr_entry *here, *last;
@@ -651,7 +651,7 @@ static int __f2fs_setxattr(struct inode *inode, int index,
if (size > MAX_VALUE_LEN(inode))
return -E2BIG;
retry:
- error = read_all_xattrs(inode, ipage, &base_addr);
+ error = read_all_xattrs(inode, ifolio, &base_addr);
if (error)
return error;
@@ -766,7 +766,7 @@ retry:
*(u32 *)((u8 *)last + newsize) = 0;
}
- error = write_all_xattrs(inode, new_hsize, base_addr, ipage);
+ error = write_all_xattrs(inode, new_hsize, base_addr, ifolio);
if (error)
goto exit;
@@ -800,7 +800,7 @@ exit:
int f2fs_setxattr(struct inode *inode, int index, const char *name,
const void *value, size_t size,
- struct page *ipage, int flags)
+ struct folio *ifolio, int flags)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
int err;
@@ -815,14 +815,14 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name,
return err;
/* this case is only from f2fs_init_inode_metadata */
- if (ipage)
+ if (ifolio)
return __f2fs_setxattr(inode, index, name, value,
- size, ipage, flags);
+ size, ifolio, flags);
f2fs_balance_fs(sbi, true);
f2fs_lock_op(sbi);
f2fs_down_write(&F2FS_I(inode)->i_xattr_sem);
- err = __f2fs_setxattr(inode, index, name, value, size, ipage, flags);
+ err = __f2fs_setxattr(inode, index, name, value, size, NULL, flags);
f2fs_up_write(&F2FS_I(inode)->i_xattr_sem);
f2fs_unlock_op(sbi);
diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h
index a005ffdcf717..4fc0b2305fbd 100644
--- a/fs/f2fs/xattr.h
+++ b/fs/f2fs/xattr.h
@@ -127,26 +127,26 @@ extern const struct xattr_handler f2fs_xattr_security_handler;
extern const struct xattr_handler * const f2fs_xattr_handlers[];
-extern int f2fs_setxattr(struct inode *, int, const char *,
- const void *, size_t, struct page *, int);
-extern int f2fs_getxattr(struct inode *, int, const char *, void *,
- size_t, struct page *);
-extern ssize_t f2fs_listxattr(struct dentry *, char *, size_t);
-extern int f2fs_init_xattr_caches(struct f2fs_sb_info *);
-extern void f2fs_destroy_xattr_caches(struct f2fs_sb_info *);
+int f2fs_setxattr(struct inode *, int, const char *, const void *,
+ size_t, struct folio *, int);
+int f2fs_getxattr(struct inode *, int, const char *, void *,
+ size_t, struct folio *);
+ssize_t f2fs_listxattr(struct dentry *, char *, size_t);
+int f2fs_init_xattr_caches(struct f2fs_sb_info *);
+void f2fs_destroy_xattr_caches(struct f2fs_sb_info *);
#else
#define f2fs_xattr_handlers NULL
#define f2fs_listxattr NULL
static inline int f2fs_setxattr(struct inode *inode, int index,
const char *name, const void *value, size_t size,
- struct page *page, int flags)
+ struct folio *folio, int flags)
{
return -EOPNOTSUPP;
}
static inline int f2fs_getxattr(struct inode *inode, int index,
const char *name, void *buffer,
- size_t buffer_size, struct page *dpage)
+ size_t buffer_size, struct folio *dfolio)
{
return -EOPNOTSUPP;
}
@@ -155,11 +155,11 @@ static inline void f2fs_destroy_xattr_caches(struct f2fs_sb_info *sbi) { }
#endif
#ifdef CONFIG_F2FS_FS_SECURITY
-extern int f2fs_init_security(struct inode *, struct inode *,
- const struct qstr *, struct page *);
+int f2fs_init_security(struct inode *, struct inode *,
+ const struct qstr *, struct folio *);
#else
static inline int f2fs_init_security(struct inode *inode, struct inode *dir,
- const struct qstr *qstr, struct page *ipage)
+ const struct qstr *qstr, struct folio *ifolio)
{
return 0;
}
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 1db348f8f887..a7061c2ad8e4 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -356,7 +356,7 @@ int fat_ent_read(struct inode *inode, struct fat_entry *fatent, int entry)
if (!fat_valid_entry(sbi, entry)) {
fatent_brelse(fatent);
- fat_fs_error(sb, "invalid access to FAT (entry 0x%08x)", entry);
+ fat_fs_error_ratelimit(sb, "invalid access to FAT (entry 0x%08x)", entry);
return -EIO;
}
diff --git a/fs/fat/file.c b/fs/fat/file.c
index e887e9ab7472..4fc49a614fb8 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -204,7 +204,7 @@ const struct file_operations fat_file_operations = {
.llseek = generic_file_llseek,
.read_iter = generic_file_read_iter,
.write_iter = generic_file_write_iter,
- .mmap = generic_file_mmap,
+ .mmap_prepare = generic_file_mmap_prepare,
.release = fat_file_release,
.unlocked_ioctl = fat_generic_ioctl,
.compat_ioctl = compat_ptr_ioctl,
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 3852bb66358c..9648ed097816 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -219,13 +219,14 @@ static void fat_write_failed(struct address_space *mapping, loff_t to)
}
}
-static int fat_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len,
- struct folio **foliop, void **fsdata)
+static int fat_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len,
+ struct folio **foliop, void **fsdata)
{
int err;
- err = cont_write_begin(file, mapping, pos, len,
+ err = cont_write_begin(iocb, mapping, pos, len,
foliop, fsdata, fat_get_block,
&MSDOS_I(mapping->host)->mmu_private);
if (err < 0)
@@ -233,13 +234,14 @@ static int fat_write_begin(struct file *file, struct address_space *mapping,
return err;
}
-static int fat_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct folio *folio, void *fsdata)
+static int fat_write_end(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct folio *folio, void *fsdata)
{
struct inode *inode = mapping->host;
int err;
- err = generic_write_end(file, mapping, pos, len, copied, folio, fsdata);
+ err = generic_write_end(iocb, mapping, pos, len, copied, folio, fsdata);
if (err < len)
fat_write_failed(mapping, pos + len);
if (!(err < 0) && !(MSDOS_I(inode)->i_attrs & ATTR_ARCH)) {
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index c7a2d27120ba..950da09f0961 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -158,9 +158,9 @@ int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster)
mark_inode_dirty(inode);
}
if (new_fclus != (inode->i_blocks >> (sbi->cluster_bits - 9))) {
- fat_fs_error(sb, "clusters badly computed (%d != %llu)",
- new_fclus,
- (llu)(inode->i_blocks >> (sbi->cluster_bits - 9)));
+ fat_fs_error_ratelimit(
+ sb, "clusters badly computed (%d != %llu)", new_fclus,
+ (llu)(inode->i_blocks >> (sbi->cluster_bits - 9)));
fat_cache_inval_inode(inode);
}
inode->i_blocks += nr_cluster << (sbi->cluster_bits - 9);
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index f06f6ba643cc..0b920ee40a7f 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -339,8 +339,8 @@ out:
}
/***** Make a directory */
-static int msdos_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *msdos_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct super_block *sb = dir->i_sb;
struct fat_slot_info sinfo;
@@ -389,13 +389,13 @@ static int msdos_mkdir(struct mnt_idmap *idmap, struct inode *dir,
mutex_unlock(&MSDOS_SB(sb)->s_lock);
fat_flush_inodes(sb, dir, inode);
- return 0;
+ return NULL;
out_free:
fat_free_clusters(dir, cluster);
out:
mutex_unlock(&MSDOS_SB(sb)->s_lock);
- return err;
+ return ERR_PTR(err);
}
/***** Unlink a file */
@@ -646,7 +646,7 @@ static const struct inode_operations msdos_dir_inode_operations = {
static void setup(struct super_block *sb)
{
MSDOS_SB(sb)->dir_ops = &msdos_dir_inode_operations;
- sb->s_d_op = &msdos_dentry_operations;
+ set_default_d_op(sb, &msdos_dentry_operations);
sb->s_flags |= SB_NOATIME;
}
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 926c26e90ef8..5dbc4cbb8fce 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -841,8 +841,8 @@ out:
return err;
}
-static int vfat_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *vfat_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct super_block *sb = dir->i_sb;
struct inode *inode;
@@ -877,13 +877,13 @@ static int vfat_mkdir(struct mnt_idmap *idmap, struct inode *dir,
d_instantiate(dentry, inode);
mutex_unlock(&MSDOS_SB(sb)->s_lock);
- return 0;
+ return NULL;
out_free:
fat_free_clusters(dir, cluster);
out:
mutex_unlock(&MSDOS_SB(sb)->s_lock);
- return err;
+ return ERR_PTR(err);
}
static int vfat_get_dotdot_de(struct inode *inode, struct buffer_head **bh,
@@ -1187,9 +1187,9 @@ static void setup(struct super_block *sb)
{
MSDOS_SB(sb)->dir_ops = &vfat_dir_inode_operations;
if (MSDOS_SB(sb)->options.name_check != 's')
- sb->s_d_op = &vfat_ci_dentry_ops;
+ set_default_d_op(sb, &vfat_ci_dentry_ops);
else
- sb->s_d_op = &vfat_dentry_ops;
+ set_default_d_op(sb, &vfat_dentry_ops);
}
static int vfat_fill_super(struct super_block *sb, struct fs_context *fc)
diff --git a/fs/fhandle.c b/fs/fhandle.c
index 3e092ae6d142..68a7d2861c58 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -88,7 +88,7 @@ static long do_sys_name_to_handle(const struct path *path,
if (fh_flags & EXPORT_FH_CONNECTABLE) {
handle->handle_type |= FILEID_IS_CONNECTABLE;
if (d_is_dir(path->dentry))
- fh_flags |= FILEID_IS_DIR;
+ handle->handle_type |= FILEID_IS_DIR;
}
retval = 0;
}
@@ -168,23 +168,28 @@ SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name,
return err;
}
-static int get_path_from_fd(int fd, struct path *root)
+static int get_path_anchor(int fd, struct path *root)
{
- if (fd == AT_FDCWD) {
- struct fs_struct *fs = current->fs;
- spin_lock(&fs->lock);
- *root = fs->pwd;
- path_get(root);
- spin_unlock(&fs->lock);
- } else {
+ if (fd >= 0) {
CLASS(fd, f)(fd);
if (fd_empty(f))
return -EBADF;
*root = fd_file(f)->f_path;
path_get(root);
+ return 0;
}
- return 0;
+ if (fd == AT_FDCWD) {
+ get_fs_pwd(current->fs, root);
+ return 0;
+ }
+
+ if (fd == FD_PIDFS_ROOT) {
+ pidfs_get_root(root);
+ return 0;
+ }
+
+ return -EBADF;
}
static int vfs_dentry_acceptable(void *context, struct dentry *dentry)
@@ -323,13 +328,24 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
{
int retval = 0;
struct file_handle f_handle;
- struct file_handle *handle = NULL;
+ struct file_handle *handle __free(kfree) = NULL;
struct handle_to_path_ctx ctx = {};
const struct export_operations *eops;
- retval = get_path_from_fd(mountdirfd, &ctx.root);
+ if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle)))
+ return -EFAULT;
+
+ if ((f_handle.handle_bytes > MAX_HANDLE_SZ) ||
+ (f_handle.handle_bytes == 0))
+ return -EINVAL;
+
+ if (f_handle.handle_type < 0 ||
+ FILEID_USER_FLAGS(f_handle.handle_type) & ~FILEID_VALID_USER_FLAGS)
+ return -EINVAL;
+
+ retval = get_path_anchor(mountdirfd, &ctx.root);
if (retval)
- goto out_err;
+ return retval;
eops = ctx.root.mnt->mnt_sb->s_export_op;
if (eops && eops->permission)
@@ -339,21 +355,6 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
if (retval)
goto out_path;
- if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) {
- retval = -EFAULT;
- goto out_path;
- }
- if ((f_handle.handle_bytes > MAX_HANDLE_SZ) ||
- (f_handle.handle_bytes == 0)) {
- retval = -EINVAL;
- goto out_path;
- }
- if (f_handle.handle_type < 0 ||
- FILEID_USER_FLAGS(f_handle.handle_type) & ~FILEID_VALID_USER_FLAGS) {
- retval = -EINVAL;
- goto out_path;
- }
-
handle = kmalloc(struct_size(handle, f_handle, f_handle.handle_bytes),
GFP_KERNEL);
if (!handle) {
@@ -366,7 +367,7 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
&ufh->f_handle,
f_handle.handle_bytes)) {
retval = -EFAULT;
- goto out_handle;
+ goto out_path;
}
/*
@@ -384,11 +385,8 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
handle->handle_type &= ~FILEID_USER_FLAGS_MASK;
retval = do_handle_to_path(handle, path, &ctx);
-out_handle:
- kfree(handle);
out_path:
path_put(&ctx.root);
-out_err:
return retval;
}
@@ -404,7 +402,7 @@ static long do_handle_open(int mountdirfd, struct file_handle __user *ufh,
if (retval)
return retval;
- CLASS(get_unused_fd, fd)(O_CLOEXEC);
+ CLASS(get_unused_fd, fd)(open_flag);
if (fd < 0)
return fd;
diff --git a/fs/file.c b/fs/file.c
index d868cdb95d1e..6d2275c3be9c 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -26,6 +26,28 @@
#include "internal.h"
+static noinline bool __file_ref_put_badval(file_ref_t *ref, unsigned long cnt)
+{
+ /*
+ * If the reference count was already in the dead zone, then this
+ * put() operation is imbalanced. Warn, put the reference count back to
+ * DEAD and tell the caller to not deconstruct the object.
+ */
+ if (WARN_ONCE(cnt >= FILE_REF_RELEASED, "imbalanced put on file reference count")) {
+ atomic_long_set(&ref->refcnt, FILE_REF_DEAD);
+ return false;
+ }
+
+ /*
+ * This is a put() operation on a saturated refcount. Restore the
+ * mean saturation value and tell the caller to not deconstruct the
+ * object.
+ */
+ if (cnt > FILE_REF_MAXREF)
+ atomic_long_set(&ref->refcnt, FILE_REF_SATURATED);
+ return false;
+}
+
/**
* __file_ref_put - Slowpath of file_ref_put()
* @ref: Pointer to the reference count
@@ -67,24 +89,7 @@ bool __file_ref_put(file_ref_t *ref, unsigned long cnt)
return true;
}
- /*
- * If the reference count was already in the dead zone, then this
- * put() operation is imbalanced. Warn, put the reference count back to
- * DEAD and tell the caller to not deconstruct the object.
- */
- if (WARN_ONCE(cnt >= FILE_REF_RELEASED, "imbalanced put on file reference count")) {
- atomic_long_set(&ref->refcnt, FILE_REF_DEAD);
- return false;
- }
-
- /*
- * This is a put() operation on a saturated refcount. Restore the
- * mean saturation value and tell the caller to not deconstruct the
- * object.
- */
- if (cnt > FILE_REF_MAXREF)
- atomic_long_set(&ref->refcnt, FILE_REF_SATURATED);
- return false;
+ return __file_ref_put_badval(ref, cnt);
}
EXPORT_SYMBOL_GPL(__file_ref_put);
@@ -192,6 +197,21 @@ static struct fdtable *alloc_fdtable(unsigned int slots_wanted)
return ERR_PTR(-EMFILE);
}
+ /*
+ * Check if the allocation size would exceed INT_MAX. kvmalloc_array()
+ * and kvmalloc() will warn if the allocation size is greater than
+ * INT_MAX, as filp_cachep objects are not __GFP_NOWARN.
+ *
+ * This can happen when sysctl_nr_open is set to a very high value and
+ * a process tries to use a file descriptor near that limit. For example,
+ * if sysctl_nr_open is set to 1073741816 (0x3ffffff8) - which is what
+ * systemd typically sets it to - then trying to use a file descriptor
+ * close to that value will require allocating a file descriptor table
+ * that exceeds 8GB in size.
+ */
+ if (unlikely(nr > INT_MAX / sizeof(struct file *)))
+ return ERR_PTR(-EMFILE);
+
fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT);
if (!fdt)
goto out;
@@ -418,17 +438,25 @@ struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_ho
old_fds = old_fdt->fd;
new_fds = new_fdt->fd;
+ /*
+ * We may be racing against fd allocation from other threads using this
+ * files_struct, despite holding ->file_lock.
+ *
+ * alloc_fd() might have already claimed a slot, while fd_install()
+ * did not populate it yet. Note the latter operates locklessly, so
+ * the file can show up as we are walking the array below.
+ *
+ * At the same time we know no files will disappear as all other
+ * operations take the lock.
+ *
+ * Instead of trying to placate userspace racing with itself, we
+ * ref the file if we see it and mark the fd slot as unused otherwise.
+ */
for (i = open_files; i != 0; i--) {
- struct file *f = *old_fds++;
+ struct file *f = rcu_dereference_raw(*old_fds++);
if (f) {
get_file(f);
} else {
- /*
- * The fd may be claimed in the fd bitmap but not yet
- * instantiated in the files array if a sibling thread
- * is partway through open(). So make sure that this
- * fd is available to the new process.
- */
__clear_open_fd(open_files - i, new_fdt);
}
rcu_assign_pointer(*new_fds++, f);
@@ -577,6 +605,7 @@ repeat:
__set_open_fd(fd, fdt, flags & O_CLOEXEC);
error = fd;
+ VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL);
out:
spin_unlock(&files->file_lock);
@@ -612,22 +641,14 @@ void put_unused_fd(unsigned int fd)
EXPORT_SYMBOL(put_unused_fd);
-/*
- * Install a file pointer in the fd array.
- *
- * The VFS is full of places where we drop the files lock between
- * setting the open_fds bitmap and installing the file in the file
- * array. At any such point, we are vulnerable to a dup2() race
- * installing a file in the array before us. We need to detect this and
- * fput() the struct file we are about to overwrite in this case.
- *
- * It should never happen - if we allow dup2() do it, _really_ bad things
- * will follow.
+/**
+ * fd_install - install a file pointer in the fd array
+ * @fd: file descriptor to install the file in
+ * @file: the file to install
*
* This consumes the "file" refcount, so callers should treat it
* as if they had called fput(file).
*/
-
void fd_install(unsigned int fd, struct file *file)
{
struct files_struct *files = current->files;
@@ -642,7 +663,7 @@ void fd_install(unsigned int fd, struct file *file)
rcu_read_unlock_sched();
spin_lock(&files->file_lock);
fdt = files_fdtable(files);
- WARN_ON(fdt->fd[fd] != NULL);
+ VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL);
rcu_assign_pointer(fdt->fd[fd], file);
spin_unlock(&files->file_lock);
return;
@@ -650,7 +671,7 @@ void fd_install(unsigned int fd, struct file *file)
/* coupled with smp_wmb() in expand_fdtable() */
smp_rmb();
fdt = rcu_dereference_sched(files->fdt);
- BUG_ON(fdt->fd[fd] != NULL);
+ VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL);
rcu_assign_pointer(fdt->fd[fd], file);
rcu_read_unlock_sched();
}
@@ -679,7 +700,7 @@ struct file *file_close_fd_locked(struct files_struct *files, unsigned fd)
return NULL;
fd = array_index_nospec(fd, fdt->max_fds);
- file = fdt->fd[fd];
+ file = rcu_dereference_raw(fdt->fd[fd]);
if (file) {
rcu_assign_pointer(fdt->fd[fd], NULL);
__put_unused_fd(files, fd);
@@ -1178,8 +1199,27 @@ struct fd fdget_raw(unsigned int fd)
*/
static inline bool file_needs_f_pos_lock(struct file *file)
{
- return (file->f_mode & FMODE_ATOMIC_POS) &&
- (file_count(file) > 1 || file->f_op->iterate_shared);
+ if (!(file->f_mode & FMODE_ATOMIC_POS))
+ return false;
+ if (__file_ref_read_raw(&file->f_ref) != FILE_REF_ONEREF)
+ return true;
+ if (file->f_op->iterate_shared)
+ return true;
+ return false;
+}
+
+bool file_seek_cur_needs_f_lock(struct file *file)
+{
+ if (!(file->f_mode & FMODE_ATOMIC_POS) && !file->f_op->iterate_shared)
+ return false;
+
+ /*
+ * Note that we are not guaranteed to be called after fdget_pos() on
+ * this file obj, in which case the caller is expected to provide the
+ * appropriate locking.
+ */
+
+ return true;
}
struct fd fdget_pos(unsigned int fd)
@@ -1187,7 +1227,7 @@ struct fd fdget_pos(unsigned int fd)
struct fd f = fdget(fd);
struct file *file = fd_file(f);
- if (file && file_needs_f_pos_lock(file)) {
+ if (likely(file) && file_needs_f_pos_lock(file)) {
f.word |= FDPUT_POS_UNLOCK;
mutex_lock(&file->f_pos_lock);
}
@@ -1230,14 +1270,34 @@ __releases(&files->file_lock)
struct fdtable *fdt;
/*
- * We need to detect attempts to do dup2() over allocated but still
- * not finished descriptor.
+ * dup2() is expected to close the file installed in the target fd slot
+ * (if any). However, userspace hand-picking a fd may be racing against
+ * its own threads which happened to allocate it in open() et al but did
+ * not populate it yet.
+ *
+ * Broadly speaking we may be racing against the following:
+ * fd = get_unused_fd_flags(); // fd slot reserved, ->fd[fd] == NULL
+ * file = hard_work_goes_here();
+ * fd_install(fd, file); // only now ->fd[fd] == file
+ *
+ * It is an invariant that a successfully allocated fd has a NULL entry
+ * in the array until the matching fd_install().
+ *
+ * If we fit the window, we have the fd to populate, yet no target file
+ * to close. Trying to ignore it and install our new file would violate
+ * the invariant and make fd_install() overwrite our file.
+ *
+ * Things can be done(tm) to handle this. However, the issue does not
+ * concern legitimate programs and we only need to make sure the kernel
+ * does not trip over it.
+ *
+ * The simplest way out is to return an error if we find ourselves here.
*
* POSIX is silent on the issue, we return -EBUSY.
*/
fdt = files_fdtable(files);
fd = array_index_nospec(fd, fdt->max_fds);
- tofree = fdt->fd[fd];
+ tofree = rcu_dereference_raw(fdt->fd[fd]);
if (!tofree && fd_is_open(fd, fdt))
goto Ebusy;
get_file(file);
diff --git a/fs/file_attr.c b/fs/file_attr.c
new file mode 100644
index 000000000000..12424d4945d0
--- /dev/null
+++ b/fs/file_attr.c
@@ -0,0 +1,498 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/fs.h>
+#include <linux/security.h>
+#include <linux/fscrypt.h>
+#include <linux/fileattr.h>
+#include <linux/export.h>
+#include <linux/syscalls.h>
+#include <linux/namei.h>
+
+#include "internal.h"
+
+/**
+ * fileattr_fill_xflags - initialize fileattr with xflags
+ * @fa: fileattr pointer
+ * @xflags: FS_XFLAG_* flags
+ *
+ * Set ->fsx_xflags, ->fsx_valid and ->flags (translated xflags). All
+ * other fields are zeroed.
+ */
+void fileattr_fill_xflags(struct file_kattr *fa, u32 xflags)
+{
+ memset(fa, 0, sizeof(*fa));
+ fa->fsx_valid = true;
+ fa->fsx_xflags = xflags;
+ if (fa->fsx_xflags & FS_XFLAG_IMMUTABLE)
+ fa->flags |= FS_IMMUTABLE_FL;
+ if (fa->fsx_xflags & FS_XFLAG_APPEND)
+ fa->flags |= FS_APPEND_FL;
+ if (fa->fsx_xflags & FS_XFLAG_SYNC)
+ fa->flags |= FS_SYNC_FL;
+ if (fa->fsx_xflags & FS_XFLAG_NOATIME)
+ fa->flags |= FS_NOATIME_FL;
+ if (fa->fsx_xflags & FS_XFLAG_NODUMP)
+ fa->flags |= FS_NODUMP_FL;
+ if (fa->fsx_xflags & FS_XFLAG_DAX)
+ fa->flags |= FS_DAX_FL;
+ if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
+ fa->flags |= FS_PROJINHERIT_FL;
+}
+EXPORT_SYMBOL(fileattr_fill_xflags);
+
+/**
+ * fileattr_fill_flags - initialize fileattr with flags
+ * @fa: fileattr pointer
+ * @flags: FS_*_FL flags
+ *
+ * Set ->flags, ->flags_valid and ->fsx_xflags (translated flags).
+ * All other fields are zeroed.
+ */
+void fileattr_fill_flags(struct file_kattr *fa, u32 flags)
+{
+ memset(fa, 0, sizeof(*fa));
+ fa->flags_valid = true;
+ fa->flags = flags;
+ if (fa->flags & FS_SYNC_FL)
+ fa->fsx_xflags |= FS_XFLAG_SYNC;
+ if (fa->flags & FS_IMMUTABLE_FL)
+ fa->fsx_xflags |= FS_XFLAG_IMMUTABLE;
+ if (fa->flags & FS_APPEND_FL)
+ fa->fsx_xflags |= FS_XFLAG_APPEND;
+ if (fa->flags & FS_NODUMP_FL)
+ fa->fsx_xflags |= FS_XFLAG_NODUMP;
+ if (fa->flags & FS_NOATIME_FL)
+ fa->fsx_xflags |= FS_XFLAG_NOATIME;
+ if (fa->flags & FS_DAX_FL)
+ fa->fsx_xflags |= FS_XFLAG_DAX;
+ if (fa->flags & FS_PROJINHERIT_FL)
+ fa->fsx_xflags |= FS_XFLAG_PROJINHERIT;
+}
+EXPORT_SYMBOL(fileattr_fill_flags);
+
+/**
+ * vfs_fileattr_get - retrieve miscellaneous file attributes
+ * @dentry: the object to retrieve from
+ * @fa: fileattr pointer
+ *
+ * Call i_op->fileattr_get() callback, if exists.
+ *
+ * Return: 0 on success, or a negative error on failure.
+ */
+int vfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
+{
+ struct inode *inode = d_inode(dentry);
+ int error;
+
+ if (!inode->i_op->fileattr_get)
+ return -EOPNOTSUPP;
+
+ error = security_inode_file_getattr(dentry, fa);
+ if (error)
+ return error;
+
+ return inode->i_op->fileattr_get(dentry, fa);
+}
+EXPORT_SYMBOL(vfs_fileattr_get);
+
+static void fileattr_to_file_attr(const struct file_kattr *fa,
+ struct file_attr *fattr)
+{
+ __u32 mask = FS_XFLAGS_MASK;
+
+ memset(fattr, 0, sizeof(struct file_attr));
+ fattr->fa_xflags = fa->fsx_xflags & mask;
+ fattr->fa_extsize = fa->fsx_extsize;
+ fattr->fa_nextents = fa->fsx_nextents;
+ fattr->fa_projid = fa->fsx_projid;
+ fattr->fa_cowextsize = fa->fsx_cowextsize;
+}
+
+/**
+ * copy_fsxattr_to_user - copy fsxattr to userspace.
+ * @fa: fileattr pointer
+ * @ufa: fsxattr user pointer
+ *
+ * Return: 0 on success, or -EFAULT on failure.
+ */
+int copy_fsxattr_to_user(const struct file_kattr *fa, struct fsxattr __user *ufa)
+{
+ struct fsxattr xfa;
+ __u32 mask = FS_XFLAGS_MASK;
+
+ memset(&xfa, 0, sizeof(xfa));
+ xfa.fsx_xflags = fa->fsx_xflags & mask;
+ xfa.fsx_extsize = fa->fsx_extsize;
+ xfa.fsx_nextents = fa->fsx_nextents;
+ xfa.fsx_projid = fa->fsx_projid;
+ xfa.fsx_cowextsize = fa->fsx_cowextsize;
+
+ if (copy_to_user(ufa, &xfa, sizeof(xfa)))
+ return -EFAULT;
+
+ return 0;
+}
+EXPORT_SYMBOL(copy_fsxattr_to_user);
+
+static int file_attr_to_fileattr(const struct file_attr *fattr,
+ struct file_kattr *fa)
+{
+ __u64 mask = FS_XFLAGS_MASK;
+
+ if (fattr->fa_xflags & ~mask)
+ return -EINVAL;
+
+ fileattr_fill_xflags(fa, fattr->fa_xflags);
+ fa->fsx_xflags &= ~FS_XFLAG_RDONLY_MASK;
+ fa->fsx_extsize = fattr->fa_extsize;
+ fa->fsx_projid = fattr->fa_projid;
+ fa->fsx_cowextsize = fattr->fa_cowextsize;
+
+ return 0;
+}
+
+static int copy_fsxattr_from_user(struct file_kattr *fa,
+ struct fsxattr __user *ufa)
+{
+ struct fsxattr xfa;
+ __u32 mask = FS_XFLAGS_MASK;
+
+ if (copy_from_user(&xfa, ufa, sizeof(xfa)))
+ return -EFAULT;
+
+ if (xfa.fsx_xflags & ~mask)
+ return -EOPNOTSUPP;
+
+ fileattr_fill_xflags(fa, xfa.fsx_xflags);
+ fa->fsx_xflags &= ~FS_XFLAG_RDONLY_MASK;
+ fa->fsx_extsize = xfa.fsx_extsize;
+ fa->fsx_nextents = xfa.fsx_nextents;
+ fa->fsx_projid = xfa.fsx_projid;
+ fa->fsx_cowextsize = xfa.fsx_cowextsize;
+
+ return 0;
+}
+
+/*
+ * Generic function to check FS_IOC_FSSETXATTR/FS_IOC_SETFLAGS values and reject
+ * any invalid configurations.
+ *
+ * Note: must be called with inode lock held.
+ */
+static int fileattr_set_prepare(struct inode *inode,
+ const struct file_kattr *old_ma,
+ struct file_kattr *fa)
+{
+ int err;
+
+ /*
+ * The IMMUTABLE and APPEND_ONLY flags can only be changed by
+ * the relevant capability.
+ */
+ if ((fa->flags ^ old_ma->flags) & (FS_APPEND_FL | FS_IMMUTABLE_FL) &&
+ !capable(CAP_LINUX_IMMUTABLE))
+ return -EPERM;
+
+ err = fscrypt_prepare_setflags(inode, old_ma->flags, fa->flags);
+ if (err)
+ return err;
+
+ /*
+ * Project Quota ID state is only allowed to change from within the init
+ * namespace. Enforce that restriction only if we are trying to change
+ * the quota ID state. Everything else is allowed in user namespaces.
+ */
+ if (current_user_ns() != &init_user_ns) {
+ if (old_ma->fsx_projid != fa->fsx_projid)
+ return -EINVAL;
+ if ((old_ma->fsx_xflags ^ fa->fsx_xflags) &
+ FS_XFLAG_PROJINHERIT)
+ return -EINVAL;
+ } else {
+ /*
+ * Caller is allowed to change the project ID. If it is being
+ * changed, make sure that the new value is valid.
+ */
+ if (old_ma->fsx_projid != fa->fsx_projid &&
+ !projid_valid(make_kprojid(&init_user_ns, fa->fsx_projid)))
+ return -EINVAL;
+ }
+
+ /* Check extent size hints. */
+ if ((fa->fsx_xflags & FS_XFLAG_EXTSIZE) && !S_ISREG(inode->i_mode))
+ return -EINVAL;
+
+ if ((fa->fsx_xflags & FS_XFLAG_EXTSZINHERIT) &&
+ !S_ISDIR(inode->i_mode))
+ return -EINVAL;
+
+ if ((fa->fsx_xflags & FS_XFLAG_COWEXTSIZE) &&
+ !S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
+ return -EINVAL;
+
+ /*
+ * It is only valid to set the DAX flag on regular files and
+ * directories on filesystems.
+ */
+ if ((fa->fsx_xflags & FS_XFLAG_DAX) &&
+ !(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
+ return -EINVAL;
+
+ /* Extent size hints of zero turn off the flags. */
+ if (fa->fsx_extsize == 0)
+ fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE | FS_XFLAG_EXTSZINHERIT);
+ if (fa->fsx_cowextsize == 0)
+ fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE;
+
+ return 0;
+}
+
+/**
+ * vfs_fileattr_set - change miscellaneous file attributes
+ * @idmap: idmap of the mount
+ * @dentry: the object to change
+ * @fa: fileattr pointer
+ *
+ * After verifying permissions, call i_op->fileattr_set() callback, if
+ * exists.
+ *
+ * Verifying attributes involves retrieving current attributes with
+ * i_op->fileattr_get(), this also allows initializing attributes that have
+ * not been set by the caller to current values. Inode lock is held
+ * thoughout to prevent racing with another instance.
+ *
+ * Return: 0 on success, or a negative error on failure.
+ */
+int vfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry,
+ struct file_kattr *fa)
+{
+ struct inode *inode = d_inode(dentry);
+ struct file_kattr old_ma = {};
+ int err;
+
+ if (!inode->i_op->fileattr_set)
+ return -EOPNOTSUPP;
+
+ if (!inode_owner_or_capable(idmap, inode))
+ return -EPERM;
+
+ inode_lock(inode);
+ err = vfs_fileattr_get(dentry, &old_ma);
+ if (!err) {
+ /* initialize missing bits from old_ma */
+ if (fa->flags_valid) {
+ fa->fsx_xflags |= old_ma.fsx_xflags & ~FS_XFLAG_COMMON;
+ fa->fsx_extsize = old_ma.fsx_extsize;
+ fa->fsx_nextents = old_ma.fsx_nextents;
+ fa->fsx_projid = old_ma.fsx_projid;
+ fa->fsx_cowextsize = old_ma.fsx_cowextsize;
+ } else {
+ fa->flags |= old_ma.flags & ~FS_COMMON_FL;
+ }
+
+ err = fileattr_set_prepare(inode, &old_ma, fa);
+ if (err)
+ goto out;
+ err = security_inode_file_setattr(dentry, fa);
+ if (err)
+ goto out;
+ err = inode->i_op->fileattr_set(idmap, dentry, fa);
+ if (err)
+ goto out;
+ }
+
+out:
+ inode_unlock(inode);
+ return err;
+}
+EXPORT_SYMBOL(vfs_fileattr_set);
+
+int ioctl_getflags(struct file *file, unsigned int __user *argp)
+{
+ struct file_kattr fa = { .flags_valid = true }; /* hint only */
+ int err;
+
+ err = vfs_fileattr_get(file->f_path.dentry, &fa);
+ if (err == -EOPNOTSUPP)
+ err = -ENOIOCTLCMD;
+ if (!err)
+ err = put_user(fa.flags, argp);
+ return err;
+}
+EXPORT_SYMBOL(ioctl_getflags);
+
+int ioctl_setflags(struct file *file, unsigned int __user *argp)
+{
+ struct mnt_idmap *idmap = file_mnt_idmap(file);
+ struct dentry *dentry = file->f_path.dentry;
+ struct file_kattr fa;
+ unsigned int flags;
+ int err;
+
+ err = get_user(flags, argp);
+ if (!err) {
+ err = mnt_want_write_file(file);
+ if (!err) {
+ fileattr_fill_flags(&fa, flags);
+ err = vfs_fileattr_set(idmap, dentry, &fa);
+ mnt_drop_write_file(file);
+ if (err == -EOPNOTSUPP)
+ err = -ENOIOCTLCMD;
+ }
+ }
+ return err;
+}
+EXPORT_SYMBOL(ioctl_setflags);
+
+int ioctl_fsgetxattr(struct file *file, void __user *argp)
+{
+ struct file_kattr fa = { .fsx_valid = true }; /* hint only */
+ int err;
+
+ err = vfs_fileattr_get(file->f_path.dentry, &fa);
+ if (err == -EOPNOTSUPP)
+ err = -ENOIOCTLCMD;
+ if (!err)
+ err = copy_fsxattr_to_user(&fa, argp);
+
+ return err;
+}
+EXPORT_SYMBOL(ioctl_fsgetxattr);
+
+int ioctl_fssetxattr(struct file *file, void __user *argp)
+{
+ struct mnt_idmap *idmap = file_mnt_idmap(file);
+ struct dentry *dentry = file->f_path.dentry;
+ struct file_kattr fa;
+ int err;
+
+ err = copy_fsxattr_from_user(&fa, argp);
+ if (!err) {
+ err = mnt_want_write_file(file);
+ if (!err) {
+ err = vfs_fileattr_set(idmap, dentry, &fa);
+ mnt_drop_write_file(file);
+ if (err == -EOPNOTSUPP)
+ err = -ENOIOCTLCMD;
+ }
+ }
+ return err;
+}
+EXPORT_SYMBOL(ioctl_fssetxattr);
+
+SYSCALL_DEFINE5(file_getattr, int, dfd, const char __user *, filename,
+ struct file_attr __user *, ufattr, size_t, usize,
+ unsigned int, at_flags)
+{
+ struct path filepath __free(path_put) = {};
+ struct filename *name __free(putname) = NULL;
+ unsigned int lookup_flags = 0;
+ struct file_attr fattr;
+ struct file_kattr fa;
+ int error;
+
+ BUILD_BUG_ON(sizeof(struct file_attr) < FILE_ATTR_SIZE_VER0);
+ BUILD_BUG_ON(sizeof(struct file_attr) != FILE_ATTR_SIZE_LATEST);
+
+ if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
+ return -EINVAL;
+
+ if (!(at_flags & AT_SYMLINK_NOFOLLOW))
+ lookup_flags |= LOOKUP_FOLLOW;
+
+ if (usize > PAGE_SIZE)
+ return -E2BIG;
+
+ if (usize < FILE_ATTR_SIZE_VER0)
+ return -EINVAL;
+
+ name = getname_maybe_null(filename, at_flags);
+ if (IS_ERR(name))
+ return PTR_ERR(name);
+
+ if (!name && dfd >= 0) {
+ CLASS(fd, f)(dfd);
+ if (fd_empty(f))
+ return -EBADF;
+
+ filepath = fd_file(f)->f_path;
+ path_get(&filepath);
+ } else {
+ error = filename_lookup(dfd, name, lookup_flags, &filepath,
+ NULL);
+ if (error)
+ return error;
+ }
+
+ error = vfs_fileattr_get(filepath.dentry, &fa);
+ if (error)
+ return error;
+
+ fileattr_to_file_attr(&fa, &fattr);
+ error = copy_struct_to_user(ufattr, usize, &fattr,
+ sizeof(struct file_attr), NULL);
+
+ return error;
+}
+
+SYSCALL_DEFINE5(file_setattr, int, dfd, const char __user *, filename,
+ struct file_attr __user *, ufattr, size_t, usize,
+ unsigned int, at_flags)
+{
+ struct path filepath __free(path_put) = {};
+ struct filename *name __free(putname) = NULL;
+ unsigned int lookup_flags = 0;
+ struct file_attr fattr;
+ struct file_kattr fa;
+ int error;
+
+ BUILD_BUG_ON(sizeof(struct file_attr) < FILE_ATTR_SIZE_VER0);
+ BUILD_BUG_ON(sizeof(struct file_attr) != FILE_ATTR_SIZE_LATEST);
+
+ if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
+ return -EINVAL;
+
+ if (!(at_flags & AT_SYMLINK_NOFOLLOW))
+ lookup_flags |= LOOKUP_FOLLOW;
+
+ if (usize > PAGE_SIZE)
+ return -E2BIG;
+
+ if (usize < FILE_ATTR_SIZE_VER0)
+ return -EINVAL;
+
+ error = copy_struct_from_user(&fattr, sizeof(struct file_attr), ufattr,
+ usize);
+ if (error)
+ return error;
+
+ error = file_attr_to_fileattr(&fattr, &fa);
+ if (error)
+ return error;
+
+ name = getname_maybe_null(filename, at_flags);
+ if (IS_ERR(name))
+ return PTR_ERR(name);
+
+ if (!name && dfd >= 0) {
+ CLASS(fd, f)(dfd);
+ if (fd_empty(f))
+ return -EBADF;
+
+ filepath = fd_file(f)->f_path;
+ path_get(&filepath);
+ } else {
+ error = filename_lookup(dfd, name, lookup_flags, &filepath,
+ NULL);
+ if (error)
+ return error;
+ }
+
+ error = mnt_want_write(filepath.mnt);
+ if (!error) {
+ error = vfs_fileattr_set(mnt_idmap(filepath.mnt),
+ filepath.dentry, &fa);
+ mnt_drop_write(filepath.mnt);
+ }
+
+ return error;
+}
diff --git a/fs/file_table.c b/fs/file_table.c
index 7f7c378c6e31..81c72576e548 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -52,17 +52,20 @@ struct backing_file {
};
};
-static inline struct backing_file *backing_file(struct file *f)
-{
- return container_of(f, struct backing_file, file);
-}
+#define backing_file(f) container_of(f, struct backing_file, file)
-struct path *backing_file_user_path(struct file *f)
+struct path *backing_file_user_path(const struct file *f)
{
return &backing_file(f)->user_path;
}
EXPORT_SYMBOL_GPL(backing_file_user_path);
+void backing_file_set_user_path(struct file *f, const struct path *path)
+{
+ backing_file(f)->user_path = *path;
+}
+EXPORT_SYMBOL_GPL(backing_file_set_user_path);
+
static inline void file_free(struct file *f)
{
security_file_free(f);
@@ -102,7 +105,7 @@ EXPORT_SYMBOL_GPL(get_max_files);
static int proc_nr_files(const struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
- files_stat.nr_files = get_nr_files();
+ files_stat.nr_files = percpu_counter_sum_positive(&nr_files);
return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}
@@ -194,6 +197,11 @@ static int init_file(struct file *f, int flags, const struct cred *cred)
* refcount bumps we should reinitialize the reused file first.
*/
file_ref_init(&f->f_ref, 1);
+ /*
+ * Disable permission and pre-content events for all files by default.
+ * They may be enabled later by fsnotify_open_perm_and_set_mode().
+ */
+ file_set_fsnotify_mode(f, FMODE_NONOTIFY_PERM);
return 0;
}
@@ -216,7 +224,8 @@ struct file *alloc_empty_file(int flags, const struct cred *cred)
/*
* Privileged users can go above max_files
*/
- if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) {
+ if (unlikely(get_nr_files() >= files_stat.max_files) &&
+ !capable(CAP_SYS_ADMIN)) {
/*
* percpu_counters are inaccurate. Do an expensive check before
* we go and fail.
@@ -351,9 +360,7 @@ static struct file *alloc_file(const struct path *path, int flags,
static inline int alloc_path_pseudo(const char *name, struct inode *inode,
struct vfsmount *mnt, struct path *path)
{
- struct qstr this = QSTR_INIT(name, strlen(name));
-
- path->dentry = d_alloc_pseudo(mnt->mnt_sb, &this);
+ path->dentry = d_alloc_pseudo(mnt->mnt_sb, &QSTR(name));
if (!path->dentry)
return -ENOMEM;
path->mnt = mntget(mnt);
@@ -377,7 +384,13 @@ struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt,
if (IS_ERR(file)) {
ihold(inode);
path_put(&path);
+ return file;
}
+ /*
+ * Disable all fsnotify events for pseudo files by default.
+ * They may be enabled by caller with file_set_fsnotify_mode().
+ */
+ file_set_fsnotify_mode(file, FMODE_NONOTIFY);
return file;
}
EXPORT_SYMBOL(alloc_file_pseudo);
@@ -402,6 +415,11 @@ struct file *alloc_file_pseudo_noaccount(struct inode *inode,
return file;
}
file_init_path(file, &path, fops);
+ /*
+ * Disable all fsnotify events for pseudo files by default.
+ * They may be enabled by caller with file_set_fsnotify_mode().
+ */
+ file_set_fsnotify_mode(file, FMODE_NONOTIFY);
return file;
}
EXPORT_SYMBOL_GPL(alloc_file_pseudo_noaccount);
@@ -497,31 +515,37 @@ void flush_delayed_fput(void)
}
EXPORT_SYMBOL_GPL(flush_delayed_fput);
-void fput(struct file *file)
+static void __fput_deferred(struct file *file)
{
- if (file_ref_put(&file->f_ref)) {
- struct task_struct *task = current;
+ struct task_struct *task = current;
+
+ if (unlikely(!(file->f_mode & (FMODE_BACKING | FMODE_OPENED)))) {
+ file_free(file);
+ return;
+ }
- if (unlikely(!(file->f_mode & (FMODE_BACKING | FMODE_OPENED)))) {
- file_free(file);
+ if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
+ init_task_work(&file->f_task_work, ____fput);
+ if (!task_work_add(task, &file->f_task_work, TWA_RESUME))
return;
- }
- if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
- init_task_work(&file->f_task_work, ____fput);
- if (!task_work_add(task, &file->f_task_work, TWA_RESUME))
- return;
- /*
- * After this task has run exit_task_work(),
- * task_work_add() will fail. Fall through to delayed
- * fput to avoid leaking *file.
- */
- }
-
- if (llist_add(&file->f_llist, &delayed_fput_list))
- schedule_delayed_work(&delayed_fput_work, 1);
+ /*
+ * After this task has run exit_task_work(),
+ * task_work_add() will fail. Fall through to delayed
+ * fput to avoid leaking *file.
+ */
}
+
+ if (llist_add(&file->f_llist, &delayed_fput_list))
+ schedule_delayed_work(&delayed_fput_work, 1);
}
+void fput(struct file *file)
+{
+ if (unlikely(file_ref_put(&file->f_ref)))
+ __fput_deferred(file);
+}
+EXPORT_SYMBOL(fput);
+
/*
* synchronous analog of fput(); for kernel threads that might be needed
* in some umount() (and thus can't use flush_delayed_fput() without
@@ -535,10 +559,32 @@ void __fput_sync(struct file *file)
if (file_ref_put(&file->f_ref))
__fput(file);
}
-
-EXPORT_SYMBOL(fput);
EXPORT_SYMBOL(__fput_sync);
+/*
+ * Equivalent to __fput_sync(), but optimized for being called with the last
+ * reference.
+ *
+ * See file_ref_put_close() for details.
+ */
+void fput_close_sync(struct file *file)
+{
+ if (likely(file_ref_put_close(&file->f_ref)))
+ __fput(file);
+}
+
+/*
+ * Equivalent to fput(), but optimized for being called with the last
+ * reference.
+ *
+ * See file_ref_put_close() for details.
+ */
+void fput_close(struct file *file)
+{
+ if (file_ref_put_close(&file->f_ref))
+ __fput_deferred(file);
+}
+
void __init files_init(void)
{
struct kmem_cache_args args = {
diff --git a/fs/filesystems.c b/fs/filesystems.c
index 58b9067b2391..95e5256821a5 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -156,15 +156,19 @@ static int fs_index(const char __user * __name)
static int fs_name(unsigned int index, char __user * buf)
{
struct file_system_type * tmp;
- int len, res;
+ int len, res = -EINVAL;
read_lock(&file_systems_lock);
- for (tmp = file_systems; tmp; tmp = tmp->next, index--)
- if (index <= 0 && try_module_get(tmp->owner))
+ for (tmp = file_systems; tmp; tmp = tmp->next, index--) {
+ if (index == 0) {
+ if (try_module_get(tmp->owner))
+ res = 0;
break;
+ }
+ }
read_unlock(&file_systems_lock);
- if (!tmp)
- return -EINVAL;
+ if (res)
+ return res;
/* OK, we got the reference, so we can safely block */
len = strlen(tmp->name) + 1;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 3cd99e2dc6ac..a07b8cf73ae2 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -65,7 +65,7 @@ struct wb_writeback_work {
* timestamps written to disk after 12 hours, but in the worst case a
* few inodes might not their timestamps updated for 24 hours.
*/
-unsigned int dirtytime_expire_interval = 12 * 60 * 60;
+static unsigned int dirtytime_expire_interval = 12 * 60 * 60;
static inline struct inode *wb_inode(struct list_head *head)
{
@@ -2435,14 +2435,7 @@ static void wakeup_dirtytime_writeback(struct work_struct *w)
schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
}
-static int __init start_dirtytime_writeback(void)
-{
- schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
- return 0;
-}
-__initcall(start_dirtytime_writeback);
-
-int dirtytime_interval_handler(const struct ctl_table *table, int write,
+static int dirtytime_interval_handler(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
@@ -2453,6 +2446,25 @@ int dirtytime_interval_handler(const struct ctl_table *table, int write,
return ret;
}
+static const struct ctl_table vm_fs_writeback_table[] = {
+ {
+ .procname = "dirtytime_expire_seconds",
+ .data = &dirtytime_expire_interval,
+ .maxlen = sizeof(dirtytime_expire_interval),
+ .mode = 0644,
+ .proc_handler = dirtytime_interval_handler,
+ .extra1 = SYSCTL_ZERO,
+ },
+};
+
+static int __init start_dirtytime_writeback(void)
+{
+ schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
+ register_sysctl_init("vm", vm_fs_writeback_table);
+ return 0;
+}
+__initcall(start_dirtytime_writeback);
+
/**
* __mark_inode_dirty - internal function to mark an inode dirty
*
@@ -2596,10 +2608,6 @@ void __mark_inode_dirty(struct inode *inode, int flags)
wakeup_bdi = inode_io_list_move_locked(inode, wb,
dirty_list);
- spin_unlock(&wb->list_lock);
- spin_unlock(&inode->i_lock);
- trace_writeback_dirty_inode_enqueue(inode);
-
/*
* If this is the first dirty inode for this bdi,
* we have to wake-up the corresponding bdi thread
@@ -2609,6 +2617,11 @@ void __mark_inode_dirty(struct inode *inode, int flags)
if (wakeup_bdi &&
(wb->bdi->capabilities & BDI_CAP_WRITEBACK))
wb_wakeup_delayed(wb);
+
+ spin_unlock(&wb->list_lock);
+ spin_unlock(&inode->i_lock);
+ trace_writeback_dirty_inode_enqueue(inode);
+
return;
}
}
diff --git a/fs/fs_context.c b/fs/fs_context.c
index 582d33e81117..666e61753aed 100644
--- a/fs/fs_context.c
+++ b/fs/fs_context.c
@@ -222,7 +222,7 @@ int vfs_parse_monolithic_sep(struct fs_context *fc, void *data,
char *value = strchr(key, '=');
if (value) {
- if (value == key)
+ if (unlikely(value == key))
continue;
*value++ = 0;
v_len = strlen(value);
@@ -449,6 +449,10 @@ void logfc(struct fc_log *log, const char *prefix, char level, const char *fmt,
printk(KERN_ERR "%s%s%pV\n", prefix ? prefix : "",
prefix ? ": " : "", &vaf);
break;
+ case 'i':
+ printk(KERN_INFO "%s%s%pV\n", prefix ? prefix : "",
+ prefix ? ": " : "", &vaf);
+ break;
default:
printk(KERN_NOTICE "%s%s%pV\n", prefix ? prefix : "",
prefix ? ": " : "", &vaf);
diff --git a/fs/fs_parser.c b/fs/fs_parser.c
index e635a81e17d9..c092a9f79e32 100644
--- a/fs/fs_parser.c
+++ b/fs/fs_parser.c
@@ -380,58 +380,9 @@ EXPORT_SYMBOL(fs_param_is_path);
#ifdef CONFIG_VALIDATE_FS_PARSER
/**
- * validate_constant_table - Validate a constant table
- * @tbl: The constant table to validate.
- * @tbl_size: The size of the table.
- * @low: The lowest permissible value.
- * @high: The highest permissible value.
- * @special: One special permissible value outside of the range.
- */
-bool validate_constant_table(const struct constant_table *tbl, size_t tbl_size,
- int low, int high, int special)
-{
- size_t i;
- bool good = true;
-
- if (tbl_size == 0) {
- pr_warn("VALIDATE C-TBL: Empty\n");
- return true;
- }
-
- for (i = 0; i < tbl_size; i++) {
- if (!tbl[i].name) {
- pr_err("VALIDATE C-TBL[%zu]: Null\n", i);
- good = false;
- } else if (i > 0 && tbl[i - 1].name) {
- int c = strcmp(tbl[i-1].name, tbl[i].name);
-
- if (c == 0) {
- pr_err("VALIDATE C-TBL[%zu]: Duplicate %s\n",
- i, tbl[i].name);
- good = false;
- }
- if (c > 0) {
- pr_err("VALIDATE C-TBL[%zu]: Missorted %s>=%s\n",
- i, tbl[i-1].name, tbl[i].name);
- good = false;
- }
- }
-
- if (tbl[i].value != special &&
- (tbl[i].value < low || tbl[i].value > high)) {
- pr_err("VALIDATE C-TBL[%zu]: %s->%d const out of range (%d-%d)\n",
- i, tbl[i].name, tbl[i].value, low, high);
- good = false;
- }
- }
-
- return good;
-}
-
-/**
- * fs_validate_description - Validate a parameter description
- * @name: The parameter name to search for.
- * @desc: The parameter description to validate.
+ * fs_validate_description - Validate a parameter specification array
+ * @name: Owner name of the parameter specification array
+ * @desc: The parameter specification array to validate.
*/
bool fs_validate_description(const char *name,
const struct fs_parameter_spec *desc)
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 64c2d0814ed6..28be762ac1c6 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -17,12 +17,10 @@ void set_fs_root(struct fs_struct *fs, const struct path *path)
struct path old_root;
path_get(path);
- spin_lock(&fs->lock);
- write_seqcount_begin(&fs->seq);
+ write_seqlock(&fs->seq);
old_root = fs->root;
fs->root = *path;
- write_seqcount_end(&fs->seq);
- spin_unlock(&fs->lock);
+ write_sequnlock(&fs->seq);
if (old_root.dentry)
path_put(&old_root);
}
@@ -36,12 +34,10 @@ void set_fs_pwd(struct fs_struct *fs, const struct path *path)
struct path old_pwd;
path_get(path);
- spin_lock(&fs->lock);
- write_seqcount_begin(&fs->seq);
+ write_seqlock(&fs->seq);
old_pwd = fs->pwd;
fs->pwd = *path;
- write_seqcount_end(&fs->seq);
- spin_unlock(&fs->lock);
+ write_sequnlock(&fs->seq);
if (old_pwd.dentry)
path_put(&old_pwd);
@@ -67,16 +63,14 @@ void chroot_fs_refs(const struct path *old_root, const struct path *new_root)
fs = p->fs;
if (fs) {
int hits = 0;
- spin_lock(&fs->lock);
- write_seqcount_begin(&fs->seq);
+ write_seqlock(&fs->seq);
hits += replace_path(&fs->root, old_root, new_root);
hits += replace_path(&fs->pwd, old_root, new_root);
- write_seqcount_end(&fs->seq);
while (hits--) {
count++;
path_get(new_root);
}
- spin_unlock(&fs->lock);
+ write_sequnlock(&fs->seq);
}
task_unlock(p);
}
@@ -99,10 +93,10 @@ void exit_fs(struct task_struct *tsk)
if (fs) {
int kill;
task_lock(tsk);
- spin_lock(&fs->lock);
+ read_seqlock_excl(&fs->seq);
tsk->fs = NULL;
kill = !--fs->users;
- spin_unlock(&fs->lock);
+ read_sequnlock_excl(&fs->seq);
task_unlock(tsk);
if (kill)
free_fs_struct(fs);
@@ -116,16 +110,15 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
if (fs) {
fs->users = 1;
fs->in_exec = 0;
- spin_lock_init(&fs->lock);
- seqcount_spinlock_init(&fs->seq, &fs->lock);
+ seqlock_init(&fs->seq);
fs->umask = old->umask;
- spin_lock(&old->lock);
+ read_seqlock_excl(&old->seq);
fs->root = old->root;
path_get(&fs->root);
fs->pwd = old->pwd;
path_get(&fs->pwd);
- spin_unlock(&old->lock);
+ read_sequnlock_excl(&old->seq);
}
return fs;
}
@@ -140,10 +133,10 @@ int unshare_fs_struct(void)
return -ENOMEM;
task_lock(current);
- spin_lock(&fs->lock);
+ read_seqlock_excl(&fs->seq);
kill = !--fs->users;
current->fs = new_fs;
- spin_unlock(&fs->lock);
+ read_sequnlock_excl(&fs->seq);
task_unlock(current);
if (kill)
@@ -162,7 +155,6 @@ EXPORT_SYMBOL(current_umask);
/* to be mentioned only in INIT_TASK */
struct fs_struct init_fs = {
.users = 1,
- .lock = __SPIN_LOCK_UNLOCKED(init_fs.lock),
- .seq = SEQCNT_SPINLOCK_ZERO(init_fs.seq, &init_fs.lock),
+ .seq = __SEQLOCK_UNLOCKED(init_fs.seq),
.umask = 0022,
};
diff --git a/fs/fsopen.c b/fs/fsopen.c
index 094a7f510edf..1aaf4cb2afb2 100644
--- a/fs/fsopen.c
+++ b/fs/fsopen.c
@@ -453,7 +453,7 @@ SYSCALL_DEFINE5(fsconfig,
case FSCONFIG_SET_FD:
param.type = fs_value_is_file;
ret = -EBADF;
- param.file = fget(aux);
+ param.file = fget_raw(aux);
if (!param.file)
goto out_key;
param.dirfd = aux;
diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
index ca215a3cba3e..a774166264de 100644
--- a/fs/fuse/Kconfig
+++ b/fs/fuse/Kconfig
@@ -2,6 +2,7 @@
config FUSE_FS
tristate "FUSE (Filesystem in Userspace) support"
select FS_POSIX_ACL
+ select FS_IOMAP
help
With FUSE it is possible to implement a fully functional filesystem
in a userspace program.
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 2a730d88cc3b..bb407705603c 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -11,6 +11,7 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/fs_context.h>
+#include <linux/namei.h>
#define FUSE_CTL_SUPER_MAGIC 0x65735543
@@ -212,7 +213,6 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
struct dentry *dentry;
struct inode *inode;
- BUG_ON(fc->ctl_ndents >= FUSE_CTL_NUM_DENTRIES);
dentry = d_alloc_name(parent, name);
if (!dentry)
return NULL;
@@ -236,8 +236,6 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
inode->i_private = fc;
d_add(dentry, inode);
- fc->ctl_dentry[fc->ctl_ndents++] = dentry;
-
return dentry;
}
@@ -280,27 +278,29 @@ int fuse_ctl_add_conn(struct fuse_conn *fc)
return -ENOMEM;
}
+static void remove_one(struct dentry *dentry)
+{
+ d_inode(dentry)->i_private = NULL;
+}
+
/*
* Remove a connection from the control filesystem (if it exists).
* Caller must hold fuse_mutex
*/
void fuse_ctl_remove_conn(struct fuse_conn *fc)
{
- int i;
+ struct dentry *dentry;
+ char name[32];
if (!fuse_control_sb || fc->no_control)
return;
- for (i = fc->ctl_ndents - 1; i >= 0; i--) {
- struct dentry *dentry = fc->ctl_dentry[i];
- d_inode(dentry)->i_private = NULL;
- if (!i) {
- /* Get rid of submounts: */
- d_invalidate(dentry);
- }
- dput(dentry);
+ sprintf(name, "%u", fc->dev);
+ dentry = lookup_noperm_positive_unlocked(&QSTR(name), fuse_control_sb->s_root);
+ if (!IS_ERR(dentry)) {
+ simple_recursive_removal(dentry, remove_one);
+ dput(dentry); // paired with lookup_noperm_positive_unlocked()
}
- drop_nlink(d_inode(fuse_control_sb->s_root));
}
static int fuse_ctl_fill_super(struct super_block *sb, struct fs_context *fsc)
@@ -346,12 +346,8 @@ static int fuse_ctl_init_fs_context(struct fs_context *fsc)
static void fuse_ctl_kill_sb(struct super_block *sb)
{
- struct fuse_conn *fc;
-
mutex_lock(&fuse_mutex);
fuse_control_sb = NULL;
- list_for_each_entry(fc, &fuse_conn_list, entry)
- fc->ctl_ndents = 0;
mutex_unlock(&fuse_mutex);
kill_litter_super(sb);
diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c
index 0b6ee6dd1fd6..ac6d4c1064cc 100644
--- a/fs/fuse/dax.c
+++ b/fs/fuse/dax.c
@@ -10,7 +10,6 @@
#include <linux/dax.h>
#include <linux/uio.h>
#include <linux/pagemap.h>
-#include <linux/pfn_t.h>
#include <linux/iomap.h>
#include <linux/interval_tree.h>
@@ -666,36 +665,12 @@ static void fuse_wait_dax_page(struct inode *inode)
filemap_invalidate_lock(inode->i_mapping);
}
-/* Should be called with mapping->invalidate_lock held exclusively */
-static int __fuse_dax_break_layouts(struct inode *inode, bool *retry,
- loff_t start, loff_t end)
-{
- struct page *page;
-
- page = dax_layout_busy_page_range(inode->i_mapping, start, end);
- if (!page)
- return 0;
-
- *retry = true;
- return ___wait_var_event(&page->_refcount,
- atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
- 0, 0, fuse_wait_dax_page(inode));
-}
-
-/* dmap_end == 0 leads to unmapping of whole file */
+/* Should be called with mapping->invalidate_lock held exclusively. */
int fuse_dax_break_layouts(struct inode *inode, u64 dmap_start,
u64 dmap_end)
{
- bool retry;
- int ret;
-
- do {
- retry = false;
- ret = __fuse_dax_break_layouts(inode, &retry, dmap_start,
- dmap_end);
- } while (ret == 0 && retry);
-
- return ret;
+ return dax_break_layout(inode, dmap_start, dmap_end,
+ fuse_wait_dax_page);
}
ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
@@ -781,7 +756,7 @@ static vm_fault_t __fuse_dax_fault(struct vm_fault *vmf, unsigned int order,
vm_fault_t ret;
struct inode *inode = file_inode(vmf->vma->vm_file);
struct super_block *sb = inode->i_sb;
- pfn_t pfn;
+ unsigned long pfn;
int error = 0;
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_conn_dax *fcd = fc->dax;
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 5b5f789b37eb..e80cd8f2c049 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -23,6 +23,7 @@
#include <linux/swap.h>
#include <linux/splice.h>
#include <linux/sched.h>
+#include <linux/seq_file.h>
#define CREATE_TRACE_POINTS
#include "fuse_trace.h"
@@ -32,6 +33,100 @@ MODULE_ALIAS("devname:fuse");
static struct kmem_cache *fuse_req_cachep;
+const unsigned long fuse_timeout_timer_freq =
+ secs_to_jiffies(FUSE_TIMEOUT_TIMER_FREQ);
+
+bool fuse_request_expired(struct fuse_conn *fc, struct list_head *list)
+{
+ struct fuse_req *req;
+
+ req = list_first_entry_or_null(list, struct fuse_req, list);
+ if (!req)
+ return false;
+ return time_is_before_jiffies(req->create_time + fc->timeout.req_timeout);
+}
+
+static bool fuse_fpq_processing_expired(struct fuse_conn *fc, struct list_head *processing)
+{
+ int i;
+
+ for (i = 0; i < FUSE_PQ_HASH_SIZE; i++)
+ if (fuse_request_expired(fc, &processing[i]))
+ return true;
+
+ return false;
+}
+
+/*
+ * Check if any requests aren't being completed by the time the request timeout
+ * elapses. To do so, we:
+ * - check the fiq pending list
+ * - check the bg queue
+ * - check the fpq io and processing lists
+ *
+ * To make this fast, we only check against the head request on each list since
+ * these are generally queued in order of creation time (eg newer requests get
+ * queued to the tail). We might miss a few edge cases (eg requests transitioning
+ * between lists, re-sent requests at the head of the pending list having a
+ * later creation time than other requests on that list, etc.) but that is fine
+ * since if the request never gets fulfilled, it will eventually be caught.
+ */
+void fuse_check_timeout(struct work_struct *work)
+{
+ struct delayed_work *dwork = to_delayed_work(work);
+ struct fuse_conn *fc = container_of(dwork, struct fuse_conn,
+ timeout.work);
+ struct fuse_iqueue *fiq = &fc->iq;
+ struct fuse_dev *fud;
+ struct fuse_pqueue *fpq;
+ bool expired = false;
+
+ if (!atomic_read(&fc->num_waiting))
+ goto out;
+
+ spin_lock(&fiq->lock);
+ expired = fuse_request_expired(fc, &fiq->pending);
+ spin_unlock(&fiq->lock);
+ if (expired)
+ goto abort_conn;
+
+ spin_lock(&fc->bg_lock);
+ expired = fuse_request_expired(fc, &fc->bg_queue);
+ spin_unlock(&fc->bg_lock);
+ if (expired)
+ goto abort_conn;
+
+ spin_lock(&fc->lock);
+ if (!fc->connected) {
+ spin_unlock(&fc->lock);
+ return;
+ }
+ list_for_each_entry(fud, &fc->devices, entry) {
+ fpq = &fud->pq;
+ spin_lock(&fpq->lock);
+ if (fuse_request_expired(fc, &fpq->io) ||
+ fuse_fpq_processing_expired(fc, fpq->processing)) {
+ spin_unlock(&fpq->lock);
+ spin_unlock(&fc->lock);
+ goto abort_conn;
+ }
+
+ spin_unlock(&fpq->lock);
+ }
+ spin_unlock(&fc->lock);
+
+ if (fuse_uring_request_expired(fc))
+ goto abort_conn;
+
+out:
+ queue_delayed_work(system_wq, &fc->timeout.work,
+ fuse_timeout_timer_freq);
+ return;
+
+abort_conn:
+ fuse_abort_conn(fc);
+}
+
static void fuse_request_init(struct fuse_mount *fm, struct fuse_req *req)
{
INIT_LIST_HEAD(&req->list);
@@ -40,6 +135,7 @@ static void fuse_request_init(struct fuse_mount *fm, struct fuse_req *req)
refcount_set(&req->count, 1);
__set_bit(FR_PENDING, &req->flags);
req->fm = fm;
+ req->create_time = jiffies;
}
static struct fuse_req *fuse_request_alloc(struct fuse_mount *fm, gfp_t flags)
@@ -77,7 +173,7 @@ void fuse_set_initialized(struct fuse_conn *fc)
static bool fuse_block_alloc(struct fuse_conn *fc, bool for_background)
{
return !fc->initialized || (for_background && fc->blocked) ||
- (fc->io_uring && !fuse_uring_ready(fc));
+ (fc->io_uring && fc->connected && !fuse_uring_ready(fc));
}
static void fuse_drop_waiting(struct fuse_conn *fc)
@@ -407,6 +503,24 @@ static int queue_interrupt(struct fuse_req *req)
return 0;
}
+bool fuse_remove_pending_req(struct fuse_req *req, spinlock_t *lock)
+{
+ spin_lock(lock);
+ if (test_bit(FR_PENDING, &req->flags)) {
+ /*
+ * FR_PENDING does not get cleared as the request will end
+ * up in destruction anyway.
+ */
+ list_del(&req->list);
+ spin_unlock(lock);
+ __fuse_put_request(req);
+ req->out.h.error = -EINTR;
+ return true;
+ }
+ spin_unlock(lock);
+ return false;
+}
+
static void request_wait_answer(struct fuse_req *req)
{
struct fuse_conn *fc = req->fm->fc;
@@ -428,22 +542,20 @@ static void request_wait_answer(struct fuse_req *req)
}
if (!test_bit(FR_FORCE, &req->flags)) {
+ bool removed;
+
/* Only fatal signals may interrupt this */
err = wait_event_killable(req->waitq,
test_bit(FR_FINISHED, &req->flags));
if (!err)
return;
- spin_lock(&fiq->lock);
- /* Request is not yet in userspace, bail out */
- if (test_bit(FR_PENDING, &req->flags)) {
- list_del(&req->list);
- spin_unlock(&fiq->lock);
- __fuse_put_request(req);
- req->out.h.error = -EINTR;
+ if (test_bit(FR_URING, &req->flags))
+ removed = fuse_uring_remove_pending_req(req);
+ else
+ removed = fuse_remove_pending_req(req, &fiq->lock);
+ if (removed)
return;
- }
- spin_unlock(&fiq->lock);
}
/*
@@ -705,7 +817,7 @@ static int unlock_request(struct fuse_req *req)
return err;
}
-void fuse_copy_init(struct fuse_copy_state *cs, int write,
+void fuse_copy_init(struct fuse_copy_state *cs, bool write,
struct iov_iter *iter)
{
memset(cs, 0, sizeof(*cs));
@@ -838,10 +950,16 @@ static int fuse_check_folio(struct folio *folio)
return 0;
}
-static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
+/*
+ * Attempt to steal a page from the splice() pipe and move it into the
+ * pagecache. If successful, the pointer in @pagep will be updated. The
+ * folio that was originally in @pagep will lose a reference and the new
+ * folio returned in @pagep will carry a reference.
+ */
+static int fuse_try_move_folio(struct fuse_copy_state *cs, struct folio **foliop)
{
int err;
- struct folio *oldfolio = page_folio(*pagep);
+ struct folio *oldfolio = *foliop;
struct folio *newfolio;
struct pipe_buffer *buf = cs->pipebufs;
@@ -862,7 +980,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
cs->pipebufs++;
cs->nr_segs--;
- if (cs->len != PAGE_SIZE)
+ if (cs->len != folio_size(oldfolio))
goto out_fallback;
if (!pipe_buf_try_steal(cs->pipe, buf))
@@ -908,7 +1026,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
if (test_bit(FR_ABORTED, &cs->req->flags))
err = -ENOENT;
else
- *pagep = &newfolio->page;
+ *foliop = newfolio;
spin_unlock(&cs->req->waitq.lock);
if (err) {
@@ -941,8 +1059,8 @@ out_fallback:
goto out_put_old;
}
-static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page,
- unsigned offset, unsigned count)
+static int fuse_ref_folio(struct fuse_copy_state *cs, struct folio *folio,
+ unsigned offset, unsigned count)
{
struct pipe_buffer *buf;
int err;
@@ -950,17 +1068,17 @@ static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page,
if (cs->nr_segs >= cs->pipe->max_usage)
return -EIO;
- get_page(page);
+ folio_get(folio);
err = unlock_request(cs->req);
if (err) {
- put_page(page);
+ folio_put(folio);
return err;
}
fuse_copy_finish(cs);
buf = cs->pipebufs;
- buf->page = page;
+ buf->page = &folio->page;
buf->offset = offset;
buf->len = count;
@@ -972,20 +1090,24 @@ static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page,
}
/*
- * Copy a page in the request to/from the userspace buffer. Must be
+ * Copy a folio in the request to/from the userspace buffer. Must be
* done atomically
*/
-static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
- unsigned offset, unsigned count, int zeroing)
+static int fuse_copy_folio(struct fuse_copy_state *cs, struct folio **foliop,
+ unsigned offset, unsigned count, int zeroing)
{
int err;
- struct page *page = *pagep;
+ struct folio *folio = *foliop;
+ size_t size;
- if (page && zeroing && count < PAGE_SIZE)
- clear_highpage(page);
+ if (folio) {
+ size = folio_size(folio);
+ if (zeroing && count < size)
+ folio_zero_range(folio, 0, size);
+ }
while (count) {
- if (cs->write && cs->pipebufs && page) {
+ if (cs->write && cs->pipebufs && folio) {
/*
* Can't control lifetime of pipe buffers, so always
* copy user pages.
@@ -995,12 +1117,12 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
if (err)
return err;
} else {
- return fuse_ref_page(cs, page, offset, count);
+ return fuse_ref_folio(cs, folio, offset, count);
}
} else if (!cs->len) {
- if (cs->move_pages && page &&
- offset == 0 && count == PAGE_SIZE) {
- err = fuse_try_move_page(cs, pagep);
+ if (cs->move_folios && folio &&
+ offset == 0 && count == size) {
+ err = fuse_try_move_folio(cs, foliop);
if (err <= 0)
return err;
} else {
@@ -1009,22 +1131,30 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
return err;
}
}
- if (page) {
- void *mapaddr = kmap_local_page(page);
- void *buf = mapaddr + offset;
- offset += fuse_copy_do(cs, &buf, &count);
+ if (folio) {
+ void *mapaddr = kmap_local_folio(folio, offset);
+ void *buf = mapaddr;
+ unsigned int copy = count;
+ unsigned int bytes_copied;
+
+ if (folio_test_highmem(folio) && count > PAGE_SIZE - offset_in_page(offset))
+ copy = PAGE_SIZE - offset_in_page(offset);
+
+ bytes_copied = fuse_copy_do(cs, &buf, &copy);
kunmap_local(mapaddr);
+ offset += bytes_copied;
+ count -= bytes_copied;
} else
offset += fuse_copy_do(cs, NULL, &count);
}
- if (page && !cs->write)
- flush_dcache_page(page);
+ if (folio && !cs->write)
+ flush_dcache_folio(folio);
return 0;
}
-/* Copy pages in the request to/from userspace buffer */
-static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,
- int zeroing)
+/* Copy folios in the request to/from userspace buffer */
+static int fuse_copy_folios(struct fuse_copy_state *cs, unsigned nbytes,
+ int zeroing)
{
unsigned i;
struct fuse_req *req = cs->req;
@@ -1034,23 +1164,12 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,
int err;
unsigned int offset = ap->descs[i].offset;
unsigned int count = min(nbytes, ap->descs[i].length);
- struct page *orig, *pagep;
- orig = pagep = &ap->folios[i]->page;
-
- err = fuse_copy_page(cs, &pagep, offset, count, zeroing);
+ err = fuse_copy_folio(cs, &ap->folios[i], offset, count, zeroing);
if (err)
return err;
nbytes -= count;
-
- /*
- * fuse_copy_page may have moved a page from a pipe instead of
- * copying into our given page, so update the folios if it was
- * replaced.
- */
- if (pagep != orig)
- ap->folios[i] = page_folio(pagep);
}
return 0;
}
@@ -1080,7 +1199,7 @@ int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs,
for (i = 0; !err && i < numargs; i++) {
struct fuse_arg *arg = &args[i];
if (i == numargs - 1 && argpages)
- err = fuse_copy_pages(cs, arg->size, zeroing);
+ err = fuse_copy_folios(cs, arg->size, zeroing);
else
err = fuse_copy_one(cs, arg->value, arg->size);
}
@@ -1421,7 +1540,7 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, struct iov_iter *to)
if (!user_backed_iter(to))
return -EINVAL;
- fuse_copy_init(&cs, 1, to);
+ fuse_copy_init(&cs, true, to);
return fuse_dev_do_read(fud, file, &cs, iov_iter_count(to));
}
@@ -1444,14 +1563,14 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
if (!bufs)
return -ENOMEM;
- fuse_copy_init(&cs, 1, NULL);
+ fuse_copy_init(&cs, true, NULL);
cs.pipebufs = bufs;
cs.pipe = pipe;
ret = fuse_dev_do_read(fud, in, &cs, len);
if (ret < 0)
goto out;
- if (pipe_occupancy(pipe->head, pipe->tail) + cs.nr_segs > pipe->max_usage) {
+ if (pipe_buf_usage(pipe) + cs.nr_segs > pipe->max_usage) {
ret = -EIO;
goto out;
}
@@ -1527,14 +1646,10 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
struct fuse_copy_state *cs)
{
struct fuse_notify_inval_entry_out outarg;
- int err = -ENOMEM;
- char *buf;
+ int err;
+ char *buf = NULL;
struct qstr name;
- buf = kzalloc(FUSE_NAME_MAX + 1, GFP_KERNEL);
- if (!buf)
- goto err;
-
err = -EINVAL;
if (size < sizeof(outarg))
goto err;
@@ -1544,13 +1659,18 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
goto err;
err = -ENAMETOOLONG;
- if (outarg.namelen > FUSE_NAME_MAX)
+ if (outarg.namelen > fc->name_max)
goto err;
err = -EINVAL;
if (size != sizeof(outarg) + outarg.namelen + 1)
goto err;
+ err = -ENOMEM;
+ buf = kzalloc(outarg.namelen + 1, GFP_KERNEL);
+ if (!buf)
+ goto err;
+
name.name = buf;
name.len = outarg.namelen;
err = fuse_copy_one(cs, buf, outarg.namelen + 1);
@@ -1575,14 +1695,10 @@ static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size,
struct fuse_copy_state *cs)
{
struct fuse_notify_delete_out outarg;
- int err = -ENOMEM;
- char *buf;
+ int err;
+ char *buf = NULL;
struct qstr name;
- buf = kzalloc(FUSE_NAME_MAX + 1, GFP_KERNEL);
- if (!buf)
- goto err;
-
err = -EINVAL;
if (size < sizeof(outarg))
goto err;
@@ -1592,13 +1708,18 @@ static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size,
goto err;
err = -ENAMETOOLONG;
- if (outarg.namelen > FUSE_NAME_MAX)
+ if (outarg.namelen > fc->name_max)
goto err;
err = -EINVAL;
if (size != sizeof(outarg) + outarg.namelen + 1)
goto err;
+ err = -ENOMEM;
+ buf = kzalloc(outarg.namelen + 1, GFP_KERNEL);
+ if (!buf)
+ goto err;
+
name.name = buf;
name.len = outarg.namelen;
err = fuse_copy_one(cs, buf, outarg.namelen + 1);
@@ -1667,20 +1788,23 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size,
num = outarg.size;
while (num) {
struct folio *folio;
- struct page *page;
- unsigned int this_num;
+ unsigned int folio_offset;
+ unsigned int nr_bytes;
+ unsigned int nr_pages;
folio = filemap_grab_folio(mapping, index);
err = PTR_ERR(folio);
if (IS_ERR(folio))
goto out_iput;
- page = &folio->page;
- this_num = min_t(unsigned, num, folio_size(folio) - offset);
- err = fuse_copy_page(cs, &page, offset, this_num, 0);
+ folio_offset = ((index - folio->index) << PAGE_SHIFT) + offset;
+ nr_bytes = min_t(unsigned, num, folio_size(folio) - folio_offset);
+ nr_pages = (offset + nr_bytes + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+ err = fuse_copy_folio(cs, &folio, folio_offset, nr_bytes, 0);
if (!folio_test_uptodate(folio) && !err && offset == 0 &&
- (this_num == folio_size(folio) || file_size == end)) {
- folio_zero_segment(folio, this_num, folio_size(folio));
+ (nr_bytes == folio_size(folio) || file_size == end)) {
+ folio_zero_segment(folio, nr_bytes, folio_size(folio));
folio_mark_uptodate(folio);
}
folio_unlock(folio);
@@ -1689,9 +1813,9 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size,
if (err)
goto out_iput;
- num -= this_num;
+ num -= nr_bytes;
offset = 0;
- index++;
+ index += nr_pages;
}
err = 0;
@@ -1730,7 +1854,7 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode,
unsigned int num;
unsigned int offset;
size_t total_len = 0;
- unsigned int num_pages, cur_pages = 0;
+ unsigned int num_pages;
struct fuse_conn *fc = fm->fc;
struct fuse_retrieve_args *ra;
size_t args_size = sizeof(*ra);
@@ -1748,6 +1872,7 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode,
num_pages = (num + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
num_pages = min(num_pages, fc->max_pages);
+ num = min(num, num_pages << PAGE_SHIFT);
args_size += num_pages * (sizeof(ap->folios[0]) + sizeof(ap->descs[0]));
@@ -1768,25 +1893,29 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode,
index = outarg->offset >> PAGE_SHIFT;
- while (num && cur_pages < num_pages) {
+ while (num) {
struct folio *folio;
- unsigned int this_num;
+ unsigned int folio_offset;
+ unsigned int nr_bytes;
+ unsigned int nr_pages;
folio = filemap_get_folio(mapping, index);
if (IS_ERR(folio))
break;
- this_num = min_t(unsigned, num, PAGE_SIZE - offset);
+ folio_offset = ((index - folio->index) << PAGE_SHIFT) + offset;
+ nr_bytes = min(folio_size(folio) - folio_offset, num);
+ nr_pages = (offset + nr_bytes + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
ap->folios[ap->num_folios] = folio;
- ap->descs[ap->num_folios].offset = offset;
- ap->descs[ap->num_folios].length = this_num;
+ ap->descs[ap->num_folios].offset = folio_offset;
+ ap->descs[ap->num_folios].length = nr_bytes;
ap->num_folios++;
- cur_pages++;
offset = 0;
- num -= this_num;
- total_len += this_num;
- index++;
+ num -= nr_bytes;
+ total_len += nr_bytes;
+ index += nr_pages;
}
ra->inarg.offset = outarg->offset;
ra->inarg.size = total_len;
@@ -1902,11 +2031,24 @@ static int fuse_notify_resend(struct fuse_conn *fc)
return 0;
}
+/*
+ * Increments the fuse connection epoch. This will result of dentries from
+ * previous epochs to be invalidated.
+ *
+ * XXX optimization: add call to shrink_dcache_sb()?
+ */
+static int fuse_notify_inc_epoch(struct fuse_conn *fc)
+{
+ atomic_inc(&fc->epoch);
+
+ return 0;
+}
+
static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
unsigned int size, struct fuse_copy_state *cs)
{
- /* Don't try to move pages (yet) */
- cs->move_pages = 0;
+ /* Don't try to move folios (yet) */
+ cs->move_folios = false;
switch (code) {
case FUSE_NOTIFY_POLL:
@@ -1930,6 +2072,9 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
case FUSE_NOTIFY_RESEND:
return fuse_notify_resend(fc);
+ case FUSE_NOTIFY_INC_EPOCH:
+ return fuse_notify_inc_epoch(fc);
+
default:
fuse_copy_finish(cs);
return -EINVAL;
@@ -2054,7 +2199,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
spin_unlock(&fpq->lock);
cs->req = req;
if (!req->args->page_replace)
- cs->move_pages = 0;
+ cs->move_folios = false;
if (oh.error)
err = nbytes != sizeof(oh) ? -EINVAL : 0;
@@ -2092,7 +2237,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, struct iov_iter *from)
if (!user_backed_iter(from))
return -EINVAL;
- fuse_copy_init(&cs, 0, from);
+ fuse_copy_init(&cs, false, from);
return fuse_dev_do_write(fud, &cs, iov_iter_count(from));
}
@@ -2101,7 +2246,7 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
struct file *out, loff_t *ppos,
size_t len, unsigned int flags)
{
- unsigned int head, tail, mask, count;
+ unsigned int head, tail, count;
unsigned nbuf;
unsigned idx;
struct pipe_buffer *bufs;
@@ -2118,8 +2263,7 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
head = pipe->head;
tail = pipe->tail;
- mask = pipe->ring_size - 1;
- count = head - tail;
+ count = pipe_occupancy(head, tail);
bufs = kvmalloc_array(count, sizeof(struct pipe_buffer), GFP_KERNEL);
if (!bufs) {
@@ -2129,8 +2273,8 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
nbuf = 0;
rem = 0;
- for (idx = tail; idx != head && rem < len; idx++)
- rem += pipe->bufs[idx & mask].len;
+ for (idx = tail; !pipe_empty(head, idx) && rem < len; idx++)
+ rem += pipe_buf(pipe, idx)->len;
ret = -EINVAL;
if (rem < len)
@@ -2141,10 +2285,10 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
struct pipe_buffer *ibuf;
struct pipe_buffer *obuf;
- if (WARN_ON(nbuf >= count || tail == head))
+ if (WARN_ON(nbuf >= count || pipe_empty(head, tail)))
goto out_free;
- ibuf = &pipe->bufs[tail & mask];
+ ibuf = pipe_buf(pipe, tail);
obuf = &bufs[nbuf];
if (rem >= ibuf->len) {
@@ -2167,13 +2311,13 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
}
pipe_unlock(pipe);
- fuse_copy_init(&cs, 0, NULL);
+ fuse_copy_init(&cs, false, NULL);
cs.pipebufs = bufs;
cs.nr_segs = nbuf;
cs.pipe = pipe;
if (flags & SPLICE_F_MOVE)
- cs.move_pages = 1;
+ cs.move_folios = true;
ret = fuse_dev_do_write(fud, &cs, len);
@@ -2270,6 +2414,9 @@ void fuse_abort_conn(struct fuse_conn *fc)
LIST_HEAD(to_end);
unsigned int i;
+ if (fc->timeout.req_timeout)
+ cancel_delayed_work(&fc->timeout.work);
+
/* Background queuing checks fc->connected under bg_lock */
spin_lock(&fc->bg_lock);
fc->connected = 0;
@@ -2481,6 +2628,17 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd,
}
}
+#ifdef CONFIG_PROC_FS
+static void fuse_dev_show_fdinfo(struct seq_file *seq, struct file *file)
+{
+ struct fuse_dev *fud = fuse_get_dev(file);
+ if (!fud)
+ return;
+
+ seq_printf(seq, "fuse_connection:\t%u\n", fud->fc->dev);
+}
+#endif
+
const struct file_operations fuse_dev_operations = {
.owner = THIS_MODULE,
.open = fuse_dev_open,
@@ -2496,6 +2654,9 @@ const struct file_operations fuse_dev_operations = {
#ifdef CONFIG_FUSE_IO_URING
.uring_cmd = fuse_uring_cmd,
#endif
+#ifdef CONFIG_PROC_FS
+ .show_fdinfo = fuse_dev_show_fdinfo,
+#endif
};
EXPORT_SYMBOL_GPL(fuse_dev_operations);
diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c
index ebd2931b4f2a..249b210becb1 100644
--- a/fs/fuse/dev_uring.c
+++ b/fs/fuse/dev_uring.c
@@ -140,6 +140,49 @@ void fuse_uring_abort_end_requests(struct fuse_ring *ring)
}
}
+static bool ent_list_request_expired(struct fuse_conn *fc, struct list_head *list)
+{
+ struct fuse_ring_ent *ent;
+ struct fuse_req *req;
+
+ ent = list_first_entry_or_null(list, struct fuse_ring_ent, list);
+ if (!ent)
+ return false;
+
+ req = ent->fuse_req;
+
+ return time_is_before_jiffies(req->create_time +
+ fc->timeout.req_timeout);
+}
+
+bool fuse_uring_request_expired(struct fuse_conn *fc)
+{
+ struct fuse_ring *ring = fc->ring;
+ struct fuse_ring_queue *queue;
+ int qid;
+
+ if (!ring)
+ return false;
+
+ for (qid = 0; qid < ring->nr_queues; qid++) {
+ queue = READ_ONCE(ring->queues[qid]);
+ if (!queue)
+ continue;
+
+ spin_lock(&queue->lock);
+ if (fuse_request_expired(fc, &queue->fuse_req_queue) ||
+ fuse_request_expired(fc, &queue->fuse_req_bg_queue) ||
+ ent_list_request_expired(fc, &queue->ent_w_req_queue) ||
+ ent_list_request_expired(fc, &queue->ent_in_userspace)) {
+ spin_unlock(&queue->lock);
+ return true;
+ }
+ spin_unlock(&queue->lock);
+ }
+
+ return false;
+}
+
void fuse_uring_destruct(struct fuse_conn *fc)
{
struct fuse_ring *ring = fc->ring;
@@ -208,11 +251,10 @@ static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc)
init_waitqueue_head(&ring->stop_waitq);
- fc->ring = ring;
ring->nr_queues = nr_queues;
ring->fc = fc;
ring->max_payload_sz = max_payload_size;
- atomic_set(&ring->queue_refs, 0);
+ smp_store_release(&fc->ring, ring);
spin_unlock(&fc->lock);
return ring;
@@ -468,7 +510,7 @@ static void fuse_uring_cancel(struct io_uring_cmd *cmd,
spin_lock(&queue->lock);
if (ent->state == FRRS_AVAILABLE) {
ent->state = FRRS_USERSPACE;
- list_move(&ent->list, &queue->ent_in_userspace);
+ list_move_tail(&ent->list, &queue->ent_in_userspace);
need_cmd_done = true;
ent->cmd = NULL;
}
@@ -551,8 +593,8 @@ static int fuse_uring_copy_from_ring(struct fuse_ring *ring,
if (err)
return err;
- fuse_copy_init(&cs, 0, &iter);
- cs.is_uring = 1;
+ fuse_copy_init(&cs, false, &iter);
+ cs.is_uring = true;
cs.req = req;
return fuse_copy_out_args(&cs, args, ring_in_out.payload_sz);
@@ -581,8 +623,8 @@ static int fuse_uring_args_to_ring(struct fuse_ring *ring, struct fuse_req *req,
return err;
}
- fuse_copy_init(&cs, 1, &iter);
- cs.is_uring = 1;
+ fuse_copy_init(&cs, true, &iter);
+ cs.is_uring = true;
cs.req = req;
if (num_args > 0) {
@@ -688,7 +730,7 @@ static int fuse_uring_send_next_to_ring(struct fuse_ring_ent *ent,
cmd = ent->cmd;
ent->cmd = NULL;
ent->state = FRRS_USERSPACE;
- list_move(&ent->list, &queue->ent_in_userspace);
+ list_move_tail(&ent->list, &queue->ent_in_userspace);
spin_unlock(&queue->lock);
io_uring_cmd_done(cmd, 0, 0, issue_flags);
@@ -726,8 +768,6 @@ static void fuse_uring_add_req_to_ring_ent(struct fuse_ring_ent *ent,
struct fuse_req *req)
{
struct fuse_ring_queue *queue = ent->queue;
- struct fuse_conn *fc = req->fm->fc;
- struct fuse_iqueue *fiq = &fc->iq;
lockdep_assert_held(&queue->lock);
@@ -737,12 +777,10 @@ static void fuse_uring_add_req_to_ring_ent(struct fuse_ring_ent *ent,
ent->state);
}
- spin_lock(&fiq->lock);
clear_bit(FR_PENDING, &req->flags);
- spin_unlock(&fiq->lock);
ent->fuse_req = req;
ent->state = FRRS_FUSE_REQ;
- list_move(&ent->list, &queue->ent_w_req_queue);
+ list_move_tail(&ent->list, &queue->ent_w_req_queue);
fuse_uring_add_to_pq(ent, req);
}
@@ -1041,7 +1079,7 @@ static int fuse_uring_register(struct io_uring_cmd *cmd,
unsigned int issue_flags, struct fuse_conn *fc)
{
const struct fuse_uring_cmd_req *cmd_req = io_uring_sqe_cmd(cmd->sqe);
- struct fuse_ring *ring = fc->ring;
+ struct fuse_ring *ring = smp_load_acquire(&fc->ring);
struct fuse_ring_queue *queue;
struct fuse_ring_ent *ent;
int err;
@@ -1158,7 +1196,7 @@ static void fuse_uring_send(struct fuse_ring_ent *ent, struct io_uring_cmd *cmd,
spin_lock(&queue->lock);
ent->state = FRRS_USERSPACE;
- list_move(&ent->list, &queue->ent_in_userspace);
+ list_move_tail(&ent->list, &queue->ent_in_userspace);
ent->cmd = NULL;
spin_unlock(&queue->lock);
@@ -1238,6 +1276,8 @@ void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req)
if (unlikely(queue->stopped))
goto err_unlock;
+ set_bit(FR_URING, &req->flags);
+ req->ring_queue = queue;
ent = list_first_entry_or_null(&queue->ent_avail_queue,
struct fuse_ring_ent, list);
if (ent)
@@ -1276,6 +1316,8 @@ bool fuse_uring_queue_bq_req(struct fuse_req *req)
return false;
}
+ set_bit(FR_URING, &req->flags);
+ req->ring_queue = queue;
list_add_tail(&req->list, &queue->fuse_req_bg_queue);
ent = list_first_entry_or_null(&queue->ent_avail_queue,
@@ -1306,6 +1348,13 @@ bool fuse_uring_queue_bq_req(struct fuse_req *req)
return true;
}
+bool fuse_uring_remove_pending_req(struct fuse_req *req)
+{
+ struct fuse_ring_queue *queue = req->ring_queue;
+
+ return fuse_remove_pending_req(req, &queue->lock);
+}
+
static const struct fuse_iqueue_ops fuse_io_uring_ops = {
/* should be send over io-uring as enhancement */
.send_forget = fuse_dev_queue_forget,
diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h
index 2102b3d0c1ae..51a563922ce1 100644
--- a/fs/fuse/dev_uring_i.h
+++ b/fs/fuse/dev_uring_i.h
@@ -142,6 +142,8 @@ void fuse_uring_abort_end_requests(struct fuse_ring *ring);
int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags);
void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req);
bool fuse_uring_queue_bq_req(struct fuse_req *req);
+bool fuse_uring_remove_pending_req(struct fuse_req *req);
+bool fuse_uring_request_expired(struct fuse_conn *fc);
static inline void fuse_uring_abort(struct fuse_conn *fc)
{
@@ -172,12 +174,6 @@ static inline bool fuse_uring_ready(struct fuse_conn *fc)
#else /* CONFIG_FUSE_IO_URING */
-struct fuse_ring;
-
-static inline void fuse_uring_create(struct fuse_conn *fc)
-{
-}
-
static inline void fuse_uring_destruct(struct fuse_conn *fc)
{
}
@@ -200,6 +196,16 @@ static inline bool fuse_uring_ready(struct fuse_conn *fc)
return false;
}
+static inline bool fuse_uring_remove_pending_req(struct fuse_req *req)
+{
+ return false;
+}
+
+static inline bool fuse_uring_request_expired(struct fuse_conn *fc)
+{
+ return false;
+}
+
#endif /* CONFIG_FUSE_IO_URING */
#endif /* _FS_FUSE_DEV_URING_I_H */
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 198862b086ff..2d817d7cab26 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -200,9 +200,14 @@ static int fuse_dentry_revalidate(struct inode *dir, const struct qstr *name,
{
struct inode *inode;
struct fuse_mount *fm;
+ struct fuse_conn *fc;
struct fuse_inode *fi;
int ret;
+ fc = get_fuse_conn_super(dir->i_sb);
+ if (entry->d_time < atomic_read(&fc->epoch))
+ goto invalid;
+
inode = d_inode_rcu(entry);
if (inode && fuse_is_bad(inode))
goto invalid;
@@ -319,9 +324,6 @@ static struct vfsmount *fuse_dentry_automount(struct path *path)
/* Create the submount */
mnt = fc_mount(fsc);
- if (!IS_ERR(mnt))
- mntget(mnt);
-
put_fs_context(fsc);
return mnt;
}
@@ -336,13 +338,6 @@ const struct dentry_operations fuse_dentry_operations = {
.d_automount = fuse_dentry_automount,
};
-const struct dentry_operations fuse_root_dentry_operations = {
-#if BITS_PER_LONG < 64
- .d_init = fuse_dentry_init,
- .d_release = fuse_dentry_release,
-#endif
-};
-
int fuse_valid_type(int m)
{
return S_ISREG(m) || S_ISDIR(m) || S_ISLNK(m) || S_ISCHR(m) ||
@@ -370,7 +365,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name
*inode = NULL;
err = -ENAMETOOLONG;
- if (name->len > FUSE_NAME_MAX)
+ if (name->len > fm->fc->name_max)
goto out;
@@ -415,16 +410,20 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name
static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
unsigned int flags)
{
- int err;
struct fuse_entry_out outarg;
+ struct fuse_conn *fc;
struct inode *inode;
struct dentry *newent;
+ int err, epoch;
bool outarg_valid = true;
bool locked;
if (fuse_is_bad(dir))
return ERR_PTR(-EIO);
+ fc = get_fuse_conn_super(dir->i_sb);
+ epoch = atomic_read(&fc->epoch);
+
locked = fuse_lock_inode(dir);
err = fuse_lookup_name(dir->i_sb, get_node_id(dir), &entry->d_name,
&outarg, &inode);
@@ -446,6 +445,7 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
goto out_err;
entry = newent ? newent : entry;
+ entry->d_time = epoch;
if (outarg_valid)
fuse_change_entry_timeout(entry, &outarg);
else
@@ -619,7 +619,6 @@ static int fuse_create_open(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *entry, struct file *file,
unsigned int flags, umode_t mode, u32 opcode)
{
- int err;
struct inode *inode;
struct fuse_mount *fm = get_fuse_mount(dir);
FUSE_ARGS(args);
@@ -629,11 +628,13 @@ static int fuse_create_open(struct mnt_idmap *idmap, struct inode *dir,
struct fuse_entry_out outentry;
struct fuse_inode *fi;
struct fuse_file *ff;
+ int epoch, err;
bool trunc = flags & O_TRUNC;
/* Userspace expects S_IFREG in create mode */
BUG_ON((mode & S_IFMT) != S_IFREG);
+ epoch = atomic_read(&fm->fc->epoch);
forget = fuse_alloc_forget();
err = -ENOMEM;
if (!forget)
@@ -702,6 +703,7 @@ static int fuse_create_open(struct mnt_idmap *idmap, struct inode *dir,
}
kfree(forget);
d_instantiate(entry, inode);
+ entry->d_time = epoch;
fuse_change_entry_timeout(entry, &outentry);
fuse_dir_changed(dir);
err = generic_file_open(inode, file);
@@ -781,22 +783,24 @@ no_open:
/*
* Code shared between mknod, mkdir, symlink and link
*/
-static int create_new_entry(struct mnt_idmap *idmap, struct fuse_mount *fm,
- struct fuse_args *args, struct inode *dir,
- struct dentry *entry, umode_t mode)
+static struct dentry *create_new_entry(struct mnt_idmap *idmap, struct fuse_mount *fm,
+ struct fuse_args *args, struct inode *dir,
+ struct dentry *entry, umode_t mode)
{
struct fuse_entry_out outarg;
struct inode *inode;
struct dentry *d;
- int err;
struct fuse_forget_link *forget;
+ int epoch, err;
if (fuse_is_bad(dir))
- return -EIO;
+ return ERR_PTR(-EIO);
+
+ epoch = atomic_read(&fm->fc->epoch);
forget = fuse_alloc_forget();
if (!forget)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
memset(&outarg, 0, sizeof(outarg));
args->nodeid = get_node_id(dir);
@@ -826,29 +830,46 @@ static int create_new_entry(struct mnt_idmap *idmap, struct fuse_mount *fm,
&outarg.attr, ATTR_TIMEOUT(&outarg), 0, 0);
if (!inode) {
fuse_queue_forget(fm->fc, forget, outarg.nodeid, 1);
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
}
kfree(forget);
d_drop(entry);
d = d_splice_alias(inode, entry);
if (IS_ERR(d))
- return PTR_ERR(d);
+ return d;
if (d) {
+ d->d_time = epoch;
fuse_change_entry_timeout(d, &outarg);
- dput(d);
} else {
+ entry->d_time = epoch;
fuse_change_entry_timeout(entry, &outarg);
}
fuse_dir_changed(dir);
- return 0;
+ return d;
out_put_forget_req:
if (err == -EEXIST)
fuse_invalidate_entry(entry);
kfree(forget);
- return err;
+ return ERR_PTR(err);
+}
+
+static int create_new_nondir(struct mnt_idmap *idmap, struct fuse_mount *fm,
+ struct fuse_args *args, struct inode *dir,
+ struct dentry *entry, umode_t mode)
+{
+ /*
+ * Note that when creating anything other than a directory we
+ * can be sure create_new_entry() will NOT return an alternate
+ * dentry as d_splice_alias() only returns an alternate dentry
+ * for directories. So we don't need to check for that case
+ * when passing back the result.
+ */
+ WARN_ON_ONCE(S_ISDIR(mode));
+
+ return PTR_ERR(create_new_entry(idmap, fm, args, dir, entry, mode));
}
static int fuse_mknod(struct mnt_idmap *idmap, struct inode *dir,
@@ -871,7 +892,7 @@ static int fuse_mknod(struct mnt_idmap *idmap, struct inode *dir,
args.in_args[0].value = &inarg;
args.in_args[1].size = entry->d_name.len + 1;
args.in_args[1].value = entry->d_name.name;
- return create_new_entry(idmap, fm, &args, dir, entry, mode);
+ return create_new_nondir(idmap, fm, &args, dir, entry, mode);
}
static int fuse_create(struct mnt_idmap *idmap, struct inode *dir,
@@ -898,8 +919,8 @@ static int fuse_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
return err;
}
-static int fuse_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *entry, umode_t mode)
+static struct dentry *fuse_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *entry, umode_t mode)
{
struct fuse_mkdir_in inarg;
struct fuse_mount *fm = get_fuse_mount(dir);
@@ -934,7 +955,7 @@ static int fuse_symlink(struct mnt_idmap *idmap, struct inode *dir,
args.in_args[1].value = entry->d_name.name;
args.in_args[2].size = len;
args.in_args[2].value = link;
- return create_new_entry(idmap, fm, &args, dir, entry, S_IFLNK);
+ return create_new_nondir(idmap, fm, &args, dir, entry, S_IFLNK);
}
void fuse_flush_time_update(struct inode *inode)
@@ -1123,6 +1144,9 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
struct fuse_mount *fm = get_fuse_mount(inode);
FUSE_ARGS(args);
+ if (fm->fc->no_link)
+ goto out;
+
memset(&inarg, 0, sizeof(inarg));
inarg.oldnodeid = get_node_id(inode);
args.opcode = FUSE_LINK;
@@ -1131,12 +1155,18 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
args.in_args[0].value = &inarg;
args.in_args[1].size = newent->d_name.len + 1;
args.in_args[1].value = newent->d_name.name;
- err = create_new_entry(&invalid_mnt_idmap, fm, &args, newdir, newent, inode->i_mode);
+ err = create_new_nondir(&invalid_mnt_idmap, fm, &args, newdir, newent, inode->i_mode);
if (!err)
fuse_update_ctime_in_cache(inode);
else if (err == -EINTR)
fuse_invalidate_attr(inode);
+ if (err == -ENOSYS)
+ fm->fc->no_link = 1;
+out:
+ if (fm->fc->no_link)
+ return -EPERM;
+
return err;
}
@@ -1589,10 +1619,10 @@ static int fuse_permission(struct mnt_idmap *idmap,
return err;
}
-static int fuse_readlink_page(struct inode *inode, struct folio *folio)
+static int fuse_readlink_folio(struct inode *inode, struct folio *folio)
{
struct fuse_mount *fm = get_fuse_mount(inode);
- struct fuse_folio_desc desc = { .length = PAGE_SIZE - 1 };
+ struct fuse_folio_desc desc = { .length = folio_size(folio) - 1 };
struct fuse_args_pages ap = {
.num_folios = 1,
.folios = &folio,
@@ -1636,7 +1666,7 @@ static const char *fuse_get_link(struct dentry *dentry, struct inode *inode,
goto out_err;
if (fc->cache_symlinks)
- return page_get_link(dentry, inode, callback);
+ return page_get_link_raw(dentry, inode, callback);
err = -ECHILD;
if (!dentry)
@@ -1647,13 +1677,13 @@ static const char *fuse_get_link(struct dentry *dentry, struct inode *inode,
if (!folio)
goto out_err;
- err = fuse_readlink_page(inode, folio);
+ err = fuse_readlink_folio(inode, folio);
if (err) {
folio_put(folio);
goto out_err;
}
- set_delayed_call(callback, page_put_link, &folio->page);
+ set_delayed_call(callback, page_put_link, folio);
return folio_address(folio);
@@ -1923,6 +1953,7 @@ int fuse_do_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
int err;
bool trust_local_cmtime = is_wb;
bool fault_blocked = false;
+ u64 attr_version;
if (!fc->default_permissions)
attr->ia_valid |= ATTR_FORCE;
@@ -1940,7 +1971,7 @@ int fuse_do_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
if (FUSE_IS_DAX(inode) && is_truncate) {
filemap_invalidate_lock(mapping);
fault_blocked = true;
- err = fuse_dax_break_layouts(inode, 0, 0);
+ err = fuse_dax_break_layouts(inode, 0, -1);
if (err) {
filemap_invalidate_unlock(mapping);
return err;
@@ -2007,6 +2038,8 @@ int fuse_do_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
if (fc->handle_killpriv_v2 && !capable(CAP_FSETID))
inarg.valid |= FATTR_KILL_SUIDGID;
}
+
+ attr_version = fuse_get_attr_version(fm->fc);
fuse_setattr_fill(fc, &args, inode, &inarg, &outarg);
err = fuse_simple_request(fm, &args);
if (err) {
@@ -2032,6 +2065,14 @@ int fuse_do_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
/* FIXME: clear I_DIRTY_SYNC? */
}
+ if (fi->attr_version > attr_version) {
+ /*
+ * Apply attributes, for example for fsnotify_change(), but set
+ * attribute timeout to zero.
+ */
+ outarg.attr_valid = outarg.attr_valid_nsec = 0;
+ }
+
fuse_change_attributes_common(inode, &outarg.attr, NULL,
ATTR_TIMEOUT(&outarg),
fuse_get_cache_mask(inode), 0);
@@ -2237,7 +2278,7 @@ void fuse_init_dir(struct inode *inode)
static int fuse_symlink_read_folio(struct file *null, struct folio *folio)
{
- int err = fuse_readlink_page(folio->mapping->host, folio);
+ int err = fuse_readlink_folio(folio->mapping->host, folio);
if (!err)
folio_mark_uptodate(folio);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 7d92a5479998..5525a4520b0f 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -21,6 +21,7 @@
#include <linux/filelock.h>
#include <linux/splice.h>
#include <linux/task_io_accounting_ops.h>
+#include <linux/iomap.h>
static int fuse_send_open(struct fuse_mount *fm, u64 nodeid,
unsigned int open_flags, int opcode,
@@ -253,7 +254,7 @@ static int fuse_open(struct inode *inode, struct file *file)
if (dax_truncate) {
filemap_invalidate_lock(inode->i_mapping);
- err = fuse_dax_break_layouts(inode, 0, 0);
+ err = fuse_dax_break_layouts(inode, 0, -1);
if (err)
goto out_inode_unlock;
}
@@ -415,89 +416,11 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id)
struct fuse_writepage_args {
struct fuse_io_args ia;
- struct rb_node writepages_entry;
struct list_head queue_entry;
- struct fuse_writepage_args *next;
struct inode *inode;
struct fuse_sync_bucket *bucket;
};
-static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi,
- pgoff_t idx_from, pgoff_t idx_to)
-{
- struct rb_node *n;
-
- n = fi->writepages.rb_node;
-
- while (n) {
- struct fuse_writepage_args *wpa;
- pgoff_t curr_index;
-
- wpa = rb_entry(n, struct fuse_writepage_args, writepages_entry);
- WARN_ON(get_fuse_inode(wpa->inode) != fi);
- curr_index = wpa->ia.write.in.offset >> PAGE_SHIFT;
- if (idx_from >= curr_index + wpa->ia.ap.num_folios)
- n = n->rb_right;
- else if (idx_to < curr_index)
- n = n->rb_left;
- else
- return wpa;
- }
- return NULL;
-}
-
-/*
- * Check if any page in a range is under writeback
- */
-static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from,
- pgoff_t idx_to)
-{
- struct fuse_inode *fi = get_fuse_inode(inode);
- bool found;
-
- if (RB_EMPTY_ROOT(&fi->writepages))
- return false;
-
- spin_lock(&fi->lock);
- found = fuse_find_writeback(fi, idx_from, idx_to);
- spin_unlock(&fi->lock);
-
- return found;
-}
-
-static inline bool fuse_page_is_writeback(struct inode *inode, pgoff_t index)
-{
- return fuse_range_is_writeback(inode, index, index);
-}
-
-/*
- * Wait for page writeback to be completed.
- *
- * Since fuse doesn't rely on the VM writeback tracking, this has to
- * use some other means.
- */
-static void fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index)
-{
- struct fuse_inode *fi = get_fuse_inode(inode);
-
- wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index));
-}
-
-static inline bool fuse_folio_is_writeback(struct inode *inode,
- struct folio *folio)
-{
- pgoff_t last = folio_next_index(folio) - 1;
- return fuse_range_is_writeback(inode, folio_index(folio), last);
-}
-
-static void fuse_wait_on_folio_writeback(struct inode *inode,
- struct folio *folio)
-{
- struct fuse_inode *fi = get_fuse_inode(inode);
-
- wait_event(fi->page_waitq, !fuse_folio_is_writeback(inode, folio));
-}
-
/*
* Wait for all pending writepages on the inode to finish.
*
@@ -532,10 +455,6 @@ static int fuse_flush(struct file *file, fl_owner_t id)
if (err)
return err;
- inode_lock(inode);
- fuse_sync_writes(inode);
- inode_unlock(inode);
-
err = filemap_check_errors(file->f_mapping);
if (err)
return err;
@@ -870,12 +789,16 @@ static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read,
}
}
-static int fuse_do_readfolio(struct file *file, struct folio *folio)
+static int fuse_do_readfolio(struct file *file, struct folio *folio,
+ size_t off, size_t len)
{
struct inode *inode = folio->mapping->host;
struct fuse_mount *fm = get_fuse_mount(inode);
- loff_t pos = folio_pos(folio);
- struct fuse_folio_desc desc = { .length = PAGE_SIZE };
+ loff_t pos = folio_pos(folio) + off;
+ struct fuse_folio_desc desc = {
+ .offset = off,
+ .length = len,
+ };
struct fuse_io_args ia = {
.ap.args.page_zeroing = true,
.ap.args.out_pages = true,
@@ -886,13 +809,6 @@ static int fuse_do_readfolio(struct file *file, struct folio *folio)
ssize_t res;
u64 attr_ver;
- /*
- * With the temporary pages that are used to complete writeback, we can
- * have writeback that extends beyond the lifetime of the folio. So
- * make sure we read a properly synced folio.
- */
- fuse_wait_on_folio_writeback(inode, folio);
-
attr_ver = fuse_get_attr_version(fm->fc);
/* Don't overflow end offset */
@@ -909,8 +825,6 @@ static int fuse_do_readfolio(struct file *file, struct folio *folio)
if (res < desc.length)
fuse_short_read(inode, attr_ver, res, &ia.ap);
- folio_mark_uptodate(folio);
-
return 0;
}
@@ -923,13 +837,26 @@ static int fuse_read_folio(struct file *file, struct folio *folio)
if (fuse_is_bad(inode))
goto out;
- err = fuse_do_readfolio(file, folio);
+ err = fuse_do_readfolio(file, folio, 0, folio_size(folio));
+ if (!err)
+ folio_mark_uptodate(folio);
+
fuse_invalidate_atime(inode);
out:
folio_unlock(folio);
return err;
}
+static int fuse_iomap_read_folio_range(const struct iomap_iter *iter,
+ struct folio *folio, loff_t pos,
+ size_t len)
+{
+ struct file *file = iter->private;
+ size_t off = offset_in_folio(folio, pos);
+
+ return fuse_do_readfolio(file, folio, off, len);
+}
+
static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args,
int err)
{
@@ -955,22 +882,23 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args,
fuse_invalidate_atime(inode);
}
- for (i = 0; i < ap->num_folios; i++)
+ for (i = 0; i < ap->num_folios; i++) {
folio_end_read(ap->folios[i], !err);
+ folio_put(ap->folios[i]);
+ }
if (ia->ff)
fuse_file_put(ia->ff, false);
fuse_io_free(ia);
}
-static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file)
+static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file,
+ unsigned int count)
{
struct fuse_file *ff = file->private_data;
struct fuse_mount *fm = ff->fm;
struct fuse_args_pages *ap = &ia->ap;
loff_t pos = folio_pos(ap->folios[0]);
- /* Currently, all folios in FUSE are one page */
- size_t count = ap->num_folios << PAGE_SHIFT;
ssize_t res;
int err;
@@ -1003,17 +931,13 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file)
static void fuse_readahead(struct readahead_control *rac)
{
struct inode *inode = rac->mapping->host;
- struct fuse_inode *fi = get_fuse_inode(inode);
struct fuse_conn *fc = get_fuse_conn(inode);
unsigned int max_pages, nr_pages;
- pgoff_t first = readahead_index(rac);
- pgoff_t last = first + readahead_count(rac) - 1;
+ struct folio *folio = NULL;
if (fuse_is_bad(inode))
return;
- wait_event(fi->page_waitq, !fuse_range_is_writeback(inode, first, last));
-
max_pages = min_t(unsigned int, fc->max_pages,
fc->max_read / PAGE_SIZE);
@@ -1031,8 +955,8 @@ static void fuse_readahead(struct readahead_control *rac)
while (nr_pages) {
struct fuse_io_args *ia;
struct fuse_args_pages *ap;
- struct folio *folio;
unsigned cur_pages = min(max_pages, nr_pages);
+ unsigned int pages = 0;
if (fc->num_background >= fc->congestion_threshold &&
rac->ra->async_size >= readahead_count(rac))
@@ -1044,17 +968,44 @@ static void fuse_readahead(struct readahead_control *rac)
ia = fuse_io_alloc(NULL, cur_pages);
if (!ia)
- return;
+ break;
ap = &ia->ap;
- while (ap->num_folios < cur_pages) {
- folio = readahead_folio(rac);
+ while (pages < cur_pages) {
+ unsigned int folio_pages;
+
+ /*
+ * This returns a folio with a ref held on it.
+ * The ref needs to be held until the request is
+ * completed, since the splice case (see
+ * fuse_try_move_page()) drops the ref after it's
+ * replaced in the page cache.
+ */
+ if (!folio)
+ folio = __readahead_folio(rac);
+
+ folio_pages = folio_nr_pages(folio);
+ if (folio_pages > cur_pages - pages) {
+ /*
+ * Large folios belonging to fuse will never
+ * have more pages than max_pages.
+ */
+ WARN_ON(!pages);
+ break;
+ }
+
ap->folios[ap->num_folios] = folio;
ap->descs[ap->num_folios].length = folio_size(folio);
ap->num_folios++;
+ pages += folio_pages;
+ folio = NULL;
}
- fuse_send_readpages(ia, rac->file);
- nr_pages -= cur_pages;
+ fuse_send_readpages(ia, rac->file, pages << PAGE_SHIFT);
+ nr_pages -= pages;
+ }
+ if (folio) {
+ folio_end_read(folio, false);
+ folio_put(folio);
}
}
@@ -1172,7 +1123,7 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia,
int err;
for (i = 0; i < ap->num_folios; i++)
- fuse_wait_on_folio_writeback(inode, ap->folios[i]);
+ folio_wait_writeback(ap->folios[i]);
fuse_write_args_fill(ia, ff, pos, count);
ia->write.in.flags = fuse_write_flags(iocb);
@@ -1212,32 +1163,28 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia,
static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia,
struct address_space *mapping,
struct iov_iter *ii, loff_t pos,
- unsigned int max_pages)
+ unsigned int max_folios)
{
struct fuse_args_pages *ap = &ia->ap;
struct fuse_conn *fc = get_fuse_conn(mapping->host);
unsigned offset = pos & (PAGE_SIZE - 1);
- unsigned int nr_pages = 0;
size_t count = 0;
- int err;
+ unsigned int num;
+ int err = 0;
+
+ num = min(iov_iter_count(ii), fc->max_write);
ap->args.in_pages = true;
ap->descs[0].offset = offset;
- do {
+ while (num && ap->num_folios < max_folios) {
size_t tmp;
struct folio *folio;
pgoff_t index = pos >> PAGE_SHIFT;
- size_t bytes = min_t(size_t, PAGE_SIZE - offset,
- iov_iter_count(ii));
-
- bytes = min_t(size_t, bytes, fc->max_write - count);
+ unsigned int bytes;
+ unsigned int folio_offset;
again:
- err = -EFAULT;
- if (fault_in_iov_iter_readable(ii, bytes))
- break;
-
folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
mapping_gfp_mask(mapping));
if (IS_ERR(folio)) {
@@ -1248,29 +1195,42 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia,
if (mapping_writably_mapped(mapping))
flush_dcache_folio(folio);
- tmp = copy_folio_from_iter_atomic(folio, offset, bytes, ii);
+ folio_offset = ((index - folio->index) << PAGE_SHIFT) + offset;
+ bytes = min(folio_size(folio) - folio_offset, num);
+
+ tmp = copy_folio_from_iter_atomic(folio, folio_offset, bytes, ii);
flush_dcache_folio(folio);
if (!tmp) {
folio_unlock(folio);
folio_put(folio);
+
+ /*
+ * Ensure forward progress by faulting in
+ * while not holding the folio lock:
+ */
+ if (fault_in_iov_iter_readable(ii, bytes)) {
+ err = -EFAULT;
+ break;
+ }
+
goto again;
}
- err = 0;
ap->folios[ap->num_folios] = folio;
+ ap->descs[ap->num_folios].offset = folio_offset;
ap->descs[ap->num_folios].length = tmp;
ap->num_folios++;
- nr_pages++;
count += tmp;
pos += tmp;
+ num -= tmp;
offset += tmp;
- if (offset == PAGE_SIZE)
+ if (offset == folio_size(folio))
offset = 0;
- /* If we copied full page, mark it uptodate */
- if (tmp == PAGE_SIZE)
+ /* If we copied full folio, mark it uptodate */
+ if (tmp == folio_size(folio))
folio_mark_uptodate(folio);
if (folio_test_uptodate(folio)) {
@@ -1279,10 +1239,9 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia,
ia->write.folio_locked = true;
break;
}
- if (!fc->big_writes)
+ if (!fc->big_writes || offset != 0)
break;
- } while (iov_iter_count(ii) && count < fc->max_write &&
- nr_pages < max_pages && offset == 0);
+ }
return count > 0 ? count : err;
}
@@ -1431,6 +1390,24 @@ static void fuse_dio_unlock(struct kiocb *iocb, bool exclusive)
}
}
+static const struct iomap_write_ops fuse_iomap_write_ops = {
+ .read_folio_range = fuse_iomap_read_folio_range,
+};
+
+static int fuse_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
+ unsigned int flags, struct iomap *iomap,
+ struct iomap *srcmap)
+{
+ iomap->type = IOMAP_MAPPED;
+ iomap->length = length;
+ iomap->offset = offset;
+ return 0;
+}
+
+static const struct iomap_ops fuse_iomap_ops = {
+ .iomap_begin = fuse_iomap_begin,
+};
+
static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
@@ -1440,6 +1417,7 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct inode *inode = mapping->host;
ssize_t err, count;
struct fuse_conn *fc = get_fuse_conn(inode);
+ bool writeback = false;
if (fc->writeback_cache) {
/* Update size (EOF optimization) and mode (SUID clearing) */
@@ -1448,16 +1426,11 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (err)
return err;
- if (fc->handle_killpriv_v2 &&
- setattr_should_drop_suidgid(idmap,
- file_inode(file))) {
- goto writethrough;
- }
-
- return generic_file_write_iter(iocb, from);
+ if (!fc->handle_killpriv_v2 ||
+ !setattr_should_drop_suidgid(idmap, file_inode(file)))
+ writeback = true;
}
-writethrough:
inode_lock(inode);
err = count = generic_write_checks(iocb, from);
@@ -1476,6 +1449,15 @@ writethrough:
goto out;
written = direct_write_fallback(iocb, from, written,
fuse_perform_write(iocb, from));
+ } else if (writeback) {
+ /*
+ * Use iomap so that we can do granular uptodate reads
+ * and granular dirty tracking for large folios.
+ */
+ written = iomap_file_buffered_write(iocb, from,
+ &fuse_iomap_ops,
+ &fuse_iomap_write_ops,
+ file);
} else {
written = fuse_perform_write(iocb, from);
}
@@ -1629,7 +1611,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
return res;
}
}
- if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) {
+ if (!cuse && filemap_range_has_writeback(mapping, pos, (pos + count - 1))) {
if (!write)
inode_lock(inode);
fuse_sync_writes(inode);
@@ -1826,38 +1808,34 @@ static ssize_t fuse_splice_write(struct pipe_inode_info *pipe, struct file *out,
static void fuse_writepage_free(struct fuse_writepage_args *wpa)
{
struct fuse_args_pages *ap = &wpa->ia.ap;
- int i;
if (wpa->bucket)
fuse_sync_bucket_dec(wpa->bucket);
- for (i = 0; i < ap->num_folios; i++)
- folio_put(ap->folios[i]);
-
fuse_file_put(wpa->ia.ff, false);
kfree(ap->folios);
kfree(wpa);
}
-static void fuse_writepage_finish_stat(struct inode *inode, struct folio *folio)
-{
- struct backing_dev_info *bdi = inode_to_bdi(inode);
-
- dec_wb_stat(&bdi->wb, WB_WRITEBACK);
- node_stat_sub_folio(folio, NR_WRITEBACK_TEMP);
- wb_writeout_inc(&bdi->wb);
-}
-
static void fuse_writepage_finish(struct fuse_writepage_args *wpa)
{
struct fuse_args_pages *ap = &wpa->ia.ap;
struct inode *inode = wpa->inode;
struct fuse_inode *fi = get_fuse_inode(inode);
+ struct backing_dev_info *bdi = inode_to_bdi(inode);
int i;
- for (i = 0; i < ap->num_folios; i++)
- fuse_writepage_finish_stat(inode, ap->folios[i]);
+ for (i = 0; i < ap->num_folios; i++) {
+ /*
+ * Benchmarks showed that ending writeback within the
+ * scope of the fi->lock alleviates xarray lock
+ * contention and noticeably improves performance.
+ */
+ iomap_finish_folio_write(inode, ap->folios[i], 1);
+ dec_wb_stat(&bdi->wb, WB_WRITEBACK);
+ wb_writeout_inc(&bdi->wb);
+ }
wake_up(&fi->page_waitq);
}
@@ -1868,13 +1846,15 @@ static void fuse_send_writepage(struct fuse_mount *fm,
__releases(fi->lock)
__acquires(fi->lock)
{
- struct fuse_writepage_args *aux, *next;
struct fuse_inode *fi = get_fuse_inode(wpa->inode);
+ struct fuse_args_pages *ap = &wpa->ia.ap;
struct fuse_write_in *inarg = &wpa->ia.write.in;
- struct fuse_args *args = &wpa->ia.ap.args;
- /* Currently, all folios in FUSE are one page */
- __u64 data_size = wpa->ia.ap.num_folios * PAGE_SIZE;
- int err;
+ struct fuse_args *args = &ap->args;
+ __u64 data_size = 0;
+ int err, i;
+
+ for (i = 0; i < ap->num_folios; i++)
+ data_size += ap->descs[i].length;
fi->writectr++;
if (inarg->offset + data_size <= size) {
@@ -1905,19 +1885,8 @@ __acquires(fi->lock)
out_free:
fi->writectr--;
- rb_erase(&wpa->writepages_entry, &fi->writepages);
fuse_writepage_finish(wpa);
spin_unlock(&fi->lock);
-
- /* After rb_erase() aux request list is private */
- for (aux = wpa->next; aux; aux = next) {
- next = aux->next;
- aux->next = NULL;
- fuse_writepage_finish_stat(aux->inode,
- aux->ia.ap.folios[0]);
- fuse_writepage_free(aux);
- }
-
fuse_writepage_free(wpa);
spin_lock(&fi->lock);
}
@@ -1945,43 +1914,6 @@ __acquires(fi->lock)
}
}
-static struct fuse_writepage_args *fuse_insert_writeback(struct rb_root *root,
- struct fuse_writepage_args *wpa)
-{
- pgoff_t idx_from = wpa->ia.write.in.offset >> PAGE_SHIFT;
- pgoff_t idx_to = idx_from + wpa->ia.ap.num_folios - 1;
- struct rb_node **p = &root->rb_node;
- struct rb_node *parent = NULL;
-
- WARN_ON(!wpa->ia.ap.num_folios);
- while (*p) {
- struct fuse_writepage_args *curr;
- pgoff_t curr_index;
-
- parent = *p;
- curr = rb_entry(parent, struct fuse_writepage_args,
- writepages_entry);
- WARN_ON(curr->inode != wpa->inode);
- curr_index = curr->ia.write.in.offset >> PAGE_SHIFT;
-
- if (idx_from >= curr_index + curr->ia.ap.num_folios)
- p = &(*p)->rb_right;
- else if (idx_to < curr_index)
- p = &(*p)->rb_left;
- else
- return curr;
- }
-
- rb_link_node(&wpa->writepages_entry, parent, p);
- rb_insert_color(&wpa->writepages_entry, root);
- return NULL;
-}
-
-static void tree_insert(struct rb_root *root, struct fuse_writepage_args *wpa)
-{
- WARN_ON(fuse_insert_writeback(root, wpa));
-}
-
static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args,
int error)
{
@@ -2001,41 +1933,6 @@ static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args,
if (!fc->writeback_cache)
fuse_invalidate_attr_mask(inode, FUSE_STATX_MODIFY);
spin_lock(&fi->lock);
- rb_erase(&wpa->writepages_entry, &fi->writepages);
- while (wpa->next) {
- struct fuse_mount *fm = get_fuse_mount(inode);
- struct fuse_write_in *inarg = &wpa->ia.write.in;
- struct fuse_writepage_args *next = wpa->next;
-
- wpa->next = next->next;
- next->next = NULL;
- tree_insert(&fi->writepages, next);
-
- /*
- * Skip fuse_flush_writepages() to make it easy to crop requests
- * based on primary request size.
- *
- * 1st case (trivial): there are no concurrent activities using
- * fuse_set/release_nowrite. Then we're on safe side because
- * fuse_flush_writepages() would call fuse_send_writepage()
- * anyway.
- *
- * 2nd case: someone called fuse_set_nowrite and it is waiting
- * now for completion of all in-flight requests. This happens
- * rarely and no more than once per page, so this should be
- * okay.
- *
- * 3rd case: someone (e.g. fuse_do_setattr()) is in the middle
- * of fuse_set_nowrite..fuse_release_nowrite section. The fact
- * that fuse_set_nowrite returned implies that all in-flight
- * requests were completed along with all of their secondary
- * requests. Further primary requests are blocked by negative
- * writectr. Hence there cannot be any in-flight requests and
- * no invocations of fuse_writepage_end() while we're in
- * fuse_set_nowrite..fuse_release_nowrite section.
- */
- fuse_send_writepage(fm, next, inarg->offset + inarg->size);
- }
fi->writectr--;
fuse_writepage_finish(wpa);
spin_unlock(&fi->lock);
@@ -2069,17 +1966,6 @@ int fuse_write_inode(struct inode *inode, struct writeback_control *wbc)
struct fuse_file *ff;
int err;
- /*
- * Inode is always written before the last reference is dropped and
- * hence this should not be reached from reclaim.
- *
- * Writing back the inode from reclaim can deadlock if the request
- * processing itself needs an allocation. Allocations triggering
- * reclaim while serving a request can't be prevented, because it can
- * involve any number of unrelated userspace processes.
- */
- WARN_ON(wbc->for_reclaim);
-
ff = __fuse_write_file_get(fi);
err = fuse_flush_times(inode, ff);
if (ff)
@@ -2122,22 +2008,20 @@ static void fuse_writepage_add_to_bucket(struct fuse_conn *fc,
}
static void fuse_writepage_args_page_fill(struct fuse_writepage_args *wpa, struct folio *folio,
- struct folio *tmp_folio, uint32_t folio_index)
+ uint32_t folio_index, loff_t offset, unsigned len)
{
struct inode *inode = folio->mapping->host;
struct fuse_args_pages *ap = &wpa->ia.ap;
- folio_copy(tmp_folio, folio);
-
- ap->folios[folio_index] = tmp_folio;
- ap->descs[folio_index].offset = 0;
- ap->descs[folio_index].length = PAGE_SIZE;
+ ap->folios[folio_index] = folio;
+ ap->descs[folio_index].offset = offset;
+ ap->descs[folio_index].length = len;
inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
- node_stat_add_folio(tmp_folio, NR_WRITEBACK_TEMP);
}
static struct fuse_writepage_args *fuse_writepage_args_setup(struct folio *folio,
+ size_t offset,
struct fuse_file *ff)
{
struct inode *inode = folio->mapping->host;
@@ -2150,7 +2034,7 @@ static struct fuse_writepage_args *fuse_writepage_args_setup(struct folio *folio
return NULL;
fuse_writepage_add_to_bucket(fc, wpa);
- fuse_write_args_fill(&wpa->ia, ff, folio_pos(folio), 0);
+ fuse_write_args_fill(&wpa->ia, ff, folio_pos(folio) + offset, 0);
wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE;
wpa->inode = inode;
wpa->ia.ff = ff;
@@ -2162,74 +2046,28 @@ static struct fuse_writepage_args *fuse_writepage_args_setup(struct folio *folio
return wpa;
}
-static int fuse_writepage_locked(struct folio *folio)
-{
- struct address_space *mapping = folio->mapping;
- struct inode *inode = mapping->host;
- struct fuse_inode *fi = get_fuse_inode(inode);
- struct fuse_writepage_args *wpa;
- struct fuse_args_pages *ap;
- struct folio *tmp_folio;
- struct fuse_file *ff;
- int error = -ENOMEM;
-
- tmp_folio = folio_alloc(GFP_NOFS | __GFP_HIGHMEM, 0);
- if (!tmp_folio)
- goto err;
-
- error = -EIO;
- ff = fuse_write_file_get(fi);
- if (!ff)
- goto err_nofile;
-
- wpa = fuse_writepage_args_setup(folio, ff);
- error = -ENOMEM;
- if (!wpa)
- goto err_writepage_args;
-
- ap = &wpa->ia.ap;
- ap->num_folios = 1;
-
- folio_start_writeback(folio);
- fuse_writepage_args_page_fill(wpa, folio, tmp_folio, 0);
-
- spin_lock(&fi->lock);
- tree_insert(&fi->writepages, wpa);
- list_add_tail(&wpa->queue_entry, &fi->queued_writes);
- fuse_flush_writepages(inode);
- spin_unlock(&fi->lock);
-
- folio_end_writeback(folio);
-
- return 0;
-
-err_writepage_args:
- fuse_file_put(ff, false);
-err_nofile:
- folio_put(tmp_folio);
-err:
- mapping_set_error(folio->mapping, error);
- return error;
-}
-
struct fuse_fill_wb_data {
struct fuse_writepage_args *wpa;
struct fuse_file *ff;
- struct inode *inode;
- struct folio **orig_folios;
unsigned int max_folios;
+ /*
+ * nr_bytes won't overflow since fuse_writepage_need_send() caps
+ * wb requests to never exceed fc->max_pages (which has an upper bound
+ * of U16_MAX).
+ */
+ unsigned int nr_bytes;
};
-static bool fuse_pages_realloc(struct fuse_fill_wb_data *data)
+static bool fuse_pages_realloc(struct fuse_fill_wb_data *data,
+ unsigned int max_pages)
{
struct fuse_args_pages *ap = &data->wpa->ia.ap;
- struct fuse_conn *fc = get_fuse_conn(data->inode);
struct folio **folios;
struct fuse_folio_desc *descs;
unsigned int nfolios = min_t(unsigned int,
max_t(unsigned int, data->max_folios * 2,
FUSE_DEFAULT_MAX_PAGES_PER_REQ),
- fc->max_pages);
+ max_pages);
WARN_ON(nfolios <= data->max_folios);
folios = fuse_folios_alloc(nfolios, GFP_NOFS, &descs);
@@ -2246,319 +2084,162 @@ static bool fuse_pages_realloc(struct fuse_fill_wb_data *data)
return true;
}
-static void fuse_writepages_send(struct fuse_fill_wb_data *data)
+static void fuse_writepages_send(struct inode *inode,
+ struct fuse_fill_wb_data *data)
{
struct fuse_writepage_args *wpa = data->wpa;
- struct inode *inode = data->inode;
struct fuse_inode *fi = get_fuse_inode(inode);
- int num_folios = wpa->ia.ap.num_folios;
- int i;
spin_lock(&fi->lock);
list_add_tail(&wpa->queue_entry, &fi->queued_writes);
fuse_flush_writepages(inode);
spin_unlock(&fi->lock);
-
- for (i = 0; i < num_folios; i++)
- folio_end_writeback(data->orig_folios[i]);
}
-/*
- * Check under fi->lock if the page is under writeback, and insert it onto the
- * rb_tree if not. Otherwise iterate auxiliary write requests, to see if there's
- * one already added for a page at this offset. If there's none, then insert
- * this new request onto the auxiliary list, otherwise reuse the existing one by
- * swapping the new temp page with the old one.
- */
-static bool fuse_writepage_add(struct fuse_writepage_args *new_wpa,
- struct folio *folio)
-{
- struct fuse_inode *fi = get_fuse_inode(new_wpa->inode);
- struct fuse_writepage_args *tmp;
- struct fuse_writepage_args *old_wpa;
- struct fuse_args_pages *new_ap = &new_wpa->ia.ap;
-
- WARN_ON(new_ap->num_folios != 0);
- new_ap->num_folios = 1;
-
- spin_lock(&fi->lock);
- old_wpa = fuse_insert_writeback(&fi->writepages, new_wpa);
- if (!old_wpa) {
- spin_unlock(&fi->lock);
- return true;
- }
-
- for (tmp = old_wpa->next; tmp; tmp = tmp->next) {
- pgoff_t curr_index;
-
- WARN_ON(tmp->inode != new_wpa->inode);
- curr_index = tmp->ia.write.in.offset >> PAGE_SHIFT;
- if (curr_index == folio->index) {
- WARN_ON(tmp->ia.ap.num_folios != 1);
- swap(tmp->ia.ap.folios[0], new_ap->folios[0]);
- break;
- }
- }
-
- if (!tmp) {
- new_wpa->next = old_wpa->next;
- old_wpa->next = new_wpa;
- }
-
- spin_unlock(&fi->lock);
-
- if (tmp) {
- fuse_writepage_finish_stat(new_wpa->inode,
- folio);
- fuse_writepage_free(new_wpa);
- }
-
- return false;
-}
-
-static bool fuse_writepage_need_send(struct fuse_conn *fc, struct folio *folio,
- struct fuse_args_pages *ap,
+static bool fuse_writepage_need_send(struct fuse_conn *fc, loff_t pos,
+ unsigned len, struct fuse_args_pages *ap,
struct fuse_fill_wb_data *data)
{
- WARN_ON(!ap->num_folios);
+ struct folio *prev_folio;
+ struct fuse_folio_desc prev_desc;
+ unsigned bytes = data->nr_bytes + len;
+ loff_t prev_pos;
- /*
- * Being under writeback is unlikely but possible. For example direct
- * read to an mmaped fuse file will set the page dirty twice; once when
- * the pages are faulted with get_user_pages(), and then after the read
- * completed.
- */
- if (fuse_folio_is_writeback(data->inode, folio))
- return true;
+ WARN_ON(!ap->num_folios);
/* Reached max pages */
- if (ap->num_folios == fc->max_pages)
+ if ((bytes + PAGE_SIZE - 1) >> PAGE_SHIFT > fc->max_pages)
return true;
/* Reached max write bytes */
- if ((ap->num_folios + 1) * PAGE_SIZE > fc->max_write)
+ if (bytes > fc->max_write)
return true;
/* Discontinuity */
- if (data->orig_folios[ap->num_folios - 1]->index + 1 != folio_index(folio))
+ prev_folio = ap->folios[ap->num_folios - 1];
+ prev_desc = ap->descs[ap->num_folios - 1];
+ prev_pos = folio_pos(prev_folio) + prev_desc.offset + prev_desc.length;
+ if (prev_pos != pos)
return true;
/* Need to grow the pages array? If so, did the expansion fail? */
- if (ap->num_folios == data->max_folios && !fuse_pages_realloc(data))
+ if (ap->num_folios == data->max_folios &&
+ !fuse_pages_realloc(data, fc->max_pages))
return true;
return false;
}
-static int fuse_writepages_fill(struct folio *folio,
- struct writeback_control *wbc, void *_data)
+static ssize_t fuse_iomap_writeback_range(struct iomap_writepage_ctx *wpc,
+ struct folio *folio, u64 pos,
+ unsigned len, u64 end_pos)
{
- struct fuse_fill_wb_data *data = _data;
+ struct fuse_fill_wb_data *data = wpc->wb_ctx;
struct fuse_writepage_args *wpa = data->wpa;
struct fuse_args_pages *ap = &wpa->ia.ap;
- struct inode *inode = data->inode;
+ struct inode *inode = wpc->inode;
struct fuse_inode *fi = get_fuse_inode(inode);
struct fuse_conn *fc = get_fuse_conn(inode);
- struct folio *tmp_folio;
- int err;
+ loff_t offset = offset_in_folio(folio, pos);
+
+ WARN_ON_ONCE(!data);
if (!data->ff) {
- err = -EIO;
data->ff = fuse_write_file_get(fi);
if (!data->ff)
- goto out_unlock;
+ return -EIO;
}
- if (wpa && fuse_writepage_need_send(fc, folio, ap, data)) {
- fuse_writepages_send(data);
+ if (wpa && fuse_writepage_need_send(fc, pos, len, ap, data)) {
+ fuse_writepages_send(inode, data);
data->wpa = NULL;
+ data->nr_bytes = 0;
}
- err = -ENOMEM;
- tmp_folio = folio_alloc(GFP_NOFS | __GFP_HIGHMEM, 0);
- if (!tmp_folio)
- goto out_unlock;
-
- /*
- * The page must not be redirtied until the writeout is completed
- * (i.e. userspace has sent a reply to the write request). Otherwise
- * there could be more than one temporary page instance for each real
- * page.
- *
- * This is ensured by holding the page lock in page_mkwrite() while
- * checking fuse_page_is_writeback(). We already hold the page lock
- * since clear_page_dirty_for_io() and keep it held until we add the
- * request to the fi->writepages list and increment ap->num_folios.
- * After this fuse_page_is_writeback() will indicate that the page is
- * under writeback, so we can release the page lock.
- */
if (data->wpa == NULL) {
- err = -ENOMEM;
- wpa = fuse_writepage_args_setup(folio, data->ff);
- if (!wpa) {
- folio_put(tmp_folio);
- goto out_unlock;
- }
+ wpa = fuse_writepage_args_setup(folio, offset, data->ff);
+ if (!wpa)
+ return -ENOMEM;
fuse_file_get(wpa->ia.ff);
data->max_folios = 1;
ap = &wpa->ia.ap;
}
- folio_start_writeback(folio);
- fuse_writepage_args_page_fill(wpa, folio, tmp_folio, ap->num_folios);
- data->orig_folios[ap->num_folios] = folio;
+ iomap_start_folio_write(inode, folio, 1);
+ fuse_writepage_args_page_fill(wpa, folio, ap->num_folios,
+ offset, len);
+ data->nr_bytes += len;
- err = 0;
- if (data->wpa) {
- /*
- * Protected by fi->lock against concurrent access by
- * fuse_page_is_writeback().
- */
- spin_lock(&fi->lock);
- ap->num_folios++;
- spin_unlock(&fi->lock);
- } else if (fuse_writepage_add(wpa, folio)) {
+ ap->num_folios++;
+ if (!data->wpa)
data->wpa = wpa;
- } else {
- folio_end_writeback(folio);
+
+ return len;
+}
+
+static int fuse_iomap_writeback_submit(struct iomap_writepage_ctx *wpc,
+ int error)
+{
+ struct fuse_fill_wb_data *data = wpc->wb_ctx;
+
+ WARN_ON_ONCE(!data);
+
+ if (data->wpa) {
+ WARN_ON(!data->wpa->ia.ap.num_folios);
+ fuse_writepages_send(wpc->inode, data);
}
-out_unlock:
- folio_unlock(folio);
- return err;
+ if (data->ff)
+ fuse_file_put(data->ff, false);
+
+ return error;
}
+static const struct iomap_writeback_ops fuse_writeback_ops = {
+ .writeback_range = fuse_iomap_writeback_range,
+ .writeback_submit = fuse_iomap_writeback_submit,
+};
+
static int fuse_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct inode *inode = mapping->host;
struct fuse_conn *fc = get_fuse_conn(inode);
- struct fuse_fill_wb_data data;
- int err;
+ struct fuse_fill_wb_data data = {};
+ struct iomap_writepage_ctx wpc = {
+ .inode = inode,
+ .iomap.type = IOMAP_MAPPED,
+ .wbc = wbc,
+ .ops = &fuse_writeback_ops,
+ .wb_ctx = &data,
+ };
- err = -EIO;
if (fuse_is_bad(inode))
- goto out;
+ return -EIO;
if (wbc->sync_mode == WB_SYNC_NONE &&
fc->num_background >= fc->congestion_threshold)
return 0;
- data.inode = inode;
- data.wpa = NULL;
- data.ff = NULL;
-
- err = -ENOMEM;
- data.orig_folios = kcalloc(fc->max_pages,
- sizeof(struct folio *),
- GFP_NOFS);
- if (!data.orig_folios)
- goto out;
-
- err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data);
- if (data.wpa) {
- WARN_ON(!data.wpa->ia.ap.num_folios);
- fuse_writepages_send(&data);
- }
- if (data.ff)
- fuse_file_put(data.ff, false);
-
- kfree(data.orig_folios);
-out:
- return err;
-}
-
-/*
- * It's worthy to make sure that space is reserved on disk for the write,
- * but how to implement it without killing performance need more thinking.
- */
-static int fuse_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, struct folio **foliop, void **fsdata)
-{
- pgoff_t index = pos >> PAGE_SHIFT;
- struct fuse_conn *fc = get_fuse_conn(file_inode(file));
- struct folio *folio;
- loff_t fsize;
- int err = -ENOMEM;
-
- WARN_ON(!fc->writeback_cache);
-
- folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
- mapping_gfp_mask(mapping));
- if (IS_ERR(folio))
- goto error;
-
- fuse_wait_on_page_writeback(mapping->host, folio->index);
-
- if (folio_test_uptodate(folio) || len >= folio_size(folio))
- goto success;
- /*
- * Check if the start of this folio comes after the end of file,
- * in which case the readpage can be optimized away.
- */
- fsize = i_size_read(mapping->host);
- if (fsize <= folio_pos(folio)) {
- size_t off = offset_in_folio(folio, pos);
- if (off)
- folio_zero_segment(folio, 0, off);
- goto success;
- }
- err = fuse_do_readfolio(file, folio);
- if (err)
- goto cleanup;
-success:
- *foliop = folio;
- return 0;
-
-cleanup:
- folio_unlock(folio);
- folio_put(folio);
-error:
- return err;
-}
-
-static int fuse_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct folio *folio, void *fsdata)
-{
- struct inode *inode = folio->mapping->host;
-
- /* Haven't copied anything? Skip zeroing, size extending, dirtying. */
- if (!copied)
- goto unlock;
-
- pos += copied;
- if (!folio_test_uptodate(folio)) {
- /* Zero any unwritten bytes at the end of the page */
- size_t endoff = pos & ~PAGE_MASK;
- if (endoff)
- folio_zero_segment(folio, endoff, PAGE_SIZE);
- folio_mark_uptodate(folio);
- }
-
- if (pos > inode->i_size)
- i_size_write(inode, pos);
-
- folio_mark_dirty(folio);
-
-unlock:
- folio_unlock(folio);
- folio_put(folio);
-
- return copied;
+ return iomap_writepages(&wpc);
}
static int fuse_launder_folio(struct folio *folio)
{
int err = 0;
- if (folio_clear_dirty_for_io(folio)) {
- struct inode *inode = folio->mapping->host;
+ struct fuse_fill_wb_data data = {};
+ struct iomap_writepage_ctx wpc = {
+ .inode = folio->mapping->host,
+ .iomap.type = IOMAP_MAPPED,
+ .ops = &fuse_writeback_ops,
+ .wb_ctx = &data,
+ };
- /* Serialize with pending writeback for the same page */
- fuse_wait_on_page_writeback(inode, folio->index);
- err = fuse_writepage_locked(folio);
+ if (folio_clear_dirty_for_io(folio)) {
+ err = iomap_writeback_folio(&wpc, folio);
+ err = fuse_iomap_writeback_submit(&wpc, err);
if (!err)
- fuse_wait_on_page_writeback(inode, folio->index);
+ folio_wait_writeback(folio);
}
return err;
}
@@ -2602,7 +2283,7 @@ static vm_fault_t fuse_page_mkwrite(struct vm_fault *vmf)
return VM_FAULT_NOPAGE;
}
- fuse_wait_on_folio_writeback(inode, folio);
+ folio_wait_writeback(folio);
return VM_FAULT_LOCKED;
}
@@ -3196,7 +2877,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
inode_lock(inode);
if (block_faults) {
filemap_invalidate_lock(inode->i_mapping);
- err = fuse_dax_break_layouts(inode, 0, 0);
+ err = fuse_dax_break_layouts(inode, 0, -1);
if (err)
goto out;
}
@@ -3409,20 +3090,24 @@ static const struct address_space_operations fuse_file_aops = {
.readahead = fuse_readahead,
.writepages = fuse_writepages,
.launder_folio = fuse_launder_folio,
- .dirty_folio = filemap_dirty_folio,
+ .dirty_folio = iomap_dirty_folio,
+ .release_folio = iomap_release_folio,
+ .invalidate_folio = iomap_invalidate_folio,
+ .is_partially_uptodate = iomap_is_partially_uptodate,
.migrate_folio = filemap_migrate_folio,
.bmap = fuse_bmap,
.direct_IO = fuse_direct_IO,
- .write_begin = fuse_write_begin,
- .write_end = fuse_write_end,
};
void fuse_init_file_inode(struct inode *inode, unsigned int flags)
{
struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_conn *fc = get_fuse_conn(inode);
inode->i_fop = &fuse_file_operations;
inode->i_data.a_ops = &fuse_file_aops;
+ if (fc->writeback_cache)
+ mapping_set_writeback_may_deadlock_on_reclaim(&inode->i_data);
INIT_LIST_HEAD(&fi->write_files);
INIT_LIST_HEAD(&fi->queued_writes);
@@ -3430,7 +3115,6 @@ void fuse_init_file_inode(struct inode *inode, unsigned int flags)
fi->iocachectr = 0;
init_waitqueue_head(&fi->page_waitq);
init_waitqueue_head(&fi->direct_io_waitq);
- fi->writepages = RB_ROOT;
if (IS_ENABLED(CONFIG_FUSE_DAX))
fuse_dax_inode_init(inode, flags);
diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h
index 3b2bfe1248d3..5a9bd771a319 100644
--- a/fs/fuse/fuse_dev_i.h
+++ b/fs/fuse/fuse_dev_i.h
@@ -20,7 +20,6 @@ struct fuse_iqueue;
struct fuse_forget_link;
struct fuse_copy_state {
- int write;
struct fuse_req *req;
struct iov_iter *iter;
struct pipe_buffer *pipebufs;
@@ -30,8 +29,9 @@ struct fuse_copy_state {
struct page *pg;
unsigned int len;
unsigned int offset;
- unsigned int move_pages:1;
- unsigned int is_uring:1;
+ bool write:1;
+ bool move_folios:1;
+ bool is_uring:1;
struct {
unsigned int copied_sz; /* copied size into the user buffer */
} ring;
@@ -51,7 +51,7 @@ struct fuse_req *fuse_request_find(struct fuse_pqueue *fpq, u64 unique);
void fuse_dev_end_requests(struct list_head *head);
-void fuse_copy_init(struct fuse_copy_state *cs, int write,
+void fuse_copy_init(struct fuse_copy_state *cs, bool write,
struct iov_iter *iter);
int fuse_copy_args(struct fuse_copy_state *cs, unsigned int numargs,
unsigned int argpages, struct fuse_arg *args,
@@ -61,6 +61,9 @@ int fuse_copy_out_args(struct fuse_copy_state *cs, struct fuse_args *args,
void fuse_dev_queue_forget(struct fuse_iqueue *fiq,
struct fuse_forget_link *forget);
void fuse_dev_queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req);
+bool fuse_remove_pending_req(struct fuse_req *req, spinlock_t *lock);
+
+bool fuse_request_expired(struct fuse_conn *fc, struct list_head *list);
#endif
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index fee96fe7887b..ec248d13c8bf 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -38,14 +38,34 @@
/** Bias for fi->writectr, meaning new writepages must not be sent */
#define FUSE_NOWRITE INT_MIN
-/** It could be as large as PATH_MAX, but would that have any uses? */
-#define FUSE_NAME_MAX 1024
+/** Maximum length of a filename, not including terminating null */
+
+/* maximum, small enough for FUSE_MIN_READ_BUFFER*/
+#define FUSE_NAME_LOW_MAX 1024
+/* maximum, but needs a request buffer > FUSE_MIN_READ_BUFFER */
+#define FUSE_NAME_MAX (PATH_MAX - 1)
/** Number of dentries for each connection in the control filesystem */
#define FUSE_CTL_NUM_DENTRIES 5
+/* Frequency (in seconds) of request timeout checks, if opted into */
+#define FUSE_TIMEOUT_TIMER_FREQ 15
+
+/** Frequency (in jiffies) of request timeout checks, if opted into */
+extern const unsigned long fuse_timeout_timer_freq;
+
/** Maximum of max_pages received in init_out */
extern unsigned int fuse_max_pages_limit;
+/*
+ * Default timeout (in seconds) for the server to reply to a request
+ * before the connection is aborted, if no timeout was specified on mount.
+ */
+extern unsigned int fuse_default_req_timeout;
+/*
+ * Max timeout (in seconds) for the server to reply to a request before
+ * the connection is aborted.
+ */
+extern unsigned int fuse_max_req_timeout;
/** List of active connections */
extern struct list_head fuse_conn_list;
@@ -54,8 +74,8 @@ extern struct list_head fuse_conn_list;
extern struct mutex fuse_mutex;
/** Module parameters */
-extern unsigned max_user_bgreq;
-extern unsigned max_user_congthresh;
+extern unsigned int max_user_bgreq;
+extern unsigned int max_user_congthresh;
/* One forget request */
struct fuse_forget_link {
@@ -141,9 +161,6 @@ struct fuse_inode {
/* waitq for direct-io completion */
wait_queue_head_t direct_io_waitq;
-
- /* List of writepage requestst (pending or sent) */
- struct rb_root writepages;
};
/* readdir cache (directory only) */
@@ -378,6 +395,7 @@ struct fuse_io_priv {
* FR_FINISHED: request is finished
* FR_PRIVATE: request is on private list
* FR_ASYNC: request is asynchronous
+ * FR_URING: request is handled through fuse-io-uring
*/
enum fuse_req_flag {
FR_ISREPLY,
@@ -392,6 +410,7 @@ enum fuse_req_flag {
FR_FINISHED,
FR_PRIVATE,
FR_ASYNC,
+ FR_URING,
};
/**
@@ -441,7 +460,10 @@ struct fuse_req {
#ifdef CONFIG_FUSE_IO_URING
void *ring_entry;
+ void *ring_queue;
#endif
+ /** When (in jiffies) the request was created */
+ unsigned long create_time;
};
struct fuse_iqueue;
@@ -611,6 +633,9 @@ struct fuse_conn {
/** Number of fuse_dev's */
atomic_t dev_count;
+ /** Current epoch for up-to-date dentries */
+ atomic_t epoch;
+
struct rcu_head rcu;
/** The user id for this mount */
@@ -867,6 +892,9 @@ struct fuse_conn {
/* Use pages instead of pointer for kernel I/O */
unsigned int use_pages_for_kvec_io:1;
+ /* Is link not implemented by fs? */
+ unsigned int no_link:1;
+
/* Use io_uring for communication */
unsigned int io_uring;
@@ -885,12 +913,6 @@ struct fuse_conn {
/** Device ID from the root super block */
dev_t dev;
- /** Dentries in the control filesystem */
- struct dentry *ctl_dentry[FUSE_CTL_NUM_DENTRIES];
-
- /** number of dentries used in the above array */
- int ctl_ndents;
-
/** Key for lock owner ID scrambling */
u32 scramble_key[4];
@@ -900,6 +922,9 @@ struct fuse_conn {
/** Version counter for evict inode */
atomic64_t evict_ctr;
+ /* maximum file name length */
+ u32 name_max;
+
/** Called on final put */
void (*release)(struct fuse_conn *);
@@ -935,6 +960,15 @@ struct fuse_conn {
/** uring connection information*/
struct fuse_ring *ring;
#endif
+
+ /** Only used if the connection opts into request timeouts */
+ struct {
+ /* Worker for checking if any requests have timed out */
+ struct delayed_work work;
+
+ /* Request timeout (in jiffies). 0 = no timeout */
+ unsigned int req_timeout;
+ } timeout;
};
/*
@@ -1069,7 +1103,6 @@ static inline void fuse_sync_bucket_dec(struct fuse_sync_bucket *bucket)
extern const struct file_operations fuse_dev_operations;
extern const struct dentry_operations fuse_dentry_operations;
-extern const struct dentry_operations fuse_root_dentry_operations;
/**
* Get a filled in inode
@@ -1216,6 +1249,9 @@ void fuse_request_end(struct fuse_req *req);
void fuse_abort_conn(struct fuse_conn *fc);
void fuse_wait_aborted(struct fuse_conn *fc);
+/* Check if any requests timed out */
+void fuse_check_timeout(struct work_struct *work);
+
/**
* Invalidate inode attributes
*/
@@ -1443,9 +1479,9 @@ void fuse_dax_cancel_work(struct fuse_conn *fc);
long fuse_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
long fuse_file_compat_ioctl(struct file *file, unsigned int cmd,
unsigned long arg);
-int fuse_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+int fuse_fileattr_get(struct dentry *dentry, struct file_kattr *fa);
int fuse_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa);
+ struct dentry *dentry, struct file_kattr *fa);
/* iomode.c */
int fuse_file_cached_io_open(struct inode *inode, struct fuse_file *ff);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index e9db2cb8c150..67c2318bfc42 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -9,6 +9,7 @@
#include "fuse_i.h"
#include "dev_uring_i.h"
+#include <linux/dax.h>
#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/file.h>
@@ -37,8 +38,11 @@ DEFINE_MUTEX(fuse_mutex);
static int set_global_limit(const char *val, const struct kernel_param *kp);
unsigned int fuse_max_pages_limit = 256;
+/* default is no timeout */
+unsigned int fuse_default_req_timeout;
+unsigned int fuse_max_req_timeout;
-unsigned max_user_bgreq;
+unsigned int max_user_bgreq;
module_param_call(max_user_bgreq, set_global_limit, param_get_uint,
&max_user_bgreq, 0644);
__MODULE_PARM_TYPE(max_user_bgreq, "uint");
@@ -46,7 +50,7 @@ MODULE_PARM_DESC(max_user_bgreq,
"Global limit for the maximum number of backgrounded requests an "
"unprivileged user can set");
-unsigned max_user_congthresh;
+unsigned int max_user_congthresh;
module_param_call(max_user_congthresh, set_global_limit, param_get_uint,
&max_user_congthresh, 0644);
__MODULE_PARM_TYPE(max_user_congthresh, "uint");
@@ -159,6 +163,9 @@ static void fuse_evict_inode(struct inode *inode)
/* Will write inode on close/munmap and in all other dirtiers */
WARN_ON(inode->i_state & I_DIRTY_INODE);
+ if (FUSE_IS_DAX(inode))
+ dax_break_layout_final(inode);
+
truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
if (inode->i_sb->s_flags & SB_ACTIVE) {
@@ -282,11 +289,6 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
}
}
- if (attr->blksize != 0)
- inode->i_blkbits = ilog2(attr->blksize);
- else
- inode->i_blkbits = inode->i_sb->s_blocksize_bits;
-
/*
* Don't set the sticky bit in i_mode, unless we want the VFS
* to check permissions. This prevents failures due to the
@@ -959,6 +961,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm,
init_rwsem(&fc->killsb);
refcount_set(&fc->count, 1);
atomic_set(&fc->dev_count, 1);
+ atomic_set(&fc->epoch, 1);
init_waitqueue_head(&fc->blocked_waitq);
fuse_iqueue_init(&fc->iq, fiq_ops, fiq_priv);
INIT_LIST_HEAD(&fc->bg_queue);
@@ -979,6 +982,8 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm,
fc->user_ns = get_user_ns(user_ns);
fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ;
fc->max_pages_limit = fuse_max_pages_limit;
+ fc->name_max = FUSE_NAME_LOW_MAX;
+ fc->timeout.req_timeout = 0;
if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
fuse_backing_files_init(fc);
@@ -1007,6 +1012,8 @@ void fuse_conn_put(struct fuse_conn *fc)
if (IS_ENABLED(CONFIG_FUSE_DAX))
fuse_dax_conn_free(fc);
+ if (fc->timeout.req_timeout)
+ cancel_delayed_work_sync(&fc->timeout.work);
if (fiq->ops->release)
fiq->ops->release(fiq);
put_pid_ns(fc->pid_ns);
@@ -1029,7 +1036,7 @@ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc)
}
EXPORT_SYMBOL_GPL(fuse_conn_get);
-static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode)
+static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned int mode)
{
struct fuse_attr attr;
memset(&attr, 0, sizeof(attr));
@@ -1204,7 +1211,7 @@ static const struct super_operations fuse_super_operations = {
.show_options = fuse_show_options,
};
-static void sanitize_global_limit(unsigned *limit)
+static void sanitize_global_limit(unsigned int *limit)
{
/*
* The default maximum number of async requests is calculated to consume
@@ -1225,7 +1232,7 @@ static int set_global_limit(const char *val, const struct kernel_param *kp)
if (rv)
return rv;
- sanitize_global_limit((unsigned *)kp->arg);
+ sanitize_global_limit((unsigned int *)kp->arg);
return 0;
}
@@ -1257,6 +1264,34 @@ static void process_init_limits(struct fuse_conn *fc, struct fuse_init_out *arg)
spin_unlock(&fc->bg_lock);
}
+static void set_request_timeout(struct fuse_conn *fc, unsigned int timeout)
+{
+ fc->timeout.req_timeout = secs_to_jiffies(timeout);
+ INIT_DELAYED_WORK(&fc->timeout.work, fuse_check_timeout);
+ queue_delayed_work(system_wq, &fc->timeout.work,
+ fuse_timeout_timer_freq);
+}
+
+static void init_server_timeout(struct fuse_conn *fc, unsigned int timeout)
+{
+ if (!timeout && !fuse_max_req_timeout && !fuse_default_req_timeout)
+ return;
+
+ if (!timeout)
+ timeout = fuse_default_req_timeout;
+
+ if (fuse_max_req_timeout) {
+ if (timeout)
+ timeout = min(fuse_max_req_timeout, timeout);
+ else
+ timeout = fuse_max_req_timeout;
+ }
+
+ timeout = max(FUSE_TIMEOUT_TIMER_FREQ, timeout);
+
+ set_request_timeout(fc, timeout);
+}
+
struct fuse_init_args {
struct fuse_args args;
struct fuse_init_in in;
@@ -1275,6 +1310,7 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
ok = false;
else {
unsigned long ra_pages;
+ unsigned int timeout = 0;
process_init_limits(fc, arg);
@@ -1338,6 +1374,13 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
fc->max_pages =
min_t(unsigned int, fc->max_pages_limit,
max_t(unsigned int, arg->max_pages, 1));
+
+ /*
+ * PATH_MAX file names might need two pages for
+ * ops like rename
+ */
+ if (fc->max_pages > 1)
+ fc->name_max = FUSE_NAME_MAX;
}
if (IS_ENABLED(CONFIG_FUSE_DAX)) {
if (flags & FUSE_MAP_ALIGNMENT &&
@@ -1392,12 +1435,17 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
}
if (flags & FUSE_OVER_IO_URING && fuse_uring_enabled())
fc->io_uring = 1;
+
+ if (flags & FUSE_REQUEST_TIMEOUT)
+ timeout = arg->request_timeout;
} else {
ra_pages = fc->max_read / PAGE_SIZE;
fc->no_lock = 1;
fc->no_flock = 1;
}
+ init_server_timeout(fc, timeout);
+
fm->sb->s_bdi->ra_pages =
min(fm->sb->s_bdi->ra_pages, ra_pages);
fc->minor = arg->minor;
@@ -1439,7 +1487,8 @@ void fuse_send_init(struct fuse_mount *fm)
FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT | FUSE_INIT_EXT |
FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP |
FUSE_HAS_EXPIRE_ONLY | FUSE_DIRECT_IO_ALLOW_MMAP |
- FUSE_NO_EXPORT_SUPPORT | FUSE_HAS_RESEND | FUSE_ALLOW_IDMAP;
+ FUSE_NO_EXPORT_SUPPORT | FUSE_HAS_RESEND | FUSE_ALLOW_IDMAP |
+ FUSE_REQUEST_TIMEOUT;
#ifdef CONFIG_FUSE_DAX
if (fm->fc->dax)
flags |= FUSE_MAP_ALIGNMENT;
@@ -1665,7 +1714,7 @@ static int fuse_fill_super_submount(struct super_block *sb,
fi = get_fuse_inode(root);
fi->nlookup--;
- sb->s_d_op = &fuse_dentry_operations;
+ set_default_d_op(sb, &fuse_dentry_operations);
sb->s_root = d_make_root(root);
if (!sb->s_root)
return -ENOMEM;
@@ -1800,12 +1849,10 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
err = -ENOMEM;
root = fuse_get_root_inode(sb, ctx->rootmode);
- sb->s_d_op = &fuse_root_dentry_operations;
+ set_default_d_op(sb, &fuse_dentry_operations);
root_dentry = d_make_root(root);
if (!root_dentry)
goto err_dev_free;
- /* Root dentry doesn't have .d_revalidate */
- sb->s_d_op = &fuse_dentry_operations;
mutex_lock(&fuse_mutex);
err = -EINVAL;
diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c
index 2d9abf48828f..57032eadca6c 100644
--- a/fs/fuse/ioctl.c
+++ b/fs/fuse/ioctl.c
@@ -502,7 +502,7 @@ static void fuse_priv_ioctl_cleanup(struct inode *inode, struct fuse_file *ff)
fuse_file_release(inode, ff, O_RDONLY, NULL, S_ISDIR(inode->i_mode));
}
-int fuse_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+int fuse_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
{
struct inode *inode = d_inode(dentry);
struct fuse_file *ff;
@@ -536,11 +536,13 @@ int fuse_fileattr_get(struct dentry *dentry, struct fileattr *fa)
cleanup:
fuse_priv_ioctl_cleanup(inode, ff);
+ if (err == -ENOTTY)
+ err = -EOPNOTSUPP;
return err;
}
int fuse_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa)
+ struct dentry *dentry, struct file_kattr *fa)
{
struct inode *inode = d_inode(dentry);
struct fuse_file *ff;
@@ -572,5 +574,7 @@ int fuse_fileattr_set(struct mnt_idmap *idmap,
cleanup:
fuse_priv_ioctl_cleanup(inode, ff);
+ if (err == -ENOTTY)
+ err = -EOPNOTSUPP;
return err;
}
diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
index 17ce9636a2b1..c2aae2eef086 100644
--- a/fs/fuse/readdir.c
+++ b/fs/fuse/readdir.c
@@ -120,7 +120,7 @@ static bool fuse_emit(struct file *file, struct dir_context *ctx,
fuse_add_dirent_to_cache(file, dirent, ctx->pos);
return dir_emit(ctx, dirent->name, dirent->namelen, dirent->ino,
- dirent->type);
+ dirent->type | FILLDIR_FLAG_NOINTR);
}
static int parse_dirfile(char *buf, size_t nbytes, struct file *file,
@@ -161,6 +161,7 @@ static int fuse_direntplus_link(struct file *file,
struct fuse_conn *fc;
struct inode *inode;
DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
+ int epoch;
if (!o->nodeid) {
/*
@@ -190,6 +191,7 @@ static int fuse_direntplus_link(struct file *file,
return -EIO;
fc = get_fuse_conn(dir);
+ epoch = atomic_read(&fc->epoch);
name.hash = full_name_hash(parent, name.name, name.len);
dentry = d_lookup(parent, &name);
@@ -256,6 +258,7 @@ retry:
}
if (fc->readdirplus_auto)
set_bit(FUSE_I_INIT_RDPLUS, &get_fuse_inode(inode)->state);
+ dentry->d_time = epoch;
fuse_change_entry_timeout(dentry, o);
dput(dentry);
@@ -332,35 +335,32 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
{
int plus;
ssize_t res;
- struct folio *folio;
struct inode *inode = file_inode(file);
struct fuse_mount *fm = get_fuse_mount(inode);
+ struct fuse_conn *fc = fm->fc;
struct fuse_io_args ia = {};
- struct fuse_args_pages *ap = &ia.ap;
- struct fuse_folio_desc desc = { .length = PAGE_SIZE };
+ struct fuse_args *args = &ia.ap.args;
+ void *buf;
+ size_t bufsize = clamp((unsigned int) ctx->count, PAGE_SIZE, fc->max_pages << PAGE_SHIFT);
u64 attr_version = 0, evict_ctr = 0;
bool locked;
- folio = folio_alloc(GFP_KERNEL, 0);
- if (!folio)
+ buf = kvmalloc(bufsize, GFP_KERNEL);
+ if (!buf)
return -ENOMEM;
+ args->out_args[0].value = buf;
+
plus = fuse_use_readdirplus(inode, ctx);
- ap->args.out_pages = true;
- ap->num_folios = 1;
- ap->folios = &folio;
- ap->descs = &desc;
if (plus) {
attr_version = fuse_get_attr_version(fm->fc);
evict_ctr = fuse_get_evict_ctr(fm->fc);
- fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE,
- FUSE_READDIRPLUS);
+ fuse_read_args_fill(&ia, file, ctx->pos, bufsize, FUSE_READDIRPLUS);
} else {
- fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE,
- FUSE_READDIR);
+ fuse_read_args_fill(&ia, file, ctx->pos, bufsize, FUSE_READDIR);
}
locked = fuse_lock_inode(inode);
- res = fuse_simple_request(fm, &ap->args);
+ res = fuse_simple_request(fm, args);
fuse_unlock_inode(inode, locked);
if (res >= 0) {
if (!res) {
@@ -369,16 +369,14 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
if (ff->open_flags & FOPEN_CACHE_DIR)
fuse_readdir_cache_end(file, ctx->pos);
} else if (plus) {
- res = parse_dirplusfile(folio_address(folio), res,
- file, ctx, attr_version,
+ res = parse_dirplusfile(buf, res, file, ctx, attr_version,
evict_ctr);
} else {
- res = parse_dirfile(folio_address(folio), res, file,
- ctx);
+ res = parse_dirfile(buf, res, file, ctx);
}
}
- folio_put(folio);
+ kvfree(buf);
fuse_invalidate_atime(inode);
return res;
}
@@ -419,7 +417,7 @@ static enum fuse_parse_result fuse_parse_cache(struct fuse_file *ff,
if (ff->readdir.pos == ctx->pos) {
res = FOUND_SOME;
if (!dir_emit(ctx, dirent->name, dirent->namelen,
- dirent->ino, dirent->type))
+ dirent->ino, dirent->type | FILLDIR_FLAG_NOINTR))
return FOUND_ALL;
ctx->pos = dirent->off;
}
diff --git a/fs/fuse/sysctl.c b/fs/fuse/sysctl.c
index 63fb1e5bee30..e2d921abcb88 100644
--- a/fs/fuse/sysctl.c
+++ b/fs/fuse/sysctl.c
@@ -13,6 +13,12 @@ static struct ctl_table_header *fuse_table_header;
/* Bound by fuse_init_out max_pages, which is a u16 */
static unsigned int sysctl_fuse_max_pages_limit = 65535;
+/*
+ * fuse_init_out request timeouts are u16.
+ * This goes up to ~18 hours, which is plenty for a timeout.
+ */
+static unsigned int sysctl_fuse_req_timeout_limit = 65535;
+
static const struct ctl_table fuse_sysctl_table[] = {
{
.procname = "max_pages_limit",
@@ -23,6 +29,24 @@ static const struct ctl_table fuse_sysctl_table[] = {
.extra1 = SYSCTL_ONE,
.extra2 = &sysctl_fuse_max_pages_limit,
},
+ {
+ .procname = "default_request_timeout",
+ .data = &fuse_default_req_timeout,
+ .maxlen = sizeof(fuse_default_req_timeout),
+ .mode = 0644,
+ .proc_handler = proc_douintvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = &sysctl_fuse_req_timeout_limit,
+ },
+ {
+ .procname = "max_request_timeout",
+ .data = &fuse_max_req_timeout,
+ .maxlen = sizeof(fuse_max_req_timeout),
+ .mode = 0644,
+ .proc_handler = proc_douintvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = &sysctl_fuse_req_timeout_limit,
+ },
};
int fuse_sysctl_register(void)
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index 82afe78ec542..c826e7ca49f5 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -9,7 +9,6 @@
#include <linux/pci.h>
#include <linux/interrupt.h>
#include <linux/group_cpus.h>
-#include <linux/pfn_t.h>
#include <linux/memremap.h>
#include <linux/module.h>
#include <linux/virtio.h>
@@ -862,7 +861,7 @@ static void virtio_fs_requests_done_work(struct work_struct *work)
static void virtio_fs_map_queues(struct virtio_device *vdev, struct virtio_fs *fs)
{
const struct cpumask *mask, *masks;
- unsigned int q, cpu;
+ unsigned int q, cpu, nr_masks;
/* First attempt to map using existing transport layer affinities
* e.g. PCIe MSI-X
@@ -882,7 +881,7 @@ static void virtio_fs_map_queues(struct virtio_device *vdev, struct virtio_fs *f
return;
fallback:
/* Attempt to map evenly in groups over the CPUs */
- masks = group_cpus_evenly(fs->num_request_queues);
+ masks = group_cpus_evenly(fs->num_request_queues, &nr_masks);
/* If even this fails we default to all CPUs use first request queue */
if (!masks) {
for_each_possible_cpu(cpu)
@@ -891,7 +890,7 @@ fallback:
}
for (q = 0; q < fs->num_request_queues; q++) {
- for_each_cpu(cpu, &masks[q])
+ for_each_cpu(cpu, &masks[q % nr_masks])
fs->mq_map[cpu] = q + VQ_REQUEST;
}
kfree(masks);
@@ -1008,7 +1007,7 @@ static void virtio_fs_cleanup_vqs(struct virtio_device *vdev)
*/
static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
long nr_pages, enum dax_access_mode mode,
- void **kaddr, pfn_t *pfn)
+ void **kaddr, unsigned long *pfn)
{
struct virtio_fs *fs = dax_get_private(dax_dev);
phys_addr_t offset = PFN_PHYS(pgoff);
@@ -1017,8 +1016,7 @@ static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
if (kaddr)
*kaddr = fs->window_kaddr + offset;
if (pfn)
- *pfn = phys_to_pfn_t(fs->window_phys_addr + offset,
- PFN_DEV | PFN_MAP);
+ *pfn = fs->window_phys_addr + offset;
return nr_pages > max_nr_pages ? max_nr_pages : nr_pages;
}
@@ -1670,6 +1668,9 @@ static int virtio_fs_get_tree(struct fs_context *fsc)
unsigned int virtqueue_size;
int err = -EIO;
+ if (!fsc->source)
+ return invalf(fsc, "No source specified");
+
/* This gets a reference on virtio_fs object. This ptr gets installed
* in fc->iq->priv. Once fuse_conn is going away, it calls ->put()
* to drop the reference to this object.
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index be7f87a8e11a..7bd231d16d4a 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -4,7 +4,6 @@ config GFS2_FS
select BUFFER_HEAD
select FS_POSIX_ACL
select CRC32
- select LIBCRC32C
select QUOTACTL
select FS_IOMAP
help
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 68fc8af14700..47d74afd63ac 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -37,27 +37,6 @@
#include "aops.h"
-void gfs2_trans_add_databufs(struct gfs2_inode *ip, struct folio *folio,
- size_t from, size_t len)
-{
- struct buffer_head *head = folio_buffers(folio);
- unsigned int bsize = head->b_size;
- struct buffer_head *bh;
- size_t to = from + len;
- size_t start, end;
-
- for (bh = head, start = 0; bh != head || !start;
- bh = bh->b_this_page, start = end) {
- end = start + bsize;
- if (end <= from)
- continue;
- if (start >= to)
- break;
- set_buffer_uptodate(bh);
- gfs2_trans_add_data(ip->i_gl, bh);
- }
-}
-
/**
* gfs2_get_block_noalloc - Fills in a buffer head with details about a block
* @inode: The inode
@@ -133,12 +112,43 @@ static int __gfs2_jdata_write_folio(struct folio *folio,
inode->i_sb->s_blocksize,
BIT(BH_Dirty)|BIT(BH_Uptodate));
}
- gfs2_trans_add_databufs(ip, folio, 0, folio_size(folio));
+ gfs2_trans_add_databufs(ip->i_gl, folio, 0, folio_size(folio));
}
return gfs2_write_jdata_folio(folio, wbc);
}
/**
+ * gfs2_jdata_writeback - Write jdata folios to the log
+ * @mapping: The mapping to write
+ * @wbc: The writeback control
+ *
+ * Returns: errno
+ */
+int gfs2_jdata_writeback(struct address_space *mapping, struct writeback_control *wbc)
+{
+ struct inode *inode = mapping->host;
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_sbd *sdp = GFS2_SB(mapping->host);
+ struct folio *folio = NULL;
+ int error;
+
+ BUG_ON(current->journal_info);
+ if (gfs2_assert_withdraw(sdp, ip->i_gl->gl_state == LM_ST_EXCLUSIVE))
+ return 0;
+
+ while ((folio = writeback_iter(mapping, wbc, folio, &error))) {
+ if (folio_test_checked(folio)) {
+ folio_redirty_for_writepage(wbc, folio);
+ folio_unlock(folio);
+ continue;
+ }
+ error = __gfs2_jdata_write_folio(folio, wbc);
+ }
+
+ return error;
+}
+
+/**
* gfs2_writepages - Write a bunch of dirty pages back to disk
* @mapping: The mapping to write
* @wbc: Write-back control
@@ -149,7 +159,11 @@ static int gfs2_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping);
- struct iomap_writepage_ctx wpc = { };
+ struct iomap_writepage_ctx wpc = {
+ .inode = mapping->host,
+ .wbc = wbc,
+ .ops = &gfs2_writeback_ops,
+ };
int ret;
/*
@@ -158,7 +172,7 @@ static int gfs2_writepages(struct address_space *mapping,
* want balance_dirty_pages() to loop indefinitely trying to write out
* pages held in the ail that it can't find.
*/
- ret = iomap_writepages(mapping, wbc, &wpc, &gfs2_writeback_ops);
+ ret = iomap_writepages(&wpc);
if (ret == 0 && wbc->nr_to_write > 0)
set_bit(SDF_FORCE_AIL_FLUSH, &sdp->sd_flags);
return ret;
@@ -228,24 +242,16 @@ continue_unlock:
ret = __gfs2_jdata_write_folio(folio, wbc);
if (unlikely(ret)) {
- if (ret == AOP_WRITEPAGE_ACTIVATE) {
- folio_unlock(folio);
- ret = 0;
- } else {
-
- /*
- * done_index is set past this page,
- * so media errors will not choke
- * background writeout for the entire
- * file. This has consequences for
- * range_cyclic semantics (ie. it may
- * not be suitable for data integrity
- * writeout).
- */
- *done_index = folio_next_index(folio);
- ret = 1;
- break;
- }
+ /*
+ * done_index is set past this page, so media errors
+ * will not choke background writeout for the entire
+ * file. This has consequences for range_cyclic
+ * semantics (ie. it may not be suitable for data
+ * integrity writeout).
+ */
+ *done_index = folio_next_index(folio);
+ ret = 1;
+ break;
}
/*
@@ -540,7 +546,7 @@ out:
gfs2_trans_end(sdp);
}
-static bool jdata_dirty_folio(struct address_space *mapping,
+static bool gfs2_jdata_dirty_folio(struct address_space *mapping,
struct folio *folio)
{
if (current->journal_info)
@@ -722,7 +728,7 @@ static const struct address_space_operations gfs2_jdata_aops = {
.writepages = gfs2_jdata_writepages,
.read_folio = gfs2_read_folio,
.readahead = gfs2_readahead,
- .dirty_folio = jdata_dirty_folio,
+ .dirty_folio = gfs2_jdata_dirty_folio,
.bmap = gfs2_bmap,
.migrate_folio = buffer_migrate_folio,
.invalidate_folio = gfs2_invalidate_folio,
diff --git a/fs/gfs2/aops.h b/fs/gfs2/aops.h
index a10c4334d248..bf002522a782 100644
--- a/fs/gfs2/aops.h
+++ b/fs/gfs2/aops.h
@@ -9,7 +9,6 @@
#include "incore.h"
void adjust_fs_space(struct inode *inode);
-void gfs2_trans_add_databufs(struct gfs2_inode *ip, struct folio *folio,
- size_t from, size_t len);
+int gfs2_jdata_writeback(struct address_space *mapping, struct writeback_control *wbc);
#endif /* __AOPS_DOT_H__ */
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 1795c4e8dbf6..131091520de6 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -963,12 +963,16 @@ static struct folio *
gfs2_iomap_get_folio(struct iomap_iter *iter, loff_t pos, unsigned len)
{
struct inode *inode = iter->inode;
+ struct gfs2_inode *ip = GFS2_I(inode);
unsigned int blockmask = i_blocksize(inode) - 1;
struct gfs2_sbd *sdp = GFS2_SB(inode);
unsigned int blocks;
struct folio *folio;
int status;
+ if (!gfs2_is_jdata(ip) && !gfs2_is_stuffed(ip))
+ return iomap_get_folio(iter, pos, len);
+
blocks = ((pos & blockmask) + len + blockmask) >> inode->i_blkbits;
status = gfs2_trans_begin(sdp, RES_DINODE + blocks, 0);
if (status)
@@ -987,20 +991,22 @@ static void gfs2_iomap_put_folio(struct inode *inode, loff_t pos,
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
- if (!gfs2_is_stuffed(ip))
- gfs2_trans_add_databufs(ip, folio, offset_in_folio(folio, pos),
+ if (gfs2_is_jdata(ip) && !gfs2_is_stuffed(ip))
+ gfs2_trans_add_databufs(ip->i_gl, folio,
+ offset_in_folio(folio, pos),
copied);
folio_unlock(folio);
folio_put(folio);
- if (tr->tr_num_buf_new)
- __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
-
- gfs2_trans_end(sdp);
+ if (gfs2_is_jdata(ip) || gfs2_is_stuffed(ip)) {
+ if (tr->tr_num_buf_new)
+ __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+ gfs2_trans_end(sdp);
+ }
}
-static const struct iomap_folio_ops gfs2_iomap_folio_ops = {
+const struct iomap_write_ops gfs2_iomap_write_ops = {
.get_folio = gfs2_iomap_get_folio,
.put_folio = gfs2_iomap_put_folio,
};
@@ -1077,8 +1083,6 @@ static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos,
gfs2_trans_end(sdp);
}
- if (gfs2_is_stuffed(ip) || gfs2_is_jdata(ip))
- iomap->folio_ops = &gfs2_iomap_folio_ops;
return 0;
out_trans_end:
@@ -1296,11 +1300,14 @@ int gfs2_alloc_extent(struct inode *inode, u64 lblock, u64 *dblock,
* uses iomap write to perform its actions, which begin their own transactions
* (iomap_begin, get_folio, etc.)
*/
-static int gfs2_block_zero_range(struct inode *inode, loff_t from,
- unsigned int length)
+static int gfs2_block_zero_range(struct inode *inode, loff_t from, loff_t length)
{
BUG_ON(current->journal_info);
- return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops);
+ if (from >= inode->i_size)
+ return 0;
+ length = min(length, inode->i_size - from);
+ return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops,
+ &gfs2_iomap_write_ops, NULL);
}
#define GFS2_JTRUNC_REVOKES 8192
@@ -2465,23 +2472,26 @@ out:
return error;
}
-static int gfs2_map_blocks(struct iomap_writepage_ctx *wpc, struct inode *inode,
- loff_t offset, unsigned int len)
+static ssize_t gfs2_writeback_range(struct iomap_writepage_ctx *wpc,
+ struct folio *folio, u64 offset, unsigned int len, u64 end_pos)
{
- int ret;
-
- if (WARN_ON_ONCE(gfs2_is_stuffed(GFS2_I(inode))))
+ if (WARN_ON_ONCE(gfs2_is_stuffed(GFS2_I(wpc->inode))))
return -EIO;
- if (offset >= wpc->iomap.offset &&
- offset < wpc->iomap.offset + wpc->iomap.length)
- return 0;
+ if (offset < wpc->iomap.offset ||
+ offset >= wpc->iomap.offset + wpc->iomap.length) {
+ int ret;
- memset(&wpc->iomap, 0, sizeof(wpc->iomap));
- ret = gfs2_iomap_get(inode, offset, INT_MAX, &wpc->iomap);
- return ret;
+ memset(&wpc->iomap, 0, sizeof(wpc->iomap));
+ ret = gfs2_iomap_get(wpc->inode, offset, INT_MAX, &wpc->iomap);
+ if (ret)
+ return ret;
+ }
+
+ return iomap_add_to_ioend(wpc, folio, offset, end_pos, len);
}
const struct iomap_writeback_ops gfs2_writeback_ops = {
- .map_blocks = gfs2_map_blocks,
+ .writeback_range = gfs2_writeback_range,
+ .writeback_submit = iomap_ioend_writeback_submit,
};
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
index 4e8b1e8ebdf3..6cdc72dd55a3 100644
--- a/fs/gfs2/bmap.h
+++ b/fs/gfs2/bmap.h
@@ -44,6 +44,7 @@ static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip,
}
extern const struct iomap_ops gfs2_iomap_ops;
+extern const struct iomap_write_ops gfs2_iomap_write_ops;
extern const struct iomap_writeback_ops gfs2_writeback_ops;
int gfs2_unstuff_dinode(struct gfs2_inode *ip);
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index dbf1aede744c..509e2f0d97e7 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -60,6 +60,7 @@
#include <linux/crc32.h>
#include <linux/vmalloc.h>
#include <linux/bio.h>
+#include <linux/log2.h>
#include "gfs2.h"
#include "incore.h"
@@ -912,7 +913,6 @@ static int dir_make_exhash(struct inode *inode)
struct qstr args;
struct buffer_head *bh, *dibh;
struct gfs2_leaf *leaf;
- int y;
u32 x;
__be64 *lp;
u64 bn;
@@ -979,9 +979,7 @@ static int dir_make_exhash(struct inode *inode)
i_size_write(inode, sdp->sd_sb.sb_bsize / 2);
gfs2_add_inode_blocks(&dip->i_inode, 1);
dip->i_diskflags |= GFS2_DIF_EXHASH;
-
- for (x = sdp->sd_hash_ptrs, y = -1; x; x >>= 1, y++) ;
- dip->i_depth = y;
+ dip->i_depth = ilog2(sdp->sd_hash_ptrs);
gfs2_dinode_out(dip, dibh->b_data);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index c9bb3be21d2b..72d95185a39f 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -155,7 +155,7 @@ static inline u32 gfs2_gfsflags_to_fsflags(struct inode *inode, u32 gfsflags)
return fsflags;
}
-int gfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+int gfs2_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
{
struct inode *inode = d_inode(dentry);
struct gfs2_inode *ip = GFS2_I(inode);
@@ -276,7 +276,7 @@ out:
}
int gfs2_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa)
+ struct dentry *dentry, struct file_kattr *fa)
{
struct inode *inode = d_inode(dentry);
u32 fsflags = fa->flags, gfsflags = 0;
@@ -820,7 +820,7 @@ static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to,
/*
* In this function, we disable page faults when we're holding the
* inode glock while doing I/O. If a page fault occurs, we indicate
- * that the inode glock may be dropped, fault in the pages manually,
+ * that the inode glock should be dropped, fault in the pages manually,
* and retry.
*
* Unlike generic_file_read_iter, for reads, iomap_dio_rw can trigger
@@ -885,7 +885,7 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from,
/*
* In this function, we disable page faults when we're holding the
* inode glock while doing I/O. If a page fault occurs, we indicate
- * that the inode glock may be dropped, fault in the pages manually,
+ * that the inode glock should be dropped, fault in the pages manually,
* and retry.
*
* For writes, iomap_dio_rw only triggers manual page faults, so we
@@ -957,7 +957,7 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
/*
* In this function, we disable page faults when we're holding the
* inode glock while doing I/O. If a page fault occurs, we indicate
- * that the inode glock may be dropped, fault in the pages manually,
+ * that the inode glock should be dropped, fault in the pages manually,
* and retry.
*/
@@ -1024,7 +1024,7 @@ static ssize_t gfs2_file_buffered_write(struct kiocb *iocb,
/*
* In this function, we disable page faults when we're holding the
* inode glock while doing I/O. If a page fault occurs, we indicate
- * that the inode glock may be dropped, fault in the pages manually,
+ * that the inode glock should be dropped, fault in the pages manually,
* and retry.
*/
@@ -1058,7 +1058,8 @@ retry:
}
pagefault_disable();
- ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops, NULL);
+ ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops,
+ &gfs2_iomap_write_ops, NULL);
pagefault_enable();
if (ret > 0)
written += ret;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 8c4c1f871a88..b6fd1cb17de7 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -590,31 +590,32 @@ static void gfs2_demote_wake(struct gfs2_glock *gl)
static void finish_xmote(struct gfs2_glock *gl, unsigned int ret)
{
const struct gfs2_glock_operations *glops = gl->gl_ops;
- struct gfs2_holder *gh;
- unsigned state = ret & LM_OUT_ST_MASK;
- trace_gfs2_glock_state_change(gl, state);
- state_change(gl, state);
- gh = find_first_waiter(gl);
+ if (!(ret & ~LM_OUT_ST_MASK)) {
+ unsigned state = ret & LM_OUT_ST_MASK;
+
+ trace_gfs2_glock_state_change(gl, state);
+ state_change(gl, state);
+ }
+
/* Demote to UN request arrived during demote to SH or DF */
if (test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) &&
- state != LM_ST_UNLOCKED && gl->gl_demote_state == LM_ST_UNLOCKED)
+ gl->gl_state != LM_ST_UNLOCKED &&
+ gl->gl_demote_state == LM_ST_UNLOCKED)
gl->gl_target = LM_ST_UNLOCKED;
/* Check for state != intended state */
- if (unlikely(state != gl->gl_target)) {
- if (gh && (ret & LM_OUT_CANCELED))
- gfs2_holder_wake(gh);
+ if (unlikely(gl->gl_state != gl->gl_target)) {
+ struct gfs2_holder *gh = find_first_waiter(gl);
+
if (gh && !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) {
- /* move to back of queue and try next entry */
if (ret & LM_OUT_CANCELED) {
- list_move_tail(&gh->gh_list, &gl->gl_holders);
- gh = find_first_waiter(gl);
- gl->gl_target = gh->gh_state;
- if (do_promote(gl))
- goto out;
- goto retry;
+ list_del_init(&gh->gh_list);
+ trace_gfs2_glock_queue(gh, 0);
+ gfs2_holder_wake(gh);
+ gl->gl_target = gl->gl_state;
+ goto out;
}
/* Some error or failed "try lock" - report it */
if ((ret & LM_OUT_ERROR) ||
@@ -624,10 +625,9 @@ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret)
goto out;
}
}
- switch(state) {
+ switch(gl->gl_state) {
/* Unlocked due to conversion deadlock, try again */
case LM_ST_UNLOCKED:
-retry:
do_xmote(gl, gh, gl->gl_target);
break;
/* Conversion fails, unlock and try again */
@@ -636,8 +636,10 @@ retry:
do_xmote(gl, gh, LM_ST_UNLOCKED);
break;
default: /* Everything else */
- fs_err(gl->gl_name.ln_sbd, "wanted %u got %u\n",
- gl->gl_target, state);
+ fs_err(gl->gl_name.ln_sbd,
+ "glock %u:%llu requested=%u ret=%u\n",
+ gl->gl_name.ln_type, gl->gl_name.ln_number,
+ gl->gl_req, ret);
GLOCK_BUG_ON(gl, 1);
}
return;
@@ -646,7 +648,7 @@ retry:
/* Fast path - we got what we asked for */
if (test_and_clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags))
gfs2_demote_wake(gl);
- if (state != LM_ST_UNLOCKED) {
+ if (gl->gl_state != LM_ST_UNLOCKED) {
if (glops->go_xmote_bh) {
int rv;
@@ -661,7 +663,8 @@ retry:
do_promote(gl);
}
out:
- clear_bit(GLF_LOCK, &gl->gl_flags);
+ if (!test_bit(GLF_CANCELING, &gl->gl_flags))
+ clear_bit(GLF_LOCK, &gl->gl_flags);
}
static bool is_system_glock(struct gfs2_glock *gl)
@@ -797,7 +800,8 @@ skip_inval:
* We skip telling dlm to do the locking, so we won't get a
* reply that would otherwise clear GLF_LOCK. So we clear it here.
*/
- clear_bit(GLF_LOCK, &gl->gl_flags);
+ if (!test_bit(GLF_CANCELING, &gl->gl_flags))
+ clear_bit(GLF_LOCK, &gl->gl_flags);
clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
gfs2_glock_queue_work(gl, GL_GLOCK_DFT_HOLD);
return;
@@ -807,6 +811,7 @@ skip_inval:
}
if (ls->ls_ops->lm_lock) {
+ set_bit(GLF_PENDING_REPLY, &gl->gl_flags);
spin_unlock(&gl->gl_lockref.lock);
ret = ls->ls_ops->lm_lock(gl, target, lck_flags);
spin_lock(&gl->gl_lockref.lock);
@@ -825,6 +830,7 @@ skip_inval:
/* The operation will be completed asynchronously. */
return;
}
+ clear_bit(GLF_PENDING_REPLY, &gl->gl_flags);
}
/* Complete the operation now. */
@@ -843,12 +849,13 @@ static void run_queue(struct gfs2_glock *gl, const int nonblock)
__releases(&gl->gl_lockref.lock)
__acquires(&gl->gl_lockref.lock)
{
- struct gfs2_holder *gh = NULL;
+ struct gfs2_holder *gh;
if (test_bit(GLF_LOCK, &gl->gl_flags))
return;
set_bit(GLF_LOCK, &gl->gl_flags);
+ /* While a demote is in progress, the GLF_LOCK flag must be set. */
GLOCK_BUG_ON(gl, test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags));
if (test_bit(GLF_DEMOTE, &gl->gl_flags) &&
@@ -860,18 +867,22 @@ __acquires(&gl->gl_lockref.lock)
set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
GLOCK_BUG_ON(gl, gl->gl_demote_state == LM_ST_EXCLUSIVE);
gl->gl_target = gl->gl_demote_state;
+ do_xmote(gl, NULL, gl->gl_target);
+ return;
} else {
if (test_bit(GLF_DEMOTE, &gl->gl_flags))
gfs2_demote_wake(gl);
if (do_promote(gl))
goto out_unlock;
gh = find_first_waiter(gl);
+ if (!gh)
+ goto out_unlock;
gl->gl_target = gh->gh_state;
if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
do_error(gl, 0); /* Fail queued try locks */
+ do_xmote(gl, gh, gl->gl_target);
+ return;
}
- do_xmote(gl, gh, gl->gl_target);
- return;
out_sched:
clear_bit(GLF_LOCK, &gl->gl_flags);
@@ -898,12 +909,8 @@ void glock_set_object(struct gfs2_glock *gl, void *object)
prev_object = gl->gl_object;
gl->gl_object = object;
spin_unlock(&gl->gl_lockref.lock);
- if (gfs2_assert_warn(gl->gl_name.ln_sbd, prev_object == NULL)) {
- pr_warn("glock=%u/%llx\n",
- gl->gl_name.ln_type,
- (unsigned long long)gl->gl_name.ln_number);
+ if (gfs2_assert_warn(gl->gl_name.ln_sbd, prev_object == NULL))
gfs2_dump_glock(NULL, gl, true);
- }
}
/**
@@ -919,12 +926,8 @@ void glock_clear_object(struct gfs2_glock *gl, void *object)
prev_object = gl->gl_object;
gl->gl_object = NULL;
spin_unlock(&gl->gl_lockref.lock);
- if (gfs2_assert_warn(gl->gl_name.ln_sbd, prev_object == object)) {
- pr_warn("glock=%u/%llx\n",
- gl->gl_name.ln_type,
- (unsigned long long)gl->gl_name.ln_number);
+ if (gfs2_assert_warn(gl->gl_name.ln_sbd, prev_object == object))
gfs2_dump_glock(NULL, gl, true);
- }
}
void gfs2_inode_remember_delete(struct gfs2_glock *gl, u64 generation)
@@ -959,6 +962,25 @@ static void gfs2_glock_poke(struct gfs2_glock *gl)
gfs2_holder_uninit(&gh);
}
+static struct gfs2_inode *gfs2_grab_existing_inode(struct gfs2_glock *gl)
+{
+ struct gfs2_inode *ip;
+
+ spin_lock(&gl->gl_lockref.lock);
+ ip = gl->gl_object;
+ if (ip && !igrab(&ip->i_inode))
+ ip = NULL;
+ spin_unlock(&gl->gl_lockref.lock);
+ if (ip) {
+ wait_on_inode(&ip->i_inode);
+ if (is_bad_inode(&ip->i_inode)) {
+ iput(&ip->i_inode);
+ ip = NULL;
+ }
+ }
+ return ip;
+}
+
static void gfs2_try_evict(struct gfs2_glock *gl)
{
struct gfs2_inode *ip;
@@ -976,32 +998,15 @@ static void gfs2_try_evict(struct gfs2_glock *gl)
* happened below. (Verification is triggered by the call to
* gfs2_queue_verify_delete() in gfs2_evict_inode().)
*/
- spin_lock(&gl->gl_lockref.lock);
- ip = gl->gl_object;
- if (ip && !igrab(&ip->i_inode))
- ip = NULL;
- spin_unlock(&gl->gl_lockref.lock);
- if (ip) {
- wait_on_inode(&ip->i_inode);
- if (is_bad_inode(&ip->i_inode)) {
- iput(&ip->i_inode);
- ip = NULL;
- }
- }
+ ip = gfs2_grab_existing_inode(gl);
if (ip) {
- set_bit(GIF_DEFER_DELETE, &ip->i_flags);
+ set_bit(GLF_DEFER_DELETE, &gl->gl_flags);
d_prune_aliases(&ip->i_inode);
iput(&ip->i_inode);
+ clear_bit(GLF_DEFER_DELETE, &gl->gl_flags);
/* If the inode was evicted, gl->gl_object will now be NULL. */
- spin_lock(&gl->gl_lockref.lock);
- ip = gl->gl_object;
- if (ip) {
- clear_bit(GIF_DEFER_DELETE, &ip->i_flags);
- if (!igrab(&ip->i_inode))
- ip = NULL;
- }
- spin_unlock(&gl->gl_lockref.lock);
+ ip = gfs2_grab_existing_inode(gl);
if (ip) {
gfs2_glock_poke(ip->i_gl);
iput(&ip->i_inode);
@@ -1160,7 +1165,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
const struct gfs2_glock_operations *glops, int create,
struct gfs2_glock **glp)
{
- struct super_block *s = sdp->sd_vfs;
struct lm_lockname name = { .ln_number = number,
.ln_type = glops->go_type,
.ln_sbd = sdp };
@@ -1201,8 +1205,8 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
if (glops->go_instantiate)
gl->gl_flags |= BIT(GLF_INSTANTIATE_NEEDED);
gl->gl_name = name;
+ lockref_init(&gl->gl_lockref);
lockdep_set_subclass(&gl->gl_lockref.lock, glops->go_subclass);
- gl->gl_lockref.count = 1;
gl->gl_state = LM_ST_UNLOCKED;
gl->gl_target = LM_ST_UNLOCKED;
gl->gl_demote_state = LM_ST_EXCLUSIVE;
@@ -1223,7 +1227,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
mapping = gfs2_glock2aspace(gl);
if (mapping) {
mapping->a_ops = &gfs2_meta_aops;
- mapping->host = s->s_bdev->bd_mapping->host;
+ mapping->host = sdp->sd_inode;
mapping->flags = 0;
mapping_set_gfp_mask(mapping, GFP_NOFS);
mapping->i_private_data = NULL;
@@ -1462,9 +1466,7 @@ static inline bool pid_is_meaningful(const struct gfs2_holder *gh)
{
if (!(gh->gh_flags & GL_NOPID))
return true;
- if (gh->gh_state == LM_ST_UNLOCKED)
- return true;
- return false;
+ return !test_bit(HIF_HOLDER, &gh->gh_iflags);
}
/**
@@ -1483,7 +1485,6 @@ __acquires(&gl->gl_lockref.lock)
{
struct gfs2_glock *gl = gh->gh_gl;
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
- struct list_head *insert_pt = NULL;
struct gfs2_holder *gh2;
int try_futile = 0;
@@ -1519,21 +1520,11 @@ fail:
gfs2_holder_wake(gh);
return;
}
- if (test_bit(HIF_HOLDER, &gh2->gh_iflags))
- continue;
}
trace_gfs2_glock_queue(gh, 1);
gfs2_glstats_inc(gl, GFS2_LKS_QCOUNT);
gfs2_sbstats_inc(gl, GFS2_LKS_QCOUNT);
- if (likely(insert_pt == NULL)) {
- list_add_tail(&gh->gh_list, &gl->gl_holders);
- return;
- }
- list_add_tail(&gh->gh_list, insert_pt);
- spin_unlock(&gl->gl_lockref.lock);
- if (sdp->sd_lockstruct.ls_ops->lm_cancel)
- sdp->sd_lockstruct.ls_ops->lm_cancel(gl);
- spin_lock(&gl->gl_lockref.lock);
+ list_add_tail(&gh->gh_list, &gl->gl_holders);
return;
trap_recursive:
@@ -1673,11 +1664,19 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
}
if (list_is_first(&gh->gh_list, &gl->gl_holders) &&
- !test_bit(HIF_HOLDER, &gh->gh_iflags)) {
+ !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
+ test_bit(GLF_LOCK, &gl->gl_flags) &&
+ !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) &&
+ !test_bit(GLF_CANCELING, &gl->gl_flags)) {
+ set_bit(GLF_CANCELING, &gl->gl_flags);
spin_unlock(&gl->gl_lockref.lock);
gl->gl_name.ln_sbd->sd_lockstruct.ls_ops->lm_cancel(gl);
wait_on_bit(&gh->gh_iflags, HIF_WAIT, TASK_UNINTERRUPTIBLE);
spin_lock(&gl->gl_lockref.lock);
+ clear_bit(GLF_CANCELING, &gl->gl_flags);
+ clear_bit(GLF_LOCK, &gl->gl_flags);
+ if (!gfs2_holder_queued(gh))
+ goto out;
}
/*
@@ -1923,6 +1922,7 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
struct lm_lockstruct *ls = &gl->gl_name.ln_sbd->sd_lockstruct;
spin_lock(&gl->gl_lockref.lock);
+ clear_bit(GLF_PENDING_REPLY, &gl->gl_flags);
gl->gl_reply = ret;
if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags))) {
@@ -2323,6 +2323,8 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl)
*p++ = 'f';
if (test_bit(GLF_INVALIDATE_IN_PROGRESS, gflags))
*p++ = 'i';
+ if (test_bit(GLF_PENDING_REPLY, gflags))
+ *p++ = 'R';
if (test_bit(GLF_HAVE_REPLY, gflags))
*p++ = 'r';
if (test_bit(GLF_INITIAL, gflags))
@@ -2347,6 +2349,10 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl)
*p++ = 'e';
if (test_bit(GLF_VERIFY_DELETE, gflags))
*p++ = 'E';
+ if (test_bit(GLF_DEFER_DELETE, gflags))
+ *p++ = 's';
+ if (test_bit(GLF_CANCELING, gflags))
+ *p++ = 'C';
*p = 0;
return buf;
}
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index c171f745650f..9339a3bff6ee 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -92,12 +92,22 @@ enum {
* LM_OUT_ST_MASK
* Masks the lower two bits of lock state in the returned value.
*
+ * LM_OUT_TRY_AGAIN
+ * The trylock request failed.
+ *
+ * LM_OUT_DEADLOCK
+ * The lock request failed because it would deadlock.
+ *
* LM_OUT_CANCELED
* The lock request was canceled.
*
+ * LM_OUT_ERROR
+ * The lock request timed out or failed.
*/
#define LM_OUT_ST_MASK 0x00000003
+#define LM_OUT_TRY_AGAIN 0x00000020
+#define LM_OUT_DEADLOCK 0x00000010
#define LM_OUT_CANCELED 0x00000008
#define LM_OUT_ERROR 0x00000004
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index eb4714f299ef..fe0faad4892f 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -11,6 +11,7 @@
#include <linux/bio.h>
#include <linux/posix_acl.h>
#include <linux/security.h>
+#include <linux/log2.h>
#include "gfs2.h"
#include "incore.h"
@@ -168,7 +169,7 @@ void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
static int gfs2_rgrp_metasync(struct gfs2_glock *gl)
{
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
- struct address_space *metamapping = &sdp->sd_aspace;
+ struct address_space *metamapping = gfs2_aspace(sdp);
struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(gl);
const unsigned bsize = sdp->sd_sb.sb_bsize;
loff_t start = (rgd->rd_addr * bsize) & PAGE_MASK;
@@ -225,7 +226,7 @@ static int rgrp_go_sync(struct gfs2_glock *gl)
static void rgrp_go_inval(struct gfs2_glock *gl, int flags)
{
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
- struct address_space *mapping = &sdp->sd_aspace;
+ struct address_space *mapping = gfs2_aspace(sdp);
struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(gl);
const unsigned bsize = sdp->sd_sb.sb_bsize;
loff_t start, end;
@@ -450,6 +451,11 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
gfs2_consist_inode(ip);
return -EIO;
}
+ if ((ip->i_diskflags & GFS2_DIF_EXHASH) &&
+ depth < ilog2(sdp->sd_hash_ptrs)) {
+ gfs2_consist_inode(ip);
+ return -EIO;
+ }
ip->i_depth = (u8)depth;
ip->i_entries = be32_to_cpu(str->di_entries);
@@ -601,14 +607,13 @@ static int freeze_go_xmote_bh(struct gfs2_glock *gl)
if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
j_gl->gl_ops->go_inval(j_gl, DIO_METADATA);
- error = gfs2_find_jhead(sdp->sd_jdesc, &head, false);
+ error = gfs2_find_jhead(sdp->sd_jdesc, &head);
if (gfs2_assert_withdraw_delayed(sdp, !error))
return error;
if (gfs2_assert_withdraw_delayed(sdp, head.lh_flags &
GFS2_LOG_HEAD_UNMOUNT))
return -EIO;
- sdp->sd_log_sequence = head.lh_sequence + 1;
- gfs2_log_pointers_init(sdp, head.lh_blkno);
+ gfs2_log_pointers_init(sdp, &head);
}
return 0;
}
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 4e19cce3d906..d4ad82f47eee 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -330,6 +330,9 @@ enum {
GLF_UNLOCKED = 16, /* Wait for glock to be unlocked */
GLF_TRY_TO_EVICT = 17, /* iopen glocks only */
GLF_VERIFY_DELETE = 18, /* iopen glocks only */
+ GLF_PENDING_REPLY = 19,
+ GLF_DEFER_DELETE = 20, /* iopen glocks only */
+ GLF_CANCELING = 21,
};
struct gfs2_glock {
@@ -372,11 +375,9 @@ struct gfs2_glock {
enum {
GIF_QD_LOCKED = 1,
- GIF_ALLOC_FAILED = 2,
GIF_SW_PAGED = 3,
GIF_FREE_VFS_INODE = 5,
GIF_GLOP_PENDING = 6,
- GIF_DEFER_DELETE = 7,
};
struct gfs2_inode {
@@ -793,7 +794,7 @@ struct gfs2_sbd {
/* Log stuff */
- struct address_space sd_aspace;
+ struct inode *sd_inode;
spinlock_t sd_log_lock;
@@ -849,6 +850,13 @@ struct gfs2_sbd {
unsigned long sd_glock_dqs_held;
};
+#define GFS2_BAD_INO 1
+
+static inline struct address_space *gfs2_aspace(struct gfs2_sbd *sdp)
+{
+ return sdp->sd_inode->i_mapping;
+}
+
static inline void gfs2_glstats_inc(struct gfs2_glock *gl, int which)
{
gl->gl_stats.stats[which]++;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 6fbbaaad1cd0..8760e7e20c9d 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -439,6 +439,72 @@ out:
return error;
}
+static void gfs2_final_release_pages(struct gfs2_inode *ip)
+{
+ struct inode *inode = &ip->i_inode;
+ struct gfs2_glock *gl = ip->i_gl;
+
+ /* This can only happen during incomplete inode creation. */
+ if (unlikely(!gl))
+ return;
+
+ truncate_inode_pages(gfs2_glock2aspace(gl), 0);
+ truncate_inode_pages(&inode->i_data, 0);
+
+ if (atomic_read(&gl->gl_revokes) == 0) {
+ clear_bit(GLF_LFLUSH, &gl->gl_flags);
+ clear_bit(GLF_DIRTY, &gl->gl_flags);
+ }
+}
+
+int gfs2_dinode_dealloc(struct gfs2_inode *ip)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct gfs2_rgrpd *rgd;
+ struct gfs2_holder gh;
+ int error;
+
+ if (gfs2_get_inode_blocks(&ip->i_inode) != 1) {
+ gfs2_consist_inode(ip);
+ return -EIO;
+ }
+
+ gfs2_rindex_update(sdp);
+
+ error = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
+ if (error)
+ return error;
+
+ rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr, 1);
+ if (!rgd) {
+ gfs2_consist_inode(ip);
+ error = -EIO;
+ goto out_qs;
+ }
+
+ error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
+ LM_FLAG_NODE_SCOPE, &gh);
+ if (error)
+ goto out_qs;
+
+ error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS + RES_QUOTA,
+ sdp->sd_jdesc->jd_blocks);
+ if (error)
+ goto out_rg_gunlock;
+
+ gfs2_free_di(rgd, ip);
+
+ gfs2_final_release_pages(ip);
+
+ gfs2_trans_end(sdp);
+
+out_rg_gunlock:
+ gfs2_glock_dq_uninit(&gh);
+out_qs:
+ gfs2_quota_unhold(ip);
+ return error;
+}
+
static void gfs2_init_dir(struct buffer_head *dibh,
const struct gfs2_inode *parent)
{
@@ -629,10 +695,11 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
struct gfs2_inode *dip = GFS2_I(dir), *ip;
struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
struct gfs2_glock *io_gl;
- int error;
+ int error, dealloc_error;
u32 aflags = 0;
unsigned blocks = 1;
struct gfs2_diradd da = { .bh = NULL, .save_loc = 1, };
+ bool xattr_initialized = false;
if (!name->len || name->len > GFS2_FNAMESIZE)
return -ENAMETOOLONG;
@@ -659,7 +726,8 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
if (!IS_ERR(inode)) {
if (S_ISDIR(inode->i_mode)) {
iput(inode);
- inode = ERR_PTR(-EISDIR);
+ inode = NULL;
+ error = -EISDIR;
goto fail_gunlock;
}
d_instantiate(dentry, inode);
@@ -744,11 +812,11 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
if (error)
- goto fail_free_inode;
+ goto fail_dealloc_inode;
error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_iopen_glops, CREATE, &io_gl);
if (error)
- goto fail_free_inode;
+ goto fail_dealloc_inode;
gfs2_cancel_delete_work(io_gl);
io_gl->gl_no_formal_ino = ip->i_no_formal_ino;
@@ -767,13 +835,16 @@ retry:
error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh);
if (error)
goto fail_gunlock3;
+ clear_bit(GLF_INSTANTIATE_NEEDED, &ip->i_gl->gl_flags);
error = gfs2_trans_begin(sdp, blocks, 0);
if (error)
goto fail_gunlock3;
- if (blocks > 1)
+ if (blocks > 1) {
gfs2_init_xattr(ip);
+ xattr_initialized = true;
+ }
init_dinode(dip, ip, symname);
gfs2_trans_end(sdp);
@@ -828,6 +899,17 @@ fail_gunlock3:
gfs2_glock_dq_uninit(&ip->i_iopen_gh);
fail_gunlock2:
gfs2_glock_put(io_gl);
+fail_dealloc_inode:
+ dealloc_error = 0;
+ if (ip->i_eattr)
+ dealloc_error = gfs2_ea_dealloc(ip, xattr_initialized);
+ clear_nlink(inode);
+ mark_inode_dirty(inode);
+ if (!dealloc_error)
+ dealloc_error = gfs2_dinode_dealloc(ip);
+ if (dealloc_error)
+ fs_warn(sdp, "%s: %d\n", __func__, dealloc_error);
+ ip->i_no_addr = 0;
fail_free_inode:
if (ip->i_gl) {
gfs2_glock_put(ip->i_gl);
@@ -842,10 +924,6 @@ fail_gunlock:
gfs2_dir_no_add(&da);
gfs2_glock_dq_uninit(&d_gh);
if (!IS_ERR_OR_NULL(inode)) {
- set_bit(GIF_ALLOC_FAILED, &ip->i_flags);
- clear_nlink(inode);
- if (ip->i_no_addr)
- mark_inode_dirty(inode);
if (inode->i_state & I_NEW)
iget_failed(inode);
else
@@ -1248,14 +1326,15 @@ static int gfs2_symlink(struct mnt_idmap *idmap, struct inode *dir,
* @dentry: The dentry of the new directory
* @mode: The mode of the new directory
*
- * Returns: errno
+ * Returns: the dentry, or ERR_PTR(errno)
*/
-static int gfs2_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *gfs2_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
unsigned dsize = gfs2_max_stuffed_size(GFS2_I(dir));
- return gfs2_create_inode(dir, dentry, NULL, S_IFDIR | mode, 0, NULL, dsize, 0);
+
+ return ERR_PTR(gfs2_create_inode(dir, dentry, NULL, S_IFDIR | mode, 0, NULL, dsize, 0));
}
/**
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 9e5e1622d50a..e43f08eb26e7 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -44,17 +44,17 @@ static inline int gfs2_is_dir(const struct gfs2_inode *ip)
static inline void gfs2_set_inode_blocks(struct inode *inode, u64 blocks)
{
- inode->i_blocks = blocks << (inode->i_blkbits - 9);
+ inode->i_blocks = blocks << (inode->i_blkbits - SECTOR_SHIFT);
}
static inline u64 gfs2_get_inode_blocks(const struct inode *inode)
{
- return inode->i_blocks >> (inode->i_blkbits - 9);
+ return inode->i_blocks >> (inode->i_blkbits - SECTOR_SHIFT);
}
static inline void gfs2_add_inode_blocks(struct inode *inode, s64 change)
{
- change <<= inode->i_blkbits - 9;
+ change <<= inode->i_blkbits - SECTOR_SHIFT;
gfs2_assert(GFS2_SB(inode), (change >= 0 || inode->i_blocks >= -change));
inode->i_blocks += change;
}
@@ -92,6 +92,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type,
struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr,
u64 no_formal_ino,
unsigned int blktype);
+int gfs2_dinode_dealloc(struct gfs2_inode *ip);
struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
int is_root);
@@ -106,9 +107,9 @@ loff_t gfs2_seek_hole(struct file *file, loff_t offset);
extern const struct file_operations gfs2_file_fops_nolock;
extern const struct file_operations gfs2_dir_fops_nolock;
-int gfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+int gfs2_fileattr_get(struct dentry *dentry, struct file_kattr *fa);
int gfs2_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa);
+ struct dentry *dentry, struct file_kattr *fa);
void gfs2_set_inode_flags(struct inode *inode);
#ifdef CONFIG_GFS2_FS_LOCKING_DLM
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 58aeeae7ed8c..cee5d199d2d8 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -119,7 +119,7 @@ static inline void gfs2_update_request_times(struct gfs2_glock *gl)
static void gdlm_ast(void *arg)
{
struct gfs2_glock *gl = arg;
- unsigned ret = gl->gl_state;
+ unsigned ret;
/* If the glock is dead, we only react to a dlm_unlock() reply. */
if (__lockref_is_dead(&gl->gl_lockref) &&
@@ -139,13 +139,16 @@ static void gdlm_ast(void *arg)
gfs2_glock_free(gl);
return;
case -DLM_ECANCEL: /* Cancel while getting lock */
- ret |= LM_OUT_CANCELED;
+ ret = LM_OUT_CANCELED;
goto out;
case -EAGAIN: /* Try lock fails */
+ ret = LM_OUT_TRY_AGAIN;
+ goto out;
case -EDEADLK: /* Deadlock detected */
+ ret = LM_OUT_DEADLOCK;
goto out;
case -ETIMEDOUT: /* Canceled due to timeout */
- ret |= LM_OUT_ERROR;
+ ret = LM_OUT_ERROR;
goto out;
case 0: /* Success */
break;
@@ -328,6 +331,7 @@ static void gdlm_put_lock(struct gfs2_glock *gl)
{
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+ uint32_t flags = 0;
int error;
BUG_ON(!__lockref_is_dead(&gl->gl_lockref));
@@ -352,7 +356,7 @@ static void gdlm_put_lock(struct gfs2_glock *gl)
* When the lockspace is released, all remaining glocks will be
* unlocked automatically. This is more efficient than unlocking them
* individually, but when the lock is held in DLM_LOCK_EX or
- * DLM_LOCK_PW mode, the lock value block (LVB) will be lost.
+ * DLM_LOCK_PW mode, the lock value block (LVB) would be lost.
*/
if (test_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags) &&
@@ -361,8 +365,11 @@ static void gdlm_put_lock(struct gfs2_glock *gl)
return;
}
+ if (gl->gl_lksb.sb_lvbptr)
+ flags |= DLM_LKF_VALBLK;
+
again:
- error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_VALBLK,
+ error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, flags,
NULL, gl);
if (error == -EBUSY) {
msleep(20);
@@ -996,14 +1003,15 @@ locks_done:
if (sdp->sd_args.ar_spectator) {
fs_info(sdp, "Recovery is required. Waiting for a "
"non-spectator to mount.\n");
+ spin_unlock(&ls->ls_recover_spin);
msleep_interruptible(1000);
} else {
fs_info(sdp, "control_mount wait1 block %u start %u "
"mount %u lvb %u flags %lx\n", block_gen,
start_gen, mount_gen, lvb_gen,
ls->ls_recover_flags);
+ spin_unlock(&ls->ls_recover_spin);
}
- spin_unlock(&ls->ls_recover_spin);
goto restart;
}
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index f9c5089783d2..115c4ac457e9 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -31,6 +31,7 @@
#include "dir.h"
#include "trace_gfs2.h"
#include "trans.h"
+#include "aops.h"
static void gfs2_log_shutdown(struct gfs2_sbd *sdp);
@@ -131,7 +132,11 @@ __acquires(&sdp->sd_ail_lock)
if (!mapping)
continue;
spin_unlock(&sdp->sd_ail_lock);
- ret = mapping->a_ops->writepages(mapping, wbc);
+ BUG_ON(GFS2_SB(mapping->host) != sdp);
+ if (gfs2_is_jdata(GFS2_I(mapping->host)))
+ ret = gfs2_jdata_writeback(mapping, wbc);
+ else
+ ret = mapping->a_ops->writepages(mapping, wbc);
if (need_resched()) {
blk_finish_plug(plug);
cond_resched();
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index c27b05099c1e..fc30ebdad83a 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -44,17 +44,6 @@ __releases(&sdp->sd_log_lock)
spin_unlock(&sdp->sd_log_lock);
}
-static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp,
- unsigned int value)
-{
- if (++value == sdp->sd_jdesc->jd_blocks) {
- value = 0;
- }
- sdp->sd_log_tail = value;
- sdp->sd_log_flush_tail = value;
- sdp->sd_log_head = value;
-}
-
static inline void gfs2_ordered_add_inode(struct gfs2_inode *ip)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 314ec2a70167..9c8c305a75c4 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -157,7 +157,9 @@ u64 gfs2_log_bmap(struct gfs2_jdesc *jd, unsigned int lblock)
/**
* gfs2_end_log_write_bh - end log write of pagecache data with buffers
* @sdp: The superblock
- * @bvec: The bio_vec
+ * @folio: The folio
+ * @offset: The first byte within the folio that completed
+ * @size: The number of bytes that completed
* @error: The i/o status
*
* This finds the relevant buffers and unlocks them and sets the
@@ -166,17 +168,13 @@ u64 gfs2_log_bmap(struct gfs2_jdesc *jd, unsigned int lblock)
* that is pinned in the pagecache.
*/
-static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp,
- struct bio_vec *bvec,
- blk_status_t error)
+static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, struct folio *folio,
+ size_t offset, size_t size, blk_status_t error)
{
struct buffer_head *bh, *next;
- struct page *page = bvec->bv_page;
- unsigned size;
- bh = page_buffers(page);
- size = bvec->bv_len;
- while (bh_offset(bh) < bvec->bv_offset)
+ bh = folio_buffers(folio);
+ while (bh_offset(bh) < offset)
bh = bh->b_this_page;
do {
if (error)
@@ -186,7 +184,7 @@ static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp,
size -= bh->b_size;
brelse(bh);
bh = next;
- } while(bh && size);
+ } while (bh && size);
}
/**
@@ -203,13 +201,14 @@ static void gfs2_end_log_write(struct bio *bio)
{
struct gfs2_sbd *sdp = bio->bi_private;
struct bio_vec *bvec;
- struct page *page;
struct bvec_iter_all iter_all;
if (bio->bi_status) {
- if (!cmpxchg(&sdp->sd_log_error, 0, (int)bio->bi_status))
+ int err = blk_status_to_errno(bio->bi_status);
+
+ if (!cmpxchg(&sdp->sd_log_error, 0, err))
fs_err(sdp, "Error %d writing to journal, jid=%u\n",
- bio->bi_status, sdp->sd_jdesc->jd_jid);
+ err, sdp->sd_jdesc->jd_jid);
gfs2_withdraw_delayed(sdp);
/* prevent more writes to the journal */
clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
@@ -217,9 +216,12 @@ static void gfs2_end_log_write(struct bio *bio)
}
bio_for_each_segment_all(bvec, bio, iter_all) {
- page = bvec->bv_page;
- if (page_has_buffers(page))
- gfs2_end_log_write_bh(sdp, bvec, bio->bi_status);
+ struct page *page = bvec->bv_page;
+ struct folio *folio = page_folio(page);
+
+ if (folio && folio_buffers(folio))
+ gfs2_end_log_write_bh(sdp, folio, bvec->bv_offset,
+ bvec->bv_len, bio->bi_status);
else
mempool_free(page, gfs2_page_pool);
}
@@ -359,8 +361,8 @@ static void gfs2_log_write_bh(struct gfs2_sbd *sdp, struct buffer_head *bh)
dblock = gfs2_log_bmap(sdp->sd_jdesc, sdp->sd_log_flush_head);
gfs2_log_incr_head(sdp);
- gfs2_log_write(sdp, sdp->sd_jdesc, bh->b_page, bh->b_size,
- bh_offset(bh), dblock);
+ gfs2_log_write(sdp, sdp->sd_jdesc, folio_page(bh->b_folio, 0),
+ bh->b_size, bh_offset(bh), dblock);
}
/**
@@ -406,17 +408,16 @@ static void gfs2_end_log_read(struct bio *bio)
}
/**
- * gfs2_jhead_pg_srch - Look for the journal head in a given page.
+ * gfs2_jhead_folio_search - Look for the journal head in a given page.
* @jd: The journal descriptor
* @head: The journal head to start from
- * @page: The page to look in
+ * @folio: The folio to look in
*
* Returns: 1 if found, 0 otherwise.
*/
-
-static bool gfs2_jhead_pg_srch(struct gfs2_jdesc *jd,
- struct gfs2_log_header_host *head,
- struct page *page)
+static bool gfs2_jhead_folio_search(struct gfs2_jdesc *jd,
+ struct gfs2_log_header_host *head,
+ struct folio *folio)
{
struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
struct gfs2_log_header_host lh;
@@ -424,7 +425,8 @@ static bool gfs2_jhead_pg_srch(struct gfs2_jdesc *jd,
unsigned int offset;
bool ret = false;
- kaddr = kmap_local_page(page);
+ VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
+ kaddr = kmap_local_folio(folio, 0);
for (offset = 0; offset < PAGE_SIZE; offset += sdp->sd_sb.sb_bsize) {
if (!__get_log_header(sdp, kaddr + offset, 0, &lh)) {
if (lh.lh_sequence >= head->lh_sequence)
@@ -449,7 +451,7 @@ static bool gfs2_jhead_pg_srch(struct gfs2_jdesc *jd,
* Find the folio with 'index' in the journal's mapping. Search the folio for
* the journal head if requested (cleanup == false). Release refs on the
* folio so the page cache can reclaim it. We grabbed a
- * reference on this folio twice, first when we did a grab_cache_page()
+ * reference on this folio twice, first when we did a filemap_grab_folio()
* to obtain the folio to add it to the bio and second when we do a
* filemap_get_folio() here to get the folio to wait on while I/O on it is being
* completed.
@@ -472,9 +474,9 @@ static void gfs2_jhead_process_page(struct gfs2_jdesc *jd, unsigned long index,
*done = true;
if (!*done)
- *done = gfs2_jhead_pg_srch(jd, head, &folio->page);
+ *done = gfs2_jhead_folio_search(jd, head, folio);
- /* filemap_get_folio() and the earlier grab_cache_page() */
+ /* filemap_get_folio() and the earlier filemap_grab_folio() */
folio_put_refs(folio, 2);
}
@@ -494,15 +496,13 @@ static struct bio *gfs2_chain_bio(struct bio *prev, unsigned int nr_iovecs)
* gfs2_find_jhead - find the head of a log
* @jd: The journal descriptor
* @head: The log descriptor for the head of the log is returned here
- * @keep_cache: If set inode pages will not be truncated
*
* Do a search of a journal by reading it in large chunks using bios and find
* the valid log entry with the highest sequence number. (i.e. the log head)
*
* Returns: 0 on success, errno otherwise
*/
-int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head,
- bool keep_cache)
+int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head)
{
struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
struct address_space *mapping = jd->jd_inode->i_mapping;
@@ -512,9 +512,9 @@ int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head,
unsigned int shift = PAGE_SHIFT - bsize_shift;
unsigned int max_blocks = 2 * 1024 * 1024 >> bsize_shift;
struct gfs2_journal_extent *je;
- int sz, ret = 0;
+ int ret = 0;
struct bio *bio = NULL;
- struct page *page = NULL;
+ struct folio *folio = NULL;
bool done = false;
errseq_t since;
@@ -527,10 +527,11 @@ int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head,
u64 dblock = je->dblock;
for (; block < je->lblock + je->blocks; block++, dblock++) {
- if (!page) {
- page = grab_cache_page(mapping, block >> shift);
- if (!page) {
- ret = -ENOMEM;
+ if (!folio) {
+ folio = filemap_grab_folio(mapping,
+ block >> shift);
+ if (IS_ERR(folio)) {
+ ret = PTR_ERR(folio);
done = true;
goto out;
}
@@ -541,8 +542,7 @@ int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head,
sector_t sector = dblock << sdp->sd_fsb2bb_shift;
if (bio_end_sector(bio) == sector) {
- sz = bio_add_page(bio, page, bsize, off);
- if (sz == bsize)
+ if (bio_add_folio(bio, folio, bsize, off))
goto block_added;
}
if (off) {
@@ -562,12 +562,12 @@ int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head,
bio = gfs2_log_alloc_bio(sdp, dblock, gfs2_end_log_read);
bio->bi_opf = REQ_OP_READ;
add_block_to_new_bio:
- sz = bio_add_page(bio, page, bsize, off);
- BUG_ON(sz != bsize);
+ if (!bio_add_folio(bio, folio, bsize, off))
+ BUG();
block_added:
off += bsize;
- if (off == PAGE_SIZE)
- page = NULL;
+ if (off == folio_size(folio))
+ folio = NULL;
if (blocks_submitted <= blocks_read + max_blocks) {
/* Keep at least one bio in flight */
continue;
@@ -591,8 +591,7 @@ out:
if (!ret)
ret = filemap_check_wb_err(mapping, since);
- if (!keep_cache)
- truncate_inode_pages(mapping, 0);
+ truncate_inode_pages(mapping, 0);
return ret;
}
@@ -615,15 +614,13 @@ static struct page *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type,
static void gfs2_check_magic(struct buffer_head *bh)
{
- void *kaddr;
__be32 *ptr;
clear_buffer_escaped(bh);
- kaddr = kmap_local_page(bh->b_page);
- ptr = kaddr + bh_offset(bh);
+ ptr = kmap_local_folio(bh->b_folio, bh_offset(bh));
if (*ptr == cpu_to_be32(GFS2_MAGIC))
set_buffer_escaped(bh);
- kunmap_local(kaddr);
+ kunmap_local(ptr);
}
static int blocknr_cmp(void *priv, const struct list_head *a,
diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h
index 07890c7b145d..be740bf33666 100644
--- a/fs/gfs2/lops.h
+++ b/fs/gfs2/lops.h
@@ -20,7 +20,7 @@ void gfs2_log_write(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd,
void gfs2_log_submit_bio(struct bio **biop, blk_opf_t opf);
void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh);
int gfs2_find_jhead(struct gfs2_jdesc *jd,
- struct gfs2_log_header_host *head, bool keep_cache);
+ struct gfs2_log_header_host *head);
void gfs2_drain_revokes(struct gfs2_sbd *sdp);
static inline unsigned int buf_limit(struct gfs2_sbd *sdp)
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 04cadc02e5a6..0727f60ad028 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -51,7 +51,6 @@ static void gfs2_init_glock_once(void *foo)
{
struct gfs2_glock *gl = foo;
- spin_lock_init(&gl->gl_lockref.lock);
INIT_LIST_HEAD(&gl->gl_holders);
INIT_LIST_HEAD(&gl->gl_lru);
INIT_LIST_HEAD(&gl->gl_ail_list);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index fea3efcc2f93..7fb11ff71b5a 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -103,6 +103,7 @@ const struct address_space_operations gfs2_meta_aops = {
.invalidate_folio = block_invalidate_folio,
.writepages = gfs2_aspace_writepages,
.release_folio = gfs2_release_folio,
+ .migrate_folio = buffer_migrate_folio_norefs,
};
const struct address_space_operations gfs2_rgrp_aops = {
@@ -110,6 +111,7 @@ const struct address_space_operations gfs2_rgrp_aops = {
.invalidate_folio = block_invalidate_folio,
.writepages = gfs2_aspace_writepages,
.release_folio = gfs2_release_folio,
+ .migrate_folio = buffer_migrate_folio_norefs,
};
/**
@@ -132,7 +134,7 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
unsigned int bufnum;
if (mapping == NULL)
- mapping = &sdp->sd_aspace;
+ mapping = gfs2_aspace(sdp);
shift = PAGE_SHIFT - sdp->sd_sb.sb_bsize_shift;
index = blkno >> shift; /* convert block to page */
@@ -198,15 +200,14 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
static void gfs2_meta_read_endio(struct bio *bio)
{
- struct bio_vec *bvec;
- struct bvec_iter_all iter_all;
+ struct folio_iter fi;
- bio_for_each_segment_all(bvec, bio, iter_all) {
- struct page *page = bvec->bv_page;
- struct buffer_head *bh = page_buffers(page);
- unsigned int len = bvec->bv_len;
+ bio_for_each_folio_all(fi, bio) {
+ struct folio *folio = fi.folio;
+ struct buffer_head *bh = folio_buffers(folio);
+ size_t len = fi.length;
- while (bh_offset(bh) < bvec->bv_offset)
+ while (bh_offset(bh) < fi.offset)
bh = bh->b_this_page;
do {
struct buffer_head *next = bh->b_this_page;
@@ -229,10 +230,10 @@ static void gfs2_submit_bhs(blk_opf_t opf, struct buffer_head *bhs[], int num)
struct bio *bio;
bio = bio_alloc(bh->b_bdev, num, opf, GFP_NOIO);
- bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+ bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> SECTOR_SHIFT);
while (num > 0) {
bh = *bhs;
- if (!bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh))) {
+ if (!bio_add_folio(bio, bh->b_folio, bh->b_size, bh_offset(bh))) {
BUG_ON(bio->bi_iter.bi_size == 0);
break;
}
@@ -444,11 +445,9 @@ void gfs2_journal_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen)
struct buffer_head *bh;
int ty;
- if (!ip->i_gl) {
- /* This can only happen during incomplete inode creation. */
- BUG_ON(!test_bit(GIF_ALLOC_FAILED, &ip->i_flags));
+ /* This can only happen during incomplete inode creation. */
+ if (!ip->i_gl)
return;
- }
gfs2_ail1_wipe(sdp, bstart, blen);
while (blen) {
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index 831d988c2ceb..b7c8a6684d02 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -44,9 +44,7 @@ static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping)
struct gfs2_glock_aspace *gla =
container_of(mapping, struct gfs2_glock_aspace, mapping);
return gla->glock.gl_name.ln_sbd;
- } else if (mapping->a_ops == &gfs2_rgrp_aops)
- return container_of(mapping, struct gfs2_sbd, sd_aspace);
- else
+ } else
return inode->i_sb->s_fs_info;
}
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index e83d293c3614..efe99b732551 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -64,15 +64,16 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
void free_sbd(struct gfs2_sbd *sdp)
{
- if (sdp->sd_lkstats)
- free_percpu(sdp->sd_lkstats);
+ struct super_block *sb = sdp->sd_vfs;
+
+ free_percpu(sdp->sd_lkstats);
+ sb->s_fs_info = NULL;
kfree(sdp);
}
static struct gfs2_sbd *init_sbd(struct super_block *sb)
{
struct gfs2_sbd *sdp;
- struct address_space *mapping;
sdp = kzalloc(sizeof(struct gfs2_sbd), GFP_KERNEL);
if (!sdp)
@@ -109,16 +110,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
INIT_LIST_HEAD(&sdp->sd_sc_inodes_list);
- mapping = &sdp->sd_aspace;
-
- address_space_init_once(mapping);
- mapping->a_ops = &gfs2_rgrp_aops;
- mapping->host = sb->s_bdev->bd_mapping->host;
- mapping->flags = 0;
- mapping_set_gfp_mask(mapping, GFP_NOFS);
- mapping->i_private_data = NULL;
- mapping->writeback_index = 0;
-
spin_lock_init(&sdp->sd_log_lock);
atomic_set(&sdp->sd_log_pinned, 0);
INIT_LIST_HEAD(&sdp->sd_log_revokes);
@@ -172,7 +163,7 @@ static int gfs2_check_sb(struct gfs2_sbd *sdp, int silent)
return -EINVAL;
}
- if (sb->sb_bsize < 512 || sb->sb_bsize > PAGE_SIZE ||
+ if (sb->sb_bsize < SECTOR_SIZE || sb->sb_bsize > PAGE_SIZE ||
(sb->sb_bsize & (sb->sb_bsize - 1))) {
pr_warn("Invalid block size\n");
return -EINVAL;
@@ -226,28 +217,22 @@ static void gfs2_sb_in(struct gfs2_sbd *sdp, const struct gfs2_sb *str)
static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent)
{
- struct super_block *sb = sdp->sd_vfs;
- struct page *page;
- struct bio_vec bvec;
- struct bio bio;
+ struct gfs2_sb *sb;
int err;
- page = alloc_page(GFP_KERNEL);
- if (unlikely(!page))
+ sb = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (unlikely(!sb))
return -ENOMEM;
-
- bio_init(&bio, sb->s_bdev, &bvec, 1, REQ_OP_READ | REQ_META);
- bio.bi_iter.bi_sector = sector * (sb->s_blocksize >> 9);
- __bio_add_page(&bio, page, PAGE_SIZE, 0);
-
- err = submit_bio_wait(&bio);
+ err = bdev_rw_virt(sdp->sd_vfs->s_bdev,
+ sector << (sdp->sd_vfs->s_blocksize_bits - SECTOR_SHIFT),
+ sb, PAGE_SIZE, REQ_OP_READ | REQ_META);
if (err) {
pr_warn("error %d reading superblock\n", err);
- __free_page(page);
+ kfree(sb);
return err;
}
- gfs2_sb_in(sdp, page_address(page));
- __free_page(page);
+ gfs2_sb_in(sdp, sb);
+ kfree(sb);
return gfs2_check_sb(sdp, silent);
}
@@ -272,7 +257,7 @@ static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent)
return error;
}
- sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - 9;
+ sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - SECTOR_SHIFT;
sdp->sd_fsb2bb = BIT(sdp->sd_fsb2bb_shift);
sdp->sd_diptrs = (sdp->sd_sb.sb_bsize -
sizeof(struct gfs2_dinode)) / sizeof(u64);
@@ -500,7 +485,9 @@ static int init_sb(struct gfs2_sbd *sdp, int silent)
sdp->sd_sb.sb_bsize, (unsigned int)PAGE_SIZE);
goto out;
}
- sb_set_blocksize(sb, sdp->sd_sb.sb_bsize);
+ ret = -EINVAL;
+ if (!sb_set_blocksize(sb, sdp->sd_sb.sb_bsize))
+ goto out;
/* Get the root inode */
no_addr = sdp->sd_sb.sb_root_dir.no_addr;
@@ -1135,6 +1122,7 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc)
int silent = fc->sb_flags & SB_SILENT;
struct gfs2_sbd *sdp;
struct gfs2_holder mount_gh;
+ struct address_space *mapping;
int error;
sdp = init_sbd(sb);
@@ -1156,7 +1144,8 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_flags |= SB_NOSEC;
sb->s_magic = GFS2_MAGIC;
sb->s_op = &gfs2_super_ops;
- sb->s_d_op = &gfs2_dops;
+
+ set_default_d_op(sb, &gfs2_dops);
sb->s_export_op = &gfs2_export_ops;
sb->s_qcop = &gfs2_quotactl_ops;
sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
@@ -1166,9 +1155,12 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc)
/* Set up the buffer cache and fill in some fake block size values
to allow us to read-in the on-disk superblock. */
- sdp->sd_sb.sb_bsize = sb_min_blocksize(sb, 512);
+ sdp->sd_sb.sb_bsize = sb_min_blocksize(sb, SECTOR_SIZE);
+ error = -EINVAL;
+ if (!sdp->sd_sb.sb_bsize)
+ goto fail_free;
sdp->sd_sb.sb_bsize_shift = sb->s_blocksize_bits;
- sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - 9;
+ sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - SECTOR_SHIFT;
sdp->sd_fsb2bb = BIT(sdp->sd_fsb2bb_shift);
sdp->sd_tune.gt_logd_secs = sdp->sd_args.ar_commit;
@@ -1181,9 +1173,21 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc)
sdp->sd_tune.gt_statfs_quantum = 30;
}
+ /* Set up an address space for metadata writes */
+ sdp->sd_inode = new_inode(sb);
+ error = -ENOMEM;
+ if (!sdp->sd_inode)
+ goto fail_free;
+ sdp->sd_inode->i_ino = GFS2_BAD_INO;
+ sdp->sd_inode->i_size = OFFSET_MAX;
+
+ mapping = gfs2_aspace(sdp);
+ mapping->a_ops = &gfs2_rgrp_aops;
+ mapping_set_gfp_mask(mapping, GFP_NOFS);
+
error = init_names(sdp, silent);
if (error)
- goto fail_free;
+ goto fail_iput;
snprintf(sdp->sd_fsname, sizeof(sdp->sd_fsname), "%s", sdp->sd_table_name);
@@ -1192,7 +1196,7 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc)
WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_FREEZABLE, 0,
sdp->sd_fsname);
if (!sdp->sd_glock_wq)
- goto fail_free;
+ goto fail_iput;
sdp->sd_delete_wq = alloc_workqueue("gfs2-delete/%s",
WQ_MEM_RECLAIM | WQ_FREEZABLE, 0, sdp->sd_fsname);
@@ -1309,9 +1313,10 @@ fail_delete_wq:
fail_glock_wq:
if (sdp->sd_glock_wq)
destroy_workqueue(sdp->sd_glock_wq);
+fail_iput:
+ iput(sdp->sd_inode);
fail_free:
free_sbd(sdp);
- sb->s_fs_info = NULL;
return error;
}
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 58bc5013ca49..2298e06797ac 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -236,7 +236,7 @@ static struct gfs2_quota_data *qd_alloc(unsigned hash, struct gfs2_sbd *sdp, str
return NULL;
qd->qd_sbd = sdp;
- lockref_init(&qd->qd_lockref, 0);
+ lockref_init(&qd->qd_lockref);
qd->qd_id = qid;
qd->qd_slot = -1;
INIT_LIST_HEAD(&qd->qd_lru);
@@ -297,7 +297,6 @@ static int qd_get(struct gfs2_sbd *sdp, struct kqid qid,
spin_lock_bucket(hash);
*qdp = qd = gfs2_qd_search_bucket(hash, sdp, qid);
if (qd == NULL) {
- new_qd->qd_lockref.count++;
*qdp = new_qd;
list_add(&new_qd->qd_list, &sdp->sd_quota_list);
hlist_bl_add_head_rcu(&new_qd->qd_hlist, &qd_hash_table[hash]);
@@ -1450,6 +1449,7 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
if (qd == NULL)
goto fail_brelse;
+ qd->qd_lockref.count = 0;
set_bit(QDF_CHANGE, &qd->qd_flags);
qd->qd_change = qc_change;
qd->qd_slot = slot;
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index f4fe7039f725..24250478b085 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -118,6 +118,7 @@ void gfs2_revoke_clean(struct gfs2_jdesc *jd)
int __get_log_header(struct gfs2_sbd *sdp, const struct gfs2_log_header *lh,
unsigned int blkno, struct gfs2_log_header_host *head)
{
+ const u32 zero = 0;
u32 hash, crc;
if (lh->lh_header.mh_magic != cpu_to_be32(GFS2_MAGIC) ||
@@ -126,7 +127,7 @@ int __get_log_header(struct gfs2_sbd *sdp, const struct gfs2_log_header *lh,
return 1;
hash = crc32(~0, lh, LH_V1_SIZE - 4);
- hash = ~crc32_le_shift(hash, 4); /* assume lh_hash is zero */
+ hash = ~crc32(hash, &zero, 4); /* assume lh_hash is zero */
if (be32_to_cpu(lh->lh_hash) != hash)
return 1;
@@ -263,16 +264,12 @@ static void clean_journal(struct gfs2_jdesc *jd,
struct gfs2_log_header_host *head)
{
struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
- u32 lblock = head->lh_blkno;
- gfs2_replay_incr_blk(jd, &lblock);
- gfs2_write_log_header(sdp, jd, head->lh_sequence + 1, 0, lblock,
+ gfs2_replay_incr_blk(jd, &head->lh_blkno);
+ head->lh_sequence++;
+ gfs2_write_log_header(sdp, jd, head->lh_sequence, 0, head->lh_blkno,
GFS2_LOG_HEAD_UNMOUNT | GFS2_LOG_HEAD_RECOVERY,
REQ_PREFLUSH | REQ_FUA | REQ_META | REQ_SYNC);
- if (jd->jd_jid == sdp->sd_lockstruct.ls_jid) {
- sdp->sd_log_flush_head = lblock;
- gfs2_log_incr_head(sdp);
- }
}
@@ -457,7 +454,7 @@ void gfs2_recover_func(struct work_struct *work)
if (error)
goto fail_gunlock_ji;
- error = gfs2_find_jhead(jd, &head, true);
+ error = gfs2_find_jhead(jd, &head);
if (error)
goto fail_gunlock_ji;
t_jhd = ktime_get();
@@ -533,6 +530,9 @@ void gfs2_recover_func(struct work_struct *work)
ktime_ms_delta(t_rep, t_tlck));
}
+ if (jd->jd_jid == sdp->sd_lockstruct.ls_jid)
+ gfs2_log_pointers_init(sdp, &head);
+
gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS);
if (jlocked) {
@@ -580,3 +580,13 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd, bool wait)
return wait ? jd->jd_recover_error : 0;
}
+void gfs2_log_pointers_init(struct gfs2_sbd *sdp,
+ struct gfs2_log_header_host *head)
+{
+ sdp->sd_log_sequence = head->lh_sequence + 1;
+ gfs2_replay_incr_blk(sdp->sd_jdesc, &head->lh_blkno);
+ sdp->sd_log_tail = head->lh_blkno;
+ sdp->sd_log_flush_head = head->lh_blkno;
+ sdp->sd_log_flush_tail = head->lh_blkno;
+ sdp->sd_log_head = head->lh_blkno;
+}
diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h
index 6a0fd42e1120..5a5ba72ecd75 100644
--- a/fs/gfs2/recovery.h
+++ b/fs/gfs2/recovery.h
@@ -29,6 +29,8 @@ void gfs2_recover_func(struct work_struct *work);
int __get_log_header(struct gfs2_sbd *sdp,
const struct gfs2_log_header *lh, unsigned int blkno,
struct gfs2_log_header_host *head);
+void gfs2_log_pointers_init(struct gfs2_sbd *sdp,
+ struct gfs2_log_header_host *head);
#endif /* __RECOVERY_DOT_H__ */
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 92a3b6ddafdc..b42e2110084b 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -134,28 +134,18 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
{
struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode);
struct gfs2_glock *j_gl = ip->i_gl;
- struct gfs2_log_header_host head;
int error;
j_gl->gl_ops->go_inval(j_gl, DIO_METADATA);
if (gfs2_withdrawing_or_withdrawn(sdp))
return -EIO;
- error = gfs2_find_jhead(sdp->sd_jdesc, &head, false);
- if (error) {
- gfs2_consist(sdp);
- return error;
- }
-
- if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
- gfs2_consist(sdp);
+ if (sdp->sd_log_sequence == 0) {
+ fs_err(sdp, "unknown status of our own journal jid %d",
+ sdp->sd_lockstruct.ls_jid);
return -EIO;
}
- /* Initialize some head of the log stuff */
- sdp->sd_log_sequence = head.lh_sequence + 1;
- gfs2_log_pointers_init(sdp, head.lh_blkno);
-
error = gfs2_quota_init(sdp);
if (!error && gfs2_withdrawing_or_withdrawn(sdp))
error = -EIO;
@@ -370,7 +360,7 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp)
error = gfs2_jdesc_check(jd);
if (error)
break;
- error = gfs2_find_jhead(jd, &lh, false);
+ error = gfs2_find_jhead(jd, &lh);
if (error)
break;
if (!(lh.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) {
@@ -497,11 +487,9 @@ static void gfs2_dirty_inode(struct inode *inode, int flags)
int need_endtrans = 0;
int ret;
- if (unlikely(!ip->i_gl)) {
- /* This can only happen during incomplete inode creation. */
- BUG_ON(!test_bit(GIF_ALLOC_FAILED, &ip->i_flags));
+ /* This can only happen during incomplete inode creation. */
+ if (unlikely(!ip->i_gl))
return;
- }
if (gfs2_withdrawing_or_withdrawn(sdp))
return;
@@ -648,7 +636,7 @@ restart:
gfs2_jindex_free(sdp);
/* Take apart glock structures and buffer lists */
gfs2_gl_hash_clear(sdp);
- truncate_inode_pages_final(&sdp->sd_aspace);
+ iput(sdp->sd_inode);
gfs2_delete_debugfs_file(sdp);
gfs2_sys_fs_del(sdp);
@@ -674,7 +662,7 @@ static int gfs2_sync_fs(struct super_block *sb, int wait)
return sdp->sd_log_error;
}
-static int gfs2_do_thaw(struct gfs2_sbd *sdp)
+static int gfs2_do_thaw(struct gfs2_sbd *sdp, enum freeze_holder who, const void *freeze_owner)
{
struct super_block *sb = sdp->sd_vfs;
int error;
@@ -682,7 +670,7 @@ static int gfs2_do_thaw(struct gfs2_sbd *sdp)
error = gfs2_freeze_lock_shared(sdp);
if (error)
goto fail;
- error = thaw_super(sb, FREEZE_HOLDER_USERSPACE);
+ error = thaw_super(sb, who, freeze_owner);
if (!error)
return 0;
@@ -703,14 +691,14 @@ void gfs2_freeze_func(struct work_struct *work)
if (test_bit(SDF_FROZEN, &sdp->sd_flags))
goto freeze_failed;
- error = freeze_super(sb, FREEZE_HOLDER_USERSPACE);
+ error = freeze_super(sb, FREEZE_HOLDER_USERSPACE, NULL);
if (error)
goto freeze_failed;
gfs2_freeze_unlock(sdp);
set_bit(SDF_FROZEN, &sdp->sd_flags);
- error = gfs2_do_thaw(sdp);
+ error = gfs2_do_thaw(sdp, FREEZE_HOLDER_USERSPACE, NULL);
if (error)
goto out;
@@ -728,10 +716,13 @@ out:
/**
* gfs2_freeze_super - prevent further writes to the filesystem
* @sb: the VFS structure for the filesystem
+ * @who: freeze flags
+ * @freeze_owner: owner of the freeze
*
*/
-static int gfs2_freeze_super(struct super_block *sb, enum freeze_holder who)
+static int gfs2_freeze_super(struct super_block *sb, enum freeze_holder who,
+ const void *freeze_owner)
{
struct gfs2_sbd *sdp = sb->s_fs_info;
int error;
@@ -744,7 +735,7 @@ static int gfs2_freeze_super(struct super_block *sb, enum freeze_holder who)
}
for (;;) {
- error = freeze_super(sb, FREEZE_HOLDER_USERSPACE);
+ error = freeze_super(sb, who, freeze_owner);
if (error) {
fs_info(sdp, "GFS2: couldn't freeze filesystem: %d\n",
error);
@@ -758,7 +749,7 @@ static int gfs2_freeze_super(struct super_block *sb, enum freeze_holder who)
break;
}
- error = gfs2_do_thaw(sdp);
+ error = gfs2_do_thaw(sdp, who, freeze_owner);
if (error)
goto out;
@@ -796,10 +787,13 @@ static int gfs2_freeze_fs(struct super_block *sb)
/**
* gfs2_thaw_super - reallow writes to the filesystem
* @sb: the VFS structure for the filesystem
+ * @who: freeze flags
+ * @freeze_owner: owner of the freeze
*
*/
-static int gfs2_thaw_super(struct super_block *sb, enum freeze_holder who)
+static int gfs2_thaw_super(struct super_block *sb, enum freeze_holder who,
+ const void *freeze_owner)
{
struct gfs2_sbd *sdp = sb->s_fs_info;
int error;
@@ -814,7 +808,7 @@ static int gfs2_thaw_super(struct super_block *sb, enum freeze_holder who)
atomic_inc(&sb->s_active);
gfs2_freeze_unlock(sdp);
- error = gfs2_do_thaw(sdp);
+ error = gfs2_do_thaw(sdp, who, freeze_owner);
if (!error) {
clear_bit(SDF_FREEZE_INITIATOR, &sdp->sd_flags);
@@ -1173,74 +1167,6 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root)
return 0;
}
-static void gfs2_final_release_pages(struct gfs2_inode *ip)
-{
- struct inode *inode = &ip->i_inode;
- struct gfs2_glock *gl = ip->i_gl;
-
- if (unlikely(!gl)) {
- /* This can only happen during incomplete inode creation. */
- BUG_ON(!test_bit(GIF_ALLOC_FAILED, &ip->i_flags));
- return;
- }
-
- truncate_inode_pages(gfs2_glock2aspace(gl), 0);
- truncate_inode_pages(&inode->i_data, 0);
-
- if (atomic_read(&gl->gl_revokes) == 0) {
- clear_bit(GLF_LFLUSH, &gl->gl_flags);
- clear_bit(GLF_DIRTY, &gl->gl_flags);
- }
-}
-
-static int gfs2_dinode_dealloc(struct gfs2_inode *ip)
-{
- struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- struct gfs2_rgrpd *rgd;
- struct gfs2_holder gh;
- int error;
-
- if (gfs2_get_inode_blocks(&ip->i_inode) != 1) {
- gfs2_consist_inode(ip);
- return -EIO;
- }
-
- gfs2_rindex_update(sdp);
-
- error = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
- if (error)
- return error;
-
- rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr, 1);
- if (!rgd) {
- gfs2_consist_inode(ip);
- error = -EIO;
- goto out_qs;
- }
-
- error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
- LM_FLAG_NODE_SCOPE, &gh);
- if (error)
- goto out_qs;
-
- error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS + RES_QUOTA,
- sdp->sd_jdesc->jd_blocks);
- if (error)
- goto out_rg_gunlock;
-
- gfs2_free_di(rgd, ip);
-
- gfs2_final_release_pages(ip);
-
- gfs2_trans_end(sdp);
-
-out_rg_gunlock:
- gfs2_glock_dq_uninit(&gh);
-out_qs:
- gfs2_quota_unhold(ip);
- return error;
-}
-
/**
* gfs2_glock_put_eventually
* @gl: The glock to put
@@ -1326,10 +1252,8 @@ static enum evict_behavior evict_should_delete(struct inode *inode,
struct gfs2_sbd *sdp = sb->s_fs_info;
int ret;
- if (unlikely(test_bit(GIF_ALLOC_FAILED, &ip->i_flags)))
- goto should_delete;
-
- if (test_bit(GIF_DEFER_DELETE, &ip->i_flags))
+ if (gfs2_holder_initialized(&ip->i_iopen_gh) &&
+ test_bit(GLF_DEFER_DELETE, &ip->i_iopen_gh.gh_gl->gl_flags))
return EVICT_SHOULD_DEFER_DELETE;
/* Deletes should never happen under memory pressure anymore. */
@@ -1338,12 +1262,8 @@ static enum evict_behavior evict_should_delete(struct inode *inode,
/* Must not read inode block until block type has been verified */
ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, gh);
- if (unlikely(ret)) {
- glock_clear_object(ip->i_iopen_gh.gh_gl, ip);
- ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
- gfs2_glock_dq_uninit(&ip->i_iopen_gh);
- return EVICT_SHOULD_DEFER_DELETE;
- }
+ if (unlikely(ret))
+ return EVICT_SHOULD_SKIP_DELETE;
if (gfs2_inode_already_deleted(ip->i_gl, ip->i_no_formal_ino))
return EVICT_SHOULD_SKIP_DELETE;
@@ -1361,17 +1281,9 @@ static enum evict_behavior evict_should_delete(struct inode *inode,
if (inode->i_nlink)
return EVICT_SHOULD_SKIP_DELETE;
-should_delete:
if (gfs2_holder_initialized(&ip->i_iopen_gh) &&
- test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) {
- enum evict_behavior behavior =
- gfs2_upgrade_iopen_glock(inode);
-
- if (behavior != EVICT_SHOULD_DELETE) {
- gfs2_holder_uninit(&ip->i_iopen_gh);
- return behavior;
- }
- }
+ test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags))
+ return gfs2_upgrade_iopen_glock(inode);
return EVICT_SHOULD_DELETE;
}
@@ -1392,7 +1304,7 @@ static int evict_unlinked_inode(struct inode *inode)
}
if (ip->i_eattr) {
- ret = gfs2_ea_dealloc(ip);
+ ret = gfs2_ea_dealloc(ip, true);
if (ret)
goto out;
}
@@ -1509,7 +1421,7 @@ static void gfs2_evict_inode(struct inode *inode)
gfs2_glock_put(io_gl);
goto out;
}
- behavior = EVICT_SHOULD_DELETE;
+ behavior = EVICT_SHOULD_SKIP_DELETE;
}
if (behavior == EVICT_SHOULD_DELETE)
ret = evict_unlinked_inode(inode);
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index ecc699f8d9fc..c3c8842920d2 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -174,10 +174,10 @@ static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
switch (n) {
case 0:
- error = thaw_super(sdp->sd_vfs, FREEZE_HOLDER_USERSPACE);
+ error = thaw_super(sdp->sd_vfs, FREEZE_HOLDER_USERSPACE, NULL);
break;
case 1:
- error = freeze_super(sdp->sd_vfs, FREEZE_HOLDER_USERSPACE);
+ error = freeze_super(sdp->sd_vfs, FREEZE_HOLDER_USERSPACE, NULL);
break;
default:
return -EINVAL;
@@ -764,7 +764,6 @@ fail_reg:
fs_err(sdp, "error %d adding sysfs files\n", error);
kobject_put(&sdp->sd_kobj);
wait_for_completion(&sdp->sd_kobj_unregister);
- sb->s_fs_info = NULL;
return error;
}
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
index 8eae8d62a413..26036ffc3f33 100644
--- a/fs/gfs2/trace_gfs2.h
+++ b/fs/gfs2/trace_gfs2.h
@@ -53,12 +53,20 @@
{(1UL << GLF_DIRTY), "y" }, \
{(1UL << GLF_LFLUSH), "f" }, \
{(1UL << GLF_INVALIDATE_IN_PROGRESS), "i" }, \
+ {(1UL << GLF_PENDING_REPLY), "R" }, \
{(1UL << GLF_HAVE_REPLY), "r" }, \
{(1UL << GLF_INITIAL), "a" }, \
{(1UL << GLF_HAVE_FROZEN_REPLY), "F" }, \
{(1UL << GLF_LRU), "L" }, \
{(1UL << GLF_OBJECT), "o" }, \
- {(1UL << GLF_BLOCKING), "b" })
+ {(1UL << GLF_BLOCKING), "b" }, \
+ {(1UL << GLF_UNLOCKED), "x" }, \
+ {(1UL << GLF_INSTANTIATE_NEEDED), "n" }, \
+ {(1UL << GLF_INSTANTIATE_IN_PROG), "N" }, \
+ {(1UL << GLF_TRY_TO_EVICT), "e" }, \
+ {(1UL << GLF_VERIFY_DELETE), "E" }, \
+ {(1UL << GLF_DEFER_DELETE), "s" }, \
+ {(1UL << GLF_CANCELING), "C" })
#ifndef NUMPTY
#define NUMPTY
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 192213c7359a..075f7e9abe47 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -226,6 +226,27 @@ out:
unlock_buffer(bh);
}
+void gfs2_trans_add_databufs(struct gfs2_glock *gl, struct folio *folio,
+ size_t from, size_t len)
+{
+ struct buffer_head *head = folio_buffers(folio);
+ unsigned int bsize = head->b_size;
+ struct buffer_head *bh;
+ size_t to = from + len;
+ size_t start, end;
+
+ for (bh = head, start = 0; bh != head || !start;
+ bh = bh->b_this_page, start = end) {
+ end = start + bsize;
+ if (end <= from)
+ continue;
+ if (start >= to)
+ break;
+ set_buffer_uptodate(bh);
+ gfs2_trans_add_data(gl, bh);
+ }
+}
+
void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh)
{
@@ -246,12 +267,12 @@ void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh)
if (bd == NULL) {
gfs2_log_unlock(sdp);
unlock_buffer(bh);
- lock_page(bh->b_page);
+ folio_lock(bh->b_folio);
if (bh->b_private == NULL)
bd = gfs2_alloc_bufdata(gl, bh);
else
bd = bh->b_private;
- unlock_page(bh->b_page);
+ folio_unlock(bh->b_folio);
lock_buffer(bh);
gfs2_log_lock(sdp);
}
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
index f8ce5302280d..790c55f59e61 100644
--- a/fs/gfs2/trans.h
+++ b/fs/gfs2/trans.h
@@ -42,6 +42,8 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
void gfs2_trans_end(struct gfs2_sbd *sdp);
void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh);
+void gfs2_trans_add_databufs(struct gfs2_glock *gl, struct folio *folio,
+ size_t from, size_t len);
void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh);
void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
void gfs2_trans_remove_revoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len);
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index 13be8d1d228b..24864a66074b 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -73,7 +73,7 @@ int check_journal_clean(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd,
"mount.\n");
goto out_unlock;
}
- error = gfs2_find_jhead(jd, &head, false);
+ error = gfs2_find_jhead(jd, &head);
if (error) {
if (verbose)
fs_err(sdp, "Error parsing journal for spectator "
@@ -232,32 +232,23 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp)
*/
ret = gfs2_glock_nq(&sdp->sd_live_gh);
+ gfs2_glock_put(live_gl); /* drop extra reference we acquired */
+ clear_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags);
+
/*
* If we actually got the "live" lock in EX mode, there are no other
- * nodes available to replay our journal. So we try to replay it
- * ourselves. We hold the "live" glock to prevent other mounters
- * during recovery, then just dequeue it and reacquire it in our
- * normal SH mode. Just in case the problem that caused us to
- * withdraw prevents us from recovering our journal (e.g. io errors
- * and such) we still check if the journal is clean before proceeding
- * but we may wait forever until another mounter does the recovery.
+ * nodes available to replay our journal.
*/
if (ret == 0) {
- fs_warn(sdp, "No other mounters found. Trying to recover our "
- "own journal jid %d.\n", sdp->sd_lockstruct.ls_jid);
- if (gfs2_recover_journal(sdp->sd_jdesc, 1))
- fs_warn(sdp, "Unable to recover our journal jid %d.\n",
- sdp->sd_lockstruct.ls_jid);
- gfs2_glock_dq_wait(&sdp->sd_live_gh);
- gfs2_holder_reinit(LM_ST_SHARED,
- LM_FLAG_NOEXP | GL_EXACT | GL_NOPID,
- &sdp->sd_live_gh);
- gfs2_glock_nq(&sdp->sd_live_gh);
+ fs_warn(sdp, "No other mounters found.\n");
+ /*
+ * We are about to release the lockspace. By keeping live_gl
+ * locked here, we ensure that the next mounter coming along
+ * will be a "first" mounter which will perform recovery.
+ */
+ goto skip_recovery;
}
- gfs2_glock_put(live_gl); /* drop extra reference we acquired */
- clear_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags);
-
/*
* At this point our journal is evicted, so we need to get a new inode
* for it. Once done, we need to call gfs2_find_jhead which
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 17ae5070a90e..df9c93de94c7 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -1383,7 +1383,7 @@ out:
return error;
}
-static int ea_dealloc_block(struct gfs2_inode *ip)
+static int ea_dealloc_block(struct gfs2_inode *ip, bool initialized)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_rgrpd *rgd;
@@ -1416,7 +1416,7 @@ static int ea_dealloc_block(struct gfs2_inode *ip)
ip->i_eattr = 0;
gfs2_add_inode_blocks(&ip->i_inode, -1);
- if (likely(!test_bit(GIF_ALLOC_FAILED, &ip->i_flags))) {
+ if (initialized) {
error = gfs2_meta_inode_buffer(ip, &dibh);
if (!error) {
gfs2_trans_add_meta(ip->i_gl, dibh);
@@ -1435,11 +1435,12 @@ out_gunlock:
/**
* gfs2_ea_dealloc - deallocate the extended attribute fork
* @ip: the inode
+ * @initialized: xattrs have been initialized
*
* Returns: errno
*/
-int gfs2_ea_dealloc(struct gfs2_inode *ip)
+int gfs2_ea_dealloc(struct gfs2_inode *ip, bool initialized)
{
int error;
@@ -1451,7 +1452,7 @@ int gfs2_ea_dealloc(struct gfs2_inode *ip)
if (error)
return error;
- if (likely(!test_bit(GIF_ALLOC_FAILED, &ip->i_flags))) {
+ if (initialized) {
error = ea_foreach(ip, ea_dealloc_unstuffed, NULL);
if (error)
goto out_quota;
@@ -1463,7 +1464,7 @@ int gfs2_ea_dealloc(struct gfs2_inode *ip)
}
}
- error = ea_dealloc_block(ip);
+ error = ea_dealloc_block(ip, initialized);
out_quota:
gfs2_quota_unhold(ip);
diff --git a/fs/gfs2/xattr.h b/fs/gfs2/xattr.h
index eb12eb7e37c1..3c9788e0e137 100644
--- a/fs/gfs2/xattr.h
+++ b/fs/gfs2/xattr.h
@@ -54,7 +54,7 @@ int __gfs2_xattr_set(struct inode *inode, const char *name,
const void *value, size_t size,
int flags, int type);
ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size);
-int gfs2_ea_dealloc(struct gfs2_inode *ip);
+int gfs2_ea_dealloc(struct gfs2_inode *ip, bool initialized);
/* Exported to acl.c */
diff --git a/fs/hfs/bfind.c b/fs/hfs/bfind.c
index ef9498a6e88a..34e9804e0f36 100644
--- a/fs/hfs/bfind.c
+++ b/fs/hfs/bfind.c
@@ -16,6 +16,9 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd)
{
void *ptr;
+ if (!tree || !fd)
+ return -EINVAL;
+
fd->tree = tree;
fd->bnode = NULL;
ptr = kmalloc(tree->max_key_len * 2 + 4, GFP_KERNEL);
diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index 6add6ebfef89..e8cd1a31f247 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -15,6 +15,48 @@
#include "btree.h"
+static inline
+bool is_bnode_offset_valid(struct hfs_bnode *node, int off)
+{
+ bool is_valid = off < node->tree->node_size;
+
+ if (!is_valid) {
+ pr_err("requested invalid offset: "
+ "NODE: id %u, type %#x, height %u, "
+ "node_size %u, offset %d\n",
+ node->this, node->type, node->height,
+ node->tree->node_size, off);
+ }
+
+ return is_valid;
+}
+
+static inline
+int check_and_correct_requested_length(struct hfs_bnode *node, int off, int len)
+{
+ unsigned int node_size;
+
+ if (!is_bnode_offset_valid(node, off))
+ return 0;
+
+ node_size = node->tree->node_size;
+
+ if ((off + len) > node_size) {
+ int new_len = (int)node_size - off;
+
+ pr_err("requested length has been corrected: "
+ "NODE: id %u, type %#x, height %u, "
+ "node_size %u, offset %d, "
+ "requested_len %d, corrected_len %d\n",
+ node->this, node->type, node->height,
+ node->tree->node_size, off, len, new_len);
+
+ return new_len;
+ }
+
+ return len;
+}
+
void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len)
{
struct page *page;
@@ -22,6 +64,20 @@ void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len)
int bytes_read;
int bytes_to_read;
+ if (!is_bnode_offset_valid(node, off))
+ return;
+
+ if (len == 0) {
+ pr_err("requested zero length: "
+ "NODE: id %u, type %#x, height %u, "
+ "node_size %u, offset %d, len %d\n",
+ node->this, node->type, node->height,
+ node->tree->node_size, off, len);
+ return;
+ }
+
+ len = check_and_correct_requested_length(node, off, len);
+
off += node->page_offset;
pagenum = off >> PAGE_SHIFT;
off &= ~PAGE_MASK; /* compute page offset for the first page */
@@ -67,6 +123,12 @@ void hfs_bnode_read_key(struct hfs_bnode *node, void *key, int off)
else
key_len = tree->max_key_len + 1;
+ if (key_len > sizeof(hfs_btree_key) || key_len < 1) {
+ memset(key, 0, sizeof(hfs_btree_key));
+ pr_err("hfs: Invalid key length: %d\n", key_len);
+ return;
+ }
+
hfs_bnode_read(node, key, off, key_len);
}
@@ -74,6 +136,20 @@ void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len)
{
struct page *page;
+ if (!is_bnode_offset_valid(node, off))
+ return;
+
+ if (len == 0) {
+ pr_err("requested zero length: "
+ "NODE: id %u, type %#x, height %u, "
+ "node_size %u, offset %d, len %d\n",
+ node->this, node->type, node->height,
+ node->tree->node_size, off, len);
+ return;
+ }
+
+ len = check_and_correct_requested_length(node, off, len);
+
off += node->page_offset;
page = node->page[0];
@@ -98,6 +174,20 @@ void hfs_bnode_clear(struct hfs_bnode *node, int off, int len)
{
struct page *page;
+ if (!is_bnode_offset_valid(node, off))
+ return;
+
+ if (len == 0) {
+ pr_err("requested zero length: "
+ "NODE: id %u, type %#x, height %u, "
+ "node_size %u, offset %d, len %d\n",
+ node->this, node->type, node->height,
+ node->tree->node_size, off, len);
+ return;
+ }
+
+ len = check_and_correct_requested_length(node, off, len);
+
off += node->page_offset;
page = node->page[0];
@@ -113,6 +203,10 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst,
hfs_dbg(BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len);
if (!len)
return;
+
+ len = check_and_correct_requested_length(src_node, src, len);
+ len = check_and_correct_requested_length(dst_node, dst, len);
+
src += src_node->page_offset;
dst += dst_node->page_offset;
src_page = src_node->page[0];
@@ -130,6 +224,10 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
hfs_dbg(BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len);
if (!len)
return;
+
+ len = check_and_correct_requested_length(node, src, len);
+ len = check_and_correct_requested_length(node, dst, len);
+
src += node->page_offset;
dst += node->page_offset;
page = node->page[0];
@@ -476,6 +574,7 @@ void hfs_bnode_put(struct hfs_bnode *node)
if (test_bit(HFS_BNODE_DELETED, &node->flags)) {
hfs_bnode_unhash(node);
spin_unlock(&tree->hash_lock);
+ hfs_bnode_clear(node, 0, tree->node_size);
hfs_bmap_free(node);
hfs_bnode_free(node);
return;
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 2fa4b1f8cc7f..e86e1e235658 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -21,8 +21,12 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
struct hfs_btree *tree;
struct hfs_btree_header_rec *head;
struct address_space *mapping;
- struct page *page;
+ struct folio *folio;
+ struct buffer_head *bh;
unsigned int size;
+ u16 dblock;
+ sector_t start_block;
+ loff_t offset;
tree = kzalloc(sizeof(*tree), GFP_KERNEL);
if (!tree)
@@ -75,12 +79,40 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
unlock_new_inode(tree->inode);
mapping = tree->inode->i_mapping;
- page = read_mapping_page(mapping, 0, NULL);
- if (IS_ERR(page))
+ folio = filemap_grab_folio(mapping, 0);
+ if (IS_ERR(folio))
goto free_inode;
+ folio_zero_range(folio, 0, folio_size(folio));
+
+ dblock = hfs_ext_find_block(HFS_I(tree->inode)->first_extents, 0);
+ start_block = HFS_SB(sb)->fs_start + (dblock * HFS_SB(sb)->fs_div);
+
+ size = folio_size(folio);
+ offset = 0;
+ while (size > 0) {
+ size_t len;
+
+ bh = sb_bread(sb, start_block);
+ if (!bh) {
+ pr_err("unable to read tree header\n");
+ goto put_folio;
+ }
+
+ len = min_t(size_t, folio_size(folio), sb->s_blocksize);
+ memcpy_to_folio(folio, offset, bh->b_data, sb->s_blocksize);
+
+ brelse(bh);
+
+ start_block++;
+ offset += len;
+ size -= len;
+ }
+
+ folio_mark_uptodate(folio);
+
/* Load the header */
- head = (struct hfs_btree_header_rec *)(kmap_local_page(page) +
+ head = (struct hfs_btree_header_rec *)(kmap_local_folio(folio, 0) +
sizeof(struct hfs_bnode_desc));
tree->root = be32_to_cpu(head->root);
tree->leaf_count = be32_to_cpu(head->leaf_count);
@@ -95,22 +127,22 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
size = tree->node_size;
if (!is_power_of_2(size))
- goto fail_page;
+ goto fail_folio;
if (!tree->node_count)
- goto fail_page;
+ goto fail_folio;
switch (id) {
case HFS_EXT_CNID:
if (tree->max_key_len != HFS_MAX_EXT_KEYLEN) {
pr_err("invalid extent max_key_len %d\n",
tree->max_key_len);
- goto fail_page;
+ goto fail_folio;
}
break;
case HFS_CAT_CNID:
if (tree->max_key_len != HFS_MAX_CAT_KEYLEN) {
pr_err("invalid catalog max_key_len %d\n",
tree->max_key_len);
- goto fail_page;
+ goto fail_folio;
}
break;
default:
@@ -121,12 +153,15 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
tree->pages_per_bnode = (tree->node_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
kunmap_local(head);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
return tree;
-fail_page:
+fail_folio:
kunmap_local(head);
- put_page(page);
+put_folio:
+ folio_unlock(folio);
+ folio_put(folio);
free_inode:
tree->inode->i_mapping->a_ops = &hfs_aops;
iput(tree->inode);
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index b75c26045df4..86a6b317b474 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -219,26 +219,26 @@ static int hfs_create(struct mnt_idmap *idmap, struct inode *dir,
* in a directory, given the inode for the parent directory and the
* name (and its length) of the new directory.
*/
-static int hfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *hfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct inode *inode;
int res;
inode = hfs_new_inode(dir, &dentry->d_name, S_IFDIR | mode);
if (!inode)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
res = hfs_cat_create(inode->i_ino, dir, &dentry->d_name, inode);
if (res) {
clear_nlink(inode);
hfs_delete_inode(inode);
iput(inode);
- return res;
+ return ERR_PTR(res);
}
d_instantiate(dentry, inode);
mark_inode_dirty(inode);
- return 0;
+ return NULL;
}
/*
diff --git a/fs/hfs/extent.c b/fs/hfs/extent.c
index 4a0ce131e233..580c62981dbd 100644
--- a/fs/hfs/extent.c
+++ b/fs/hfs/extent.c
@@ -71,7 +71,7 @@ int hfs_ext_keycmp(const btree_key *key1, const btree_key *key2)
*
* Find a block within an extent record
*/
-static u16 hfs_ext_find_block(struct hfs_extent *ext, u16 off)
+u16 hfs_ext_find_block(struct hfs_extent *ext, u16 off)
{
int i;
u16 count;
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index a0c7cb0f79fc..7c5a7ecfa246 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -190,6 +190,7 @@ extern const struct inode_operations hfs_dir_inode_operations;
/* extent.c */
extern int hfs_ext_keycmp(const btree_key *, const btree_key *);
+extern u16 hfs_ext_find_block(struct hfs_extent *ext, u16 off);
extern int hfs_free_fork(struct super_block *, struct hfs_cat_file *, int);
extern int hfs_ext_write_extent(struct inode *);
extern int hfs_extend_file(struct inode *);
@@ -201,7 +202,7 @@ extern int hfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
extern const struct address_space_operations hfs_aops;
extern const struct address_space_operations hfs_btree_aops;
-int hfs_write_begin(struct file *file, struct address_space *mapping,
+int hfs_write_begin(const struct kiocb *iocb, struct address_space *mapping,
loff_t pos, unsigned len, struct folio **foliop, void **fsdata);
extern struct inode *hfs_new_inode(struct inode *, const struct qstr *, umode_t);
extern void hfs_inode_write_fork(struct inode *, struct hfs_extent *, __be32 *, __be32 *);
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index a81ce7a740b9..bf4cb7e78396 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -44,12 +44,12 @@ static void hfs_write_failed(struct address_space *mapping, loff_t to)
}
}
-int hfs_write_begin(struct file *file, struct address_space *mapping,
+int hfs_write_begin(const struct kiocb *iocb, struct address_space *mapping,
loff_t pos, unsigned len, struct folio **foliop, void **fsdata)
{
int ret;
- ret = cont_write_begin(file, mapping, pos, len, foliop, fsdata,
+ ret = cont_write_begin(iocb, mapping, pos, len, foliop, fsdata,
hfs_get_block,
&HFS_I(mapping->host)->phys_size);
if (unlikely(ret))
@@ -690,8 +690,9 @@ static const struct file_operations hfs_file_operations = {
.llseek = generic_file_llseek,
.read_iter = generic_file_read_iter,
.write_iter = generic_file_write_iter,
- .mmap = generic_file_mmap,
+ .mmap_prepare = generic_file_mmap_prepare,
.splice_read = filemap_splice_read,
+ .splice_write = iter_file_splice_write,
.fsync = hfs_file_fsync,
.open = hfs_file_open,
.release = hfs_file_release,
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index fe09c2093a93..388a318297ec 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -365,7 +365,7 @@ static int hfs_fill_super(struct super_block *sb, struct fs_context *fc)
if (!root_inode)
goto bail_no_root;
- sb->s_d_op = &hfs_dentry_operations;
+ set_default_d_op(sb, &hfs_dentry_operations);
res = -ENOMEM;
sb->s_root = d_make_root(root_inode);
if (!sb->s_root)
diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c
index 87974d5e6791..14f4995588ff 100644
--- a/fs/hfsplus/bnode.c
+++ b/fs/hfsplus/bnode.c
@@ -18,12 +18,68 @@
#include "hfsplus_fs.h"
#include "hfsplus_raw.h"
+static inline
+bool is_bnode_offset_valid(struct hfs_bnode *node, int off)
+{
+ bool is_valid = off < node->tree->node_size;
+
+ if (!is_valid) {
+ pr_err("requested invalid offset: "
+ "NODE: id %u, type %#x, height %u, "
+ "node_size %u, offset %d\n",
+ node->this, node->type, node->height,
+ node->tree->node_size, off);
+ }
+
+ return is_valid;
+}
+
+static inline
+int check_and_correct_requested_length(struct hfs_bnode *node, int off, int len)
+{
+ unsigned int node_size;
+
+ if (!is_bnode_offset_valid(node, off))
+ return 0;
+
+ node_size = node->tree->node_size;
+
+ if ((off + len) > node_size) {
+ int new_len = (int)node_size - off;
+
+ pr_err("requested length has been corrected: "
+ "NODE: id %u, type %#x, height %u, "
+ "node_size %u, offset %d, "
+ "requested_len %d, corrected_len %d\n",
+ node->this, node->type, node->height,
+ node->tree->node_size, off, len, new_len);
+
+ return new_len;
+ }
+
+ return len;
+}
+
/* Copy a specified range of bytes from the raw data of a node */
void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len)
{
struct page **pagep;
int l;
+ if (!is_bnode_offset_valid(node, off))
+ return;
+
+ if (len == 0) {
+ pr_err("requested zero length: "
+ "NODE: id %u, type %#x, height %u, "
+ "node_size %u, offset %d, len %d\n",
+ node->this, node->type, node->height,
+ node->tree->node_size, off, len);
+ return;
+ }
+
+ len = check_and_correct_requested_length(node, off, len);
+
off += node->page_offset;
pagep = node->page + (off >> PAGE_SHIFT);
off &= ~PAGE_MASK;
@@ -67,6 +123,12 @@ void hfs_bnode_read_key(struct hfs_bnode *node, void *key, int off)
else
key_len = tree->max_key_len + 2;
+ if (key_len > sizeof(hfsplus_btree_key) || key_len < 1) {
+ memset(key, 0, sizeof(hfsplus_btree_key));
+ pr_err("hfsplus: Invalid key length: %d\n", key_len);
+ return;
+ }
+
hfs_bnode_read(node, key, off, key_len);
}
@@ -75,6 +137,20 @@ void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len)
struct page **pagep;
int l;
+ if (!is_bnode_offset_valid(node, off))
+ return;
+
+ if (len == 0) {
+ pr_err("requested zero length: "
+ "NODE: id %u, type %#x, height %u, "
+ "node_size %u, offset %d, len %d\n",
+ node->this, node->type, node->height,
+ node->tree->node_size, off, len);
+ return;
+ }
+
+ len = check_and_correct_requested_length(node, off, len);
+
off += node->page_offset;
pagep = node->page + (off >> PAGE_SHIFT);
off &= ~PAGE_MASK;
@@ -103,6 +179,20 @@ void hfs_bnode_clear(struct hfs_bnode *node, int off, int len)
struct page **pagep;
int l;
+ if (!is_bnode_offset_valid(node, off))
+ return;
+
+ if (len == 0) {
+ pr_err("requested zero length: "
+ "NODE: id %u, type %#x, height %u, "
+ "node_size %u, offset %d, len %d\n",
+ node->this, node->type, node->height,
+ node->tree->node_size, off, len);
+ return;
+ }
+
+ len = check_and_correct_requested_length(node, off, len);
+
off += node->page_offset;
pagep = node->page + (off >> PAGE_SHIFT);
off &= ~PAGE_MASK;
@@ -127,6 +217,10 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst,
hfs_dbg(BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len);
if (!len)
return;
+
+ len = check_and_correct_requested_length(src_node, src, len);
+ len = check_and_correct_requested_length(dst_node, dst, len);
+
src += src_node->page_offset;
dst += dst_node->page_offset;
src_page = src_node->page + (src >> PAGE_SHIFT);
@@ -181,6 +275,10 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
hfs_dbg(BNODE_MOD, "movebytes: %u,%u,%u\n", dst, src, len);
if (!len)
return;
+
+ len = check_and_correct_requested_length(node, src, len);
+ len = check_and_correct_requested_length(node, dst, len);
+
src += node->page_offset;
dst += node->page_offset;
if (dst > src) {
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index f5c4b3e31a1c..876bbb80fb4d 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -523,10 +523,10 @@ static int hfsplus_create(struct mnt_idmap *idmap, struct inode *dir,
return hfsplus_mknod(&nop_mnt_idmap, dir, dentry, mode, 0);
}
-static int hfsplus_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *hfsplus_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
- return hfsplus_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFDIR, 0);
+ return ERR_PTR(hfsplus_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFDIR, 0));
}
static int hfsplus_rename(struct mnt_idmap *idmap,
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index a6d61685ae79..b1699b3c246a 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -342,9 +342,6 @@ static int hfsplus_free_extents(struct super_block *sb,
int i;
int err = 0;
- /* Mapping the allocation file may lock the extent tree */
- WARN_ON(mutex_is_locked(&HFSPLUS_SB(sb)->ext_tree->tree_lock));
-
hfsplus_dump_extent(extent);
for (i = 0; i < 8; extent++, i++) {
count = be32_to_cpu(extent->block_count);
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 2f089bff0095..96a5c24813dd 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -473,8 +473,10 @@ extern const struct address_space_operations hfsplus_aops;
extern const struct address_space_operations hfsplus_btree_aops;
extern const struct dentry_operations hfsplus_dentry_operations;
-int hfsplus_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, struct folio **foliop, void **fsdata);
+int hfsplus_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, struct folio **foliop,
+ void **fsdata);
struct inode *hfsplus_new_inode(struct super_block *sb, struct inode *dir,
umode_t mode);
void hfsplus_delete_inode(struct inode *inode);
@@ -489,9 +491,9 @@ int hfsplus_getattr(struct mnt_idmap *idmap, const struct path *path,
unsigned int query_flags);
int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
int datasync);
-int hfsplus_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+int hfsplus_fileattr_get(struct dentry *dentry, struct file_kattr *fa);
int hfsplus_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa);
+ struct dentry *dentry, struct file_kattr *fa);
/* ioctl.c */
long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index f331e9574217..b51a411ecd23 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -38,12 +38,14 @@ static void hfsplus_write_failed(struct address_space *mapping, loff_t to)
}
}
-int hfsplus_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, struct folio **foliop, void **fsdata)
+int hfsplus_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping, loff_t pos,
+ unsigned len, struct folio **foliop,
+ void **fsdata)
{
int ret;
- ret = cont_write_begin(file, mapping, pos, len, foliop, fsdata,
+ ret = cont_write_begin(iocb, mapping, pos, len, foliop, fsdata,
hfsplus_get_block,
&HFSPLUS_I(mapping->host)->phys_size);
if (unlikely(ret))
@@ -366,8 +368,9 @@ static const struct file_operations hfsplus_file_operations = {
.llseek = generic_file_llseek,
.read_iter = generic_file_read_iter,
.write_iter = generic_file_write_iter,
- .mmap = generic_file_mmap,
+ .mmap_prepare = generic_file_mmap_prepare,
.splice_read = filemap_splice_read,
+ .splice_write = iter_file_splice_write,
.fsync = hfsplus_file_fsync,
.open = hfsplus_file_open,
.release = hfsplus_file_release,
@@ -654,7 +657,7 @@ out:
return res;
}
-int hfsplus_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+int hfsplus_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
{
struct inode *inode = d_inode(dentry);
struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
@@ -673,7 +676,7 @@ int hfsplus_fileattr_get(struct dentry *dentry, struct fileattr *fa)
}
int hfsplus_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa)
+ struct dentry *dentry, struct file_kattr *fa)
{
struct inode *inode = d_inode(dentry);
struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 948b8aaee33e..86351bdc8985 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -222,8 +222,7 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait)
error2 = hfsplus_submit_bio(sb,
sbi->part_start + HFSPLUS_VOLHEAD_SECTOR,
- sbi->s_vhdr_buf, NULL, REQ_OP_WRITE |
- REQ_SYNC);
+ sbi->s_vhdr_buf, NULL, REQ_OP_WRITE);
if (!error)
error = error2;
if (!write_backup)
@@ -231,8 +230,7 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait)
error2 = hfsplus_submit_bio(sb,
sbi->part_start + sbi->sect_count - 2,
- sbi->s_backup_vhdr_buf, NULL, REQ_OP_WRITE |
- REQ_SYNC);
+ sbi->s_backup_vhdr_buf, NULL, REQ_OP_WRITE);
if (!error)
error2 = error;
out:
@@ -508,7 +506,7 @@ static int hfsplus_fill_super(struct super_block *sb, struct fs_context *fc)
goto out_put_alloc_file;
}
- sb->s_d_op = &hfsplus_dentry_operations;
+ set_default_d_op(sb, &hfsplus_dentry_operations);
sb->s_root = d_make_root(root);
if (!sb->s_root) {
err = -ENOMEM;
diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c
index 73342c925a4b..36b6cf2a3abb 100644
--- a/fs/hfsplus/unicode.c
+++ b/fs/hfsplus/unicode.c
@@ -132,7 +132,14 @@ int hfsplus_uni2asc(struct super_block *sb,
op = astr;
ip = ustr->unicode;
+
ustrlen = be16_to_cpu(ustr->length);
+ if (ustrlen > HFSPLUS_MAX_STRLEN) {
+ ustrlen = HFSPLUS_MAX_STRLEN;
+ pr_err("invalid length %u has been corrected to %d\n",
+ be16_to_cpu(ustr->length), ustrlen);
+ }
+
len = *len_p;
ce1 = NULL;
compose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index 74801911bc1c..30cf4fe78b3d 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -48,47 +48,19 @@ struct hfsplus_wd {
int hfsplus_submit_bio(struct super_block *sb, sector_t sector,
void *buf, void **data, blk_opf_t opf)
{
- const enum req_op op = opf & REQ_OP_MASK;
- struct bio *bio;
- int ret = 0;
- u64 io_size;
- loff_t start;
- int offset;
+ u64 io_size = hfsplus_min_io_size(sb);
+ loff_t start = (loff_t)sector << HFSPLUS_SECTOR_SHIFT;
+ int offset = start & (io_size - 1);
+
+ if ((opf & REQ_OP_MASK) != REQ_OP_WRITE && data)
+ *data = (u8 *)buf + offset;
/*
- * Align sector to hardware sector size and find offset. We
- * assume that io_size is a power of two, which _should_
- * be true.
+ * Align sector to hardware sector size and find offset. We assume that
+ * io_size is a power of two, which _should_ be true.
*/
- io_size = hfsplus_min_io_size(sb);
- start = (loff_t)sector << HFSPLUS_SECTOR_SHIFT;
- offset = start & (io_size - 1);
sector &= ~((io_size >> HFSPLUS_SECTOR_SHIFT) - 1);
-
- bio = bio_alloc(sb->s_bdev, 1, opf, GFP_NOIO);
- bio->bi_iter.bi_sector = sector;
-
- if (op != REQ_OP_WRITE && data)
- *data = (u8 *)buf + offset;
-
- while (io_size > 0) {
- unsigned int page_offset = offset_in_page(buf);
- unsigned int len = min_t(unsigned int, PAGE_SIZE - page_offset,
- io_size);
-
- ret = bio_add_page(bio, virt_to_page(buf), len, page_offset);
- if (ret != len) {
- ret = -EIO;
- goto out;
- }
- io_size -= len;
- buf = (u8 *)buf + len;
- }
-
- ret = submit_bio_wait(bio);
-out:
- bio_put(bio);
- return ret < 0 ? ret : 0;
+ return bdev_rw_virt(sb->s_bdev, sector, buf, io_size, opf);
}
static int hfsplus_read_mdb(void *bufptr, struct hfsplus_wd *wd)
diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
index 9a1a93e3888b..18dc3d254d21 100644
--- a/fs/hfsplus/xattr.c
+++ b/fs/hfsplus/xattr.c
@@ -172,7 +172,11 @@ check_attr_tree_state_again:
return PTR_ERR(attr_file);
}
- BUG_ON(i_size_read(attr_file) != 0);
+ if (i_size_read(attr_file) != 0) {
+ err = -EIO;
+ pr_err("detected inconsistent attributes file, running fsck.hfsplus is recommended.\n");
+ goto end_attr_file_creation;
+ }
hip = HFSPLUS_I(attr_file);
diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h
index 8b39c15c408c..15b2f094d36e 100644
--- a/fs/hostfs/hostfs.h
+++ b/fs/hostfs/hostfs.h
@@ -60,7 +60,7 @@ struct hostfs_stat {
unsigned int uid;
unsigned int gid;
unsigned long long size;
- struct hostfs_timespec atime, mtime, ctime;
+ struct hostfs_timespec atime, mtime, ctime, btime;
unsigned int blksize;
unsigned long long blocks;
struct {
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 7e51d2cec64b..01e516175bcd 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -33,6 +33,7 @@ struct hostfs_inode_info {
struct inode vfs_inode;
struct mutex open_mutex;
dev_t dev;
+ struct hostfs_timespec btime;
};
static inline struct hostfs_inode_info *HOSTFS_I(struct inode *inode)
@@ -95,32 +96,17 @@ __uml_setup("hostfs=", hostfs_args,
static char *__dentry_name(struct dentry *dentry, char *name)
{
char *p = dentry_path_raw(dentry, name, PATH_MAX);
- char *root;
- size_t len;
- struct hostfs_fs_info *fsi;
+ struct hostfs_fs_info *fsi = dentry->d_sb->s_fs_info;
+ char *root = fsi->host_root_path;
+ size_t len = strlen(root);
- fsi = dentry->d_sb->s_fs_info;
- root = fsi->host_root_path;
- len = strlen(root);
- if (IS_ERR(p)) {
+ if (IS_ERR(p) || len > p - name) {
__putname(name);
return NULL;
}
- /*
- * This function relies on the fact that dentry_path_raw() will place
- * the path name at the end of the provided buffer.
- */
- BUG_ON(p + strlen(p) + 1 != name + PATH_MAX);
-
- strscpy(name, root, PATH_MAX);
- if (len > p - name) {
- __putname(name);
- return NULL;
- }
-
- if (p > name + len)
- strcpy(name + len, p);
+ memcpy(name, root, len);
+ memmove(name + len, p, name + PATH_MAX - p);
return name;
}
@@ -396,7 +382,7 @@ static const struct file_operations hostfs_file_fops = {
.splice_write = iter_file_splice_write,
.read_iter = generic_file_read_iter,
.write_iter = generic_file_write_iter,
- .mmap = generic_file_mmap,
+ .mmap_prepare = generic_file_mmap_prepare,
.open = hostfs_open,
.release = hostfs_file_release,
.fsync = hostfs_fsync,
@@ -410,38 +396,33 @@ static const struct file_operations hostfs_dir_fops = {
.fsync = hostfs_fsync,
};
-static int hostfs_writepage(struct page *page, struct writeback_control *wbc)
+static int hostfs_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
{
- struct address_space *mapping = page->mapping;
struct inode *inode = mapping->host;
- char *buffer;
- loff_t base = page_offset(page);
- int count = PAGE_SIZE;
- int end_index = inode->i_size >> PAGE_SHIFT;
- int err;
-
- if (page->index >= end_index)
- count = inode->i_size & (PAGE_SIZE-1);
-
- buffer = kmap_local_page(page);
-
- err = write_file(HOSTFS_I(inode)->fd, &base, buffer, count);
- if (err != count) {
- if (err >= 0)
- err = -EIO;
- mapping_set_error(mapping, err);
- goto out;
+ struct folio *folio = NULL;
+ loff_t i_size = i_size_read(inode);
+ int err = 0;
+
+ while ((folio = writeback_iter(mapping, wbc, folio, &err))) {
+ loff_t pos = folio_pos(folio);
+ size_t count = folio_size(folio);
+ char *buffer;
+ int ret;
+
+ if (count > i_size - pos)
+ count = i_size - pos;
+
+ buffer = kmap_local_folio(folio, 0);
+ ret = write_file(HOSTFS_I(inode)->fd, &pos, buffer, count);
+ kunmap_local(buffer);
+ folio_unlock(folio);
+ if (ret != count) {
+ err = ret < 0 ? ret : -EIO;
+ mapping_set_error(mapping, err);
+ }
}
- if (base > inode->i_size)
- inode->i_size = base;
-
- err = 0;
-
- out:
- kunmap_local(buffer);
- unlock_page(page);
-
return err;
}
@@ -464,7 +445,8 @@ static int hostfs_read_folio(struct file *file, struct folio *folio)
return ret;
}
-static int hostfs_write_begin(struct file *file, struct address_space *mapping,
+static int hostfs_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
loff_t pos, unsigned len,
struct folio **foliop, void **fsdata)
{
@@ -477,7 +459,8 @@ static int hostfs_write_begin(struct file *file, struct address_space *mapping,
return 0;
}
-static int hostfs_write_end(struct file *file, struct address_space *mapping,
+static int hostfs_write_end(const struct kiocb *iocb,
+ struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct folio *folio, void *fsdata)
{
@@ -487,7 +470,7 @@ static int hostfs_write_end(struct file *file, struct address_space *mapping,
int err;
buffer = kmap_local_folio(folio, from);
- err = write_file(FILE_HOSTFS_I(file)->fd, &pos, buffer, copied);
+ err = write_file(FILE_HOSTFS_I(iocb->ki_filp)->fd, &pos, buffer, copied);
kunmap_local(buffer);
if (!folio_test_uptodate(folio) && err == folio_size(folio))
@@ -506,11 +489,12 @@ static int hostfs_write_end(struct file *file, struct address_space *mapping,
}
static const struct address_space_operations hostfs_aops = {
- .writepage = hostfs_writepage,
+ .writepages = hostfs_writepages,
.read_folio = hostfs_read_folio,
.dirty_folio = filemap_dirty_folio,
.write_begin = hostfs_write_begin,
.write_end = hostfs_write_end,
+ .migrate_folio = filemap_migrate_folio,
};
static int hostfs_inode_update(struct inode *ino, const struct hostfs_stat *st)
@@ -566,6 +550,7 @@ static int hostfs_inode_set(struct inode *ino, void *data)
}
HOSTFS_I(ino)->dev = dev;
+ HOSTFS_I(ino)->btime = st->btime;
ino->i_ino = st->ino;
ino->i_mode = st->mode;
return hostfs_inode_update(ino, st);
@@ -576,7 +561,10 @@ static int hostfs_inode_test(struct inode *inode, void *data)
const struct hostfs_stat *st = data;
dev_t dev = MKDEV(st->dev.maj, st->dev.min);
- return inode->i_ino == st->ino && HOSTFS_I(inode)->dev == dev;
+ return inode->i_ino == st->ino && HOSTFS_I(inode)->dev == dev &&
+ (inode->i_mode & S_IFMT) == (st->mode & S_IFMT) &&
+ HOSTFS_I(inode)->btime.tv_sec == st->btime.tv_sec &&
+ HOSTFS_I(inode)->btime.tv_nsec == st->btime.tv_nsec;
}
static struct inode *hostfs_iget(struct super_block *sb, char *name)
@@ -698,17 +686,25 @@ static int hostfs_symlink(struct mnt_idmap *idmap, struct inode *ino,
return err;
}
-static int hostfs_mkdir(struct mnt_idmap *idmap, struct inode *ino,
- struct dentry *dentry, umode_t mode)
+static struct dentry *hostfs_mkdir(struct mnt_idmap *idmap, struct inode *ino,
+ struct dentry *dentry, umode_t mode)
{
+ struct inode *inode;
char *file;
int err;
if ((file = dentry_name(dentry)) == NULL)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
err = do_mkdir(file, mode);
+ if (err) {
+ dentry = ERR_PTR(err);
+ } else {
+ inode = hostfs_iget(dentry->d_sb, file);
+ d_drop(dentry);
+ dentry = d_splice_alias(inode, dentry);
+ }
__putname(file);
- return err;
+ return dentry;
}
static int hostfs_rmdir(struct inode *ino, struct dentry *dentry)
@@ -939,7 +935,7 @@ static int hostfs_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_blocksize_bits = 10;
sb->s_magic = HOSTFS_SUPER_MAGIC;
sb->s_op = &hostfs_sbops;
- sb->s_d_op = &simple_dentry_operations;
+ sb->s_d_flags = DCACHE_DONTCACHE;
sb->s_maxbytes = MAX_LFS_FILESIZE;
err = super_setup_bdi(sb);
if (err)
diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c
index 97e9c40a9448..3bcd9f35e70b 100644
--- a/fs/hostfs/hostfs_user.c
+++ b/fs/hostfs/hostfs_user.c
@@ -18,39 +18,48 @@
#include "hostfs.h"
#include <utime.h>
-static void stat64_to_hostfs(const struct stat64 *buf, struct hostfs_stat *p)
+static void statx_to_hostfs(const struct statx *buf, struct hostfs_stat *p)
{
- p->ino = buf->st_ino;
- p->mode = buf->st_mode;
- p->nlink = buf->st_nlink;
- p->uid = buf->st_uid;
- p->gid = buf->st_gid;
- p->size = buf->st_size;
- p->atime.tv_sec = buf->st_atime;
- p->atime.tv_nsec = 0;
- p->ctime.tv_sec = buf->st_ctime;
- p->ctime.tv_nsec = 0;
- p->mtime.tv_sec = buf->st_mtime;
- p->mtime.tv_nsec = 0;
- p->blksize = buf->st_blksize;
- p->blocks = buf->st_blocks;
- p->rdev.maj = os_major(buf->st_rdev);
- p->rdev.min = os_minor(buf->st_rdev);
- p->dev.maj = os_major(buf->st_dev);
- p->dev.min = os_minor(buf->st_dev);
+ p->ino = buf->stx_ino;
+ p->mode = buf->stx_mode;
+ p->nlink = buf->stx_nlink;
+ p->uid = buf->stx_uid;
+ p->gid = buf->stx_gid;
+ p->size = buf->stx_size;
+ p->atime.tv_sec = buf->stx_atime.tv_sec;
+ p->atime.tv_nsec = buf->stx_atime.tv_nsec;
+ p->ctime.tv_sec = buf->stx_ctime.tv_sec;
+ p->ctime.tv_nsec = buf->stx_ctime.tv_nsec;
+ p->mtime.tv_sec = buf->stx_mtime.tv_sec;
+ p->mtime.tv_nsec = buf->stx_mtime.tv_nsec;
+ if (buf->stx_mask & STATX_BTIME) {
+ p->btime.tv_sec = buf->stx_btime.tv_sec;
+ p->btime.tv_nsec = buf->stx_btime.tv_nsec;
+ } else {
+ memset(&p->btime, 0, sizeof(p->btime));
+ }
+ p->blksize = buf->stx_blksize;
+ p->blocks = buf->stx_blocks;
+ p->rdev.maj = buf->stx_rdev_major;
+ p->rdev.min = buf->stx_rdev_minor;
+ p->dev.maj = buf->stx_dev_major;
+ p->dev.min = buf->stx_dev_minor;
}
int stat_file(const char *path, struct hostfs_stat *p, int fd)
{
- struct stat64 buf;
+ struct statx buf;
+ int flags = AT_SYMLINK_NOFOLLOW;
if (fd >= 0) {
- if (fstat64(fd, &buf) < 0)
- return -errno;
- } else if (lstat64(path, &buf) < 0) {
- return -errno;
+ flags |= AT_EMPTY_PATH;
+ path = "";
}
- stat64_to_hostfs(&buf, p);
+
+ if ((statx(fd, path, flags, STATX_BASIC_STATS | STATX_BTIME, &buf)) < 0)
+ return -errno;
+
+ statx_to_hostfs(&buf, p);
return 0;
}
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index 449a3fc1b8d9..263b5bbe1849 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -188,13 +188,14 @@ static void hpfs_write_failed(struct address_space *mapping, loff_t to)
hpfs_unlock(inode->i_sb);
}
-static int hpfs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len,
- struct folio **foliop, void **fsdata)
+static int hpfs_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len,
+ struct folio **foliop, void **fsdata)
{
int ret;
- ret = cont_write_begin(file, mapping, pos, len, foliop, fsdata,
+ ret = cont_write_begin(iocb, mapping, pos, len, foliop, fsdata,
hpfs_get_block,
&hpfs_i(mapping->host)->mmu_private);
if (unlikely(ret))
@@ -203,13 +204,14 @@ static int hpfs_write_begin(struct file *file, struct address_space *mapping,
return ret;
}
-static int hpfs_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct folio *folio, void *fsdata)
+static int hpfs_write_end(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct folio *folio, void *fsdata)
{
struct inode *inode = mapping->host;
int err;
- err = generic_write_end(file, mapping, pos, len, copied, folio, fsdata);
+ err = generic_write_end(iocb, mapping, pos, len, copied, folio, fsdata);
if (err < len)
hpfs_write_failed(mapping, pos + len);
if (!(err < 0)) {
@@ -255,7 +257,7 @@ const struct file_operations hpfs_file_ops =
.llseek = generic_file_llseek,
.read_iter = generic_file_read_iter,
.write_iter = generic_file_write_iter,
- .mmap = generic_file_mmap,
+ .mmap_prepare = generic_file_mmap_prepare,
.release = hpfs_file_release,
.fsync = hpfs_file_fsync,
.splice_read = filemap_splice_read,
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index d0edf9ed33b6..e3cdc421dfba 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -19,8 +19,8 @@ static void hpfs_update_directory_times(struct inode *dir)
hpfs_write_inode_nolock(dir);
}
-static int hpfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *hpfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
const unsigned char *name = dentry->d_name.name;
unsigned len = dentry->d_name.len;
@@ -35,7 +35,7 @@ static int hpfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
int r;
struct hpfs_dirent dee;
int err;
- if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err;
+ if ((err = hpfs_chk_name(name, &len))) return ERR_PTR(err==-ENOENT ? -EINVAL : err);
hpfs_lock(dir->i_sb);
err = -ENOSPC;
fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh);
@@ -112,7 +112,7 @@ static int hpfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
hpfs_update_directory_times(dir);
d_instantiate(dentry, result);
hpfs_unlock(dir->i_sb);
- return 0;
+ return NULL;
bail3:
iput(result);
bail2:
@@ -123,7 +123,7 @@ bail1:
hpfs_free_sectors(dir->i_sb, fno, 1);
bail:
hpfs_unlock(dir->i_sb);
- return err;
+ return ERR_PTR(err);
}
static int hpfs_create(struct mnt_idmap *idmap, struct inode *dir,
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 27567920abe4..42b779b4d87f 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -554,7 +554,7 @@ static int hpfs_fill_super(struct super_block *s, struct fs_context *fc)
/* Fill superblock stuff */
s->s_magic = HPFS_SUPER_MAGIC;
s->s_op = &hpfs_sops;
- s->s_d_op = &hpfs_dentry_operations;
+ set_default_d_op(s, &hpfs_dentry_operations);
s->s_time_min = local_to_gmt(s, 0);
s->s_time_max = local_to_gmt(s, U32_MAX);
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 0fc179a59830..09d4baef29cf 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -150,10 +150,10 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
if (inode->i_flags & S_PRIVATE)
vm_flags |= VM_NORESERVE;
- if (!hugetlb_reserve_pages(inode,
+ if (hugetlb_reserve_pages(inode,
vma->vm_pgoff >> huge_page_order(h),
len >> huge_page_shift(h), vma,
- vm_flags))
+ vm_flags) < 0)
goto out;
ret = 0;
@@ -179,12 +179,8 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
if (len & ~huge_page_mask(h))
return -EINVAL;
- if (flags & MAP_FIXED) {
- if (addr & ~huge_page_mask(h))
- return -EINVAL;
- if (prepare_hugepage_range(file, addr, len))
- return -EINVAL;
- }
+ if ((flags & MAP_FIXED) && (addr & ~huge_page_mask(h)))
+ return -EINVAL;
if (addr)
addr0 = ALIGN(addr, huge_page_size(h));
@@ -193,19 +189,21 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
}
/*
- * Someone wants to read @bytes from a HWPOISON hugetlb @page from @offset.
+ * Someone wants to read @bytes from a HWPOISON hugetlb @folio from @offset.
* Returns the maximum number of bytes one can read without touching the 1st raw
- * HWPOISON subpage.
+ * HWPOISON page.
*
* The implementation borrows the iteration logic from copy_page_to_iter*.
*/
-static size_t adjust_range_hwpoison(struct page *page, size_t offset, size_t bytes)
+static size_t adjust_range_hwpoison(struct folio *folio, size_t offset,
+ size_t bytes)
{
+ struct page *page;
size_t n = 0;
size_t res = 0;
- /* First subpage to start the loop. */
- page = nth_page(page, offset / PAGE_SIZE);
+ /* First page to start the loop. */
+ page = folio_page(folio, offset / PAGE_SIZE);
offset %= PAGE_SIZE;
while (1) {
if (is_raw_hwpoison_page_in_hugepage(page))
@@ -278,10 +276,10 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
else {
/*
* Adjust how many bytes safe to read without
- * touching the 1st raw HWPOISON subpage after
+ * touching the 1st raw HWPOISON page after
* offset.
*/
- want = adjust_range_hwpoison(&folio->page, offset, nr);
+ want = adjust_range_hwpoison(folio, offset, nr);
if (want == 0) {
folio_put(folio);
retval = -EIO;
@@ -309,7 +307,7 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
return retval;
}
-static int hugetlbfs_write_begin(struct file *file,
+static int hugetlbfs_write_begin(const struct kiocb *iocb,
struct address_space *mapping,
loff_t pos, unsigned len,
struct folio **foliop, void **fsdata)
@@ -317,9 +315,10 @@ static int hugetlbfs_write_begin(struct file *file,
return -EINVAL;
}
-static int hugetlbfs_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct folio *folio, void *fsdata)
+static int hugetlbfs_write_end(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct folio *folio, void *fsdata)
{
BUG();
return -EINVAL;
@@ -338,8 +337,8 @@ static void hugetlb_delete_from_page_cache(struct folio *folio)
* mutex for the page in the mapping. So, we can not race with page being
* faulted into the vma.
*/
-static bool hugetlb_vma_maps_page(struct vm_area_struct *vma,
- unsigned long addr, struct page *page)
+static bool hugetlb_vma_maps_pfn(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long pfn)
{
pte_t *ptep, pte;
@@ -351,7 +350,7 @@ static bool hugetlb_vma_maps_page(struct vm_area_struct *vma,
if (huge_pte_none(pte) || !pte_present(pte))
return false;
- if (pte_page(pte) == page)
+ if (pte_pfn(pte) == pfn)
return true;
return false;
@@ -396,7 +395,7 @@ static void hugetlb_unmap_file_folio(struct hstate *h,
{
struct rb_root_cached *root = &mapping->i_mmap;
struct hugetlb_vma_lock *vma_lock;
- struct page *page = &folio->page;
+ unsigned long pfn = folio_pfn(folio);
struct vm_area_struct *vma;
unsigned long v_start;
unsigned long v_end;
@@ -412,7 +411,7 @@ retry:
v_start = vma_offset_start(vma, start);
v_end = vma_offset_end(vma, end);
- if (!hugetlb_vma_maps_page(vma, v_start, page))
+ if (!hugetlb_vma_maps_pfn(vma, v_start, pfn))
continue;
if (!hugetlb_vma_trylock_write(vma)) {
@@ -462,7 +461,7 @@ retry:
*/
v_start = vma_offset_start(vma, start);
v_end = vma_offset_end(vma, end);
- if (hugetlb_vma_maps_page(vma, v_start, page))
+ if (hugetlb_vma_maps_pfn(vma, v_start, pfn))
unmap_hugepage_range(vma, v_start, v_end, NULL,
ZAP_FLAG_DROP_MARKER);
@@ -991,14 +990,14 @@ static int hugetlbfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
return 0;
}
-static int hugetlbfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *hugetlbfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
int retval = hugetlbfs_mknod(idmap, dir, dentry,
mode | S_IFDIR, 0);
if (!retval)
inc_nlink(dir);
- return retval;
+ return ERR_PTR(retval);
}
static int hugetlbfs_create(struct mnt_idmap *idmap,
@@ -1431,6 +1430,7 @@ hugetlbfs_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_blocksize_bits = huge_page_shift(ctx->hstate);
sb->s_magic = HUGETLBFS_MAGIC;
sb->s_op = &hugetlbfs_ops;
+ sb->s_d_flags = DCACHE_DONTCACHE;
sb->s_time_gran = 1;
/*
@@ -1559,9 +1559,9 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
inode->i_size = size;
clear_nlink(inode);
- if (!hugetlb_reserve_pages(inode, 0,
+ if (hugetlb_reserve_pages(inode, 0,
size >> huge_page_shift(hstate_inode(inode)), NULL,
- acctflag))
+ acctflag) < 0)
file = ERR_PTR(-ENOMEM);
else
file = alloc_file_pseudo(inode, mnt, name, O_RDWR,
@@ -1585,7 +1585,7 @@ static struct vfsmount *__init mount_one_hugetlbfs(struct hstate *h)
} else {
struct hugetlbfs_fs_context *ctx = fc->fs_private;
ctx->hstate = h;
- mnt = fc_mount(fc);
+ mnt = fc_mount_longterm(fc);
put_fs_context(fc);
}
if (IS_ERR(mnt))
diff --git a/fs/init.c b/fs/init.c
index e9387b6c4f30..eef5124885e3 100644
--- a/fs/init.c
+++ b/fs/init.c
@@ -230,9 +230,12 @@ int __init init_mkdir(const char *pathname, umode_t mode)
return PTR_ERR(dentry);
mode = mode_strip_umask(d_inode(path.dentry), mode);
error = security_path_mkdir(&path, dentry, mode);
- if (!error)
- error = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode,
+ if (!error) {
+ dentry = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode,
dentry, mode);
+ if (IS_ERR(dentry))
+ error = PTR_ERR(dentry);
+ }
done_path_create(&path, dentry);
return error;
}
diff --git a/fs/inode.c b/fs/inode.c
index 5587aabdaa5e..01ebdc40021e 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -327,7 +327,17 @@ static void i_callback(struct rcu_head *head)
free_inode_nonrcu(inode);
}
-static struct inode *alloc_inode(struct super_block *sb)
+/**
+ * alloc_inode - obtain an inode
+ * @sb: superblock
+ *
+ * Allocates a new inode for given superblock.
+ * Inode wont be chained in superblock s_inodes list
+ * This means :
+ * - fs can't be unmount
+ * - quotas, fsnotify, writeback can't work
+ */
+struct inode *alloc_inode(struct super_block *sb)
{
const struct super_operations *ops = sb->s_op;
struct inode *inode;
@@ -613,18 +623,22 @@ static void inode_wait_for_lru_isolating(struct inode *inode)
*/
void inode_sb_list_add(struct inode *inode)
{
- spin_lock(&inode->i_sb->s_inode_list_lock);
- list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
- spin_unlock(&inode->i_sb->s_inode_list_lock);
+ struct super_block *sb = inode->i_sb;
+
+ spin_lock(&sb->s_inode_list_lock);
+ list_add(&inode->i_sb_list, &sb->s_inodes);
+ spin_unlock(&sb->s_inode_list_lock);
}
EXPORT_SYMBOL_GPL(inode_sb_list_add);
static inline void inode_sb_list_del(struct inode *inode)
{
+ struct super_block *sb = inode->i_sb;
+
if (!list_empty(&inode->i_sb_list)) {
- spin_lock(&inode->i_sb->s_inode_list_lock);
+ spin_lock(&sb->s_inode_list_lock);
list_del_init(&inode->i_sb_list);
- spin_unlock(&inode->i_sb->s_inode_list_lock);
+ spin_unlock(&sb->s_inode_list_lock);
}
}
@@ -806,23 +820,16 @@ static void evict(struct inode *inode)
/*
* Wake up waiters in __wait_on_freeing_inode().
*
- * Lockless hash lookup may end up finding the inode before we removed
- * it above, but only lock it *after* we are done with the wakeup below.
- * In this case the potential waiter cannot safely block.
+ * It is an invariant that any thread we need to wake up is already
+ * accounted for before remove_inode_hash() acquires ->i_lock -- both
+ * sides take the lock and sleep is aborted if the inode is found
+ * unhashed. Thus either the sleeper wins and goes off CPU, or removal
+ * wins and the sleeper aborts after testing with the lock.
*
- * The inode being unhashed after the call to remove_inode_hash() is
- * used as an indicator whether blocking on it is safe.
+ * This also means we don't need any fences for the call below.
*/
- spin_lock(&inode->i_lock);
- /*
- * Pairs with the barrier in prepare_to_wait_event() to make sure
- * ___wait_var_event() either sees the bit cleared or
- * waitqueue_active() check in wake_up_var() sees the waiter.
- */
- smp_mb__after_spinlock();
inode_wake_up_bit(inode, __I_NEW);
BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
- spin_unlock(&inode->i_lock);
destroy_inode(inode);
}
@@ -858,12 +865,12 @@ static void dispose_list(struct list_head *head)
*/
void evict_inodes(struct super_block *sb)
{
- struct inode *inode, *next;
+ struct inode *inode;
LIST_HEAD(dispose);
again:
spin_lock(&sb->s_inode_list_lock);
- list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
+ list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
if (atomic_read(&inode->i_count))
continue;
@@ -900,46 +907,6 @@ again:
}
EXPORT_SYMBOL_GPL(evict_inodes);
-/**
- * invalidate_inodes - attempt to free all inodes on a superblock
- * @sb: superblock to operate on
- *
- * Attempts to free all inodes (including dirty inodes) for a given superblock.
- */
-void invalidate_inodes(struct super_block *sb)
-{
- struct inode *inode, *next;
- LIST_HEAD(dispose);
-
-again:
- spin_lock(&sb->s_inode_list_lock);
- list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
- spin_lock(&inode->i_lock);
- if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
- spin_unlock(&inode->i_lock);
- continue;
- }
- if (atomic_read(&inode->i_count)) {
- spin_unlock(&inode->i_lock);
- continue;
- }
-
- inode->i_state |= I_FREEING;
- inode_lru_list_del(inode);
- spin_unlock(&inode->i_lock);
- list_add(&inode->i_lru, &dispose);
- if (need_resched()) {
- spin_unlock(&sb->s_inode_list_lock);
- cond_resched();
- dispose_list(&dispose);
- goto again;
- }
- }
- spin_unlock(&sb->s_inode_list_lock);
-
- dispose_list(&dispose);
-}
-
/*
* Isolate the inode from the LRU in preparation for freeing it.
*
@@ -1160,21 +1127,6 @@ unsigned int get_next_ino(void)
EXPORT_SYMBOL(get_next_ino);
/**
- * new_inode_pseudo - obtain an inode
- * @sb: superblock
- *
- * Allocates a new inode for given superblock.
- * Inode wont be chained in superblock s_inodes list
- * This means :
- * - fs can't be unmount
- * - quotas, fsnotify, writeback can't work
- */
-struct inode *new_inode_pseudo(struct super_block *sb)
-{
- return alloc_inode(sb);
-}
-
-/**
* new_inode - obtain an inode
* @sb: superblock
*
@@ -1190,7 +1142,7 @@ struct inode *new_inode(struct super_block *sb)
{
struct inode *inode;
- inode = new_inode_pseudo(sb);
+ inode = alloc_inode(sb);
if (inode)
inode_sb_list_add(inode);
return inode;
@@ -1206,9 +1158,8 @@ void lockdep_annotate_inode_mutex_key(struct inode *inode)
/* Set new key only if filesystem hasn't already changed it */
if (lockdep_match_class(&inode->i_rwsem, &type->i_mutex_key)) {
/*
- * ensure nobody is actually holding i_mutex
+ * ensure nobody is actually holding i_rwsem
*/
- // mutex_destroy(&inode->i_mutex);
init_rwsem(&inode->i_rwsem);
lockdep_set_class(&inode->i_rwsem,
&type->i_mutex_dir_key);
@@ -1348,8 +1299,8 @@ again:
}
if (set && unlikely(set(inode, data))) {
- inode = NULL;
- goto unlock;
+ spin_unlock(&inode_hash_lock);
+ return NULL;
}
/*
@@ -1361,14 +1312,14 @@ again:
hlist_add_head_rcu(&inode->i_hash, head);
spin_unlock(&inode->i_lock);
+ spin_unlock(&inode_hash_lock);
+
/*
* Add inode to the sb list if it's not already. It has I_NEW at this
* point, so it should be safe to test i_sb_list locklessly.
*/
if (list_empty(&inode->i_sb_list))
inode_sb_list_add(inode);
-unlock:
- spin_unlock(&inode_hash_lock);
return inode;
}
@@ -1497,8 +1448,8 @@ again:
inode->i_state = I_NEW;
hlist_add_head_rcu(&inode->i_hash, head);
spin_unlock(&inode->i_lock);
- inode_sb_list_add(inode);
spin_unlock(&inode_hash_lock);
+ inode_sb_list_add(inode);
/* Return the locked inode with I_NEW set, the
* caller is responsible for filling in the contents
@@ -2663,7 +2614,7 @@ EXPORT_SYMBOL(inode_dio_finished);
* proceed with a truncate or equivalent operation.
*
* Must be called under a lock that serializes taking new references
- * to i_dio_count, usually by inode->i_mutex.
+ * to i_dio_count, usually by inode->i_rwsem.
*/
void inode_dio_wait(struct inode *inode)
{
@@ -2681,7 +2632,7 @@ EXPORT_SYMBOL(inode_dio_wait_interruptible);
/*
* inode_set_flags - atomically set some inode flags
*
- * Note: the caller should be holding i_mutex, or else be sure that
+ * Note: the caller should be holding i_rwsem exclusively, or else be sure that
* they have exclusive access to the inode structure (i.e., while the
* inode is being instantiated). The reason for the cmpxchg() loop
* --- which wouldn't be necessary if all code paths which modify
@@ -2689,7 +2640,7 @@ EXPORT_SYMBOL(inode_dio_wait_interruptible);
* code path which doesn't today so we use cmpxchg() out of an abundance
* of caution.
*
- * In the long run, i_mutex is overkill, and we should probably look
+ * In the long run, i_rwsem is overkill, and we should probably look
* at using the i_lock spinlock to protect i_flags, and then make sure
* it is so documented in include/linux/fs.h and that all code follows
* the locking convention!!
@@ -2953,3 +2904,18 @@ umode_t mode_strip_sgid(struct mnt_idmap *idmap,
return mode & ~S_ISGID;
}
EXPORT_SYMBOL(mode_strip_sgid);
+
+#ifdef CONFIG_DEBUG_VFS
+/*
+ * Dump an inode.
+ *
+ * TODO: add a proper inode dumping routine, this is a stub to get debug off the
+ * ground.
+ */
+void dump_inode(struct inode *inode, const char *reason)
+{
+ pr_warn("%s encountered for inode %px", reason, inode);
+}
+
+EXPORT_SYMBOL(dump_inode);
+#endif
diff --git a/fs/internal.h b/fs/internal.h
index e7f02ae1e098..38e8aab27bbd 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -66,6 +66,7 @@ int do_linkat(int olddfd, struct filename *old, int newdfd,
int vfs_tmpfile(struct mnt_idmap *idmap,
const struct path *parentpath,
struct file *file, umode_t mode);
+struct dentry *d_hash_and_lookup(struct dentry *, struct qstr *);
/*
* namespace.c
@@ -100,6 +101,7 @@ extern void chroot_fs_refs(const struct path *, const struct path *);
struct file *alloc_empty_file(int flags, const struct cred *cred);
struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred);
struct file *alloc_empty_backing_file(int flags, const struct cred *cred);
+void backing_file_set_user_path(struct file *f, const struct path *path);
static inline void file_put_write_access(struct file *file)
{
@@ -118,6 +120,9 @@ static inline void put_file_access(struct file *file)
}
}
+void fput_close_sync(struct file *);
+void fput_close(struct file *);
+
/*
* super.c
*/
@@ -187,8 +192,8 @@ extern struct open_how build_open_how(int flags, umode_t mode);
extern int build_open_flags(const struct open_how *how, struct open_flags *op);
struct file *file_close_fd_locked(struct files_struct *files, unsigned fd);
-long do_ftruncate(struct file *file, loff_t length, int small);
-long do_sys_ftruncate(unsigned int fd, loff_t length, int small);
+int do_ftruncate(struct file *file, loff_t length, int small);
+int do_sys_ftruncate(unsigned int fd, loff_t length, int small);
int chmod_common(const struct path *path, umode_t mode);
int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group,
int flag);
@@ -207,7 +212,6 @@ bool in_group_or_capable(struct mnt_idmap *idmap,
* fs-writeback.c
*/
extern long get_nr_dirty_inodes(void);
-void invalidate_inodes(struct super_block *sb);
/*
* dcache.c
@@ -319,12 +323,16 @@ struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns);
struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap);
void mnt_idmap_put(struct mnt_idmap *idmap);
struct stashed_operations {
+ struct dentry *(*stash_dentry)(struct dentry **stashed,
+ struct dentry *dentry);
void (*put_data)(void *data);
int (*init_inode)(struct inode *inode, void *data);
};
int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data,
struct path *path);
void stashed_dentry_prune(struct dentry *dentry);
+struct dentry *stash_dentry(struct dentry **stashed, struct dentry *dentry);
+struct dentry *stashed_dentry_get(struct dentry **stashed);
/**
* path_mounted - check whether path is mounted
* @path: path to check
@@ -338,3 +346,12 @@ static inline bool path_mounted(const struct path *path)
return path->mnt->mnt_root == path->dentry;
}
void file_f_owner_release(struct file *file);
+bool file_seek_cur_needs_f_lock(struct file *file);
+int statmount_mnt_idmap(struct mnt_idmap *idmap, struct seq_file *seq, bool uid_map);
+struct dentry *find_next_child(struct dentry *parent, struct dentry *prev);
+int anon_inode_getattr(struct mnt_idmap *idmap, const struct path *path,
+ struct kstat *stat, u32 request_mask,
+ unsigned int query_flags);
+int anon_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
+ struct iattr *attr);
+void pidfs_get_root(struct path *path);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 638a36be31c1..0248cb8db2d3 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -41,7 +41,7 @@
*
* Returns 0 on success, -errno on error.
*/
-long vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+int vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
int error = -ENOTTY;
@@ -228,8 +228,8 @@ static int ioctl_fiemap(struct file *filp, struct fiemap __user *ufiemap)
return error;
}
-static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd,
- u64 off, u64 olen, u64 destoff)
+static int ioctl_file_clone(struct file *dst_file, unsigned long srcfd,
+ u64 off, u64 olen, u64 destoff)
{
CLASS(fd, src_file)(srcfd);
loff_t cloned;
@@ -248,8 +248,8 @@ static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd,
return ret;
}
-static long ioctl_file_clone_range(struct file *file,
- struct file_clone_range __user *argp)
+static int ioctl_file_clone_range(struct file *file,
+ struct file_clone_range __user *argp)
{
struct file_clone_range args;
@@ -396,8 +396,8 @@ static int ioctl_fsfreeze(struct file *filp)
/* Freeze */
if (sb->s_op->freeze_super)
- return sb->s_op->freeze_super(sb, FREEZE_HOLDER_USERSPACE);
- return freeze_super(sb, FREEZE_HOLDER_USERSPACE);
+ return sb->s_op->freeze_super(sb, FREEZE_HOLDER_USERSPACE, NULL);
+ return freeze_super(sb, FREEZE_HOLDER_USERSPACE, NULL);
}
static int ioctl_fsthaw(struct file *filp)
@@ -409,8 +409,8 @@ static int ioctl_fsthaw(struct file *filp)
/* Thaw */
if (sb->s_op->thaw_super)
- return sb->s_op->thaw_super(sb, FREEZE_HOLDER_USERSPACE);
- return thaw_super(sb, FREEZE_HOLDER_USERSPACE);
+ return sb->s_op->thaw_super(sb, FREEZE_HOLDER_USERSPACE, NULL);
+ return thaw_super(sb, FREEZE_HOLDER_USERSPACE, NULL);
}
static int ioctl_file_dedupe_range(struct file *file,
@@ -453,315 +453,6 @@ out:
return ret;
}
-/**
- * fileattr_fill_xflags - initialize fileattr with xflags
- * @fa: fileattr pointer
- * @xflags: FS_XFLAG_* flags
- *
- * Set ->fsx_xflags, ->fsx_valid and ->flags (translated xflags). All
- * other fields are zeroed.
- */
-void fileattr_fill_xflags(struct fileattr *fa, u32 xflags)
-{
- memset(fa, 0, sizeof(*fa));
- fa->fsx_valid = true;
- fa->fsx_xflags = xflags;
- if (fa->fsx_xflags & FS_XFLAG_IMMUTABLE)
- fa->flags |= FS_IMMUTABLE_FL;
- if (fa->fsx_xflags & FS_XFLAG_APPEND)
- fa->flags |= FS_APPEND_FL;
- if (fa->fsx_xflags & FS_XFLAG_SYNC)
- fa->flags |= FS_SYNC_FL;
- if (fa->fsx_xflags & FS_XFLAG_NOATIME)
- fa->flags |= FS_NOATIME_FL;
- if (fa->fsx_xflags & FS_XFLAG_NODUMP)
- fa->flags |= FS_NODUMP_FL;
- if (fa->fsx_xflags & FS_XFLAG_DAX)
- fa->flags |= FS_DAX_FL;
- if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
- fa->flags |= FS_PROJINHERIT_FL;
-}
-EXPORT_SYMBOL(fileattr_fill_xflags);
-
-/**
- * fileattr_fill_flags - initialize fileattr with flags
- * @fa: fileattr pointer
- * @flags: FS_*_FL flags
- *
- * Set ->flags, ->flags_valid and ->fsx_xflags (translated flags).
- * All other fields are zeroed.
- */
-void fileattr_fill_flags(struct fileattr *fa, u32 flags)
-{
- memset(fa, 0, sizeof(*fa));
- fa->flags_valid = true;
- fa->flags = flags;
- if (fa->flags & FS_SYNC_FL)
- fa->fsx_xflags |= FS_XFLAG_SYNC;
- if (fa->flags & FS_IMMUTABLE_FL)
- fa->fsx_xflags |= FS_XFLAG_IMMUTABLE;
- if (fa->flags & FS_APPEND_FL)
- fa->fsx_xflags |= FS_XFLAG_APPEND;
- if (fa->flags & FS_NODUMP_FL)
- fa->fsx_xflags |= FS_XFLAG_NODUMP;
- if (fa->flags & FS_NOATIME_FL)
- fa->fsx_xflags |= FS_XFLAG_NOATIME;
- if (fa->flags & FS_DAX_FL)
- fa->fsx_xflags |= FS_XFLAG_DAX;
- if (fa->flags & FS_PROJINHERIT_FL)
- fa->fsx_xflags |= FS_XFLAG_PROJINHERIT;
-}
-EXPORT_SYMBOL(fileattr_fill_flags);
-
-/**
- * vfs_fileattr_get - retrieve miscellaneous file attributes
- * @dentry: the object to retrieve from
- * @fa: fileattr pointer
- *
- * Call i_op->fileattr_get() callback, if exists.
- *
- * Return: 0 on success, or a negative error on failure.
- */
-int vfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
-{
- struct inode *inode = d_inode(dentry);
-
- if (!inode->i_op->fileattr_get)
- return -ENOIOCTLCMD;
-
- return inode->i_op->fileattr_get(dentry, fa);
-}
-EXPORT_SYMBOL(vfs_fileattr_get);
-
-/**
- * copy_fsxattr_to_user - copy fsxattr to userspace.
- * @fa: fileattr pointer
- * @ufa: fsxattr user pointer
- *
- * Return: 0 on success, or -EFAULT on failure.
- */
-int copy_fsxattr_to_user(const struct fileattr *fa, struct fsxattr __user *ufa)
-{
- struct fsxattr xfa;
-
- memset(&xfa, 0, sizeof(xfa));
- xfa.fsx_xflags = fa->fsx_xflags;
- xfa.fsx_extsize = fa->fsx_extsize;
- xfa.fsx_nextents = fa->fsx_nextents;
- xfa.fsx_projid = fa->fsx_projid;
- xfa.fsx_cowextsize = fa->fsx_cowextsize;
-
- if (copy_to_user(ufa, &xfa, sizeof(xfa)))
- return -EFAULT;
-
- return 0;
-}
-EXPORT_SYMBOL(copy_fsxattr_to_user);
-
-static int copy_fsxattr_from_user(struct fileattr *fa,
- struct fsxattr __user *ufa)
-{
- struct fsxattr xfa;
-
- if (copy_from_user(&xfa, ufa, sizeof(xfa)))
- return -EFAULT;
-
- fileattr_fill_xflags(fa, xfa.fsx_xflags);
- fa->fsx_extsize = xfa.fsx_extsize;
- fa->fsx_nextents = xfa.fsx_nextents;
- fa->fsx_projid = xfa.fsx_projid;
- fa->fsx_cowextsize = xfa.fsx_cowextsize;
-
- return 0;
-}
-
-/*
- * Generic function to check FS_IOC_FSSETXATTR/FS_IOC_SETFLAGS values and reject
- * any invalid configurations.
- *
- * Note: must be called with inode lock held.
- */
-static int fileattr_set_prepare(struct inode *inode,
- const struct fileattr *old_ma,
- struct fileattr *fa)
-{
- int err;
-
- /*
- * The IMMUTABLE and APPEND_ONLY flags can only be changed by
- * the relevant capability.
- */
- if ((fa->flags ^ old_ma->flags) & (FS_APPEND_FL | FS_IMMUTABLE_FL) &&
- !capable(CAP_LINUX_IMMUTABLE))
- return -EPERM;
-
- err = fscrypt_prepare_setflags(inode, old_ma->flags, fa->flags);
- if (err)
- return err;
-
- /*
- * Project Quota ID state is only allowed to change from within the init
- * namespace. Enforce that restriction only if we are trying to change
- * the quota ID state. Everything else is allowed in user namespaces.
- */
- if (current_user_ns() != &init_user_ns) {
- if (old_ma->fsx_projid != fa->fsx_projid)
- return -EINVAL;
- if ((old_ma->fsx_xflags ^ fa->fsx_xflags) &
- FS_XFLAG_PROJINHERIT)
- return -EINVAL;
- } else {
- /*
- * Caller is allowed to change the project ID. If it is being
- * changed, make sure that the new value is valid.
- */
- if (old_ma->fsx_projid != fa->fsx_projid &&
- !projid_valid(make_kprojid(&init_user_ns, fa->fsx_projid)))
- return -EINVAL;
- }
-
- /* Check extent size hints. */
- if ((fa->fsx_xflags & FS_XFLAG_EXTSIZE) && !S_ISREG(inode->i_mode))
- return -EINVAL;
-
- if ((fa->fsx_xflags & FS_XFLAG_EXTSZINHERIT) &&
- !S_ISDIR(inode->i_mode))
- return -EINVAL;
-
- if ((fa->fsx_xflags & FS_XFLAG_COWEXTSIZE) &&
- !S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
- return -EINVAL;
-
- /*
- * It is only valid to set the DAX flag on regular files and
- * directories on filesystems.
- */
- if ((fa->fsx_xflags & FS_XFLAG_DAX) &&
- !(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
- return -EINVAL;
-
- /* Extent size hints of zero turn off the flags. */
- if (fa->fsx_extsize == 0)
- fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE | FS_XFLAG_EXTSZINHERIT);
- if (fa->fsx_cowextsize == 0)
- fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE;
-
- return 0;
-}
-
-/**
- * vfs_fileattr_set - change miscellaneous file attributes
- * @idmap: idmap of the mount
- * @dentry: the object to change
- * @fa: fileattr pointer
- *
- * After verifying permissions, call i_op->fileattr_set() callback, if
- * exists.
- *
- * Verifying attributes involves retrieving current attributes with
- * i_op->fileattr_get(), this also allows initializing attributes that have
- * not been set by the caller to current values. Inode lock is held
- * thoughout to prevent racing with another instance.
- *
- * Return: 0 on success, or a negative error on failure.
- */
-int vfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry,
- struct fileattr *fa)
-{
- struct inode *inode = d_inode(dentry);
- struct fileattr old_ma = {};
- int err;
-
- if (!inode->i_op->fileattr_set)
- return -ENOIOCTLCMD;
-
- if (!inode_owner_or_capable(idmap, inode))
- return -EPERM;
-
- inode_lock(inode);
- err = vfs_fileattr_get(dentry, &old_ma);
- if (!err) {
- /* initialize missing bits from old_ma */
- if (fa->flags_valid) {
- fa->fsx_xflags |= old_ma.fsx_xflags & ~FS_XFLAG_COMMON;
- fa->fsx_extsize = old_ma.fsx_extsize;
- fa->fsx_nextents = old_ma.fsx_nextents;
- fa->fsx_projid = old_ma.fsx_projid;
- fa->fsx_cowextsize = old_ma.fsx_cowextsize;
- } else {
- fa->flags |= old_ma.flags & ~FS_COMMON_FL;
- }
- err = fileattr_set_prepare(inode, &old_ma, fa);
- if (!err)
- err = inode->i_op->fileattr_set(idmap, dentry, fa);
- }
- inode_unlock(inode);
-
- return err;
-}
-EXPORT_SYMBOL(vfs_fileattr_set);
-
-static int ioctl_getflags(struct file *file, unsigned int __user *argp)
-{
- struct fileattr fa = { .flags_valid = true }; /* hint only */
- int err;
-
- err = vfs_fileattr_get(file->f_path.dentry, &fa);
- if (!err)
- err = put_user(fa.flags, argp);
- return err;
-}
-
-static int ioctl_setflags(struct file *file, unsigned int __user *argp)
-{
- struct mnt_idmap *idmap = file_mnt_idmap(file);
- struct dentry *dentry = file->f_path.dentry;
- struct fileattr fa;
- unsigned int flags;
- int err;
-
- err = get_user(flags, argp);
- if (!err) {
- err = mnt_want_write_file(file);
- if (!err) {
- fileattr_fill_flags(&fa, flags);
- err = vfs_fileattr_set(idmap, dentry, &fa);
- mnt_drop_write_file(file);
- }
- }
- return err;
-}
-
-static int ioctl_fsgetxattr(struct file *file, void __user *argp)
-{
- struct fileattr fa = { .fsx_valid = true }; /* hint only */
- int err;
-
- err = vfs_fileattr_get(file->f_path.dentry, &fa);
- if (!err)
- err = copy_fsxattr_to_user(&fa, argp);
-
- return err;
-}
-
-static int ioctl_fssetxattr(struct file *file, void __user *argp)
-{
- struct mnt_idmap *idmap = file_mnt_idmap(file);
- struct dentry *dentry = file->f_path.dentry;
- struct fileattr fa;
- int err;
-
- err = copy_fsxattr_from_user(&fa, argp);
- if (!err) {
- err = mnt_want_write_file(file);
- if (!err) {
- err = vfs_fileattr_set(idmap, dentry, &fa);
- mnt_drop_write_file(file);
- }
- }
- return err;
-}
-
static int ioctl_getfsuuid(struct file *file, void __user *argp)
{
struct super_block *sb = file_inode(file)->i_sb;
@@ -821,7 +512,8 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd,
return ioctl_fioasync(fd, filp, argp);
case FIOQSIZE:
- if (S_ISDIR(inode->i_mode) || S_ISREG(inode->i_mode) ||
+ if (S_ISDIR(inode->i_mode) ||
+ (S_ISREG(inode->i_mode) && !IS_ANON_FILE(inode)) ||
S_ISLNK(inode->i_mode)) {
loff_t res = inode_get_bytes(inode);
return copy_to_user(argp, &res, sizeof(res)) ?
@@ -856,7 +548,7 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd,
return ioctl_file_dedupe_range(filp, argp);
case FIONREAD:
- if (!S_ISREG(inode->i_mode))
+ if (!S_ISREG(inode->i_mode) || IS_ANON_FILE(inode))
return vfs_ioctl(filp, cmd, arg);
return put_user(i_size_read(inode) - filp->f_pos,
@@ -881,7 +573,7 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd,
return ioctl_get_fs_sysfs_path(filp, argp);
default:
- if (S_ISREG(inode->i_mode))
+ if (S_ISREG(inode->i_mode) && !IS_ANON_FILE(inode))
return file_ioctl(filp, cmd, argp);
break;
}
diff --git a/fs/iomap/Makefile b/fs/iomap/Makefile
index 381d76c5c232..f7e1c8534c46 100644
--- a/fs/iomap/Makefile
+++ b/fs/iomap/Makefile
@@ -9,9 +9,10 @@ ccflags-y += -I $(src) # needed for trace events
obj-$(CONFIG_FS_IOMAP) += iomap.o
iomap-y += trace.o \
- iter.o
-iomap-$(CONFIG_BLOCK) += buffered-io.o \
- direct-io.o \
+ iter.o \
+ buffered-io.o
+iomap-$(CONFIG_BLOCK) += direct-io.o \
+ ioend.o \
fiemap.o \
seek.o
iomap-$(CONFIG_SWAP) += swapfile.o
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index d303e6c8900c..fd827398afd2 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -3,26 +3,15 @@
* Copyright (C) 2010 Red Hat, Inc.
* Copyright (C) 2016-2023 Christoph Hellwig.
*/
-#include <linux/module.h>
-#include <linux/compiler.h>
-#include <linux/fs.h>
#include <linux/iomap.h>
-#include <linux/pagemap.h>
-#include <linux/uio.h>
#include <linux/buffer_head.h>
-#include <linux/dax.h>
#include <linux/writeback.h>
-#include <linux/list_sort.h>
#include <linux/swap.h>
-#include <linux/bio.h>
-#include <linux/sched/signal.h>
#include <linux/migrate.h>
#include "trace.h"
#include "../internal.h"
-#define IOEND_BATCH_SIZE 4096
-
/*
* Structure allocated for each folio to track per-block uptodate, dirty state
* and I/O completions.
@@ -40,8 +29,6 @@ struct iomap_folio_state {
unsigned long state[];
};
-static struct bio_set iomap_ioend_bioset;
-
static inline bool ifs_is_fully_uptodate(struct folio *folio,
struct iomap_folio_state *ifs)
{
@@ -75,6 +62,9 @@ static void iomap_set_range_uptodate(struct folio *folio, size_t off,
unsigned long flags;
bool uptodate = true;
+ if (folio_test_uptodate(folio))
+ return;
+
if (ifs) {
spin_lock_irqsave(&ifs->state_lock, flags);
uptodate = ifs_set_range_uptodate(folio, ifs, off, len);
@@ -263,7 +253,7 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
}
/* truncate len if we find any trailing uptodate block(s) */
- for ( ; i <= last; i++) {
+ while (++i <= last) {
if (ifs_block_is_uptodate(ifs, i)) {
plen -= (last - i + 1) * block_size;
last = i - 1;
@@ -288,6 +278,46 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
*lenp = plen;
}
+static inline bool iomap_block_needs_zeroing(const struct iomap_iter *iter,
+ loff_t pos)
+{
+ const struct iomap *srcmap = iomap_iter_srcmap(iter);
+
+ return srcmap->type != IOMAP_MAPPED ||
+ (srcmap->flags & IOMAP_F_NEW) ||
+ pos >= i_size_read(iter->inode);
+}
+
+/**
+ * iomap_read_inline_data - copy inline data into the page cache
+ * @iter: iteration structure
+ * @folio: folio to copy to
+ *
+ * Copy the inline data in @iter into @folio and zero out the rest of the folio.
+ * Only a single IOMAP_INLINE extent is allowed at the end of each file.
+ * Returns zero for success to complete the read, or the usual negative errno.
+ */
+static int iomap_read_inline_data(const struct iomap_iter *iter,
+ struct folio *folio)
+{
+ const struct iomap *iomap = iomap_iter_srcmap(iter);
+ size_t size = i_size_read(iter->inode) - iomap->offset;
+ size_t offset = offset_in_folio(folio, iomap->offset);
+
+ if (folio_test_uptodate(folio))
+ return 0;
+
+ if (WARN_ON_ONCE(size > iomap->length))
+ return -EIO;
+ if (offset > 0)
+ ifs_alloc(iter->inode, folio, iter->flags);
+
+ folio_fill_tail(folio, offset, iomap->inline_data, size);
+ iomap_set_range_uptodate(folio, offset, folio_size(folio) - offset);
+ return 0;
+}
+
+#ifdef CONFIG_BLOCK
static void iomap_finish_folio_read(struct folio *folio, size_t off,
size_t len, int error)
{
@@ -327,59 +357,24 @@ struct iomap_readpage_ctx {
struct readahead_control *rac;
};
-/**
- * iomap_read_inline_data - copy inline data into the page cache
- * @iter: iteration structure
- * @folio: folio to copy to
- *
- * Copy the inline data in @iter into @folio and zero out the rest of the folio.
- * Only a single IOMAP_INLINE extent is allowed at the end of each file.
- * Returns zero for success to complete the read, or the usual negative errno.
- */
-static int iomap_read_inline_data(const struct iomap_iter *iter,
- struct folio *folio)
-{
- const struct iomap *iomap = iomap_iter_srcmap(iter);
- size_t size = i_size_read(iter->inode) - iomap->offset;
- size_t offset = offset_in_folio(folio, iomap->offset);
-
- if (folio_test_uptodate(folio))
- return 0;
-
- if (WARN_ON_ONCE(size > iomap->length))
- return -EIO;
- if (offset > 0)
- ifs_alloc(iter->inode, folio, iter->flags);
-
- folio_fill_tail(folio, offset, iomap->inline_data, size);
- iomap_set_range_uptodate(folio, offset, folio_size(folio) - offset);
- return 0;
-}
-
-static inline bool iomap_block_needs_zeroing(const struct iomap_iter *iter,
- loff_t pos)
-{
- const struct iomap *srcmap = iomap_iter_srcmap(iter);
-
- return srcmap->type != IOMAP_MAPPED ||
- (srcmap->flags & IOMAP_F_NEW) ||
- pos >= i_size_read(iter->inode);
-}
-
-static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
- struct iomap_readpage_ctx *ctx, loff_t offset)
+static int iomap_readpage_iter(struct iomap_iter *iter,
+ struct iomap_readpage_ctx *ctx)
{
const struct iomap *iomap = &iter->iomap;
- loff_t pos = iter->pos + offset;
- loff_t length = iomap_length(iter) - offset;
+ loff_t pos = iter->pos;
+ loff_t length = iomap_length(iter);
struct folio *folio = ctx->cur_folio;
struct iomap_folio_state *ifs;
- loff_t orig_pos = pos;
size_t poff, plen;
sector_t sector;
+ int ret;
- if (iomap->type == IOMAP_INLINE)
- return iomap_read_inline_data(iter, folio);
+ if (iomap->type == IOMAP_INLINE) {
+ ret = iomap_read_inline_data(iter, folio);
+ if (ret)
+ return ret;
+ return iomap_iter_advance(iter, &length);
+ }
/* zero post-eof blocks as the page may be mapped */
ifs = ifs_alloc(iter->inode, folio, iter->flags);
@@ -438,25 +433,22 @@ done:
* we can skip trailing ones as they will be handled in the next
* iteration.
*/
- return pos - orig_pos + plen;
+ length = pos - iter->pos + plen;
+ return iomap_iter_advance(iter, &length);
}
-static loff_t iomap_read_folio_iter(const struct iomap_iter *iter,
+static int iomap_read_folio_iter(struct iomap_iter *iter,
struct iomap_readpage_ctx *ctx)
{
- struct folio *folio = ctx->cur_folio;
- size_t offset = offset_in_folio(folio, iter->pos);
- loff_t length = min_t(loff_t, folio_size(folio) - offset,
- iomap_length(iter));
- loff_t done, ret;
-
- for (done = 0; done < length; done += ret) {
- ret = iomap_readpage_iter(iter, ctx, done);
- if (ret <= 0)
+ int ret;
+
+ while (iomap_length(iter)) {
+ ret = iomap_readpage_iter(iter, ctx);
+ if (ret)
return ret;
}
- return done;
+ return 0;
}
int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops)
@@ -474,7 +466,7 @@ int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops)
trace_iomap_readpage(iter.inode, 1);
while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.processed = iomap_read_folio_iter(&iter, &ctx);
+ iter.status = iomap_read_folio_iter(&iter, &ctx);
if (ctx.bio) {
submit_bio(ctx.bio);
@@ -493,15 +485,14 @@ int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops)
}
EXPORT_SYMBOL_GPL(iomap_read_folio);
-static loff_t iomap_readahead_iter(const struct iomap_iter *iter,
+static int iomap_readahead_iter(struct iomap_iter *iter,
struct iomap_readpage_ctx *ctx)
{
- loff_t length = iomap_length(iter);
- loff_t done, ret;
+ int ret;
- for (done = 0; done < length; done += ret) {
+ while (iomap_length(iter)) {
if (ctx->cur_folio &&
- offset_in_folio(ctx->cur_folio, iter->pos + done) == 0) {
+ offset_in_folio(ctx->cur_folio, iter->pos) == 0) {
if (!ctx->cur_folio_in_bio)
folio_unlock(ctx->cur_folio);
ctx->cur_folio = NULL;
@@ -510,12 +501,12 @@ static loff_t iomap_readahead_iter(const struct iomap_iter *iter,
ctx->cur_folio = readahead_folio(ctx->rac);
ctx->cur_folio_in_bio = false;
}
- ret = iomap_readpage_iter(iter, ctx, done);
- if (ret <= 0)
+ ret = iomap_readpage_iter(iter, ctx);
+ if (ret)
return ret;
}
- return done;
+ return 0;
}
/**
@@ -547,7 +538,7 @@ void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops)
trace_iomap_readahead(rac->mapping->host, readahead_count(rac));
while (iomap_iter(&iter, ops) > 0)
- iter.processed = iomap_readahead_iter(&iter, &ctx);
+ iter.status = iomap_readahead_iter(&iter, &ctx);
if (ctx.bio)
submit_bio(ctx.bio);
@@ -558,6 +549,27 @@ void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops)
}
EXPORT_SYMBOL_GPL(iomap_readahead);
+static int iomap_read_folio_range(const struct iomap_iter *iter,
+ struct folio *folio, loff_t pos, size_t len)
+{
+ const struct iomap *srcmap = iomap_iter_srcmap(iter);
+ struct bio_vec bvec;
+ struct bio bio;
+
+ bio_init(&bio, srcmap->bdev, &bvec, 1, REQ_OP_READ);
+ bio.bi_iter.bi_sector = iomap_sector(srcmap, pos);
+ bio_add_folio_nofail(&bio, folio, len, offset_in_folio(folio, pos));
+ return submit_bio_wait(&bio);
+}
+#else
+static int iomap_read_folio_range(const struct iomap_iter *iter,
+ struct folio *folio, loff_t pos, size_t len)
+{
+ WARN_ON_ONCE(1);
+ return -EIO;
+}
+#endif /* CONFIG_BLOCK */
+
/*
* iomap_is_partially_uptodate checks whether blocks within a folio are
* uptodate or not.
@@ -603,6 +615,8 @@ struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len)
if (iter->flags & IOMAP_NOWAIT)
fgp |= FGP_NOWAIT;
+ if (iter->flags & IOMAP_DONTCACHE)
+ fgp |= FGP_DONTCACHE;
fgp |= fgf_set_order(len);
return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
@@ -669,23 +683,12 @@ iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
pos + len - 1);
}
-static int iomap_read_folio_sync(loff_t block_start, struct folio *folio,
- size_t poff, size_t plen, const struct iomap *iomap)
-{
- struct bio_vec bvec;
- struct bio bio;
-
- bio_init(&bio, iomap->bdev, &bvec, 1, REQ_OP_READ);
- bio.bi_iter.bi_sector = iomap_sector(iomap, block_start);
- bio_add_folio_nofail(&bio, folio, plen, poff);
- return submit_bio_wait(&bio);
-}
-
-static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
- size_t len, struct folio *folio)
+static int __iomap_write_begin(const struct iomap_iter *iter,
+ const struct iomap_write_ops *write_ops, size_t len,
+ struct folio *folio)
{
- const struct iomap *srcmap = iomap_iter_srcmap(iter);
struct iomap_folio_state *ifs;
+ loff_t pos = iter->pos;
loff_t block_size = i_blocksize(iter->inode);
loff_t block_start = round_down(pos, block_size);
loff_t block_end = round_up(pos + len, block_size);
@@ -732,8 +735,12 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
if (iter->flags & IOMAP_NOWAIT)
return -EAGAIN;
- status = iomap_read_folio_sync(block_start, folio,
- poff, plen, srcmap);
+ if (write_ops && write_ops->read_folio_range)
+ status = write_ops->read_folio_range(iter,
+ folio, block_start, plen);
+ else
+ status = iomap_read_folio_range(iter,
+ folio, block_start, plen);
if (status)
return status;
}
@@ -743,30 +750,49 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
return 0;
}
-static struct folio *__iomap_get_folio(struct iomap_iter *iter, loff_t pos,
- size_t len)
+static struct folio *__iomap_get_folio(struct iomap_iter *iter,
+ const struct iomap_write_ops *write_ops, size_t len)
{
- const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops;
+ loff_t pos = iter->pos;
- if (folio_ops && folio_ops->get_folio)
- return folio_ops->get_folio(iter, pos, len);
- else
- return iomap_get_folio(iter, pos, len);
+ if (!mapping_large_folio_support(iter->inode->i_mapping))
+ len = min_t(size_t, len, PAGE_SIZE - offset_in_page(pos));
+
+ if (write_ops && write_ops->get_folio)
+ return write_ops->get_folio(iter, pos, len);
+ return iomap_get_folio(iter, pos, len);
}
-static void __iomap_put_folio(struct iomap_iter *iter, loff_t pos, size_t ret,
+static void __iomap_put_folio(struct iomap_iter *iter,
+ const struct iomap_write_ops *write_ops, size_t ret,
struct folio *folio)
{
- const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops;
+ loff_t pos = iter->pos;
- if (folio_ops && folio_ops->put_folio) {
- folio_ops->put_folio(iter->inode, pos, ret, folio);
+ if (write_ops && write_ops->put_folio) {
+ write_ops->put_folio(iter->inode, pos, ret, folio);
} else {
folio_unlock(folio);
folio_put(folio);
}
}
+/* trim pos and bytes to within a given folio */
+static loff_t iomap_trim_folio_range(struct iomap_iter *iter,
+ struct folio *folio, size_t *offset, u64 *bytes)
+{
+ loff_t pos = iter->pos;
+ size_t fsize = folio_size(folio);
+
+ WARN_ON_ONCE(pos < folio_pos(folio));
+ WARN_ON_ONCE(pos >= folio_pos(folio) + fsize);
+
+ *offset = offset_in_folio(folio, pos);
+ *bytes = min(*bytes, fsize - *offset);
+
+ return pos;
+}
+
static int iomap_write_begin_inline(const struct iomap_iter *iter,
struct folio *folio)
{
@@ -776,14 +802,22 @@ static int iomap_write_begin_inline(const struct iomap_iter *iter,
return iomap_read_inline_data(iter, folio);
}
-static int iomap_write_begin(struct iomap_iter *iter, loff_t pos,
- size_t len, struct folio **foliop)
+/*
+ * Grab and prepare a folio for write based on iter state. Returns the folio,
+ * offset, and length. Callers can optionally pass a max length *plen,
+ * otherwise init to zero.
+ */
+static int iomap_write_begin(struct iomap_iter *iter,
+ const struct iomap_write_ops *write_ops, struct folio **foliop,
+ size_t *poffset, u64 *plen)
{
- const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops;
const struct iomap *srcmap = iomap_iter_srcmap(iter);
+ loff_t pos = iter->pos;
+ u64 len = min_t(u64, SIZE_MAX, iomap_length(iter));
struct folio *folio;
int status = 0;
+ len = min_not_zero(len, *plen);
BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length);
if (srcmap != &iter->iomap)
BUG_ON(pos + len > srcmap->offset + srcmap->length);
@@ -791,10 +825,7 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos,
if (fatal_signal_pending(current))
return -EINTR;
- if (!mapping_large_folio_support(iter->inode->i_mapping))
- len = min_t(size_t, len, PAGE_SIZE - offset_in_page(pos));
-
- folio = __iomap_get_folio(iter, pos, len);
+ folio = __iomap_get_folio(iter, write_ops, len);
if (IS_ERR(folio))
return PTR_ERR(folio);
@@ -808,8 +839,8 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos,
* could do the wrong thing here (zero a page range incorrectly or fail
* to zero) and corrupt data.
*/
- if (folio_ops && folio_ops->iomap_valid) {
- bool iomap_valid = folio_ops->iomap_valid(iter->inode,
+ if (write_ops && write_ops->iomap_valid) {
+ bool iomap_valid = write_ops->iomap_valid(iter->inode,
&iter->iomap);
if (!iomap_valid) {
iter->iomap.flags |= IOMAP_F_STALE;
@@ -818,25 +849,24 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos,
}
}
- if (pos + len > folio_pos(folio) + folio_size(folio))
- len = folio_pos(folio) + folio_size(folio) - pos;
+ pos = iomap_trim_folio_range(iter, folio, poffset, &len);
if (srcmap->type == IOMAP_INLINE)
status = iomap_write_begin_inline(iter, folio);
else if (srcmap->flags & IOMAP_F_BUFFER_HEAD)
status = __block_write_begin_int(folio, pos, len, NULL, srcmap);
else
- status = __iomap_write_begin(iter, pos, len, folio);
+ status = __iomap_write_begin(iter, write_ops, len, folio);
if (unlikely(status))
goto out_unlock;
*foliop = folio;
+ *plen = len;
return 0;
out_unlock:
- __iomap_put_folio(iter, pos, 0, folio);
-
+ __iomap_put_folio(iter, write_ops, 0, folio);
return status;
}
@@ -885,10 +915,11 @@ static void iomap_write_end_inline(const struct iomap_iter *iter,
* Returns true if all copied bytes have been written to the pagecache,
* otherwise return false.
*/
-static bool iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
- size_t copied, struct folio *folio)
+static bool iomap_write_end(struct iomap_iter *iter, size_t len, size_t copied,
+ struct folio *folio)
{
const struct iomap *srcmap = iomap_iter_srcmap(iter);
+ loff_t pos = iter->pos;
if (srcmap->type == IOMAP_INLINE) {
iomap_write_end_inline(iter, folio, pos, copied);
@@ -898,8 +929,7 @@ static bool iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
if (srcmap->flags & IOMAP_F_BUFFER_HEAD) {
size_t bh_written;
- bh_written = block_write_end(NULL, iter->inode->i_mapping, pos,
- len, copied, folio, NULL);
+ bh_written = block_write_end(pos, len, copied, folio);
WARN_ON_ONCE(bh_written != copied && bh_written != 0);
return bh_written == copied;
}
@@ -907,12 +937,11 @@ static bool iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len,
return __iomap_write_end(iter->inode, pos, len, copied, folio);
}
-static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
+static int iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i,
+ const struct iomap_write_ops *write_ops)
{
- loff_t length = iomap_length(iter);
- loff_t pos = iter->pos;
ssize_t total_written = 0;
- long status = 0;
+ int status = 0;
struct address_space *mapping = iter->inode->i_mapping;
size_t chunk = mapping_max_folio_size(mapping);
unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0;
@@ -921,21 +950,22 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
struct folio *folio;
loff_t old_size;
size_t offset; /* Offset into folio */
- size_t bytes; /* Bytes to write to folio */
+ u64 bytes; /* Bytes to write to folio */
size_t copied; /* Bytes copied from user */
- size_t written; /* Bytes have been written */
+ u64 written; /* Bytes have been written */
+ loff_t pos;
bytes = iov_iter_count(i);
retry:
- offset = pos & (chunk - 1);
+ offset = iter->pos & (chunk - 1);
bytes = min(chunk - offset, bytes);
status = balance_dirty_pages_ratelimited_flags(mapping,
bdp_flags);
if (unlikely(status))
break;
- if (bytes > length)
- bytes = length;
+ if (bytes > iomap_length(iter))
+ bytes = iomap_length(iter);
/*
* Bring in the user page that we'll copy from _first_.
@@ -952,23 +982,22 @@ retry:
break;
}
- status = iomap_write_begin(iter, pos, bytes, &folio);
+ status = iomap_write_begin(iter, write_ops, &folio, &offset,
+ &bytes);
if (unlikely(status)) {
- iomap_write_failed(iter->inode, pos, bytes);
+ iomap_write_failed(iter->inode, iter->pos, bytes);
break;
}
if (iter->iomap.flags & IOMAP_F_STALE)
break;
- offset = offset_in_folio(folio, pos);
- if (bytes > folio_size(folio) - offset)
- bytes = folio_size(folio) - offset;
+ pos = iter->pos;
if (mapping_writably_mapped(mapping))
flush_dcache_folio(folio);
copied = copy_folio_from_iter_atomic(folio, offset, bytes, i);
- written = iomap_write_end(iter, pos, bytes, copied, folio) ?
+ written = iomap_write_end(iter, bytes, copied, folio) ?
copied : 0;
/*
@@ -983,7 +1012,7 @@ retry:
i_size_write(iter->inode, pos + written);
iter->iomap.flags |= IOMAP_F_SIZE_CHANGED;
}
- __iomap_put_folio(iter, pos, written, folio);
+ __iomap_put_folio(iter, write_ops, written, folio);
if (old_size < pos)
pagecache_isize_extended(iter->inode, old_size, pos);
@@ -1006,22 +1035,18 @@ retry:
goto retry;
}
} else {
- pos += written;
total_written += written;
- length -= written;
+ iomap_iter_advance(iter, &written);
}
- } while (iov_iter_count(i) && length);
+ } while (iov_iter_count(i) && iomap_length(iter));
- if (status == -EAGAIN) {
- iov_iter_revert(i, total_written);
- return -EAGAIN;
- }
- return total_written ? total_written : status;
+ return total_written ? 0 : status;
}
ssize_t
iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
- const struct iomap_ops *ops, void *private)
+ const struct iomap_ops *ops,
+ const struct iomap_write_ops *write_ops, void *private)
{
struct iomap_iter iter = {
.inode = iocb->ki_filp->f_mapping->host,
@@ -1034,9 +1059,11 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
if (iocb->ki_flags & IOCB_NOWAIT)
iter.flags |= IOMAP_NOWAIT;
+ if (iocb->ki_flags & IOCB_DONTCACHE)
+ iter.flags |= IOMAP_DONTCACHE;
while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.processed = iomap_write_iter(&iter, i);
+ iter.status = iomap_write_iter(&iter, i, write_ops);
if (unlikely(iter.pos == iocb->ki_pos))
return ret;
@@ -1270,53 +1297,50 @@ void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte,
}
EXPORT_SYMBOL_GPL(iomap_write_delalloc_release);
-static loff_t iomap_unshare_iter(struct iomap_iter *iter)
+static int iomap_unshare_iter(struct iomap_iter *iter,
+ const struct iomap_write_ops *write_ops)
{
struct iomap *iomap = &iter->iomap;
- loff_t pos = iter->pos;
- loff_t length = iomap_length(iter);
- loff_t written = 0;
+ u64 bytes = iomap_length(iter);
+ int status;
if (!iomap_want_unshare_iter(iter))
- return length;
+ return iomap_iter_advance(iter, &bytes);
do {
struct folio *folio;
- int status;
size_t offset;
- size_t bytes = min_t(u64, SIZE_MAX, length);
bool ret;
- status = iomap_write_begin(iter, pos, bytes, &folio);
+ bytes = min_t(u64, SIZE_MAX, bytes);
+ status = iomap_write_begin(iter, write_ops, &folio, &offset,
+ &bytes);
if (unlikely(status))
return status;
if (iomap->flags & IOMAP_F_STALE)
break;
- offset = offset_in_folio(folio, pos);
- if (bytes > folio_size(folio) - offset)
- bytes = folio_size(folio) - offset;
-
- ret = iomap_write_end(iter, pos, bytes, bytes, folio);
- __iomap_put_folio(iter, pos, bytes, folio);
+ ret = iomap_write_end(iter, bytes, bytes, folio);
+ __iomap_put_folio(iter, write_ops, bytes, folio);
if (WARN_ON_ONCE(!ret))
return -EIO;
cond_resched();
- pos += bytes;
- written += bytes;
- length -= bytes;
-
balance_dirty_pages_ratelimited(iter->inode->i_mapping);
- } while (length > 0);
- return written;
+ status = iomap_iter_advance(iter, &bytes);
+ if (status)
+ break;
+ } while (bytes > 0);
+
+ return status;
}
int
iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
- const struct iomap_ops *ops)
+ const struct iomap_ops *ops,
+ const struct iomap_write_ops *write_ops)
{
struct iomap_iter iter = {
.inode = inode,
@@ -1331,7 +1355,7 @@ iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
iter.len = min(len, size - pos);
while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.processed = iomap_unshare_iter(&iter);
+ iter.status = iomap_unshare_iter(&iter, write_ops);
return ret;
}
EXPORT_SYMBOL_GPL(iomap_file_unshare);
@@ -1350,20 +1374,20 @@ static inline int iomap_zero_iter_flush_and_stale(struct iomap_iter *i)
return filemap_write_and_wait_range(mapping, i->pos, end);
}
-static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
+static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero,
+ const struct iomap_write_ops *write_ops)
{
- loff_t pos = iter->pos;
- loff_t length = iomap_length(iter);
- loff_t written = 0;
+ u64 bytes = iomap_length(iter);
+ int status;
do {
struct folio *folio;
- int status;
size_t offset;
- size_t bytes = min_t(u64, SIZE_MAX, length);
bool ret;
- status = iomap_write_begin(iter, pos, bytes, &folio);
+ bytes = min_t(u64, SIZE_MAX, bytes);
+ status = iomap_write_begin(iter, write_ops, &folio, &offset,
+ &bytes);
if (status)
return status;
if (iter->iomap.flags & IOMAP_F_STALE)
@@ -1371,37 +1395,36 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
/* warn about zeroing folios beyond eof that won't write back */
WARN_ON_ONCE(folio_pos(folio) > iter->inode->i_size);
- offset = offset_in_folio(folio, pos);
- if (bytes > folio_size(folio) - offset)
- bytes = folio_size(folio) - offset;
folio_zero_range(folio, offset, bytes);
folio_mark_accessed(folio);
- ret = iomap_write_end(iter, pos, bytes, bytes, folio);
- __iomap_put_folio(iter, pos, bytes, folio);
+ ret = iomap_write_end(iter, bytes, bytes, folio);
+ __iomap_put_folio(iter, write_ops, bytes, folio);
if (WARN_ON_ONCE(!ret))
return -EIO;
- pos += bytes;
- length -= bytes;
- written += bytes;
- } while (length > 0);
+ status = iomap_iter_advance(iter, &bytes);
+ if (status)
+ break;
+ } while (bytes > 0);
if (did_zero)
*did_zero = true;
- return written;
+ return status;
}
int
iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
- const struct iomap_ops *ops)
+ const struct iomap_ops *ops,
+ const struct iomap_write_ops *write_ops, void *private)
{
struct iomap_iter iter = {
.inode = inode,
.pos = pos,
.len = len,
.flags = IOMAP_ZERO,
+ .private = private,
};
struct address_space *mapping = inode->i_mapping;
unsigned int blocksize = i_blocksize(inode);
@@ -1424,7 +1447,8 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
filemap_range_needs_writeback(mapping, pos, pos + plen - 1)) {
iter.len = plen;
while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.processed = iomap_zero_iter(&iter, did_zero);
+ iter.status = iomap_zero_iter(&iter, did_zero,
+ write_ops);
iter.len = len - (iter.pos - pos);
if (ret || !iter.len)
@@ -1443,17 +1467,19 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
if (srcmap->type == IOMAP_HOLE ||
srcmap->type == IOMAP_UNWRITTEN) {
- loff_t proc = iomap_length(&iter);
+ s64 status;
if (range_dirty) {
range_dirty = false;
- proc = iomap_zero_iter_flush_and_stale(&iter);
+ status = iomap_zero_iter_flush_and_stale(&iter);
+ } else {
+ status = iomap_iter_advance_full(&iter);
}
- iter.processed = proc;
+ iter.status = status;
continue;
}
- iter.processed = iomap_zero_iter(&iter, did_zero);
+ iter.status = iomap_zero_iter(&iter, did_zero, write_ops);
}
return ret;
}
@@ -1461,7 +1487,8 @@ EXPORT_SYMBOL_GPL(iomap_zero_range);
int
iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
- const struct iomap_ops *ops)
+ const struct iomap_ops *ops,
+ const struct iomap_write_ops *write_ops, void *private)
{
unsigned int blocksize = i_blocksize(inode);
unsigned int off = pos & (blocksize - 1);
@@ -1469,11 +1496,12 @@ iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
/* Block boundary? Nothing to do */
if (!off)
return 0;
- return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops);
+ return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops,
+ write_ops, private);
}
EXPORT_SYMBOL_GPL(iomap_truncate_page);
-static loff_t iomap_folio_mkwrite_iter(struct iomap_iter *iter,
+static int iomap_folio_mkwrite_iter(struct iomap_iter *iter,
struct folio *folio)
{
loff_t length = iomap_length(iter);
@@ -1484,20 +1512,22 @@ static loff_t iomap_folio_mkwrite_iter(struct iomap_iter *iter,
&iter->iomap);
if (ret)
return ret;
- block_commit_write(&folio->page, 0, length);
+ block_commit_write(folio, 0, length);
} else {
WARN_ON_ONCE(!folio_test_uptodate(folio));
folio_mark_dirty(folio);
}
- return length;
+ return iomap_iter_advance(iter, &length);
}
-vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
+vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops,
+ void *private)
{
struct iomap_iter iter = {
.inode = file_inode(vmf->vma->vm_file),
.flags = IOMAP_WRITE | IOMAP_FAULT,
+ .private = private,
};
struct folio *folio = page_folio(vmf->page);
ssize_t ret;
@@ -1509,7 +1539,7 @@ vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
iter.pos = folio_pos(folio);
iter.len = ret;
while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.processed = iomap_folio_mkwrite_iter(&iter, folio);
+ iter.status = iomap_folio_mkwrite_iter(&iter, folio);
if (ret < 0)
goto out_unlock;
@@ -1521,378 +1551,54 @@ out_unlock:
}
EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
-static void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
+void iomap_start_folio_write(struct inode *inode, struct folio *folio,
size_t len)
{
struct iomap_folio_state *ifs = folio->private;
WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs);
- WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) <= 0);
-
- if (!ifs || atomic_sub_and_test(len, &ifs->write_bytes_pending))
- folio_end_writeback(folio);
-}
-
-/*
- * We're now finished for good with this ioend structure. Update the page
- * state, release holds on bios, and finally free up memory. Do not use the
- * ioend after this.
- */
-static u32
-iomap_finish_ioend(struct iomap_ioend *ioend, int error)
-{
- struct inode *inode = ioend->io_inode;
- struct bio *bio = &ioend->io_bio;
- struct folio_iter fi;
- u32 folio_count = 0;
-
- if (error) {
- mapping_set_error(inode->i_mapping, error);
- if (!bio_flagged(bio, BIO_QUIET)) {
- pr_err_ratelimited(
-"%s: writeback error on inode %lu, offset %lld, sector %llu",
- inode->i_sb->s_id, inode->i_ino,
- ioend->io_offset, ioend->io_sector);
- }
- }
-
- /* walk all folios in bio, ending page IO on them */
- bio_for_each_folio_all(fi, bio) {
- iomap_finish_folio_write(inode, fi.folio, fi.length);
- folio_count++;
- }
-
- bio_put(bio); /* frees the ioend */
- return folio_count;
-}
-
-/*
- * Ioend completion routine for merged bios. This can only be called from task
- * contexts as merged ioends can be of unbound length. Hence we have to break up
- * the writeback completions into manageable chunks to avoid long scheduler
- * holdoffs. We aim to keep scheduler holdoffs down below 10ms so that we get
- * good batch processing throughput without creating adverse scheduler latency
- * conditions.
- */
-void
-iomap_finish_ioends(struct iomap_ioend *ioend, int error)
-{
- struct list_head tmp;
- u32 completions;
-
- might_sleep();
-
- list_replace_init(&ioend->io_list, &tmp);
- completions = iomap_finish_ioend(ioend, error);
-
- while (!list_empty(&tmp)) {
- if (completions > IOEND_BATCH_SIZE * 8) {
- cond_resched();
- completions = 0;
- }
- ioend = list_first_entry(&tmp, struct iomap_ioend, io_list);
- list_del_init(&ioend->io_list);
- completions += iomap_finish_ioend(ioend, error);
- }
-}
-EXPORT_SYMBOL_GPL(iomap_finish_ioends);
-
-/*
- * We can merge two adjacent ioends if they have the same set of work to do.
- */
-static bool
-iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next)
-{
- if (ioend->io_bio.bi_status != next->io_bio.bi_status)
- return false;
- if (next->io_flags & IOMAP_F_BOUNDARY)
- return false;
- if ((ioend->io_flags & IOMAP_F_SHARED) ^
- (next->io_flags & IOMAP_F_SHARED))
- return false;
- if ((ioend->io_type == IOMAP_UNWRITTEN) ^
- (next->io_type == IOMAP_UNWRITTEN))
- return false;
- if (ioend->io_offset + ioend->io_size != next->io_offset)
- return false;
- /*
- * Do not merge physically discontiguous ioends. The filesystem
- * completion functions will have to iterate the physical
- * discontiguities even if we merge the ioends at a logical level, so
- * we don't gain anything by merging physical discontiguities here.
- *
- * We cannot use bio->bi_iter.bi_sector here as it is modified during
- * submission so does not point to the start sector of the bio at
- * completion.
- */
- if (ioend->io_sector + (ioend->io_size >> 9) != next->io_sector)
- return false;
- return true;
-}
-
-void
-iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends)
-{
- struct iomap_ioend *next;
-
- INIT_LIST_HEAD(&ioend->io_list);
-
- while ((next = list_first_entry_or_null(more_ioends, struct iomap_ioend,
- io_list))) {
- if (!iomap_ioend_can_merge(ioend, next))
- break;
- list_move_tail(&next->io_list, &ioend->io_list);
- ioend->io_size += next->io_size;
- }
-}
-EXPORT_SYMBOL_GPL(iomap_ioend_try_merge);
-
-static int
-iomap_ioend_compare(void *priv, const struct list_head *a,
- const struct list_head *b)
-{
- struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list);
- struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list);
-
- if (ia->io_offset < ib->io_offset)
- return -1;
- if (ia->io_offset > ib->io_offset)
- return 1;
- return 0;
-}
-
-void
-iomap_sort_ioends(struct list_head *ioend_list)
-{
- list_sort(NULL, ioend_list, iomap_ioend_compare);
-}
-EXPORT_SYMBOL_GPL(iomap_sort_ioends);
-
-static void iomap_writepage_end_bio(struct bio *bio)
-{
- iomap_finish_ioend(iomap_ioend_from_bio(bio),
- blk_status_to_errno(bio->bi_status));
-}
-
-/*
- * Submit the final bio for an ioend.
- *
- * If @error is non-zero, it means that we have a situation where some part of
- * the submission process has failed after we've marked pages for writeback.
- * We cannot cancel ioend directly in that case, so call the bio end I/O handler
- * with the error status here to run the normal I/O completion handler to clear
- * the writeback bit and let the file system proess the errors.
- */
-static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error)
-{
- if (!wpc->ioend)
- return error;
-
- /*
- * Let the file systems prepare the I/O submission and hook in an I/O
- * comletion handler. This also needs to happen in case after a
- * failure happened so that the file system end I/O handler gets called
- * to clean up.
- */
- if (wpc->ops->prepare_ioend)
- error = wpc->ops->prepare_ioend(wpc->ioend, error);
-
- if (error) {
- wpc->ioend->io_bio.bi_status = errno_to_blk_status(error);
- bio_endio(&wpc->ioend->io_bio);
- } else {
- submit_bio(&wpc->ioend->io_bio);
- }
-
- wpc->ioend = NULL;
- return error;
-}
-
-static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
- struct writeback_control *wbc, struct inode *inode, loff_t pos)
-{
- struct iomap_ioend *ioend;
- struct bio *bio;
-
- bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS,
- REQ_OP_WRITE | wbc_to_write_flags(wbc),
- GFP_NOFS, &iomap_ioend_bioset);
- bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos);
- bio->bi_end_io = iomap_writepage_end_bio;
- wbc_init_bio(wbc, bio);
- bio->bi_write_hint = inode->i_write_hint;
-
- ioend = iomap_ioend_from_bio(bio);
- INIT_LIST_HEAD(&ioend->io_list);
- ioend->io_type = wpc->iomap.type;
- ioend->io_flags = wpc->iomap.flags;
- if (pos > wpc->iomap.offset)
- wpc->iomap.flags &= ~IOMAP_F_BOUNDARY;
- ioend->io_inode = inode;
- ioend->io_size = 0;
- ioend->io_offset = pos;
- ioend->io_sector = bio->bi_iter.bi_sector;
-
- wpc->nr_folios = 0;
- return ioend;
-}
-
-static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos)
-{
- if (wpc->iomap.offset == pos && (wpc->iomap.flags & IOMAP_F_BOUNDARY))
- return false;
- if ((wpc->iomap.flags & IOMAP_F_SHARED) !=
- (wpc->ioend->io_flags & IOMAP_F_SHARED))
- return false;
- if (wpc->iomap.type != wpc->ioend->io_type)
- return false;
- if (pos != wpc->ioend->io_offset + wpc->ioend->io_size)
- return false;
- if (iomap_sector(&wpc->iomap, pos) !=
- bio_end_sector(&wpc->ioend->io_bio))
- return false;
- /*
- * Limit ioend bio chain lengths to minimise IO completion latency. This
- * also prevents long tight loops ending page writeback on all the
- * folios in the ioend.
- */
- if (wpc->nr_folios >= IOEND_BATCH_SIZE)
- return false;
- return true;
+ if (ifs)
+ atomic_add(len, &ifs->write_bytes_pending);
}
+EXPORT_SYMBOL_GPL(iomap_start_folio_write);
-/*
- * Test to see if we have an existing ioend structure that we could append to
- * first; otherwise finish off the current ioend and start another.
- *
- * If a new ioend is created and cached, the old ioend is submitted to the block
- * layer instantly. Batching optimisations are provided by higher level block
- * plugging.
- *
- * At the end of a writeback pass, there will be a cached ioend remaining on the
- * writepage context that the caller will need to submit.
- */
-static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
- struct writeback_control *wbc, struct folio *folio,
- struct inode *inode, loff_t pos, loff_t end_pos,
- unsigned len)
+void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
+ size_t len)
{
struct iomap_folio_state *ifs = folio->private;
- size_t poff = offset_in_folio(folio, pos);
- int error;
- if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos)) {
-new_ioend:
- error = iomap_submit_ioend(wpc, 0);
- if (error)
- return error;
- wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos);
- }
-
- if (!bio_add_folio(&wpc->ioend->io_bio, folio, len, poff))
- goto new_ioend;
-
- if (ifs)
- atomic_add(len, &ifs->write_bytes_pending);
-
- /*
- * Clamp io_offset and io_size to the incore EOF so that ondisk
- * file size updates in the ioend completion are byte-accurate.
- * This avoids recovering files with zeroed tail regions when
- * writeback races with appending writes:
- *
- * Thread 1: Thread 2:
- * ------------ -----------
- * write [A, A+B]
- * update inode size to A+B
- * submit I/O [A, A+BS]
- * write [A+B, A+B+C]
- * update inode size to A+B+C
- * <I/O completes, updates disk size to min(A+B+C, A+BS)>
- * <power failure>
- *
- * After reboot:
- * 1) with A+B+C < A+BS, the file has zero padding in range
- * [A+B, A+B+C]
- *
- * |< Block Size (BS) >|
- * |DDDDDDDDDDDD0000000000000|
- * ^ ^ ^
- * A A+B A+B+C
- * (EOF)
- *
- * 2) with A+B+C > A+BS, the file has zero padding in range
- * [A+B, A+BS]
- *
- * |< Block Size (BS) >|< Block Size (BS) >|
- * |DDDDDDDDDDDD0000000000000|00000000000000000000000000|
- * ^ ^ ^ ^
- * A A+B A+BS A+B+C
- * (EOF)
- *
- * D = Valid Data
- * 0 = Zero Padding
- *
- * Note that this defeats the ability to chain the ioends of
- * appending writes.
- */
- wpc->ioend->io_size += len;
- if (wpc->ioend->io_offset + wpc->ioend->io_size > end_pos)
- wpc->ioend->io_size = end_pos - wpc->ioend->io_offset;
+ WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs);
+ WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) <= 0);
- wbc_account_cgroup_owner(wbc, folio, len);
- return 0;
+ if (!ifs || atomic_sub_and_test(len, &ifs->write_bytes_pending))
+ folio_end_writeback(folio);
}
+EXPORT_SYMBOL_GPL(iomap_finish_folio_write);
-static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc,
- struct writeback_control *wbc, struct folio *folio,
- struct inode *inode, u64 pos, u64 end_pos,
- unsigned dirty_len, unsigned *count)
+static int iomap_writeback_range(struct iomap_writepage_ctx *wpc,
+ struct folio *folio, u64 pos, u32 rlen, u64 end_pos,
+ bool *wb_pending)
{
- int error;
-
do {
- unsigned map_len;
-
- error = wpc->ops->map_blocks(wpc, inode, pos, dirty_len);
- if (error)
- break;
- trace_iomap_writepage_map(inode, pos, dirty_len, &wpc->iomap);
+ ssize_t ret;
- map_len = min_t(u64, dirty_len,
- wpc->iomap.offset + wpc->iomap.length - pos);
- WARN_ON_ONCE(!folio->private && map_len < dirty_len);
+ ret = wpc->ops->writeback_range(wpc, folio, pos, rlen, end_pos);
+ if (WARN_ON_ONCE(ret == 0 || ret > rlen))
+ return -EIO;
+ if (ret < 0)
+ return ret;
+ rlen -= ret;
+ pos += ret;
- switch (wpc->iomap.type) {
- case IOMAP_INLINE:
- WARN_ON_ONCE(1);
- error = -EIO;
- break;
- case IOMAP_HOLE:
- break;
- default:
- error = iomap_add_to_ioend(wpc, wbc, folio, inode, pos,
- end_pos, map_len);
- if (!error)
- (*count)++;
- break;
- }
- dirty_len -= map_len;
- pos += map_len;
- } while (dirty_len && !error);
+ /*
+ * Holes are not be written back by ->writeback_range, so track
+ * if we did handle anything that is not a hole here.
+ */
+ if (wpc->iomap.type != IOMAP_HOLE)
+ *wb_pending = true;
+ } while (rlen);
- /*
- * We cannot cancel the ioend directly here on error. We may have
- * already set other pages under writeback and hence we have to run I/O
- * completion to mark the error state of the pages under writeback
- * appropriately.
- *
- * Just let the file system know what portion of the folio failed to
- * map.
- */
- if (error && wpc->ops->discard_folio)
- wpc->ops->discard_folio(folio, pos);
- return error;
+ return 0;
}
/*
@@ -1901,7 +1607,7 @@ static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc,
* If the folio is entirely beyond i_size, return false. If it straddles
* i_size, adjust end_pos and zero all data beyond i_size.
*/
-static bool iomap_writepage_handle_eof(struct folio *folio, struct inode *inode,
+static bool iomap_writeback_handle_eof(struct folio *folio, struct inode *inode,
u64 *end_pos)
{
u64 isize = i_size_read(inode);
@@ -1953,15 +1659,14 @@ static bool iomap_writepage_handle_eof(struct folio *folio, struct inode *inode,
return true;
}
-static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
- struct writeback_control *wbc, struct folio *folio)
+int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio)
{
struct iomap_folio_state *ifs = folio->private;
- struct inode *inode = folio->mapping->host;
+ struct inode *inode = wpc->inode;
u64 pos = folio_pos(folio);
u64 end_pos = pos + folio_size(folio);
u64 end_aligned = 0;
- unsigned count = 0;
+ bool wb_pending = false;
int error = 0;
u32 rlen;
@@ -1969,12 +1674,10 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
WARN_ON_ONCE(folio_test_dirty(folio));
WARN_ON_ONCE(folio_test_writeback(folio));
- trace_iomap_writepage(inode, pos, folio_size(folio));
+ trace_iomap_writeback_folio(inode, pos, folio_size(folio));
- if (!iomap_writepage_handle_eof(folio, inode, &end_pos)) {
- folio_unlock(folio);
+ if (!iomap_writeback_handle_eof(folio, inode, &end_pos))
return 0;
- }
WARN_ON_ONCE(end_pos <= pos);
if (i_blocks_per_folio(inode, folio) > 1) {
@@ -1990,7 +1693,7 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
* all blocks.
*/
WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending) != 0);
- atomic_inc(&ifs->write_bytes_pending);
+ iomap_start_folio_write(inode, folio, 1);
}
/*
@@ -2004,14 +1707,14 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
*/
end_aligned = round_up(end_pos, i_blocksize(inode));
while ((rlen = iomap_find_dirty_range(folio, &pos, end_aligned))) {
- error = iomap_writepage_map_blocks(wpc, wbc, folio, inode,
- pos, end_pos, rlen, &count);
+ error = iomap_writeback_range(wpc, folio, pos, rlen, end_pos,
+ &wb_pending);
if (error)
break;
pos += rlen;
}
- if (count)
+ if (wb_pending)
wpc->nr_folios++;
/*
@@ -2028,23 +1731,22 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
* already at this point. In that case we need to clear the writeback
* bit ourselves right after unlocking the page.
*/
- folio_unlock(folio);
if (ifs) {
if (atomic_dec_and_test(&ifs->write_bytes_pending))
folio_end_writeback(folio);
} else {
- if (!count)
+ if (!wb_pending)
folio_end_writeback(folio);
}
mapping_set_error(inode->i_mapping, error);
return error;
}
+EXPORT_SYMBOL_GPL(iomap_writeback_folio);
int
-iomap_writepages(struct address_space *mapping, struct writeback_control *wbc,
- struct iomap_writepage_ctx *wpc,
- const struct iomap_writeback_ops *ops)
+iomap_writepages(struct iomap_writepage_ctx *wpc)
{
+ struct address_space *mapping = wpc->inode->i_mapping;
struct folio *folio = NULL;
int error;
@@ -2056,17 +1758,22 @@ iomap_writepages(struct address_space *mapping, struct writeback_control *wbc,
PF_MEMALLOC))
return -EIO;
- wpc->ops = ops;
- while ((folio = writeback_iter(mapping, wbc, folio, &error)))
- error = iomap_writepage_map(wpc, wbc, folio);
- return iomap_submit_ioend(wpc, error);
-}
-EXPORT_SYMBOL_GPL(iomap_writepages);
+ while ((folio = writeback_iter(mapping, wpc->wbc, folio, &error))) {
+ error = iomap_writeback_folio(wpc, folio);
+ folio_unlock(folio);
+ }
-static int __init iomap_buffered_init(void)
-{
- return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE),
- offsetof(struct iomap_ioend, io_bio),
- BIOSET_NEED_BVECS);
+ /*
+ * If @error is non-zero, it means that we have a situation where some
+ * part of the submission process has failed after we've marked pages
+ * for writeback.
+ *
+ * We cannot cancel the writeback directly in that case, so always call
+ * ->writeback_submit to run the I/O completion handler to clear the
+ * writeback bit and let the file system proess the errors.
+ */
+ if (wpc->wb_ctx)
+ return wpc->ops->writeback_submit(wpc, error);
+ return error;
}
-fs_initcall(iomap_buffered_init);
+EXPORT_SYMBOL_GPL(iomap_writepages);
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index b521eb15759e..b84f6af2eb4c 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -1,17 +1,13 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2010 Red Hat, Inc.
- * Copyright (c) 2016-2021 Christoph Hellwig.
+ * Copyright (c) 2016-2025 Christoph Hellwig.
*/
-#include <linux/module.h>
-#include <linux/compiler.h>
-#include <linux/fs.h>
#include <linux/fscrypt.h>
#include <linux/pagemap.h>
#include <linux/iomap.h>
-#include <linux/backing-dev.h>
-#include <linux/uio.h>
#include <linux/task_io_accounting_ops.h>
+#include "internal.h"
#include "trace.h"
#include "../internal.h"
@@ -20,6 +16,7 @@
* Private flags for iomap_dio, must not overlap with the public ones in
* iomap.h:
*/
+#define IOMAP_DIO_NO_INVALIDATE (1U << 25)
#define IOMAP_DIO_CALLER_COMP (1U << 26)
#define IOMAP_DIO_INLINE_COMP (1U << 27)
#define IOMAP_DIO_WRITE_THROUGH (1U << 28)
@@ -81,10 +78,12 @@ static void iomap_dio_submit_bio(const struct iomap_iter *iter,
WRITE_ONCE(iocb->private, bio);
}
- if (dio->dops && dio->dops->submit_io)
+ if (dio->dops && dio->dops->submit_io) {
dio->dops->submit_io(iter, bio, pos);
- else
+ } else {
+ WARN_ON_ONCE(iter->iomap.flags & IOMAP_F_ANON_WRITE);
submit_bio(bio);
+ }
}
ssize_t iomap_dio_complete(struct iomap_dio *dio)
@@ -117,7 +116,8 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio)
* ->end_io() when necessary, otherwise a racing buffer read would cache
* zeros from unwritten extents.
*/
- if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE))
+ if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE) &&
+ !(dio->flags & IOMAP_DIO_NO_INVALIDATE))
kiocb_invalidate_post_direct_write(iocb, dio->size);
inode_dio_end(file_inode(iocb->ki_filp));
@@ -163,43 +163,31 @@ static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret)
cmpxchg(&dio->error, 0, ret);
}
-void iomap_dio_bio_end_io(struct bio *bio)
+/*
+ * Called when dio->ref reaches zero from an I/O completion.
+ */
+static void iomap_dio_done(struct iomap_dio *dio)
{
- struct iomap_dio *dio = bio->bi_private;
- bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
struct kiocb *iocb = dio->iocb;
- if (bio->bi_status)
- iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));
- if (!atomic_dec_and_test(&dio->ref))
- goto release_bio;
-
- /*
- * Synchronous dio, task itself will handle any completion work
- * that needs after IO. All we need to do is wake the task.
- */
if (dio->wait_for_completion) {
+ /*
+ * Synchronous I/O, task itself will handle any completion work
+ * that needs after IO. All we need to do is wake the task.
+ */
struct task_struct *waiter = dio->submit.waiter;
WRITE_ONCE(dio->submit.waiter, NULL);
blk_wake_io_task(waiter);
- goto release_bio;
- }
-
- /*
- * Flagged with IOMAP_DIO_INLINE_COMP, we can complete it inline
- */
- if (dio->flags & IOMAP_DIO_INLINE_COMP) {
+ } else if (dio->flags & IOMAP_DIO_INLINE_COMP) {
WRITE_ONCE(iocb->private, NULL);
iomap_dio_complete_work(&dio->aio.work);
- goto release_bio;
- }
-
- /*
- * If this dio is flagged with IOMAP_DIO_CALLER_COMP, then schedule
- * our completion that way to avoid an async punt to a workqueue.
- */
- if (dio->flags & IOMAP_DIO_CALLER_COMP) {
+ } else if (dio->flags & IOMAP_DIO_CALLER_COMP) {
+ /*
+ * If this dio is flagged with IOMAP_DIO_CALLER_COMP, then
+ * schedule our completion that way to avoid an async punt to a
+ * workqueue.
+ */
/* only polled IO cares about private cleared */
iocb->private = dio;
iocb->dio_complete = iomap_dio_deferred_complete;
@@ -217,19 +205,31 @@ void iomap_dio_bio_end_io(struct bio *bio)
* issuer.
*/
iocb->ki_complete(iocb, 0);
- goto release_bio;
+ } else {
+ struct inode *inode = file_inode(iocb->ki_filp);
+
+ /*
+ * Async DIO completion that requires filesystem level
+ * completion work gets punted to a work queue to complete as
+ * the operation may require more IO to be issued to finalise
+ * filesystem metadata changes or guarantee data integrity.
+ */
+ INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
+ queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
}
+}
+
+void iomap_dio_bio_end_io(struct bio *bio)
+{
+ struct iomap_dio *dio = bio->bi_private;
+ bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
+
+ if (bio->bi_status)
+ iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));
+
+ if (atomic_dec_and_test(&dio->ref))
+ iomap_dio_done(dio);
- /*
- * Async DIO completion that requires filesystem level completion work
- * gets punted to a work queue to complete as the operation may require
- * more IO to be issued to finalise filesystem metadata changes or
- * guarantee data integrity.
- */
- INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
- queue_work(file_inode(iocb->ki_filp)->i_sb->s_dio_done_wq,
- &dio->aio.work);
-release_bio:
if (should_dirty) {
bio_check_pages_dirty(bio);
} else {
@@ -239,6 +239,47 @@ release_bio:
}
EXPORT_SYMBOL_GPL(iomap_dio_bio_end_io);
+u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend)
+{
+ struct iomap_dio *dio = ioend->io_bio.bi_private;
+ bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
+ u32 vec_count = ioend->io_bio.bi_vcnt;
+
+ if (ioend->io_error)
+ iomap_dio_set_error(dio, ioend->io_error);
+
+ if (atomic_dec_and_test(&dio->ref)) {
+ /*
+ * Try to avoid another context switch for the completion given
+ * that we are already called from the ioend completion
+ * workqueue, but never invalidate pages from this thread to
+ * avoid deadlocks with buffered I/O completions. Tough luck if
+ * you hit the tiny race with someone dirtying the range now
+ * between this check and the actual completion.
+ */
+ if (!dio->iocb->ki_filp->f_mapping->nrpages) {
+ dio->flags |= IOMAP_DIO_INLINE_COMP;
+ dio->flags |= IOMAP_DIO_NO_INVALIDATE;
+ }
+ dio->flags &= ~IOMAP_DIO_CALLER_COMP;
+ iomap_dio_done(dio);
+ }
+
+ if (should_dirty) {
+ bio_check_pages_dirty(&ioend->io_bio);
+ } else {
+ bio_release_pages(&ioend->io_bio, false);
+ bio_put(&ioend->io_bio);
+ }
+
+ /*
+ * Return the number of bvecs completed as even direct I/O completions
+ * do significant per-folio work and we'll still want to give up the
+ * CPU after a lot of completions.
+ */
+ return vec_count;
+}
+
static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
loff_t pos, unsigned len)
{
@@ -266,81 +307,85 @@ static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
}
/*
- * Figure out the bio's operation flags from the dio request, the
- * mapping, and whether or not we want FUA. Note that we can end up
- * clearing the WRITE_THROUGH flag in the dio request.
+ * Use a FUA write if we need datasync semantics and this is a pure data I/O
+ * that doesn't require any metadata updates (including after I/O completion
+ * such as unwritten extent conversion) and the underlying device either
+ * doesn't have a volatile write cache or supports FUA.
+ * This allows us to avoid cache flushes on I/O completion.
*/
-static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
- const struct iomap *iomap, bool use_fua, bool atomic)
+static inline bool iomap_dio_can_use_fua(const struct iomap *iomap,
+ struct iomap_dio *dio)
{
- blk_opf_t opflags = REQ_SYNC | REQ_IDLE;
-
- if (!(dio->flags & IOMAP_DIO_WRITE))
- return REQ_OP_READ;
-
- opflags |= REQ_OP_WRITE;
- if (use_fua)
- opflags |= REQ_FUA;
- else
- dio->flags &= ~IOMAP_DIO_WRITE_THROUGH;
- if (atomic)
- opflags |= REQ_ATOMIC;
-
- return opflags;
+ if (iomap->flags & (IOMAP_F_SHARED | IOMAP_F_DIRTY))
+ return false;
+ if (!(dio->flags & IOMAP_DIO_WRITE_THROUGH))
+ return false;
+ return !bdev_write_cache(iomap->bdev) || bdev_fua(iomap->bdev);
}
-static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
- struct iomap_dio *dio)
+static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
{
const struct iomap *iomap = &iter->iomap;
struct inode *inode = iter->inode;
unsigned int fs_block_size = i_blocksize(inode), pad;
const loff_t length = iomap_length(iter);
- bool atomic = iter->flags & IOMAP_ATOMIC;
loff_t pos = iter->pos;
- blk_opf_t bio_opf;
+ blk_opf_t bio_opf = REQ_SYNC | REQ_IDLE;
struct bio *bio;
bool need_zeroout = false;
- bool use_fua = false;
int nr_pages, ret = 0;
- size_t copied = 0;
+ u64 copied = 0;
size_t orig_count;
- if (atomic && length != fs_block_size)
- return -EINVAL;
-
if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) ||
!bdev_iter_is_aligned(iomap->bdev, dio->submit.iter))
return -EINVAL;
- if (iomap->type == IOMAP_UNWRITTEN) {
- dio->flags |= IOMAP_DIO_UNWRITTEN;
- need_zeroout = true;
- }
+ if (dio->flags & IOMAP_DIO_WRITE) {
+ bio_opf |= REQ_OP_WRITE;
- if (iomap->flags & IOMAP_F_SHARED)
- dio->flags |= IOMAP_DIO_COW;
+ if (iomap->flags & IOMAP_F_ATOMIC_BIO) {
+ /*
+ * Ensure that the mapping covers the full write
+ * length, otherwise it won't be submitted as a single
+ * bio, which is required to use hardware atomics.
+ */
+ if (length != iter->len)
+ return -EINVAL;
+ bio_opf |= REQ_ATOMIC;
+ }
+
+ if (iomap->type == IOMAP_UNWRITTEN) {
+ dio->flags |= IOMAP_DIO_UNWRITTEN;
+ need_zeroout = true;
+ }
+
+ if (iomap->flags & IOMAP_F_SHARED)
+ dio->flags |= IOMAP_DIO_COW;
+
+ if (iomap->flags & IOMAP_F_NEW)
+ need_zeroout = true;
+ else if (iomap->type == IOMAP_MAPPED &&
+ iomap_dio_can_use_fua(iomap, dio))
+ bio_opf |= REQ_FUA;
+
+ if (!(bio_opf & REQ_FUA))
+ dio->flags &= ~IOMAP_DIO_WRITE_THROUGH;
- if (iomap->flags & IOMAP_F_NEW) {
- need_zeroout = true;
- } else if (iomap->type == IOMAP_MAPPED) {
/*
- * Use a FUA write if we need datasync semantics, this is a pure
- * data IO that doesn't require any metadata updates (including
- * after IO completion such as unwritten extent conversion) and
- * the underlying device either supports FUA or doesn't have
- * a volatile write cache. This allows us to avoid cache flushes
- * on IO completion. If we can't use writethrough and need to
- * sync, disable in-task completions as dio completion will
- * need to call generic_write_sync() which will do a blocking
- * fsync / cache flush call.
+ * We can only do deferred completion for pure overwrites that
+ * don't require additional I/O at completion time.
+ *
+ * This rules out writes that need zeroing or extent conversion,
+ * extend the file size, or issue metadata I/O or cache flushes
+ * during completion processing.
*/
- if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
- (dio->flags & IOMAP_DIO_WRITE_THROUGH) &&
- (bdev_fua(iomap->bdev) || !bdev_write_cache(iomap->bdev)))
- use_fua = true;
- else if (dio->flags & IOMAP_DIO_NEED_SYNC)
+ if (need_zeroout || (pos >= i_size_read(inode)) ||
+ ((dio->flags & IOMAP_DIO_NEED_SYNC) &&
+ !(bio_opf & REQ_FUA)))
dio->flags &= ~IOMAP_DIO_CALLER_COMP;
+ } else {
+ bio_opf |= REQ_OP_READ;
}
/*
@@ -355,18 +400,6 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
goto out;
/*
- * We can only do deferred completion for pure overwrites that
- * don't require additional IO at completion. This rules out
- * writes that need zeroing or extent conversion, extend
- * the file size, or issue journal IO or cache flushes
- * during completion processing.
- */
- if (need_zeroout ||
- ((dio->flags & IOMAP_DIO_NEED_SYNC) && !use_fua) ||
- ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode)))
- dio->flags &= ~IOMAP_DIO_CALLER_COMP;
-
- /*
* The rules for polled IO completions follow the guidelines as the
* ones we set for inline and deferred completions. If none of those
* are available for this IO, clear the polled flag.
@@ -383,8 +416,6 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
goto out;
}
- bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua, atomic);
-
nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS);
do {
size_t n;
@@ -416,9 +447,9 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
}
n = bio->bi_iter.bi_size;
- if (WARN_ON_ONCE(atomic && n != length)) {
+ if (WARN_ON_ONCE((bio_opf & REQ_ATOMIC) && n != length)) {
/*
- * This bio should have covered the complete length,
+ * An atomic write bio must cover the complete length,
* which it doesn't, so error. We may need to zero out
* the tail (complete FS block), similar to when
* bio_iov_iter_get_pages() returns an error, above.
@@ -427,12 +458,10 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
bio_put(bio);
goto zero_tail;
}
- if (dio->flags & IOMAP_DIO_WRITE) {
+ if (dio->flags & IOMAP_DIO_WRITE)
task_io_account_write(n);
- } else {
- if (dio->flags & IOMAP_DIO_DIRTY)
- bio_set_pages_dirty(bio);
- }
+ else if (dio->flags & IOMAP_DIO_DIRTY)
+ bio_set_pages_dirty(bio);
dio->size += n;
copied += n;
@@ -467,30 +496,28 @@ out:
/* Undo iter limitation to current extent */
iov_iter_reexpand(dio->submit.iter, orig_count - copied);
if (copied)
- return copied;
+ return iomap_iter_advance(iter, &copied);
return ret;
}
-static loff_t iomap_dio_hole_iter(const struct iomap_iter *iter,
- struct iomap_dio *dio)
+static int iomap_dio_hole_iter(struct iomap_iter *iter, struct iomap_dio *dio)
{
loff_t length = iov_iter_zero(iomap_length(iter), dio->submit.iter);
dio->size += length;
if (!length)
return -EFAULT;
- return length;
+ return iomap_iter_advance(iter, &length);
}
-static loff_t iomap_dio_inline_iter(const struct iomap_iter *iomi,
- struct iomap_dio *dio)
+static int iomap_dio_inline_iter(struct iomap_iter *iomi, struct iomap_dio *dio)
{
const struct iomap *iomap = &iomi->iomap;
struct iov_iter *iter = dio->submit.iter;
void *inline_data = iomap_inline_data(iomap, iomi->pos);
loff_t length = iomap_length(iomi);
loff_t pos = iomi->pos;
- size_t copied;
+ u64 copied;
if (WARN_ON_ONCE(!iomap_inline_data_valid(iomap)))
return -EIO;
@@ -512,11 +539,10 @@ static loff_t iomap_dio_inline_iter(const struct iomap_iter *iomi,
dio->size += copied;
if (!copied)
return -EFAULT;
- return copied;
+ return iomap_iter_advance(iomi, &copied);
}
-static loff_t iomap_dio_iter(const struct iomap_iter *iter,
- struct iomap_dio *dio)
+static int iomap_dio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
{
switch (iter->iomap.type) {
case IOMAP_HOLE:
@@ -610,9 +636,6 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (iocb->ki_flags & IOCB_NOWAIT)
iomi.flags |= IOMAP_NOWAIT;
- if (iocb->ki_flags & IOCB_ATOMIC)
- iomi.flags |= IOMAP_ATOMIC;
-
if (iov_iter_rw(iter) == READ) {
/* reads can always complete inline */
dio->flags |= IOMAP_DIO_INLINE_COMP;
@@ -647,6 +670,9 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
iomi.flags |= IOMAP_OVERWRITE_ONLY;
}
+ if (iocb->ki_flags & IOCB_ATOMIC)
+ iomi.flags |= IOMAP_ATOMIC;
+
/* for data sync or sync, we need sync completion processing */
if (iocb_is_dsync(iocb)) {
dio->flags |= IOMAP_DIO_NEED_SYNC;
@@ -700,7 +726,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
blk_start_plug(&plug);
while ((ret = iomap_iter(&iomi, ops)) > 0) {
- iomi.processed = iomap_dio_iter(&iomi, dio);
+ iomi.status = iomap_dio_iter(&iomi, dio);
/*
* We can only poll for single bio I/Os.
diff --git a/fs/iomap/fiemap.c b/fs/iomap/fiemap.c
index 610ca6f1ec9b..d11dadff8286 100644
--- a/fs/iomap/fiemap.c
+++ b/fs/iomap/fiemap.c
@@ -2,9 +2,6 @@
/*
* Copyright (c) 2016-2021 Christoph Hellwig.
*/
-#include <linux/module.h>
-#include <linux/compiler.h>
-#include <linux/fs.h>
#include <linux/iomap.h>
#include <linux/fiemap.h>
#include <linux/pagemap.h>
@@ -39,24 +36,23 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi,
iomap->length, flags);
}
-static loff_t iomap_fiemap_iter(const struct iomap_iter *iter,
+static int iomap_fiemap_iter(struct iomap_iter *iter,
struct fiemap_extent_info *fi, struct iomap *prev)
{
int ret;
if (iter->iomap.type == IOMAP_HOLE)
- return iomap_length(iter);
+ goto advance;
ret = iomap_to_fiemap(fi, prev, 0);
*prev = iter->iomap;
- switch (ret) {
- case 0: /* success */
- return iomap_length(iter);
- case 1: /* extent array full */
- return 0;
- default: /* error */
+ if (ret < 0)
return ret;
- }
+ if (ret == 1) /* extent array full */
+ return 0;
+
+advance:
+ return iomap_iter_advance_full(iter);
}
int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
@@ -78,7 +74,7 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
return ret;
while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.processed = iomap_fiemap_iter(&iter, fi, &prev);
+ iter.status = iomap_fiemap_iter(&iter, fi, &prev);
if (prev.type != IOMAP_HOLE) {
ret = iomap_to_fiemap(fi, &prev, FIEMAP_EXTENT_LAST);
@@ -114,7 +110,7 @@ iomap_bmap(struct address_space *mapping, sector_t bno,
while ((ret = iomap_iter(&iter, ops)) > 0) {
if (iter.iomap.type == IOMAP_MAPPED)
bno = iomap_sector(&iter.iomap, iter.pos) >> blkshift;
- /* leave iter.processed unset to abort loop */
+ /* leave iter.status unset to abort loop */
}
if (ret)
return 0;
diff --git a/fs/iomap/internal.h b/fs/iomap/internal.h
new file mode 100644
index 000000000000..d05cb3aed96e
--- /dev/null
+++ b/fs/iomap/internal.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _IOMAP_INTERNAL_H
+#define _IOMAP_INTERNAL_H 1
+
+#define IOEND_BATCH_SIZE 4096
+
+u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend);
+
+#endif /* _IOMAP_INTERNAL_H */
diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c
new file mode 100644
index 000000000000..b49fa75eab26
--- /dev/null
+++ b/fs/iomap/ioend.c
@@ -0,0 +1,434 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2016-2025 Christoph Hellwig.
+ */
+#include <linux/iomap.h>
+#include <linux/list_sort.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include "internal.h"
+#include "trace.h"
+
+struct bio_set iomap_ioend_bioset;
+EXPORT_SYMBOL_GPL(iomap_ioend_bioset);
+
+struct iomap_ioend *iomap_init_ioend(struct inode *inode,
+ struct bio *bio, loff_t file_offset, u16 ioend_flags)
+{
+ struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
+
+ atomic_set(&ioend->io_remaining, 1);
+ ioend->io_error = 0;
+ ioend->io_parent = NULL;
+ INIT_LIST_HEAD(&ioend->io_list);
+ ioend->io_flags = ioend_flags;
+ ioend->io_inode = inode;
+ ioend->io_offset = file_offset;
+ ioend->io_size = bio->bi_iter.bi_size;
+ ioend->io_sector = bio->bi_iter.bi_sector;
+ ioend->io_private = NULL;
+ return ioend;
+}
+EXPORT_SYMBOL_GPL(iomap_init_ioend);
+
+/*
+ * We're now finished for good with this ioend structure. Update the folio
+ * state, release holds on bios, and finally free up memory. Do not use the
+ * ioend after this.
+ */
+static u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend)
+{
+ struct inode *inode = ioend->io_inode;
+ struct bio *bio = &ioend->io_bio;
+ struct folio_iter fi;
+ u32 folio_count = 0;
+
+ if (ioend->io_error) {
+ mapping_set_error(inode->i_mapping, ioend->io_error);
+ if (!bio_flagged(bio, BIO_QUIET)) {
+ pr_err_ratelimited(
+"%s: writeback error on inode %lu, offset %lld, sector %llu",
+ inode->i_sb->s_id, inode->i_ino,
+ ioend->io_offset, ioend->io_sector);
+ }
+ }
+
+ /* walk all folios in bio, ending page IO on them */
+ bio_for_each_folio_all(fi, bio) {
+ iomap_finish_folio_write(inode, fi.folio, fi.length);
+ folio_count++;
+ }
+
+ bio_put(bio); /* frees the ioend */
+ return folio_count;
+}
+
+static void ioend_writeback_end_bio(struct bio *bio)
+{
+ struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
+
+ ioend->io_error = blk_status_to_errno(bio->bi_status);
+ iomap_finish_ioend_buffered(ioend);
+}
+
+/*
+ * We cannot cancel the ioend directly in case of an error, so call the bio end
+ * I/O handler with the error status here to run the normal I/O completion
+ * handler.
+ */
+int iomap_ioend_writeback_submit(struct iomap_writepage_ctx *wpc, int error)
+{
+ struct iomap_ioend *ioend = wpc->wb_ctx;
+
+ if (!ioend->io_bio.bi_end_io)
+ ioend->io_bio.bi_end_io = ioend_writeback_end_bio;
+
+ if (WARN_ON_ONCE(wpc->iomap.flags & IOMAP_F_ANON_WRITE))
+ error = -EIO;
+
+ if (error) {
+ ioend->io_bio.bi_status = errno_to_blk_status(error);
+ bio_endio(&ioend->io_bio);
+ return error;
+ }
+
+ submit_bio(&ioend->io_bio);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(iomap_ioend_writeback_submit);
+
+static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
+ loff_t pos, u16 ioend_flags)
+{
+ struct bio *bio;
+
+ bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS,
+ REQ_OP_WRITE | wbc_to_write_flags(wpc->wbc),
+ GFP_NOFS, &iomap_ioend_bioset);
+ bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos);
+ bio->bi_write_hint = wpc->inode->i_write_hint;
+ wbc_init_bio(wpc->wbc, bio);
+ wpc->nr_folios = 0;
+ return iomap_init_ioend(wpc->inode, bio, pos, ioend_flags);
+}
+
+static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos,
+ u16 ioend_flags)
+{
+ struct iomap_ioend *ioend = wpc->wb_ctx;
+
+ if (ioend_flags & IOMAP_IOEND_BOUNDARY)
+ return false;
+ if ((ioend_flags & IOMAP_IOEND_NOMERGE_FLAGS) !=
+ (ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS))
+ return false;
+ if (pos != ioend->io_offset + ioend->io_size)
+ return false;
+ if (!(wpc->iomap.flags & IOMAP_F_ANON_WRITE) &&
+ iomap_sector(&wpc->iomap, pos) != bio_end_sector(&ioend->io_bio))
+ return false;
+ /*
+ * Limit ioend bio chain lengths to minimise IO completion latency. This
+ * also prevents long tight loops ending page writeback on all the
+ * folios in the ioend.
+ */
+ if (wpc->nr_folios >= IOEND_BATCH_SIZE)
+ return false;
+ return true;
+}
+
+/*
+ * Test to see if we have an existing ioend structure that we could append to
+ * first; otherwise finish off the current ioend and start another.
+ *
+ * If a new ioend is created and cached, the old ioend is submitted to the block
+ * layer instantly. Batching optimisations are provided by higher level block
+ * plugging.
+ *
+ * At the end of a writeback pass, there will be a cached ioend remaining on the
+ * writepage context that the caller will need to submit.
+ */
+ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio,
+ loff_t pos, loff_t end_pos, unsigned int dirty_len)
+{
+ struct iomap_ioend *ioend = wpc->wb_ctx;
+ size_t poff = offset_in_folio(folio, pos);
+ unsigned int ioend_flags = 0;
+ unsigned int map_len = min_t(u64, dirty_len,
+ wpc->iomap.offset + wpc->iomap.length - pos);
+ int error;
+
+ trace_iomap_add_to_ioend(wpc->inode, pos, dirty_len, &wpc->iomap);
+
+ WARN_ON_ONCE(!folio->private && map_len < dirty_len);
+
+ switch (wpc->iomap.type) {
+ case IOMAP_INLINE:
+ WARN_ON_ONCE(1);
+ return -EIO;
+ case IOMAP_HOLE:
+ return map_len;
+ default:
+ break;
+ }
+
+ if (wpc->iomap.type == IOMAP_UNWRITTEN)
+ ioend_flags |= IOMAP_IOEND_UNWRITTEN;
+ if (wpc->iomap.flags & IOMAP_F_SHARED)
+ ioend_flags |= IOMAP_IOEND_SHARED;
+ if (folio_test_dropbehind(folio))
+ ioend_flags |= IOMAP_IOEND_DONTCACHE;
+ if (pos == wpc->iomap.offset && (wpc->iomap.flags & IOMAP_F_BOUNDARY))
+ ioend_flags |= IOMAP_IOEND_BOUNDARY;
+
+ if (!ioend || !iomap_can_add_to_ioend(wpc, pos, ioend_flags)) {
+new_ioend:
+ if (ioend) {
+ error = wpc->ops->writeback_submit(wpc, 0);
+ if (error)
+ return error;
+ }
+ wpc->wb_ctx = ioend = iomap_alloc_ioend(wpc, pos, ioend_flags);
+ }
+
+ if (!bio_add_folio(&ioend->io_bio, folio, map_len, poff))
+ goto new_ioend;
+
+ iomap_start_folio_write(wpc->inode, folio, map_len);
+
+ /*
+ * Clamp io_offset and io_size to the incore EOF so that ondisk
+ * file size updates in the ioend completion are byte-accurate.
+ * This avoids recovering files with zeroed tail regions when
+ * writeback races with appending writes:
+ *
+ * Thread 1: Thread 2:
+ * ------------ -----------
+ * write [A, A+B]
+ * update inode size to A+B
+ * submit I/O [A, A+BS]
+ * write [A+B, A+B+C]
+ * update inode size to A+B+C
+ * <I/O completes, updates disk size to min(A+B+C, A+BS)>
+ * <power failure>
+ *
+ * After reboot:
+ * 1) with A+B+C < A+BS, the file has zero padding in range
+ * [A+B, A+B+C]
+ *
+ * |< Block Size (BS) >|
+ * |DDDDDDDDDDDD0000000000000|
+ * ^ ^ ^
+ * A A+B A+B+C
+ * (EOF)
+ *
+ * 2) with A+B+C > A+BS, the file has zero padding in range
+ * [A+B, A+BS]
+ *
+ * |< Block Size (BS) >|< Block Size (BS) >|
+ * |DDDDDDDDDDDD0000000000000|00000000000000000000000000|
+ * ^ ^ ^ ^
+ * A A+B A+BS A+B+C
+ * (EOF)
+ *
+ * D = Valid Data
+ * 0 = Zero Padding
+ *
+ * Note that this defeats the ability to chain the ioends of
+ * appending writes.
+ */
+ ioend->io_size += map_len;
+ if (ioend->io_offset + ioend->io_size > end_pos)
+ ioend->io_size = end_pos - ioend->io_offset;
+
+ wbc_account_cgroup_owner(wpc->wbc, folio, map_len);
+ return map_len;
+}
+EXPORT_SYMBOL_GPL(iomap_add_to_ioend);
+
+static u32 iomap_finish_ioend(struct iomap_ioend *ioend, int error)
+{
+ if (ioend->io_parent) {
+ struct bio *bio = &ioend->io_bio;
+
+ ioend = ioend->io_parent;
+ bio_put(bio);
+ }
+
+ if (error)
+ cmpxchg(&ioend->io_error, 0, error);
+
+ if (!atomic_dec_and_test(&ioend->io_remaining))
+ return 0;
+ if (ioend->io_flags & IOMAP_IOEND_DIRECT)
+ return iomap_finish_ioend_direct(ioend);
+ return iomap_finish_ioend_buffered(ioend);
+}
+
+/*
+ * Ioend completion routine for merged bios. This can only be called from task
+ * contexts as merged ioends can be of unbound length. Hence we have to break up
+ * the writeback completions into manageable chunks to avoid long scheduler
+ * holdoffs. We aim to keep scheduler holdoffs down below 10ms so that we get
+ * good batch processing throughput without creating adverse scheduler latency
+ * conditions.
+ */
+void iomap_finish_ioends(struct iomap_ioend *ioend, int error)
+{
+ struct list_head tmp;
+ u32 completions;
+
+ might_sleep();
+
+ list_replace_init(&ioend->io_list, &tmp);
+ completions = iomap_finish_ioend(ioend, error);
+
+ while (!list_empty(&tmp)) {
+ if (completions > IOEND_BATCH_SIZE * 8) {
+ cond_resched();
+ completions = 0;
+ }
+ ioend = list_first_entry(&tmp, struct iomap_ioend, io_list);
+ list_del_init(&ioend->io_list);
+ completions += iomap_finish_ioend(ioend, error);
+ }
+}
+EXPORT_SYMBOL_GPL(iomap_finish_ioends);
+
+/*
+ * We can merge two adjacent ioends if they have the same set of work to do.
+ */
+static bool iomap_ioend_can_merge(struct iomap_ioend *ioend,
+ struct iomap_ioend *next)
+{
+ if (ioend->io_bio.bi_status != next->io_bio.bi_status)
+ return false;
+ if (next->io_flags & IOMAP_IOEND_BOUNDARY)
+ return false;
+ if ((ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS) !=
+ (next->io_flags & IOMAP_IOEND_NOMERGE_FLAGS))
+ return false;
+ if (ioend->io_offset + ioend->io_size != next->io_offset)
+ return false;
+ /*
+ * Do not merge physically discontiguous ioends. The filesystem
+ * completion functions will have to iterate the physical
+ * discontiguities even if we merge the ioends at a logical level, so
+ * we don't gain anything by merging physical discontiguities here.
+ *
+ * We cannot use bio->bi_iter.bi_sector here as it is modified during
+ * submission so does not point to the start sector of the bio at
+ * completion.
+ */
+ if (ioend->io_sector + (ioend->io_size >> SECTOR_SHIFT) !=
+ next->io_sector)
+ return false;
+ return true;
+}
+
+void iomap_ioend_try_merge(struct iomap_ioend *ioend,
+ struct list_head *more_ioends)
+{
+ struct iomap_ioend *next;
+
+ INIT_LIST_HEAD(&ioend->io_list);
+
+ while ((next = list_first_entry_or_null(more_ioends, struct iomap_ioend,
+ io_list))) {
+ if (!iomap_ioend_can_merge(ioend, next))
+ break;
+ list_move_tail(&next->io_list, &ioend->io_list);
+ ioend->io_size += next->io_size;
+ }
+}
+EXPORT_SYMBOL_GPL(iomap_ioend_try_merge);
+
+static int iomap_ioend_compare(void *priv, const struct list_head *a,
+ const struct list_head *b)
+{
+ struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list);
+ struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list);
+
+ if (ia->io_offset < ib->io_offset)
+ return -1;
+ if (ia->io_offset > ib->io_offset)
+ return 1;
+ return 0;
+}
+
+void iomap_sort_ioends(struct list_head *ioend_list)
+{
+ list_sort(NULL, ioend_list, iomap_ioend_compare);
+}
+EXPORT_SYMBOL_GPL(iomap_sort_ioends);
+
+/*
+ * Split up to the first @max_len bytes from @ioend if the ioend covers more
+ * than @max_len bytes.
+ *
+ * If @is_append is set, the split will be based on the hardware limits for
+ * REQ_OP_ZONE_APPEND commands and can be less than @max_len if the hardware
+ * limits don't allow the entire @max_len length.
+ *
+ * The bio embedded into @ioend must be a REQ_OP_WRITE because the block layer
+ * does not allow splitting REQ_OP_ZONE_APPEND bios. The file systems has to
+ * switch the operation after this call, but before submitting the bio.
+ */
+struct iomap_ioend *iomap_split_ioend(struct iomap_ioend *ioend,
+ unsigned int max_len, bool is_append)
+{
+ struct bio *bio = &ioend->io_bio;
+ struct iomap_ioend *split_ioend;
+ unsigned int nr_segs;
+ int sector_offset;
+ struct bio *split;
+
+ if (is_append) {
+ struct queue_limits *lim = bdev_limits(bio->bi_bdev);
+
+ max_len = min(max_len,
+ lim->max_zone_append_sectors << SECTOR_SHIFT);
+
+ sector_offset = bio_split_rw_at(bio, lim, &nr_segs, max_len);
+ if (unlikely(sector_offset < 0))
+ return ERR_PTR(sector_offset);
+ if (!sector_offset)
+ return NULL;
+ } else {
+ if (bio->bi_iter.bi_size <= max_len)
+ return NULL;
+ sector_offset = max_len >> SECTOR_SHIFT;
+ }
+
+ /* ensure the split ioend is still block size aligned */
+ sector_offset = ALIGN_DOWN(sector_offset << SECTOR_SHIFT,
+ i_blocksize(ioend->io_inode)) >> SECTOR_SHIFT;
+
+ split = bio_split(bio, sector_offset, GFP_NOFS, &iomap_ioend_bioset);
+ if (IS_ERR(split))
+ return ERR_CAST(split);
+ split->bi_private = bio->bi_private;
+ split->bi_end_io = bio->bi_end_io;
+
+ split_ioend = iomap_init_ioend(ioend->io_inode, split, ioend->io_offset,
+ ioend->io_flags);
+ split_ioend->io_parent = ioend;
+
+ atomic_inc(&ioend->io_remaining);
+ ioend->io_offset += split_ioend->io_size;
+ ioend->io_size -= split_ioend->io_size;
+
+ split_ioend->io_sector = ioend->io_sector;
+ if (!is_append)
+ ioend->io_sector += (split_ioend->io_size >> SECTOR_SHIFT);
+ return split_ioend;
+}
+EXPORT_SYMBOL_GPL(iomap_split_ioend);
+
+static int __init iomap_ioend_init(void)
+{
+ return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE),
+ offsetof(struct iomap_ioend, io_bio),
+ BIOSET_NEED_BVECS);
+}
+fs_initcall(iomap_ioend_init);
diff --git a/fs/iomap/iter.c b/fs/iomap/iter.c
index 3790918646af..cef77ca0c20b 100644
--- a/fs/iomap/iter.c
+++ b/fs/iomap/iter.c
@@ -3,44 +3,28 @@
* Copyright (C) 2010 Red Hat, Inc.
* Copyright (c) 2016-2021 Christoph Hellwig.
*/
-#include <linux/fs.h>
#include <linux/iomap.h>
#include "trace.h"
-/*
- * Advance to the next range we need to map.
- *
- * If the iomap is marked IOMAP_F_STALE, it means the existing map was not fully
- * processed - it was aborted because the extent the iomap spanned may have been
- * changed during the operation. In this case, the iteration behaviour is to
- * remap the unprocessed range of the iter, and that means we may need to remap
- * even when we've made no progress (i.e. iter->processed = 0). Hence the
- * "finished iterating" case needs to distinguish between
- * (processed = 0) meaning we are done and (processed = 0 && stale) meaning we
- * need to remap the entire remaining range.
- */
-static inline int iomap_iter_advance(struct iomap_iter *iter)
+static inline void iomap_iter_reset_iomap(struct iomap_iter *iter)
{
- bool stale = iter->iomap.flags & IOMAP_F_STALE;
- int ret = 1;
-
- /* handle the previous iteration (if any) */
- if (iter->iomap.length) {
- if (iter->processed < 0)
- return iter->processed;
- if (WARN_ON_ONCE(iter->processed > iomap_length(iter)))
- return -EIO;
- iter->pos += iter->processed;
- iter->len -= iter->processed;
- if (!iter->len || (!iter->processed && !stale))
- ret = 0;
- }
-
- /* clear the per iteration state */
- iter->processed = 0;
+ iter->status = 0;
memset(&iter->iomap, 0, sizeof(iter->iomap));
memset(&iter->srcmap, 0, sizeof(iter->srcmap));
- return ret;
+}
+
+/*
+ * Advance the current iterator position and output the length remaining for the
+ * current mapping.
+ */
+int iomap_iter_advance(struct iomap_iter *iter, u64 *count)
+{
+ if (WARN_ON_ONCE(*count > iomap_length(iter)))
+ return -EIO;
+ iter->pos += *count;
+ iter->len -= *count;
+ *count = iomap_length(iter);
+ return 0;
}
static inline void iomap_iter_done(struct iomap_iter *iter)
@@ -50,6 +34,8 @@ static inline void iomap_iter_done(struct iomap_iter *iter)
WARN_ON_ONCE(iter->iomap.offset + iter->iomap.length <= iter->pos);
WARN_ON_ONCE(iter->iomap.flags & IOMAP_F_STALE);
+ iter->iter_start_pos = iter->pos;
+
trace_iomap_iter_dstmap(iter->inode, &iter->iomap);
if (iter->srcmap.type != IOMAP_HOLE)
trace_iomap_iter_srcmap(iter->inode, &iter->srcmap);
@@ -67,26 +53,58 @@ static inline void iomap_iter_done(struct iomap_iter *iter)
* function must be called in a loop that continues as long it returns a
* positive value. If 0 or a negative value is returned, the caller must not
* return to the loop body. Within a loop body, there are two ways to break out
- * of the loop body: leave @iter.processed unchanged, or set it to a negative
+ * of the loop body: leave @iter.status unchanged, or set it to a negative
* errno.
*/
int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops)
{
+ bool stale = iter->iomap.flags & IOMAP_F_STALE;
+ ssize_t advanced;
+ u64 olen;
int ret;
- if (iter->iomap.length && ops->iomap_end) {
- ret = ops->iomap_end(iter->inode, iter->pos, iomap_length(iter),
- iter->processed > 0 ? iter->processed : 0,
- iter->flags, &iter->iomap);
- if (ret < 0 && !iter->processed)
+ trace_iomap_iter(iter, ops, _RET_IP_);
+
+ if (!iter->iomap.length)
+ goto begin;
+
+ /*
+ * Calculate how far the iter was advanced and the original length bytes
+ * for ->iomap_end().
+ */
+ advanced = iter->pos - iter->iter_start_pos;
+ olen = iter->len + advanced;
+
+ if (ops->iomap_end) {
+ ret = ops->iomap_end(iter->inode, iter->iter_start_pos,
+ iomap_length_trim(iter, iter->iter_start_pos,
+ olen),
+ advanced, iter->flags, &iter->iomap);
+ if (ret < 0 && !advanced)
return ret;
}
- trace_iomap_iter(iter, ops, _RET_IP_);
- ret = iomap_iter_advance(iter);
+ /* detect old return semantics where this would advance */
+ if (WARN_ON_ONCE(iter->status > 0))
+ iter->status = -EIO;
+
+ /*
+ * Use iter->len to determine whether to continue onto the next mapping.
+ * Explicitly terminate on error status or if the current iter has not
+ * advanced at all (i.e. no work was done for some reason) unless the
+ * mapping has been marked stale and needs to be reprocessed.
+ */
+ if (iter->status < 0)
+ ret = iter->status;
+ else if (iter->len == 0 || (!advanced && !stale))
+ ret = 0;
+ else
+ ret = 1;
+ iomap_iter_reset_iomap(iter);
if (ret <= 0)
return ret;
+begin:
ret = ops->iomap_begin(iter->inode, iter->pos, iter->len, iter->flags,
&iter->iomap, &iter->srcmap);
if (ret < 0)
diff --git a/fs/iomap/seek.c b/fs/iomap/seek.c
index a845c012b50c..56db2dd4b10d 100644
--- a/fs/iomap/seek.c
+++ b/fs/iomap/seek.c
@@ -3,14 +3,10 @@
* Copyright (C) 2017 Red Hat, Inc.
* Copyright (c) 2018-2021 Christoph Hellwig.
*/
-#include <linux/module.h>
-#include <linux/compiler.h>
-#include <linux/fs.h>
#include <linux/iomap.h>
#include <linux/pagemap.h>
-#include <linux/pagevec.h>
-static loff_t iomap_seek_hole_iter(const struct iomap_iter *iter,
+static int iomap_seek_hole_iter(struct iomap_iter *iter,
loff_t *hole_pos)
{
loff_t length = iomap_length(iter);
@@ -20,13 +16,13 @@ static loff_t iomap_seek_hole_iter(const struct iomap_iter *iter,
*hole_pos = mapping_seek_hole_data(iter->inode->i_mapping,
iter->pos, iter->pos + length, SEEK_HOLE);
if (*hole_pos == iter->pos + length)
- return length;
+ return iomap_iter_advance(iter, &length);
return 0;
case IOMAP_HOLE:
*hole_pos = iter->pos;
return 0;
default:
- return length;
+ return iomap_iter_advance(iter, &length);
}
}
@@ -47,7 +43,7 @@ iomap_seek_hole(struct inode *inode, loff_t pos, const struct iomap_ops *ops)
iter.len = size - pos;
while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.processed = iomap_seek_hole_iter(&iter, &pos);
+ iter.status = iomap_seek_hole_iter(&iter, &pos);
if (ret < 0)
return ret;
if (iter.len) /* found hole before EOF */
@@ -56,19 +52,19 @@ iomap_seek_hole(struct inode *inode, loff_t pos, const struct iomap_ops *ops)
}
EXPORT_SYMBOL_GPL(iomap_seek_hole);
-static loff_t iomap_seek_data_iter(const struct iomap_iter *iter,
+static int iomap_seek_data_iter(struct iomap_iter *iter,
loff_t *hole_pos)
{
loff_t length = iomap_length(iter);
switch (iter->iomap.type) {
case IOMAP_HOLE:
- return length;
+ return iomap_iter_advance(iter, &length);
case IOMAP_UNWRITTEN:
*hole_pos = mapping_seek_hole_data(iter->inode->i_mapping,
iter->pos, iter->pos + length, SEEK_DATA);
if (*hole_pos < 0)
- return length;
+ return iomap_iter_advance(iter, &length);
return 0;
default:
*hole_pos = iter->pos;
@@ -93,7 +89,7 @@ iomap_seek_data(struct inode *inode, loff_t pos, const struct iomap_ops *ops)
iter.len = size - pos;
while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.processed = iomap_seek_data_iter(&iter, &pos);
+ iter.status = iomap_seek_data_iter(&iter, &pos);
if (ret < 0)
return ret;
if (iter.len) /* found data before EOF */
diff --git a/fs/iomap/swapfile.c b/fs/iomap/swapfile.c
index b90d0eda9e51..0db77c449467 100644
--- a/fs/iomap/swapfile.c
+++ b/fs/iomap/swapfile.c
@@ -3,9 +3,6 @@
* Copyright (C) 2018 Oracle. All Rights Reserved.
* Author: Darrick J. Wong <darrick.wong@oracle.com>
*/
-#include <linux/module.h>
-#include <linux/compiler.h>
-#include <linux/fs.h>
#include <linux/iomap.h>
#include <linux/swap.h>
@@ -94,7 +91,7 @@ static int iomap_swapfile_fail(struct iomap_swapfile_info *isi, const char *str)
* swap only cares about contiguous page-aligned physical extents and makes no
* distinction between written and unwritten extents.
*/
-static loff_t iomap_swapfile_iter(const struct iomap_iter *iter,
+static int iomap_swapfile_iter(struct iomap_iter *iter,
struct iomap *iomap, struct iomap_swapfile_info *isi)
{
switch (iomap->type) {
@@ -132,7 +129,8 @@ static loff_t iomap_swapfile_iter(const struct iomap_iter *iter,
return error;
memcpy(&isi->iomap, iomap, sizeof(isi->iomap));
}
- return iomap_length(iter);
+
+ return iomap_iter_advance_full(iter);
}
/*
@@ -166,7 +164,7 @@ int iomap_swapfile_activate(struct swap_info_struct *sis,
return ret;
while ((ret = iomap_iter(&iter, ops)) > 0)
- iter.processed = iomap_swapfile_iter(&iter, &iter.iomap, &isi);
+ iter.status = iomap_swapfile_iter(&iter, &iter.iomap, &isi);
if (ret < 0)
return ret;
diff --git a/fs/iomap/trace.c b/fs/iomap/trace.c
index 728d5443daf5..da217246b1a9 100644
--- a/fs/iomap/trace.c
+++ b/fs/iomap/trace.c
@@ -3,7 +3,6 @@
* Copyright (c) 2019 Christoph Hellwig
*/
#include <linux/iomap.h>
-#include <linux/uio.h>
/*
* We include this last to have the helpers above available for the trace
diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
index 4118a42cdab0..6ad66e6ba653 100644
--- a/fs/iomap/trace.h
+++ b/fs/iomap/trace.h
@@ -79,7 +79,7 @@ DECLARE_EVENT_CLASS(iomap_range_class,
DEFINE_EVENT(iomap_range_class, name, \
TP_PROTO(struct inode *inode, loff_t off, u64 len),\
TP_ARGS(inode, off, len))
-DEFINE_RANGE_EVENT(iomap_writepage);
+DEFINE_RANGE_EVENT(iomap_writeback_folio);
DEFINE_RANGE_EVENT(iomap_release_folio);
DEFINE_RANGE_EVENT(iomap_invalidate_folio);
DEFINE_RANGE_EVENT(iomap_dio_invalidate_fail);
@@ -99,7 +99,11 @@ DEFINE_RANGE_EVENT(iomap_dio_rw_queued);
{ IOMAP_FAULT, "FAULT" }, \
{ IOMAP_DIRECT, "DIRECT" }, \
{ IOMAP_NOWAIT, "NOWAIT" }, \
- { IOMAP_ATOMIC, "ATOMIC" }
+ { IOMAP_OVERWRITE_ONLY, "OVERWRITE_ONLY" }, \
+ { IOMAP_UNSHARE, "UNSHARE" }, \
+ { IOMAP_DAX, "DAX" }, \
+ { IOMAP_ATOMIC, "ATOMIC" }, \
+ { IOMAP_DONTCACHE, "DONTCACHE" }
#define IOMAP_F_FLAGS_STRINGS \
{ IOMAP_F_NEW, "NEW" }, \
@@ -107,7 +111,14 @@ DEFINE_RANGE_EVENT(iomap_dio_rw_queued);
{ IOMAP_F_SHARED, "SHARED" }, \
{ IOMAP_F_MERGED, "MERGED" }, \
{ IOMAP_F_BUFFER_HEAD, "BH" }, \
- { IOMAP_F_SIZE_CHANGED, "SIZE_CHANGED" }
+ { IOMAP_F_XATTR, "XATTR" }, \
+ { IOMAP_F_BOUNDARY, "BOUNDARY" }, \
+ { IOMAP_F_ANON_WRITE, "ANON_WRITE" }, \
+ { IOMAP_F_ATOMIC_BIO, "ATOMIC_BIO" }, \
+ { IOMAP_F_PRIVATE, "PRIVATE" }, \
+ { IOMAP_F_SIZE_CHANGED, "SIZE_CHANGED" }, \
+ { IOMAP_F_STALE, "STALE" }
+
#define IOMAP_DIO_STRINGS \
{IOMAP_DIO_FORCE_WAIT, "DIO_FORCE_WAIT" }, \
@@ -138,7 +149,7 @@ DECLARE_EVENT_CLASS(iomap_class,
__entry->bdev = iomap->bdev ? iomap->bdev->bd_dev : 0;
),
TP_printk("dev %d:%d ino 0x%llx bdev %d:%d addr 0x%llx offset 0x%llx "
- "length 0x%llx type %s flags %s",
+ "length 0x%llx type %s (0x%x) flags %s (0x%x)",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
MAJOR(__entry->bdev), MINOR(__entry->bdev),
@@ -146,7 +157,9 @@ DECLARE_EVENT_CLASS(iomap_class,
__entry->offset,
__entry->length,
__print_symbolic(__entry->type, IOMAP_TYPE_STRINGS),
- __print_flags(__entry->flags, "|", IOMAP_F_FLAGS_STRINGS))
+ __entry->type,
+ __print_flags(__entry->flags, "|", IOMAP_F_FLAGS_STRINGS),
+ __entry->flags)
)
#define DEFINE_IOMAP_EVENT(name) \
@@ -156,7 +169,7 @@ DEFINE_EVENT(iomap_class, name, \
DEFINE_IOMAP_EVENT(iomap_iter_dstmap);
DEFINE_IOMAP_EVENT(iomap_iter_srcmap);
-TRACE_EVENT(iomap_writepage_map,
+TRACE_EVENT(iomap_add_to_ioend,
TP_PROTO(struct inode *inode, u64 pos, unsigned int dirty_len,
struct iomap *iomap),
TP_ARGS(inode, pos, dirty_len, iomap),
@@ -185,7 +198,7 @@ TRACE_EVENT(iomap_writepage_map,
__entry->bdev = iomap->bdev ? iomap->bdev->bd_dev : 0;
),
TP_printk("dev %d:%d ino 0x%llx bdev %d:%d pos 0x%llx dirty len 0x%llx "
- "addr 0x%llx offset 0x%llx length 0x%llx type %s flags %s",
+ "addr 0x%llx offset 0x%llx length 0x%llx type %s (0x%x) flags %s (0x%x)",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
MAJOR(__entry->bdev), MINOR(__entry->bdev),
@@ -195,7 +208,9 @@ TRACE_EVENT(iomap_writepage_map,
__entry->offset,
__entry->length,
__print_symbolic(__entry->type, IOMAP_TYPE_STRINGS),
- __print_flags(__entry->flags, "|", IOMAP_F_FLAGS_STRINGS))
+ __entry->type,
+ __print_flags(__entry->flags, "|", IOMAP_F_FLAGS_STRINGS),
+ __entry->flags)
);
TRACE_EVENT(iomap_iter,
@@ -207,7 +222,7 @@ TRACE_EVENT(iomap_iter,
__field(u64, ino)
__field(loff_t, pos)
__field(u64, length)
- __field(s64, processed)
+ __field(int, status)
__field(unsigned int, flags)
__field(const void *, ops)
__field(unsigned long, caller)
@@ -217,17 +232,17 @@ TRACE_EVENT(iomap_iter,
__entry->ino = iter->inode->i_ino;
__entry->pos = iter->pos;
__entry->length = iomap_length(iter);
- __entry->processed = iter->processed;
+ __entry->status = iter->status;
__entry->flags = iter->flags;
__entry->ops = ops;
__entry->caller = caller;
),
- TP_printk("dev %d:%d ino 0x%llx pos 0x%llx length 0x%llx processed %lld flags %s (0x%x) ops %ps caller %pS",
+ TP_printk("dev %d:%d ino 0x%llx pos 0x%llx length 0x%llx status %d flags %s (0x%x) ops %ps caller %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->pos,
__entry->length,
- __entry->processed,
+ __entry->status,
__print_flags(__entry->flags, "|", IOMAP_FLAGS_STRINGS),
__entry->flags,
__entry->ops,
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index eb2f8273e6f1..09df40b612fb 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -147,7 +147,8 @@ static int do_isofs_readdir(struct inode *inode, struct file *file,
de = tmpde;
}
/* Basic sanity check, whether name doesn't exceed dir entry */
- if (de_len < de->name_len[0] +
+ if (de_len < sizeof(struct iso_directory_record) ||
+ de_len < de->name_len[0] +
sizeof(struct iso_directory_record)) {
printk(KERN_NOTICE "iso9660: Corrupted directory entry"
" in block %lu of inode %lu\n", block,
diff --git a/fs/isofs/export.c b/fs/isofs/export.c
index 35768a63fb1d..421d247fae52 100644
--- a/fs/isofs/export.c
+++ b/fs/isofs/export.c
@@ -180,7 +180,7 @@ static struct dentry *isofs_fh_to_parent(struct super_block *sb,
return NULL;
return isofs_export_iget(sb,
- fh_len > 2 ? ifid->parent_block : 0,
+ fh_len > 3 ? ifid->parent_block : 0,
ifid->parent_offset,
fh_len > 4 ? ifid->parent_generation : 0);
}
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 47038e660812..6f0e6b19383c 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -939,7 +939,7 @@ root_found:
sbi->s_check = opt->check;
if (table)
- s->s_d_op = &isofs_dentry_ops[table - 1];
+ set_default_d_op(s, &isofs_dentry_ops[table - 1]);
/* get the root dentry */
s->s_root = d_make_root(inode);
@@ -1275,6 +1275,7 @@ static int isofs_read_inode(struct inode *inode, int relocated)
unsigned long offset;
struct iso_inode_info *ei = ISOFS_I(inode);
int ret = -EIO;
+ struct timespec64 ts;
block = ei->i_iget5_block;
bh = sb_bread(inode->i_sb, block);
@@ -1387,8 +1388,10 @@ static int isofs_read_inode(struct inode *inode, int relocated)
inode->i_ino, de->flags[-high_sierra]);
}
#endif
- inode_set_mtime_to_ts(inode,
- inode_set_atime_to_ts(inode, inode_set_ctime(inode, iso_date(de->date, high_sierra), 0)));
+ ts = iso_date(de->date, high_sierra ? ISO_DATE_HIGH_SIERRA : 0);
+ inode_set_ctime_to_ts(inode, ts);
+ inode_set_atime_to_ts(inode, ts);
+ inode_set_mtime_to_ts(inode, ts);
ei->i_first_extent = (isonum_733(de->extent) +
isonum_711(de->ext_attr_length));
@@ -1437,9 +1440,16 @@ static int isofs_read_inode(struct inode *inode, int relocated)
inode->i_op = &page_symlink_inode_operations;
inode_nohighmem(inode);
inode->i_data.a_ops = &isofs_symlink_aops;
- } else
+ } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
+ S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
/* XXX - parse_rock_ridge_inode() had already set i_rdev. */
init_special_inode(inode, inode->i_mode, inode->i_rdev);
+ } else {
+ printk(KERN_DEBUG "ISOFS: Invalid file type 0%04o for inode %lu.\n",
+ inode->i_mode, inode->i_ino);
+ ret = -EIO;
+ goto fail;
+ }
ret = 0;
out:
diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h
index 2d55207c9a99..506555837533 100644
--- a/fs/isofs/isofs.h
+++ b/fs/isofs/isofs.h
@@ -106,7 +106,9 @@ static inline unsigned int isonum_733(u8 *p)
/* Ignore bigendian datum due to broken mastering programs */
return get_unaligned_le32(p);
}
-extern int iso_date(u8 *, int);
+#define ISO_DATE_HIGH_SIERRA (1 << 0)
+#define ISO_DATE_LONG_FORM (1 << 1)
+struct timespec64 iso_date(u8 *p, int flags);
struct inode; /* To make gcc happy */
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index dbf911126e61..576498245b9d 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -412,7 +412,12 @@ repeat:
}
}
break;
- case SIG('T', 'F'):
+ case SIG('T', 'F'): {
+ int flags, size, slen;
+
+ flags = rr->u.TF.flags & TF_LONG_FORM ? ISO_DATE_LONG_FORM : 0;
+ size = rr->u.TF.flags & TF_LONG_FORM ? 17 : 7;
+ slen = rr->len - 5;
/*
* Some RRIP writers incorrectly place ctime in the
* TF_CREATE field. Try to handle this correctly for
@@ -420,27 +425,28 @@ repeat:
*/
/* Rock ridge never appears on a High Sierra disk */
cnt = 0;
- if (rr->u.TF.flags & TF_CREATE) {
- inode_set_ctime(inode,
- iso_date(rr->u.TF.times[cnt++].time, 0),
- 0);
+ if ((rr->u.TF.flags & TF_CREATE) && size <= slen) {
+ inode_set_ctime_to_ts(inode,
+ iso_date(rr->u.TF.data + size * cnt++, flags));
+ slen -= size;
}
- if (rr->u.TF.flags & TF_MODIFY) {
- inode_set_mtime(inode,
- iso_date(rr->u.TF.times[cnt++].time, 0),
- 0);
+ if ((rr->u.TF.flags & TF_MODIFY) && size <= slen) {
+ inode_set_mtime_to_ts(inode,
+ iso_date(rr->u.TF.data + size * cnt++, flags));
+ slen -= size;
}
- if (rr->u.TF.flags & TF_ACCESS) {
- inode_set_atime(inode,
- iso_date(rr->u.TF.times[cnt++].time, 0),
- 0);
+ if ((rr->u.TF.flags & TF_ACCESS) && size <= slen) {
+ inode_set_atime_to_ts(inode,
+ iso_date(rr->u.TF.data + size * cnt++, flags));
+ slen -= size;
}
- if (rr->u.TF.flags & TF_ATTRIBUTES) {
- inode_set_ctime(inode,
- iso_date(rr->u.TF.times[cnt++].time, 0),
- 0);
+ if ((rr->u.TF.flags & TF_ATTRIBUTES) && size <= slen) {
+ inode_set_ctime_to_ts(inode,
+ iso_date(rr->u.TF.data + size * cnt++, flags));
+ slen -= size;
}
break;
+ }
case SIG('S', 'L'):
{
int slen;
diff --git a/fs/isofs/rock.h b/fs/isofs/rock.h
index 7755e587f778..c0856fa9bb6a 100644
--- a/fs/isofs/rock.h
+++ b/fs/isofs/rock.h
@@ -65,13 +65,9 @@ struct RR_PL_s {
__u8 location[8];
};
-struct stamp {
- __u8 time[7]; /* actually 6 unsigned, 1 signed */
-} __attribute__ ((packed));
-
struct RR_TF_s {
__u8 flags;
- struct stamp times[]; /* Variable number of these beasts */
+ __u8 data[];
} __attribute__ ((packed));
/* Linux-specific extension for transparent decompression */
diff --git a/fs/isofs/util.c b/fs/isofs/util.c
index e88dba721661..42f479da0b28 100644
--- a/fs/isofs/util.c
+++ b/fs/isofs/util.c
@@ -16,29 +16,44 @@
* to GMT. Thus we should always be correct.
*/
-int iso_date(u8 *p, int flag)
+struct timespec64 iso_date(u8 *p, int flags)
{
int year, month, day, hour, minute, second, tz;
- int crtime;
+ struct timespec64 ts;
+
+ if (flags & ISO_DATE_LONG_FORM) {
+ year = (p[0] - '0') * 1000 +
+ (p[1] - '0') * 100 +
+ (p[2] - '0') * 10 +
+ (p[3] - '0') - 1900;
+ month = ((p[4] - '0') * 10 + (p[5] - '0'));
+ day = ((p[6] - '0') * 10 + (p[7] - '0'));
+ hour = ((p[8] - '0') * 10 + (p[9] - '0'));
+ minute = ((p[10] - '0') * 10 + (p[11] - '0'));
+ second = ((p[12] - '0') * 10 + (p[13] - '0'));
+ ts.tv_nsec = ((p[14] - '0') * 10 + (p[15] - '0')) * 10000000;
+ tz = p[16];
+ } else {
+ year = p[0];
+ month = p[1];
+ day = p[2];
+ hour = p[3];
+ minute = p[4];
+ second = p[5];
+ ts.tv_nsec = 0;
+ /* High sierra has no time zone */
+ tz = flags & ISO_DATE_HIGH_SIERRA ? 0 : p[6];
+ }
- year = p[0];
- month = p[1];
- day = p[2];
- hour = p[3];
- minute = p[4];
- second = p[5];
- if (flag == 0) tz = p[6]; /* High sierra has no time zone */
- else tz = 0;
-
if (year < 0) {
- crtime = 0;
+ ts.tv_sec = 0;
} else {
- crtime = mktime64(year+1900, month, day, hour, minute, second);
+ ts.tv_sec = mktime64(year+1900, month, day, hour, minute, second);
/* sign extend */
if (tz & 0x80)
tz |= (-1 << 8);
-
+
/*
* The timezone offset is unreliable on some disks,
* so we make a sanity check. In no case is it ever
@@ -65,7 +80,7 @@ int iso_date(u8 *p, int flag)
* for pointing out the sign error.
*/
if (-52 <= tz && tz <= 52)
- crtime -= tz * 15 * 60;
+ ts.tv_sec -= tz * 15 * 60;
}
- return crtime;
-}
+ return ts;
+}
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index b3971e91e8eb..38861ca04899 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -285,6 +285,7 @@ restart:
retry:
if (batch_count)
__flush_batch(journal, &batch_count);
+ cond_resched();
spin_lock(&journal->j_list_lock);
goto restart;
}
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index e8e80761ac73..7203d2d2624d 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -57,8 +57,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
* So here, we have a buffer which has just come off the forget list. Look to
* see if we can strip all buffers from the backing page.
*
- * Called under lock_journal(), and possibly under journal_datalist_lock. The
- * caller provided us with a ref against the buffer, and we drop that here.
+ * Called under j_list_lock. The caller provided us with a ref against the
+ * buffer, and we drop that here.
*/
static void release_buffer_page(struct buffer_head *bh)
{
@@ -99,7 +99,7 @@ static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh)
h->h_chksum_type = 0;
h->h_chksum_size = 0;
h->h_chksum[0] = 0;
- csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
+ csum = jbd2_chksum(j->j_csum_seed, bh->b_data, j->j_blocksize);
h->h_chksum[0] = cpu_to_be32(csum);
}
@@ -330,8 +330,8 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
seq = cpu_to_be32(sequence);
addr = kmap_local_folio(bh->b_folio, bh_offset(bh));
- csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
- csum32 = jbd2_chksum(j, csum32, addr, bh->b_size);
+ csum32 = jbd2_chksum(j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
+ csum32 = jbd2_chksum(csum32, addr, bh->b_size);
kunmap_local(addr);
if (jbd2_has_feature_csum3(j))
@@ -738,10 +738,8 @@ start_journal_io:
err = journal_finish_inode_data_buffers(journal, commit_transaction);
if (err) {
printk(KERN_WARNING
- "JBD2: Detected IO errors while flushing file data "
- "on %s\n", journal->j_devname);
- if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
- jbd2_journal_abort(journal, err);
+ "JBD2: Detected IO errors %d while flushing file data on %s\n",
+ err, journal->j_devname);
err = 0;
}
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index d8084b31b361..d480b94117cd 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -83,7 +83,7 @@ EXPORT_SYMBOL(jbd2_log_wait_commit);
EXPORT_SYMBOL(jbd2_journal_start_commit);
EXPORT_SYMBOL(jbd2_journal_force_commit_nested);
EXPORT_SYMBOL(jbd2_journal_wipe);
-EXPORT_SYMBOL(jbd2_journal_blocks_per_page);
+EXPORT_SYMBOL(jbd2_journal_blocks_per_folio);
EXPORT_SYMBOL(jbd2_journal_invalidate_folio);
EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
EXPORT_SYMBOL(jbd2_journal_force_commit);
@@ -115,14 +115,14 @@ void __jbd2_debug(int level, const char *file, const char *func,
#endif
/* Checksumming functions */
-static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb)
+static __be32 jbd2_superblock_csum(journal_superblock_t *sb)
{
__u32 csum;
__be32 old_csum;
old_csum = sb->s_checksum;
sb->s_checksum = 0;
- csum = jbd2_chksum(j, ~0, (char *)sb, sizeof(journal_superblock_t));
+ csum = jbd2_chksum(~0, (char *)sb, sizeof(journal_superblock_t));
sb->s_checksum = old_csum;
return cpu_to_be32(csum);
@@ -134,7 +134,7 @@ static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb)
static void commit_timeout(struct timer_list *t)
{
- journal_t *journal = from_timer(journal, t, j_commit_timer);
+ journal_t *journal = timer_container_of(journal, t, j_commit_timer);
wake_up_process(journal->j_task);
}
@@ -197,7 +197,7 @@ loop:
if (journal->j_commit_sequence != journal->j_commit_request) {
jbd2_debug(1, "OK, requests differ\n");
write_unlock(&journal->j_state_lock);
- del_timer_sync(&journal->j_commit_timer);
+ timer_delete_sync(&journal->j_commit_timer);
jbd2_journal_commit_transaction(journal);
write_lock(&journal->j_state_lock);
goto loop;
@@ -246,7 +246,7 @@ loop:
goto loop;
end_loop:
- del_timer_sync(&journal->j_commit_timer);
+ timer_delete_sync(&journal->j_commit_timer);
journal->j_task = NULL;
wake_up(&journal->j_wait_done_commit);
jbd2_debug(1, "Journal thread exiting.\n");
@@ -603,7 +603,7 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid)
{
int ret = 0;
- transaction_t *commit_trans;
+ transaction_t *commit_trans, *running_trans;
if (!(journal->j_flags & JBD2_BARRIER))
return 0;
@@ -613,6 +613,16 @@ int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid)
goto out;
commit_trans = journal->j_committing_transaction;
if (!commit_trans || commit_trans->t_tid != tid) {
+ running_trans = journal->j_running_transaction;
+ /*
+ * The query transaction hasn't started committing,
+ * it must still be running.
+ */
+ if (WARN_ON_ONCE(!running_trans ||
+ running_trans->t_tid != tid))
+ goto out;
+
+ running_trans->t_need_data_flush = 1;
ret = 1;
goto out;
}
@@ -718,7 +728,6 @@ int jbd2_fc_begin_commit(journal_t *journal, tid_t tid)
}
journal->j_flags |= JBD2_FAST_COMMIT_ONGOING;
write_unlock(&journal->j_state_lock);
- jbd2_journal_lock_updates(journal);
return 0;
}
@@ -732,7 +741,6 @@ static int __jbd2_fc_end_commit(journal_t *journal, tid_t tid, bool fallback)
{
if (journal->j_fc_cleanup_callback)
journal->j_fc_cleanup_callback(journal, 0, tid);
- jbd2_journal_unlock_updates(journal);
write_lock(&journal->j_state_lock);
journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING;
if (fallback)
@@ -947,7 +955,7 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr,
* descriptor blocks we do need to generate bona fide buffers.
*
* After the caller of jbd2_journal_get_descriptor_buffer() has finished modifying
- * the buffer's contents they really should run flush_dcache_page(bh->b_page).
+ * the buffer's contents they really should run flush_dcache_folio(bh->b_folio).
* But we don't bother doing that, so there will be coherency problems with
* mmaps of blockdevs which hold live JBD-controlled filesystems.
*/
@@ -992,7 +1000,7 @@ void jbd2_descriptor_block_csum_set(journal_t *j, struct buffer_head *bh)
tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize -
sizeof(struct jbd2_journal_block_tail));
tail->t_checksum = 0;
- csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
+ csum = jbd2_chksum(j->j_csum_seed, bh->b_data, j->j_blocksize);
tail->t_checksum = cpu_to_be32(csum);
}
@@ -1361,7 +1369,7 @@ static int journal_check_superblock(journal_t *journal)
return err;
}
- if (jbd2_journal_has_csum_v2or3_feature(journal) &&
+ if (jbd2_journal_has_csum_v2or3(journal) &&
jbd2_has_feature_checksum(journal)) {
/* Can't have checksum v1 and v2 on at the same time! */
printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2/3 "
@@ -1369,14 +1377,14 @@ static int journal_check_superblock(journal_t *journal)
return err;
}
- if (jbd2_journal_has_csum_v2or3_feature(journal)) {
+ if (jbd2_journal_has_csum_v2or3(journal)) {
if (sb->s_checksum_type != JBD2_CRC32C_CHKSUM) {
printk(KERN_ERR "JBD2: Unknown checksum type\n");
return err;
}
/* Check superblock checksum */
- if (sb->s_checksum != jbd2_superblock_csum(journal, sb)) {
+ if (sb->s_checksum != jbd2_superblock_csum(sb)) {
printk(KERN_ERR "JBD2: journal checksum error\n");
err = -EFSBADCRC;
return err;
@@ -1482,7 +1490,7 @@ static int journal_load_superblock(journal_t *journal)
journal->j_total_len = be32_to_cpu(sb->s_maxlen);
/* Precompute checksum seed for all metadata */
if (jbd2_journal_has_csum_v2or3(journal))
- journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid,
+ journal->j_csum_seed = jbd2_chksum(~0, sb->s_uuid,
sizeof(sb->s_uuid));
/* After journal features are set, we can compute transaction limits */
jbd2_journal_init_transaction_limits(journal);
@@ -1811,7 +1819,7 @@ static int jbd2_write_superblock(journal_t *journal, blk_opf_t write_flags)
set_buffer_uptodate(bh);
}
if (jbd2_journal_has_csum_v2or3(journal))
- sb->s_checksum = jbd2_superblock_csum(journal, sb);
+ sb->s_checksum = jbd2_superblock_csum(sb);
get_bh(bh);
bh->b_end_io = end_buffer_write_sync;
submit_bh(REQ_OP_WRITE | write_flags, bh);
@@ -1869,7 +1877,6 @@ int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
/* Log is no longer empty */
write_lock(&journal->j_state_lock);
- WARN_ON(!sb->s_sequence);
journal->j_flags &= ~JBD2_FLUSHED;
write_unlock(&journal->j_state_lock);
@@ -1965,17 +1972,15 @@ static int __jbd2_journal_erase(journal_t *journal, unsigned int flags)
return err;
}
- if (block_start == ~0ULL) {
- block_start = phys_block;
- block_stop = block_start - 1;
- }
+ if (block_start == ~0ULL)
+ block_stop = block_start = phys_block;
/*
* last block not contiguous with current block,
* process last contiguous region and return to this block on
* next loop
*/
- if (phys_block != block_stop + 1) {
+ if (phys_block != block_stop) {
block--;
} else {
block_stop++;
@@ -1994,11 +1999,10 @@ static int __jbd2_journal_erase(journal_t *journal, unsigned int flags)
*/
byte_start = block_start * journal->j_blocksize;
byte_stop = block_stop * journal->j_blocksize;
- byte_count = (block_stop - block_start + 1) *
- journal->j_blocksize;
+ byte_count = (block_stop - block_start) * journal->j_blocksize;
truncate_inode_pages_range(journal->j_dev->bd_mapping,
- byte_start, byte_stop);
+ byte_start, byte_stop - 1);
if (flags & JBD2_JOURNAL_FLUSH_DISCARD) {
err = blkdev_issue_discard(journal->j_dev,
@@ -2013,7 +2017,7 @@ static int __jbd2_journal_erase(journal_t *journal, unsigned int flags)
}
if (unlikely(err != 0)) {
- pr_err("JBD2: (error %d) unable to wipe journal at physical blocks %llu - %llu",
+ pr_err("JBD2: (error %d) unable to wipe journal at physical blocks [%llu, %llu)",
err, block_start, block_stop);
return err;
}
@@ -2332,7 +2336,7 @@ int jbd2_journal_set_features(journal_t *journal, unsigned long compat,
sb->s_checksum_type = JBD2_CRC32C_CHKSUM;
sb->s_feature_compat &=
~cpu_to_be32(JBD2_FEATURE_COMPAT_CHECKSUM);
- journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid,
+ journal->j_csum_seed = jbd2_chksum(~0, sb->s_uuid,
sizeof(sb->s_uuid));
}
@@ -2651,9 +2655,10 @@ void jbd2_journal_ack_err(journal_t *journal)
write_unlock(&journal->j_state_lock);
}
-int jbd2_journal_blocks_per_page(struct inode *inode)
+int jbd2_journal_blocks_per_folio(struct inode *inode)
{
- return 1 << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
+ return 1 << (PAGE_SHIFT + mapping_max_folio_order(inode->i_mapping) -
+ inode->i_sb->s_blocksize_bits);
}
/*
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 9192be7c19d8..cac8c2cd4a92 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -39,7 +39,7 @@ struct recovery_info
static int do_one_pass(journal_t *journal,
struct recovery_info *info, enum passtype pass);
-static int scan_revoke_records(journal_t *, struct buffer_head *,
+static int scan_revoke_records(journal_t *, enum passtype, struct buffer_head *,
tid_t, struct recovery_info *);
#ifdef __KERNEL__
@@ -65,9 +65,8 @@ static void journal_brelse_array(struct buffer_head *b[], int n)
*/
#define MAXBUF 8
-static int do_readahead(journal_t *journal, unsigned int start)
+static void do_readahead(journal_t *journal, unsigned int start)
{
- int err;
unsigned int max, nbufs, next;
unsigned long long blocknr;
struct buffer_head *bh;
@@ -85,7 +84,7 @@ static int do_readahead(journal_t *journal, unsigned int start)
nbufs = 0;
for (next = start; next < max; next++) {
- err = jbd2_journal_bmap(journal, next, &blocknr);
+ int err = jbd2_journal_bmap(journal, next, &blocknr);
if (err) {
printk(KERN_ERR "JBD2: bad block at offset %u\n",
@@ -94,10 +93,8 @@ static int do_readahead(journal_t *journal, unsigned int start)
}
bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
- if (!bh) {
- err = -ENOMEM;
+ if (!bh)
goto failed;
- }
if (!buffer_uptodate(bh) && !buffer_locked(bh)) {
bufs[nbufs++] = bh;
@@ -112,12 +109,10 @@ static int do_readahead(journal_t *journal, unsigned int start)
if (nbufs)
bh_readahead_batch(nbufs, bufs, 0);
- err = 0;
failed:
if (nbufs)
journal_brelse_array(bufs, nbufs);
- return err;
}
#endif /* __KERNEL__ */
@@ -190,7 +185,7 @@ static int jbd2_descriptor_block_csum_verify(journal_t *j, void *buf)
j->j_blocksize - sizeof(struct jbd2_journal_block_tail));
provided = tail->t_checksum;
tail->t_checksum = 0;
- calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize);
+ calculated = jbd2_chksum(j->j_csum_seed, buf, j->j_blocksize);
tail->t_checksum = provided;
return provided == cpu_to_be32(calculated);
@@ -287,19 +282,20 @@ static int fc_do_one_pass(journal_t *journal,
int jbd2_journal_recover(journal_t *journal)
{
int err, err2;
- journal_superblock_t * sb;
-
struct recovery_info info;
memset(&info, 0, sizeof(info));
- sb = journal->j_superblock;
/*
* The journal superblock's s_start field (the current log head)
* is always zero if, and only if, the journal was cleanly
- * unmounted.
+ * unmounted. We use its in-memory version j_tail here because
+ * jbd2_journal_wipe() could have updated it without updating journal
+ * superblock.
*/
- if (!sb->s_start) {
+ if (!journal->j_tail) {
+ journal_superblock_t *sb = journal->j_superblock;
+
jbd2_debug(1, "No recovery required, last transaction %d, head block %u\n",
be32_to_cpu(sb->s_sequence), be32_to_cpu(sb->s_head));
journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1;
@@ -327,6 +323,12 @@ int jbd2_journal_recover(journal_t *journal)
journal->j_transaction_sequence, journal->j_head);
jbd2_journal_clear_revoke(journal);
+ /* Free revoke table allocated for replay */
+ if (journal->j_revoke != journal->j_revoke_table[0] &&
+ journal->j_revoke != journal->j_revoke_table[1]) {
+ jbd2_journal_destroy_revoke_table(journal->j_revoke);
+ journal->j_revoke = journal->j_revoke_table[1];
+ }
err2 = sync_blockdev(journal->j_fs_dev);
if (!err)
err = err2;
@@ -438,7 +440,7 @@ static int jbd2_commit_block_csum_verify(journal_t *j, void *buf)
h = buf;
provided = h->h_chksum[0];
h->h_chksum[0] = 0;
- calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize);
+ calculated = jbd2_chksum(j->j_csum_seed, buf, j->j_blocksize);
h->h_chksum[0] = provided;
return provided == cpu_to_be32(calculated);
@@ -459,7 +461,7 @@ static bool jbd2_commit_block_csum_verify_partial(journal_t *j, void *buf)
h = tmpbuf;
provided = h->h_chksum[0];
h->h_chksum[0] = 0;
- calculated = jbd2_chksum(j, j->j_csum_seed, tmpbuf, j->j_blocksize);
+ calculated = jbd2_chksum(j->j_csum_seed, tmpbuf, j->j_blocksize);
kfree(tmpbuf);
return provided == cpu_to_be32(calculated);
@@ -476,8 +478,8 @@ static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag,
return 1;
seq = cpu_to_be32(sequence);
- csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
- csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize);
+ csum32 = jbd2_chksum(j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
+ csum32 = jbd2_chksum(csum32, buf, j->j_blocksize);
if (jbd2_has_feature_csum3(j))
return tag3->t_checksum == cpu_to_be32(csum32);
@@ -612,6 +614,31 @@ static int do_one_pass(journal_t *journal,
first_commit_ID = next_commit_ID;
if (pass == PASS_SCAN)
info->start_transaction = first_commit_ID;
+ else if (pass == PASS_REVOKE) {
+ /*
+ * Would the default revoke table have too long hash chains
+ * during replay?
+ */
+ if (info->nr_revokes > JOURNAL_REVOKE_DEFAULT_HASH * 16) {
+ unsigned int hash_size;
+
+ /*
+ * Aim for average chain length of 8, limit at 1M
+ * entries to avoid problems with malicious
+ * filesystems.
+ */
+ hash_size = min(roundup_pow_of_two(info->nr_revokes / 8),
+ 1U << 20);
+ journal->j_revoke =
+ jbd2_journal_init_revoke_table(hash_size);
+ if (!journal->j_revoke) {
+ printk(KERN_ERR
+ "JBD2: failed to allocate revoke table for replay with %u entries. "
+ "Journal replay may be slow.\n", hash_size);
+ journal->j_revoke = journal->j_revoke_table[1];
+ }
+ }
+ }
jbd2_debug(1, "Starting recovery pass %d\n", pass);
@@ -852,6 +879,13 @@ chksum_ok:
case JBD2_REVOKE_BLOCK:
/*
+ * If we aren't in the SCAN or REVOKE pass, then we can
+ * just skip over this block.
+ */
+ if (pass != PASS_REVOKE && pass != PASS_SCAN)
+ continue;
+
+ /*
* Check revoke block crc in pass_scan, if csum verify
* failed, check commit block time later.
*/
@@ -863,12 +897,7 @@ chksum_ok:
need_check_commit_time = true;
}
- /* If we aren't in the REVOKE pass, then we can
- * just skip over this block. */
- if (pass != PASS_REVOKE)
- continue;
-
- err = scan_revoke_records(journal, bh,
+ err = scan_revoke_records(journal, pass, bh,
next_commit_ID, info);
if (err)
goto failed;
@@ -922,8 +951,9 @@ chksum_ok:
/* Scan a revoke record, marking all blocks mentioned as revoked. */
-static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
- tid_t sequence, struct recovery_info *info)
+static int scan_revoke_records(journal_t *journal, enum passtype pass,
+ struct buffer_head *bh, tid_t sequence,
+ struct recovery_info *info)
{
jbd2_journal_revoke_header_t *header;
int offset, max;
@@ -944,6 +974,11 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
if (jbd2_has_feature_64bit(journal))
record_len = 8;
+ if (pass == PASS_SCAN) {
+ info->nr_revokes += (max - offset) / record_len;
+ return 0;
+ }
+
while (offset + record_len <= max) {
unsigned long long blocknr;
int err;
@@ -956,7 +991,6 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
err = jbd2_journal_set_revoke(journal, blocknr, sequence);
if (err)
return err;
- ++info->nr_revokes;
}
return 0;
}
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index ce63d5fde9c3..1467f6790747 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -215,7 +215,7 @@ int __init jbd2_journal_init_revoke_table_cache(void)
return 0;
}
-static struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size)
+struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size)
{
int shift = 0;
int tmp = hash_size;
@@ -231,7 +231,7 @@ static struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size)
table->hash_size = hash_size;
table->hash_shift = shift;
table->hash_table =
- kmalloc_array(hash_size, sizeof(struct list_head), GFP_KERNEL);
+ kvmalloc_array(hash_size, sizeof(struct list_head), GFP_KERNEL);
if (!table->hash_table) {
kmem_cache_free(jbd2_revoke_table_cache, table);
table = NULL;
@@ -245,7 +245,7 @@ out:
return table;
}
-static void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table)
+void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table)
{
int i;
struct list_head *hash_list;
@@ -255,7 +255,7 @@ static void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table)
J_ASSERT(list_empty(hash_list));
}
- kfree(table->hash_table);
+ kvfree(table->hash_table);
kmem_cache_free(jbd2_revoke_table_cache, table);
}
@@ -345,7 +345,8 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
bh = bh_in;
if (!bh) {
- bh = __find_get_block(bdev, blocknr, journal->j_blocksize);
+ bh = __find_get_block_nonatomic(bdev, blocknr,
+ journal->j_blocksize);
if (bh)
BUFFER_TRACE(bh, "found on hash");
}
@@ -355,7 +356,8 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
/* If there is a different buffer_head lying around in
* memory anywhere... */
- bh2 = __find_get_block(bdev, blocknr, journal->j_blocksize);
+ bh2 = __find_get_block_nonatomic(bdev, blocknr,
+ journal->j_blocksize);
if (bh2) {
/* ... and it has RevokeValid status... */
if (bh2 != bh && buffer_revokevalid(bh2))
@@ -420,12 +422,11 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
* do not trust the Revoked bit on buffers unless RevokeValid is also
* set.
*/
-int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
+void jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
{
struct jbd2_revoke_record_s *record;
journal_t *journal = handle->h_transaction->t_journal;
int need_cancel;
- int did_revoke = 0; /* akpm: debug */
struct buffer_head *bh = jh2bh(jh);
jbd2_debug(4, "journal_head %p, cancelling revoke\n", jh);
@@ -450,7 +451,6 @@ int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
list_del(&record->hash);
spin_unlock(&journal->j_revoke_lock);
kmem_cache_free(jbd2_revoke_record_cache, record);
- did_revoke = 1;
}
}
@@ -466,18 +466,18 @@ int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
* state machine will get very upset later on. */
if (need_cancel) {
struct buffer_head *bh2;
- bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr, bh->b_size);
+ bh2 = __find_get_block_nonatomic(bh->b_bdev, bh->b_blocknr,
+ bh->b_size);
if (bh2) {
if (bh2 != bh)
clear_buffer_revoked(bh2);
__brelse(bh2);
}
}
- return did_revoke;
}
/*
- * journal_clear_revoked_flag clears revoked flag of buffers in
+ * jbd2_clear_buffer_revoked_flags clears revoked flag of buffers in
* revoke table to reflect there is no revoked buffers in the next
* transaction which is going to be started.
*/
@@ -495,9 +495,9 @@ void jbd2_clear_buffer_revoked_flags(journal_t *journal)
struct jbd2_revoke_record_s *record;
struct buffer_head *bh;
record = (struct jbd2_revoke_record_s *)list_entry;
- bh = __find_get_block(journal->j_fs_dev,
- record->blocknr,
- journal->j_blocksize);
+ bh = __find_get_block_nonatomic(journal->j_fs_dev,
+ record->blocknr,
+ journal->j_blocksize);
if (bh) {
clear_buffer_revoked(bh);
__brelse(bh);
@@ -506,9 +506,9 @@ void jbd2_clear_buffer_revoked_flags(journal_t *journal)
}
}
-/* journal_switch_revoke table select j_revoke for next transaction
- * we do not want to suspend any processing until all revokes are
- * written -bzzz
+/* jbd2_journal_switch_revoke_table table select j_revoke for next
+ * transaction we do not want to suspend any processing until all
+ * revokes are written -bzzz
*/
void jbd2_journal_switch_revoke_table(journal_t *journal)
{
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 66513c18ca29..c7867139af69 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -92,7 +92,6 @@ static void jbd2_get_transaction(journal_t *journal,
atomic_set(&transaction->t_outstanding_revokes, 0);
atomic_set(&transaction->t_handle_count, 0);
INIT_LIST_HEAD(&transaction->t_inode_list);
- INIT_LIST_HEAD(&transaction->t_private_list);
/* Set up the commit timer for the new transaction. */
journal->j_commit_timer.expires = round_jiffies_up(transaction->t_expires);
@@ -114,12 +113,9 @@ static void jbd2_get_transaction(journal_t *journal,
*/
/*
- * Update transaction's maximum wait time, if debugging is enabled.
- *
* t_max_wait is carefully updated here with use of atomic compare exchange.
* Note that there could be multiplre threads trying to do this simultaneously
* hence using cmpxchg to avoid any use of locks in this case.
- * With this t_max_wait can be updated w/o enabling jbd2_journal_enable_debug.
*/
static inline void update_t_max_wait(transaction_t *transaction,
unsigned long ts)
@@ -1513,7 +1509,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
jh->b_next_transaction == transaction);
spin_unlock(&jh->b_state_lock);
}
- if (jh->b_modified == 1) {
+ if (data_race(jh->b_modified == 1)) {
/* If it's in our transaction it must be in BJ_Metadata list. */
if (data_race(jh->b_transaction == transaction &&
jh->b_jlist != BJ_Metadata)) {
@@ -1532,7 +1528,6 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
goto out;
}
- journal = transaction->t_journal;
spin_lock(&jh->b_state_lock);
if (is_handle_aborted(handle)) {
@@ -1547,6 +1542,8 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
goto out_unlock_bh;
}
+ journal = transaction->t_journal;
+
if (jh->b_modified == 0) {
/*
* This buffer's got modified and becoming part
@@ -2079,21 +2076,6 @@ static void __jbd2_journal_unfile_buffer(struct journal_head *jh)
jh->b_transaction = NULL;
}
-void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh)
-{
- struct buffer_head *bh = jh2bh(jh);
-
- /* Get reference so that buffer cannot be freed before we unlock it */
- get_bh(bh);
- spin_lock(&jh->b_state_lock);
- spin_lock(&journal->j_list_lock);
- __jbd2_journal_unfile_buffer(jh);
- spin_unlock(&journal->j_list_lock);
- spin_unlock(&jh->b_state_lock);
- jbd2_journal_put_journal_head(jh);
- __brelse(bh);
-}
-
/**
* jbd2_journal_try_to_free_buffers() - try to free page buffers.
* @journal: journal for operation
@@ -2192,7 +2174,7 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
/*
* We don't want to write the buffer anymore, clear the
* bit so that we don't confuse checks in
- * __journal_file_buffer
+ * __jbd2_journal_file_buffer
*/
clear_buffer_dirty(bh);
__jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 2b2938970da3..dd91f725ded6 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -32,8 +32,8 @@ static int jffs2_link (struct dentry *,struct inode *,struct dentry *);
static int jffs2_unlink (struct inode *,struct dentry *);
static int jffs2_symlink (struct mnt_idmap *, struct inode *,
struct dentry *, const char *);
-static int jffs2_mkdir (struct mnt_idmap *, struct inode *,struct dentry *,
- umode_t);
+static struct dentry *jffs2_mkdir (struct mnt_idmap *, struct inode *,struct dentry *,
+ umode_t);
static int jffs2_rmdir (struct inode *,struct dentry *);
static int jffs2_mknod (struct mnt_idmap *, struct inode *,struct dentry *,
umode_t,dev_t);
@@ -446,8 +446,8 @@ static int jffs2_symlink (struct mnt_idmap *idmap, struct inode *dir_i,
}
-static int jffs2_mkdir (struct mnt_idmap *idmap, struct inode *dir_i,
- struct dentry *dentry, umode_t mode)
+static struct dentry *jffs2_mkdir (struct mnt_idmap *idmap, struct inode *dir_i,
+ struct dentry *dentry, umode_t mode)
{
struct jffs2_inode_info *f, *dir_f;
struct jffs2_sb_info *c;
@@ -464,7 +464,7 @@ static int jffs2_mkdir (struct mnt_idmap *idmap, struct inode *dir_i,
ri = jffs2_alloc_raw_inode();
if (!ri)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
c = JFFS2_SB_INFO(dir_i->i_sb);
@@ -477,7 +477,7 @@ static int jffs2_mkdir (struct mnt_idmap *idmap, struct inode *dir_i,
if (ret) {
jffs2_free_raw_inode(ri);
- return ret;
+ return ERR_PTR(ret);
}
inode = jffs2_new_inode(dir_i, mode, ri);
@@ -485,7 +485,7 @@ static int jffs2_mkdir (struct mnt_idmap *idmap, struct inode *dir_i,
if (IS_ERR(inode)) {
jffs2_free_raw_inode(ri);
jffs2_complete_reservation(c);
- return PTR_ERR(inode);
+ return ERR_CAST(inode);
}
inode->i_op = &jffs2_dir_inode_operations;
@@ -584,11 +584,11 @@ static int jffs2_mkdir (struct mnt_idmap *idmap, struct inode *dir_i,
jffs2_complete_reservation(c);
d_instantiate_new(dentry, inode);
- return 0;
+ return NULL;
fail:
iget_failed(inode);
- return ret;
+ return ERR_PTR(ret);
}
static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry)
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index ef3a1e1b6cb0..fda9f4d6093f 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -425,7 +425,9 @@ static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseb
.totlen = cpu_to_je32(c->cleanmarker_size)
};
- jffs2_prealloc_raw_node_refs(c, jeb, 1);
+ ret = jffs2_prealloc_raw_node_refs(c, jeb, 1);
+ if (ret)
+ goto filebad;
marker.hdr_crc = cpu_to_je32(crc32(0, &marker, sizeof(struct jffs2_unknown_node)-4));
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 13c18ccc13b0..dd3dff95cb24 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -21,12 +21,14 @@
#include <linux/jffs2.h>
#include "nodelist.h"
-static int jffs2_write_end(struct file *filp, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct folio *folio, void *fsdata);
-static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
- loff_t pos, unsigned len,
- struct folio **foliop, void **fsdata);
+static int jffs2_write_end(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct folio *folio, void *fsdata);
+static int jffs2_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len,
+ struct folio **foliop, void **fsdata);
static int jffs2_read_folio(struct file *filp, struct folio *folio);
int jffs2_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
@@ -54,7 +56,7 @@ const struct file_operations jffs2_file_operations =
.read_iter = generic_file_read_iter,
.write_iter = generic_file_write_iter,
.unlocked_ioctl=jffs2_ioctl,
- .mmap = generic_file_readonly_mmap,
+ .mmap_prepare = generic_file_readonly_mmap_prepare,
.fsync = jffs2_fsync,
.splice_read = filemap_splice_read,
.splice_write = iter_file_splice_write,
@@ -121,9 +123,10 @@ static int jffs2_read_folio(struct file *file, struct folio *folio)
return ret;
}
-static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
- loff_t pos, unsigned len,
- struct folio **foliop, void **fsdata)
+static int jffs2_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len,
+ struct folio **foliop, void **fsdata)
{
struct folio *folio;
struct inode *inode = mapping->host;
@@ -235,9 +238,10 @@ out_err:
return ret;
}
-static int jffs2_write_end(struct file *filp, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct folio *folio, void *fsdata)
+static int jffs2_write_end(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct folio *folio, void *fsdata)
{
/* Actually commit the write from the page cache page we're looking at.
* For now, we write the full page out each time. It sucks, but it's simple
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 29671e33a171..62879c218d4b 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -256,7 +256,9 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
jffs2_dbg(1, "%s(): Skipping %d bytes in nextblock to ensure page alignment\n",
__func__, skip);
- jffs2_prealloc_raw_node_refs(c, c->nextblock, 1);
+ ret = jffs2_prealloc_raw_node_refs(c, c->nextblock, 1);
+ if (ret)
+ goto out;
jffs2_scan_dirty_space(c, c->nextblock, skip);
}
#endif
diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index 4fe64519870f..d83372d3e1a0 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -858,7 +858,10 @@ int jffs2_sum_write_sumnode(struct jffs2_sb_info *c)
spin_unlock(&c->erase_completion_lock);
jeb = c->nextblock;
- jffs2_prealloc_raw_node_refs(c, jeb, 1);
+ ret = jffs2_prealloc_raw_node_refs(c, jeb, 1);
+
+ if (ret)
+ goto out;
if (!c->summary->sum_num || !c->summary->sum_list_head) {
JFFS2_WARNING("Empty summary info!!!\n");
@@ -872,6 +875,8 @@ int jffs2_sum_write_sumnode(struct jffs2_sb_info *c)
datasize += padsize;
ret = jffs2_sum_write_data(c, jeb, infosize, datasize, padsize);
+
+out:
spin_lock(&c->erase_completion_lock);
return ret;
}
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 4061e0ba7010..bb815a002984 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -584,7 +584,7 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad)
size_t retlen;
/* Nothing to do if not write-buffering the flash. In particular, we shouldn't
- del_timer() the timer we never initialised. */
+ call timer_delete() on the timer we never initialised. */
if (!jffs2_is_writebuffered(c))
return 0;
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 01b6912e60f8..2a4a288b821c 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -44,6 +44,9 @@ static int jfs_open(struct inode *inode, struct file *file)
{
int rc;
+ if (S_ISREG(inode->i_mode) && inode->i_size < 0)
+ return -EIO;
+
if ((rc = dquot_file_open(inode, file)))
return rc;
@@ -143,7 +146,7 @@ const struct file_operations jfs_file_operations = {
.llseek = generic_file_llseek,
.read_iter = generic_file_read_iter,
.write_iter = generic_file_write_iter,
- .mmap = generic_file_mmap,
+ .mmap_prepare = generic_file_mmap_prepare,
.splice_read = filemap_splice_read,
.splice_write = iter_file_splice_write,
.fsync = jfs_fsync,
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 07cfdc440596..fcedeb514e14 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -145,9 +145,9 @@ void jfs_evict_inode(struct inode *inode)
if (!inode->i_nlink && !is_bad_inode(inode)) {
dquot_initialize(inode);
+ truncate_inode_pages_final(&inode->i_data);
if (JFS_IP(inode)->fileset == FILESYSTEM_I) {
struct inode *ipimap = JFS_SBI(inode->i_sb)->ipimap;
- truncate_inode_pages_final(&inode->i_data);
if (test_cflag(COMMIT_Freewmap, inode))
jfs_free_zero_link(inode);
@@ -290,9 +290,10 @@ static void jfs_write_failed(struct address_space *mapping, loff_t to)
}
}
-static int jfs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len,
- struct folio **foliop, void **fsdata)
+static int jfs_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len,
+ struct folio **foliop, void **fsdata)
{
int ret;
@@ -303,13 +304,14 @@ static int jfs_write_begin(struct file *file, struct address_space *mapping,
return ret;
}
-static int jfs_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied, struct folio *folio,
- void *fsdata)
+static int jfs_write_end(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct folio *folio, void *fsdata)
{
int ret;
- ret = generic_write_end(file, mapping, pos, len, copied, folio, fsdata);
+ ret = generic_write_end(iocb, mapping, pos, len, copied, folio, fsdata);
if (ret < len)
jfs_write_failed(mapping, pos + len);
return ret;
@@ -369,7 +371,7 @@ void jfs_truncate_nolock(struct inode *ip, loff_t length)
ASSERT(length >= 0);
- if (test_cflag(COMMIT_Nolink, ip)) {
+ if (test_cflag(COMMIT_Nolink, ip) || isReadOnly(ip)) {
xtTruncate(0, ip, length, COMMIT_WMAP);
return;
}
diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c
index f7bd7e8f5be4..563f148be8af 100644
--- a/fs/jfs/ioctl.c
+++ b/fs/jfs/ioctl.c
@@ -57,7 +57,7 @@ static long jfs_map_ext2(unsigned long flags, int from)
return mapped;
}
-int jfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+int jfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
{
struct jfs_inode_info *jfs_inode = JFS_IP(d_inode(dentry));
unsigned int flags = jfs_inode->mode2 & JFS_FL_USER_VISIBLE;
@@ -71,7 +71,7 @@ int jfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
}
int jfs_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa)
+ struct dentry *dentry, struct file_kattr *fa)
{
struct inode *inode = d_inode(dentry);
struct jfs_inode_info *jfs_inode = JFS_IP(inode);
diff --git a/fs/jfs/jfs_discard.c b/fs/jfs/jfs_discard.c
index 5f4b305030ad..4b660296caf3 100644
--- a/fs/jfs/jfs_discard.c
+++ b/fs/jfs/jfs_discard.c
@@ -86,7 +86,8 @@ int jfs_ioc_trim(struct inode *ip, struct fstrim_range *range)
down_read(&sb->s_umount);
bmp = JFS_SBI(ip->i_sb)->bmap;
- if (minlen > bmp->db_agsize ||
+ if (bmp == NULL ||
+ minlen > bmp->db_agsize ||
start >= bmp->db_mapsize ||
range->len < sb->s_blocksize) {
up_read(&sb->s_umount);
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index f9009e4f9ffd..cdfa699cd7c8 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -178,41 +178,30 @@ int dbMount(struct inode *ipbmap)
dbmp_le = (struct dbmap_disk *) mp->data;
bmp->db_mapsize = le64_to_cpu(dbmp_le->dn_mapsize);
bmp->db_nfree = le64_to_cpu(dbmp_le->dn_nfree);
-
bmp->db_l2nbperpage = le32_to_cpu(dbmp_le->dn_l2nbperpage);
- if (bmp->db_l2nbperpage > L2PSIZE - L2MINBLOCKSIZE ||
- bmp->db_l2nbperpage < 0) {
- err = -EINVAL;
- goto err_release_metapage;
- }
-
bmp->db_numag = le32_to_cpu(dbmp_le->dn_numag);
- if (!bmp->db_numag || bmp->db_numag > MAXAG) {
- err = -EINVAL;
- goto err_release_metapage;
- }
-
bmp->db_maxlevel = le32_to_cpu(dbmp_le->dn_maxlevel);
bmp->db_maxag = le32_to_cpu(dbmp_le->dn_maxag);
bmp->db_agpref = le32_to_cpu(dbmp_le->dn_agpref);
- if (bmp->db_maxag >= MAXAG || bmp->db_maxag < 0 ||
- bmp->db_agpref >= MAXAG || bmp->db_agpref < 0) {
- err = -EINVAL;
- goto err_release_metapage;
- }
-
bmp->db_aglevel = le32_to_cpu(dbmp_le->dn_aglevel);
bmp->db_agheight = le32_to_cpu(dbmp_le->dn_agheight);
bmp->db_agwidth = le32_to_cpu(dbmp_le->dn_agwidth);
bmp->db_agstart = le32_to_cpu(dbmp_le->dn_agstart);
bmp->db_agl2size = le32_to_cpu(dbmp_le->dn_agl2size);
- if (bmp->db_agl2size > L2MAXL2SIZE - L2MAXAG ||
- bmp->db_agl2size < 0) {
- err = -EINVAL;
- goto err_release_metapage;
- }
- if (((bmp->db_mapsize - 1) >> bmp->db_agl2size) > MAXAG) {
+ if ((bmp->db_l2nbperpage > L2PSIZE - L2MINBLOCKSIZE) ||
+ (bmp->db_l2nbperpage < 0) ||
+ !bmp->db_numag || (bmp->db_numag > MAXAG) ||
+ (bmp->db_maxag >= MAXAG) || (bmp->db_maxag < 0) ||
+ (bmp->db_agpref >= MAXAG) || (bmp->db_agpref < 0) ||
+ (bmp->db_agheight < 0) || (bmp->db_agheight > (L2LPERCTL >> 1)) ||
+ (bmp->db_agwidth < 1) || (bmp->db_agwidth > (LPERCTL / MAXAG)) ||
+ (bmp->db_agwidth > (1 << (L2LPERCTL - (bmp->db_agheight << 1)))) ||
+ (bmp->db_agstart < 0) ||
+ (bmp->db_agstart > (CTLTREESIZE - 1 - bmp->db_agwidth * (MAXAG - 1))) ||
+ (bmp->db_agl2size > L2MAXL2SIZE - L2MAXAG) ||
+ (bmp->db_agl2size < 0) ||
+ ((bmp->db_mapsize - 1) >> bmp->db_agl2size) > MAXAG) {
err = -EINVAL;
goto err_release_metapage;
}
@@ -1400,6 +1389,12 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
(1 << (L2LPERCTL - (bmp->db_agheight << 1))) / bmp->db_agwidth;
ti = bmp->db_agstart + bmp->db_agwidth * (agno & (agperlev - 1));
+ if (ti < 0 || ti >= le32_to_cpu(dcp->nleafs)) {
+ jfs_error(bmp->db_ipbmap->i_sb, "Corrupt dmapctl page\n");
+ release_metapage(mp);
+ return -EIO;
+ }
+
/* dmap control page trees fan-out by 4 and a single allocation
* group may be described by 1 or 2 subtrees within the ag level
* dmap control page, depending upon the ag size. examine the ag's
@@ -1820,8 +1815,10 @@ dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results)
return -EIO;
dp = (struct dmap *) mp->data;
- if (dp->tree.budmin < 0)
+ if (dp->tree.budmin < 0) {
+ release_metapage(mp);
return -EIO;
+ }
/* try to allocate the blocks.
*/
@@ -3403,7 +3400,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks)
oldl2agsize = bmp->db_agl2size;
bmp->db_agl2size = l2agsize;
- bmp->db_agsize = 1 << l2agsize;
+ bmp->db_agsize = (s64)1 << l2agsize;
/* compute new number of AG */
agno = bmp->db_numag;
@@ -3666,8 +3663,8 @@ void dbFinalizeBmap(struct inode *ipbmap)
* system size is not a multiple of the group size).
*/
inactfree = (inactags && ag_rem) ?
- ((inactags - 1) << bmp->db_agl2size) + ag_rem
- : inactags << bmp->db_agl2size;
+ (((s64)inactags - 1) << bmp->db_agl2size) + ag_rem
+ : ((s64)inactags << bmp->db_agl2size);
/* determine how many free blocks are in the active
* allocation groups plus the average number of free blocks
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index 8f85177f284b..ab11849cf9cc 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -117,7 +117,8 @@ do { \
if (!(RC)) { \
if (((P)->header.nextindex > \
(((BN) == 0) ? DTROOTMAXSLOT : (P)->header.maxslot)) || \
- ((BN) && ((P)->header.maxslot > DTPAGEMAXSLOT))) { \
+ ((BN) && (((P)->header.maxslot > DTPAGEMAXSLOT) || \
+ ((P)->header.stblindex >= DTPAGEMAXSLOT)))) { \
BT_PUTPAGE(MP); \
jfs_error((IP)->i_sb, \
"DT_GETPAGE: dtree page corrupt\n"); \
@@ -2612,7 +2613,7 @@ void dtInitRoot(tid_t tid, struct inode *ip, u32 idotdot)
* fsck.jfs should really fix this, but it currently does not.
* Called from jfs_readdir when bad index is detected.
*/
-static void add_missing_indices(struct inode *inode, s64 bn)
+static int add_missing_indices(struct inode *inode, s64 bn)
{
struct ldtentry *d;
struct dt_lock *dtlck;
@@ -2621,7 +2622,7 @@ static void add_missing_indices(struct inode *inode, s64 bn)
struct lv *lv;
struct metapage *mp;
dtpage_t *p;
- int rc;
+ int rc = 0;
s8 *stbl;
tid_t tid;
struct tlock *tlck;
@@ -2646,6 +2647,16 @@ static void add_missing_indices(struct inode *inode, s64 bn)
stbl = DT_GETSTBL(p);
for (i = 0; i < p->header.nextindex; i++) {
+ if (stbl[i] < 0) {
+ jfs_err("jfs: add_missing_indices: Invalid stbl[%d] = %d for inode %ld, block = %lld",
+ i, stbl[i], (long)inode->i_ino, (long long)bn);
+ rc = -EIO;
+
+ DT_PUTPAGE(mp);
+ txAbort(tid, 0);
+ goto end;
+ }
+
d = (struct ldtentry *) &p->slot[stbl[i]];
index = le32_to_cpu(d->index);
if ((index < 2) || (index >= JFS_IP(inode)->next_index)) {
@@ -2663,6 +2674,7 @@ static void add_missing_indices(struct inode *inode, s64 bn)
(void) txCommit(tid, 1, &inode, 0);
end:
txEnd(tid);
+ return rc;
}
/*
@@ -3016,7 +3028,8 @@ skip_one:
}
if (fix_page) {
- add_missing_indices(ip, bn);
+ if ((rc = add_missing_indices(ip, bn)))
+ goto out;
page_fixed = 1;
}
diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c
index 63d21822d309..46529bcc8297 100644
--- a/fs/jfs/jfs_extent.c
+++ b/fs/jfs/jfs_extent.c
@@ -74,6 +74,11 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr)
int rc;
int xflag;
+ if (isReadOnly(ip)) {
+ jfs_error(ip->i_sb, "read-only filesystem\n");
+ return -EIO;
+ }
+
/* This blocks if we are low on resources */
txBeginAnon(ip->i_sb);
@@ -253,6 +258,11 @@ int extRecord(struct inode *ip, xad_t * xp)
{
int rc;
+ if (isReadOnly(ip)) {
+ jfs_error(ip->i_sb, "read-only filesystem\n");
+ return -EIO;
+ }
+
txBeginAnon(ip->i_sb);
mutex_lock(&JFS_IP(ip)->commit_mutex);
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index a360b24ed320..ecb8e05b8b84 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -102,7 +102,7 @@ int diMount(struct inode *ipimap)
* allocate/initialize the in-memory inode map control structure
*/
/* allocate the in-memory inode map control structure. */
- imap = kmalloc(sizeof(struct inomap), GFP_KERNEL);
+ imap = kzalloc(sizeof(struct inomap), GFP_KERNEL);
if (imap == NULL)
return -ENOMEM;
@@ -456,7 +456,7 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary)
dp += inum % 8; /* 8 inodes per 4K page */
/* copy on-disk inode to in-memory inode */
- if ((copy_from_dinode(dp, ip)) != 0) {
+ if ((copy_from_dinode(dp, ip) != 0) || (ip->i_nlink == 0)) {
/* handle bad return by returning NULL for ip */
set_nlink(ip, 1); /* Don't want iput() deleting it */
iput(ip);
@@ -3029,14 +3029,23 @@ static void duplicateIXtree(struct super_block *sb, s64 blkno,
*
* RETURN VALUES:
* 0 - success
- * -ENOMEM - insufficient memory
+ * -EINVAL - unexpected inode type
*/
static int copy_from_dinode(struct dinode * dip, struct inode *ip)
{
struct jfs_inode_info *jfs_ip = JFS_IP(ip);
struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
+ int fileset = le32_to_cpu(dip->di_fileset);
+
+ switch (fileset) {
+ case AGGR_RESERVED_I: case AGGREGATE_I: case BMAP_I:
+ case LOG_I: case BADBLOCK_I: case FILESYSTEM_I:
+ break;
+ default:
+ return -EINVAL;
+ }
- jfs_ip->fileset = le32_to_cpu(dip->di_fileset);
+ jfs_ip->fileset = fileset;
jfs_ip->mode2 = le32_to_cpu(dip->di_mode);
jfs_set_inode_flags(ip);
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index ea80661597ac..2c6c81c8cb9f 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -9,9 +9,9 @@ struct fid;
extern struct inode *ialloc(struct inode *, umode_t);
extern int jfs_fsync(struct file *, loff_t, loff_t, int);
-extern int jfs_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+extern int jfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa);
extern int jfs_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa);
+ struct dentry *dentry, struct file_kattr *fa);
extern long jfs_ioctl(struct file *, unsigned int, unsigned long);
extern struct inode *jfs_iget(struct super_block *, unsigned long);
extern int jfs_commit_inode(struct inode *, int);
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index df575a873ec6..b98cf3bb6c1f 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -15,6 +15,7 @@
#include <linux/mempool.h>
#include <linux/seq_file.h>
#include <linux/writeback.h>
+#include <linux/migrate.h>
#include "jfs_incore.h"
#include "jfs_superblock.h"
#include "jfs_filsys.h"
@@ -151,7 +152,59 @@ static inline void dec_io(struct folio *folio, blk_status_t status,
handler(folio, anchor->status);
}
+#ifdef CONFIG_MIGRATION
+static int __metapage_migrate_folio(struct address_space *mapping,
+ struct folio *dst, struct folio *src,
+ enum migrate_mode mode)
+{
+ struct meta_anchor *src_anchor = src->private;
+ struct metapage *mps[MPS_PER_PAGE] = {0};
+ struct metapage *mp;
+ int i, rc;
+
+ for (i = 0; i < MPS_PER_PAGE; i++) {
+ mp = src_anchor->mp[i];
+ if (mp && metapage_locked(mp))
+ return -EAGAIN;
+ }
+
+ rc = filemap_migrate_folio(mapping, dst, src, mode);
+ if (rc != MIGRATEPAGE_SUCCESS)
+ return rc;
+
+ for (i = 0; i < MPS_PER_PAGE; i++) {
+ mp = src_anchor->mp[i];
+ if (!mp)
+ continue;
+ if (unlikely(insert_metapage(dst, mp))) {
+ /* If error, roll-back previosly inserted pages */
+ for (int j = 0 ; j < i; j++) {
+ if (mps[j])
+ remove_metapage(dst, mps[j]);
+ }
+ return -EAGAIN;
+ }
+ mps[i] = mp;
+ }
+
+ /* Update the metapage and remove it from src */
+ for (i = 0; i < MPS_PER_PAGE; i++) {
+ mp = mps[i];
+ if (mp) {
+ int page_offset = mp->data - folio_address(src);
+
+ mp->data = folio_address(dst) + page_offset;
+ mp->folio = dst;
+ remove_metapage(src, mp);
+ }
+ }
+
+ return MIGRATEPAGE_SUCCESS;
+}
+#endif /* CONFIG_MIGRATION */
+
#else
+
static inline struct metapage *folio_to_mp(struct folio *folio, int offset)
{
return folio->private;
@@ -175,6 +228,35 @@ static inline void remove_metapage(struct folio *folio, struct metapage *mp)
#define inc_io(folio) do {} while(0)
#define dec_io(folio, status, handler) handler(folio, status)
+#ifdef CONFIG_MIGRATION
+static int __metapage_migrate_folio(struct address_space *mapping,
+ struct folio *dst, struct folio *src,
+ enum migrate_mode mode)
+{
+ struct metapage *mp;
+ int page_offset;
+ int rc;
+
+ mp = folio_to_mp(src, 0);
+ if (metapage_locked(mp))
+ return -EAGAIN;
+
+ rc = filemap_migrate_folio(mapping, dst, src, mode);
+ if (rc != MIGRATEPAGE_SUCCESS)
+ return rc;
+
+ if (unlikely(insert_metapage(dst, mp)))
+ return -EAGAIN;
+
+ page_offset = mp->data - folio_address(src);
+ mp->data = folio_address(dst) + page_offset;
+ mp->folio = dst;
+ remove_metapage(src, mp);
+
+ return MIGRATEPAGE_SUCCESS;
+}
+#endif /* CONFIG_MIGRATION */
+
#endif
static inline struct metapage *alloc_metapage(gfp_t gfp_mask)
@@ -339,7 +421,7 @@ static void metapage_write_end_io(struct bio *bio)
}
static int metapage_write_folio(struct folio *folio,
- struct writeback_control *wbc, void *unused)
+ struct writeback_control *wbc)
{
struct bio *bio = NULL;
int block_offset; /* block offset of mp within page */
@@ -468,10 +550,12 @@ static int metapage_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct blk_plug plug;
+ struct folio *folio = NULL;
int err;
blk_start_plug(&plug);
- err = write_cache_pages(mapping, wbc, metapage_write_folio, NULL);
+ while ((folio = writeback_iter(mapping, wbc, folio, &err)))
+ err = metapage_write_folio(folio, wbc);
blk_finish_plug(&plug);
return err;
@@ -554,6 +638,29 @@ static bool metapage_release_folio(struct folio *folio, gfp_t gfp_mask)
return ret;
}
+#ifdef CONFIG_MIGRATION
+/*
+ * metapage_migrate_folio - Migration function for JFS metapages
+ */
+static int metapage_migrate_folio(struct address_space *mapping,
+ struct folio *dst, struct folio *src,
+ enum migrate_mode mode)
+{
+ int expected_count;
+
+ if (!src->private)
+ return filemap_migrate_folio(mapping, dst, src, mode);
+
+ /* Check whether page does not have extra refs before we do more work */
+ expected_count = folio_expected_ref_count(src) + 1;
+ if (folio_ref_count(src) != expected_count)
+ return -EAGAIN;
+ return __metapage_migrate_folio(mapping, dst, src, mode);
+}
+#else
+#define metapage_migrate_folio NULL
+#endif /* CONFIG_MIGRATION */
+
static void metapage_invalidate_folio(struct folio *folio, size_t offset,
size_t length)
{
@@ -570,6 +677,7 @@ const struct address_space_operations jfs_metapage_aops = {
.release_folio = metapage_release_folio,
.invalidate_folio = metapage_invalidate_folio,
.dirty_folio = filemap_dirty_folio,
+ .migrate_folio = metapage_migrate_folio,
};
struct metapage *__get_metapage(struct inode *inode, unsigned long lblock,
@@ -707,7 +815,7 @@ static int metapage_write_one(struct folio *folio)
if (folio_clear_dirty_for_io(folio)) {
folio_get(folio);
- ret = metapage_write_folio(folio, &wbc, NULL);
+ ret = metapage_write_folio(folio, &wbc);
if (ret == 0)
folio_wait_writeback(folio);
folio_put(folio);
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
index 5ee618d17e77..28c3cf960c6f 100644
--- a/fs/jfs/jfs_xtree.c
+++ b/fs/jfs/jfs_xtree.c
@@ -49,26 +49,6 @@
#define XT_PAGE(IP, MP) BT_PAGE(IP, MP, xtpage_t, i_xtroot)
-/* get page buffer for specified block address */
-/* ToDo: Replace this ugly macro with a function */
-#define XT_GETPAGE(IP, BN, MP, SIZE, P, RC) \
-do { \
- BT_GETPAGE(IP, BN, MP, xtpage_t, SIZE, P, RC, i_xtroot); \
- if (!(RC)) { \
- if ((le16_to_cpu((P)->header.nextindex) < XTENTRYSTART) || \
- (le16_to_cpu((P)->header.nextindex) > \
- le16_to_cpu((P)->header.maxentry)) || \
- (le16_to_cpu((P)->header.maxentry) > \
- (((BN) == 0) ? XTROOTMAXSLOT : PSIZE >> L2XTSLOTSIZE))) { \
- jfs_error((IP)->i_sb, \
- "XT_GETPAGE: xtree page corrupt\n"); \
- BT_PUTPAGE(MP); \
- MP = NULL; \
- RC = -EIO; \
- } \
- } \
-} while (0)
-
/* for consistency */
#define XT_PUTPAGE(MP) BT_PUTPAGE(MP)
@@ -115,6 +95,42 @@ static int xtSplitRoot(tid_t tid, struct inode *ip,
struct xtsplit * split, struct metapage ** rmpp);
/*
+ * xt_getpage()
+ *
+ * function: get the page buffer for a specified block address.
+ *
+ * parameters:
+ * ip - pointer to the inode
+ * bn - block number (s64) of the xtree page to be retrieved;
+ * mp - pointer to a metapage pointer where the page buffer is returned;
+ *
+ * returns:
+ * A pointer to the xtree page (xtpage_t) on success, -EIO on error.
+ */
+
+static inline xtpage_t *xt_getpage(struct inode *ip, s64 bn, struct metapage **mp)
+{
+ xtpage_t *p;
+ int rc;
+
+ BT_GETPAGE(ip, bn, *mp, xtpage_t, PSIZE, p, rc, i_xtroot);
+
+ if (rc)
+ return ERR_PTR(rc);
+ if ((le16_to_cpu(p->header.nextindex) < XTENTRYSTART) ||
+ (le16_to_cpu(p->header.nextindex) >
+ le16_to_cpu(p->header.maxentry)) ||
+ (le16_to_cpu(p->header.maxentry) >
+ ((bn == 0) ? XTROOTMAXSLOT : PSIZE >> L2XTSLOTSIZE))) {
+ jfs_error(ip->i_sb, "xt_getpage: xtree page corrupt\n");
+ BT_PUTPAGE(*mp);
+ *mp = NULL;
+ return ERR_PTR(-EIO);
+ }
+ return p;
+}
+
+/*
* xtLookup()
*
* function: map a single page into a physical extent;
@@ -216,7 +232,6 @@ static int xtSearch(struct inode *ip, s64 xoff, s64 *nextp,
int *cmpp, struct btstack * btstack, int flag)
{
struct jfs_inode_info *jfs_ip = JFS_IP(ip);
- int rc = 0;
int cmp = 1; /* init for empty page */
s64 bn; /* block number */
struct metapage *mp; /* page buffer */
@@ -252,9 +267,9 @@ static int xtSearch(struct inode *ip, s64 xoff, s64 *nextp,
*/
for (bn = 0;;) {
/* get/pin the page to search */
- XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
- if (rc)
- return rc;
+ p = xt_getpage(ip, bn, &mp);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
/* try sequential access heuristics with the previous
* access entry in target leaf page:
@@ -807,10 +822,10 @@ xtSplitUp(tid_t tid,
* insert router entry in parent for new right child page <rp>
*/
/* get/pin the parent page <sp> */
- XT_GETPAGE(ip, parent->bn, smp, PSIZE, sp, rc);
- if (rc) {
+ sp = xt_getpage(ip, parent->bn, &smp);
+ if (IS_ERR(sp)) {
XT_PUTPAGE(rcmp);
- return rc;
+ return PTR_ERR(sp);
}
/*
@@ -1062,10 +1077,10 @@ xtSplitPage(tid_t tid, struct inode *ip,
* update previous pointer of old next/right page of <sp>
*/
if (nextbn != 0) {
- XT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc);
- if (rc) {
+ p = xt_getpage(ip, nextbn, &mp);
+ if (IS_ERR(p)) {
XT_PUTPAGE(rmp);
- goto clean_up;
+ return PTR_ERR(p);
}
BT_MARK_DIRTY(mp, ip);
@@ -1417,9 +1432,9 @@ int xtExtend(tid_t tid, /* transaction id */
return rc;
/* get back old page */
- XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
- if (rc)
- return rc;
+ p = xt_getpage(ip, bn, &mp);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
/*
* if leaf root has been split, original root has been
* copied to new child page, i.e., original entry now
@@ -1433,9 +1448,9 @@ int xtExtend(tid_t tid, /* transaction id */
XT_PUTPAGE(mp);
/* get new child page */
- XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
- if (rc)
- return rc;
+ p = xt_getpage(ip, bn, &mp);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
BT_MARK_DIRTY(mp, ip);
if (!test_cflag(COMMIT_Nolink, ip)) {
@@ -1711,9 +1726,9 @@ int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad)
return rc;
/* get back old page */
- XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
- if (rc)
- return rc;
+ p = xt_getpage(ip, bn, &mp);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
/*
* if leaf root has been split, original root has been
* copied to new child page, i.e., original entry now
@@ -1727,9 +1742,9 @@ int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad)
XT_PUTPAGE(mp);
/* get new child page */
- XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
- if (rc)
- return rc;
+ p = xt_getpage(ip, bn, &mp);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
BT_MARK_DIRTY(mp, ip);
if (!test_cflag(COMMIT_Nolink, ip)) {
@@ -1788,9 +1803,9 @@ int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad)
XT_PUTPAGE(mp);
/* get new right page */
- XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
- if (rc)
- return rc;
+ p = xt_getpage(ip, bn, &mp);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
BT_MARK_DIRTY(mp, ip);
if (!test_cflag(COMMIT_Nolink, ip)) {
@@ -1864,9 +1879,9 @@ printf("xtUpdate.updateLeft.split p:0x%p\n", p);
return rc;
/* get back old page */
- XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
- if (rc)
- return rc;
+ p = xt_getpage(ip, bn, &mp);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
/*
* if leaf root has been split, original root has been
@@ -1881,9 +1896,9 @@ printf("xtUpdate.updateLeft.split p:0x%p\n", p);
XT_PUTPAGE(mp);
/* get new child page */
- XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
- if (rc)
- return rc;
+ p = xt_getpage(ip, bn, &mp);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
BT_MARK_DIRTY(mp, ip);
if (!test_cflag(COMMIT_Nolink, ip)) {
@@ -2187,7 +2202,6 @@ void xtInitRoot(tid_t tid, struct inode *ip)
*/
s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
{
- int rc = 0;
s64 teof;
struct metapage *mp;
xtpage_t *p;
@@ -2268,9 +2282,9 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
* first access of each page:
*/
getPage:
- XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
- if (rc)
- return rc;
+ p = xt_getpage(ip, bn, &mp);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
/* process entries backward from last index */
index = le16_to_cpu(p->header.nextindex) - 1;
@@ -2506,9 +2520,9 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
/* get back the parent page */
bn = parent->bn;
- XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
- if (rc)
- return rc;
+ p = xt_getpage(ip, bn, &mp);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
index = parent->index;
@@ -2791,9 +2805,9 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size)
* first access of each page:
*/
getPage:
- XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
- if (rc)
- return rc;
+ p = xt_getpage(ip, bn, &mp);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
/* process entries backward from last index */
index = le16_to_cpu(p->header.nextindex) - 1;
@@ -2836,9 +2850,9 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size)
/* get back the parent page */
bn = parent->bn;
- XT_GETPAGE(ip, bn, mp, PSIZE, p, rc);
- if (rc)
- return rc;
+ p = xt_getpage(ip, bn, &mp);
+ if (IS_ERR(p))
+ return PTR_ERR(p);
index = parent->index;
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index fc8ede43afde..65a218eba8fa 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -187,13 +187,13 @@ static int jfs_create(struct mnt_idmap *idmap, struct inode *dip,
* dentry - dentry of child directory
* mode - create mode (rwxrwxrwx).
*
- * RETURN: Errors from subroutines
+ * RETURN: ERR_PTR() of errors from subroutines.
*
* note:
* EACCES: user needs search+write permission on the parent directory
*/
-static int jfs_mkdir(struct mnt_idmap *idmap, struct inode *dip,
- struct dentry *dentry, umode_t mode)
+static struct dentry *jfs_mkdir(struct mnt_idmap *idmap, struct inode *dip,
+ struct dentry *dentry, umode_t mode)
{
int rc = 0;
tid_t tid; /* transaction id */
@@ -308,7 +308,7 @@ static int jfs_mkdir(struct mnt_idmap *idmap, struct inode *dip,
out1:
jfs_info("jfs_mkdir: rc:%d", rc);
- return rc;
+ return ERR_PTR(rc);
}
/*
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 223d9ac59839..3cfb86c5a36e 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -389,8 +389,8 @@ static int jfs_reconfigure(struct fs_context *fc)
if (!ctx->newLVSize) {
ctx->newLVSize = sb_bdev_nr_blocks(sb);
- if (ctx->newLVSize == 0)
- pr_err("JFS: Cannot determine volume size\n");
+ if (ctx->newLVSize == 0)
+ pr_err("JFS: Cannot determine volume size\n");
}
rc = jfs_extendfs(sb, ctx->newLVSize, 0);
@@ -542,7 +542,7 @@ static int jfs_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_magic = JFS_SUPER_MAGIC;
if (sbi->mntflag & JFS_OS2)
- sb->s_d_op = &jfs_ci_dentry_operations;
+ set_default_d_op(sb, &jfs_ci_dentry_operations);
inode = jfs_iget(sb, ROOT_I);
if (IS_ERR(inode)) {
@@ -766,7 +766,7 @@ static ssize_t jfs_quota_write(struct super_block *sb, int type,
}
lock_buffer(bh);
memcpy(bh->b_data+offset, data, tocopy);
- flush_dcache_page(bh->b_page);
+ flush_dcache_folio(bh->b_folio);
set_buffer_uptodate(bh);
mark_buffer_dirty(bh);
unlock_buffer(bh);
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 24afbae87225..11d7f74d207b 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -559,11 +559,16 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size)
size_check:
if (EALIST_SIZE(ea_buf->xattr) != ea_size) {
- int size = clamp_t(int, ea_size, 0, EALIST_SIZE(ea_buf->xattr));
-
- printk(KERN_ERR "ea_get: invalid extended attribute\n");
- print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1,
- ea_buf->xattr, size, 1);
+ if (unlikely(EALIST_SIZE(ea_buf->xattr) > INT_MAX)) {
+ printk(KERN_ERR "ea_get: extended attribute size too large: %u > INT_MAX\n",
+ EALIST_SIZE(ea_buf->xattr));
+ } else {
+ int size = clamp_t(int, ea_size, 0, EALIST_SIZE(ea_buf->xattr));
+
+ printk(KERN_ERR "ea_get: invalid extended attribute\n");
+ print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1,
+ ea_buf->xattr, size, 1);
+ }
ea_release(inode, ea_buf);
rc = -EIO;
goto clean_up;
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 5f0f8b95f44c..a670ba3e565e 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -17,7 +17,6 @@
#include "kernfs-internal.h"
-static DEFINE_RWLOCK(kernfs_rename_lock); /* kn->parent and ->name */
/*
* Don't use rename_lock to piggy back on pr_cont_buf. We don't want to
* call pr_cont() while holding rename_lock. Because sometimes pr_cont()
@@ -27,7 +26,6 @@ static DEFINE_RWLOCK(kernfs_rename_lock); /* kn->parent and ->name */
*/
static DEFINE_SPINLOCK(kernfs_pr_cont_lock);
static char kernfs_pr_cont_buf[PATH_MAX]; /* protected by pr_cont_lock */
-static DEFINE_SPINLOCK(kernfs_idr_lock); /* root->ino_idr */
#define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb)
@@ -51,22 +49,14 @@ static bool kernfs_lockdep(struct kernfs_node *kn)
#endif
}
-static int kernfs_name_locked(struct kernfs_node *kn, char *buf, size_t buflen)
-{
- if (!kn)
- return strscpy(buf, "(null)", buflen);
-
- return strscpy(buf, kn->parent ? kn->name : "/", buflen);
-}
-
/* kernfs_node_depth - compute depth from @from to @to */
static size_t kernfs_depth(struct kernfs_node *from, struct kernfs_node *to)
{
size_t depth = 0;
- while (to->parent && to != from) {
+ while (rcu_dereference(to->__parent) && to != from) {
depth++;
- to = to->parent;
+ to = rcu_dereference(to->__parent);
}
return depth;
}
@@ -84,18 +74,18 @@ static struct kernfs_node *kernfs_common_ancestor(struct kernfs_node *a,
db = kernfs_depth(rb->kn, b);
while (da > db) {
- a = a->parent;
+ a = rcu_dereference(a->__parent);
da--;
}
while (db > da) {
- b = b->parent;
+ b = rcu_dereference(b->__parent);
db--;
}
/* worst case b and a will be the same at root */
while (b != a) {
- b = b->parent;
- a = a->parent;
+ b = rcu_dereference(b->__parent);
+ a = rcu_dereference(a->__parent);
}
return a;
@@ -168,10 +158,13 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to,
/* Calculate how many bytes we need for the rest */
for (i = depth_to - 1; i >= 0; i--) {
+ const char *name;
+
for (kn = kn_to, j = 0; j < i; j++)
- kn = kn->parent;
+ kn = rcu_dereference(kn->__parent);
- len += scnprintf(buf + len, buflen - len, "/%s", kn->name);
+ name = rcu_dereference(kn->name);
+ len += scnprintf(buf + len, buflen - len, "/%s", name);
}
return len;
@@ -195,13 +188,18 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to,
*/
int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
{
- unsigned long flags;
- int ret;
+ struct kernfs_node *kn_parent;
- read_lock_irqsave(&kernfs_rename_lock, flags);
- ret = kernfs_name_locked(kn, buf, buflen);
- read_unlock_irqrestore(&kernfs_rename_lock, flags);
- return ret;
+ if (!kn)
+ return strscpy(buf, "(null)", buflen);
+
+ guard(rcu)();
+ /*
+ * KERNFS_ROOT_INVARIANT_PARENT is ignored here. The name is RCU freed and
+ * the parent is either existing or not.
+ */
+ kn_parent = rcu_dereference(kn->__parent);
+ return strscpy(buf, kn_parent ? rcu_dereference(kn->name) : "/", buflen);
}
/**
@@ -223,13 +221,17 @@ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
int kernfs_path_from_node(struct kernfs_node *to, struct kernfs_node *from,
char *buf, size_t buflen)
{
- unsigned long flags;
- int ret;
+ struct kernfs_root *root;
- read_lock_irqsave(&kernfs_rename_lock, flags);
- ret = kernfs_path_from_node_locked(to, from, buf, buflen);
- read_unlock_irqrestore(&kernfs_rename_lock, flags);
- return ret;
+ guard(rcu)();
+ if (to) {
+ root = kernfs_root(to);
+ if (!(root->flags & KERNFS_ROOT_INVARIANT_PARENT)) {
+ guard(read_lock_irqsave)(&root->kernfs_rename_lock);
+ return kernfs_path_from_node_locked(to, from, buf, buflen);
+ }
+ }
+ return kernfs_path_from_node_locked(to, from, buf, buflen);
}
EXPORT_SYMBOL_GPL(kernfs_path_from_node);
@@ -292,12 +294,14 @@ out:
struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn)
{
struct kernfs_node *parent;
+ struct kernfs_root *root;
unsigned long flags;
- read_lock_irqsave(&kernfs_rename_lock, flags);
- parent = kn->parent;
+ root = kernfs_root(kn);
+ read_lock_irqsave(&root->kernfs_rename_lock, flags);
+ parent = kernfs_parent(kn);
kernfs_get(parent);
- read_unlock_irqrestore(&kernfs_rename_lock, flags);
+ read_unlock_irqrestore(&root->kernfs_rename_lock, flags);
return parent;
}
@@ -336,13 +340,13 @@ static int kernfs_name_compare(unsigned int hash, const char *name,
return -1;
if (ns > kn->ns)
return 1;
- return strcmp(name, kn->name);
+ return strcmp(name, kernfs_rcu_name(kn));
}
static int kernfs_sd_compare(const struct kernfs_node *left,
const struct kernfs_node *right)
{
- return kernfs_name_compare(left->hash, left->name, left->ns, right);
+ return kernfs_name_compare(left->hash, kernfs_rcu_name(left), left->ns, right);
}
/**
@@ -360,8 +364,12 @@ static int kernfs_sd_compare(const struct kernfs_node *left,
*/
static int kernfs_link_sibling(struct kernfs_node *kn)
{
- struct rb_node **node = &kn->parent->dir.children.rb_node;
struct rb_node *parent = NULL;
+ struct kernfs_node *kn_parent;
+ struct rb_node **node;
+
+ kn_parent = kernfs_parent(kn);
+ node = &kn_parent->dir.children.rb_node;
while (*node) {
struct kernfs_node *pos;
@@ -380,13 +388,13 @@ static int kernfs_link_sibling(struct kernfs_node *kn)
/* add new node and rebalance the tree */
rb_link_node(&kn->rb, parent, node);
- rb_insert_color(&kn->rb, &kn->parent->dir.children);
+ rb_insert_color(&kn->rb, &kn_parent->dir.children);
/* successfully added, account subdir number */
down_write(&kernfs_root(kn)->kernfs_iattr_rwsem);
if (kernfs_type(kn) == KERNFS_DIR)
- kn->parent->dir.subdirs++;
- kernfs_inc_rev(kn->parent);
+ kn_parent->dir.subdirs++;
+ kernfs_inc_rev(kn_parent);
up_write(&kernfs_root(kn)->kernfs_iattr_rwsem);
return 0;
@@ -407,16 +415,19 @@ static int kernfs_link_sibling(struct kernfs_node *kn)
*/
static bool kernfs_unlink_sibling(struct kernfs_node *kn)
{
+ struct kernfs_node *kn_parent;
+
if (RB_EMPTY_NODE(&kn->rb))
return false;
+ kn_parent = kernfs_parent(kn);
down_write(&kernfs_root(kn)->kernfs_iattr_rwsem);
if (kernfs_type(kn) == KERNFS_DIR)
- kn->parent->dir.subdirs--;
- kernfs_inc_rev(kn->parent);
+ kn_parent->dir.subdirs--;
+ kernfs_inc_rev(kn_parent);
up_write(&kernfs_root(kn)->kernfs_iattr_rwsem);
- rb_erase(&kn->rb, &kn->parent->dir.children);
+ rb_erase(&kn->rb, &kn_parent->dir.children);
RB_CLEAR_NODE(&kn->rb);
return true;
}
@@ -533,7 +544,8 @@ static void kernfs_free_rcu(struct rcu_head *rcu)
{
struct kernfs_node *kn = container_of(rcu, struct kernfs_node, rcu);
- kfree_const(kn->name);
+ /* If the whole node goes away, then name can't be used outside */
+ kfree_const(rcu_access_pointer(kn->name));
if (kn->iattr) {
simple_xattrs_free(&kn->iattr->xattrs, NULL);
@@ -562,18 +574,19 @@ void kernfs_put(struct kernfs_node *kn)
* Moving/renaming is always done while holding reference.
* kn->parent won't change beneath us.
*/
- parent = kn->parent;
+ parent = kernfs_parent(kn);
WARN_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS,
"kernfs_put: %s/%s: released with incorrect active_ref %d\n",
- parent ? parent->name : "", kn->name, atomic_read(&kn->active));
+ parent ? rcu_dereference(parent->name) : "",
+ rcu_dereference(kn->name), atomic_read(&kn->active));
if (kernfs_type(kn) == KERNFS_LINK)
kernfs_put(kn->symlink.target_kn);
- spin_lock(&kernfs_idr_lock);
+ spin_lock(&root->kernfs_idr_lock);
idr_remove(&root->ino_idr, (u32)kernfs_ino(kn));
- spin_unlock(&kernfs_idr_lock);
+ spin_unlock(&root->kernfs_idr_lock);
call_rcu(&kn->rcu, kernfs_free_rcu);
@@ -626,13 +639,13 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
goto err_out1;
idr_preload(GFP_KERNEL);
- spin_lock(&kernfs_idr_lock);
+ spin_lock(&root->kernfs_idr_lock);
ret = idr_alloc_cyclic(&root->ino_idr, kn, 1, 0, GFP_ATOMIC);
if (ret >= 0 && ret < root->last_id_lowbits)
root->id_highbits++;
id_highbits = root->id_highbits;
root->last_id_lowbits = ret;
- spin_unlock(&kernfs_idr_lock);
+ spin_unlock(&root->kernfs_idr_lock);
idr_preload_end();
if (ret < 0)
goto err_out2;
@@ -643,7 +656,7 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
atomic_set(&kn->active, KN_DEACTIVATED_BIAS);
RB_CLEAR_NODE(&kn->rb);
- kn->name = name;
+ rcu_assign_pointer(kn->name, name);
kn->mode = mode;
kn->flags = flags;
@@ -668,9 +681,9 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
return kn;
err_out3:
- spin_lock(&kernfs_idr_lock);
+ spin_lock(&root->kernfs_idr_lock);
idr_remove(&root->ino_idr, (u32)kernfs_ino(kn));
- spin_unlock(&kernfs_idr_lock);
+ spin_unlock(&root->kernfs_idr_lock);
err_out2:
kmem_cache_free(kernfs_node_cache, kn);
err_out1:
@@ -701,7 +714,7 @@ struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
name, mode, uid, gid, flags);
if (kn) {
kernfs_get(parent);
- kn->parent = parent;
+ rcu_assign_pointer(kn->__parent, parent);
}
return kn;
}
@@ -769,18 +782,20 @@ err_unlock:
*/
int kernfs_add_one(struct kernfs_node *kn)
{
- struct kernfs_node *parent = kn->parent;
- struct kernfs_root *root = kernfs_root(parent);
+ struct kernfs_root *root = kernfs_root(kn);
struct kernfs_iattrs *ps_iattr;
+ struct kernfs_node *parent;
bool has_ns;
int ret;
down_write(&root->kernfs_rwsem);
+ parent = kernfs_parent(kn);
ret = -EINVAL;
has_ns = kernfs_ns_enabled(parent);
if (WARN(has_ns != (bool)kn->ns, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
- has_ns ? "required" : "invalid", parent->name, kn->name))
+ has_ns ? "required" : "invalid",
+ kernfs_rcu_name(parent), kernfs_rcu_name(kn)))
goto out_unlock;
if (kernfs_type(parent) != KERNFS_DIR)
@@ -790,7 +805,7 @@ int kernfs_add_one(struct kernfs_node *kn)
if (parent->flags & (KERNFS_REMOVING | KERNFS_EMPTY_DIR))
goto out_unlock;
- kn->hash = kernfs_name_hash(kn->name, kn->ns);
+ kn->hash = kernfs_name_hash(kernfs_rcu_name(kn), kn->ns);
ret = kernfs_link_sibling(kn);
if (ret)
@@ -846,7 +861,7 @@ static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent,
if (has_ns != (bool)ns) {
WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
- has_ns ? "required" : "invalid", parent->name, name);
+ has_ns ? "required" : "invalid", kernfs_rcu_name(parent), name);
return NULL;
}
@@ -949,6 +964,11 @@ struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent,
return kn;
}
+unsigned int kernfs_root_flags(struct kernfs_node *kn)
+{
+ return kernfs_root(kn)->flags;
+}
+
/**
* kernfs_create_root - create a new kernfs hierarchy
* @scops: optional syscall operations for the hierarchy
@@ -969,10 +989,12 @@ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
return ERR_PTR(-ENOMEM);
idr_init(&root->ino_idr);
+ spin_lock_init(&root->kernfs_idr_lock);
init_rwsem(&root->kernfs_rwsem);
init_rwsem(&root->kernfs_iattr_rwsem);
init_rwsem(&root->kernfs_supers_rwsem);
INIT_LIST_HEAD(&root->supers);
+ rwlock_init(&root->kernfs_rename_lock);
/*
* On 64bit ino setups, id is ino. On 32bit, low 32bits are ino.
@@ -1112,7 +1134,7 @@ struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent,
static int kernfs_dop_revalidate(struct inode *dir, const struct qstr *name,
struct dentry *dentry, unsigned int flags)
{
- struct kernfs_node *kn;
+ struct kernfs_node *kn, *parent;
struct kernfs_root *root;
if (flags & LOOKUP_RCU)
@@ -1120,8 +1142,6 @@ static int kernfs_dop_revalidate(struct inode *dir, const struct qstr *name,
/* Negative hashed dentry? */
if (d_really_is_negative(dentry)) {
- struct kernfs_node *parent;
-
/* If the kernfs parent node has changed discard and
* proceed to ->lookup.
*
@@ -1163,16 +1183,17 @@ static int kernfs_dop_revalidate(struct inode *dir, const struct qstr *name,
if (!kernfs_active(kn))
goto out_bad;
+ parent = kernfs_parent(kn);
/* The kernfs node has been moved? */
- if (kernfs_dentry_node(dentry->d_parent) != kn->parent)
+ if (kernfs_dentry_node(dentry->d_parent) != parent)
goto out_bad;
/* The kernfs node has been renamed */
- if (strcmp(dentry->d_name.name, kn->name) != 0)
+ if (strcmp(dentry->d_name.name, kernfs_rcu_name(kn)) != 0)
goto out_bad;
/* The kernfs node has been moved to a different namespace */
- if (kn->parent && kernfs_ns_enabled(kn->parent) &&
+ if (parent && kernfs_ns_enabled(parent) &&
kernfs_info(dentry->d_sb)->ns != kn->ns)
goto out_bad;
@@ -1230,24 +1251,24 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir,
return d_splice_alias(inode, dentry);
}
-static int kernfs_iop_mkdir(struct mnt_idmap *idmap,
- struct inode *dir, struct dentry *dentry,
- umode_t mode)
+static struct dentry *kernfs_iop_mkdir(struct mnt_idmap *idmap,
+ struct inode *dir, struct dentry *dentry,
+ umode_t mode)
{
struct kernfs_node *parent = dir->i_private;
struct kernfs_syscall_ops *scops = kernfs_root(parent)->syscall_ops;
int ret;
if (!scops || !scops->mkdir)
- return -EPERM;
+ return ERR_PTR(-EPERM);
if (!kernfs_get_active(parent))
- return -ENODEV;
+ return ERR_PTR(-ENODEV);
ret = scops->mkdir(parent, dentry->d_name.name, mode);
kernfs_put_active(parent);
- return ret;
+ return ERR_PTR(ret);
}
static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry)
@@ -1365,7 +1386,7 @@ static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos,
return kernfs_leftmost_descendant(rb_to_kn(rbn));
/* no sibling left, visit parent */
- return pos->parent;
+ return kernfs_parent(pos);
}
static void kernfs_activate_one(struct kernfs_node *kn)
@@ -1377,7 +1398,7 @@ static void kernfs_activate_one(struct kernfs_node *kn)
if (kernfs_active(kn) || (kn->flags & (KERNFS_HIDDEN | KERNFS_REMOVING)))
return;
- WARN_ON_ONCE(kn->parent && RB_EMPTY_NODE(&kn->rb));
+ WARN_ON_ONCE(rcu_access_pointer(kn->__parent) && RB_EMPTY_NODE(&kn->rb));
WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS);
atomic_sub(KN_DEACTIVATED_BIAS, &kn->active);
@@ -1447,7 +1468,7 @@ void kernfs_show(struct kernfs_node *kn, bool show)
static void __kernfs_remove(struct kernfs_node *kn)
{
- struct kernfs_node *pos;
+ struct kernfs_node *pos, *parent;
/* Short-circuit if non-root @kn has already finished removal. */
if (!kn)
@@ -1459,10 +1480,10 @@ static void __kernfs_remove(struct kernfs_node *kn)
* This is for kernfs_remove_self() which plays with active ref
* after removal.
*/
- if (kn->parent && RB_EMPTY_NODE(&kn->rb))
+ if (kernfs_parent(kn) && RB_EMPTY_NODE(&kn->rb))
return;
- pr_debug("kernfs %s: removing\n", kn->name);
+ pr_debug("kernfs %s: removing\n", kernfs_rcu_name(kn));
/* prevent new usage by marking all nodes removing and deactivating */
pos = NULL;
@@ -1485,14 +1506,14 @@ static void __kernfs_remove(struct kernfs_node *kn)
kernfs_get(pos);
kernfs_drain(pos);
-
+ parent = kernfs_parent(pos);
/*
* kernfs_unlink_sibling() succeeds once per node. Use it
* to decide who's responsible for cleanups.
*/
- if (!pos->parent || kernfs_unlink_sibling(pos)) {
+ if (!parent || kernfs_unlink_sibling(pos)) {
struct kernfs_iattrs *ps_iattr =
- pos->parent ? pos->parent->iattr : NULL;
+ parent ? parent->iattr : NULL;
/* update timestamps on the parent */
down_write(&kernfs_root(kn)->kernfs_iattr_rwsem);
@@ -1561,8 +1582,9 @@ void kernfs_break_active_protection(struct kernfs_node *kn)
* invoked before finishing the kernfs operation. Note that while this
* function restores the active reference, it doesn't and can't actually
* restore the active protection - @kn may already or be in the process of
- * being removed. Once kernfs_break_active_protection() is invoked, that
- * protection is irreversibly gone for the kernfs operation instance.
+ * being drained and removed. Once kernfs_break_active_protection() is
+ * invoked, that protection is irreversibly gone for the kernfs operation
+ * instance.
*
* While this function may be called at any point after
* kernfs_break_active_protection() is invoked, its most useful location
@@ -1718,11 +1740,11 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
{
struct kernfs_node *old_parent;
struct kernfs_root *root;
- const char *old_name = NULL;
+ const char *old_name;
int error;
/* can't move or rename root */
- if (!kn->parent)
+ if (!rcu_access_pointer(kn->__parent))
return -EINVAL;
root = kernfs_root(kn);
@@ -1733,9 +1755,19 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
(new_parent->flags & KERNFS_EMPTY_DIR))
goto out;
+ old_parent = kernfs_parent(kn);
+ if (root->flags & KERNFS_ROOT_INVARIANT_PARENT) {
+ error = -EINVAL;
+ if (WARN_ON_ONCE(old_parent != new_parent))
+ goto out;
+ }
+
error = 0;
- if ((kn->parent == new_parent) && (kn->ns == new_ns) &&
- (strcmp(kn->name, new_name) == 0))
+ old_name = kernfs_rcu_name(kn);
+ if (!new_name)
+ new_name = old_name;
+ if ((old_parent == new_parent) && (kn->ns == new_ns) &&
+ (strcmp(old_name, new_name) == 0))
goto out; /* nothing to rename */
error = -EEXIST;
@@ -1743,7 +1775,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
goto out;
/* rename kernfs_node */
- if (strcmp(kn->name, new_name) != 0) {
+ if (strcmp(old_name, new_name) != 0) {
error = -ENOMEM;
new_name = kstrdup_const(new_name, GFP_KERNEL);
if (!new_name)
@@ -1756,27 +1788,32 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
* Move to the appropriate place in the appropriate directories rbtree.
*/
kernfs_unlink_sibling(kn);
- kernfs_get(new_parent);
- /* rename_lock protects ->parent and ->name accessors */
- write_lock_irq(&kernfs_rename_lock);
+ /* rename_lock protects ->parent accessors */
+ if (old_parent != new_parent) {
+ kernfs_get(new_parent);
+ write_lock_irq(&root->kernfs_rename_lock);
- old_parent = kn->parent;
- kn->parent = new_parent;
+ rcu_assign_pointer(kn->__parent, new_parent);
- kn->ns = new_ns;
- if (new_name) {
- old_name = kn->name;
- kn->name = new_name;
- }
+ kn->ns = new_ns;
+ if (new_name)
+ rcu_assign_pointer(kn->name, new_name);
- write_unlock_irq(&kernfs_rename_lock);
+ write_unlock_irq(&root->kernfs_rename_lock);
+ kernfs_put(old_parent);
+ } else {
+ /* name assignment is RCU protected, parent is the same */
+ kn->ns = new_ns;
+ if (new_name)
+ rcu_assign_pointer(kn->name, new_name);
+ }
- kn->hash = kernfs_name_hash(kn->name, kn->ns);
+ kn->hash = kernfs_name_hash(new_name ?: old_name, kn->ns);
kernfs_link_sibling(kn);
- kernfs_put(old_parent);
- kfree_const(old_name);
+ if (new_name && !is_kernel_rodata((unsigned long)old_name))
+ kfree_rcu_mightsleep(old_name);
error = 0;
out:
@@ -1795,7 +1832,8 @@ static struct kernfs_node *kernfs_dir_pos(const void *ns,
{
if (pos) {
int valid = kernfs_active(pos) &&
- pos->parent == parent && hash == pos->hash;
+ rcu_access_pointer(pos->__parent) == parent &&
+ hash == pos->hash;
kernfs_put(pos);
if (!valid)
pos = NULL;
@@ -1860,7 +1898,7 @@ static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx)
for (pos = kernfs_dir_pos(ns, parent, ctx->pos, pos);
pos;
pos = kernfs_dir_next_pos(ns, parent, ctx->pos, pos)) {
- const char *name = pos->name;
+ const char *name = kernfs_rcu_name(pos);
unsigned int type = fs_umode_to_dtype(pos->mode);
int len = strlen(name);
ino_t ino = kernfs_ino(pos);
@@ -1869,10 +1907,10 @@ static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx)
file->private_data = pos;
kernfs_get(pos);
- up_read(&root->kernfs_rwsem);
- if (!dir_emit(ctx, name, len, ino, type))
+ if (!dir_emit(ctx, name, len, ino, type)) {
+ up_read(&root->kernfs_rwsem);
return 0;
- down_read(&root->kernfs_rwsem);
+ }
}
up_read(&root->kernfs_rwsem);
file->private_data = NULL;
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 8502ef68459b..a6c692cac616 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -778,8 +778,9 @@ bool kernfs_should_drain_open_files(struct kernfs_node *kn)
/*
* @kn being deactivated guarantees that @kn->attr.open can't change
* beneath us making the lockless test below safe.
+ * Callers post kernfs_unbreak_active_protection may be counted in
+ * kn->active by now, do not WARN_ON because of them.
*/
- WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS);
rcu_read_lock();
on = rcu_dereference(kn->attr.open);
@@ -911,9 +912,11 @@ repeat:
/* kick fsnotify */
down_read(&root->kernfs_supers_rwsem);
+ down_read(&root->kernfs_rwsem);
list_for_each_entry(info, &kernfs_root(kn)->supers, node) {
struct kernfs_node *parent;
struct inode *p_inode = NULL;
+ const char *kn_name;
struct inode *inode;
struct qstr name;
@@ -927,7 +930,8 @@ repeat:
if (!inode)
continue;
- name = (struct qstr)QSTR_INIT(kn->name, strlen(kn->name));
+ kn_name = kernfs_rcu_name(kn);
+ name = QSTR(kn_name);
parent = kernfs_get_parent(kn);
if (parent) {
p_inode = ilookup(info->sb, kernfs_ino(parent));
@@ -947,6 +951,7 @@ repeat:
iput(inode);
}
+ up_read(&root->kernfs_rwsem);
up_read(&root->kernfs_supers_rwsem);
kernfs_put(kn);
goto repeat;
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index b83054da68b3..457f91c412d4 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -24,45 +24,46 @@ static const struct inode_operations kernfs_iops = {
.listxattr = kernfs_iop_listxattr,
};
-static struct kernfs_iattrs *__kernfs_iattrs(struct kernfs_node *kn, int alloc)
+static struct kernfs_iattrs *__kernfs_iattrs(struct kernfs_node *kn, bool alloc)
{
- static DEFINE_MUTEX(iattr_mutex);
- struct kernfs_iattrs *ret;
+ struct kernfs_iattrs *ret __free(kfree) = NULL;
+ struct kernfs_iattrs *attr;
- mutex_lock(&iattr_mutex);
+ attr = READ_ONCE(kn->iattr);
+ if (attr || !alloc)
+ return attr;
- if (kn->iattr || !alloc)
- goto out_unlock;
-
- kn->iattr = kmem_cache_zalloc(kernfs_iattrs_cache, GFP_KERNEL);
- if (!kn->iattr)
- goto out_unlock;
+ ret = kmem_cache_zalloc(kernfs_iattrs_cache, GFP_KERNEL);
+ if (!ret)
+ return NULL;
/* assign default attributes */
- kn->iattr->ia_uid = GLOBAL_ROOT_UID;
- kn->iattr->ia_gid = GLOBAL_ROOT_GID;
-
- ktime_get_real_ts64(&kn->iattr->ia_atime);
- kn->iattr->ia_mtime = kn->iattr->ia_atime;
- kn->iattr->ia_ctime = kn->iattr->ia_atime;
-
- simple_xattrs_init(&kn->iattr->xattrs);
- atomic_set(&kn->iattr->nr_user_xattrs, 0);
- atomic_set(&kn->iattr->user_xattr_size, 0);
-out_unlock:
- ret = kn->iattr;
- mutex_unlock(&iattr_mutex);
- return ret;
+ ret->ia_uid = GLOBAL_ROOT_UID;
+ ret->ia_gid = GLOBAL_ROOT_GID;
+
+ ktime_get_real_ts64(&ret->ia_atime);
+ ret->ia_mtime = ret->ia_atime;
+ ret->ia_ctime = ret->ia_atime;
+
+ simple_xattrs_init(&ret->xattrs);
+ atomic_set(&ret->nr_user_xattrs, 0);
+ atomic_set(&ret->user_xattr_size, 0);
+
+ /* If someone raced us, recognize it. */
+ if (!try_cmpxchg(&kn->iattr, &attr, ret))
+ return READ_ONCE(kn->iattr);
+
+ return no_free_ptr(ret);
}
static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn)
{
- return __kernfs_iattrs(kn, 1);
+ return __kernfs_iattrs(kn, true);
}
static struct kernfs_iattrs *kernfs_iattrs_noalloc(struct kernfs_node *kn)
{
- return __kernfs_iattrs(kn, 0);
+ return __kernfs_iattrs(kn, false);
}
int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
@@ -166,9 +167,10 @@ static inline void set_inode_attr(struct inode *inode,
static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode)
{
- struct kernfs_iattrs *attrs = kn->iattr;
+ struct kernfs_iattrs *attrs;
inode->i_mode = kn->mode;
+ attrs = kernfs_iattrs_noalloc(kn);
if (attrs)
/*
* kernfs_node has non-default attributes get them from
@@ -306,7 +308,9 @@ int kernfs_xattr_set(struct kernfs_node *kn, const char *name,
const void *value, size_t size, int flags)
{
struct simple_xattr *old_xattr;
- struct kernfs_iattrs *attrs = kernfs_iattrs(kn);
+ struct kernfs_iattrs *attrs;
+
+ attrs = kernfs_iattrs(kn);
if (!attrs)
return -ENOMEM;
@@ -345,8 +349,9 @@ static int kernfs_vfs_user_xattr_add(struct kernfs_node *kn,
struct simple_xattrs *xattrs,
const void *value, size_t size, int flags)
{
- atomic_t *sz = &kn->iattr->user_xattr_size;
- atomic_t *nr = &kn->iattr->nr_user_xattrs;
+ struct kernfs_iattrs *attr = kernfs_iattrs_noalloc(kn);
+ atomic_t *sz = &attr->user_xattr_size;
+ atomic_t *nr = &attr->nr_user_xattrs;
struct simple_xattr *old_xattr;
int ret;
@@ -384,8 +389,9 @@ static int kernfs_vfs_user_xattr_rm(struct kernfs_node *kn,
struct simple_xattrs *xattrs,
const void *value, size_t size, int flags)
{
- atomic_t *sz = &kn->iattr->user_xattr_size;
- atomic_t *nr = &kn->iattr->nr_user_xattrs;
+ struct kernfs_iattrs *attr = kernfs_iattrs_noalloc(kn);
+ atomic_t *sz = &attr->user_xattr_size;
+ atomic_t *nr = &attr->nr_user_xattrs;
struct simple_xattr *old_xattr;
old_xattr = simple_xattr_set(xattrs, full_name, value, size, flags);
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index b42ee6547cdc..6061b6f70d2a 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -38,6 +38,7 @@ struct kernfs_root {
/* private fields, do not use outside kernfs proper */
struct idr ino_idr;
+ spinlock_t kernfs_idr_lock; /* root->ino_idr */
u32 last_id_lowbits;
u32 id_highbits;
struct kernfs_syscall_ops *syscall_ops;
@@ -50,6 +51,9 @@ struct kernfs_root {
struct rw_semaphore kernfs_iattr_rwsem;
struct rw_semaphore kernfs_supers_rwsem;
+ /* kn->parent and kn->name */
+ rwlock_t kernfs_rename_lock;
+
struct rcu_head rcu;
};
@@ -64,11 +68,14 @@ struct kernfs_root {
*
* Return: the kernfs_root @kn belongs to.
*/
-static inline struct kernfs_root *kernfs_root(struct kernfs_node *kn)
+static inline struct kernfs_root *kernfs_root(const struct kernfs_node *kn)
{
+ const struct kernfs_node *knp;
/* if parent exists, it's always a dir; otherwise, @sd is a dir */
- if (kn->parent)
- kn = kn->parent;
+ guard(rcu)();
+ knp = rcu_dereference(kn->__parent);
+ if (knp)
+ kn = knp;
return kn->dir.root;
}
@@ -97,6 +104,38 @@ struct kernfs_super_info {
};
#define kernfs_info(SB) ((struct kernfs_super_info *)(SB->s_fs_info))
+static inline bool kernfs_root_is_locked(const struct kernfs_node *kn)
+{
+ return lockdep_is_held(&kernfs_root(kn)->kernfs_rwsem);
+}
+
+static inline bool kernfs_rename_is_locked(const struct kernfs_node *kn)
+{
+ return lockdep_is_held(&kernfs_root(kn)->kernfs_rename_lock);
+}
+
+static inline const char *kernfs_rcu_name(const struct kernfs_node *kn)
+{
+ return rcu_dereference_check(kn->name, kernfs_root_is_locked(kn));
+}
+
+static inline struct kernfs_node *kernfs_parent(const struct kernfs_node *kn)
+{
+ /*
+ * The kernfs_node::__parent remains valid within a RCU section. The kn
+ * can be reparented (and renamed) which changes the entry. This can be
+ * avoided by locking kernfs_root::kernfs_rwsem or
+ * kernfs_root::kernfs_rename_lock.
+ * Both locks can be used to obtain a reference on __parent. Once the
+ * reference count reaches 0 then the node is about to be freed
+ * and can not be renamed (or become a different parent) anymore.
+ */
+ return rcu_dereference_check(kn->__parent,
+ kernfs_root_is_locked(kn) ||
+ kernfs_rename_is_locked(kn) ||
+ !atomic_read(&kn->count));
+}
+
static inline struct kernfs_node *kernfs_dentry_node(struct dentry *dentry)
{
if (d_really_is_negative(dentry))
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index 1358c21837f1..e384a69fbece 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -62,6 +62,21 @@ const struct super_operations kernfs_sops = {
.show_options = kernfs_sop_show_options,
.show_path = kernfs_sop_show_path,
+
+ /*
+ * sysfs is built on top of kernfs and sysfs provides the power
+ * management infrastructure to support suspend/hibernate by
+ * writing to various files in /sys/power/. As filesystems may
+ * be automatically frozen during suspend/hibernate implementing
+ * freeze/thaw support for kernfs generically will cause
+ * deadlocks as the suspending/hibernation initiating task will
+ * hold a VFS lock that it will then wait upon to be released.
+ * If freeze/thaw for kernfs is needed talk to the VFS.
+ */
+ .freeze_fs = NULL,
+ .unfreeze_fs = NULL,
+ .freeze_super = NULL,
+ .thaw_super = NULL,
};
static int kernfs_encode_fh(struct inode *inode, __u32 *fh, int *max_len,
@@ -145,8 +160,10 @@ static struct dentry *kernfs_fh_to_parent(struct super_block *sb,
static struct dentry *kernfs_get_parent_dentry(struct dentry *child)
{
struct kernfs_node *kn = kernfs_dentry_node(child);
+ struct kernfs_root *root = kernfs_root(kn);
- return d_obtain_alias(kernfs_get_inode(child->d_sb, kn->parent));
+ guard(rwsem_read)(&root->kernfs_rwsem);
+ return d_obtain_alias(kernfs_get_inode(child->d_sb, kernfs_parent(kn)));
}
static const struct export_operations kernfs_export_ops = {
@@ -186,10 +203,10 @@ static struct kernfs_node *find_next_ancestor(struct kernfs_node *child,
return NULL;
}
- while (child->parent != parent) {
- if (!child->parent)
+ while (kernfs_parent(child) != parent) {
+ child = kernfs_parent(child);
+ if (!child)
return NULL;
- child = child->parent;
}
return child;
@@ -207,16 +224,27 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn,
{
struct dentry *dentry;
struct kernfs_node *knparent;
+ struct kernfs_root *root;
BUG_ON(sb->s_op != &kernfs_sops);
dentry = dget(sb->s_root);
/* Check if this is the root kernfs_node */
- if (!kn->parent)
+ if (!rcu_access_pointer(kn->__parent))
return dentry;
- knparent = find_next_ancestor(kn, NULL);
+ root = kernfs_root(kn);
+ /*
+ * As long as kn is valid, its parent can not vanish. This is cgroup's
+ * kn so it can't have its parent replaced. Therefore it is safe to use
+ * the ancestor node outside of the RCU or locked section.
+ */
+ if (WARN_ON_ONCE(!(root->flags & KERNFS_ROOT_INVARIANT_PARENT)))
+ return ERR_PTR(-EINVAL);
+ scoped_guard(rcu) {
+ knparent = find_next_ancestor(kn, NULL);
+ }
if (WARN_ON(!knparent)) {
dput(dentry);
return ERR_PTR(-EINVAL);
@@ -225,17 +253,26 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn,
do {
struct dentry *dtmp;
struct kernfs_node *kntmp;
+ const char *name;
if (kn == knparent)
return dentry;
- kntmp = find_next_ancestor(kn, knparent);
- if (WARN_ON(!kntmp)) {
+
+ scoped_guard(rwsem_read, &root->kernfs_rwsem) {
+ kntmp = find_next_ancestor(kn, knparent);
+ if (WARN_ON(!kntmp)) {
+ dput(dentry);
+ return ERR_PTR(-EINVAL);
+ }
+ name = kstrdup(kernfs_rcu_name(kntmp), GFP_KERNEL);
+ }
+ if (!name) {
dput(dentry);
- return ERR_PTR(-EINVAL);
+ return ERR_PTR(-ENOMEM);
}
- dtmp = lookup_positive_unlocked(kntmp->name, dentry,
- strlen(kntmp->name));
+ dtmp = lookup_noperm_positive_unlocked(&QSTR(name), dentry);
dput(dentry);
+ kfree(name);
if (IS_ERR(dtmp))
return dtmp;
knparent = kntmp;
@@ -281,7 +318,7 @@ static int kernfs_fill_super(struct super_block *sb, struct kernfs_fs_context *k
return -ENOMEM;
}
sb->s_root = root;
- sb->s_d_op = &kernfs_dops;
+ set_default_d_op(sb, &kernfs_dops);
return 0;
}
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
index 45371a70caa7..0bd8a2143723 100644
--- a/fs/kernfs/symlink.c
+++ b/fs/kernfs/symlink.c
@@ -62,10 +62,10 @@ static int kernfs_get_target_path(struct kernfs_node *parent,
/* go up to the root, stop at the base */
base = parent;
- while (base->parent) {
- kn = target->parent;
- while (kn->parent && base != kn)
- kn = kn->parent;
+ while (kernfs_parent(base)) {
+ kn = kernfs_parent(target);
+ while (kernfs_parent(kn) && base != kn)
+ kn = kernfs_parent(kn);
if (base == kn)
break;
@@ -75,14 +75,14 @@ static int kernfs_get_target_path(struct kernfs_node *parent,
strcpy(s, "../");
s += 3;
- base = base->parent;
+ base = kernfs_parent(base);
}
/* determine end of target string for reverse fillup */
kn = target;
- while (kn->parent && kn != base) {
- len += strlen(kn->name) + 1;
- kn = kn->parent;
+ while (kernfs_parent(kn) && kn != base) {
+ len += strlen(kernfs_rcu_name(kn)) + 1;
+ kn = kernfs_parent(kn);
}
/* check limits */
@@ -94,15 +94,16 @@ static int kernfs_get_target_path(struct kernfs_node *parent,
/* reverse fillup of target string from target to base */
kn = target;
- while (kn->parent && kn != base) {
- int slen = strlen(kn->name);
+ while (kernfs_parent(kn) && kn != base) {
+ const char *name = kernfs_rcu_name(kn);
+ int slen = strlen(name);
len -= slen;
- memcpy(s + len, kn->name, slen);
+ memcpy(s + len, name, slen);
if (len)
s[--len] = '/';
- kn = kn->parent;
+ kn = kernfs_parent(kn);
}
return 0;
@@ -111,12 +112,13 @@ static int kernfs_get_target_path(struct kernfs_node *parent,
static int kernfs_getlink(struct inode *inode, char *path)
{
struct kernfs_node *kn = inode->i_private;
- struct kernfs_node *parent = kn->parent;
+ struct kernfs_node *parent;
struct kernfs_node *target = kn->symlink.target_kn;
- struct kernfs_root *root = kernfs_root(parent);
+ struct kernfs_root *root = kernfs_root(kn);
int error;
down_read(&root->kernfs_rwsem);
+ parent = kernfs_parent(kn);
error = kernfs_get_target_path(parent, target, path);
up_read(&root->kernfs_rwsem);
diff --git a/fs/libfs.c b/fs/libfs.c
index 8444f5cc4064..ce8c496a6940 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -62,11 +62,6 @@ int always_delete_dentry(const struct dentry *dentry)
}
EXPORT_SYMBOL(always_delete_dentry);
-const struct dentry_operations simple_dentry_operations = {
- .d_delete = always_delete_dentry,
-};
-EXPORT_SYMBOL(simple_dentry_operations);
-
/*
* Lookup the data. This is trivial - if the dentry didn't already
* exist, we know it is negative. Set d_op to delete negative dentries.
@@ -75,9 +70,11 @@ struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, unsigned
{
if (dentry->d_name.len > NAME_MAX)
return ERR_PTR(-ENAMETOOLONG);
- if (!dentry->d_sb->s_d_op)
- d_set_d_op(dentry, &simple_dentry_operations);
-
+ if (!dentry->d_op && !(dentry->d_flags & DCACHE_DONTCACHE)) {
+ spin_lock(&dentry->d_lock);
+ dentry->d_flags |= DCACHE_DONTCACHE;
+ spin_unlock(&dentry->d_lock);
+ }
if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir))
return NULL;
@@ -496,7 +493,7 @@ offset_dir_lookup(struct dentry *parent, loff_t offset)
found = find_positive_dentry(parent, NULL, false);
else {
rcu_read_lock();
- child = mas_find(&mas, DIR_OFFSET_MAX);
+ child = mas_find_rev(&mas, DIR_OFFSET_MIN);
found = find_positive_dentry(parent, child, false);
rcu_read_unlock();
}
@@ -583,7 +580,7 @@ const struct file_operations simple_offset_dir_operations = {
.fsync = noop_fsync,
};
-static struct dentry *find_next_child(struct dentry *parent, struct dentry *prev)
+struct dentry *find_next_child(struct dentry *parent, struct dentry *prev)
{
struct dentry *child = NULL, *d;
@@ -603,16 +600,18 @@ static struct dentry *find_next_child(struct dentry *parent, struct dentry *prev
dput(prev);
return child;
}
+EXPORT_SYMBOL(find_next_child);
-void simple_recursive_removal(struct dentry *dentry,
- void (*callback)(struct dentry *))
+static void __simple_recursive_removal(struct dentry *dentry,
+ void (*callback)(struct dentry *),
+ bool locked)
{
struct dentry *this = dget(dentry);
while (true) {
struct dentry *victim = NULL, *child;
struct inode *inode = this->d_inode;
- inode_lock(inode);
+ inode_lock_nested(inode, I_MUTEX_CHILD);
if (d_is_dir(this))
inode->i_flags |= S_DEAD;
while ((child = find_next_child(this, victim)) == NULL) {
@@ -624,15 +623,13 @@ void simple_recursive_removal(struct dentry *dentry,
victim = this;
this = this->d_parent;
inode = this->d_inode;
- inode_lock(inode);
+ if (!locked || victim != dentry)
+ inode_lock_nested(inode, I_MUTEX_CHILD);
if (simple_positive(victim)) {
d_invalidate(victim); // avoid lost mounts
- if (d_is_dir(victim))
- fsnotify_rmdir(inode, victim);
- else
- fsnotify_unlink(inode, victim);
if (callback)
callback(victim);
+ fsnotify_delete(inode, d_inode(victim), victim);
dput(victim); // unpin it
}
if (victim == dentry) {
@@ -640,7 +637,8 @@ void simple_recursive_removal(struct dentry *dentry,
inode_set_ctime_current(inode));
if (d_is_dir(dentry))
drop_nlink(inode);
- inode_unlock(inode);
+ if (!locked)
+ inode_unlock(inode);
dput(dentry);
return;
}
@@ -649,8 +647,22 @@ void simple_recursive_removal(struct dentry *dentry,
this = child;
}
}
+
+void simple_recursive_removal(struct dentry *dentry,
+ void (*callback)(struct dentry *))
+{
+ return __simple_recursive_removal(dentry, callback, false);
+}
EXPORT_SYMBOL(simple_recursive_removal);
+/* caller holds parent directory with I_MUTEX_PARENT */
+void locked_recursive_removal(struct dentry *dentry,
+ void (*callback)(struct dentry *))
+{
+ return __simple_recursive_removal(dentry, callback, true);
+}
+EXPORT_SYMBOL(locked_recursive_removal);
+
static const struct super_operations simple_super_operations = {
.statfs = simple_statfs,
};
@@ -683,7 +695,7 @@ static int pseudo_fs_fill_super(struct super_block *s, struct fs_context *fc)
s->s_root = d_make_root(root);
if (!s->s_root)
return -ENOMEM;
- s->s_d_op = ctx->dops;
+ set_default_d_op(s, ctx->dops);
return 0;
}
@@ -909,7 +921,7 @@ static int simple_read_folio(struct file *file, struct folio *folio)
return 0;
}
-int simple_write_begin(struct file *file, struct address_space *mapping,
+int simple_write_begin(const struct kiocb *iocb, struct address_space *mapping,
loff_t pos, unsigned len,
struct folio **foliop, void **fsdata)
{
@@ -934,7 +946,7 @@ EXPORT_SYMBOL(simple_write_begin);
/**
* simple_write_end - .write_end helper for non-block-device FSes
- * @file: See .write_end of address_space_operations
+ * @iocb: kernel I/O control block
* @mapping: "
* @pos: "
* @len: "
@@ -945,7 +957,8 @@ EXPORT_SYMBOL(simple_write_begin);
* simple_write_end does the minimum needed for updating a folio after
* writing is done. It has the same API signature as the .write_end of
* address_space_operations vector. So it can just be set onto .write_end for
- * FSes that don't need any other processing. i_mutex is assumed to be held.
+ * FSes that don't need any other processing. i_rwsem is assumed to be held
+ * exclusively.
* Block based filesystems should use generic_write_end().
* NOTE: Even though i_size might get updated by this function, mark_inode_dirty
* is not called, so a filesystem that actually does store data in .write_inode
@@ -954,9 +967,10 @@ EXPORT_SYMBOL(simple_write_begin);
*
* Use *ONLY* with simple_read_folio()
*/
-static int simple_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct folio *folio, void *fsdata)
+static int simple_write_end(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct folio *folio, void *fsdata)
{
struct inode *inode = folio->mapping->host;
loff_t last_pos = pos + copied;
@@ -972,7 +986,7 @@ static int simple_write_end(struct file *file, struct address_space *mapping,
}
/*
* No need to use i_size_read() here, the i_size
- * cannot change under us because we hold the i_mutex.
+ * cannot change under us because we hold the i_rwsem.
*/
if (last_pos > inode->i_size)
i_size_write(inode, last_pos);
@@ -1582,13 +1596,17 @@ EXPORT_SYMBOL(generic_file_fsync);
int generic_check_addressable(unsigned blocksize_bits, u64 num_blocks)
{
u64 last_fs_block = num_blocks - 1;
- u64 last_fs_page =
- last_fs_block >> (PAGE_SHIFT - blocksize_bits);
+ u64 last_fs_page, max_bytes;
+
+ if (check_shl_overflow(num_blocks, blocksize_bits, &max_bytes))
+ return -EFBIG;
+
+ last_fs_page = (max_bytes >> PAGE_SHIFT) - 1;
if (unlikely(num_blocks == 0))
return 0;
- if ((blocksize_bits < 9) || (blocksize_bits > PAGE_SHIFT))
+ if (blocksize_bits < 9)
return -EINVAL;
if ((last_fs_block > (sector_t)(~0ULL) >> (blocksize_bits - 9)) ||
@@ -1647,10 +1665,14 @@ struct inode *alloc_anon_inode(struct super_block *s)
* that it already _is_ on the dirty list.
*/
inode->i_state = I_DIRTY;
+ /*
+ * Historically anonymous inodes don't have a type at all and
+ * userspace has come to rely on this.
+ */
inode->i_mode = S_IRUSR | S_IWUSR;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
- inode->i_flags |= S_PRIVATE;
+ inode->i_flags |= S_PRIVATE | S_ANON_INODE;
simple_inode_init_ts(inode);
return inode;
}
@@ -1943,22 +1965,22 @@ static const struct dentry_operations generic_encrypted_dentry_ops = {
* @sb: superblock to be configured
*
* Filesystems supporting casefolding and/or fscrypt can call this
- * helper at mount-time to configure sb->s_d_op to best set of dentry
- * operations required for the enabled features. The helper must be
- * called after these have been configured, but before the root dentry
- * is created.
+ * helper at mount-time to configure default dentry_operations to the
+ * best set of dentry operations required for the enabled features.
+ * The helper must be called after these have been configured, but
+ * before the root dentry is created.
*/
void generic_set_sb_d_ops(struct super_block *sb)
{
#if IS_ENABLED(CONFIG_UNICODE)
if (sb->s_encoding) {
- sb->s_d_op = &generic_ci_dentry_ops;
+ set_default_d_op(sb, &generic_ci_dentry_ops);
return;
}
#endif
#ifdef CONFIG_FS_ENCRYPTION
if (sb->s_cop) {
- sb->s_d_op = &generic_encrypted_dentry_ops;
+ set_default_d_op(sb, &generic_encrypted_dentry_ops);
return;
}
#endif
@@ -2113,7 +2135,7 @@ struct timespec64 simple_inode_init_ts(struct inode *inode)
}
EXPORT_SYMBOL(simple_inode_init_ts);
-static inline struct dentry *get_stashed_dentry(struct dentry **stashed)
+struct dentry *stashed_dentry_get(struct dentry **stashed)
{
struct dentry *dentry;
@@ -2121,6 +2143,8 @@ static inline struct dentry *get_stashed_dentry(struct dentry **stashed)
dentry = rcu_dereference(*stashed);
if (!dentry)
return NULL;
+ if (IS_ERR(dentry))
+ return dentry;
if (!lockref_get_not_dead(&dentry->d_lockref))
return NULL;
return dentry;
@@ -2153,7 +2177,6 @@ static struct dentry *prepare_anon_dentry(struct dentry **stashed,
/* Notice when this is changed. */
WARN_ON_ONCE(!S_ISREG(inode->i_mode));
- WARN_ON_ONCE(!IS_IMMUTABLE(inode));
dentry = d_alloc_anon(sb);
if (!dentry) {
@@ -2169,8 +2192,7 @@ static struct dentry *prepare_anon_dentry(struct dentry **stashed,
return dentry;
}
-static struct dentry *stash_dentry(struct dentry **stashed,
- struct dentry *dentry)
+struct dentry *stash_dentry(struct dentry **stashed, struct dentry *dentry)
{
guard(rcu)();
for (;;) {
@@ -2211,14 +2233,16 @@ static struct dentry *stash_dentry(struct dentry **stashed,
int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data,
struct path *path)
{
- struct dentry *dentry;
+ struct dentry *dentry, *res;
const struct stashed_operations *sops = mnt->mnt_sb->s_fs_info;
/* See if dentry can be reused. */
- path->dentry = get_stashed_dentry(stashed);
- if (path->dentry) {
+ res = stashed_dentry_get(stashed);
+ if (IS_ERR(res))
+ return PTR_ERR(res);
+ if (res) {
sops->put_data(data);
- goto out_path;
+ goto make_path;
}
/* Allocate a new dentry. */
@@ -2227,14 +2251,22 @@ int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data,
return PTR_ERR(dentry);
/* Added a new dentry. @data is now owned by the filesystem. */
- path->dentry = stash_dentry(stashed, dentry);
- if (path->dentry != dentry)
+ if (sops->stash_dentry)
+ res = sops->stash_dentry(stashed, dentry);
+ else
+ res = stash_dentry(stashed, dentry);
+ if (IS_ERR(res)) {
+ dput(dentry);
+ return PTR_ERR(res);
+ }
+ if (res != dentry)
dput(dentry);
-out_path:
- WARN_ON_ONCE(path->dentry->d_fsdata != stashed);
- WARN_ON_ONCE(d_inode(path->dentry)->i_private != data);
+make_path:
+ path->dentry = res;
path->mnt = mntget(mnt);
+ VFS_WARN_ON_ONCE(path->dentry->d_fsdata != stashed);
+ VFS_WARN_ON_ONCE(d_inode(path->dentry)->i_private != data);
return 0;
}
@@ -2256,3 +2288,28 @@ void stashed_dentry_prune(struct dentry *dentry)
*/
cmpxchg(stashed, dentry, NULL);
}
+
+/* parent must be held exclusive */
+struct dentry *simple_start_creating(struct dentry *parent, const char *name)
+{
+ struct dentry *dentry;
+ struct inode *dir = d_inode(parent);
+
+ inode_lock(dir);
+ if (unlikely(IS_DEADDIR(dir))) {
+ inode_unlock(dir);
+ return ERR_PTR(-ENOENT);
+ }
+ dentry = lookup_noperm(&QSTR(name), parent);
+ if (IS_ERR(dentry)) {
+ inode_unlock(dir);
+ return dentry;
+ }
+ if (dentry->d_inode) {
+ dput(dentry);
+ inode_unlock(dir);
+ return ERR_PTR(-EEXIST);
+ }
+ return dentry;
+}
+EXPORT_SYMBOL(simple_start_creating);
diff --git a/fs/lockd/Makefile b/fs/lockd/Makefile
index fe3e23dd29c3..51bbe22d21e3 100644
--- a/fs/lockd/Makefile
+++ b/fs/lockd/Makefile
@@ -8,6 +8,6 @@ ccflags-y += -I$(src) # needed for trace events
obj-$(CONFIG_LOCKD) += lockd.o
lockd-y := clntlock.o clntproc.o clntxdr.o host.o svc.o svclock.o \
- svcshare.o svcproc.o svcsubs.o mon.o trace.o xdr.o
+ svcshare.o svcproc.o svcsubs.o mon.o trace.o xdr.o netlink.o
lockd-$(CONFIG_LOCKD_V4) += clnt4xdr.o xdr4.o svc4proc.o
lockd-$(CONFIG_PROC_FS) += procfs.o
diff --git a/fs/lockd/netlink.c b/fs/lockd/netlink.c
new file mode 100644
index 000000000000..6e00b02cad90
--- /dev/null
+++ b/fs/lockd/netlink.c
@@ -0,0 +1,44 @@
+// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
+/* Do not edit directly, auto-generated from: */
+/* Documentation/netlink/specs/lockd.yaml */
+/* YNL-GEN kernel source */
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include "netlink.h"
+
+#include <uapi/linux/lockd_netlink.h>
+
+/* LOCKD_CMD_SERVER_SET - do */
+static const struct nla_policy lockd_server_set_nl_policy[LOCKD_A_SERVER_UDP_PORT + 1] = {
+ [LOCKD_A_SERVER_GRACETIME] = { .type = NLA_U32, },
+ [LOCKD_A_SERVER_TCP_PORT] = { .type = NLA_U16, },
+ [LOCKD_A_SERVER_UDP_PORT] = { .type = NLA_U16, },
+};
+
+/* Ops table for lockd */
+static const struct genl_split_ops lockd_nl_ops[] = {
+ {
+ .cmd = LOCKD_CMD_SERVER_SET,
+ .doit = lockd_nl_server_set_doit,
+ .policy = lockd_server_set_nl_policy,
+ .maxattr = LOCKD_A_SERVER_UDP_PORT,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = LOCKD_CMD_SERVER_GET,
+ .doit = lockd_nl_server_get_doit,
+ .flags = GENL_CMD_CAP_DO,
+ },
+};
+
+struct genl_family lockd_nl_family __ro_after_init = {
+ .name = LOCKD_FAMILY_NAME,
+ .version = LOCKD_FAMILY_VERSION,
+ .netnsok = true,
+ .parallel_ops = true,
+ .module = THIS_MODULE,
+ .split_ops = lockd_nl_ops,
+ .n_split_ops = ARRAY_SIZE(lockd_nl_ops),
+};
diff --git a/fs/lockd/netlink.h b/fs/lockd/netlink.h
new file mode 100644
index 000000000000..1920543a7955
--- /dev/null
+++ b/fs/lockd/netlink.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */
+/* Do not edit directly, auto-generated from: */
+/* Documentation/netlink/specs/lockd.yaml */
+/* YNL-GEN kernel header */
+
+#ifndef _LINUX_LOCKD_GEN_H
+#define _LINUX_LOCKD_GEN_H
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include <uapi/linux/lockd_netlink.h>
+
+int lockd_nl_server_set_doit(struct sk_buff *skb, struct genl_info *info);
+int lockd_nl_server_get_doit(struct sk_buff *skb, struct genl_info *info);
+
+extern struct genl_family lockd_nl_family;
+
+#endif /* _LINUX_LOCKD_GEN_H */
diff --git a/fs/lockd/netns.h b/fs/lockd/netns.h
index 17432c445fe6..88e8e2a97397 100644
--- a/fs/lockd/netns.h
+++ b/fs/lockd/netns.h
@@ -10,6 +10,9 @@ struct lockd_net {
unsigned int nlmsvc_users;
unsigned long next_gc;
unsigned long nrhosts;
+ u32 gracetime;
+ u16 tcp_port;
+ u16 udp_port;
struct delayed_work grace_period_end;
struct lock_manager lockd_manager;
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 2c8eedc6c2cc..e80262a51884 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -41,6 +41,7 @@
#include "netns.h"
#include "procfs.h"
+#include "netlink.h"
#define NLMDBG_FACILITY NLMDBG_SVC
#define LOCKD_BUFSIZE (1024 + NLMSVC_XDRSIZE)
@@ -83,8 +84,14 @@ static const int nlm_port_min = 0, nlm_port_max = 65535;
static struct ctl_table_header * nlm_sysctl_table;
#endif
-static unsigned long get_lockd_grace_period(void)
+static unsigned long get_lockd_grace_period(struct net *net)
{
+ struct lockd_net *ln = net_generic(net, lockd_net_id);
+
+ /* Return the net-ns specific grace period, if there is one */
+ if (ln->gracetime)
+ return ln->gracetime * HZ;
+
/* Note: nlm_timeout should always be nonzero */
if (nlm_grace_period)
return roundup(nlm_grace_period, nlm_timeout) * HZ;
@@ -103,7 +110,7 @@ static void grace_ender(struct work_struct *grace)
static void set_grace_period(struct net *net)
{
- unsigned long grace_period = get_lockd_grace_period();
+ unsigned long grace_period = get_lockd_grace_period(net);
struct lockd_net *ln = net_generic(net, lockd_net_id);
locks_start_grace(net, &ln->lockd_manager);
@@ -166,15 +173,16 @@ static int create_lockd_listener(struct svc_serv *serv, const char *name,
static int create_lockd_family(struct svc_serv *serv, struct net *net,
const int family, const struct cred *cred)
{
+ struct lockd_net *ln = net_generic(net, lockd_net_id);
int err;
- err = create_lockd_listener(serv, "udp", net, family, nlm_udpport,
- cred);
+ err = create_lockd_listener(serv, "udp", net, family,
+ ln->udp_port ? ln->udp_port : nlm_udpport, cred);
if (err < 0)
return err;
- return create_lockd_listener(serv, "tcp", net, family, nlm_tcpport,
- cred);
+ return create_lockd_listener(serv, "tcp", net, family,
+ ln->tcp_port ? ln->tcp_port : nlm_tcpport, cred);
}
/*
@@ -459,9 +467,10 @@ static const struct ctl_table nlm_sysctls[] = {
{
.procname = "nsm_local_state",
.data = &nsm_local_state,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(nsm_local_state),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_douintvec,
+ .extra1 = SYSCTL_ZERO,
},
};
@@ -588,6 +597,10 @@ static int __init init_nlm(void)
if (err)
goto err_pernet;
+ err = genl_register_family(&lockd_nl_family);
+ if (err)
+ goto err_netlink;
+
err = lockd_create_procfs();
if (err)
goto err_procfs;
@@ -595,6 +608,8 @@ static int __init init_nlm(void)
return 0;
err_procfs:
+ genl_unregister_family(&lockd_nl_family);
+err_netlink:
unregister_pernet_subsys(&lockd_net_ops);
err_pernet:
#ifdef CONFIG_SYSCTL
@@ -608,6 +623,7 @@ static void __exit exit_nlm(void)
{
/* FIXME: delete all NLM clients */
nlm_shutdown_hosts();
+ genl_unregister_family(&lockd_nl_family);
lockd_remove_procfs();
unregister_pernet_subsys(&lockd_net_ops);
#ifdef CONFIG_SYSCTL
@@ -710,3 +726,94 @@ static struct svc_program nlmsvc_program = {
.pg_init_request = svc_generic_init_request,
.pg_rpcbind_set = svc_generic_rpcbind_set,
};
+
+/**
+ * lockd_nl_server_set_doit - set the lockd server parameters via netlink
+ * @skb: reply buffer
+ * @info: netlink metadata and command arguments
+ *
+ * This updates the per-net values. When updating the values in the init_net
+ * namespace, also update the "legacy" global values.
+ *
+ * Return 0 on success or a negative errno.
+ */
+int lockd_nl_server_set_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = genl_info_net(info);
+ struct lockd_net *ln = net_generic(net, lockd_net_id);
+ const struct nlattr *attr;
+
+ if (GENL_REQ_ATTR_CHECK(info, LOCKD_A_SERVER_GRACETIME))
+ return -EINVAL;
+
+ if (info->attrs[LOCKD_A_SERVER_GRACETIME] ||
+ info->attrs[LOCKD_A_SERVER_TCP_PORT] ||
+ info->attrs[LOCKD_A_SERVER_UDP_PORT]) {
+ attr = info->attrs[LOCKD_A_SERVER_GRACETIME];
+ if (attr) {
+ u32 gracetime = nla_get_u32(attr);
+
+ if (gracetime > nlm_grace_period_max)
+ return -EINVAL;
+
+ ln->gracetime = gracetime;
+
+ if (net == &init_net)
+ nlm_grace_period = gracetime;
+ }
+
+ attr = info->attrs[LOCKD_A_SERVER_TCP_PORT];
+ if (attr) {
+ ln->tcp_port = nla_get_u16(attr);
+ if (net == &init_net)
+ nlm_tcpport = ln->tcp_port;
+ }
+
+ attr = info->attrs[LOCKD_A_SERVER_UDP_PORT];
+ if (attr) {
+ ln->udp_port = nla_get_u16(attr);
+ if (net == &init_net)
+ nlm_udpport = ln->udp_port;
+ }
+ }
+ return 0;
+}
+
+/**
+ * lockd_nl_server_get_doit - get lockd server parameters via netlink
+ * @skb: reply buffer
+ * @info: netlink metadata and command arguments
+ *
+ * Return 0 on success or a negative errno.
+ */
+int lockd_nl_server_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = genl_info_net(info);
+ struct lockd_net *ln = net_generic(net, lockd_net_id);
+ void *hdr;
+ int err;
+
+ skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!skb)
+ return -ENOMEM;
+
+ hdr = genlmsg_iput(skb, info);
+ if (!hdr) {
+ err = -EMSGSIZE;
+ goto err_free_msg;
+ }
+
+ err = nla_put_u32(skb, LOCKD_A_SERVER_GRACETIME, ln->gracetime) ||
+ nla_put_u16(skb, LOCKD_A_SERVER_TCP_PORT, ln->tcp_port) ||
+ nla_put_u16(skb, LOCKD_A_SERVER_UDP_PORT, ln->udp_port);
+ if (err)
+ goto err_free_msg;
+
+ genlmsg_end(skb, hdr);
+
+ return genlmsg_reply(skb, info);
+err_free_msg:
+ nlmsg_free(skb);
+
+ return err;
+}
diff --git a/fs/locks.c b/fs/locks.c
index 1619cddfa7a4..559f02aa4172 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -712,7 +712,7 @@ static void __locks_wake_up_blocks(struct file_lock_core *blocker)
fl->fl_lmops && fl->fl_lmops->lm_notify)
fl->fl_lmops->lm_notify(fl);
else
- locks_wake_up(fl);
+ locks_wake_up_waiter(waiter);
/*
* The setting of flc_blocker to NULL marks the "done"
@@ -1794,7 +1794,7 @@ generic_add_lease(struct file *filp, int arg, struct file_lease **flp, void **pr
/*
* In the delegation case we need mutual exclusion with
- * a number of operations that take the i_mutex. We trylock
+ * a number of operations that take the i_rwsem. We trylock
* because delegations are an optional optimization, and if
* there's some chance of a conflict--we'd rather not
* bother, maybe that's a sign this just isn't a good file to
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index dd2a425b41f0..19052fc47e9e 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -45,7 +45,7 @@ static void dir_commit_chunk(struct folio *folio, loff_t pos, unsigned len)
struct address_space *mapping = folio->mapping;
struct inode *dir = mapping->host;
- block_write_end(NULL, mapping, pos, len, len, folio, NULL);
+ block_write_end(pos, len, len, folio);
if (pos+len > dir->i_size) {
i_size_write(dir, pos+len);
diff --git a/fs/minix/file.c b/fs/minix/file.c
index 906d192ab7f3..dca7ac71f049 100644
--- a/fs/minix/file.c
+++ b/fs/minix/file.c
@@ -17,7 +17,7 @@ const struct file_operations minix_file_operations = {
.llseek = generic_file_llseek,
.read_iter = generic_file_read_iter,
.write_iter = generic_file_write_iter,
- .mmap = generic_file_mmap,
+ .mmap_prepare = generic_file_mmap_prepare,
.fsync = generic_file_fsync,
.splice_read = filemap_splice_read,
};
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index f007e389d5d2..df9d11479caf 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -442,9 +442,10 @@ static void minix_write_failed(struct address_space *mapping, loff_t to)
}
}
-static int minix_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len,
- struct folio **foliop, void **fsdata)
+static int minix_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len,
+ struct folio **foliop, void **fsdata)
{
int ret;
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index 5d9c1406fe27..8938536d8d3c 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -104,15 +104,15 @@ static int minix_link(struct dentry * old_dentry, struct inode * dir,
return add_nondir(dentry, inode);
}
-static int minix_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *minix_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct inode * inode;
int err;
inode = minix_new_inode(dir, S_IFDIR | mode);
if (IS_ERR(inode))
- return PTR_ERR(inode);
+ return ERR_CAST(inode);
inode_inc_link_count(dir);
minix_set_inode(inode, 0);
@@ -128,7 +128,7 @@ static int minix_mkdir(struct mnt_idmap *idmap, struct inode *dir,
d_instantiate(dentry, inode);
out:
- return err;
+ return ERR_PTR(err);
out_fail:
inode_dec_link_count(inode);
diff --git a/fs/mnt_idmapping.c b/fs/mnt_idmapping.c
index 7b1df8cc2821..a37991fdb194 100644
--- a/fs/mnt_idmapping.c
+++ b/fs/mnt_idmapping.c
@@ -6,6 +6,7 @@
#include <linux/mnt_idmapping.h>
#include <linux/slab.h>
#include <linux/user_namespace.h>
+#include <linux/seq_file.h>
#include "internal.h"
@@ -334,3 +335,53 @@ void mnt_idmap_put(struct mnt_idmap *idmap)
free_mnt_idmap(idmap);
}
EXPORT_SYMBOL_GPL(mnt_idmap_put);
+
+int statmount_mnt_idmap(struct mnt_idmap *idmap, struct seq_file *seq, bool uid_map)
+{
+ struct uid_gid_map *map, *map_up;
+ u32 idx, nr_mappings;
+
+ if (!is_valid_mnt_idmap(idmap))
+ return 0;
+
+ /*
+ * Idmappings are shown relative to the caller's idmapping.
+ * This is both the most intuitive and most useful solution.
+ */
+ if (uid_map) {
+ map = &idmap->uid_map;
+ map_up = &current_user_ns()->uid_map;
+ } else {
+ map = &idmap->gid_map;
+ map_up = &current_user_ns()->gid_map;
+ }
+
+ for (idx = 0, nr_mappings = 0; idx < map->nr_extents; idx++) {
+ uid_t lower;
+ struct uid_gid_extent *extent;
+
+ if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
+ extent = &map->extent[idx];
+ else
+ extent = &map->forward[idx];
+
+ /*
+ * Verify that the whole range of the mapping can be
+ * resolved in the caller's idmapping. If it cannot be
+ * resolved skip the mapping.
+ */
+ lower = map_id_range_up(map_up, extent->lower_first, extent->count);
+ if (lower == (uid_t) -1)
+ continue;
+
+ seq_printf(seq, "%u %u %u", extent->first, lower, extent->count);
+
+ seq->count++; /* mappings are separated by \0 */
+ if (seq_has_overflowed(seq))
+ return -EAGAIN;
+
+ nr_mappings++;
+ }
+
+ return nr_mappings;
+}
diff --git a/fs/mount.h b/fs/mount.h
index ffb613cdfeee..97737051a8b9 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -5,6 +5,8 @@
#include <linux/ns_common.h>
#include <linux/fs_pin.h>
+extern struct list_head notify_list;
+
struct mnt_namespace {
struct ns_common ns;
struct mount * root;
@@ -20,7 +22,12 @@ struct mnt_namespace {
wait_queue_head_t poll;
struct rcu_head mnt_ns_rcu;
};
+ u64 seq_origin; /* Sequence number of origin mount namespace */
u64 event;
+#ifdef CONFIG_FSNOTIFY
+ __u32 n_fsnotify_mask;
+ struct fsnotify_mark_connector __rcu *n_fsnotify_marks;
+#endif
unsigned int nr_mounts; /* # of mounts in the namespace */
unsigned int pending_mounts;
struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */
@@ -37,7 +44,6 @@ struct mountpoint {
struct hlist_node m_hash;
struct dentry *m_dentry;
struct hlist_head m_list;
- int m_count;
};
struct mount {
@@ -63,8 +69,8 @@ struct mount {
struct list_head mnt_list;
struct list_head mnt_expire; /* link in fs-specific expiry list */
struct list_head mnt_share; /* circular list of shared mounts */
- struct list_head mnt_slave_list;/* list of slave mounts */
- struct list_head mnt_slave; /* slave list entry */
+ struct hlist_head mnt_slave_list;/* list of slave mounts */
+ struct hlist_node mnt_slave; /* slave list entry */
struct mount *mnt_master; /* slave is on master->mnt_slave_list */
struct mnt_namespace *mnt_ns; /* containing namespace */
struct mountpoint *mnt_mp; /* where is it mounted */
@@ -72,19 +78,38 @@ struct mount {
struct hlist_node mnt_mp_list; /* list mounts with the same mountpoint */
struct hlist_node mnt_umount;
};
- struct list_head mnt_umounting; /* list entry for umount propagation */
#ifdef CONFIG_FSNOTIFY
struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks;
__u32 mnt_fsnotify_mask;
+ struct list_head to_notify; /* need to queue notification */
+ struct mnt_namespace *prev_ns; /* previous namespace (NULL if none) */
#endif
+ int mnt_t_flags; /* namespace_sem-protected flags */
int mnt_id; /* mount identifier, reused */
u64 mnt_id_unique; /* mount ID unique until reboot */
int mnt_group_id; /* peer group identifier */
int mnt_expiry_mark; /* true if marked for expiry */
struct hlist_head mnt_pins;
struct hlist_head mnt_stuck_children;
+ struct mount *overmount; /* mounted on ->mnt_root */
} __randomize_layout;
+enum {
+ T_SHARED = 1, /* mount is shared */
+ T_UNBINDABLE = 2, /* mount is unbindable */
+ T_MARKED = 4, /* internal mark for propagate_... */
+ T_UMOUNT_CANDIDATE = 8, /* for propagate_umount */
+
+ /*
+ * T_SHARED_MASK is the set of flags that should be cleared when a
+ * mount becomes shared. Currently, this is only the flag that says a
+ * mount cannot be bind mounted, since this is how we create a mount
+ * that shares events with another mount. If you add a new T_*
+ * flag, consider how it interacts with shared mounts.
+ */
+ T_SHARED_MASK = T_UNBINDABLE,
+};
+
#define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */
static inline struct mount *real_mount(struct vfsmount *mnt)
@@ -92,7 +117,7 @@ static inline struct mount *real_mount(struct vfsmount *mnt)
return container_of(mnt, struct mount, mnt);
}
-static inline int mnt_has_parent(struct mount *mnt)
+static inline int mnt_has_parent(const struct mount *mnt)
{
return mnt != mnt->mnt_parent;
}
@@ -137,8 +162,8 @@ struct proc_mounts {
extern const struct seq_operations mounts_op;
-extern bool __is_local_mountpoint(struct dentry *dentry);
-static inline bool is_local_mountpoint(struct dentry *dentry)
+extern bool __is_local_mountpoint(const struct dentry *dentry);
+static inline bool is_local_mountpoint(const struct dentry *dentry)
{
if (!d_mountpoint(dentry))
return false;
@@ -151,12 +176,24 @@ static inline bool is_anon_ns(struct mnt_namespace *ns)
return ns->seq == 0;
}
+static inline bool anon_ns_root(const struct mount *m)
+{
+ struct mnt_namespace *ns = READ_ONCE(m->mnt_ns);
+
+ return !IS_ERR_OR_NULL(ns) && is_anon_ns(ns) && m == ns->root;
+}
+
static inline bool mnt_ns_attached(const struct mount *mnt)
{
return !RB_EMPTY_NODE(&mnt->mnt_node);
}
-static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list)
+static inline bool mnt_ns_empty(const struct mnt_namespace *ns)
+{
+ return RB_EMPTY_ROOT(&ns->mounts);
+}
+
+static inline void move_from_ns(struct mount *mnt)
{
struct mnt_namespace *ns = mnt->mnt_ns;
WARN_ON(!mnt_ns_attached(mnt));
@@ -166,7 +203,6 @@ static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list)
ns->mnt_first_node = rb_next(&mnt->mnt_node);
rb_erase(&mnt->mnt_node, &ns->mounts);
RB_CLEAR_NODE(&mnt->mnt_node);
- list_add_tail(&mnt->mnt_list, dt_list);
}
bool has_locked_children(struct mount *mnt, struct dentry *dentry);
@@ -177,3 +213,21 @@ static inline struct mnt_namespace *to_mnt_ns(struct ns_common *ns)
{
return container_of(ns, struct mnt_namespace, ns);
}
+
+#ifdef CONFIG_FSNOTIFY
+static inline void mnt_notify_add(struct mount *m)
+{
+ /* Optimize the case where there are no watches */
+ if ((m->mnt_ns && m->mnt_ns->n_fsnotify_marks) ||
+ (m->prev_ns && m->prev_ns->n_fsnotify_marks))
+ list_add_tail(&m->to_notify, &notify_list);
+ else
+ m->prev_ns = m->mnt_ns;
+}
+#else
+static inline void mnt_notify_add(struct mount *m)
+{
+}
+#endif
+
+struct mnt_namespace *mnt_ns_from_dentry(struct dentry *dentry);
diff --git a/fs/mpage.c b/fs/mpage.c
index 82aecf372743..c5fd821fd30e 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -107,7 +107,7 @@ static void map_buffer_to_folio(struct folio *folio, struct buffer_head *bh,
* don't make any buffers if there is only one buffer on
* the folio and the folio just needs to be set up to date
*/
- if (inode->i_blkbits == PAGE_SHIFT &&
+ if (inode->i_blkbits == folio_shift(folio) &&
buffer_uptodate(bh)) {
folio_mark_uptodate(folio);
return;
@@ -153,7 +153,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
struct folio *folio = args->folio;
struct inode *inode = folio->mapping->host;
const unsigned blkbits = inode->i_blkbits;
- const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
+ const unsigned blocks_per_folio = folio_size(folio) >> blkbits;
const unsigned blocksize = 1 << blkbits;
struct buffer_head *map_bh = &args->map_bh;
sector_t block_in_file;
@@ -161,7 +161,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
sector_t last_block_in_file;
sector_t first_block;
unsigned page_block;
- unsigned first_hole = blocks_per_page;
+ unsigned first_hole = blocks_per_folio;
struct block_device *bdev = NULL;
int length;
int fully_mapped = 1;
@@ -170,9 +170,6 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
unsigned relative_block;
gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL);
- /* MAX_BUF_PER_PAGE, for example */
- VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
-
if (args->is_readahead) {
opf |= REQ_RAHEAD;
gfp |= __GFP_NORETRY | __GFP_NOWARN;
@@ -181,8 +178,8 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
if (folio_buffers(folio))
goto confused;
- block_in_file = (sector_t)folio->index << (PAGE_SHIFT - blkbits);
- last_block = block_in_file + args->nr_pages * blocks_per_page;
+ block_in_file = folio_pos(folio) >> blkbits;
+ last_block = block_in_file + ((args->nr_pages * PAGE_SIZE) >> blkbits);
last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
if (last_block > last_block_in_file)
last_block = last_block_in_file;
@@ -204,7 +201,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
clear_buffer_mapped(map_bh);
break;
}
- if (page_block == blocks_per_page)
+ if (page_block == blocks_per_folio)
break;
page_block++;
block_in_file++;
@@ -216,7 +213,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
* Then do more get_blocks calls until we are done with this folio.
*/
map_bh->b_folio = folio;
- while (page_block < blocks_per_page) {
+ while (page_block < blocks_per_folio) {
map_bh->b_state = 0;
map_bh->b_size = 0;
@@ -229,7 +226,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
if (!buffer_mapped(map_bh)) {
fully_mapped = 0;
- if (first_hole == blocks_per_page)
+ if (first_hole == blocks_per_folio)
first_hole = page_block;
page_block++;
block_in_file++;
@@ -247,7 +244,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
goto confused;
}
- if (first_hole != blocks_per_page)
+ if (first_hole != blocks_per_folio)
goto confused; /* hole -> non-hole */
/* Contiguous blocks? */
@@ -260,7 +257,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
if (relative_block == nblocks) {
clear_buffer_mapped(map_bh);
break;
- } else if (page_block == blocks_per_page)
+ } else if (page_block == blocks_per_folio)
break;
page_block++;
block_in_file++;
@@ -268,8 +265,8 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
bdev = map_bh->b_bdev;
}
- if (first_hole != blocks_per_page) {
- folio_zero_segment(folio, first_hole << blkbits, PAGE_SIZE);
+ if (first_hole != blocks_per_folio) {
+ folio_zero_segment(folio, first_hole << blkbits, folio_size(folio));
if (first_hole == 0) {
folio_mark_uptodate(folio);
folio_unlock(folio);
@@ -303,10 +300,10 @@ alloc_new:
relative_block = block_in_file - args->first_logical_block;
nblocks = map_bh->b_size >> blkbits;
if ((buffer_boundary(map_bh) && relative_block == nblocks) ||
- (first_hole != blocks_per_page))
+ (first_hole != blocks_per_folio))
args->bio = mpage_bio_submit_read(args->bio);
else
- args->last_block_in_bio = first_block + blocks_per_page - 1;
+ args->last_block_in_bio = first_block + blocks_per_folio - 1;
out:
return args->bio;
@@ -385,7 +382,7 @@ int mpage_read_folio(struct folio *folio, get_block_t get_block)
{
struct mpage_readpage_args args = {
.folio = folio,
- .nr_pages = 1,
+ .nr_pages = folio_nr_pages(folio),
.get_block = get_block,
};
@@ -448,20 +445,19 @@ static void clean_buffers(struct folio *folio, unsigned first_unmapped)
try_to_free_buffers(folio);
}
-static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc,
- void *data)
+static int mpage_write_folio(struct writeback_control *wbc, struct folio *folio,
+ struct mpage_data *mpd)
{
- struct mpage_data *mpd = data;
struct bio *bio = mpd->bio;
struct address_space *mapping = folio->mapping;
struct inode *inode = mapping->host;
const unsigned blkbits = inode->i_blkbits;
- const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
+ const unsigned blocks_per_folio = folio_size(folio) >> blkbits;
sector_t last_block;
sector_t block_in_file;
sector_t first_block;
unsigned page_block;
- unsigned first_unmapped = blocks_per_page;
+ unsigned first_unmapped = blocks_per_folio;
struct block_device *bdev = NULL;
int boundary = 0;
sector_t boundary_block = 0;
@@ -486,12 +482,12 @@ static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc,
*/
if (buffer_dirty(bh))
goto confused;
- if (first_unmapped == blocks_per_page)
+ if (first_unmapped == blocks_per_folio)
first_unmapped = page_block;
continue;
}
- if (first_unmapped != blocks_per_page)
+ if (first_unmapped != blocks_per_folio)
goto confused; /* hole -> non-hole */
if (!buffer_dirty(bh) || !buffer_uptodate(bh))
@@ -527,7 +523,7 @@ static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc,
* The page has no buffers: map it to disk
*/
BUG_ON(!folio_test_uptodate(folio));
- block_in_file = (sector_t)folio->index << (PAGE_SHIFT - blkbits);
+ block_in_file = folio_pos(folio) >> blkbits;
/*
* Whole page beyond EOF? Skip allocating blocks to avoid leaking
* space.
@@ -536,7 +532,7 @@ static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc,
goto page_is_mapped;
last_block = (i_size - 1) >> blkbits;
map_bh.b_folio = folio;
- for (page_block = 0; page_block < blocks_per_page; ) {
+ for (page_block = 0; page_block < blocks_per_folio; ) {
map_bh.b_state = 0;
map_bh.b_size = 1 << blkbits;
@@ -618,14 +614,14 @@ alloc_new:
BUG_ON(folio_test_writeback(folio));
folio_start_writeback(folio);
folio_unlock(folio);
- if (boundary || (first_unmapped != blocks_per_page)) {
+ if (boundary || (first_unmapped != blocks_per_folio)) {
bio = mpage_bio_submit_write(bio);
if (boundary_block) {
write_boundary_block(boundary_bdev,
boundary_block, 1 << blkbits);
}
} else {
- mpd->last_block_in_bio = first_block + blocks_per_page - 1;
+ mpd->last_block_in_bio = first_block + blocks_per_folio - 1;
}
goto out;
@@ -659,14 +655,16 @@ mpage_writepages(struct address_space *mapping,
struct mpage_data mpd = {
.get_block = get_block,
};
+ struct folio *folio = NULL;
struct blk_plug plug;
- int ret;
+ int error;
blk_start_plug(&plug);
- ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd);
+ while ((folio = writeback_iter(mapping, wbc, folio, &error)))
+ error = mpage_write_folio(wbc, folio, &mpd);
if (mpd.bio)
mpage_bio_submit_write(mpd.bio);
blk_finish_plug(&plug);
- return ret;
+ return error;
}
EXPORT_SYMBOL(mpage_writepages);
diff --git a/fs/namei.c b/fs/namei.c
index 3ab9440c5b93..cd43ff89fbaa 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -125,6 +125,13 @@
#define EMBEDDED_NAME_MAX (PATH_MAX - offsetof(struct filename, iname))
+static inline void initname(struct filename *name, const char __user *uptr)
+{
+ name->uptr = uptr;
+ name->aname = NULL;
+ atomic_set(&name->refcnt, 1);
+}
+
struct filename *
getname_flags(const char __user *filename, int flags)
{
@@ -203,10 +210,7 @@ getname_flags(const char __user *filename, int flags)
return ERR_PTR(-ENAMETOOLONG);
}
}
-
- atomic_set(&result->refcnt, 1);
- result->uptr = filename;
- result->aname = NULL;
+ initname(result, filename);
audit_getname(result);
return result;
}
@@ -218,11 +222,6 @@ struct filename *getname_uflags(const char __user *filename, int uflags)
return getname_flags(filename, flags);
}
-struct filename *getname(const char __user * filename)
-{
- return getname_flags(filename, 0);
-}
-
struct filename *__getname_maybe_null(const char __user *pathname)
{
struct filename *name;
@@ -269,25 +268,27 @@ struct filename *getname_kernel(const char * filename)
return ERR_PTR(-ENAMETOOLONG);
}
memcpy((char *)result->name, filename, len);
- result->uptr = NULL;
- result->aname = NULL;
- atomic_set(&result->refcnt, 1);
+ initname(result, NULL);
audit_getname(result);
-
return result;
}
EXPORT_SYMBOL(getname_kernel);
void putname(struct filename *name)
{
+ int refcnt;
+
if (IS_ERR_OR_NULL(name))
return;
- if (WARN_ON_ONCE(!atomic_read(&name->refcnt)))
- return;
+ refcnt = atomic_read(&name->refcnt);
+ if (refcnt != 1) {
+ if (WARN_ON_ONCE(!refcnt))
+ return;
- if (!atomic_dec_and_test(&name->refcnt))
- return;
+ if (!atomic_dec_and_test(&name->refcnt))
+ return;
+ }
if (name->name != name->iname) {
__putname(name->name);
@@ -570,14 +571,14 @@ int inode_permission(struct mnt_idmap *idmap,
int retval;
retval = sb_permission(inode->i_sb, inode, mask);
- if (retval)
+ if (unlikely(retval))
return retval;
if (unlikely(mask & MAY_WRITE)) {
/*
* Nobody gets write access to an immutable file.
*/
- if (IS_IMMUTABLE(inode))
+ if (unlikely(IS_IMMUTABLE(inode)))
return -EPERM;
/*
@@ -585,16 +586,16 @@ int inode_permission(struct mnt_idmap *idmap,
* written back improperly if their true value is unknown
* to the vfs.
*/
- if (HAS_UNMAPPED_ID(idmap, inode))
+ if (unlikely(HAS_UNMAPPED_ID(idmap, inode)))
return -EACCES;
}
retval = do_inode_permission(idmap, inode, mask);
- if (retval)
+ if (unlikely(retval))
return retval;
retval = devcgroup_inode_permission(inode, mask);
- if (retval)
+ if (unlikely(retval))
return retval;
return security_inode_permission(inode, mask);
@@ -1011,10 +1012,10 @@ static int set_root(struct nameidata *nd)
unsigned seq;
do {
- seq = read_seqcount_begin(&fs->seq);
+ seq = read_seqbegin(&fs->seq);
nd->root = fs->root;
nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
- } while (read_seqcount_retry(&fs->seq, seq));
+ } while (read_seqretry(&fs->seq, seq));
} else {
get_fs_root(fs, &nd->root);
nd->state |= ND_ROOT_GRABBED;
@@ -1468,7 +1469,7 @@ static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped,
int ret = 0;
while (flags & DCACHE_MANAGED_DENTRY) {
- /* Allow the filesystem to manage the transit without i_mutex
+ /* Allow the filesystem to manage the transit without i_rwsem
* being held. */
if (flags & DCACHE_MANAGE_TRANSIT) {
ret = path->dentry->d_op->d_manage(path, false);
@@ -1670,19 +1671,22 @@ static struct dentry *lookup_dcache(const struct qstr *name,
* dentries - as the matter of fact, this only gets called
* when directory is guaranteed to have no in-lookup children
* at all.
+ * Will return -ENOENT if name isn't found and LOOKUP_CREATE wasn't passed.
+ * Will return -EEXIST if name is found and LOOKUP_EXCL was passed.
*/
struct dentry *lookup_one_qstr_excl(const struct qstr *name,
- struct dentry *base,
- unsigned int flags)
+ struct dentry *base, unsigned int flags)
{
- struct dentry *dentry = lookup_dcache(name, base, flags);
+ struct dentry *dentry;
struct dentry *old;
- struct inode *dir = base->d_inode;
+ struct inode *dir;
+ dentry = lookup_dcache(name, base, flags);
if (dentry)
- return dentry;
+ goto found;
/* Don't create child dentry for a dead directory. */
+ dir = base->d_inode;
if (unlikely(IS_DEADDIR(dir)))
return ERR_PTR(-ENOENT);
@@ -1695,6 +1699,17 @@ struct dentry *lookup_one_qstr_excl(const struct qstr *name,
dput(dentry);
dentry = old;
}
+found:
+ if (IS_ERR(dentry))
+ return dentry;
+ if (d_is_negative(dentry) && !(flags & LOOKUP_CREATE)) {
+ dput(dentry);
+ return ERR_PTR(-ENOENT);
+ }
+ if (d_is_positive(dentry) && (flags & LOOKUP_EXCL)) {
+ dput(dentry);
+ return ERR_PTR(-EEXIST);
+ }
return dentry;
}
EXPORT_SYMBOL(lookup_one_qstr_excl);
@@ -1891,13 +1906,13 @@ static const char *pick_link(struct nameidata *nd, struct path *link,
unlikely(link->mnt->mnt_flags & MNT_NOSYMFOLLOW))
return ERR_PTR(-ELOOP);
- if (!(nd->flags & LOOKUP_RCU)) {
+ if (unlikely(atime_needs_update(&last->link, inode))) {
+ if (nd->flags & LOOKUP_RCU) {
+ if (!try_to_unlazy(nd))
+ return ERR_PTR(-ECHILD);
+ }
touch_atime(&last->link);
cond_resched();
- } else if (atime_needs_update(&last->link, inode)) {
- if (!try_to_unlazy(nd))
- return ERR_PTR(-ECHILD);
- touch_atime(&last->link);
}
error = security_inode_follow_link(link->dentry, inode,
@@ -2410,9 +2425,12 @@ static int link_path_walk(const char *name, struct nameidata *nd)
nd->flags |= LOOKUP_PARENT;
if (IS_ERR(name))
return PTR_ERR(name);
- while (*name=='/')
- name++;
- if (!*name) {
+ if (*name == '/') {
+ do {
+ name++;
+ } while (unlikely(*name == '/'));
+ }
+ if (unlikely(!*name)) {
nd->dir_mode = 0; // short-circuit the 'hardening' idiocy
return 0;
}
@@ -2425,7 +2443,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
idmap = mnt_idmap(nd->path.mnt);
err = may_lookup(idmap, nd);
- if (err)
+ if (unlikely(err))
return err;
nd->last.name = name;
@@ -2553,11 +2571,11 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
unsigned seq;
do {
- seq = read_seqcount_begin(&fs->seq);
+ seq = read_seqbegin(&fs->seq);
nd->path = fs->pwd;
nd->inode = nd->path.dentry->d_inode;
nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
- } while (read_seqcount_retry(&fs->seq, seq));
+ } while (read_seqretry(&fs->seq, seq));
} else {
get_fs_pwd(current->fs, &nd->path);
nd->inode = nd->path.dentry->d_inode;
@@ -2728,23 +2746,48 @@ static int filename_parentat(int dfd, struct filename *name,
/* does lookup, returns the object with parent locked */
static struct dentry *__kern_path_locked(int dfd, struct filename *name, struct path *path)
{
+ struct path parent_path __free(path_put) = {};
struct dentry *d;
struct qstr last;
int type, error;
- error = filename_parentat(dfd, name, 0, path, &last, &type);
+ error = filename_parentat(dfd, name, 0, &parent_path, &last, &type);
if (error)
return ERR_PTR(error);
- if (unlikely(type != LAST_NORM)) {
- path_put(path);
+ if (unlikely(type != LAST_NORM))
return ERR_PTR(-EINVAL);
+ inode_lock_nested(parent_path.dentry->d_inode, I_MUTEX_PARENT);
+ d = lookup_one_qstr_excl(&last, parent_path.dentry, 0);
+ if (IS_ERR(d)) {
+ inode_unlock(parent_path.dentry->d_inode);
+ return d;
}
- inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
- d = lookup_one_qstr_excl(&last, path->dentry, 0);
+ path->dentry = no_free_ptr(parent_path.dentry);
+ path->mnt = no_free_ptr(parent_path.mnt);
+ return d;
+}
+
+struct dentry *kern_path_locked_negative(const char *name, struct path *path)
+{
+ struct path parent_path __free(path_put) = {};
+ struct filename *filename __free(putname) = getname_kernel(name);
+ struct dentry *d;
+ struct qstr last;
+ int type, error;
+
+ error = filename_parentat(AT_FDCWD, filename, 0, &parent_path, &last, &type);
+ if (error)
+ return ERR_PTR(error);
+ if (unlikely(type != LAST_NORM))
+ return ERR_PTR(-EINVAL);
+ inode_lock_nested(parent_path.dentry->d_inode, I_MUTEX_PARENT);
+ d = lookup_one_qstr_excl(&last, parent_path.dentry, LOOKUP_CREATE);
if (IS_ERR(d)) {
- inode_unlock(path->dentry->d_inode);
- path_put(path);
+ inode_unlock(parent_path.dentry->d_inode);
+ return d;
}
+ path->dentry = no_free_ptr(parent_path.dentry);
+ path->mnt = no_free_ptr(parent_path.mnt);
return d;
}
@@ -2820,13 +2863,12 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
}
EXPORT_SYMBOL(vfs_path_lookup);
-static int lookup_one_common(struct mnt_idmap *idmap,
- const char *name, struct dentry *base, int len,
- struct qstr *this)
+static int lookup_noperm_common(struct qstr *qname, struct dentry *base)
{
- this->name = name;
- this->len = len;
- this->hash = full_name_hash(base, name, len);
+ const char *name = qname->name;
+ u32 len = qname->len;
+
+ qname->hash = full_name_hash(base, name, len);
if (!len)
return -EACCES;
@@ -2843,140 +2885,136 @@ static int lookup_one_common(struct mnt_idmap *idmap,
* to use its own hash..
*/
if (base->d_flags & DCACHE_OP_HASH) {
- int err = base->d_op->d_hash(base, this);
+ int err = base->d_op->d_hash(base, qname);
if (err < 0)
return err;
}
+ return 0;
+}
+static int lookup_one_common(struct mnt_idmap *idmap,
+ struct qstr *qname, struct dentry *base)
+{
+ int err;
+ err = lookup_noperm_common(qname, base);
+ if (err < 0)
+ return err;
return inode_permission(idmap, base->d_inode, MAY_EXEC);
}
/**
- * try_lookup_one_len - filesystem helper to lookup single pathname component
- * @name: pathname component to lookup
+ * try_lookup_noperm - filesystem helper to lookup single pathname component
+ * @name: qstr storing pathname component to lookup
* @base: base directory to lookup from
- * @len: maximum length @len should be interpreted to
*
* Look up a dentry by name in the dcache, returning NULL if it does not
- * currently exist. The function does not try to create a dentry.
+ * currently exist. The function does not try to create a dentry and if one
+ * is found it doesn't try to revalidate it.
*
* Note that this routine is purely a helper for filesystem usage and should
- * not be called by generic code.
+ * not be called by generic code. It does no permission checking.
+ *
+ * No locks need be held - only a counted reference to @base is needed.
*
- * The caller must hold base->i_mutex.
*/
-struct dentry *try_lookup_one_len(const char *name, struct dentry *base, int len)
+struct dentry *try_lookup_noperm(struct qstr *name, struct dentry *base)
{
- struct qstr this;
int err;
- WARN_ON_ONCE(!inode_is_locked(base->d_inode));
-
- err = lookup_one_common(&nop_mnt_idmap, name, base, len, &this);
+ err = lookup_noperm_common(name, base);
if (err)
return ERR_PTR(err);
- return lookup_dcache(&this, base, 0);
+ return d_lookup(base, name);
}
-EXPORT_SYMBOL(try_lookup_one_len);
+EXPORT_SYMBOL(try_lookup_noperm);
/**
- * lookup_one_len - filesystem helper to lookup single pathname component
- * @name: pathname component to lookup
+ * lookup_noperm - filesystem helper to lookup single pathname component
+ * @name: qstr storing pathname component to lookup
* @base: base directory to lookup from
- * @len: maximum length @len should be interpreted to
*
* Note that this routine is purely a helper for filesystem usage and should
- * not be called by generic code.
+ * not be called by generic code. It does no permission checking.
*
- * The caller must hold base->i_mutex.
+ * The caller must hold base->i_rwsem.
*/
-struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
+struct dentry *lookup_noperm(struct qstr *name, struct dentry *base)
{
struct dentry *dentry;
- struct qstr this;
int err;
WARN_ON_ONCE(!inode_is_locked(base->d_inode));
- err = lookup_one_common(&nop_mnt_idmap, name, base, len, &this);
+ err = lookup_noperm_common(name, base);
if (err)
return ERR_PTR(err);
- dentry = lookup_dcache(&this, base, 0);
- return dentry ? dentry : __lookup_slow(&this, base, 0);
+ dentry = lookup_dcache(name, base, 0);
+ return dentry ? dentry : __lookup_slow(name, base, 0);
}
-EXPORT_SYMBOL(lookup_one_len);
+EXPORT_SYMBOL(lookup_noperm);
/**
- * lookup_one - filesystem helper to lookup single pathname component
+ * lookup_one - lookup single pathname component
* @idmap: idmap of the mount the lookup is performed from
- * @name: pathname component to lookup
+ * @name: qstr holding pathname component to lookup
* @base: base directory to lookup from
- * @len: maximum length @len should be interpreted to
*
- * Note that this routine is purely a helper for filesystem usage and should
- * not be called by generic code.
+ * This can be used for in-kernel filesystem clients such as file servers.
*
- * The caller must hold base->i_mutex.
+ * The caller must hold base->i_rwsem.
*/
-struct dentry *lookup_one(struct mnt_idmap *idmap, const char *name,
- struct dentry *base, int len)
+struct dentry *lookup_one(struct mnt_idmap *idmap, struct qstr *name,
+ struct dentry *base)
{
struct dentry *dentry;
- struct qstr this;
int err;
WARN_ON_ONCE(!inode_is_locked(base->d_inode));
- err = lookup_one_common(idmap, name, base, len, &this);
+ err = lookup_one_common(idmap, name, base);
if (err)
return ERR_PTR(err);
- dentry = lookup_dcache(&this, base, 0);
- return dentry ? dentry : __lookup_slow(&this, base, 0);
+ dentry = lookup_dcache(name, base, 0);
+ return dentry ? dentry : __lookup_slow(name, base, 0);
}
EXPORT_SYMBOL(lookup_one);
/**
- * lookup_one_unlocked - filesystem helper to lookup single pathname component
+ * lookup_one_unlocked - lookup single pathname component
* @idmap: idmap of the mount the lookup is performed from
- * @name: pathname component to lookup
+ * @name: qstr olding pathname component to lookup
* @base: base directory to lookup from
- * @len: maximum length @len should be interpreted to
*
- * Note that this routine is purely a helper for filesystem usage and should
- * not be called by generic code.
+ * This can be used for in-kernel filesystem clients such as file servers.
*
- * Unlike lookup_one_len, it should be called without the parent
- * i_mutex held, and will take the i_mutex itself if necessary.
+ * Unlike lookup_one, it should be called without the parent
+ * i_rwsem held, and will take the i_rwsem itself if necessary.
*/
-struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap,
- const char *name, struct dentry *base,
- int len)
+struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap, struct qstr *name,
+ struct dentry *base)
{
- struct qstr this;
int err;
struct dentry *ret;
- err = lookup_one_common(idmap, name, base, len, &this);
+ err = lookup_one_common(idmap, name, base);
if (err)
return ERR_PTR(err);
- ret = lookup_dcache(&this, base, 0);
+ ret = lookup_dcache(name, base, 0);
if (!ret)
- ret = lookup_slow(&this, base, 0);
+ ret = lookup_slow(name, base, 0);
return ret;
}
EXPORT_SYMBOL(lookup_one_unlocked);
/**
- * lookup_one_positive_unlocked - filesystem helper to lookup single
- * pathname component
+ * lookup_one_positive_unlocked - lookup single pathname component
* @idmap: idmap of the mount the lookup is performed from
- * @name: pathname component to lookup
+ * @name: qstr holding pathname component to lookup
* @base: base directory to lookup from
- * @len: maximum length @len should be interpreted to
*
* This helper will yield ERR_PTR(-ENOENT) on negatives. The helper returns
* known positive or ERR_PTR(). This is what most of the users want.
@@ -2985,16 +3023,15 @@ EXPORT_SYMBOL(lookup_one_unlocked);
* time, so callers of lookup_one_unlocked() need to be very careful; pinned
* positives have >d_inode stable, so this one avoids such problems.
*
- * Note that this routine is purely a helper for filesystem usage and should
- * not be called by generic code.
+ * This can be used for in-kernel filesystem clients such as file servers.
*
- * The helper should be called without i_mutex held.
+ * The helper should be called without i_rwsem held.
*/
struct dentry *lookup_one_positive_unlocked(struct mnt_idmap *idmap,
- const char *name,
- struct dentry *base, int len)
+ struct qstr *name,
+ struct dentry *base)
{
- struct dentry *ret = lookup_one_unlocked(idmap, name, base, len);
+ struct dentry *ret = lookup_one_unlocked(idmap, name, base);
if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
dput(ret);
@@ -3005,38 +3042,56 @@ struct dentry *lookup_one_positive_unlocked(struct mnt_idmap *idmap,
EXPORT_SYMBOL(lookup_one_positive_unlocked);
/**
- * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
+ * lookup_noperm_unlocked - filesystem helper to lookup single pathname component
* @name: pathname component to lookup
* @base: base directory to lookup from
- * @len: maximum length @len should be interpreted to
*
* Note that this routine is purely a helper for filesystem usage and should
- * not be called by generic code.
+ * not be called by generic code. It does no permission checking.
*
- * Unlike lookup_one_len, it should be called without the parent
- * i_mutex held, and will take the i_mutex itself if necessary.
+ * Unlike lookup_noperm(), it should be called without the parent
+ * i_rwsem held, and will take the i_rwsem itself if necessary.
+ *
+ * Unlike try_lookup_noperm() it *does* revalidate the dentry if it already
+ * existed.
*/
-struct dentry *lookup_one_len_unlocked(const char *name,
- struct dentry *base, int len)
+struct dentry *lookup_noperm_unlocked(struct qstr *name, struct dentry *base)
{
- return lookup_one_unlocked(&nop_mnt_idmap, name, base, len);
+ struct dentry *ret;
+ int err;
+
+ err = lookup_noperm_common(name, base);
+ if (err)
+ return ERR_PTR(err);
+
+ ret = lookup_dcache(name, base, 0);
+ if (!ret)
+ ret = lookup_slow(name, base, 0);
+ return ret;
}
-EXPORT_SYMBOL(lookup_one_len_unlocked);
+EXPORT_SYMBOL(lookup_noperm_unlocked);
/*
- * Like lookup_one_len_unlocked(), except that it yields ERR_PTR(-ENOENT)
+ * Like lookup_noperm_unlocked(), except that it yields ERR_PTR(-ENOENT)
* on negatives. Returns known positive or ERR_PTR(); that's what
* most of the users want. Note that pinned negative with unlocked parent
- * _can_ become positive at any time, so callers of lookup_one_len_unlocked()
+ * _can_ become positive at any time, so callers of lookup_noperm_unlocked()
* need to be very careful; pinned positives have ->d_inode stable, so
* this one avoids such problems.
*/
-struct dentry *lookup_positive_unlocked(const char *name,
- struct dentry *base, int len)
+struct dentry *lookup_noperm_positive_unlocked(struct qstr *name,
+ struct dentry *base)
{
- return lookup_one_positive_unlocked(&nop_mnt_idmap, name, base, len);
+ struct dentry *ret;
+
+ ret = lookup_noperm_unlocked(name, base);
+ if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
+ dput(ret);
+ ret = ERR_PTR(-ENOENT);
+ }
+ return ret;
}
-EXPORT_SYMBOL(lookup_positive_unlocked);
+EXPORT_SYMBOL(lookup_noperm_positive_unlocked);
#ifdef CONFIG_UNIX98_PTYS
int path_pts(struct path *path)
@@ -3415,6 +3470,8 @@ static int may_open(struct mnt_idmap *idmap, const struct path *path,
if ((acc_mode & MAY_EXEC) && path_noexec(path))
return -EACCES;
break;
+ default:
+ VFS_BUG_ON_INODE(!IS_ANON_FILE(inode), inode);
}
error = inode_permission(idmap, inode, MAY_OPEN | acc_mode);
@@ -3995,7 +4052,7 @@ static struct file *path_openat(struct nameidata *nd,
WARN_ON(1);
error = -EINVAL;
}
- fput(file);
+ fput_close(file);
if (error == -EOPENSTALE) {
if (flags & LOOKUP_RCU)
error = -ECHILD;
@@ -4078,27 +4135,13 @@ static struct dentry *filename_create(int dfd, struct filename *name,
* '/', and a directory wasn't requested.
*/
if (last.name[last.len] && !want_dir)
- create_flags = 0;
+ create_flags &= ~LOOKUP_CREATE;
inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
dentry = lookup_one_qstr_excl(&last, path->dentry,
reval_flag | create_flags);
if (IS_ERR(dentry))
goto unlock;
- error = -EEXIST;
- if (d_is_positive(dentry))
- goto fail;
-
- /*
- * Special case - lookup gave negative, but... we had foo/bar/
- * From the vfs_mknod() POV we just have a negative dentry -
- * all is fine. Let's be bastards - you had / on the end, you've
- * been asking for (non-existent) directory. -ENOENT for you.
- */
- if (unlikely(!create_flags)) {
- error = -ENOENT;
- goto fail;
- }
if (unlikely(err2)) {
error = err2;
goto fail;
@@ -4129,7 +4172,8 @@ EXPORT_SYMBOL(kern_path_create);
void done_path_create(struct path *path, struct dentry *dentry)
{
- dput(dentry);
+ if (!IS_ERR(dentry))
+ dput(dentry);
inode_unlock(path->dentry->d_inode);
mnt_drop_write(path->mnt);
path_put(path);
@@ -4275,7 +4319,7 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, d
}
/**
- * vfs_mkdir - create directory
+ * vfs_mkdir - create directory returning correct dentry if possible
* @idmap: idmap of the mount the inode was found from
* @dir: inode of the parent directory
* @dentry: dentry of the child directory
@@ -4288,32 +4332,51 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, d
* care to map the inode according to @idmap before checking permissions.
* On non-idmapped mounts or if permission checking is to be performed on the
* raw inode simply pass @nop_mnt_idmap.
+ *
+ * In the event that the filesystem does not use the *@dentry but leaves it
+ * negative or unhashes it and possibly splices a different one returning it,
+ * the original dentry is dput() and the alternate is returned.
+ *
+ * In case of an error the dentry is dput() and an ERR_PTR() is returned.
*/
-int vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+struct dentry *vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
int error;
unsigned max_links = dir->i_sb->s_max_links;
+ struct dentry *de;
error = may_create(idmap, dir, dentry);
if (error)
- return error;
+ goto err;
+ error = -EPERM;
if (!dir->i_op->mkdir)
- return -EPERM;
+ goto err;
mode = vfs_prepare_mode(idmap, dir, mode, S_IRWXUGO | S_ISVTX, 0);
error = security_inode_mkdir(dir, dentry, mode);
if (error)
- return error;
+ goto err;
+ error = -EMLINK;
if (max_links && dir->i_nlink >= max_links)
- return -EMLINK;
+ goto err;
- error = dir->i_op->mkdir(idmap, dir, dentry, mode);
- if (!error)
- fsnotify_mkdir(dir, dentry);
- return error;
+ de = dir->i_op->mkdir(idmap, dir, dentry, mode);
+ error = PTR_ERR(de);
+ if (IS_ERR(de))
+ goto err;
+ if (de) {
+ dput(dentry);
+ dentry = de;
+ }
+ fsnotify_mkdir(dir, dentry);
+ return dentry;
+
+err:
+ dput(dentry);
+ return ERR_PTR(error);
}
EXPORT_SYMBOL(vfs_mkdir);
@@ -4333,8 +4396,10 @@ retry:
error = security_path_mkdir(&path, dentry,
mode_strip_umask(path.dentry->d_inode, mode));
if (!error) {
- error = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode,
+ dentry = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode,
dentry, mode);
+ if (IS_ERR(dentry))
+ error = PTR_ERR(dentry);
}
done_path_create(&path, dentry);
if (retry_estale(error, lookup_flags)) {
@@ -4445,10 +4510,6 @@ retry:
error = PTR_ERR(dentry);
if (IS_ERR(dentry))
goto exit3;
- if (!dentry->d_inode) {
- error = -ENOENT;
- goto exit4;
- }
error = security_path_rmdir(&path, dentry);
if (error)
goto exit4;
@@ -4481,13 +4542,13 @@ SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
* @dentry: victim
* @delegated_inode: returns victim inode, if the inode is delegated.
*
- * The caller must hold dir->i_mutex.
+ * The caller must hold dir->i_rwsem exclusively.
*
* If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and
* return a reference to the inode in delegated_inode. The caller
* should then break the delegation on that inode and retry. Because
* breaking a delegation may take a long time, the caller should drop
- * dir->i_mutex before doing so.
+ * dir->i_rwsem before doing so.
*
* Alternatively, a caller may pass NULL for delegated_inode. This may
* be appropriate for callers that expect the underlying filesystem not
@@ -4546,7 +4607,7 @@ EXPORT_SYMBOL(vfs_unlink);
/*
* Make sure that the actual truncation of the file will occur outside its
- * directory's i_mutex. Truncate can take a long time if there is a lot of
+ * directory's i_rwsem. Truncate can take a long time if there is a lot of
* writeout happening, and we don't want to prevent access to the directory
* while waiting on the I/O.
*/
@@ -4579,7 +4640,7 @@ retry_deleg:
if (!IS_ERR(dentry)) {
/* Why not before? Because we want correct error value */
- if (last.name[last.len] || d_is_negative(dentry))
+ if (last.name[last.len])
goto slashes;
inode = dentry->d_inode;
ihold(inode);
@@ -4613,9 +4674,7 @@ exit1:
return error;
slashes:
- if (d_is_negative(dentry))
- error = -ENOENT;
- else if (d_is_dir(dentry))
+ if (d_is_dir(dentry))
error = -EISDIR;
else
error = -ENOTDIR;
@@ -4726,13 +4785,13 @@ SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newn
* @new_dentry: where to create the new link
* @delegated_inode: returns inode needing a delegation break
*
- * The caller must hold dir->i_mutex
+ * The caller must hold dir->i_rwsem exclusively.
*
* If vfs_link discovers a delegation on the to-be-linked file in need
* of breaking, it will return -EWOULDBLOCK and return a reference to the
* inode in delegated_inode. The caller should then break the delegation
* and retry. Because breaking a delegation may take a long time, the
- * caller should drop the i_mutex before doing so.
+ * caller should drop the i_rwsem before doing so.
*
* Alternatively, a caller may pass NULL for delegated_inode. This may
* be appropriate for callers that expect the underlying filesystem not
@@ -4928,7 +4987,7 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
* c) we may have to lock up to _four_ objects - parents and victim (if it exists),
* and source (if it's a non-directory or a subdirectory that moves to
* different parent).
- * And that - after we got ->i_mutex on parents (until then we don't know
+ * And that - after we got ->i_rwsem on parents (until then we don't know
* whether the target exists). Solution: try to be smart with locking
* order for inodes. We rely on the fact that tree topology may change
* only under ->s_vfs_rename_mutex _and_ that parent of the object we
@@ -4940,15 +4999,16 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
* has no more than 1 dentry. If "hybrid" objects will ever appear,
* we'd better make sure that there's no link(2) for them.
* d) conversion from fhandle to dentry may come in the wrong moment - when
- * we are removing the target. Solution: we will have to grab ->i_mutex
+ * we are removing the target. Solution: we will have to grab ->i_rwsem
* in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
- * ->i_mutex on parents, which works but leads to some truly excessive
+ * ->i_rwsem on parents, which works but leads to some truly excessive
* locking].
*/
int vfs_rename(struct renamedata *rd)
{
int error;
- struct inode *old_dir = rd->old_dir, *new_dir = rd->new_dir;
+ struct inode *old_dir = d_inode(rd->old_parent);
+ struct inode *new_dir = d_inode(rd->new_parent);
struct dentry *old_dentry = rd->old_dentry;
struct dentry *new_dentry = rd->new_dentry;
struct inode **delegated_inode = rd->delegated_inode;
@@ -5115,7 +5175,8 @@ int do_renameat2(int olddfd, struct filename *from, int newdfd,
struct qstr old_last, new_last;
int old_type, new_type;
struct inode *delegated_inode = NULL;
- unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET;
+ unsigned int lookup_flags = 0, target_flags =
+ LOOKUP_RENAME_TARGET | LOOKUP_CREATE;
bool should_retry = false;
int error = -EINVAL;
@@ -5128,6 +5189,8 @@ int do_renameat2(int olddfd, struct filename *from, int newdfd,
if (flags & RENAME_EXCHANGE)
target_flags = 0;
+ if (flags & RENAME_NOREPLACE)
+ target_flags |= LOOKUP_EXCL;
retry:
error = filename_parentat(olddfd, from, lookup_flags, &old_path,
@@ -5169,23 +5232,12 @@ retry_deleg:
error = PTR_ERR(old_dentry);
if (IS_ERR(old_dentry))
goto exit3;
- /* source must exist */
- error = -ENOENT;
- if (d_is_negative(old_dentry))
- goto exit4;
new_dentry = lookup_one_qstr_excl(&new_last, new_path.dentry,
lookup_flags | target_flags);
error = PTR_ERR(new_dentry);
if (IS_ERR(new_dentry))
goto exit4;
- error = -EEXIST;
- if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry))
- goto exit5;
if (flags & RENAME_EXCHANGE) {
- error = -ENOENT;
- if (d_is_negative(new_dentry))
- goto exit5;
-
if (!d_is_dir(new_dentry)) {
error = -ENOTDIR;
if (new_last.name[new_last.len])
@@ -5215,10 +5267,10 @@ retry_deleg:
if (error)
goto exit5;
- rd.old_dir = old_path.dentry->d_inode;
+ rd.old_parent = old_path.dentry;
rd.old_dentry = old_dentry;
rd.old_mnt_idmap = mnt_idmap(old_path.mnt);
- rd.new_dir = new_path.dentry->d_inode;
+ rd.new_parent = new_path.dentry;
rd.new_dentry = new_dentry;
rd.new_mnt_idmap = mnt_idmap(new_path.mnt);
rd.delegated_inode = &delegated_inode;
@@ -5356,38 +5408,76 @@ const char *vfs_get_link(struct dentry *dentry, struct delayed_call *done)
EXPORT_SYMBOL(vfs_get_link);
/* get the link contents into pagecache */
-const char *page_get_link(struct dentry *dentry, struct inode *inode,
- struct delayed_call *callback)
+static char *__page_get_link(struct dentry *dentry, struct inode *inode,
+ struct delayed_call *callback)
{
- char *kaddr;
- struct page *page;
+ struct folio *folio;
struct address_space *mapping = inode->i_mapping;
if (!dentry) {
- page = find_get_page(mapping, 0);
- if (!page)
+ folio = filemap_get_folio(mapping, 0);
+ if (IS_ERR(folio))
return ERR_PTR(-ECHILD);
- if (!PageUptodate(page)) {
- put_page(page);
+ if (!folio_test_uptodate(folio)) {
+ folio_put(folio);
return ERR_PTR(-ECHILD);
}
} else {
- page = read_mapping_page(mapping, 0, NULL);
- if (IS_ERR(page))
- return (char*)page;
+ folio = read_mapping_folio(mapping, 0, NULL);
+ if (IS_ERR(folio))
+ return ERR_CAST(folio);
}
- set_delayed_call(callback, page_put_link, page);
+ set_delayed_call(callback, page_put_link, folio);
BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM);
- kaddr = page_address(page);
- nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1);
- return kaddr;
+ return folio_address(folio);
}
+const char *page_get_link_raw(struct dentry *dentry, struct inode *inode,
+ struct delayed_call *callback)
+{
+ return __page_get_link(dentry, inode, callback);
+}
+EXPORT_SYMBOL_GPL(page_get_link_raw);
+
+/**
+ * page_get_link() - An implementation of the get_link inode_operation.
+ * @dentry: The directory entry which is the symlink.
+ * @inode: The inode for the symlink.
+ * @callback: Used to drop the reference to the symlink.
+ *
+ * Filesystems which store their symlinks in the page cache should use
+ * this to implement the get_link() member of their inode_operations.
+ *
+ * Return: A pointer to the NUL-terminated symlink.
+ */
+const char *page_get_link(struct dentry *dentry, struct inode *inode,
+ struct delayed_call *callback)
+{
+ char *kaddr = __page_get_link(dentry, inode, callback);
+
+ if (!IS_ERR(kaddr))
+ nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1);
+ return kaddr;
+}
EXPORT_SYMBOL(page_get_link);
+/**
+ * page_put_link() - Drop the reference to the symlink.
+ * @arg: The folio which contains the symlink.
+ *
+ * This is used internally by page_get_link(). It is exported for use
+ * by filesystems which need to implement a variant of page_get_link()
+ * themselves. Despite the apparent symmetry, filesystems which use
+ * page_get_link() do not need to call page_put_link().
+ *
+ * The argument, while it has a void pointer type, must be a pointer to
+ * the folio which was retrieved from the page cache. The delayed_call
+ * infrastructure is used to drop the reference count once the caller
+ * is done with the symlink.
+ */
void page_put_link(void *arg)
{
- put_page(arg);
+ folio_put(arg);
}
EXPORT_SYMBOL(page_put_link);
diff --git a/fs/namespace.c b/fs/namespace.c
index a3ed3f2980cb..ae6d1312b184 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -79,17 +79,26 @@ static struct kmem_cache *mnt_cache __ro_after_init;
static DECLARE_RWSEM(namespace_sem);
static HLIST_HEAD(unmounted); /* protected by namespace_sem */
static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
+static struct mnt_namespace *emptied_ns; /* protected by namespace_sem */
static DEFINE_SEQLOCK(mnt_ns_tree_lock);
+#ifdef CONFIG_FSNOTIFY
+LIST_HEAD(notify_list); /* protected by namespace_sem */
+#endif
static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */
static LIST_HEAD(mnt_ns_list); /* protected by mnt_ns_tree_lock */
+enum mount_kattr_flags_t {
+ MOUNT_KATTR_RECURSE = (1 << 0),
+ MOUNT_KATTR_IDMAP_REPLACE = (1 << 1),
+};
+
struct mount_kattr {
unsigned int attr_set;
unsigned int attr_clr;
unsigned int propagation;
unsigned int lookup_flags;
- bool recurse;
+ enum mount_kattr_flags_t kflags;
struct user_namespace *mnt_userns;
struct mnt_idmap *mnt_idmap;
};
@@ -163,6 +172,7 @@ static void mnt_ns_release(struct mnt_namespace *ns)
{
/* keep alive for {list,stat}mount() */
if (refcount_dec_and_test(&ns->passive)) {
+ fsnotify_mntns_delete(ns);
put_user_ns(ns->user_ns);
kfree(ns);
}
@@ -346,12 +356,13 @@ static struct mount *alloc_vfsmnt(const char *name)
if (err)
goto out_free_cache;
- if (name) {
+ if (name)
mnt->mnt_devname = kstrdup_const(name,
GFP_KERNEL_ACCOUNT);
- if (!mnt->mnt_devname)
- goto out_free_id;
- }
+ else
+ mnt->mnt_devname = "none";
+ if (!mnt->mnt_devname)
+ goto out_free_id;
#ifdef CONFIG_SMP
mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
@@ -370,10 +381,9 @@ static struct mount *alloc_vfsmnt(const char *name)
INIT_LIST_HEAD(&mnt->mnt_list);
INIT_LIST_HEAD(&mnt->mnt_expire);
INIT_LIST_HEAD(&mnt->mnt_share);
- INIT_LIST_HEAD(&mnt->mnt_slave_list);
- INIT_LIST_HEAD(&mnt->mnt_slave);
+ INIT_HLIST_HEAD(&mnt->mnt_slave_list);
+ INIT_HLIST_NODE(&mnt->mnt_slave);
INIT_HLIST_NODE(&mnt->mnt_mp_list);
- INIT_LIST_HEAD(&mnt->mnt_umounting);
INIT_HLIST_HEAD(&mnt->mnt_stuck_children);
RB_CLEAR_NODE(&mnt->mnt_node);
mnt->mnt.mnt_idmap = &nop_mnt_idmap;
@@ -778,15 +788,11 @@ int __legitimize_mnt(struct vfsmount *bastard, unsigned seq)
return 0;
mnt = real_mount(bastard);
mnt_add_count(mnt, 1);
- smp_mb(); // see mntput_no_expire()
+ smp_mb(); // see mntput_no_expire() and do_umount()
if (likely(!read_seqretry(&mount_lock, seq)))
return 0;
- if (bastard->mnt_flags & MNT_SYNC_UMOUNT) {
- mnt_add_count(mnt, -1);
- return 1;
- }
lock_mount_hash();
- if (unlikely(bastard->mnt_flags & MNT_DOOMED)) {
+ if (unlikely(bastard->mnt_flags & (MNT_SYNC_UMOUNT | MNT_DOOMED))) {
mnt_add_count(mnt, -1);
unlock_mount_hash();
return 1;
@@ -888,7 +894,7 @@ struct vfsmount *lookup_mnt(const struct path *path)
* namespace not just a mount that happens to have some specified
* parent mount.
*/
-bool __is_local_mountpoint(struct dentry *dentry)
+bool __is_local_mountpoint(const struct dentry *dentry)
{
struct mnt_namespace *ns = current->nsproxy->mnt_ns;
struct mount *mnt, *n;
@@ -905,42 +911,48 @@ bool __is_local_mountpoint(struct dentry *dentry)
return is_covered;
}
-static struct mountpoint *lookup_mountpoint(struct dentry *dentry)
+struct pinned_mountpoint {
+ struct hlist_node node;
+ struct mountpoint *mp;
+};
+
+static bool lookup_mountpoint(struct dentry *dentry, struct pinned_mountpoint *m)
{
struct hlist_head *chain = mp_hash(dentry);
struct mountpoint *mp;
hlist_for_each_entry(mp, chain, m_hash) {
if (mp->m_dentry == dentry) {
- mp->m_count++;
- return mp;
+ hlist_add_head(&m->node, &mp->m_list);
+ m->mp = mp;
+ return true;
}
}
- return NULL;
+ return false;
}
-static struct mountpoint *get_mountpoint(struct dentry *dentry)
+static int get_mountpoint(struct dentry *dentry, struct pinned_mountpoint *m)
{
- struct mountpoint *mp, *new = NULL;
+ struct mountpoint *mp __free(kfree) = NULL;
+ bool found;
int ret;
if (d_mountpoint(dentry)) {
/* might be worth a WARN_ON() */
if (d_unlinked(dentry))
- return ERR_PTR(-ENOENT);
+ return -ENOENT;
mountpoint:
read_seqlock_excl(&mount_lock);
- mp = lookup_mountpoint(dentry);
+ found = lookup_mountpoint(dentry, m);
read_sequnlock_excl(&mount_lock);
- if (mp)
- goto done;
+ if (found)
+ return 0;
}
- if (!new)
- new = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
- if (!new)
- return ERR_PTR(-ENOMEM);
-
+ if (!mp)
+ mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
+ if (!mp)
+ return -ENOMEM;
/* Exactly one processes may set d_mounted */
ret = d_set_mounted(dentry);
@@ -950,34 +962,28 @@ mountpoint:
goto mountpoint;
/* The dentry is not available as a mountpoint? */
- mp = ERR_PTR(ret);
if (ret)
- goto done;
+ return ret;
/* Add the new mountpoint to the hash table */
read_seqlock_excl(&mount_lock);
- new->m_dentry = dget(dentry);
- new->m_count = 1;
- hlist_add_head(&new->m_hash, mp_hash(dentry));
- INIT_HLIST_HEAD(&new->m_list);
+ mp->m_dentry = dget(dentry);
+ hlist_add_head(&mp->m_hash, mp_hash(dentry));
+ INIT_HLIST_HEAD(&mp->m_list);
+ hlist_add_head(&m->node, &mp->m_list);
+ m->mp = no_free_ptr(mp);
read_sequnlock_excl(&mount_lock);
-
- mp = new;
- new = NULL;
-done:
- kfree(new);
- return mp;
+ return 0;
}
/*
* vfsmount lock must be held. Additionally, the caller is responsible
* for serializing calls for given disposal list.
*/
-static void __put_mountpoint(struct mountpoint *mp, struct list_head *list)
+static void maybe_free_mountpoint(struct mountpoint *mp, struct list_head *list)
{
- if (!--mp->m_count) {
+ if (hlist_empty(&mp->m_list)) {
struct dentry *dentry = mp->m_dentry;
- BUG_ON(!hlist_empty(&mp->m_list));
spin_lock(&dentry->d_lock);
dentry->d_flags &= ~DCACHE_MOUNTED;
spin_unlock(&dentry->d_lock);
@@ -987,10 +993,15 @@ static void __put_mountpoint(struct mountpoint *mp, struct list_head *list)
}
}
-/* called with namespace_lock and vfsmount lock */
-static void put_mountpoint(struct mountpoint *mp)
+/*
+ * locks: mount_lock [read_seqlock_excl], namespace_sem [excl]
+ */
+static void unpin_mountpoint(struct pinned_mountpoint *m)
{
- __put_mountpoint(mp, &ex_mountpoints);
+ if (m->mp) {
+ hlist_del(&m->node);
+ maybe_free_mountpoint(m->mp, &ex_mountpoints);
+ }
}
static inline int check_mnt(struct mount *mnt)
@@ -998,6 +1009,17 @@ static inline int check_mnt(struct mount *mnt)
return mnt->mnt_ns == current->nsproxy->mnt_ns;
}
+static inline bool check_anonymous_mnt(struct mount *mnt)
+{
+ u64 seq;
+
+ if (!is_anon_ns(mnt->mnt_ns))
+ return false;
+
+ seq = mnt->mnt_ns->seq_origin;
+ return !seq || (seq == current->nsproxy->mnt_ns->seq);
+}
+
/*
* vfsmount lock must be held for write
*/
@@ -1021,11 +1043,14 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns)
}
/*
- * vfsmount lock must be held for write
+ * locks: mount_lock[write_seqlock]
*/
-static struct mountpoint *unhash_mnt(struct mount *mnt)
+static void __umount_mnt(struct mount *mnt, struct list_head *shrink_list)
{
struct mountpoint *mp;
+ struct mount *parent = mnt->mnt_parent;
+ if (unlikely(parent->overmount == mnt))
+ parent->overmount = NULL;
mnt->mnt_parent = mnt;
mnt->mnt_mountpoint = mnt->mnt.mnt_root;
list_del_init(&mnt->mnt_child);
@@ -1033,15 +1058,15 @@ static struct mountpoint *unhash_mnt(struct mount *mnt)
hlist_del_init(&mnt->mnt_mp_list);
mp = mnt->mnt_mp;
mnt->mnt_mp = NULL;
- return mp;
+ maybe_free_mountpoint(mp, shrink_list);
}
/*
- * vfsmount lock must be held for write
+ * locks: mount_lock[write_seqlock], namespace_sem[excl] (for ex_mountpoints)
*/
static void umount_mnt(struct mount *mnt)
{
- put_mountpoint(unhash_mnt(mnt));
+ __umount_mnt(mnt, &ex_mountpoints);
}
/*
@@ -1051,43 +1076,17 @@ void mnt_set_mountpoint(struct mount *mnt,
struct mountpoint *mp,
struct mount *child_mnt)
{
- mp->m_count++;
- mnt_add_count(mnt, 1); /* essentially, that's mntget */
child_mnt->mnt_mountpoint = mp->m_dentry;
child_mnt->mnt_parent = mnt;
child_mnt->mnt_mp = mp;
hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
}
-/**
- * mnt_set_mountpoint_beneath - mount a mount beneath another one
- *
- * @new_parent: the source mount
- * @top_mnt: the mount beneath which @new_parent is mounted
- * @new_mp: the new mountpoint of @top_mnt on @new_parent
- *
- * Remove @top_mnt from its current mountpoint @top_mnt->mnt_mp and
- * parent @top_mnt->mnt_parent and mount it on top of @new_parent at
- * @new_mp. And mount @new_parent on the old parent and old
- * mountpoint of @top_mnt.
- *
- * Context: This function expects namespace_lock() and lock_mount_hash()
- * to have been acquired in that order.
- */
-static void mnt_set_mountpoint_beneath(struct mount *new_parent,
- struct mount *top_mnt,
- struct mountpoint *new_mp)
-{
- struct mount *old_top_parent = top_mnt->mnt_parent;
- struct mountpoint *old_top_mp = top_mnt->mnt_mp;
-
- mnt_set_mountpoint(old_top_parent, old_top_mp, new_parent);
- mnt_change_mountpoint(new_parent, new_mp, top_mnt);
-}
-
-
-static void __attach_mnt(struct mount *mnt, struct mount *parent)
+static void make_visible(struct mount *mnt)
{
+ struct mount *parent = mnt->mnt_parent;
+ if (unlikely(mnt->mnt_mountpoint == parent->mnt.mnt_root))
+ parent->overmount = mnt;
hlist_add_head_rcu(&mnt->mnt_hash,
m_hash(&parent->mnt, mnt->mnt_mountpoint));
list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
@@ -1099,51 +1098,34 @@ static void __attach_mnt(struct mount *mnt, struct mount *parent)
* @parent: the parent
* @mnt: the new mount
* @mp: the new mountpoint
- * @beneath: whether to mount @mnt beneath or on top of @parent
*
- * If @beneath is false, mount @mnt at @mp on @parent. Then attach @mnt
+ * Mount @mnt at @mp on @parent. Then attach @mnt
* to @parent's child mount list and to @mount_hashtable.
*
- * If @beneath is true, remove @mnt from its current parent and
- * mountpoint and mount it on @mp on @parent, and mount @parent on the
- * old parent and old mountpoint of @mnt. Finally, attach @parent to
- * @mnt_hashtable and @parent->mnt_parent->mnt_mounts.
- *
- * Note, when __attach_mnt() is called @mnt->mnt_parent already points
+ * Note, when make_visible() is called @mnt->mnt_parent already points
* to the correct parent.
*
* Context: This function expects namespace_lock() and lock_mount_hash()
* to have been acquired in that order.
*/
static void attach_mnt(struct mount *mnt, struct mount *parent,
- struct mountpoint *mp, bool beneath)
+ struct mountpoint *mp)
{
- if (beneath)
- mnt_set_mountpoint_beneath(mnt, parent, mp);
- else
- mnt_set_mountpoint(parent, mp, mnt);
- /*
- * Note, @mnt->mnt_parent has to be used. If @mnt was mounted
- * beneath @parent then @mnt will need to be attached to
- * @parent's old parent, not @parent. IOW, @mnt->mnt_parent
- * isn't the same mount as @parent.
- */
- __attach_mnt(mnt, mnt->mnt_parent);
+ mnt_set_mountpoint(parent, mp, mnt);
+ make_visible(mnt);
}
void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt)
{
struct mountpoint *old_mp = mnt->mnt_mp;
- struct mount *old_parent = mnt->mnt_parent;
list_del_init(&mnt->mnt_child);
hlist_del_init(&mnt->mnt_mp_list);
hlist_del_init_rcu(&mnt->mnt_hash);
- attach_mnt(mnt, parent, mp, false);
+ attach_mnt(mnt, parent, mp);
- put_mountpoint(old_mp);
- mnt_add_count(old_parent, -1);
+ maybe_free_mountpoint(old_mp, &ex_mountpoints);
}
static inline struct mount *node_to_mount(struct rb_node *node)
@@ -1176,32 +1158,8 @@ static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt)
ns->mnt_first_node = &mnt->mnt_node;
rb_link_node(&mnt->mnt_node, parent, link);
rb_insert_color(&mnt->mnt_node, &ns->mounts);
-}
-
-/*
- * vfsmount lock must be held for write
- */
-static void commit_tree(struct mount *mnt)
-{
- struct mount *parent = mnt->mnt_parent;
- struct mount *m;
- LIST_HEAD(head);
- struct mnt_namespace *n = parent->mnt_ns;
-
- BUG_ON(parent == mnt);
-
- list_add_tail(&head, &mnt->mnt_list);
- while (!list_empty(&head)) {
- m = list_first_entry(&head, typeof(*m), mnt_list);
- list_del(&m->mnt_list);
-
- mnt_add_to_ns(n, m);
- }
- n->nr_mounts += n->pending_mounts;
- n->pending_mounts = 0;
- __attach_mnt(mnt, parent);
- touch_mnt_namespace(n);
+ mnt_notify_add(mnt);
}
static struct mount *next_mnt(struct mount *p, struct mount *root)
@@ -1230,6 +1188,24 @@ static struct mount *skip_mnt_tree(struct mount *p)
return p;
}
+/*
+ * vfsmount lock must be held for write
+ */
+static void commit_tree(struct mount *mnt)
+{
+ struct mnt_namespace *n = mnt->mnt_parent->mnt_ns;
+
+ if (!mnt_ns_attached(mnt)) {
+ for (struct mount *m = mnt; m; m = next_mnt(m, mnt))
+ mnt_add_to_ns(n, m);
+ n->nr_mounts += n->pending_mounts;
+ n->pending_mounts = 0;
+ }
+
+ make_visible(mnt);
+ touch_mnt_namespace(n);
+}
+
/**
* vfs_create_mount - Create a mount for a configured superblock
* @fc: The configuration context with the superblock attached
@@ -1246,7 +1222,7 @@ struct vfsmount *vfs_create_mount(struct fs_context *fc)
if (!fc->root)
return ERR_PTR(-EINVAL);
- mnt = alloc_vfsmnt(fc->source ?: "none");
+ mnt = alloc_vfsmnt(fc->source);
if (!mnt)
return ERR_PTR(-ENOMEM);
@@ -1277,6 +1253,15 @@ struct vfsmount *fc_mount(struct fs_context *fc)
}
EXPORT_SYMBOL(fc_mount);
+struct vfsmount *fc_mount_longterm(struct fs_context *fc)
+{
+ struct vfsmount *mnt = fc_mount(fc);
+ if (!IS_ERR(mnt))
+ real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL;
+ return mnt;
+}
+EXPORT_SYMBOL(fc_mount_longterm);
+
struct vfsmount *vfs_kern_mount(struct file_system_type *type,
int flags, const char *name,
void *data)
@@ -1307,21 +1292,6 @@ struct vfsmount *vfs_kern_mount(struct file_system_type *type,
}
EXPORT_SYMBOL_GPL(vfs_kern_mount);
-struct vfsmount *
-vfs_submount(const struct dentry *mountpoint, struct file_system_type *type,
- const char *name, void *data)
-{
- /* Until it is worked out how to pass the user namespace
- * through from the parent mount to the submount don't support
- * unprivileged mounts with submounts.
- */
- if (mountpoint->d_sb->s_user_ns != &init_user_ns)
- return ERR_PTR(-EPERM);
-
- return vfs_kern_mount(type, SB_SUBMOUNT, name, data);
-}
-EXPORT_SYMBOL_GPL(vfs_submount);
-
static struct mount *clone_mnt(struct mount *old, struct dentry *root,
int flag)
{
@@ -1333,7 +1303,10 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
if (!mnt)
return ERR_PTR(-ENOMEM);
- if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE))
+ mnt->mnt.mnt_flags = READ_ONCE(old->mnt.mnt_flags) &
+ ~MNT_INTERNAL_FLAGS;
+
+ if (flag & (CL_SLAVE | CL_PRIVATE))
mnt->mnt_group_id = 0; /* not a peer of original */
else
mnt->mnt_group_id = old->mnt_group_id;
@@ -1344,8 +1317,8 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
goto out_free;
}
- mnt->mnt.mnt_flags = old->mnt.mnt_flags;
- mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL);
+ if (mnt->mnt_group_id)
+ set_mnt_shared(mnt);
atomic_inc(&sb->s_active);
mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt));
@@ -1358,30 +1331,19 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
unlock_mount_hash();
- if ((flag & CL_SLAVE) ||
- ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) {
- list_add(&mnt->mnt_slave, &old->mnt_slave_list);
+ if (flag & CL_PRIVATE) // we are done with it
+ return mnt;
+
+ if (peers(mnt, old))
+ list_add(&mnt->mnt_share, &old->mnt_share);
+
+ if ((flag & CL_SLAVE) && old->mnt_group_id) {
+ hlist_add_head(&mnt->mnt_slave, &old->mnt_slave_list);
mnt->mnt_master = old;
- CLEAR_MNT_SHARED(mnt);
- } else if (!(flag & CL_PRIVATE)) {
- if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old))
- list_add(&mnt->mnt_share, &old->mnt_share);
- if (IS_MNT_SLAVE(old))
- list_add(&mnt->mnt_slave, &old->mnt_slave);
+ } else if (IS_MNT_SLAVE(old)) {
+ hlist_add_behind(&mnt->mnt_slave, &old->mnt_slave);
mnt->mnt_master = old->mnt_master;
- } else {
- CLEAR_MNT_SHARED(mnt);
- }
- if (flag & CL_MAKE_SHARED)
- set_mnt_shared(mnt);
-
- /* stick the duplicate mount on the same expiry list
- * as the original if that was on one */
- if (flag & CL_EXPIRE) {
- if (!list_empty(&old->mnt_expire))
- list_add(&mnt->mnt_expire, &old->mnt_expire);
}
-
return mnt;
out_free:
@@ -1474,11 +1436,13 @@ static void mntput_no_expire(struct mount *mnt)
rcu_read_unlock();
list_del(&mnt->mnt_instance);
+ if (unlikely(!list_empty(&mnt->mnt_expire)))
+ list_del(&mnt->mnt_expire);
if (unlikely(!list_empty(&mnt->mnt_mounts))) {
struct mount *p, *tmp;
list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts, mnt_child) {
- __put_mountpoint(unhash_mnt(p), &list);
+ __umount_mnt(p, &list);
hlist_add_head(&p->mnt_umount, &mnt->mnt_stuck_children);
}
}
@@ -1675,23 +1639,19 @@ const struct seq_operations mounts_op = {
int may_umount_tree(struct vfsmount *m)
{
struct mount *mnt = real_mount(m);
- int actual_refs = 0;
- int minimum_refs = 0;
- struct mount *p;
- BUG_ON(!m);
+ bool busy = false;
/* write lock needed for mnt_get_count */
lock_mount_hash();
- for (p = mnt; p; p = next_mnt(p, mnt)) {
- actual_refs += mnt_get_count(p);
- minimum_refs += 2;
+ for (struct mount *p = mnt; p; p = next_mnt(p, mnt)) {
+ if (mnt_get_count(p) > (p == mnt ? 2 : 1)) {
+ busy = true;
+ break;
+ }
}
unlock_mount_hash();
- if (actual_refs > minimum_refs)
- return 0;
-
- return 1;
+ return !busy;
}
EXPORT_SYMBOL(may_umount_tree);
@@ -1723,17 +1683,80 @@ int may_umount(struct vfsmount *mnt)
EXPORT_SYMBOL(may_umount);
+#ifdef CONFIG_FSNOTIFY
+static void mnt_notify(struct mount *p)
+{
+ if (!p->prev_ns && p->mnt_ns) {
+ fsnotify_mnt_attach(p->mnt_ns, &p->mnt);
+ } else if (p->prev_ns && !p->mnt_ns) {
+ fsnotify_mnt_detach(p->prev_ns, &p->mnt);
+ } else if (p->prev_ns == p->mnt_ns) {
+ fsnotify_mnt_move(p->mnt_ns, &p->mnt);
+ } else {
+ fsnotify_mnt_detach(p->prev_ns, &p->mnt);
+ fsnotify_mnt_attach(p->mnt_ns, &p->mnt);
+ }
+ p->prev_ns = p->mnt_ns;
+}
+
+static void notify_mnt_list(void)
+{
+ struct mount *m, *tmp;
+ /*
+ * Notify about mounts that were added/reparented/detached/remain
+ * connected after unmount.
+ */
+ list_for_each_entry_safe(m, tmp, &notify_list, to_notify) {
+ mnt_notify(m);
+ list_del_init(&m->to_notify);
+ }
+}
+
+static bool need_notify_mnt_list(void)
+{
+ return !list_empty(&notify_list);
+}
+#else
+static void notify_mnt_list(void)
+{
+}
+
+static bool need_notify_mnt_list(void)
+{
+ return false;
+}
+#endif
+
+static void free_mnt_ns(struct mnt_namespace *);
static void namespace_unlock(void)
{
struct hlist_head head;
struct hlist_node *p;
struct mount *m;
+ struct mnt_namespace *ns = emptied_ns;
LIST_HEAD(list);
hlist_move_list(&unmounted, &head);
list_splice_init(&ex_mountpoints, &list);
+ emptied_ns = NULL;
- up_write(&namespace_sem);
+ if (need_notify_mnt_list()) {
+ /*
+ * No point blocking out concurrent readers while notifications
+ * are sent. This will also allow statmount()/listmount() to run
+ * concurrently.
+ */
+ downgrade_write(&namespace_sem);
+ notify_mnt_list();
+ up_read(&namespace_sem);
+ } else {
+ up_write(&namespace_sem);
+ }
+ if (unlikely(ns)) {
+ /* Make sure we notice when we leak mounts. */
+ VFS_WARN_ON_ONCE(!mnt_ns_empty(ns));
+ free_mnt_ns(ns);
+ }
shrink_dentry_list(&list);
@@ -1753,6 +1776,8 @@ static inline void namespace_lock(void)
down_write(&namespace_sem);
}
+DEFINE_GUARD(namespace_lock, struct rw_semaphore *, namespace_lock(), namespace_unlock())
+
enum umount_tree_flags {
UMOUNT_SYNC = 1,
UMOUNT_PROPAGATE = 2,
@@ -1804,9 +1829,8 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
for (p = mnt; p; p = next_mnt(p, mnt)) {
p->mnt.mnt_flags |= MNT_UMOUNT;
if (mnt_ns_attached(p))
- move_from_ns(p, &tmp_list);
- else
- list_move(&p->mnt_list, &tmp_list);
+ move_from_ns(p);
+ list_add_tail(&p->mnt_list, &tmp_list);
}
/* Hide the mounts from mnt_mounts */
@@ -1835,7 +1859,6 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
disconnect = disconnect_mount(p, how);
if (mnt_has_parent(p)) {
- mnt_add_count(p->mnt_parent, -1);
if (!disconnect) {
/* Don't forget about p */
list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts);
@@ -1846,6 +1869,19 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
change_mnt_propagation(p, MS_PRIVATE);
if (disconnect)
hlist_add_head(&p->mnt_umount, &unmounted);
+
+ /*
+ * At this point p->mnt_ns is NULL, notification will be queued
+ * only if
+ *
+ * - p->prev_ns is non-NULL *and*
+ * - p->prev_ns->n_fsnotify_marks is non-NULL
+ *
+ * This will preclude queuing the mount if this is a cleanup
+ * after a failed copy_tree() or destruction of an anonymous
+ * namespace, etc.
+ */
+ mnt_notify_add(p);
}
}
@@ -1899,7 +1935,7 @@ static int do_umount(struct mount *mnt, int flags)
* all race cases, but it's a slowpath.
*/
lock_mount_hash();
- if (mnt_get_count(mnt) != 2) {
+ if (!list_empty(&mnt->mnt_mounts) || mnt_get_count(mnt) != 2) {
unlock_mount_hash();
return -EBUSY;
}
@@ -1945,22 +1981,27 @@ static int do_umount(struct mount *mnt, int flags)
namespace_lock();
lock_mount_hash();
- /* Recheck MNT_LOCKED with the locks held */
+ /* Repeat the earlier racy checks, now that we are holding the locks */
retval = -EINVAL;
+ if (!check_mnt(mnt))
+ goto out;
+
if (mnt->mnt.mnt_flags & MNT_LOCKED)
goto out;
+ if (!mnt_has_parent(mnt)) /* not the absolute root */
+ goto out;
+
event++;
if (flags & MNT_DETACH) {
- if (mnt_ns_attached(mnt) || !list_empty(&mnt->mnt_list))
- umount_tree(mnt, UMOUNT_PROPAGATE);
+ umount_tree(mnt, UMOUNT_PROPAGATE);
retval = 0;
} else {
+ smp_mb(); // paired with __legitimize_mnt()
shrink_submounts(mnt);
retval = -EBUSY;
if (!propagate_mount_busy(mnt, 2)) {
- if (mnt_ns_attached(mnt) || !list_empty(&mnt->mnt_list))
- umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
+ umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
retval = 0;
}
}
@@ -1978,29 +2019,28 @@ out:
* detach_mounts allows lazily unmounting those mounts instead of
* leaking them.
*
- * The caller may hold dentry->d_inode->i_mutex.
+ * The caller may hold dentry->d_inode->i_rwsem.
*/
void __detach_mounts(struct dentry *dentry)
{
- struct mountpoint *mp;
+ struct pinned_mountpoint mp = {};
struct mount *mnt;
namespace_lock();
lock_mount_hash();
- mp = lookup_mountpoint(dentry);
- if (!mp)
+ if (!lookup_mountpoint(dentry, &mp))
goto out_unlock;
event++;
- while (!hlist_empty(&mp->m_list)) {
- mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
+ while (mp.node.next) {
+ mnt = hlist_entry(mp.node.next, struct mount, mnt_mp_list);
if (mnt->mnt.mnt_flags & MNT_UMOUNT) {
umount_mnt(mnt);
hlist_add_head(&mnt->mnt_umount, &unmounted);
}
else umount_tree(mnt, UMOUNT_CONNECTED);
}
- put_mountpoint(mp);
+ unpin_mountpoint(&mp);
out_unlock:
unlock_mount_hash();
namespace_unlock();
@@ -2026,6 +2066,7 @@ static void warn_mandlock(void)
static int can_umount(const struct path *path, int flags)
{
struct mount *mnt = real_mount(path->mnt);
+ struct super_block *sb = path->dentry->d_sb;
if (!may_mount())
return -EPERM;
@@ -2035,7 +2076,7 @@ static int can_umount(const struct path *path, int flags)
return -EINVAL;
if (mnt->mnt.mnt_flags & MNT_LOCKED) /* Check optimistically */
return -EINVAL;
- if (flags & MNT_FORCE && !capable(CAP_SYS_ADMIN))
+ if (flags & MNT_FORCE && !ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
return -EPERM;
return 0;
}
@@ -2145,16 +2186,24 @@ struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mntns, bool pr
}
}
+struct mnt_namespace *mnt_ns_from_dentry(struct dentry *dentry)
+{
+ if (!is_mnt_ns_file(dentry))
+ return NULL;
+
+ return to_mnt_ns(get_proc_ns(dentry->d_inode));
+}
+
static bool mnt_ns_loop(struct dentry *dentry)
{
/* Could bind mounting the mount namespace inode cause a
* mount namespace loop?
*/
- struct mnt_namespace *mnt_ns;
- if (!is_mnt_ns_file(dentry))
+ struct mnt_namespace *mnt_ns = mnt_ns_from_dentry(dentry);
+
+ if (!mnt_ns)
return false;
- mnt_ns = to_mnt_ns(get_proc_ns(dentry->d_inode));
return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;
}
@@ -2175,7 +2224,6 @@ struct mount *copy_tree(struct mount *src_root, struct dentry *dentry,
return dst_mnt;
src_parent = src_root;
- dst_mnt->mnt_mountpoint = src_root->mnt_mountpoint;
list_for_each_entry(src_root_child, &src_root->mnt_mounts, mnt_child) {
if (!is_subdir(src_root_child->mnt_mountpoint, dentry))
@@ -2210,8 +2258,16 @@ struct mount *copy_tree(struct mount *src_root, struct dentry *dentry,
if (IS_ERR(dst_mnt))
goto out;
lock_mount_hash();
- list_add_tail(&dst_mnt->mnt_list, &res->mnt_list);
- attach_mnt(dst_mnt, dst_parent, src_parent->mnt_mp, false);
+ if (src_mnt->mnt.mnt_flags & MNT_LOCKED)
+ dst_mnt->mnt.mnt_flags |= MNT_LOCKED;
+ if (unlikely(flag & CL_EXPIRE)) {
+ /* stick the duplicate mount on the same expiry
+ * list as the original if that was on one */
+ if (!list_empty(&src_mnt->mnt_expire))
+ list_add(&dst_mnt->mnt_expire,
+ &src_mnt->mnt_expire);
+ }
+ attach_mnt(dst_mnt, dst_parent, src_parent->mnt_mp);
unlock_mount_hash();
}
}
@@ -2226,54 +2282,97 @@ out:
return dst_mnt;
}
-/* Caller should check returned pointer for errors */
+static inline bool extend_array(struct path **res, struct path **to_free,
+ unsigned n, unsigned *count, unsigned new_count)
+{
+ struct path *p;
-struct vfsmount *collect_mounts(const struct path *path)
+ if (likely(n < *count))
+ return true;
+ p = kmalloc_array(new_count, sizeof(struct path), GFP_KERNEL);
+ if (p && *count)
+ memcpy(p, *res, *count * sizeof(struct path));
+ *count = new_count;
+ kfree(*to_free);
+ *to_free = *res = p;
+ return p;
+}
+
+struct path *collect_paths(const struct path *path,
+ struct path *prealloc, unsigned count)
{
- struct mount *tree;
- namespace_lock();
- if (!check_mnt(real_mount(path->mnt)))
- tree = ERR_PTR(-EINVAL);
- else
- tree = copy_tree(real_mount(path->mnt), path->dentry,
- CL_COPY_ALL | CL_PRIVATE);
- namespace_unlock();
- if (IS_ERR(tree))
- return ERR_CAST(tree);
- return &tree->mnt;
+ struct mount *root = real_mount(path->mnt);
+ struct mount *child;
+ struct path *res = prealloc, *to_free = NULL;
+ unsigned n = 0;
+
+ guard(rwsem_read)(&namespace_sem);
+
+ if (!check_mnt(root))
+ return ERR_PTR(-EINVAL);
+ if (!extend_array(&res, &to_free, 0, &count, 32))
+ return ERR_PTR(-ENOMEM);
+ res[n++] = *path;
+ list_for_each_entry(child, &root->mnt_mounts, mnt_child) {
+ if (!is_subdir(child->mnt_mountpoint, path->dentry))
+ continue;
+ for (struct mount *m = child; m; m = next_mnt(m, child)) {
+ if (!extend_array(&res, &to_free, n, &count, 2 * count))
+ return ERR_PTR(-ENOMEM);
+ res[n].mnt = &m->mnt;
+ res[n].dentry = m->mnt.mnt_root;
+ n++;
+ }
+ }
+ if (!extend_array(&res, &to_free, n, &count, count + 1))
+ return ERR_PTR(-ENOMEM);
+ memset(res + n, 0, (count - n) * sizeof(struct path));
+ for (struct path *p = res; p->mnt; p++)
+ path_get(p);
+ return res;
+}
+
+void drop_collected_paths(struct path *paths, struct path *prealloc)
+{
+ for (struct path *p = paths; p->mnt; p++)
+ path_put(p);
+ if (paths != prealloc)
+ kfree(paths);
}
-static void free_mnt_ns(struct mnt_namespace *);
static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *, bool);
void dissolve_on_fput(struct vfsmount *mnt)
{
- struct mnt_namespace *ns;
- namespace_lock();
- lock_mount_hash();
- ns = real_mount(mnt)->mnt_ns;
- if (ns) {
- if (is_anon_ns(ns))
- umount_tree(real_mount(mnt), UMOUNT_CONNECTED);
- else
- ns = NULL;
+ struct mount *m = real_mount(mnt);
+
+ /*
+ * m used to be the root of anon namespace; if it still is one,
+ * we need to dissolve the mount tree and free that namespace.
+ * Let's try to avoid taking namespace_sem if we can determine
+ * that there's nothing to do without it - rcu_read_lock() is
+ * enough to make anon_ns_root() memory-safe and once m has
+ * left its namespace, it's no longer our concern, since it will
+ * never become a root of anon ns again.
+ */
+
+ scoped_guard(rcu) {
+ if (!anon_ns_root(m))
+ return;
}
- unlock_mount_hash();
- namespace_unlock();
- if (ns)
- free_mnt_ns(ns);
-}
-void drop_collected_mounts(struct vfsmount *mnt)
-{
- namespace_lock();
- lock_mount_hash();
- umount_tree(real_mount(mnt), 0);
- unlock_mount_hash();
- namespace_unlock();
+ scoped_guard(namespace_lock, &namespace_sem) {
+ if (!anon_ns_root(m))
+ return;
+
+ emptied_ns = m->mnt_ns;
+ lock_mount_hash();
+ umount_tree(m, UMOUNT_CONNECTED);
+ unlock_mount_hash();
+ }
}
-bool has_locked_children(struct mount *mnt, struct dentry *dentry)
+static bool __has_locked_children(struct mount *mnt, struct dentry *dentry)
{
struct mount *child;
@@ -2287,6 +2386,38 @@ bool has_locked_children(struct mount *mnt, struct dentry *dentry)
return false;
}
+bool has_locked_children(struct mount *mnt, struct dentry *dentry)
+{
+ bool res;
+
+ read_seqlock_excl(&mount_lock);
+ res = __has_locked_children(mnt, dentry);
+ read_sequnlock_excl(&mount_lock);
+ return res;
+}
+
+/*
+ * Check that there aren't references to earlier/same mount namespaces in the
+ * specified subtree. Such references can act as pins for mount namespaces
+ * that aren't checked by the mount-cycle checking code, thereby allowing
+ * cycles to be made.
+ */
+static bool check_for_nsfs_mounts(struct mount *subtree)
+{
+ struct mount *p;
+ bool ret = false;
+
+ lock_mount_hash();
+ for (p = subtree; p; p = next_mnt(p, subtree))
+ if (mnt_ns_loop(p->mnt.mnt_root))
+ goto out;
+
+ ret = true;
+out:
+ unlock_mount_hash();
+ return ret;
+}
+
/**
* clone_private_mount - create a private clone of a path
* @path: path to clone
@@ -2295,6 +2426,8 @@ bool has_locked_children(struct mount *mnt, struct dentry *dentry)
* will not be attached anywhere in the namespace and will be private (i.e.
* changes to the originating mount won't be propagated into this).
*
+ * This assumes caller has called or done the equivalent of may_mount().
+ *
* Release with mntput().
*/
struct vfsmount *clone_private_mount(const struct path *path)
@@ -2302,48 +2435,42 @@ struct vfsmount *clone_private_mount(const struct path *path)
struct mount *old_mnt = real_mount(path->mnt);
struct mount *new_mnt;
- down_read(&namespace_sem);
+ guard(rwsem_read)(&namespace_sem);
+
if (IS_MNT_UNBINDABLE(old_mnt))
- goto invalid;
+ return ERR_PTR(-EINVAL);
- if (!check_mnt(old_mnt))
- goto invalid;
+ /*
+ * Make sure the source mount is acceptable.
+ * Anything mounted in our mount namespace is allowed.
+ * Otherwise, it must be the root of an anonymous mount
+ * namespace, and we need to make sure no namespace
+ * loops get created.
+ */
+ if (!check_mnt(old_mnt)) {
+ if (!anon_ns_root(old_mnt))
+ return ERR_PTR(-EINVAL);
- if (has_locked_children(old_mnt, path->dentry))
- goto invalid;
+ if (!check_for_nsfs_mounts(old_mnt))
+ return ERR_PTR(-EINVAL);
+ }
- new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);
- up_read(&namespace_sem);
+ if (!ns_capable(old_mnt->mnt_ns->user_ns, CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+ if (__has_locked_children(old_mnt, path->dentry))
+ return ERR_PTR(-EINVAL);
+
+ new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);
if (IS_ERR(new_mnt))
- return ERR_CAST(new_mnt);
+ return ERR_PTR(-EINVAL);
/* Longterm mount to be removed by kern_unmount*() */
new_mnt->mnt_ns = MNT_NS_INTERNAL;
-
return &new_mnt->mnt;
-
-invalid:
- up_read(&namespace_sem);
- return ERR_PTR(-EINVAL);
}
EXPORT_SYMBOL_GPL(clone_private_mount);
-int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
- struct vfsmount *root)
-{
- struct mount *mnt;
- int res = f(root, arg);
- if (res)
- return res;
- list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) {
- res = f(&mnt->mnt, arg);
- if (res)
- return res;
- }
- return 0;
-}
-
static void lock_mnt_tree(struct mount *mnt)
{
struct mount *p;
@@ -2365,7 +2492,7 @@ static void lock_mnt_tree(struct mount *mnt)
if (flags & MNT_NOEXEC)
flags |= MNT_LOCK_NOEXEC;
/* Don't allow unprivileged users to reveal what is under a mount */
- if (list_empty(&p->mnt_expire))
+ if (list_empty(&p->mnt_expire) && p != mnt)
flags |= MNT_LOCKED;
p->mnt.mnt_flags = flags;
}
@@ -2386,7 +2513,7 @@ static int invent_group_ids(struct mount *mnt, bool recurse)
struct mount *p;
for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) {
- if (!p->mnt_group_id && !IS_MNT_SHARED(p)) {
+ if (!p->mnt_group_id) {
int err = mnt_alloc_group_id(p);
if (err) {
cleanup_group_ids(mnt, p);
@@ -2422,16 +2549,15 @@ int count_mounts(struct mnt_namespace *ns, struct mount *mnt)
}
enum mnt_tree_flags_t {
- MNT_TREE_MOVE = BIT(0),
- MNT_TREE_BENEATH = BIT(1),
+ MNT_TREE_BENEATH = BIT(0),
+ MNT_TREE_PROPAGATION = BIT(1),
};
/**
* attach_recursive_mnt - attach a source mount tree
* @source_mnt: mount tree to be attached
- * @top_mnt: mount that @source_mnt will be mounted on or mounted beneath
+ * @dest_mnt: mount that @source_mnt will be mounted on
* @dest_mp: the mountpoint @source_mnt will be mounted at
- * @flags: modify how @source_mnt is supposed to be attached
*
* NOTE: in the table below explains the semantics when a source mount
* of a given type is attached to a destination mount of a given type.
@@ -2494,26 +2620,31 @@ enum mnt_tree_flags_t {
* Otherwise a negative error code is returned.
*/
static int attach_recursive_mnt(struct mount *source_mnt,
- struct mount *top_mnt,
- struct mountpoint *dest_mp,
- enum mnt_tree_flags_t flags)
+ struct mount *dest_mnt,
+ struct mountpoint *dest_mp)
{
struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
HLIST_HEAD(tree_list);
- struct mnt_namespace *ns = top_mnt->mnt_ns;
- struct mountpoint *smp;
- struct mount *child, *dest_mnt, *p;
+ struct mnt_namespace *ns = dest_mnt->mnt_ns;
+ struct pinned_mountpoint root = {};
+ struct mountpoint *shorter = NULL;
+ struct mount *child, *p;
+ struct mount *top;
struct hlist_node *n;
int err = 0;
- bool moving = flags & MNT_TREE_MOVE, beneath = flags & MNT_TREE_BENEATH;
+ bool moving = mnt_has_parent(source_mnt);
/*
* Preallocate a mountpoint in case the new mounts need to be
* mounted beneath mounts on the same mountpoint.
*/
- smp = get_mountpoint(source_mnt->mnt.mnt_root);
- if (IS_ERR(smp))
- return PTR_ERR(smp);
+ for (top = source_mnt; unlikely(top->overmount); top = top->overmount) {
+ if (!shorter && is_mnt_ns_file(top->mnt.mnt_root))
+ shorter = top->mnt_mp;
+ }
+ err = get_mountpoint(top->mnt.mnt_root, &root);
+ if (err)
+ return err;
/* Is there space to add these mounts to the mount namespace? */
if (!moving) {
@@ -2522,11 +2653,6 @@ static int attach_recursive_mnt(struct mount *source_mnt,
goto out;
}
- if (beneath)
- dest_mnt = top_mnt->mnt_parent;
- else
- dest_mnt = top_mnt;
-
if (IS_MNT_SHARED(dest_mnt)) {
err = invent_group_ids(source_mnt, true);
if (err)
@@ -2543,41 +2669,50 @@ static int attach_recursive_mnt(struct mount *source_mnt,
}
if (moving) {
- if (beneath)
- dest_mp = smp;
- unhash_mnt(source_mnt);
- attach_mnt(source_mnt, top_mnt, dest_mp, beneath);
- touch_mnt_namespace(source_mnt->mnt_ns);
+ umount_mnt(source_mnt);
+ mnt_notify_add(source_mnt);
+ /* if the mount is moved, it should no longer be expired
+ * automatically */
+ list_del_init(&source_mnt->mnt_expire);
} else {
if (source_mnt->mnt_ns) {
- LIST_HEAD(head);
-
/* move from anon - the caller will destroy */
+ emptied_ns = source_mnt->mnt_ns;
for (p = source_mnt; p; p = next_mnt(p, source_mnt))
- move_from_ns(p, &head);
- list_del_init(&head);
+ move_from_ns(p);
}
- if (beneath)
- mnt_set_mountpoint_beneath(source_mnt, top_mnt, smp);
- else
- mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
- commit_tree(source_mnt);
}
+ mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
+ /*
+ * Now the original copy is in the same state as the secondaries -
+ * its root attached to mountpoint, but not hashed and all mounts
+ * in it are either in our namespace or in no namespace at all.
+ * Add the original to the list of copies and deal with the
+ * rest of work for all of them uniformly.
+ */
+ hlist_add_head(&source_mnt->mnt_hash, &tree_list);
+
hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
struct mount *q;
hlist_del_init(&child->mnt_hash);
- q = __lookup_mnt(&child->mnt_parent->mnt,
- child->mnt_mountpoint);
- if (q)
- mnt_change_mountpoint(child, smp, q);
/* Notice when we are propagating across user namespaces */
if (child->mnt_parent->mnt_ns->user_ns != user_ns)
lock_mnt_tree(child);
- child->mnt.mnt_flags &= ~MNT_LOCKED;
+ q = __lookup_mnt(&child->mnt_parent->mnt,
+ child->mnt_mountpoint);
commit_tree(child);
+ if (q) {
+ struct mountpoint *mp = root.mp;
+ struct mount *r = child;
+ while (unlikely(r->overmount))
+ r = r->overmount;
+ if (unlikely(shorter) && child != source_mnt)
+ mp = shorter;
+ mnt_change_mountpoint(r, mp, q);
+ }
}
- put_mountpoint(smp);
+ unpin_mountpoint(&root);
unlock_mount_hash();
return 0;
@@ -2594,7 +2729,7 @@ static int attach_recursive_mnt(struct mount *source_mnt,
ns->pending_mounts = 0;
read_seqlock_excl(&mount_lock);
- put_mountpoint(smp);
+ unpin_mountpoint(&root);
read_sequnlock_excl(&mount_lock);
return err;
@@ -2634,79 +2769,82 @@ static int attach_recursive_mnt(struct mount *source_mnt,
* Return: Either the target mountpoint on the top mount or the top
* mount's mountpoint.
*/
-static struct mountpoint *do_lock_mount(struct path *path, bool beneath)
+static int do_lock_mount(struct path *path, struct pinned_mountpoint *pinned, bool beneath)
{
struct vfsmount *mnt = path->mnt;
struct dentry *dentry;
- struct mountpoint *mp = ERR_PTR(-ENOENT);
+ struct path under = {};
+ int err = -ENOENT;
for (;;) {
- struct mount *m;
+ struct mount *m = real_mount(mnt);
if (beneath) {
- m = real_mount(mnt);
+ path_put(&under);
read_seqlock_excl(&mount_lock);
- dentry = dget(m->mnt_mountpoint);
+ under.mnt = mntget(&m->mnt_parent->mnt);
+ under.dentry = dget(m->mnt_mountpoint);
read_sequnlock_excl(&mount_lock);
+ dentry = under.dentry;
} else {
dentry = path->dentry;
}
inode_lock(dentry->d_inode);
- if (unlikely(cant_mount(dentry))) {
- inode_unlock(dentry->d_inode);
- goto out;
- }
-
namespace_lock();
- if (beneath && (!is_mounted(mnt) || m->mnt_mountpoint != dentry)) {
+ if (unlikely(cant_mount(dentry) || !is_mounted(mnt)))
+ break; // not to be mounted on
+
+ if (beneath && unlikely(m->mnt_mountpoint != dentry ||
+ &m->mnt_parent->mnt != under.mnt)) {
namespace_unlock();
inode_unlock(dentry->d_inode);
- goto out;
+ continue; // got moved
}
mnt = lookup_mnt(path);
- if (likely(!mnt))
+ if (unlikely(mnt)) {
+ namespace_unlock();
+ inode_unlock(dentry->d_inode);
+ path_put(path);
+ path->mnt = mnt;
+ path->dentry = dget(mnt->mnt_root);
+ continue; // got overmounted
+ }
+ err = get_mountpoint(dentry, pinned);
+ if (err)
break;
-
- namespace_unlock();
- inode_unlock(dentry->d_inode);
- if (beneath)
- dput(dentry);
- path_put(path);
- path->mnt = mnt;
- path->dentry = dget(mnt->mnt_root);
- }
-
- mp = get_mountpoint(dentry);
- if (IS_ERR(mp)) {
- namespace_unlock();
- inode_unlock(dentry->d_inode);
+ if (beneath) {
+ /*
+ * @under duplicates the references that will stay
+ * at least until namespace_unlock(), so the path_put()
+ * below is safe (and OK to do under namespace_lock -
+ * we are not dropping the final references here).
+ */
+ path_put(&under);
+ }
+ return 0;
}
-
-out:
+ namespace_unlock();
+ inode_unlock(dentry->d_inode);
if (beneath)
- dput(dentry);
-
- return mp;
+ path_put(&under);
+ return err;
}
-static inline struct mountpoint *lock_mount(struct path *path)
+static inline int lock_mount(struct path *path, struct pinned_mountpoint *m)
{
- return do_lock_mount(path, false);
+ return do_lock_mount(path, m, false);
}
-static void unlock_mount(struct mountpoint *where)
+static void unlock_mount(struct pinned_mountpoint *m)
{
- struct dentry *dentry = where->m_dentry;
-
+ inode_unlock(m->mp->m_dentry->d_inode);
read_seqlock_excl(&mount_lock);
- put_mountpoint(where);
+ unpin_mountpoint(m);
read_sequnlock_excl(&mount_lock);
-
namespace_unlock();
- inode_unlock(dentry->d_inode);
}
static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
@@ -2718,7 +2856,20 @@ static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
d_is_dir(mnt->mnt.mnt_root))
return -ENOTDIR;
- return attach_recursive_mnt(mnt, p, mp, 0);
+ return attach_recursive_mnt(mnt, p, mp);
+}
+
+static int may_change_propagation(const struct mount *m)
+{
+ struct mnt_namespace *ns = m->mnt_ns;
+
+ // it must be mounted in some namespace
+ if (IS_ERR_OR_NULL(ns)) // is_mounted()
+ return -EINVAL;
+ // and the caller must be admin in userns of that namespace
+ if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN))
+ return -EPERM;
+ return 0;
}
/*
@@ -2757,49 +2908,106 @@ static int do_change_type(struct path *path, int ms_flags)
return -EINVAL;
namespace_lock();
+ err = may_change_propagation(mnt);
+ if (err)
+ goto out_unlock;
+
if (type == MS_SHARED) {
err = invent_group_ids(mnt, recurse);
if (err)
goto out_unlock;
}
- lock_mount_hash();
for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
change_mnt_propagation(m, type);
- unlock_mount_hash();
out_unlock:
namespace_unlock();
return err;
}
+/* may_copy_tree() - check if a mount tree can be copied
+ * @path: path to the mount tree to be copied
+ *
+ * This helper checks if the caller may copy the mount tree starting
+ * from @path->mnt. The caller may copy the mount tree under the
+ * following circumstances:
+ *
+ * (1) The caller is located in the mount namespace of the mount tree.
+ * This also implies that the mount does not belong to an anonymous
+ * mount namespace.
+ * (2) The caller tries to copy an nfs mount referring to a mount
+ * namespace, i.e., the caller is trying to copy a mount namespace
+ * entry from nsfs.
+ * (3) The caller tries to copy a pidfs mount referring to a pidfd.
+ * (4) The caller is trying to copy a mount tree that belongs to an
+ * anonymous mount namespace.
+ *
+ * For that to be safe, this helper enforces that the origin mount
+ * namespace the anonymous mount namespace was created from is the
+ * same as the caller's mount namespace by comparing the sequence
+ * numbers.
+ *
+ * This is not strictly necessary. The current semantics of the new
+ * mount api enforce that the caller must be located in the same
+ * mount namespace as the mount tree it interacts with. Using the
+ * origin sequence number preserves these semantics even for
+ * anonymous mount namespaces. However, one could envision extending
+ * the api to directly operate across mount namespace if needed.
+ *
+ * The ownership of a non-anonymous mount namespace such as the
+ * caller's cannot change.
+ * => We know that the caller's mount namespace is stable.
+ *
+ * If the origin sequence number of the anonymous mount namespace is
+ * the same as the sequence number of the caller's mount namespace.
+ * => The owning namespaces are the same.
+ *
+ * ==> The earlier capability check on the owning namespace of the
+ * caller's mount namespace ensures that the caller has the
+ * ability to copy the mount tree.
+ *
+ * Returns true if the mount tree can be copied, false otherwise.
+ */
+static inline bool may_copy_tree(struct path *path)
+{
+ struct mount *mnt = real_mount(path->mnt);
+ const struct dentry_operations *d_op;
+
+ if (check_mnt(mnt))
+ return true;
+
+ d_op = path->dentry->d_op;
+ if (d_op == &ns_dentry_operations)
+ return true;
+
+ if (d_op == &pidfs_dentry_operations)
+ return true;
+
+ if (!is_mounted(path->mnt))
+ return false;
+
+ return check_anonymous_mnt(mnt);
+}
+
+
static struct mount *__do_loopback(struct path *old_path, int recurse)
{
- struct mount *mnt = ERR_PTR(-EINVAL), *old = real_mount(old_path->mnt);
+ struct mount *old = real_mount(old_path->mnt);
if (IS_MNT_UNBINDABLE(old))
- return mnt;
-
- if (!check_mnt(old)) {
- const struct dentry_operations *d_op = old_path->dentry->d_op;
+ return ERR_PTR(-EINVAL);
- if (d_op != &ns_dentry_operations &&
- d_op != &pidfs_dentry_operations)
- return mnt;
- }
+ if (!may_copy_tree(old_path))
+ return ERR_PTR(-EINVAL);
- if (!recurse && has_locked_children(old, old_path->dentry))
- return mnt;
+ if (!recurse && __has_locked_children(old, old_path->dentry))
+ return ERR_PTR(-EINVAL);
if (recurse)
- mnt = copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE);
+ return copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE);
else
- mnt = clone_mnt(old, old_path->dentry, 0);
-
- if (!IS_ERR(mnt))
- mnt->mnt.mnt_flags &= ~MNT_LOCKED;
-
- return mnt;
+ return clone_mnt(old, old_path->dentry, 0);
}
/*
@@ -2810,7 +3018,7 @@ static int do_loopback(struct path *path, const char *old_name,
{
struct path old_path;
struct mount *mnt = NULL, *parent;
- struct mountpoint *mp;
+ struct pinned_mountpoint mp = {};
int err;
if (!old_name || !*old_name)
return -EINVAL;
@@ -2822,11 +3030,9 @@ static int do_loopback(struct path *path, const char *old_name,
if (mnt_ns_loop(old_path.dentry))
goto out;
- mp = lock_mount(path);
- if (IS_ERR(mp)) {
- err = PTR_ERR(mp);
+ err = lock_mount(path, &mp);
+ if (err)
goto out;
- }
parent = real_mount(path->mnt);
if (!check_mnt(parent))
@@ -2838,14 +3044,14 @@ static int do_loopback(struct path *path, const char *old_name,
goto out2;
}
- err = graft_tree(mnt, parent, mp);
+ err = graft_tree(mnt, parent, mp.mp);
if (err) {
lock_mount_hash();
umount_tree(mnt, UMOUNT_SYNC);
unlock_mount_hash();
}
out2:
- unlock_mount(mp);
+ unlock_mount(&mp);
out:
path_put(&old_path);
return err;
@@ -2853,15 +3059,30 @@ out:
static struct file *open_detached_copy(struct path *path, bool recursive)
{
- struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
- struct mnt_namespace *ns = alloc_mnt_ns(user_ns, true);
+ struct mnt_namespace *ns, *mnt_ns = current->nsproxy->mnt_ns, *src_mnt_ns;
+ struct user_namespace *user_ns = mnt_ns->user_ns;
struct mount *mnt, *p;
struct file *file;
+ ns = alloc_mnt_ns(user_ns, true);
if (IS_ERR(ns))
return ERR_CAST(ns);
namespace_lock();
+
+ /*
+ * Record the sequence number of the source mount namespace.
+ * This needs to hold namespace_sem to ensure that the mount
+ * doesn't get attached.
+ */
+ if (is_mounted(path->mnt)) {
+ src_mnt_ns = real_mount(path->mnt)->mnt_ns;
+ if (is_anon_ns(src_mnt_ns))
+ ns->seq_origin = src_mnt_ns->seq_origin;
+ else
+ ns->seq_origin = src_mnt_ns->seq;
+ }
+
mnt = __do_loopback(path, recursive);
if (IS_ERR(mnt)) {
namespace_unlock();
@@ -2889,24 +3110,22 @@ static struct file *open_detached_copy(struct path *path, bool recursive)
return file;
}
-SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, flags)
+static struct file *vfs_open_tree(int dfd, const char __user *filename, unsigned int flags)
{
- struct file *file;
- struct path path;
+ int ret;
+ struct path path __free(path_put) = {};
int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
bool detached = flags & OPEN_TREE_CLONE;
- int error;
- int fd;
BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC);
if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE |
AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE |
OPEN_TREE_CLOEXEC))
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE)) == AT_RECURSIVE)
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
if (flags & AT_NO_AUTOMOUNT)
lookup_flags &= ~LOOKUP_AUTOMOUNT;
@@ -2916,27 +3135,32 @@ SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, fl
lookup_flags |= LOOKUP_EMPTY;
if (detached && !may_mount())
- return -EPERM;
+ return ERR_PTR(-EPERM);
+
+ ret = user_path_at(dfd, filename, lookup_flags, &path);
+ if (unlikely(ret))
+ return ERR_PTR(ret);
+
+ if (detached)
+ return open_detached_copy(&path, flags & AT_RECURSIVE);
+
+ return dentry_open(&path, O_PATH, current_cred());
+}
+
+SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, flags)
+{
+ int fd;
+ struct file *file __free(fput) = NULL;
+
+ file = vfs_open_tree(dfd, filename, flags);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
fd = get_unused_fd_flags(flags & O_CLOEXEC);
if (fd < 0)
return fd;
- error = user_path_at(dfd, filename, lookup_flags, &path);
- if (unlikely(error)) {
- file = ERR_PTR(error);
- } else {
- if (detached)
- file = open_detached_copy(&path, flags & AT_RECURSIVE);
- else
- file = dentry_open(&path, O_PATH, current_cred());
- path_put(&path);
- }
- if (IS_ERR(file)) {
- put_unused_fd(fd);
- return PTR_ERR(file);
- }
- fd_install(fd, file);
+ fd_install(fd, no_free_ptr(file));
return fd;
}
@@ -3123,28 +3347,6 @@ static inline int tree_contains_unbindable(struct mount *mnt)
return 0;
}
-/*
- * Check that there aren't references to earlier/same mount namespaces in the
- * specified subtree. Such references can act as pins for mount namespaces
- * that aren't checked by the mount-cycle checking code, thereby allowing
- * cycles to be made.
- */
-static bool check_for_nsfs_mounts(struct mount *subtree)
-{
- struct mount *p;
- bool ret = false;
-
- lock_mount_hash();
- for (p = subtree; p; p = next_mnt(p, subtree))
- if (mnt_ns_loop(p->mnt.mnt_root))
- goto out;
-
- ret = true;
-out:
- unlock_mount_hash();
- return ret;
-}
-
static int do_set_group(struct path *from_path, struct path *to_path)
{
struct mount *from, *to;
@@ -3155,18 +3357,11 @@ static int do_set_group(struct path *from_path, struct path *to_path)
namespace_lock();
- err = -EINVAL;
- /* To and From must be mounted */
- if (!is_mounted(&from->mnt))
- goto out;
- if (!is_mounted(&to->mnt))
- goto out;
-
- err = -EPERM;
- /* We should be allowed to modify mount namespaces of both mounts */
- if (!ns_capable(from->mnt_ns->user_ns, CAP_SYS_ADMIN))
+ err = may_change_propagation(from);
+ if (err)
goto out;
- if (!ns_capable(to->mnt_ns->user_ns, CAP_SYS_ADMIN))
+ err = may_change_propagation(to);
+ if (err)
goto out;
err = -EINVAL;
@@ -3185,7 +3380,7 @@ static int do_set_group(struct path *from_path, struct path *to_path)
goto out;
/* From mount should not have locked children in place of To's root */
- if (has_locked_children(from, to->mnt.mnt_root))
+ if (__has_locked_children(from, to->mnt.mnt_root))
goto out;
/* Setting sharing groups is only allowed on private mounts */
@@ -3197,18 +3392,14 @@ static int do_set_group(struct path *from_path, struct path *to_path)
goto out;
if (IS_MNT_SLAVE(from)) {
- struct mount *m = from->mnt_master;
-
- list_add(&to->mnt_slave, &m->mnt_slave_list);
- to->mnt_master = m;
+ hlist_add_behind(&to->mnt_slave, &from->mnt_slave);
+ to->mnt_master = from->mnt_master;
}
if (IS_MNT_SHARED(from)) {
to->mnt_group_id = from->mnt_group_id;
list_add(&to->mnt_share, &from->mnt_share);
- lock_mount_hash();
set_mnt_shared(to);
- unlock_mount_hash();
}
err = 0;
@@ -3224,18 +3415,36 @@ out:
* Check if path is overmounted, i.e., if there's a mount on top of
* @path->mnt with @path->dentry as mountpoint.
*
- * Context: This function expects namespace_lock() to be held.
+ * Context: namespace_sem must be held at least shared.
+ * MUST NOT be called under lock_mount_hash() (there one should just
+ * call __lookup_mnt() and check if it returns NULL).
* Return: If path is overmounted true is returned, false if not.
*/
static inline bool path_overmounted(const struct path *path)
{
+ unsigned seq = read_seqbegin(&mount_lock);
+ bool no_child;
+
rcu_read_lock();
- if (unlikely(__lookup_mnt(path->mnt, path->dentry))) {
- rcu_read_unlock();
- return true;
- }
+ no_child = !__lookup_mnt(path->mnt, path->dentry);
rcu_read_unlock();
- return false;
+ if (need_seqretry(&mount_lock, seq)) {
+ read_seqlock_excl(&mount_lock);
+ no_child = !__lookup_mnt(path->mnt, path->dentry);
+ read_sequnlock_excl(&mount_lock);
+ }
+ return unlikely(!no_child);
+}
+
+/*
+ * Check if there is a possibly empty chain of descent from p1 to p2.
+ * Locks: namespace_sem (shared) or mount_lock (read_seqlock_excl).
+ */
+static bool mount_is_ancestor(const struct mount *p1, const struct mount *p2)
+{
+ while (p2 != p1 && mnt_has_parent(p2))
+ p2 = p2->mnt_parent;
+ return p2 == p1;
}
/**
@@ -3289,9 +3498,8 @@ static int can_move_mount_beneath(const struct path *from,
if (parent_mnt_to == current->nsproxy->mnt_ns->root)
return -EINVAL;
- for (struct mount *p = mnt_from; mnt_has_parent(p); p = p->mnt_parent)
- if (p == mnt_to)
- return -EINVAL;
+ if (mount_is_ancestor(mnt_to, mnt_from))
+ return -EINVAL;
/*
* If the parent mount propagates to the child mount this would
@@ -3314,52 +3522,114 @@ static int can_move_mount_beneath(const struct path *from,
* @mnt_from itself. This defeats the whole purpose of mounting
* @mnt_from beneath @mnt_to.
*/
- if (propagation_would_overmount(parent_mnt_to, mnt_from, mp))
+ if (check_mnt(mnt_from) &&
+ propagation_would_overmount(parent_mnt_to, mnt_from, mp))
return -EINVAL;
return 0;
}
-static int do_move_mount(struct path *old_path, struct path *new_path,
- bool beneath)
+/* may_use_mount() - check if a mount tree can be used
+ * @mnt: vfsmount to be used
+ *
+ * This helper checks if the caller may use the mount tree starting
+ * from @path->mnt. The caller may use the mount tree under the
+ * following circumstances:
+ *
+ * (1) The caller is located in the mount namespace of the mount tree.
+ * This also implies that the mount does not belong to an anonymous
+ * mount namespace.
+ * (2) The caller is trying to use a mount tree that belongs to an
+ * anonymous mount namespace.
+ *
+ * For that to be safe, this helper enforces that the origin mount
+ * namespace the anonymous mount namespace was created from is the
+ * same as the caller's mount namespace by comparing the sequence
+ * numbers.
+ *
+ * The ownership of a non-anonymous mount namespace such as the
+ * caller's cannot change.
+ * => We know that the caller's mount namespace is stable.
+ *
+ * If the origin sequence number of the anonymous mount namespace is
+ * the same as the sequence number of the caller's mount namespace.
+ * => The owning namespaces are the same.
+ *
+ * ==> The earlier capability check on the owning namespace of the
+ * caller's mount namespace ensures that the caller has the
+ * ability to use the mount tree.
+ *
+ * Returns true if the mount tree can be used, false otherwise.
+ */
+static inline bool may_use_mount(struct mount *mnt)
+{
+ if (check_mnt(mnt))
+ return true;
+
+ /*
+ * Make sure that noone unmounted the target path or somehow
+ * managed to get their hands on something purely kernel
+ * internal.
+ */
+ if (!is_mounted(&mnt->mnt))
+ return false;
+
+ return check_anonymous_mnt(mnt);
+}
+
+static int do_move_mount(struct path *old_path,
+ struct path *new_path, enum mnt_tree_flags_t flags)
{
struct mnt_namespace *ns;
struct mount *p;
struct mount *old;
struct mount *parent;
- struct mountpoint *mp, *old_mp;
+ struct pinned_mountpoint mp;
int err;
- bool attached;
- enum mnt_tree_flags_t flags = 0;
+ bool beneath = flags & MNT_TREE_BENEATH;
- mp = do_lock_mount(new_path, beneath);
- if (IS_ERR(mp))
- return PTR_ERR(mp);
+ err = do_lock_mount(new_path, &mp, beneath);
+ if (err)
+ return err;
old = real_mount(old_path->mnt);
p = real_mount(new_path->mnt);
parent = old->mnt_parent;
- attached = mnt_has_parent(old);
- if (attached)
- flags |= MNT_TREE_MOVE;
- old_mp = old->mnt_mp;
ns = old->mnt_ns;
err = -EINVAL;
- /* The mountpoint must be in our namespace. */
- if (!check_mnt(p))
- goto out;
-
- /* The thing moved must be mounted... */
- if (!is_mounted(&old->mnt))
- goto out;
-
- /* ... and either ours or the root of anon namespace */
- if (!(attached ? check_mnt(old) : is_anon_ns(ns)))
- goto out;
- if (old->mnt.mnt_flags & MNT_LOCKED)
- goto out;
+ if (check_mnt(old)) {
+ /* if the source is in our namespace... */
+ /* ... it should be detachable from parent */
+ if (!mnt_has_parent(old) || IS_MNT_LOCKED(old))
+ goto out;
+ /* ... and the target should be in our namespace */
+ if (!check_mnt(p))
+ goto out;
+ /* parent of the source should not be shared */
+ if (IS_MNT_SHARED(parent))
+ goto out;
+ } else {
+ /*
+ * otherwise the source must be the root of some anon namespace.
+ */
+ if (!anon_ns_root(old))
+ goto out;
+ /*
+ * Bail out early if the target is within the same namespace -
+ * subsequent checks would've rejected that, but they lose
+ * some corner cases if we check it early.
+ */
+ if (ns == p->mnt_ns)
+ goto out;
+ /*
+ * Target should be either in our namespace or in an acceptable
+ * anon namespace, sensu check_anonymous_mnt().
+ */
+ if (!may_use_mount(p))
+ goto out;
+ }
if (!path_mounted(old_path))
goto out;
@@ -3367,20 +3637,14 @@ static int do_move_mount(struct path *old_path, struct path *new_path,
if (d_is_dir(new_path->dentry) !=
d_is_dir(old_path->dentry))
goto out;
- /*
- * Don't move a mount residing in a shared parent.
- */
- if (attached && IS_MNT_SHARED(parent))
- goto out;
if (beneath) {
- err = can_move_mount_beneath(old_path, new_path, mp);
+ err = can_move_mount_beneath(old_path, new_path, mp.mp);
if (err)
goto out;
err = -EINVAL;
p = p->mnt_parent;
- flags |= MNT_TREE_BENEATH;
}
/*
@@ -3392,27 +3656,12 @@ static int do_move_mount(struct path *old_path, struct path *new_path,
err = -ELOOP;
if (!check_for_nsfs_mounts(old))
goto out;
- for (; mnt_has_parent(p); p = p->mnt_parent)
- if (p == old)
- goto out;
-
- err = attach_recursive_mnt(old, real_mount(new_path->mnt), mp, flags);
- if (err)
+ if (mount_is_ancestor(old, p))
goto out;
- /* if the mount is moved, it should no longer be expire
- * automatically */
- list_del_init(&old->mnt_expire);
- if (attached)
- put_mountpoint(old_mp);
+ err = attach_recursive_mnt(old, p, mp.mp);
out:
- unlock_mount(mp);
- if (!err) {
- if (attached)
- mntput_no_expire(parent);
- else
- free_mnt_ns(ns);
- }
+ unlock_mount(&mp);
return err;
}
@@ -3428,7 +3677,7 @@ static int do_move_mount_old(struct path *path, const char *old_name)
if (err)
return err;
- err = do_move_mount(&old_path, path, false);
+ err = do_move_mount(&old_path, path, 0);
path_put(&old_path);
return err;
}
@@ -3473,7 +3722,7 @@ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint,
unsigned int mnt_flags)
{
struct vfsmount *mnt;
- struct mountpoint *mp;
+ struct pinned_mountpoint mp = {};
struct super_block *sb = fc->root->d_sb;
int error;
@@ -3494,13 +3743,12 @@ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint,
mnt_warn_timestamp_expiry(mountpoint, mnt);
- mp = lock_mount(mountpoint);
- if (IS_ERR(mp)) {
- mntput(mnt);
- return PTR_ERR(mp);
+ error = lock_mount(mountpoint, &mp);
+ if (!error) {
+ error = do_add_mount(real_mount(mnt), mp.mp,
+ mountpoint, mnt_flags);
+ unlock_mount(&mp);
}
- error = do_add_mount(real_mount(mnt), mp, mountpoint, mnt_flags);
- unlock_mount(mp);
if (error < 0)
mntput(mnt);
return error;
@@ -3568,7 +3816,7 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
int finish_automount(struct vfsmount *m, const struct path *path)
{
struct dentry *dentry = path->dentry;
- struct mountpoint *mp;
+ struct pinned_mountpoint mp = {};
struct mount *mnt;
int err;
@@ -3578,10 +3826,6 @@ int finish_automount(struct vfsmount *m, const struct path *path)
return PTR_ERR(m);
mnt = real_mount(m);
- /* The new mount record should have at least 2 refs to prevent it being
- * expired before we get a chance to add it
- */
- BUG_ON(mnt_get_count(mnt) < 2);
if (m->mnt_sb == path->mnt->mnt_sb &&
m->mnt_root == dentry) {
@@ -3604,30 +3848,21 @@ int finish_automount(struct vfsmount *m, const struct path *path)
err = 0;
goto discard_locked;
}
- mp = get_mountpoint(dentry);
- if (IS_ERR(mp)) {
- err = PTR_ERR(mp);
+ err = get_mountpoint(dentry, &mp);
+ if (err)
goto discard_locked;
- }
- err = do_add_mount(mnt, mp, path, path->mnt->mnt_flags | MNT_SHRINKABLE);
- unlock_mount(mp);
+ err = do_add_mount(mnt, mp.mp, path,
+ path->mnt->mnt_flags | MNT_SHRINKABLE);
+ unlock_mount(&mp);
if (unlikely(err))
goto discard;
- mntput(m);
return 0;
discard_locked:
namespace_unlock();
inode_unlock(dentry->d_inode);
discard:
- /* remove m from any expiration list it may be on */
- if (!list_empty(&mnt->mnt_expire)) {
- namespace_lock();
- list_del_init(&mnt->mnt_expire);
- namespace_unlock();
- }
- mntput(m);
mntput(m);
return err;
}
@@ -3639,11 +3874,9 @@ discard:
*/
void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
{
- namespace_lock();
-
+ read_seqlock_excl(&mount_lock);
list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list);
-
- namespace_unlock();
+ read_sequnlock_excl(&mount_lock);
}
EXPORT_SYMBOL(mnt_set_expiry);
@@ -3665,11 +3898,14 @@ void mark_mounts_for_expiry(struct list_head *mounts)
/* extract from the expiration list every vfsmount that matches the
* following criteria:
+ * - already mounted
* - only referenced by its parent vfsmount
* - still marked for expiry (marked on the last call here; marks are
* cleared by mntput())
*/
list_for_each_entry_safe(mnt, next, mounts, mnt_expire) {
+ if (!is_mounted(&mnt->mnt))
+ continue;
if (!xchg(&mnt->mnt_expiry_mark, 1) ||
propagate_mount_busy(mnt, 1))
continue;
@@ -3994,7 +4230,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
/* First pass: copy the tree topology */
copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
if (user_ns != ns->user_ns)
- copy_flags |= CL_SHARED_TO_SLAVE;
+ copy_flags |= CL_SLAVE;
new = copy_tree(old, old->mnt.mnt_root, copy_flags);
if (IS_ERR(new)) {
namespace_unlock();
@@ -4269,6 +4505,21 @@ err_unlock:
return ret;
}
+static inline int vfs_move_mount(struct path *from_path, struct path *to_path,
+ enum mnt_tree_flags_t mflags)
+{
+ int ret;
+
+ ret = security_move_mount(from_path, to_path);
+ if (ret)
+ return ret;
+
+ if (mflags & MNT_TREE_PROPAGATION)
+ return do_set_group(from_path, to_path);
+
+ return do_move_mount(from_path, to_path, mflags);
+}
+
/*
* Move a mount from one place to another. In combination with
* fsopen()/fsmount() this is used to install a new mount and in combination
@@ -4282,8 +4533,12 @@ SYSCALL_DEFINE5(move_mount,
int, to_dfd, const char __user *, to_pathname,
unsigned int, flags)
{
- struct path from_path, to_path;
- unsigned int lflags;
+ struct path to_path __free(path_put) = {};
+ struct path from_path __free(path_put) = {};
+ struct filename *to_name __free(putname) = NULL;
+ struct filename *from_name __free(putname) = NULL;
+ unsigned int lflags, uflags;
+ enum mnt_tree_flags_t mflags = 0;
int ret = 0;
if (!may_mount())
@@ -4296,43 +4551,61 @@ SYSCALL_DEFINE5(move_mount,
(MOVE_MOUNT_BENEATH | MOVE_MOUNT_SET_GROUP))
return -EINVAL;
- /* If someone gives a pathname, they aren't permitted to move
- * from an fd that requires unmount as we can't get at the flag
- * to clear it afterwards.
- */
- lflags = 0;
- if (flags & MOVE_MOUNT_F_SYMLINKS) lflags |= LOOKUP_FOLLOW;
- if (flags & MOVE_MOUNT_F_AUTOMOUNTS) lflags |= LOOKUP_AUTOMOUNT;
- if (flags & MOVE_MOUNT_F_EMPTY_PATH) lflags |= LOOKUP_EMPTY;
+ if (flags & MOVE_MOUNT_SET_GROUP) mflags |= MNT_TREE_PROPAGATION;
+ if (flags & MOVE_MOUNT_BENEATH) mflags |= MNT_TREE_BENEATH;
- ret = user_path_at(from_dfd, from_pathname, lflags, &from_path);
- if (ret < 0)
- return ret;
+ uflags = 0;
+ if (flags & MOVE_MOUNT_T_EMPTY_PATH)
+ uflags = AT_EMPTY_PATH;
- lflags = 0;
- if (flags & MOVE_MOUNT_T_SYMLINKS) lflags |= LOOKUP_FOLLOW;
- if (flags & MOVE_MOUNT_T_AUTOMOUNTS) lflags |= LOOKUP_AUTOMOUNT;
- if (flags & MOVE_MOUNT_T_EMPTY_PATH) lflags |= LOOKUP_EMPTY;
+ to_name = getname_maybe_null(to_pathname, uflags);
+ if (IS_ERR(to_name))
+ return PTR_ERR(to_name);
- ret = user_path_at(to_dfd, to_pathname, lflags, &to_path);
- if (ret < 0)
- goto out_from;
+ if (!to_name && to_dfd >= 0) {
+ CLASS(fd_raw, f_to)(to_dfd);
+ if (fd_empty(f_to))
+ return -EBADF;
- ret = security_move_mount(&from_path, &to_path);
- if (ret < 0)
- goto out_to;
+ to_path = fd_file(f_to)->f_path;
+ path_get(&to_path);
+ } else {
+ lflags = 0;
+ if (flags & MOVE_MOUNT_T_SYMLINKS)
+ lflags |= LOOKUP_FOLLOW;
+ if (flags & MOVE_MOUNT_T_AUTOMOUNTS)
+ lflags |= LOOKUP_AUTOMOUNT;
+ ret = filename_lookup(to_dfd, to_name, lflags, &to_path, NULL);
+ if (ret)
+ return ret;
+ }
- if (flags & MOVE_MOUNT_SET_GROUP)
- ret = do_set_group(&from_path, &to_path);
- else
- ret = do_move_mount(&from_path, &to_path,
- (flags & MOVE_MOUNT_BENEATH));
+ uflags = 0;
+ if (flags & MOVE_MOUNT_F_EMPTY_PATH)
+ uflags = AT_EMPTY_PATH;
-out_to:
- path_put(&to_path);
-out_from:
- path_put(&from_path);
- return ret;
+ from_name = getname_maybe_null(from_pathname, uflags);
+ if (IS_ERR(from_name))
+ return PTR_ERR(from_name);
+
+ if (!from_name && from_dfd >= 0) {
+ CLASS(fd_raw, f_from)(from_dfd);
+ if (fd_empty(f_from))
+ return -EBADF;
+
+ return vfs_move_mount(&fd_file(f_from)->f_path, &to_path, mflags);
+ }
+
+ lflags = 0;
+ if (flags & MOVE_MOUNT_F_SYMLINKS)
+ lflags |= LOOKUP_FOLLOW;
+ if (flags & MOVE_MOUNT_F_AUTOMOUNTS)
+ lflags |= LOOKUP_AUTOMOUNT;
+ ret = filename_lookup(from_dfd, from_name, lflags, &from_path, NULL);
+ if (ret)
+ return ret;
+
+ return vfs_move_mount(&from_path, &to_path, mflags);
}
/*
@@ -4390,7 +4663,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
{
struct path new, old, root;
struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent, *ex_parent;
- struct mountpoint *old_mp, *root_mp;
+ struct pinned_mountpoint old_mp = {};
int error;
if (!may_mount())
@@ -4411,9 +4684,8 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
goto out2;
get_fs_root(current->fs, &root);
- old_mp = lock_mount(&old);
- error = PTR_ERR(old_mp);
- if (IS_ERR(old_mp))
+ error = lock_mount(&old, &old_mp);
+ if (error)
goto out3;
error = -EINVAL;
@@ -4440,11 +4712,11 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
if (!path_mounted(&root))
goto out4; /* not a mountpoint */
if (!mnt_has_parent(root_mnt))
- goto out4; /* not attached */
+ goto out4; /* absolute root */
if (!path_mounted(&new))
goto out4; /* not a mountpoint */
if (!mnt_has_parent(new_mnt))
- goto out4; /* not attached */
+ goto out4; /* absolute root */
/* make sure we can reach put_old from new_root */
if (!is_path_reachable(old_mnt, old.dentry, &new))
goto out4;
@@ -4453,27 +4725,25 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
goto out4;
lock_mount_hash();
umount_mnt(new_mnt);
- root_mp = unhash_mnt(root_mnt); /* we'll need its mountpoint */
if (root_mnt->mnt.mnt_flags & MNT_LOCKED) {
new_mnt->mnt.mnt_flags |= MNT_LOCKED;
root_mnt->mnt.mnt_flags &= ~MNT_LOCKED;
}
- /* mount old root on put_old */
- attach_mnt(root_mnt, old_mnt, old_mp, false);
/* mount new_root on / */
- attach_mnt(new_mnt, root_parent, root_mp, false);
- mnt_add_count(root_parent, -1);
+ attach_mnt(new_mnt, root_parent, root_mnt->mnt_mp);
+ umount_mnt(root_mnt);
+ /* mount old root on put_old */
+ attach_mnt(root_mnt, old_mnt, old_mp.mp);
touch_mnt_namespace(current->nsproxy->mnt_ns);
/* A moved mount should not expire automatically */
list_del_init(&new_mnt->mnt_expire);
- put_mountpoint(root_mp);
unlock_mount_hash();
+ mnt_notify_add(root_mnt);
+ mnt_notify_add(new_mnt);
chroot_fs_refs(&root, &new);
error = 0;
out4:
- unlock_mount(old_mp);
- if (!error)
- mntput_no_expire(ex_parent);
+ unlock_mount(&old_mp);
out3:
path_put(&root);
out2:
@@ -4512,11 +4782,10 @@ static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
return -EINVAL;
/*
- * Once a mount has been idmapped we don't allow it to change its
- * mapping. It makes things simpler and callers can just create
- * another bind-mount they can idmap if they want to.
+ * We only allow an mount to change it's idmapping if it has
+ * never been accessible to userspace.
*/
- if (is_idmapped_mnt(m))
+ if (!(kattr->kflags & MOUNT_KATTR_IDMAP_REPLACE) && is_idmapped_mnt(m))
return -EPERM;
/* The underlying filesystem doesn't support idmapped mounts yet. */
@@ -4576,7 +4845,7 @@ static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt)
break;
}
- if (!kattr->recurse)
+ if (!(kattr->kflags & MOUNT_KATTR_RECURSE))
return 0;
}
@@ -4606,18 +4875,16 @@ static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt)
static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
{
+ struct mnt_idmap *old_idmap;
+
if (!kattr->mnt_idmap)
return;
- /*
- * Pairs with smp_load_acquire() in mnt_idmap().
- *
- * Since we only allow a mount to change the idmapping once and
- * verified this in can_idmap_mount() we know that the mount has
- * @nop_mnt_idmap attached to it. So there's no need to drop any
- * references.
- */
+ old_idmap = mnt_idmap(&mnt->mnt);
+
+ /* Pairs with smp_load_acquire() in mnt_idmap(). */
smp_store_release(&mnt->mnt.mnt_idmap, mnt_idmap_get(kattr->mnt_idmap));
+ mnt_idmap_put(old_idmap);
}
static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt)
@@ -4637,7 +4904,7 @@ static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt)
if (kattr->propagation)
change_mnt_propagation(m, kattr->propagation);
- if (!kattr->recurse)
+ if (!(kattr->kflags & MOUNT_KATTR_RECURSE))
break;
}
touch_mnt_namespace(mnt->mnt_ns);
@@ -4667,7 +4934,7 @@ static int do_mount_setattr(struct path *path, struct mount_kattr *kattr)
*/
namespace_lock();
if (kattr->propagation == MS_SHARED) {
- err = invent_group_ids(mnt, kattr->recurse);
+ err = invent_group_ids(mnt, kattr->kflags & MOUNT_KATTR_RECURSE);
if (err) {
namespace_unlock();
return err;
@@ -4678,22 +4945,7 @@ static int do_mount_setattr(struct path *path, struct mount_kattr *kattr)
err = -EINVAL;
lock_mount_hash();
- /* Ensure that this isn't anything purely vfs internal. */
- if (!is_mounted(&mnt->mnt))
- goto out;
-
- /*
- * If this is an attached mount make sure it's located in the callers
- * mount namespace. If it's not don't let the caller interact with it.
- *
- * If this mount doesn't have a parent it's most often simply a
- * detached mount with an anonymous mount namespace. IOW, something
- * that's simply not attached yet. But there are apparently also users
- * that do change mount properties on the rootfs itself. That obviously
- * neither has a parent nor is it a detached mount so we cannot
- * unconditionally check for detached mounts.
- */
- if ((mnt_has_parent(mnt) || !is_anon_ns(mnt->mnt_ns)) && !check_mnt(mnt))
+ if (!anon_ns_root(mnt) && !check_mnt(mnt))
goto out;
/*
@@ -4718,7 +4970,7 @@ out:
}
static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
- struct mount_kattr *kattr, unsigned int flags)
+ struct mount_kattr *kattr)
{
struct ns_common *ns;
struct user_namespace *mnt_userns;
@@ -4726,13 +4978,23 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
if (!((attr->attr_set | attr->attr_clr) & MOUNT_ATTR_IDMAP))
return 0;
- /*
- * We currently do not support clearing an idmapped mount. If this ever
- * is a use-case we can revisit this but for now let's keep it simple
- * and not allow it.
- */
- if (attr->attr_clr & MOUNT_ATTR_IDMAP)
- return -EINVAL;
+ if (attr->attr_clr & MOUNT_ATTR_IDMAP) {
+ /*
+ * We can only remove an idmapping if it's never been
+ * exposed to userspace.
+ */
+ if (!(kattr->kflags & MOUNT_KATTR_IDMAP_REPLACE))
+ return -EINVAL;
+
+ /*
+ * Removal of idmappings is equivalent to setting
+ * nop_mnt_idmap.
+ */
+ if (!(attr->attr_set & MOUNT_ATTR_IDMAP)) {
+ kattr->mnt_idmap = &nop_mnt_idmap;
+ return 0;
+ }
+ }
if (attr->userns_fd > INT_MAX)
return -EINVAL;
@@ -4769,22 +5031,8 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
}
static int build_mount_kattr(const struct mount_attr *attr, size_t usize,
- struct mount_kattr *kattr, unsigned int flags)
+ struct mount_kattr *kattr)
{
- unsigned int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
-
- if (flags & AT_NO_AUTOMOUNT)
- lookup_flags &= ~LOOKUP_AUTOMOUNT;
- if (flags & AT_SYMLINK_NOFOLLOW)
- lookup_flags &= ~LOOKUP_FOLLOW;
- if (flags & AT_EMPTY_PATH)
- lookup_flags |= LOOKUP_EMPTY;
-
- *kattr = (struct mount_kattr) {
- .lookup_flags = lookup_flags,
- .recurse = !!(flags & AT_RECURSIVE),
- };
-
if (attr->propagation & ~MOUNT_SETATTR_PROPAGATION_FLAGS)
return -EINVAL;
if (hweight32(attr->propagation & MOUNT_SETATTR_PROPAGATION_FLAGS) > 1)
@@ -4832,35 +5080,28 @@ static int build_mount_kattr(const struct mount_attr *attr, size_t usize,
return -EINVAL;
}
- return build_mount_idmapped(attr, usize, kattr, flags);
+ return build_mount_idmapped(attr, usize, kattr);
}
static void finish_mount_kattr(struct mount_kattr *kattr)
{
- put_user_ns(kattr->mnt_userns);
- kattr->mnt_userns = NULL;
+ if (kattr->mnt_userns) {
+ put_user_ns(kattr->mnt_userns);
+ kattr->mnt_userns = NULL;
+ }
if (kattr->mnt_idmap)
mnt_idmap_put(kattr->mnt_idmap);
}
-SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
- unsigned int, flags, struct mount_attr __user *, uattr,
- size_t, usize)
+static int wants_mount_setattr(struct mount_attr __user *uattr, size_t usize,
+ struct mount_kattr *kattr)
{
- int err;
- struct path target;
+ int ret;
struct mount_attr attr;
- struct mount_kattr kattr;
BUILD_BUG_ON(sizeof(struct mount_attr) != MOUNT_ATTR_SIZE_VER0);
- if (flags & ~(AT_EMPTY_PATH |
- AT_RECURSIVE |
- AT_SYMLINK_NOFOLLOW |
- AT_NO_AUTOMOUNT))
- return -EINVAL;
-
if (unlikely(usize > PAGE_SIZE))
return -E2BIG;
if (unlikely(usize < MOUNT_ATTR_SIZE_VER0))
@@ -4869,18 +5110,54 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
if (!may_mount())
return -EPERM;
- err = copy_struct_from_user(&attr, sizeof(attr), uattr, usize);
- if (err)
- return err;
+ ret = copy_struct_from_user(&attr, sizeof(attr), uattr, usize);
+ if (ret)
+ return ret;
/* Don't bother walking through the mounts if this is a nop. */
if (attr.attr_set == 0 &&
attr.attr_clr == 0 &&
attr.propagation == 0)
- return 0;
+ return 0; /* Tell caller to not bother. */
- err = build_mount_kattr(&attr, usize, &kattr, flags);
- if (err)
+ ret = build_mount_kattr(&attr, usize, kattr);
+ if (ret < 0)
+ return ret;
+
+ return 1;
+}
+
+SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
+ unsigned int, flags, struct mount_attr __user *, uattr,
+ size_t, usize)
+{
+ int err;
+ struct path target;
+ struct mount_kattr kattr;
+ unsigned int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
+
+ if (flags & ~(AT_EMPTY_PATH |
+ AT_RECURSIVE |
+ AT_SYMLINK_NOFOLLOW |
+ AT_NO_AUTOMOUNT))
+ return -EINVAL;
+
+ if (flags & AT_NO_AUTOMOUNT)
+ lookup_flags &= ~LOOKUP_AUTOMOUNT;
+ if (flags & AT_SYMLINK_NOFOLLOW)
+ lookup_flags &= ~LOOKUP_FOLLOW;
+ if (flags & AT_EMPTY_PATH)
+ lookup_flags |= LOOKUP_EMPTY;
+
+ kattr = (struct mount_kattr) {
+ .lookup_flags = lookup_flags,
+ };
+
+ if (flags & AT_RECURSIVE)
+ kattr.kflags |= MOUNT_KATTR_RECURSE;
+
+ err = wants_mount_setattr(uattr, usize, &kattr);
+ if (err <= 0)
return err;
err = user_path_at(dfd, path, kattr.lookup_flags, &target);
@@ -4892,6 +5169,46 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
return err;
}
+SYSCALL_DEFINE5(open_tree_attr, int, dfd, const char __user *, filename,
+ unsigned, flags, struct mount_attr __user *, uattr,
+ size_t, usize)
+{
+ struct file __free(fput) *file = NULL;
+ int fd;
+
+ if (!uattr && usize)
+ return -EINVAL;
+
+ file = vfs_open_tree(dfd, filename, flags);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+
+ if (uattr) {
+ int ret;
+ struct mount_kattr kattr = {};
+
+ if (flags & OPEN_TREE_CLONE)
+ kattr.kflags = MOUNT_KATTR_IDMAP_REPLACE;
+ if (flags & AT_RECURSIVE)
+ kattr.kflags |= MOUNT_KATTR_RECURSE;
+
+ ret = wants_mount_setattr(uattr, usize, &kattr);
+ if (ret > 0) {
+ ret = do_mount_setattr(&file->f_path, &kattr);
+ finish_mount_kattr(&kattr);
+ }
+ if (ret)
+ return ret;
+ }
+
+ fd = get_unused_fd_flags(flags & O_CLOEXEC);
+ if (fd < 0)
+ return fd;
+
+ fd_install(fd, no_free_ptr(file));
+ return fd;
+}
+
int show_path(struct seq_file *m, struct dentry *root)
{
if (root->d_sb->s_op->show_path)
@@ -4915,10 +5232,13 @@ struct kstatmount {
struct statmount __user *buf;
size_t bufsize;
struct vfsmount *mnt;
+ struct mnt_idmap *idmap;
u64 mask;
struct path root;
- struct statmount sm;
struct seq_file seq;
+
+ /* Must be last --ends in a flexible-array member. */
+ struct statmount sm;
};
static u64 mnt_to_attr_flags(struct vfsmount *mnt)
@@ -4990,7 +5310,7 @@ static void statmount_mnt_basic(struct kstatmount *s)
s->sm.mnt_parent_id_old = m->mnt_parent->mnt_id;
s->sm.mnt_attr = mnt_to_attr_flags(&m->mnt);
s->sm.mnt_propagation = mnt_to_propagation_flags(m);
- s->sm.mnt_peer_group = IS_MNT_SHARED(m) ? m->mnt_group_id : 0;
+ s->sm.mnt_peer_group = m->mnt_group_id;
s->sm.mnt_master = IS_MNT_SLAVE(m) ? m->mnt_master->mnt_group_id : 0;
}
@@ -5071,7 +5391,7 @@ static int statmount_sb_source(struct kstatmount *s, struct seq_file *seq)
seq->buf[seq->count] = '\0';
seq->count = start;
seq_commit(seq, string_unescape_inplace(seq->buf + start, UNESCAPE_OCTAL));
- } else if (r->mnt_devname) {
+ } else {
seq_puts(seq, r->mnt_devname);
}
return 0;
@@ -5087,30 +5407,29 @@ static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq)
{
struct vfsmount *mnt = s->mnt;
struct super_block *sb = mnt->mnt_sb;
+ size_t start = seq->count;
int err;
- if (sb->s_op->show_options) {
- size_t start = seq->count;
-
- err = security_sb_show_options(seq, sb);
- if (err)
- return err;
+ err = security_sb_show_options(seq, sb);
+ if (err)
+ return err;
+ if (sb->s_op->show_options) {
err = sb->s_op->show_options(seq, mnt->mnt_root);
if (err)
return err;
+ }
- if (unlikely(seq_has_overflowed(seq)))
- return -EAGAIN;
+ if (unlikely(seq_has_overflowed(seq)))
+ return -EAGAIN;
- if (seq->count == start)
- return 0;
+ if (seq->count == start)
+ return 0;
- /* skip leading comma */
- memmove(seq->buf + start, seq->buf + start + 1,
- seq->count - start - 1);
- seq->count--;
- }
+ /* skip leading comma */
+ memmove(seq->buf + start, seq->buf + start + 1,
+ seq->count - start - 1);
+ seq->count--;
return 0;
}
@@ -5185,47 +5504,101 @@ static int statmount_opt_sec_array(struct kstatmount *s, struct seq_file *seq)
return 0;
}
+static inline int statmount_mnt_uidmap(struct kstatmount *s, struct seq_file *seq)
+{
+ int ret;
+
+ ret = statmount_mnt_idmap(s->idmap, seq, true);
+ if (ret < 0)
+ return ret;
+
+ s->sm.mnt_uidmap_num = ret;
+ /*
+ * Always raise STATMOUNT_MNT_UIDMAP even if there are no valid
+ * mappings. This allows userspace to distinguish between a
+ * non-idmapped mount and an idmapped mount where none of the
+ * individual mappings are valid in the caller's idmapping.
+ */
+ if (is_valid_mnt_idmap(s->idmap))
+ s->sm.mask |= STATMOUNT_MNT_UIDMAP;
+ return 0;
+}
+
+static inline int statmount_mnt_gidmap(struct kstatmount *s, struct seq_file *seq)
+{
+ int ret;
+
+ ret = statmount_mnt_idmap(s->idmap, seq, false);
+ if (ret < 0)
+ return ret;
+
+ s->sm.mnt_gidmap_num = ret;
+ /*
+ * Always raise STATMOUNT_MNT_GIDMAP even if there are no valid
+ * mappings. This allows userspace to distinguish between a
+ * non-idmapped mount and an idmapped mount where none of the
+ * individual mappings are valid in the caller's idmapping.
+ */
+ if (is_valid_mnt_idmap(s->idmap))
+ s->sm.mask |= STATMOUNT_MNT_GIDMAP;
+ return 0;
+}
+
static int statmount_string(struct kstatmount *s, u64 flag)
{
int ret = 0;
size_t kbufsize;
struct seq_file *seq = &s->seq;
struct statmount *sm = &s->sm;
- u32 start = seq->count;
+ u32 start, *offp;
+
+ /* Reserve an empty string at the beginning for any unset offsets */
+ if (!seq->count)
+ seq_putc(seq, 0);
+
+ start = seq->count;
switch (flag) {
case STATMOUNT_FS_TYPE:
- sm->fs_type = start;
+ offp = &sm->fs_type;
ret = statmount_fs_type(s, seq);
break;
case STATMOUNT_MNT_ROOT:
- sm->mnt_root = start;
+ offp = &sm->mnt_root;
ret = statmount_mnt_root(s, seq);
break;
case STATMOUNT_MNT_POINT:
- sm->mnt_point = start;
+ offp = &sm->mnt_point;
ret = statmount_mnt_point(s, seq);
break;
case STATMOUNT_MNT_OPTS:
- sm->mnt_opts = start;
+ offp = &sm->mnt_opts;
ret = statmount_mnt_opts(s, seq);
break;
case STATMOUNT_OPT_ARRAY:
- sm->opt_array = start;
+ offp = &sm->opt_array;
ret = statmount_opt_array(s, seq);
break;
case STATMOUNT_OPT_SEC_ARRAY:
- sm->opt_sec_array = start;
+ offp = &sm->opt_sec_array;
ret = statmount_opt_sec_array(s, seq);
break;
case STATMOUNT_FS_SUBTYPE:
- sm->fs_subtype = start;
+ offp = &sm->fs_subtype;
statmount_fs_subtype(s, seq);
break;
case STATMOUNT_SB_SOURCE:
- sm->sb_source = start;
+ offp = &sm->sb_source;
ret = statmount_sb_source(s, seq);
break;
+ case STATMOUNT_MNT_UIDMAP:
+ sm->mnt_uidmap = start;
+ ret = statmount_mnt_uidmap(s, seq);
+ break;
+ case STATMOUNT_MNT_GIDMAP:
+ sm->mnt_gidmap = start;
+ ret = statmount_mnt_gidmap(s, seq);
+ break;
default:
WARN_ON_ONCE(true);
return -EINVAL;
@@ -5251,6 +5624,7 @@ static int statmount_string(struct kstatmount *s, u64 flag)
seq->buf[seq->count++] = '\0';
sm->mask |= flag;
+ *offp = start;
return 0;
}
@@ -5300,7 +5674,7 @@ static int grab_requested_root(struct mnt_namespace *ns, struct path *root)
* We have to find the first mount in our ns and use that, however it
* may not exist, so handle that properly.
*/
- if (RB_EMPTY_ROOT(&ns->mounts))
+ if (mnt_ns_empty(ns))
return -ENOENT;
first = child = ns->root;
@@ -5317,6 +5691,23 @@ static int grab_requested_root(struct mnt_namespace *ns, struct path *root)
return 0;
}
+/* This must be updated whenever a new flag is added */
+#define STATMOUNT_SUPPORTED (STATMOUNT_SB_BASIC | \
+ STATMOUNT_MNT_BASIC | \
+ STATMOUNT_PROPAGATE_FROM | \
+ STATMOUNT_MNT_ROOT | \
+ STATMOUNT_MNT_POINT | \
+ STATMOUNT_FS_TYPE | \
+ STATMOUNT_MNT_NS_ID | \
+ STATMOUNT_MNT_OPTS | \
+ STATMOUNT_FS_SUBTYPE | \
+ STATMOUNT_SB_SOURCE | \
+ STATMOUNT_OPT_ARRAY | \
+ STATMOUNT_OPT_SEC_ARRAY | \
+ STATMOUNT_SUPPORTED_MASK | \
+ STATMOUNT_MNT_UIDMAP | \
+ STATMOUNT_MNT_GIDMAP)
+
static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
struct mnt_namespace *ns)
{
@@ -5325,7 +5716,7 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
int err;
/* Has the namespace already been emptied? */
- if (mnt_ns_id && RB_EMPTY_ROOT(&ns->mounts))
+ if (mnt_ns_id && mnt_ns_empty(ns))
return -ENOENT;
s->mnt = lookup_mnt_in_ns(mnt_id, ns);
@@ -5350,12 +5741,29 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
return err;
s->root = root;
- if (s->mask & STATMOUNT_SB_BASIC)
- statmount_sb_basic(s);
+ /*
+ * Note that mount properties in mnt->mnt_flags, mnt->mnt_idmap
+ * can change concurrently as we only hold the read-side of the
+ * namespace semaphore and mount properties may change with only
+ * the mount lock held.
+ *
+ * We could sample the mount lock sequence counter to detect
+ * those changes and retry. But it's not worth it. Worst that
+ * happens is that the mnt->mnt_idmap pointer is already changed
+ * while mnt->mnt_flags isn't or vica versa. So what.
+ *
+ * Both mnt->mnt_flags and mnt->mnt_idmap are set and retrieved
+ * via READ_ONCE()/WRITE_ONCE() and guard against theoretical
+ * torn read/write. That's all we care about right now.
+ */
+ s->idmap = mnt_idmap(s->mnt);
if (s->mask & STATMOUNT_MNT_BASIC)
statmount_mnt_basic(s);
+ if (s->mask & STATMOUNT_SB_BASIC)
+ statmount_sb_basic(s);
+
if (s->mask & STATMOUNT_PROPAGATE_FROM)
statmount_propagate_from(s);
@@ -5383,12 +5791,26 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
if (!err && s->mask & STATMOUNT_SB_SOURCE)
err = statmount_string(s, STATMOUNT_SB_SOURCE);
+ if (!err && s->mask & STATMOUNT_MNT_UIDMAP)
+ err = statmount_string(s, STATMOUNT_MNT_UIDMAP);
+
+ if (!err && s->mask & STATMOUNT_MNT_GIDMAP)
+ err = statmount_string(s, STATMOUNT_MNT_GIDMAP);
+
if (!err && s->mask & STATMOUNT_MNT_NS_ID)
statmount_mnt_ns_id(s, ns);
+ if (!err && s->mask & STATMOUNT_SUPPORTED_MASK) {
+ s->sm.mask |= STATMOUNT_SUPPORTED_MASK;
+ s->sm.supported_mask = STATMOUNT_SUPPORTED;
+ }
+
if (err)
return err;
+ /* Are there bits in the return mask not present in STATMOUNT_SUPPORTED? */
+ WARN_ON_ONCE(~STATMOUNT_SUPPORTED & s->sm.mask);
+
return 0;
}
@@ -5406,7 +5828,8 @@ static inline bool retry_statmount(const long ret, size_t *seq_size)
#define STATMOUNT_STRING_REQ (STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT | \
STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS | \
STATMOUNT_FS_SUBTYPE | STATMOUNT_SB_SOURCE | \
- STATMOUNT_OPT_ARRAY | STATMOUNT_OPT_SEC_ARRAY)
+ STATMOUNT_OPT_ARRAY | STATMOUNT_OPT_SEC_ARRAY | \
+ STATMOUNT_MNT_UIDMAP | STATMOUNT_MNT_GIDMAP)
static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq,
struct statmount __user *buf, size_t bufsize,
@@ -5652,6 +6075,10 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
!ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
return -ENOENT;
+ /*
+ * We only need to guard against mount topology changes as
+ * listmount() doesn't care about any mount properties.
+ */
scoped_guard(rwsem_read, &namespace_sem)
ret = do_listmount(ns, kreq.mnt_id, last_mnt_id, kmnt_ids,
nr_mnt_ids, (flags & LISTMOUNT_REVERSE));
@@ -5675,9 +6102,11 @@ static void __init init_mount_tree(void)
if (IS_ERR(mnt))
panic("Can't create rootfs");
- ns = alloc_mnt_ns(&init_user_ns, false);
+ ns = alloc_mnt_ns(&init_user_ns, true);
if (IS_ERR(ns))
panic("Can't allocate initial namespace");
+ ns->seq = atomic64_inc_return(&mnt_ns_seq);
+ ns->ns.inum = PROC_MNT_INIT_INO;
m = real_mount(mnt);
ns->root = m;
ns->nr_mounts = 1;
@@ -5687,7 +6116,6 @@ static void __init init_mount_tree(void)
root.mnt = mnt;
root.dentry = mnt->mnt_root;
- mnt->mnt_flags |= MNT_LOCKED;
set_fs_pwd(current->fs, &root);
set_fs_root(current->fs, &root);
@@ -5734,8 +6162,12 @@ void put_mnt_ns(struct mnt_namespace *ns)
{
if (!refcount_dec_and_test(&ns->ns.count))
return;
- drop_collected_mounts(&ns->root->mnt);
- free_mnt_ns(ns);
+ namespace_lock();
+ emptied_ns = ns;
+ lock_mount_hash();
+ umount_tree(ns->root, 0);
+ unlock_mount_hash();
+ namespace_unlock();
}
struct vfsmount *kern_mount(struct file_system_type *type)
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index f761d44b3436..18b3dc74c70e 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -78,7 +78,8 @@ static int netfs_begin_cache_read(struct netfs_io_request *rreq, struct netfs_in
* [!] NOTE: This must be run in the same thread as ->issue_read() was called
* in as we access the readahead_control struct.
*/
-static ssize_t netfs_prepare_read_iterator(struct netfs_io_subrequest *subreq)
+static ssize_t netfs_prepare_read_iterator(struct netfs_io_subrequest *subreq,
+ struct readahead_control *ractl)
{
struct netfs_io_request *rreq = subreq->rreq;
size_t rsize = subreq->len;
@@ -86,7 +87,7 @@ static ssize_t netfs_prepare_read_iterator(struct netfs_io_subrequest *subreq)
if (subreq->source == NETFS_DOWNLOAD_FROM_SERVER)
rsize = umin(rsize, rreq->io_streams[0].sreq_max_len);
- if (rreq->ractl) {
+ if (ractl) {
/* If we don't have sufficient folios in the rolling buffer,
* extract a folioq's worth from the readahead region at a time
* into the buffer. Note that this acquires a ref on each page
@@ -99,7 +100,7 @@ static ssize_t netfs_prepare_read_iterator(struct netfs_io_subrequest *subreq)
while (rreq->submitted < subreq->start + rsize) {
ssize_t added;
- added = rolling_buffer_load_from_ra(&rreq->buffer, rreq->ractl,
+ added = rolling_buffer_load_from_ra(&rreq->buffer, ractl,
&put_batch);
if (added < 0)
return added;
@@ -155,8 +156,9 @@ static void netfs_read_cache_to_pagecache(struct netfs_io_request *rreq,
netfs_cache_read_terminated, subreq);
}
-static void netfs_issue_read(struct netfs_io_request *rreq,
- struct netfs_io_subrequest *subreq)
+static void netfs_queue_read(struct netfs_io_request *rreq,
+ struct netfs_io_subrequest *subreq,
+ bool last_subreq)
{
struct netfs_io_stream *stream = &rreq->io_streams[0];
@@ -177,8 +179,17 @@ static void netfs_issue_read(struct netfs_io_request *rreq,
}
}
+ if (last_subreq) {
+ smp_wmb(); /* Write lists before ALL_QUEUED. */
+ set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags);
+ }
+
spin_unlock(&rreq->lock);
+}
+static void netfs_issue_read(struct netfs_io_request *rreq,
+ struct netfs_io_subrequest *subreq)
+{
switch (subreq->source) {
case NETFS_DOWNLOAD_FROM_SERVER:
rreq->netfs_ops->issue_read(subreq);
@@ -201,7 +212,8 @@ static void netfs_issue_read(struct netfs_io_request *rreq,
* slicing up the region to be read according to available cache blocks and
* network rsize.
*/
-static void netfs_read_to_pagecache(struct netfs_io_request *rreq)
+static void netfs_read_to_pagecache(struct netfs_io_request *rreq,
+ struct readahead_control *ractl)
{
struct netfs_inode *ictx = netfs_inode(rreq->inode);
unsigned long long start = rreq->start;
@@ -252,9 +264,9 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq)
if (ret < 0) {
subreq->error = ret;
/* Not queued - release both refs. */
- netfs_put_subrequest(subreq, false,
+ netfs_put_subrequest(subreq,
netfs_sreq_trace_put_cancel);
- netfs_put_subrequest(subreq, false,
+ netfs_put_subrequest(subreq,
netfs_sreq_trace_put_cancel);
break;
}
@@ -281,23 +293,20 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq)
break;
issue:
- slice = netfs_prepare_read_iterator(subreq);
+ slice = netfs_prepare_read_iterator(subreq, ractl);
if (slice < 0) {
ret = slice;
subreq->error = ret;
trace_netfs_sreq(subreq, netfs_sreq_trace_cancel);
/* Not queued - release both refs. */
- netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel);
- netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel);
+ netfs_put_subrequest(subreq, netfs_sreq_trace_put_cancel);
+ netfs_put_subrequest(subreq, netfs_sreq_trace_put_cancel);
break;
}
size -= slice;
start += slice;
- if (size <= 0) {
- smp_wmb(); /* Write lists before ALL_QUEUED. */
- set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags);
- }
+ netfs_queue_read(rreq, subreq, size <= 0);
netfs_issue_read(rreq, subreq);
cond_resched();
} while (size > 0);
@@ -305,7 +314,7 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq)
if (unlikely(size > 0)) {
smp_wmb(); /* Write lists before ALL_QUEUED. */
set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags);
- netfs_wake_read_collector(rreq);
+ netfs_wake_collector(rreq);
}
/* Defer error return as we may need to wait for outstanding I/O. */
@@ -352,18 +361,15 @@ void netfs_readahead(struct readahead_control *ractl)
netfs_rreq_expand(rreq, ractl);
- rreq->ractl = ractl;
rreq->submitted = rreq->start;
if (rolling_buffer_init(&rreq->buffer, rreq->debug_id, ITER_DEST) < 0)
goto cleanup_free;
- netfs_read_to_pagecache(rreq);
+ netfs_read_to_pagecache(rreq, ractl);
- netfs_put_request(rreq, true, netfs_rreq_trace_put_return);
- return;
+ return netfs_put_request(rreq, netfs_rreq_trace_put_return);
cleanup_free:
- netfs_put_request(rreq, false, netfs_rreq_trace_put_failed);
- return;
+ return netfs_put_request(rreq, netfs_rreq_trace_put_failed);
}
EXPORT_SYMBOL(netfs_readahead);
@@ -382,7 +388,6 @@ static int netfs_create_singular_buffer(struct netfs_io_request *rreq, struct fo
if (added < 0)
return added;
rreq->submitted = rreq->start + added;
- rreq->ractl = (struct readahead_control *)1UL;
return 0;
}
@@ -452,7 +457,7 @@ static int netfs_read_gaps(struct file *file, struct folio *folio)
iov_iter_bvec(&rreq->buffer.iter, ITER_DEST, bvec, i, rreq->len);
rreq->submitted = rreq->start + flen;
- netfs_read_to_pagecache(rreq);
+ netfs_read_to_pagecache(rreq, NULL);
if (sink)
folio_put(sink);
@@ -463,11 +468,11 @@ static int netfs_read_gaps(struct file *file, struct folio *folio)
folio_mark_uptodate(folio);
}
folio_unlock(folio);
- netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
+ netfs_put_request(rreq, netfs_rreq_trace_put_return);
return ret < 0 ? ret : 0;
discard:
- netfs_put_request(rreq, false, netfs_rreq_trace_put_discard);
+ netfs_put_request(rreq, netfs_rreq_trace_put_discard);
alloc_error:
folio_unlock(folio);
return ret;
@@ -521,13 +526,13 @@ int netfs_read_folio(struct file *file, struct folio *folio)
if (ret < 0)
goto discard;
- netfs_read_to_pagecache(rreq);
+ netfs_read_to_pagecache(rreq, NULL);
ret = netfs_wait_for_read(rreq);
- netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
+ netfs_put_request(rreq, netfs_rreq_trace_put_return);
return ret < 0 ? ret : 0;
discard:
- netfs_put_request(rreq, false, netfs_rreq_trace_put_discard);
+ netfs_put_request(rreq, netfs_rreq_trace_put_discard);
alloc_error:
folio_unlock(folio);
return ret;
@@ -678,11 +683,11 @@ retry:
if (ret < 0)
goto error_put;
- netfs_read_to_pagecache(rreq);
+ netfs_read_to_pagecache(rreq, NULL);
ret = netfs_wait_for_read(rreq);
if (ret < 0)
goto error;
- netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
+ netfs_put_request(rreq, netfs_rreq_trace_put_return);
have_folio:
ret = folio_wait_private_2_killable(folio);
@@ -694,7 +699,7 @@ have_folio_no_wait:
return 0;
error_put:
- netfs_put_request(rreq, false, netfs_rreq_trace_put_failed);
+ netfs_put_request(rreq, netfs_rreq_trace_put_failed);
error:
if (folio) {
folio_unlock(folio);
@@ -743,13 +748,13 @@ int netfs_prefetch_for_write(struct file *file, struct folio *folio,
if (ret < 0)
goto error_put;
- netfs_read_to_pagecache(rreq);
+ netfs_read_to_pagecache(rreq, NULL);
ret = netfs_wait_for_read(rreq);
- netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
+ netfs_put_request(rreq, netfs_rreq_trace_put_return);
return ret < 0 ? ret : 0;
error_put:
- netfs_put_request(rreq, false, netfs_rreq_trace_put_discard);
+ netfs_put_request(rreq, netfs_rreq_trace_put_discard);
error:
_leave(" = %d", ret);
return ret;
diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c
index b4826360a411..f27ea5099a68 100644
--- a/fs/netfs/buffered_write.c
+++ b/fs/netfs/buffered_write.c
@@ -53,30 +53,40 @@ static struct folio *netfs_grab_folio_for_write(struct address_space *mapping,
* data written into the pagecache until we can find out from the server what
* the values actually are.
*/
-static void netfs_update_i_size(struct netfs_inode *ctx, struct inode *inode,
- loff_t i_size, loff_t pos, size_t copied)
+void netfs_update_i_size(struct netfs_inode *ctx, struct inode *inode,
+ loff_t pos, size_t copied)
{
+ loff_t i_size, end = pos + copied;
blkcnt_t add;
size_t gap;
+ if (end <= i_size_read(inode))
+ return;
+
if (ctx->ops->update_i_size) {
- ctx->ops->update_i_size(inode, pos);
+ ctx->ops->update_i_size(inode, end);
return;
}
- i_size_write(inode, pos);
+ spin_lock(&inode->i_lock);
+
+ i_size = i_size_read(inode);
+ if (end > i_size) {
+ i_size_write(inode, end);
#if IS_ENABLED(CONFIG_FSCACHE)
- fscache_update_cookie(ctx->cache, NULL, &pos);
+ fscache_update_cookie(ctx->cache, NULL, &end);
#endif
- gap = SECTOR_SIZE - (i_size & (SECTOR_SIZE - 1));
- if (copied > gap) {
- add = DIV_ROUND_UP(copied - gap, SECTOR_SIZE);
+ gap = SECTOR_SIZE - (i_size & (SECTOR_SIZE - 1));
+ if (copied > gap) {
+ add = DIV_ROUND_UP(copied - gap, SECTOR_SIZE);
- inode->i_blocks = min_t(blkcnt_t,
- DIV_ROUND_UP(pos, SECTOR_SIZE),
- inode->i_blocks + add);
+ inode->i_blocks = min_t(blkcnt_t,
+ DIV_ROUND_UP(end, SECTOR_SIZE),
+ inode->i_blocks + add);
+ }
}
+ spin_unlock(&inode->i_lock);
}
/**
@@ -111,12 +121,11 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
struct folio *folio = NULL, *writethrough = NULL;
unsigned int bdp_flags = (iocb->ki_flags & IOCB_NOWAIT) ? BDP_ASYNC : 0;
ssize_t written = 0, ret, ret2;
- loff_t i_size, pos = iocb->ki_pos;
+ loff_t pos = iocb->ki_pos;
size_t max_chunk = mapping_max_folio_size(mapping);
bool maybe_trouble = false;
- if (unlikely(test_bit(NETFS_ICTX_WRITETHROUGH, &ctx->flags) ||
- iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC))
+ if (unlikely(iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC))
) {
wbc_attach_fdatawrite_inode(&wbc, mapping->host);
@@ -345,10 +354,8 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
flush_dcache_folio(folio);
/* Update the inode size if we moved the EOF marker */
+ netfs_update_i_size(ctx, inode, pos, copied);
pos += copied;
- i_size = i_size_read(inode);
- if (pos > i_size)
- netfs_update_i_size(ctx, inode, i_size, pos, copied);
written += copied;
if (likely(!wreq)) {
@@ -386,7 +393,7 @@ out:
wbc_detach_inode(&wbc);
if (ret2 == -EIOCBQUEUED)
return ret2;
- if (ret == 0)
+ if (ret == 0 && ret2 < 0)
ret = ret2;
}
diff --git a/fs/netfs/direct_read.c b/fs/netfs/direct_read.c
index 0bf3c2f5a710..a05e13472baf 100644
--- a/fs/netfs/direct_read.c
+++ b/fs/netfs/direct_read.c
@@ -85,7 +85,7 @@ static int netfs_dispatch_unbuffered_reads(struct netfs_io_request *rreq)
if (rreq->netfs_ops->prepare_read) {
ret = rreq->netfs_ops->prepare_read(subreq);
if (ret < 0) {
- netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel);
+ netfs_put_subrequest(subreq, netfs_sreq_trace_put_cancel);
break;
}
}
@@ -103,19 +103,16 @@ static int netfs_dispatch_unbuffered_reads(struct netfs_io_request *rreq)
rreq->netfs_ops->issue_read(subreq);
if (test_bit(NETFS_RREQ_PAUSE, &rreq->flags))
- netfs_wait_for_pause(rreq);
+ netfs_wait_for_paused_read(rreq);
if (test_bit(NETFS_RREQ_FAILED, &rreq->flags))
break;
- if (test_bit(NETFS_RREQ_BLOCKED, &rreq->flags) &&
- test_bit(NETFS_RREQ_NONBLOCK, &rreq->flags))
- break;
cond_resched();
} while (size > 0);
if (unlikely(size > 0)) {
smp_wmb(); /* Write lists before ALL_QUEUED. */
set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags);
- netfs_wake_read_collector(rreq);
+ netfs_wake_collector(rreq);
}
return ret;
@@ -125,9 +122,9 @@ static int netfs_dispatch_unbuffered_reads(struct netfs_io_request *rreq)
* Perform a read to an application buffer, bypassing the pagecache and the
* local disk cache.
*/
-static int netfs_unbuffered_read(struct netfs_io_request *rreq, bool sync)
+static ssize_t netfs_unbuffered_read(struct netfs_io_request *rreq, bool sync)
{
- int ret;
+ ssize_t ret;
_enter("R=%x %llx-%llx",
rreq->debug_id, rreq->start, rreq->start + rreq->len - 1);
@@ -144,7 +141,7 @@ static int netfs_unbuffered_read(struct netfs_io_request *rreq, bool sync)
ret = netfs_dispatch_unbuffered_reads(rreq);
if (!rreq->submitted) {
- netfs_put_request(rreq, false, netfs_rreq_trace_put_no_submit);
+ netfs_put_request(rreq, netfs_rreq_trace_put_no_submit);
inode_dio_end(rreq->inode);
ret = 0;
goto out;
@@ -155,7 +152,7 @@ static int netfs_unbuffered_read(struct netfs_io_request *rreq, bool sync)
else
ret = -EIOCBQUEUED;
out:
- _leave(" = %d", ret);
+ _leave(" = %zd", ret);
return ret;
}
@@ -188,7 +185,8 @@ ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *i
rreq = netfs_alloc_request(iocb->ki_filp->f_mapping, iocb->ki_filp,
iocb->ki_pos, orig_count,
- NETFS_DIO_READ);
+ iocb->ki_flags & IOCB_DIRECT ?
+ NETFS_DIO_READ : NETFS_UNBUFFERED_READ);
if (IS_ERR(rreq))
return PTR_ERR(rreq);
@@ -236,7 +234,7 @@ ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *i
}
out:
- netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
+ netfs_put_request(rreq, netfs_rreq_trace_put_return);
if (ret > 0)
orig_count -= ret;
return ret;
diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c
index 42ce53cc216e..a16660ab7f83 100644
--- a/fs/netfs/direct_write.c
+++ b/fs/netfs/direct_write.c
@@ -9,20 +9,6 @@
#include <linux/uio.h>
#include "internal.h"
-static void netfs_cleanup_dio_write(struct netfs_io_request *wreq)
-{
- struct inode *inode = wreq->inode;
- unsigned long long end = wreq->start + wreq->transferred;
-
- if (!wreq->error &&
- i_size_read(inode) < end) {
- if (wreq->netfs_ops->update_i_size)
- wreq->netfs_ops->update_i_size(inode, end);
- else
- i_size_write(inode, end);
- }
-}
-
/*
* Perform an unbuffered write where we may have to do an RMW operation on an
* encrypted file. This can also be used for direct I/O writes.
@@ -87,6 +73,8 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *
}
__set_bit(NETFS_RREQ_USE_IO_ITER, &wreq->flags);
+ if (async)
+ __set_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &wreq->flags);
/* Copy the data into the bounce buffer and encrypt it. */
// TODO
@@ -96,7 +84,6 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *
if (async)
wreq->iocb = iocb;
wreq->len = iov_iter_count(&wreq->buffer.iter);
- wreq->cleanup = netfs_cleanup_dio_write;
ret = netfs_unbuffered_write(wreq, is_sync_kiocb(iocb), wreq->len);
if (ret < 0) {
_debug("begin = %zd", ret);
@@ -104,20 +91,15 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *
}
if (!async) {
- trace_netfs_rreq(wreq, netfs_rreq_trace_wait_ip);
- wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS,
- TASK_UNINTERRUPTIBLE);
- ret = wreq->error;
- if (ret == 0) {
- ret = wreq->transferred;
+ ret = netfs_wait_for_write(wreq);
+ if (ret > 0)
iocb->ki_pos += ret;
- }
} else {
ret = -EIOCBQUEUED;
}
out:
- netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
+ netfs_put_request(wreq, netfs_rreq_trace_put_return);
return ret;
}
EXPORT_SYMBOL(netfs_unbuffered_write_iter_locked);
diff --git a/fs/netfs/fscache_cache.c b/fs/netfs/fscache_cache.c
index 9397ed39b0b4..8f70f8da064b 100644
--- a/fs/netfs/fscache_cache.c
+++ b/fs/netfs/fscache_cache.c
@@ -372,7 +372,7 @@ void fscache_withdraw_cache(struct fscache_cache *cache)
EXPORT_SYMBOL(fscache_withdraw_cache);
#ifdef CONFIG_PROC_FS
-static const char fscache_cache_states[NR__FSCACHE_CACHE_STATE] = "-PAEW";
+static const char fscache_cache_states[NR__FSCACHE_CACHE_STATE] __nonstring = "-PAEW";
/*
* Generate a list of caches in /proc/fs/fscache/caches
diff --git a/fs/netfs/fscache_cookie.c b/fs/netfs/fscache_cookie.c
index d4d4b3a8b106..3d56fc73435f 100644
--- a/fs/netfs/fscache_cookie.c
+++ b/fs/netfs/fscache_cookie.c
@@ -29,7 +29,7 @@ static LIST_HEAD(fscache_cookie_lru);
static DEFINE_SPINLOCK(fscache_cookie_lru_lock);
DEFINE_TIMER(fscache_cookie_lru_timer, fscache_cookie_lru_timed_out);
static DECLARE_WORK(fscache_cookie_lru_work, fscache_cookie_lru_worker);
-static const char fscache_cookie_states[FSCACHE_COOKIE_STATE__NR] = "-LCAIFUWRD";
+static const char fscache_cookie_states[FSCACHE_COOKIE_STATE__NR] __nonstring = "-LCAIFUWRD";
static unsigned int fscache_lru_cookie_timeout = 10 * HZ;
void fscache_print_cookie(struct fscache_cookie *cookie, char prefix)
diff --git a/fs/netfs/fscache_io.c b/fs/netfs/fscache_io.c
index b1722a82c03d..e4308457633c 100644
--- a/fs/netfs/fscache_io.c
+++ b/fs/netfs/fscache_io.c
@@ -192,8 +192,7 @@ EXPORT_SYMBOL(__fscache_clear_page_bits);
/*
* Deal with the completion of writing the data to the cache.
*/
-static void fscache_wreq_done(void *priv, ssize_t transferred_or_error,
- bool was_async)
+static void fscache_wreq_done(void *priv, ssize_t transferred_or_error)
{
struct fscache_write_request *wreq = priv;
@@ -202,8 +201,7 @@ static void fscache_wreq_done(void *priv, ssize_t transferred_or_error,
wreq->set_bits);
if (wreq->term_func)
- wreq->term_func(wreq->term_func_priv, transferred_or_error,
- was_async);
+ wreq->term_func(wreq->term_func_priv, transferred_or_error);
fscache_end_operation(&wreq->cache_resources);
kfree(wreq);
}
@@ -255,14 +253,14 @@ void __fscache_write_to_cache(struct fscache_cookie *cookie,
return;
abandon_end:
- return fscache_wreq_done(wreq, ret, false);
+ return fscache_wreq_done(wreq, ret);
abandon_free:
kfree(wreq);
abandon:
if (using_pgpriv2)
fscache_clear_page_bits(mapping, start, len, cond);
if (term_func)
- term_func(term_func_priv, ret, false);
+ term_func(term_func_priv, ret);
}
EXPORT_SYMBOL(__fscache_write_to_cache);
diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index eb76f98c894b..d4f16fefd965 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -23,11 +23,17 @@
/*
* buffered_read.c
*/
-void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error, bool was_async);
+void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error);
int netfs_prefetch_for_write(struct file *file, struct folio *folio,
size_t offset, size_t len);
/*
+ * buffered_write.c
+ */
+void netfs_update_i_size(struct netfs_inode *ctx, struct inode *inode,
+ loff_t pos, size_t copied);
+
+/*
* main.c
*/
extern unsigned int netfs_debug;
@@ -62,6 +68,14 @@ static inline void netfs_proc_del_rreq(struct netfs_io_request *rreq) {}
struct folio_queue *netfs_buffer_make_space(struct netfs_io_request *rreq,
enum netfs_folioq_trace trace);
void netfs_reset_iter(struct netfs_io_subrequest *subreq);
+void netfs_wake_collector(struct netfs_io_request *rreq);
+void netfs_subreq_clear_in_progress(struct netfs_io_subrequest *subreq);
+void netfs_wait_for_in_progress_stream(struct netfs_io_request *rreq,
+ struct netfs_io_stream *stream);
+ssize_t netfs_wait_for_read(struct netfs_io_request *rreq);
+ssize_t netfs_wait_for_write(struct netfs_io_request *rreq);
+void netfs_wait_for_paused_read(struct netfs_io_request *rreq);
+void netfs_wait_for_paused_write(struct netfs_io_request *rreq);
/*
* objects.c
@@ -71,9 +85,8 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
loff_t start, size_t len,
enum netfs_io_origin origin);
void netfs_get_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace what);
-void netfs_clear_subrequests(struct netfs_io_request *rreq, bool was_async);
-void netfs_put_request(struct netfs_io_request *rreq, bool was_async,
- enum netfs_rreq_ref_trace what);
+void netfs_clear_subrequests(struct netfs_io_request *rreq);
+void netfs_put_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace what);
struct netfs_io_subrequest *netfs_alloc_subrequest(struct netfs_io_request *rreq);
static inline void netfs_see_request(struct netfs_io_request *rreq,
@@ -92,11 +105,9 @@ static inline void netfs_see_subrequest(struct netfs_io_subrequest *subreq,
/*
* read_collect.c
*/
+bool netfs_read_collection(struct netfs_io_request *rreq);
void netfs_read_collection_worker(struct work_struct *work);
-void netfs_wake_read_collector(struct netfs_io_request *rreq);
-void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error, bool was_async);
-ssize_t netfs_wait_for_read(struct netfs_io_request *rreq);
-void netfs_wait_for_pause(struct netfs_io_request *rreq);
+void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error);
/*
* read_pgpriv2.c
@@ -135,6 +146,8 @@ extern atomic_t netfs_n_rh_write_begin;
extern atomic_t netfs_n_rh_write_done;
extern atomic_t netfs_n_rh_write_failed;
extern atomic_t netfs_n_rh_write_zskip;
+extern atomic_t netfs_n_rh_retry_read_req;
+extern atomic_t netfs_n_rh_retry_read_subreq;
extern atomic_t netfs_n_wh_buffered_write;
extern atomic_t netfs_n_wh_writethrough;
extern atomic_t netfs_n_wh_dio_write;
@@ -147,6 +160,8 @@ extern atomic_t netfs_n_wh_upload_failed;
extern atomic_t netfs_n_wh_write;
extern atomic_t netfs_n_wh_write_done;
extern atomic_t netfs_n_wh_write_failed;
+extern atomic_t netfs_n_wh_retry_write_req;
+extern atomic_t netfs_n_wh_retry_write_subreq;
extern atomic_t netfs_n_wb_lock_skip;
extern atomic_t netfs_n_wb_lock_wait;
extern atomic_t netfs_n_folioq;
@@ -172,8 +187,8 @@ static inline void netfs_stat_d(atomic_t *stat)
* write_collect.c
*/
int netfs_folio_written_back(struct folio *folio);
+bool netfs_write_collection(struct netfs_io_request *wreq);
void netfs_write_collection_worker(struct work_struct *work);
-void netfs_wake_write_collector(struct netfs_io_request *wreq, bool was_async);
/*
* write_issue.c
@@ -194,8 +209,8 @@ struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len
int netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc,
struct folio *folio, size_t copied, bool to_page_end,
struct folio **writethrough_cache);
-int netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc,
- struct folio *writethrough_cache);
+ssize_t netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc,
+ struct folio *writethrough_cache);
int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t len);
/*
@@ -251,6 +266,39 @@ static inline void netfs_put_group_many(struct netfs_group *netfs_group, int nr)
}
/*
+ * Clear and wake up a NETFS_RREQ_* flag bit on a request.
+ */
+static inline void netfs_wake_rreq_flag(struct netfs_io_request *rreq,
+ unsigned int rreq_flag,
+ enum netfs_rreq_trace trace)
+{
+ if (test_bit(rreq_flag, &rreq->flags)) {
+ clear_bit_unlock(rreq_flag, &rreq->flags);
+ smp_mb__after_atomic(); /* Set flag before task state */
+ trace_netfs_rreq(rreq, trace);
+ wake_up(&rreq->waitq);
+ }
+}
+
+/*
+ * Test the NETFS_RREQ_IN_PROGRESS flag, inserting an appropriate barrier.
+ */
+static inline bool netfs_check_rreq_in_progress(const struct netfs_io_request *rreq)
+{
+ /* Order read of flags before read of anything else, such as error. */
+ return test_bit_acquire(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
+}
+
+/*
+ * Test the NETFS_SREQ_IN_PROGRESS flag, inserting an appropriate barrier.
+ */
+static inline bool netfs_check_subreq_in_progress(const struct netfs_io_subrequest *subreq)
+{
+ /* Order read of flags before read of anything else, such as error. */
+ return test_bit_acquire(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
+}
+
+/*
* fscache-cache.c
*/
#ifdef CONFIG_PROC_FS
diff --git a/fs/netfs/main.c b/fs/netfs/main.c
index 4e3e62040831..73da6c9f5777 100644
--- a/fs/netfs/main.c
+++ b/fs/netfs/main.c
@@ -39,6 +39,7 @@ static const char *netfs_origins[nr__netfs_io_origin] = {
[NETFS_READ_GAPS] = "RG",
[NETFS_READ_SINGLE] = "R1",
[NETFS_READ_FOR_WRITE] = "RW",
+ [NETFS_UNBUFFERED_READ] = "UR",
[NETFS_DIO_READ] = "DR",
[NETFS_WRITEBACK] = "WB",
[NETFS_WRITEBACK_SINGLE] = "W1",
@@ -57,15 +58,15 @@ static int netfs_requests_seq_show(struct seq_file *m, void *v)
if (v == &netfs_io_requests) {
seq_puts(m,
- "REQUEST OR REF FL ERR OPS COVERAGE\n"
- "======== == === == ==== === =========\n"
+ "REQUEST OR REF FLAG ERR OPS COVERAGE\n"
+ "======== == === ==== ==== === =========\n"
);
return 0;
}
rreq = list_entry(v, struct netfs_io_request, proc_link);
seq_printf(m,
- "%08x %s %3d %2lx %4ld %3d @%04llx %llx/%llx",
+ "%08x %s %3d %4lx %4ld %3d @%04llx %llx/%llx",
rreq->debug_id,
netfs_origins[rreq->origin],
refcount_read(&rreq->ref),
@@ -127,11 +128,13 @@ static int __init netfs_init(void)
if (mempool_init_slab_pool(&netfs_subrequest_pool, 100, netfs_subrequest_slab) < 0)
goto error_subreqpool;
+#ifdef CONFIG_PROC_FS
if (!proc_mkdir("fs/netfs", NULL))
goto error_proc;
if (!proc_create_seq("fs/netfs/requests", S_IFREG | 0444, NULL,
&netfs_requests_seq_ops))
goto error_procfile;
+#endif
#ifdef CONFIG_FSCACHE_STATS
if (!proc_create_single("fs/netfs/stats", S_IFREG | 0444, NULL,
netfs_stats_show))
@@ -144,9 +147,11 @@ static int __init netfs_init(void)
return 0;
error_fscache:
+#ifdef CONFIG_PROC_FS
error_procfile:
remove_proc_subtree("fs/netfs", NULL);
error_proc:
+#endif
mempool_exit(&netfs_subrequest_pool);
error_subreqpool:
kmem_cache_destroy(netfs_subrequest_slab);
diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c
index 7099aa07737a..20748bcfbf59 100644
--- a/fs/netfs/misc.c
+++ b/fs/netfs/misc.c
@@ -313,3 +313,234 @@ bool netfs_release_folio(struct folio *folio, gfp_t gfp)
return true;
}
EXPORT_SYMBOL(netfs_release_folio);
+
+/*
+ * Wake the collection work item.
+ */
+void netfs_wake_collector(struct netfs_io_request *rreq)
+{
+ if (test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags) &&
+ !test_bit(NETFS_RREQ_RETRYING, &rreq->flags)) {
+ queue_work(system_unbound_wq, &rreq->work);
+ } else {
+ trace_netfs_rreq(rreq, netfs_rreq_trace_wake_queue);
+ wake_up(&rreq->waitq);
+ }
+}
+
+/*
+ * Mark a subrequest as no longer being in progress and, if need be, wake the
+ * collector.
+ */
+void netfs_subreq_clear_in_progress(struct netfs_io_subrequest *subreq)
+{
+ struct netfs_io_request *rreq = subreq->rreq;
+ struct netfs_io_stream *stream = &rreq->io_streams[subreq->stream_nr];
+
+ clear_bit_unlock(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
+ smp_mb__after_atomic(); /* Clear IN_PROGRESS before task state */
+
+ /* If we are at the head of the queue, wake up the collector. */
+ if (list_is_first(&subreq->rreq_link, &stream->subrequests) ||
+ test_bit(NETFS_RREQ_RETRYING, &rreq->flags))
+ netfs_wake_collector(rreq);
+}
+
+/*
+ * Wait for all outstanding I/O in a stream to quiesce.
+ */
+void netfs_wait_for_in_progress_stream(struct netfs_io_request *rreq,
+ struct netfs_io_stream *stream)
+{
+ struct netfs_io_subrequest *subreq;
+ DEFINE_WAIT(myself);
+
+ list_for_each_entry(subreq, &stream->subrequests, rreq_link) {
+ if (!netfs_check_subreq_in_progress(subreq))
+ continue;
+
+ trace_netfs_rreq(rreq, netfs_rreq_trace_wait_quiesce);
+ for (;;) {
+ prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE);
+
+ if (!netfs_check_subreq_in_progress(subreq))
+ break;
+
+ trace_netfs_sreq(subreq, netfs_sreq_trace_wait_for);
+ schedule();
+ }
+ }
+
+ trace_netfs_rreq(rreq, netfs_rreq_trace_waited_quiesce);
+ finish_wait(&rreq->waitq, &myself);
+}
+
+/*
+ * Perform collection in app thread if not offloaded to workqueue.
+ */
+static int netfs_collect_in_app(struct netfs_io_request *rreq,
+ bool (*collector)(struct netfs_io_request *rreq))
+{
+ bool need_collect = false, inactive = true, done = true;
+
+ if (!netfs_check_rreq_in_progress(rreq)) {
+ trace_netfs_rreq(rreq, netfs_rreq_trace_recollect);
+ return 1; /* Done */
+ }
+
+ for (int i = 0; i < NR_IO_STREAMS; i++) {
+ struct netfs_io_subrequest *subreq;
+ struct netfs_io_stream *stream = &rreq->io_streams[i];
+
+ if (!stream->active)
+ continue;
+ inactive = false;
+ trace_netfs_collect_stream(rreq, stream);
+ subreq = list_first_entry_or_null(&stream->subrequests,
+ struct netfs_io_subrequest,
+ rreq_link);
+ if (subreq &&
+ (!netfs_check_subreq_in_progress(subreq) ||
+ test_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags))) {
+ need_collect = true;
+ break;
+ }
+ if (subreq || !test_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags))
+ done = false;
+ }
+
+ if (!need_collect && !inactive && !done)
+ return 0; /* Sleep */
+
+ __set_current_state(TASK_RUNNING);
+ if (collector(rreq)) {
+ /* Drop the ref from the NETFS_RREQ_IN_PROGRESS flag. */
+ netfs_put_request(rreq, netfs_rreq_trace_put_work_ip);
+ return 1; /* Done */
+ }
+
+ if (inactive) {
+ WARN(true, "Failed to collect inactive req R=%08x\n",
+ rreq->debug_id);
+ cond_resched();
+ }
+ return 2; /* Again */
+}
+
+/*
+ * Wait for a request to complete, successfully or otherwise.
+ */
+static ssize_t netfs_wait_for_in_progress(struct netfs_io_request *rreq,
+ bool (*collector)(struct netfs_io_request *rreq))
+{
+ DEFINE_WAIT(myself);
+ ssize_t ret;
+
+ for (;;) {
+ prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE);
+
+ if (!test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags)) {
+ switch (netfs_collect_in_app(rreq, collector)) {
+ case 0:
+ break;
+ case 1:
+ goto all_collected;
+ case 2:
+ if (!netfs_check_rreq_in_progress(rreq))
+ break;
+ cond_resched();
+ continue;
+ }
+ }
+
+ if (!netfs_check_rreq_in_progress(rreq))
+ break;
+
+ trace_netfs_rreq(rreq, netfs_rreq_trace_wait_ip);
+ schedule();
+ }
+
+all_collected:
+ trace_netfs_rreq(rreq, netfs_rreq_trace_waited_ip);
+ finish_wait(&rreq->waitq, &myself);
+
+ ret = rreq->error;
+ if (ret == 0) {
+ ret = rreq->transferred;
+ switch (rreq->origin) {
+ case NETFS_DIO_READ:
+ case NETFS_DIO_WRITE:
+ case NETFS_READ_SINGLE:
+ case NETFS_UNBUFFERED_READ:
+ case NETFS_UNBUFFERED_WRITE:
+ break;
+ default:
+ if (rreq->submitted < rreq->len) {
+ trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read);
+ ret = -EIO;
+ }
+ break;
+ }
+ }
+
+ return ret;
+}
+
+ssize_t netfs_wait_for_read(struct netfs_io_request *rreq)
+{
+ return netfs_wait_for_in_progress(rreq, netfs_read_collection);
+}
+
+ssize_t netfs_wait_for_write(struct netfs_io_request *rreq)
+{
+ return netfs_wait_for_in_progress(rreq, netfs_write_collection);
+}
+
+/*
+ * Wait for a paused operation to unpause or complete in some manner.
+ */
+static void netfs_wait_for_pause(struct netfs_io_request *rreq,
+ bool (*collector)(struct netfs_io_request *rreq))
+{
+ DEFINE_WAIT(myself);
+
+ for (;;) {
+ trace_netfs_rreq(rreq, netfs_rreq_trace_wait_pause);
+ prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE);
+
+ if (!test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags)) {
+ switch (netfs_collect_in_app(rreq, collector)) {
+ case 0:
+ break;
+ case 1:
+ goto all_collected;
+ case 2:
+ if (!netfs_check_rreq_in_progress(rreq) ||
+ !test_bit(NETFS_RREQ_PAUSE, &rreq->flags))
+ break;
+ cond_resched();
+ continue;
+ }
+ }
+
+ if (!netfs_check_rreq_in_progress(rreq) ||
+ !test_bit(NETFS_RREQ_PAUSE, &rreq->flags))
+ break;
+
+ schedule();
+ }
+
+all_collected:
+ trace_netfs_rreq(rreq, netfs_rreq_trace_waited_pause);
+ finish_wait(&rreq->waitq, &myself);
+}
+
+void netfs_wait_for_paused_read(struct netfs_io_request *rreq)
+{
+ return netfs_wait_for_pause(rreq, netfs_read_collection);
+}
+
+void netfs_wait_for_paused_write(struct netfs_io_request *rreq)
+{
+ return netfs_wait_for_pause(rreq, netfs_write_collection);
+}
diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c
index dc6b41ef18b0..e8c99738b5bb 100644
--- a/fs/netfs/objects.c
+++ b/fs/netfs/objects.c
@@ -10,6 +10,8 @@
#include <linux/delay.h>
#include "internal.h"
+static void netfs_free_request(struct work_struct *work);
+
/*
* Allocate an I/O request and initialise it.
*/
@@ -34,6 +36,7 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
}
memset(rreq, 0, kmem_cache_size(cache));
+ INIT_WORK(&rreq->cleanup_work, netfs_free_request);
rreq->start = start;
rreq->len = len;
rreq->origin = origin;
@@ -49,13 +52,14 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
INIT_LIST_HEAD(&rreq->io_streams[0].subrequests);
INIT_LIST_HEAD(&rreq->io_streams[1].subrequests);
init_waitqueue_head(&rreq->waitq);
- refcount_set(&rreq->ref, 1);
+ refcount_set(&rreq->ref, 2);
if (origin == NETFS_READAHEAD ||
origin == NETFS_READPAGE ||
origin == NETFS_READ_GAPS ||
origin == NETFS_READ_SINGLE ||
origin == NETFS_READ_FOR_WRITE ||
+ origin == NETFS_UNBUFFERED_READ ||
origin == NETFS_DIO_READ) {
INIT_WORK(&rreq->work, netfs_read_collection_worker);
rreq->io_streams[0].avail = true;
@@ -64,8 +68,6 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
}
__set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
- if (file && file->f_flags & O_NONBLOCK)
- __set_bit(NETFS_RREQ_NONBLOCK, &rreq->flags);
if (rreq->netfs_ops->init_request) {
ret = rreq->netfs_ops->init_request(rreq, file);
if (ret < 0) {
@@ -75,7 +77,7 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
}
atomic_inc(&ctx->io_count);
- trace_netfs_rreq_ref(rreq->debug_id, 1, netfs_rreq_trace_new);
+ trace_netfs_rreq_ref(rreq->debug_id, refcount_read(&rreq->ref), netfs_rreq_trace_new);
netfs_proc_add_rreq(rreq);
netfs_stat(&netfs_n_rh_rreq);
return rreq;
@@ -89,7 +91,7 @@ void netfs_get_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace
trace_netfs_rreq_ref(rreq->debug_id, r + 1, what);
}
-void netfs_clear_subrequests(struct netfs_io_request *rreq, bool was_async)
+void netfs_clear_subrequests(struct netfs_io_request *rreq)
{
struct netfs_io_subrequest *subreq;
struct netfs_io_stream *stream;
@@ -101,8 +103,7 @@ void netfs_clear_subrequests(struct netfs_io_request *rreq, bool was_async)
subreq = list_first_entry(&stream->subrequests,
struct netfs_io_subrequest, rreq_link);
list_del(&subreq->rreq_link);
- netfs_put_subrequest(subreq, was_async,
- netfs_sreq_trace_put_clear);
+ netfs_put_subrequest(subreq, netfs_sreq_trace_put_clear);
}
}
}
@@ -118,13 +119,19 @@ static void netfs_free_request_rcu(struct rcu_head *rcu)
static void netfs_free_request(struct work_struct *work)
{
struct netfs_io_request *rreq =
- container_of(work, struct netfs_io_request, work);
+ container_of(work, struct netfs_io_request, cleanup_work);
struct netfs_inode *ictx = netfs_inode(rreq->inode);
unsigned int i;
trace_netfs_rreq(rreq, netfs_rreq_trace_free);
+
+ /* Cancel/flush the result collection worker. That does not carry a
+ * ref of its own, so we must wait for it somewhere.
+ */
+ cancel_work_sync(&rreq->work);
+
netfs_proc_del_rreq(rreq);
- netfs_clear_subrequests(rreq, false);
+ netfs_clear_subrequests(rreq);
if (rreq->netfs_ops->free_request)
rreq->netfs_ops->free_request(rreq);
if (rreq->cache_resources.ops)
@@ -145,8 +152,7 @@ static void netfs_free_request(struct work_struct *work)
call_rcu(&rreq->rcu, netfs_free_request_rcu);
}
-void netfs_put_request(struct netfs_io_request *rreq, bool was_async,
- enum netfs_rreq_ref_trace what)
+void netfs_put_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace what)
{
unsigned int debug_id;
bool dead;
@@ -156,15 +162,8 @@ void netfs_put_request(struct netfs_io_request *rreq, bool was_async,
debug_id = rreq->debug_id;
dead = __refcount_dec_and_test(&rreq->ref, &r);
trace_netfs_rreq_ref(debug_id, r - 1, what);
- if (dead) {
- if (was_async) {
- rreq->work.func = netfs_free_request;
- if (!queue_work(system_unbound_wq, &rreq->work))
- WARN_ON(1);
- } else {
- netfs_free_request(&rreq->work);
- }
- }
+ if (dead)
+ WARN_ON(!queue_work(system_unbound_wq, &rreq->cleanup_work));
}
}
@@ -206,8 +205,7 @@ void netfs_get_subrequest(struct netfs_io_subrequest *subreq,
what);
}
-static void netfs_free_subrequest(struct netfs_io_subrequest *subreq,
- bool was_async)
+static void netfs_free_subrequest(struct netfs_io_subrequest *subreq)
{
struct netfs_io_request *rreq = subreq->rreq;
@@ -216,10 +214,10 @@ static void netfs_free_subrequest(struct netfs_io_subrequest *subreq,
rreq->netfs_ops->free_subrequest(subreq);
mempool_free(subreq, rreq->netfs_ops->subrequest_pool ?: &netfs_subrequest_pool);
netfs_stat_d(&netfs_n_rh_sreq);
- netfs_put_request(rreq, was_async, netfs_rreq_trace_put_subreq);
+ netfs_put_request(rreq, netfs_rreq_trace_put_subreq);
}
-void netfs_put_subrequest(struct netfs_io_subrequest *subreq, bool was_async,
+void netfs_put_subrequest(struct netfs_io_subrequest *subreq,
enum netfs_sreq_ref_trace what)
{
unsigned int debug_index = subreq->debug_index;
@@ -230,5 +228,5 @@ void netfs_put_subrequest(struct netfs_io_subrequest *subreq, bool was_async,
dead = __refcount_dec_and_test(&subreq->ref, &r);
trace_netfs_sreq_ref(debug_id, debug_index, r - 1, what);
if (dead)
- netfs_free_subrequest(subreq, was_async);
+ netfs_free_subrequest(subreq);
}
diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c
index f65affa5a9e4..a95e7aadafd0 100644
--- a/fs/netfs/read_collect.c
+++ b/fs/netfs/read_collect.c
@@ -83,14 +83,12 @@ static void netfs_unlock_read_folio(struct netfs_io_request *rreq,
}
just_unlock:
- if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) {
- if (folio->index == rreq->no_unlock_folio &&
- test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) {
- _debug("no unlock");
- } else {
- trace_netfs_folio(folio, netfs_folio_trace_read_unlock);
- folio_unlock(folio);
- }
+ if (folio->index == rreq->no_unlock_folio &&
+ test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) {
+ _debug("no unlock");
+ } else {
+ trace_netfs_folio(folio, netfs_folio_trace_read_unlock);
+ folio_unlock(folio);
}
folioq_clear(folioq, slot);
@@ -220,7 +218,7 @@ reassess:
stream->collected_to = front->start;
}
- if (test_bit(NETFS_SREQ_IN_PROGRESS, &front->flags))
+ if (netfs_check_subreq_in_progress(front))
notes |= HIT_PENDING;
smp_rmb(); /* Read counters after IN_PROGRESS flag. */
transferred = READ_ONCE(front->transferred);
@@ -280,9 +278,15 @@ reassess:
stream->need_retry = true;
notes |= NEED_RETRY | MADE_PROGRESS;
break;
+ } else if (test_bit(NETFS_RREQ_SHORT_TRANSFER, &rreq->flags)) {
+ notes |= MADE_PROGRESS;
} else {
- if (!stream->failed)
- stream->transferred = stream->collected_to - rreq->start;
+ if (!stream->failed) {
+ stream->transferred += transferred;
+ stream->transferred_valid = true;
+ }
+ if (front->transferred < front->len)
+ set_bit(NETFS_RREQ_SHORT_TRANSFER, &rreq->flags);
notes |= MADE_PROGRESS;
}
@@ -291,13 +295,15 @@ reassess:
spin_lock(&rreq->lock);
remove = front;
- trace_netfs_sreq(front, netfs_sreq_trace_discard);
+ trace_netfs_sreq(front,
+ notes & ABANDON_SREQ ?
+ netfs_sreq_trace_abandoned : netfs_sreq_trace_consumed);
list_del_init(&front->rreq_link);
front = list_first_entry_or_null(&stream->subrequests,
struct netfs_io_subrequest, rreq_link);
stream->front = front;
spin_unlock(&rreq->lock);
- netfs_put_subrequest(remove, false,
+ netfs_put_subrequest(remove,
notes & ABANDON_SREQ ?
netfs_sreq_trace_put_abandon :
netfs_sreq_trace_put_done);
@@ -311,14 +317,8 @@ reassess:
if (notes & NEED_RETRY)
goto need_retry;
- if ((notes & MADE_PROGRESS) && test_bit(NETFS_RREQ_PAUSE, &rreq->flags)) {
- trace_netfs_rreq(rreq, netfs_rreq_trace_unpause);
- clear_bit_unlock(NETFS_RREQ_PAUSE, &rreq->flags);
- smp_mb__after_atomic(); /* Set PAUSE before task state */
- wake_up(&rreq->waitq);
- }
-
if (notes & MADE_PROGRESS) {
+ netfs_wake_rreq_flag(rreq, NETFS_RREQ_PAUSE, netfs_rreq_trace_unpause);
//cond_resched();
goto reassess;
}
@@ -342,24 +342,10 @@ need_retry:
*/
static void netfs_rreq_assess_dio(struct netfs_io_request *rreq)
{
- struct netfs_io_subrequest *subreq;
- struct netfs_io_stream *stream = &rreq->io_streams[0];
unsigned int i;
- /* Collect unbuffered reads and direct reads, adding up the transfer
- * sizes until we find the first short or failed subrequest.
- */
- list_for_each_entry(subreq, &stream->subrequests, rreq_link) {
- rreq->transferred += subreq->transferred;
-
- if (subreq->transferred < subreq->len ||
- test_bit(NETFS_SREQ_FAILED, &subreq->flags)) {
- rreq->error = subreq->error;
- break;
- }
- }
-
- if (rreq->origin == NETFS_DIO_READ) {
+ if (rreq->origin == NETFS_UNBUFFERED_READ ||
+ rreq->origin == NETFS_DIO_READ) {
for (i = 0; i < rreq->direct_bv_count; i++) {
flush_dcache_page(rreq->direct_bv[i].bv_page);
// TODO: cifs marks pages in the destination buffer
@@ -371,13 +357,16 @@ static void netfs_rreq_assess_dio(struct netfs_io_request *rreq)
if (rreq->iocb) {
rreq->iocb->ki_pos += rreq->transferred;
- if (rreq->iocb->ki_complete)
+ if (rreq->iocb->ki_complete) {
+ trace_netfs_rreq(rreq, netfs_rreq_trace_ki_complete);
rreq->iocb->ki_complete(
rreq->iocb, rreq->error ? rreq->error : rreq->transferred);
+ }
}
if (rreq->netfs_ops->done)
rreq->netfs_ops->done(rreq);
- if (rreq->origin == NETFS_DIO_READ)
+ if (rreq->origin == NETFS_UNBUFFERED_READ ||
+ rreq->origin == NETFS_DIO_READ)
inode_dio_end(rreq->inode);
}
@@ -396,9 +385,11 @@ static void netfs_rreq_assess_single(struct netfs_io_request *rreq)
if (rreq->iocb) {
rreq->iocb->ki_pos += rreq->transferred;
- if (rreq->iocb->ki_complete)
+ if (rreq->iocb->ki_complete) {
+ trace_netfs_rreq(rreq, netfs_rreq_trace_ki_complete);
rreq->iocb->ki_complete(
rreq->iocb, rreq->error ? rreq->error : rreq->transferred);
+ }
}
if (rreq->netfs_ops->done)
rreq->netfs_ops->done(rreq);
@@ -410,7 +401,7 @@ static void netfs_rreq_assess_single(struct netfs_io_request *rreq)
* Note that we're in normal kernel thread context at this point, possibly
* running on a workqueue.
*/
-static void netfs_read_collection(struct netfs_io_request *rreq)
+bool netfs_read_collection(struct netfs_io_request *rreq)
{
struct netfs_io_stream *stream = &rreq->io_streams[0];
@@ -420,11 +411,11 @@ static void netfs_read_collection(struct netfs_io_request *rreq)
* queue is empty.
*/
if (!test_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags))
- return;
+ return false;
smp_rmb(); /* Read ALL_QUEUED before subreq lists. */
if (!list_empty(&stream->subrequests))
- return;
+ return false;
/* Okay, declare that all I/O is complete. */
rreq->transferred = stream->transferred;
@@ -433,6 +424,7 @@ static void netfs_read_collection(struct netfs_io_request *rreq)
//netfs_rreq_is_still_valid(rreq);
switch (rreq->origin) {
+ case NETFS_UNBUFFERED_READ:
case NETFS_DIO_READ:
case NETFS_READ_GAPS:
netfs_rreq_assess_dio(rreq);
@@ -445,14 +437,15 @@ static void netfs_read_collection(struct netfs_io_request *rreq)
}
task_io_account_read(rreq->transferred);
- trace_netfs_rreq(rreq, netfs_rreq_trace_wake_ip);
- clear_and_wake_up_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
+ netfs_wake_rreq_flag(rreq, NETFS_RREQ_IN_PROGRESS, netfs_rreq_trace_wake_ip);
+ /* As we cleared NETFS_RREQ_IN_PROGRESS, we acquired its ref. */
trace_netfs_rreq(rreq, netfs_rreq_trace_done);
- netfs_clear_subrequests(rreq, false);
+ netfs_clear_subrequests(rreq);
netfs_unlock_abandoned_read_pages(rreq);
if (unlikely(rreq->copy_to_cache))
netfs_pgpriv2_end_copy_to_cache(rreq);
+ return true;
}
void netfs_read_collection_worker(struct work_struct *work)
@@ -460,25 +453,12 @@ void netfs_read_collection_worker(struct work_struct *work)
struct netfs_io_request *rreq = container_of(work, struct netfs_io_request, work);
netfs_see_request(rreq, netfs_rreq_trace_see_work);
- if (test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags))
- netfs_read_collection(rreq);
- netfs_put_request(rreq, false, netfs_rreq_trace_put_work);
-}
-
-/*
- * Wake the collection work item.
- */
-void netfs_wake_read_collector(struct netfs_io_request *rreq)
-{
- if (test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags)) {
- if (!work_pending(&rreq->work)) {
- netfs_get_request(rreq, netfs_rreq_trace_get_work);
- if (!queue_work(system_unbound_wq, &rreq->work))
- netfs_put_request(rreq, true, netfs_rreq_trace_put_work_nq);
- }
- } else {
- trace_netfs_rreq(rreq, netfs_rreq_trace_wake_queue);
- wake_up(&rreq->waitq);
+ if (netfs_check_rreq_in_progress(rreq)) {
+ if (netfs_read_collection(rreq))
+ /* Drop the ref from the IN_PROGRESS flag. */
+ netfs_put_request(rreq, netfs_rreq_trace_put_work_ip);
+ else
+ netfs_see_request(rreq, netfs_rreq_trace_see_work_complete);
}
}
@@ -510,7 +490,7 @@ void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq)
list_is_first(&subreq->rreq_link, &stream->subrequests)
) {
__set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
- netfs_wake_read_collector(rreq);
+ netfs_wake_collector(rreq);
}
}
EXPORT_SYMBOL(netfs_read_subreq_progress);
@@ -534,7 +514,6 @@ EXPORT_SYMBOL(netfs_read_subreq_progress);
void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq)
{
struct netfs_io_request *rreq = subreq->rreq;
- struct netfs_io_stream *stream = &rreq->io_streams[0];
switch (subreq->source) {
case NETFS_READ_FROM_CACHE:
@@ -581,22 +560,15 @@ void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq)
}
trace_netfs_sreq(subreq, netfs_sreq_trace_terminated);
-
- clear_bit_unlock(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
- smp_mb__after_atomic(); /* Clear IN_PROGRESS before task state */
-
- /* If we are at the head of the queue, wake up the collector. */
- if (list_is_first(&subreq->rreq_link, &stream->subrequests))
- netfs_wake_read_collector(rreq);
-
- netfs_put_subrequest(subreq, true, netfs_sreq_trace_put_terminated);
+ netfs_subreq_clear_in_progress(subreq);
+ netfs_put_subrequest(subreq, netfs_sreq_trace_put_terminated);
}
EXPORT_SYMBOL(netfs_read_subreq_terminated);
/*
* Handle termination of a read from the cache.
*/
-void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error, bool was_async)
+void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error)
{
struct netfs_io_subrequest *subreq = priv;
@@ -611,92 +583,3 @@ void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error, bool
}
netfs_read_subreq_terminated(subreq);
}
-
-/*
- * Wait for the read operation to complete, successfully or otherwise.
- */
-ssize_t netfs_wait_for_read(struct netfs_io_request *rreq)
-{
- struct netfs_io_subrequest *subreq;
- struct netfs_io_stream *stream = &rreq->io_streams[0];
- DEFINE_WAIT(myself);
- ssize_t ret;
-
- for (;;) {
- trace_netfs_rreq(rreq, netfs_rreq_trace_wait_queue);
- prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE);
-
- subreq = list_first_entry_or_null(&stream->subrequests,
- struct netfs_io_subrequest, rreq_link);
- if (subreq &&
- (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags) ||
- test_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags))) {
- __set_current_state(TASK_RUNNING);
- netfs_read_collection(rreq);
- continue;
- }
-
- if (!test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags))
- break;
-
- schedule();
- trace_netfs_rreq(rreq, netfs_rreq_trace_woke_queue);
- }
-
- finish_wait(&rreq->waitq, &myself);
-
- ret = rreq->error;
- if (ret == 0) {
- ret = rreq->transferred;
- switch (rreq->origin) {
- case NETFS_DIO_READ:
- case NETFS_READ_SINGLE:
- ret = rreq->transferred;
- break;
- default:
- if (rreq->submitted < rreq->len) {
- trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read);
- ret = -EIO;
- }
- break;
- }
- }
-
- return ret;
-}
-
-/*
- * Wait for a paused read operation to unpause or complete in some manner.
- */
-void netfs_wait_for_pause(struct netfs_io_request *rreq)
-{
- struct netfs_io_subrequest *subreq;
- struct netfs_io_stream *stream = &rreq->io_streams[0];
- DEFINE_WAIT(myself);
-
- trace_netfs_rreq(rreq, netfs_rreq_trace_wait_pause);
-
- for (;;) {
- trace_netfs_rreq(rreq, netfs_rreq_trace_wait_queue);
- prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE);
-
- subreq = list_first_entry_or_null(&stream->subrequests,
- struct netfs_io_subrequest, rreq_link);
- if (subreq &&
- (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags) ||
- test_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags))) {
- __set_current_state(TASK_RUNNING);
- netfs_read_collection(rreq);
- continue;
- }
-
- if (!test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags) ||
- !test_bit(NETFS_RREQ_PAUSE, &rreq->flags))
- break;
-
- schedule();
- trace_netfs_rreq(rreq, netfs_rreq_trace_woke_queue);
- }
-
- finish_wait(&rreq->waitq, &myself);
-}
diff --git a/fs/netfs/read_pgpriv2.c b/fs/netfs/read_pgpriv2.c
index cf7727060215..8097bc069c1d 100644
--- a/fs/netfs/read_pgpriv2.c
+++ b/fs/netfs/read_pgpriv2.c
@@ -110,13 +110,15 @@ static struct netfs_io_request *netfs_pgpriv2_begin_copy_to_cache(
if (!creq->io_streams[1].avail)
goto cancel_put;
+ __set_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &creq->flags);
+ trace_netfs_copy2cache(rreq, creq);
trace_netfs_write(creq, netfs_write_trace_copy_to_cache);
netfs_stat(&netfs_n_wh_copy_to_cache);
rreq->copy_to_cache = creq;
return creq;
cancel_put:
- netfs_put_request(creq, false, netfs_rreq_trace_put_return);
+ netfs_put_request(creq, netfs_rreq_trace_put_return);
cancel:
rreq->copy_to_cache = ERR_PTR(-ENOBUFS);
clear_bit(NETFS_RREQ_FOLIO_COPY_TO_CACHE, &rreq->flags);
@@ -154,8 +156,11 @@ void netfs_pgpriv2_end_copy_to_cache(struct netfs_io_request *rreq)
netfs_issue_write(creq, &creq->io_streams[1]);
smp_wmb(); /* Write lists before ALL_QUEUED. */
set_bit(NETFS_RREQ_ALL_QUEUED, &creq->flags);
+ trace_netfs_rreq(rreq, netfs_rreq_trace_end_copy_to_cache);
+ if (list_empty_careful(&creq->io_streams[1].subrequests))
+ netfs_wake_collector(creq);
- netfs_put_request(creq, false, netfs_rreq_trace_put_return);
+ netfs_put_request(creq, netfs_rreq_trace_put_return);
creq->copy_to_cache = NULL;
}
diff --git a/fs/netfs/read_retry.c b/fs/netfs/read_retry.c
index 2290af0d51ac..b99e84a8170a 100644
--- a/fs/netfs/read_retry.c
+++ b/fs/netfs/read_retry.c
@@ -14,7 +14,7 @@ static void netfs_reissue_read(struct netfs_io_request *rreq,
{
__clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
__set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
- netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
+ netfs_stat(&netfs_n_rh_retry_read_subreq);
subreq->rreq->netfs_ops->issue_read(subreq);
}
@@ -48,6 +48,7 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq)
__clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
subreq->retry_count++;
netfs_reset_iter(subreq);
+ netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
netfs_reissue_read(rreq, subreq);
}
}
@@ -75,7 +76,7 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq)
struct iov_iter source;
unsigned long long start, len;
size_t part;
- bool boundary = false;
+ bool boundary = false, subreq_superfluous = false;
/* Go through the subreqs and find the next span of contiguous
* buffer that we then rejig (cifs, for example, needs the
@@ -116,8 +117,10 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq)
/* Work through the sublist. */
subreq = from;
list_for_each_entry_from(subreq, &stream->subrequests, rreq_link) {
- if (!len)
+ if (!len) {
+ subreq_superfluous = true;
break;
+ }
subreq->source = NETFS_DOWNLOAD_FROM_SERVER;
subreq->start = start - subreq->transferred;
subreq->len = len + subreq->transferred;
@@ -154,21 +157,23 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq)
netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
netfs_reissue_read(rreq, subreq);
- if (subreq == to)
+ if (subreq == to) {
+ subreq_superfluous = false;
break;
+ }
}
/* If we managed to use fewer subreqs, we can discard the
* excess; if we used the same number, then we're done.
*/
if (!len) {
- if (subreq == to)
+ if (!subreq_superfluous)
continue;
list_for_each_entry_safe_from(subreq, tmp,
&stream->subrequests, rreq_link) {
- trace_netfs_sreq(subreq, netfs_sreq_trace_discard);
+ trace_netfs_sreq(subreq, netfs_sreq_trace_superfluous);
list_del(&subreq->rreq_link);
- netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_done);
+ netfs_put_subrequest(subreq, netfs_sreq_trace_put_done);
if (subreq == to)
break;
}
@@ -187,14 +192,12 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq)
subreq->source = NETFS_DOWNLOAD_FROM_SERVER;
subreq->start = start;
subreq->len = len;
- subreq->debug_index = atomic_inc_return(&rreq->subreq_counter);
subreq->stream_nr = stream->stream_nr;
subreq->retry_count = 1;
trace_netfs_sreq_ref(rreq->debug_id, subreq->debug_index,
refcount_read(&subreq->ref),
netfs_sreq_trace_new);
- netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
list_add(&subreq->rreq_link, &to->rreq_link);
to = list_next_entry(to, rreq_link);
@@ -254,16 +257,16 @@ abandon:
*/
void netfs_retry_reads(struct netfs_io_request *rreq)
{
- struct netfs_io_subrequest *subreq;
struct netfs_io_stream *stream = &rreq->io_streams[0];
+ netfs_stat(&netfs_n_rh_retry_read_req);
+
/* Wait for all outstanding I/O to quiesce before performing retries as
* we may need to renegotiate the I/O sizes.
*/
- list_for_each_entry(subreq, &stream->subrequests, rreq_link) {
- wait_on_bit(&subreq->flags, NETFS_SREQ_IN_PROGRESS,
- TASK_UNINTERRUPTIBLE);
- }
+ set_bit(NETFS_RREQ_RETRYING, &rreq->flags);
+ netfs_wait_for_in_progress_stream(rreq, stream);
+ clear_bit(NETFS_RREQ_RETRYING, &rreq->flags);
trace_netfs_rreq(rreq, netfs_rreq_trace_resubmit);
netfs_retry_read_subrequests(rreq);
diff --git a/fs/netfs/read_single.c b/fs/netfs/read_single.c
index fea0ecdecc53..fa622a6cd56d 100644
--- a/fs/netfs/read_single.c
+++ b/fs/netfs/read_single.c
@@ -142,7 +142,7 @@ static int netfs_single_dispatch_read(struct netfs_io_request *rreq)
set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags);
return ret;
cancel:
- netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel);
+ netfs_put_subrequest(subreq, netfs_sreq_trace_put_cancel);
return ret;
}
@@ -185,11 +185,11 @@ ssize_t netfs_read_single(struct inode *inode, struct file *file, struct iov_ite
netfs_single_dispatch_read(rreq);
ret = netfs_wait_for_read(rreq);
- netfs_put_request(rreq, true, netfs_rreq_trace_put_return);
+ netfs_put_request(rreq, netfs_rreq_trace_put_return);
return ret;
cleanup_free:
- netfs_put_request(rreq, false, netfs_rreq_trace_put_failed);
+ netfs_put_request(rreq, netfs_rreq_trace_put_failed);
return ret;
}
EXPORT_SYMBOL(netfs_read_single);
diff --git a/fs/netfs/rolling_buffer.c b/fs/netfs/rolling_buffer.c
index 75d97af14b4a..207b6a326651 100644
--- a/fs/netfs/rolling_buffer.c
+++ b/fs/netfs/rolling_buffer.c
@@ -146,10 +146,6 @@ ssize_t rolling_buffer_load_from_ra(struct rolling_buffer *roll,
/* Store the counter after setting the slot. */
smp_store_release(&roll->next_head_slot, to);
-
- for (; ix < folioq_nr_slots(fq); ix++)
- folioq_clear(fq, ix);
-
return size;
}
diff --git a/fs/netfs/stats.c b/fs/netfs/stats.c
index f1af344266cc..ab6b916addc4 100644
--- a/fs/netfs/stats.c
+++ b/fs/netfs/stats.c
@@ -29,6 +29,8 @@ atomic_t netfs_n_rh_write_begin;
atomic_t netfs_n_rh_write_done;
atomic_t netfs_n_rh_write_failed;
atomic_t netfs_n_rh_write_zskip;
+atomic_t netfs_n_rh_retry_read_req;
+atomic_t netfs_n_rh_retry_read_subreq;
atomic_t netfs_n_wh_buffered_write;
atomic_t netfs_n_wh_writethrough;
atomic_t netfs_n_wh_dio_write;
@@ -41,6 +43,8 @@ atomic_t netfs_n_wh_upload_failed;
atomic_t netfs_n_wh_write;
atomic_t netfs_n_wh_write_done;
atomic_t netfs_n_wh_write_failed;
+atomic_t netfs_n_wh_retry_write_req;
+atomic_t netfs_n_wh_retry_write_subreq;
atomic_t netfs_n_wb_lock_skip;
atomic_t netfs_n_wb_lock_wait;
atomic_t netfs_n_folioq;
@@ -81,6 +85,11 @@ int netfs_stats_show(struct seq_file *m, void *v)
atomic_read(&netfs_n_wh_write),
atomic_read(&netfs_n_wh_write_done),
atomic_read(&netfs_n_wh_write_failed));
+ seq_printf(m, "Retries: rq=%u rs=%u wq=%u ws=%u\n",
+ atomic_read(&netfs_n_rh_retry_read_req),
+ atomic_read(&netfs_n_rh_retry_read_subreq),
+ atomic_read(&netfs_n_wh_retry_write_req),
+ atomic_read(&netfs_n_wh_retry_write_subreq));
seq_printf(m, "Objs : rr=%u sr=%u foq=%u wsc=%u\n",
atomic_read(&netfs_n_rh_rreq),
atomic_read(&netfs_n_rh_sreq),
diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c
index 294f67795f79..cbf3d9194c7b 100644
--- a/fs/netfs/write_collect.c
+++ b/fs/netfs/write_collect.c
@@ -240,7 +240,7 @@ reassess_streams:
}
/* Stall if the front is still undergoing I/O. */
- if (test_bit(NETFS_SREQ_IN_PROGRESS, &front->flags)) {
+ if (netfs_check_subreq_in_progress(front)) {
notes |= HIT_PENDING;
break;
}
@@ -254,6 +254,7 @@ reassess_streams:
if (front->start + front->transferred > stream->collected_to) {
stream->collected_to = front->start + front->transferred;
stream->transferred = stream->collected_to - wreq->start;
+ stream->transferred_valid = true;
notes |= MADE_PROGRESS;
}
if (test_bit(NETFS_SREQ_FAILED, &front->flags)) {
@@ -280,7 +281,7 @@ reassess_streams:
struct netfs_io_subrequest, rreq_link);
stream->front = front;
spin_unlock(&wreq->lock);
- netfs_put_subrequest(remove, false,
+ netfs_put_subrequest(remove,
notes & SAW_FAILURE ?
netfs_sreq_trace_put_cancel :
netfs_sreq_trace_put_done);
@@ -321,18 +322,14 @@ reassess_streams:
if (notes & NEED_RETRY)
goto need_retry;
- if ((notes & MADE_PROGRESS) && test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) {
- trace_netfs_rreq(wreq, netfs_rreq_trace_unpause);
- clear_bit_unlock(NETFS_RREQ_PAUSE, &wreq->flags);
- smp_mb__after_atomic(); /* Set PAUSE before task state */
- wake_up(&wreq->waitq);
- }
- if (notes & NEED_REASSESS) {
+ if (notes & MADE_PROGRESS) {
+ netfs_wake_rreq_flag(wreq, NETFS_RREQ_PAUSE, netfs_rreq_trace_unpause);
//cond_resched();
goto reassess_streams;
}
- if (notes & MADE_PROGRESS) {
+
+ if (notes & NEED_REASSESS) {
//cond_resched();
goto reassess_streams;
}
@@ -356,30 +353,22 @@ need_retry:
/*
* Perform the collection of subrequests, folios and encryption buffers.
*/
-void netfs_write_collection_worker(struct work_struct *work)
+bool netfs_write_collection(struct netfs_io_request *wreq)
{
- struct netfs_io_request *wreq = container_of(work, struct netfs_io_request, work);
struct netfs_inode *ictx = netfs_inode(wreq->inode);
size_t transferred;
+ bool transferred_valid = false;
int s;
_enter("R=%x", wreq->debug_id);
- netfs_see_request(wreq, netfs_rreq_trace_see_work);
- if (!test_bit(NETFS_RREQ_IN_PROGRESS, &wreq->flags)) {
- netfs_put_request(wreq, false, netfs_rreq_trace_put_work);
- return;
- }
-
netfs_collect_write_results(wreq);
/* We're done when the app thread has finished posting subreqs and all
* the queues in all the streams are empty.
*/
- if (!test_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags)) {
- netfs_put_request(wreq, false, netfs_rreq_trace_put_work);
- return;
- }
+ if (!test_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags))
+ return false;
smp_rmb(); /* Read ALL_QUEUED before lists. */
transferred = LONG_MAX;
@@ -387,28 +376,33 @@ void netfs_write_collection_worker(struct work_struct *work)
struct netfs_io_stream *stream = &wreq->io_streams[s];
if (!stream->active)
continue;
- if (!list_empty(&stream->subrequests)) {
- netfs_put_request(wreq, false, netfs_rreq_trace_put_work);
- return;
- }
- if (stream->transferred < transferred)
+ if (!list_empty(&stream->subrequests))
+ return false;
+ if (stream->transferred_valid &&
+ stream->transferred < transferred) {
transferred = stream->transferred;
+ transferred_valid = true;
+ }
}
/* Okay, declare that all I/O is complete. */
- wreq->transferred = transferred;
+ if (transferred_valid)
+ wreq->transferred = transferred;
trace_netfs_rreq(wreq, netfs_rreq_trace_write_done);
if (wreq->io_streams[1].active &&
- wreq->io_streams[1].failed) {
+ wreq->io_streams[1].failed &&
+ ictx->ops->invalidate_cache) {
/* Cache write failure doesn't prevent writeback completion
* unless we're in disconnected mode.
*/
ictx->ops->invalidate_cache(wreq);
}
- if (wreq->cleanup)
- wreq->cleanup(wreq);
+ if ((wreq->origin == NETFS_UNBUFFERED_WRITE ||
+ wreq->origin == NETFS_DIO_WRITE) &&
+ !wreq->error)
+ netfs_update_i_size(ictx, &ictx->inode, wreq->start, wreq->transferred);
if (wreq->origin == NETFS_DIO_WRITE &&
wreq->mapping->nrpages) {
@@ -427,31 +421,35 @@ void netfs_write_collection_worker(struct work_struct *work)
inode_dio_end(wreq->inode);
_debug("finished");
- trace_netfs_rreq(wreq, netfs_rreq_trace_wake_ip);
- clear_and_wake_up_bit(NETFS_RREQ_IN_PROGRESS, &wreq->flags);
+ netfs_wake_rreq_flag(wreq, NETFS_RREQ_IN_PROGRESS, netfs_rreq_trace_wake_ip);
+ /* As we cleared NETFS_RREQ_IN_PROGRESS, we acquired its ref. */
if (wreq->iocb) {
size_t written = min(wreq->transferred, wreq->len);
wreq->iocb->ki_pos += written;
- if (wreq->iocb->ki_complete)
+ if (wreq->iocb->ki_complete) {
+ trace_netfs_rreq(wreq, netfs_rreq_trace_ki_complete);
wreq->iocb->ki_complete(
wreq->iocb, wreq->error ? wreq->error : written);
+ }
wreq->iocb = VFS_PTR_POISON;
}
- netfs_clear_subrequests(wreq, false);
- netfs_put_request(wreq, false, netfs_rreq_trace_put_work_complete);
+ netfs_clear_subrequests(wreq);
+ return true;
}
-/*
- * Wake the collection work item.
- */
-void netfs_wake_write_collector(struct netfs_io_request *wreq, bool was_async)
+void netfs_write_collection_worker(struct work_struct *work)
{
- if (!work_pending(&wreq->work)) {
- netfs_get_request(wreq, netfs_rreq_trace_get_work);
- if (!queue_work(system_unbound_wq, &wreq->work))
- netfs_put_request(wreq, was_async, netfs_rreq_trace_put_work_nq);
+ struct netfs_io_request *rreq = container_of(work, struct netfs_io_request, work);
+
+ netfs_see_request(rreq, netfs_rreq_trace_see_work);
+ if (netfs_check_rreq_in_progress(rreq)) {
+ if (netfs_write_collection(rreq))
+ /* Drop the ref from the IN_PROGRESS flag. */
+ netfs_put_request(rreq, netfs_rreq_trace_put_work_ip);
+ else
+ netfs_see_request(rreq, netfs_rreq_trace_see_work_complete);
}
}
@@ -459,7 +457,6 @@ void netfs_wake_write_collector(struct netfs_io_request *wreq, bool was_async)
* netfs_write_subrequest_terminated - Note the termination of a write operation.
* @_op: The I/O request that has terminated.
* @transferred_or_error: The amount of data transferred or an error code.
- * @was_async: The termination was asynchronous
*
* This tells the library that a contributory write I/O operation has
* terminated, one way or another, and that it should collect the results.
@@ -469,21 +466,16 @@ void netfs_wake_write_collector(struct netfs_io_request *wreq, bool was_async)
* negative error code. The library will look after reissuing I/O operations
* as appropriate and writing downloaded data to the cache.
*
- * If @was_async is true, the caller might be running in softirq or interrupt
- * context and we can't sleep.
- *
* When this is called, ownership of the subrequest is transferred back to the
* library, along with a ref.
*
* Note that %_op is a void* so that the function can be passed to
* kiocb::term_func without the need for a casting wrapper.
*/
-void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error,
- bool was_async)
+void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error)
{
struct netfs_io_subrequest *subreq = _op;
struct netfs_io_request *wreq = subreq->rreq;
- struct netfs_io_stream *stream = &wreq->io_streams[subreq->stream_nr];
_enter("%x[%x] %zd", wreq->debug_id, subreq->debug_index, transferred_or_error);
@@ -494,8 +486,6 @@ void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error,
case NETFS_WRITE_TO_CACHE:
netfs_stat(&netfs_n_wh_write_done);
break;
- case NETFS_INVALID_WRITE:
- break;
default:
BUG();
}
@@ -535,15 +525,7 @@ void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error,
}
trace_netfs_sreq(subreq, netfs_sreq_trace_terminated);
-
- clear_and_wake_up_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
-
- /* If we are at the head of the queue, wake up the collector,
- * transferring a ref to it if we were the ones to do so.
- */
- if (list_is_first(&subreq->rreq_link, &stream->subrequests))
- netfs_wake_write_collector(wreq, was_async);
-
- netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated);
+ netfs_subreq_clear_in_progress(subreq);
+ netfs_put_subrequest(subreq, netfs_sreq_trace_put_terminated);
}
EXPORT_SYMBOL(netfs_write_subrequest_terminated);
diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c
index 69727411683e..0584cba1a043 100644
--- a/fs/netfs/write_issue.c
+++ b/fs/netfs/write_issue.c
@@ -118,12 +118,12 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping,
wreq->io_streams[0].prepare_write = ictx->ops->prepare_write;
wreq->io_streams[0].issue_write = ictx->ops->issue_write;
wreq->io_streams[0].collected_to = start;
- wreq->io_streams[0].transferred = LONG_MAX;
+ wreq->io_streams[0].transferred = 0;
wreq->io_streams[1].stream_nr = 1;
wreq->io_streams[1].source = NETFS_WRITE_TO_CACHE;
wreq->io_streams[1].collected_to = start;
- wreq->io_streams[1].transferred = LONG_MAX;
+ wreq->io_streams[1].transferred = 0;
if (fscache_resources_valid(&wreq->cache_resources)) {
wreq->io_streams[1].avail = true;
wreq->io_streams[1].active = true;
@@ -134,7 +134,7 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping,
return wreq;
nomem:
wreq->error = -ENOMEM;
- netfs_put_request(wreq, false, netfs_rreq_trace_put_failed);
+ netfs_put_request(wreq, netfs_rreq_trace_put_failed);
return ERR_PTR(-ENOMEM);
}
@@ -233,7 +233,7 @@ static void netfs_do_issue_write(struct netfs_io_stream *stream,
_enter("R=%x[%x],%zx", wreq->debug_id, subreq->debug_index, subreq->len);
if (test_bit(NETFS_SREQ_FAILED, &subreq->flags))
- return netfs_write_subrequest_terminated(subreq, subreq->error, false);
+ return netfs_write_subrequest_terminated(subreq, subreq->error);
trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
stream->issue_write(subreq);
@@ -253,6 +253,7 @@ void netfs_reissue_write(struct netfs_io_stream *stream,
subreq->retry_count++;
__clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
__set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
+ netfs_stat(&netfs_n_wh_retry_write_subreq);
netfs_do_issue_write(stream, subreq);
}
@@ -541,7 +542,7 @@ static void netfs_end_issue_write(struct netfs_io_request *wreq)
}
if (needs_poke)
- netfs_wake_write_collector(wreq, false);
+ netfs_wake_collector(wreq);
}
/*
@@ -575,6 +576,7 @@ int netfs_writepages(struct address_space *mapping,
goto couldnt_start;
}
+ __set_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &wreq->flags);
trace_netfs_write(wreq, netfs_write_trace_writeback);
netfs_stat(&netfs_n_wh_writepages);
@@ -598,8 +600,9 @@ int netfs_writepages(struct address_space *mapping,
netfs_end_issue_write(wreq);
mutex_unlock(&ictx->wb_lock);
+ netfs_wake_collector(wreq);
- netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
+ netfs_put_request(wreq, netfs_rreq_trace_put_return);
_leave(" = %d", error);
return error;
@@ -672,11 +675,11 @@ int netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_c
/*
* End a write operation used when writing through the pagecache.
*/
-int netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc,
- struct folio *writethrough_cache)
+ssize_t netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc,
+ struct folio *writethrough_cache)
{
struct netfs_inode *ictx = netfs_inode(wreq->inode);
- int ret;
+ ssize_t ret;
_enter("R=%x", wreq->debug_id);
@@ -687,13 +690,11 @@ int netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_contr
mutex_unlock(&ictx->wb_lock);
- if (wreq->iocb) {
+ if (wreq->iocb)
ret = -EIOCBQUEUED;
- } else {
- wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS, TASK_UNINTERRUPTIBLE);
- ret = wreq->error;
- }
- netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
+ else
+ ret = netfs_wait_for_write(wreq);
+ netfs_put_request(wreq, netfs_rreq_trace_put_return);
return ret;
}
@@ -721,10 +722,8 @@ int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t
start += part;
len -= part;
rolling_buffer_advance(&wreq->buffer, part);
- if (test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) {
- trace_netfs_rreq(wreq, netfs_rreq_trace_wait_pause);
- wait_event(wreq->waitq, !test_bit(NETFS_RREQ_PAUSE, &wreq->flags));
- }
+ if (test_bit(NETFS_RREQ_PAUSE, &wreq->flags))
+ netfs_wait_for_paused_write(wreq);
if (test_bit(NETFS_RREQ_FAILED, &wreq->flags))
break;
}
@@ -884,7 +883,8 @@ int netfs_writeback_single(struct address_space *mapping,
goto couldnt_start;
}
- trace_netfs_write(wreq, netfs_write_trace_writeback);
+ __set_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &wreq->flags);
+ trace_netfs_write(wreq, netfs_write_trace_writeback_single);
netfs_stat(&netfs_n_wh_writepages);
if (__test_and_set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags))
@@ -913,8 +913,9 @@ stop:
set_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags);
mutex_unlock(&ictx->wb_lock);
+ netfs_wake_collector(wreq);
- netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
+ netfs_put_request(wreq, netfs_rreq_trace_put_return);
_leave(" = %d", ret);
return ret;
diff --git a/fs/netfs/write_retry.c b/fs/netfs/write_retry.c
index c841a851dd73..fc9c3e0d34d8 100644
--- a/fs/netfs/write_retry.c
+++ b/fs/netfs/write_retry.c
@@ -39,9 +39,10 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq,
if (test_bit(NETFS_SREQ_FAILED, &subreq->flags))
break;
if (__test_and_clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) {
- struct iov_iter source = subreq->io_iter;
+ struct iov_iter source;
- iov_iter_revert(&source, subreq->len - source.count);
+ netfs_reset_iter(subreq);
+ source = subreq->io_iter;
netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
netfs_reissue_write(stream, subreq, &source);
}
@@ -131,7 +132,7 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq,
&stream->subrequests, rreq_link) {
trace_netfs_sreq(subreq, netfs_sreq_trace_discard);
list_del(&subreq->rreq_link);
- netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_done);
+ netfs_put_subrequest(subreq, netfs_sreq_trace_put_done);
if (subreq == to)
break;
}
@@ -145,14 +146,13 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq,
subreq = netfs_alloc_subrequest(wreq);
subreq->source = to->source;
subreq->start = start;
- subreq->debug_index = atomic_inc_return(&wreq->subreq_counter);
subreq->stream_nr = to->stream_nr;
subreq->retry_count = 1;
trace_netfs_sreq_ref(wreq->debug_id, subreq->debug_index,
refcount_read(&subreq->ref),
netfs_sreq_trace_new);
- netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
+ trace_netfs_sreq(subreq, netfs_sreq_trace_split);
list_add(&subreq->rreq_link, &to->rreq_link);
to = list_next_entry(to, rreq_link);
@@ -199,23 +199,21 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq,
*/
void netfs_retry_writes(struct netfs_io_request *wreq)
{
- struct netfs_io_subrequest *subreq;
struct netfs_io_stream *stream;
int s;
+ netfs_stat(&netfs_n_wh_retry_write_req);
+
/* Wait for all outstanding I/O to quiesce before performing retries as
* we may need to renegotiate the I/O sizes.
*/
+ set_bit(NETFS_RREQ_RETRYING, &wreq->flags);
for (s = 0; s < NR_IO_STREAMS; s++) {
stream = &wreq->io_streams[s];
- if (!stream->active)
- continue;
-
- list_for_each_entry(subreq, &stream->subrequests, rreq_link) {
- wait_on_bit(&subreq->flags, NETFS_SREQ_IN_PROGRESS,
- TASK_UNINTERRUPTIBLE);
- }
+ if (stream->active)
+ netfs_wait_for_in_progress_stream(wreq, stream);
}
+ clear_bit(NETFS_RREQ_RETRYING, &wreq->flags);
// TODO: Enc: Fetch changed partial pages
// TODO: Enc: Reencrypt content if needed.
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index d3f76101ad4b..07932ce9246c 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -2,6 +2,7 @@
config NFS_FS
tristate "NFS client support"
depends on INET && FILE_LOCKING && MULTIUSER
+ select CRC32
select LOCKD
select SUNRPC
select NFS_COMMON
@@ -196,7 +197,6 @@ config NFS_USE_KERNEL_DNS
config NFS_DEBUG
bool
depends on NFS_FS && SUNRPC_DEBUG
- select CRC32
default y
config NFS_DISABLE_UDP_SUPPORT
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 47189476b553..5d6edafbed20 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -149,8 +149,8 @@ do_add_page_to_bio(struct bio *bio, int npg, enum req_op op, sector_t isect,
/* limit length to what the device mapping allows */
end = disk_addr + *len;
- if (end >= map->start + map->len)
- *len = map->start + map->len - disk_addr;
+ if (end >= map->disk_offset + map->len)
+ *len = map->disk_offset + map->len - disk_addr;
retry:
if (!bio) {
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
index cab8809f0e0f..44306ac22353 100644
--- a/fs/nfs/blocklayout/dev.c
+++ b/fs/nfs/blocklayout/dev.c
@@ -257,10 +257,11 @@ static bool bl_map_stripe(struct pnfs_block_dev *dev, u64 offset,
struct pnfs_block_dev *child;
u64 chunk;
u32 chunk_idx;
+ u64 disk_chunk;
u64 disk_offset;
chunk = div_u64(offset, dev->chunk_size);
- div_u64_rem(chunk, dev->nr_children, &chunk_idx);
+ disk_chunk = div_u64_rem(chunk, dev->nr_children, &chunk_idx);
if (chunk_idx >= dev->nr_children) {
dprintk("%s: invalid chunk idx %d (%lld/%lld)\n",
@@ -273,7 +274,7 @@ static bool bl_map_stripe(struct pnfs_block_dev *dev, u64 offset,
offset = chunk * dev->chunk_size;
/* disk offset of the stripe */
- disk_offset = div_u64(offset, dev->nr_children);
+ disk_offset = disk_chunk * dev->chunk_size;
child = &dev->children[chunk_idx];
child->map(child, disk_offset, map);
diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
index 8f7cff7a4293..315949a7e92d 100644
--- a/fs/nfs/blocklayout/extent_tree.c
+++ b/fs/nfs/blocklayout/extent_tree.c
@@ -6,6 +6,7 @@
#include <linux/vmalloc.h>
#include "blocklayout.h"
+#include "../nfs4trace.h"
#define NFSDBG_FACILITY NFSDBG_PNFS_LD
@@ -520,10 +521,71 @@ static __be32 *encode_scsi_range(struct pnfs_block_extent *be, __be32 *p)
return xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
}
-static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
+/**
+ * ext_tree_try_encode_commit - try to encode all extents into the buffer
+ * @bl: pointer to the layout
+ * @p: pointer to the output buffer
+ * @buffer_size: size of the output buffer
+ * @count: output pointer to the number of encoded extents
+ * @lastbyte: output pointer to the last written byte
+ *
+ * Return values:
+ * %0: Success, all required extents encoded, outputs are valid
+ * %-ENOSPC: Buffer too small, nothing encoded, outputs are invalid
+ */
+static int
+ext_tree_try_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
size_t buffer_size, size_t *count, __u64 *lastbyte)
{
struct pnfs_block_extent *be;
+
+ spin_lock(&bl->bl_ext_lock);
+ for (be = ext_tree_first(&bl->bl_ext_rw); be; be = ext_tree_next(be)) {
+ if (be->be_state != PNFS_BLOCK_INVALID_DATA ||
+ be->be_tag != EXTENT_WRITTEN)
+ continue;
+
+ (*count)++;
+ if (ext_tree_layoutupdate_size(bl, *count) > buffer_size) {
+ spin_unlock(&bl->bl_ext_lock);
+ return -ENOSPC;
+ }
+ }
+ for (be = ext_tree_first(&bl->bl_ext_rw); be; be = ext_tree_next(be)) {
+ if (be->be_state != PNFS_BLOCK_INVALID_DATA ||
+ be->be_tag != EXTENT_WRITTEN)
+ continue;
+
+ if (bl->bl_scsi_layout)
+ p = encode_scsi_range(be, p);
+ else
+ p = encode_block_extent(be, p);
+ be->be_tag = EXTENT_COMMITTING;
+ }
+ *lastbyte = (bl->bl_lwb != 0) ? bl->bl_lwb - 1 : U64_MAX;
+ bl->bl_lwb = 0;
+ spin_unlock(&bl->bl_ext_lock);
+
+ return 0;
+}
+
+/**
+ * ext_tree_encode_commit - encode as much as possible extents into the buffer
+ * @bl: pointer to the layout
+ * @p: pointer to the output buffer
+ * @buffer_size: size of the output buffer
+ * @count: output pointer to the number of encoded extents
+ * @lastbyte: output pointer to the last written byte
+ *
+ * Return values:
+ * %0: Success, all required extents encoded, outputs are valid
+ * %-ENOSPC: Buffer too small, some extents are encoded, outputs are valid
+ */
+static int
+ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
+ size_t buffer_size, size_t *count, __u64 *lastbyte)
+{
+ struct pnfs_block_extent *be, *be_prev;
int ret = 0;
spin_lock(&bl->bl_ext_lock);
@@ -534,9 +596,9 @@ static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
(*count)++;
if (ext_tree_layoutupdate_size(bl, *count) > buffer_size) {
- /* keep counting.. */
+ (*count)--;
ret = -ENOSPC;
- continue;
+ break;
}
if (bl->bl_scsi_layout)
@@ -544,14 +606,30 @@ static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
else
p = encode_block_extent(be, p);
be->be_tag = EXTENT_COMMITTING;
+ be_prev = be;
+ }
+ if (!ret) {
+ *lastbyte = (bl->bl_lwb != 0) ? bl->bl_lwb - 1 : U64_MAX;
+ bl->bl_lwb = 0;
+ } else {
+ *lastbyte = be_prev->be_f_offset + be_prev->be_length;
+ *lastbyte <<= SECTOR_SHIFT;
+ *lastbyte -= 1;
}
- *lastbyte = bl->bl_lwb - 1;
- bl->bl_lwb = 0;
spin_unlock(&bl->bl_ext_lock);
return ret;
}
+/**
+ * ext_tree_prepare_commit - encode extents that need to be committed
+ * @arg: layout commit data
+ *
+ * Return values:
+ * %0: Success, all required extents are encoded
+ * %-ENOSPC: Some extents are encoded, but not all, due to RPC size limit
+ * %-ENOMEM: Out of memory, extents not encoded
+ */
int
ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg)
{
@@ -560,20 +638,18 @@ ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg)
__be32 *start_p;
int ret;
- dprintk("%s enter\n", __func__);
-
arg->layoutupdate_page = alloc_page(GFP_NOFS);
if (!arg->layoutupdate_page)
return -ENOMEM;
start_p = page_address(arg->layoutupdate_page);
arg->layoutupdate_pages = &arg->layoutupdate_page;
-retry:
- ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count, &arg->lastbytewritten);
+ ret = ext_tree_try_encode_commit(bl, start_p + 1, buffer_size,
+ &count, &arg->lastbytewritten);
if (unlikely(ret)) {
ext_tree_free_commitdata(arg, buffer_size);
- buffer_size = ext_tree_layoutupdate_size(bl, count);
+ buffer_size = NFS_SERVER(arg->inode)->wsize;
count = 0;
arg->layoutupdate_pages =
@@ -588,7 +664,8 @@ retry:
return -ENOMEM;
}
- goto retry;
+ ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size,
+ &count, &arg->lastbytewritten);
}
*start_p = cpu_to_be32(count);
@@ -607,8 +684,9 @@ retry:
}
}
- dprintk("%s found %zu ranges\n", __func__, count);
- return 0;
+ trace_bl_ext_tree_prepare_commit(ret, count,
+ arg->lastbytewritten, !!ret);
+ return ret;
}
void
diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c
index d8d50a88de04..d526f5ba7887 100644
--- a/fs/nfs/blocklayout/rpc_pipefs.c
+++ b/fs/nfs/blocklayout/rpc_pipefs.c
@@ -141,24 +141,18 @@ static const struct rpc_pipe_ops bl_upcall_ops = {
.destroy_msg = bl_pipe_destroy_msg,
};
-static struct dentry *nfs4blocklayout_register_sb(struct super_block *sb,
+static int nfs4blocklayout_register_sb(struct super_block *sb,
struct rpc_pipe *pipe)
{
- struct dentry *dir, *dentry;
+ struct dentry *dir;
+ int err;
dir = rpc_d_lookup_sb(sb, NFS_PIPE_DIRNAME);
if (dir == NULL)
- return ERR_PTR(-ENOENT);
- dentry = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe);
+ return -ENOENT;
+ err = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe);
dput(dir);
- return dentry;
-}
-
-static void nfs4blocklayout_unregister_sb(struct super_block *sb,
- struct rpc_pipe *pipe)
-{
- if (pipe->dentry)
- rpc_unlink(pipe->dentry);
+ return err;
}
static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
@@ -167,7 +161,6 @@ static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
struct super_block *sb = ptr;
struct net *net = sb->s_fs_info;
struct nfs_net *nn = net_generic(net, nfs_net_id);
- struct dentry *dentry;
int ret = 0;
if (!try_module_get(THIS_MODULE))
@@ -180,16 +173,10 @@ static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
switch (event) {
case RPC_PIPEFS_MOUNT:
- dentry = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe);
- if (IS_ERR(dentry)) {
- ret = PTR_ERR(dentry);
- break;
- }
- nn->bl_device_pipe->dentry = dentry;
+ ret = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe);
break;
case RPC_PIPEFS_UMOUNT:
- if (nn->bl_device_pipe->dentry)
- nfs4blocklayout_unregister_sb(sb, nn->bl_device_pipe);
+ rpc_unlink(nn->bl_device_pipe);
break;
default:
ret = -ENOTSUPP;
@@ -203,18 +190,17 @@ static struct notifier_block nfs4blocklayout_block = {
.notifier_call = rpc_pipefs_event,
};
-static struct dentry *nfs4blocklayout_register_net(struct net *net,
- struct rpc_pipe *pipe)
+static int nfs4blocklayout_register_net(struct net *net, struct rpc_pipe *pipe)
{
struct super_block *pipefs_sb;
- struct dentry *dentry;
+ int ret;
pipefs_sb = rpc_get_sb_net(net);
if (!pipefs_sb)
- return NULL;
- dentry = nfs4blocklayout_register_sb(pipefs_sb, pipe);
+ return 0;
+ ret = nfs4blocklayout_register_sb(pipefs_sb, pipe);
rpc_put_sb_net(net);
- return dentry;
+ return ret;
}
static void nfs4blocklayout_unregister_net(struct net *net,
@@ -224,7 +210,7 @@ static void nfs4blocklayout_unregister_net(struct net *net,
pipefs_sb = rpc_get_sb_net(net);
if (pipefs_sb) {
- nfs4blocklayout_unregister_sb(pipefs_sb, pipe);
+ rpc_unlink(pipe);
rpc_put_sb_net(net);
}
}
@@ -232,20 +218,17 @@ static void nfs4blocklayout_unregister_net(struct net *net,
static int nfs4blocklayout_net_init(struct net *net)
{
struct nfs_net *nn = net_generic(net, nfs_net_id);
- struct dentry *dentry;
+ int err;
mutex_init(&nn->bl_mutex);
init_waitqueue_head(&nn->bl_wq);
nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0);
if (IS_ERR(nn->bl_device_pipe))
return PTR_ERR(nn->bl_device_pipe);
- dentry = nfs4blocklayout_register_net(net, nn->bl_device_pipe);
- if (IS_ERR(dentry)) {
+ err = nfs4blocklayout_register_net(net, nn->bl_device_pipe);
+ if (unlikely(err))
rpc_destroy_pipe_data(nn->bl_device_pipe);
- return PTR_ERR(dentry);
- }
- nn->bl_device_pipe->dentry = dentry;
- return 0;
+ return err;
}
static void nfs4blocklayout_net_exit(struct net *net)
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 3b0918ade53c..8fb4a950dd55 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -180,7 +180,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init)
clp->cl_proto = cl_init->proto;
clp->cl_nconnect = cl_init->nconnect;
clp->cl_max_connect = cl_init->max_connect ? cl_init->max_connect : 1;
- clp->cl_net = get_net(cl_init->net);
+ clp->cl_net = get_net_track(cl_init->net, &clp->cl_ns_tracker, GFP_KERNEL);
#if IS_ENABLED(CONFIG_NFS_LOCALIO)
seqlock_init(&clp->cl_boot_lock);
@@ -250,7 +250,7 @@ void nfs_free_client(struct nfs_client *clp)
if (!IS_ERR(clp->cl_rpcclient))
rpc_shutdown_client(clp->cl_rpcclient);
- put_net(clp->cl_net);
+ put_net_track(clp->cl_net, &clp->cl_ns_tracker);
put_nfs_version(clp->cl_nfs_mod);
kfree(clp->cl_hostname);
kfree(clp->cl_acceptor);
@@ -439,7 +439,7 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init)
spin_unlock(&nn->nfs_client_lock);
new = rpc_ops->init_client(new, cl_init);
if (!IS_ERR(new))
- nfs_local_probe(new);
+ nfs_local_probe_async(new);
return new;
}
@@ -546,6 +546,8 @@ int nfs_create_rpc_client(struct nfs_client *clp,
args.flags |= RPC_CLNT_CREATE_NOPING;
if (test_bit(NFS_CS_REUSEPORT, &clp->cl_flags))
args.flags |= RPC_CLNT_CREATE_REUSEPORT;
+ if (test_bit(NFS_CS_NETUNREACH_FATAL, &clp->cl_flags))
+ args.flags |= RPC_CLNT_CREATE_NETUNREACH_FATAL;
if (!IS_ERR(clp->cl_rpcclient))
return 0;
@@ -680,6 +682,44 @@ struct nfs_client *nfs_init_client(struct nfs_client *clp,
}
EXPORT_SYMBOL_GPL(nfs_init_client);
+static void nfs4_server_set_init_caps(struct nfs_server *server)
+{
+#if IS_ENABLED(CONFIG_NFS_V4)
+ /* Set the basic capabilities */
+ server->caps = server->nfs_client->cl_mvops->init_caps;
+ if (server->flags & NFS_MOUNT_NORDIRPLUS)
+ server->caps &= ~NFS_CAP_READDIRPLUS;
+ if (server->nfs_client->cl_proto == XPRT_TRANSPORT_RDMA)
+ server->caps &= ~NFS_CAP_READ_PLUS;
+
+ /*
+ * Don't use NFS uid/gid mapping if we're using AUTH_SYS or lower
+ * authentication.
+ */
+ if (nfs4_disable_idmapping &&
+ server->client->cl_auth->au_flavor == RPC_AUTH_UNIX)
+ server->caps |= NFS_CAP_UIDGID_NOMAP;
+#endif
+}
+
+void nfs_server_set_init_caps(struct nfs_server *server)
+{
+ switch (server->nfs_client->rpc_ops->version) {
+ case 2:
+ server->caps = NFS_CAP_HARDLINKS | NFS_CAP_SYMLINKS;
+ break;
+ case 3:
+ server->caps = NFS_CAP_HARDLINKS | NFS_CAP_SYMLINKS;
+ if (!(server->flags & NFS_MOUNT_NORDIRPLUS))
+ server->caps |= NFS_CAP_READDIRPLUS;
+ break;
+ default:
+ nfs4_server_set_init_caps(server);
+ break;
+ }
+}
+EXPORT_SYMBOL_GPL(nfs_server_set_init_caps);
+
/*
* Create a version 2 or 3 client
*/
@@ -709,6 +749,9 @@ static int nfs_init_server(struct nfs_server *server,
if (ctx->flags & NFS_MOUNT_NORESVPORT)
set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
+ if (ctx->flags & NFS_MOUNT_NETUNREACH_FATAL)
+ __set_bit(NFS_CS_NETUNREACH_FATAL, &cl_init.init_flags);
+
/* Allocate or find a client reference we can use */
clp = nfs_get_client(&cl_init);
if (IS_ERR(clp))
@@ -721,7 +764,6 @@ static int nfs_init_server(struct nfs_server *server,
/* Initialise the client representation from the mount data */
server->flags = ctx->flags;
server->options = ctx->options;
- server->caps |= NFS_CAP_HARDLINKS | NFS_CAP_SYMLINKS;
switch (clp->rpc_ops->version) {
case 2:
@@ -757,6 +799,8 @@ static int nfs_init_server(struct nfs_server *server,
if (error < 0)
goto error;
+ nfs_server_set_init_caps(server);
+
/* Preserve the values of mount_server-related mount options */
if (ctx->mount_server.addrlen) {
memcpy(&server->mountd_address, &ctx->mount_server.address,
@@ -809,7 +853,6 @@ static void nfs_server_set_fsinfo(struct nfs_server *server,
server->wsize = max_rpc_payload;
if (server->wsize > NFS_MAX_FILE_IO_SIZE)
server->wsize = NFS_MAX_FILE_IO_SIZE;
- server->wpages = (server->wsize + PAGE_SIZE - 1) >> PAGE_SHIFT;
server->wtmult = nfs_block_bits(fsinfo->wtmult, NULL);
@@ -826,7 +869,6 @@ static void nfs_server_set_fsinfo(struct nfs_server *server,
server->maxfilesize = fsinfo->maxfilesize;
- server->time_delta = fsinfo->time_delta;
server->change_attr_type = fsinfo->change_attr_type;
server->clone_blksize = fsinfo->clone_blksize;
@@ -931,7 +973,6 @@ void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_server *sour
target->acregmax = source->acregmax;
target->acdirmin = source->acdirmin;
target->acdirmax = source->acdirmax;
- target->caps = source->caps;
target->options = source->options;
target->auth_info = source->auth_info;
target->port = source->port;
@@ -1002,6 +1043,7 @@ struct nfs_server *nfs_alloc_server(void)
INIT_LIST_HEAD(&server->ss_src_copies);
atomic_set(&server->active, 0);
+ atomic_long_set(&server->nr_active_delegations, 0);
server->io_stats = nfs_alloc_iostats();
if (!server->io_stats) {
@@ -1100,6 +1142,8 @@ struct nfs_server *nfs_create_server(struct fs_context *fc)
if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN)
server->namelen = NFS2_MAXNAMLEN;
}
+ /* Linux 'subtree_check' borkenness mandates this setting */
+ server->fh_expire_type = NFS_FH_VOL_RENAME;
if (!(fattr->valid & NFS_ATTR_FATTR)) {
error = ctx->nfs_mod->rpc_ops->getattr(server, ctx->mntfh,
@@ -1163,6 +1207,8 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
if (error < 0)
goto out_free_server;
+ nfs_server_set_init_caps(server);
+
/* probe the filesystem info for this server filesystem */
error = nfs_probe_server(server, fh);
if (error < 0)
@@ -1195,6 +1241,10 @@ void nfs_clients_init(struct net *net)
#if IS_ENABLED(CONFIG_NFS_V4)
idr_init(&nn->cb_ident_idr);
#endif
+#if IS_ENABLED(CONFIG_NFS_V4_1)
+ INIT_LIST_HEAD(&nn->nfs4_data_server_cache);
+ spin_lock_init(&nn->nfs4_data_server_lock);
+#endif
spin_lock_init(&nn->nfs_client_lock);
nn->boot_time = ktime_get_real();
memset(&nn->rpcstats, 0, sizeof(nn->rpcstats));
@@ -1211,6 +1261,9 @@ void nfs_clients_exit(struct net *net)
nfs_cleanup_cb_ident_idr(net);
WARN_ON_ONCE(!list_empty(&nn->nfs_client_list));
WARN_ON_ONCE(!list_empty(&nn->nfs_volume_list));
+#if IS_ENABLED(CONFIG_NFS_V4_1)
+ WARN_ON_ONCE(!list_empty(&nn->nfs4_data_server_cache));
+#endif
}
#ifdef CONFIG_PROC_FS
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 035ba52742a5..9d3a5f29f17f 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -27,8 +27,15 @@
#define NFS_DEFAULT_DELEGATION_WATERMARK (5000U)
-static atomic_long_t nfs_active_delegations;
static unsigned nfs_delegation_watermark = NFS_DEFAULT_DELEGATION_WATERMARK;
+module_param_named(delegation_watermark, nfs_delegation_watermark, uint, 0644);
+
+static struct hlist_head *nfs_delegation_hash(struct nfs_server *server,
+ const struct nfs_fh *fhandle)
+{
+ return server->delegation_hash_table +
+ (nfs_fhandle_hash(fhandle) & server->delegation_hash_mask);
+}
static void __nfs_free_delegation(struct nfs_delegation *delegation)
{
@@ -37,11 +44,12 @@ static void __nfs_free_delegation(struct nfs_delegation *delegation)
kfree_rcu(delegation, rcu);
}
-static void nfs_mark_delegation_revoked(struct nfs_delegation *delegation)
+static void nfs_mark_delegation_revoked(struct nfs_server *server,
+ struct nfs_delegation *delegation)
{
if (!test_and_set_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) {
delegation->stateid.type = NFS4_INVALID_STATEID_TYPE;
- atomic_long_dec(&nfs_active_delegations);
+ atomic_long_dec(&server->nr_active_delegations);
if (!test_bit(NFS_DELEGATION_RETURNING, &delegation->flags))
nfs_clear_verifier_delegated(delegation->inode);
}
@@ -59,9 +67,10 @@ static void nfs_put_delegation(struct nfs_delegation *delegation)
__nfs_free_delegation(delegation);
}
-static void nfs_free_delegation(struct nfs_delegation *delegation)
+static void nfs_free_delegation(struct nfs_server *server,
+ struct nfs_delegation *delegation)
{
- nfs_mark_delegation_revoked(delegation);
+ nfs_mark_delegation_revoked(server, delegation);
nfs_put_delegation(delegation);
}
@@ -79,6 +88,7 @@ static void nfs_mark_return_delegation(struct nfs_server *server,
struct nfs_delegation *delegation)
{
set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
+ set_bit(NFS4SERV_DELEGRETURN, &server->delegation_flags);
set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state);
}
@@ -236,34 +246,34 @@ void nfs_inode_reclaim_delegation(struct inode *inode, const struct cred *cred,
rcu_read_lock();
delegation = rcu_dereference(NFS_I(inode)->delegation);
- if (delegation != NULL) {
- spin_lock(&delegation->lock);
- nfs4_stateid_copy(&delegation->stateid, stateid);
- delegation->type = type;
- delegation->pagemod_limit = pagemod_limit;
- oldcred = delegation->cred;
- delegation->cred = get_cred(cred);
- switch (deleg_type) {
- case NFS4_OPEN_DELEGATE_READ_ATTRS_DELEG:
- case NFS4_OPEN_DELEGATE_WRITE_ATTRS_DELEG:
- set_bit(NFS_DELEGATION_DELEGTIME, &delegation->flags);
- break;
- default:
- clear_bit(NFS_DELEGATION_DELEGTIME, &delegation->flags);
- }
- clear_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
- if (test_and_clear_bit(NFS_DELEGATION_REVOKED,
- &delegation->flags))
- atomic_long_inc(&nfs_active_delegations);
- spin_unlock(&delegation->lock);
- rcu_read_unlock();
- put_cred(oldcred);
- trace_nfs4_reclaim_delegation(inode, type);
- } else {
+ if (!delegation) {
rcu_read_unlock();
nfs_inode_set_delegation(inode, cred, type, stateid,
pagemod_limit, deleg_type);
+ return;
}
+
+ spin_lock(&delegation->lock);
+ nfs4_stateid_copy(&delegation->stateid, stateid);
+ delegation->type = type;
+ delegation->pagemod_limit = pagemod_limit;
+ oldcred = delegation->cred;
+ delegation->cred = get_cred(cred);
+ switch (deleg_type) {
+ case NFS4_OPEN_DELEGATE_READ_ATTRS_DELEG:
+ case NFS4_OPEN_DELEGATE_WRITE_ATTRS_DELEG:
+ set_bit(NFS_DELEGATION_DELEGTIME, &delegation->flags);
+ break;
+ default:
+ clear_bit(NFS_DELEGATION_DELEGTIME, &delegation->flags);
+ }
+ clear_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
+ if (test_and_clear_bit(NFS_DELEGATION_REVOKED, &delegation->flags))
+ atomic_long_inc(&NFS_SERVER(inode)->nr_active_delegations);
+ spin_unlock(&delegation->lock);
+ rcu_read_unlock();
+ put_cred(oldcred);
+ trace_nfs4_reclaim_delegation(inode, type);
}
static int nfs_do_return_delegation(struct inode *inode,
@@ -306,7 +316,8 @@ nfs_start_delegation_return_locked(struct nfs_inode *nfsi)
if (delegation == NULL)
goto out;
spin_lock(&delegation->lock);
- if (!test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) {
+ if (delegation->inode &&
+ !test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) {
clear_bit(NFS_DELEGATION_RETURN_DELAYED, &delegation->flags);
/* Refcount matched in nfs_end_delegation_return() */
ret = nfs_get_delegation(delegation);
@@ -330,14 +341,16 @@ nfs_start_delegation_return(struct nfs_inode *nfsi)
}
static void nfs_abort_delegation_return(struct nfs_delegation *delegation,
- struct nfs_client *clp, int err)
+ struct nfs_server *server, int err)
{
-
spin_lock(&delegation->lock);
clear_bit(NFS_DELEGATION_RETURNING, &delegation->flags);
if (err == -EAGAIN) {
set_bit(NFS_DELEGATION_RETURN_DELAYED, &delegation->flags);
- set_bit(NFS4CLNT_DELEGRETURN_DELAYED, &clp->cl_state);
+ set_bit(NFS4SERV_DELEGRETURN_DELAYED,
+ &server->delegation_flags);
+ set_bit(NFS4CLNT_DELEGRETURN_DELAYED,
+ &server->nfs_client->cl_state);
}
spin_unlock(&delegation->lock);
}
@@ -351,6 +364,8 @@ nfs_detach_delegation_locked(struct nfs_inode *nfsi,
rcu_dereference_protected(nfsi->delegation,
lockdep_is_held(&clp->cl_lock));
+ trace_nfs4_detach_delegation(&nfsi->vfs_inode, delegation->type);
+
if (deleg_cur == NULL || delegation != deleg_cur)
return NULL;
@@ -359,6 +374,7 @@ nfs_detach_delegation_locked(struct nfs_inode *nfsi,
spin_unlock(&delegation->lock);
return NULL;
}
+ hlist_del_init_rcu(&delegation->hash);
list_del_rcu(&delegation->super_list);
delegation->inode = NULL;
rcu_assign_pointer(nfsi->delegation, NULL);
@@ -406,7 +422,8 @@ nfs_update_delegation_cred(struct nfs_delegation *delegation,
}
static void
-nfs_update_inplace_delegation(struct nfs_delegation *delegation,
+nfs_update_inplace_delegation(struct nfs_server *server,
+ struct nfs_delegation *delegation,
const struct nfs_delegation *update)
{
if (nfs4_stateid_is_newer(&update->stateid, &delegation->stateid)) {
@@ -419,7 +436,7 @@ nfs_update_inplace_delegation(struct nfs_delegation *delegation,
nfs_update_delegation_cred(delegation, update->cred);
/* smp_mb__before_atomic() is implicit due to xchg() */
clear_bit(NFS_DELEGATION_REVOKED, &delegation->flags);
- atomic_long_inc(&nfs_active_delegations);
+ atomic_long_inc(&server->nr_active_delegations);
}
}
}
@@ -474,7 +491,7 @@ int nfs_inode_set_delegation(struct inode *inode, const struct cred *cred,
if (nfs4_stateid_match_other(&old_delegation->stateid,
&delegation->stateid)) {
spin_lock(&old_delegation->lock);
- nfs_update_inplace_delegation(old_delegation,
+ nfs_update_inplace_delegation(server, old_delegation,
delegation);
spin_unlock(&old_delegation->lock);
goto out;
@@ -520,10 +537,12 @@ add_new:
spin_unlock(&inode->i_lock);
list_add_tail_rcu(&delegation->super_list, &server->delegations);
+ hlist_add_head_rcu(&delegation->hash,
+ nfs_delegation_hash(server, &NFS_I(inode)->fh));
rcu_assign_pointer(nfsi->delegation, delegation);
delegation = NULL;
- atomic_long_inc(&nfs_active_delegations);
+ atomic_long_inc(&server->nr_active_delegations);
trace_nfs4_set_delegation(inode, type);
@@ -537,7 +556,7 @@ out:
__nfs_free_delegation(delegation);
if (freeme != NULL) {
nfs_do_return_delegation(inode, freeme, 0);
- nfs_free_delegation(freeme);
+ nfs_free_delegation(server, freeme);
}
return status;
}
@@ -547,7 +566,7 @@ out:
*/
static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation *delegation, int issync)
{
- struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+ struct nfs_server *server = NFS_SERVER(inode);
unsigned int mode = O_WRONLY | O_RDWR;
int err = 0;
@@ -569,11 +588,11 @@ static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation
/*
* Guard against state recovery
*/
- err = nfs4_wait_clnt_recover(clp);
+ err = nfs4_wait_clnt_recover(server->nfs_client);
}
if (err) {
- nfs_abort_delegation_return(delegation, clp, err);
+ nfs_abort_delegation_return(delegation, server, err);
goto out;
}
@@ -588,19 +607,10 @@ static bool nfs_delegation_need_return(struct nfs_delegation *delegation)
{
bool ret = false;
+ trace_nfs_delegation_need_return(delegation);
+
if (test_and_clear_bit(NFS_DELEGATION_RETURN, &delegation->flags))
ret = true;
- else if (test_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags)) {
- struct inode *inode;
-
- spin_lock(&delegation->lock);
- inode = delegation->inode;
- if (inode && list_empty(&NFS_I(inode)->open_files))
- ret = true;
- spin_unlock(&delegation->lock);
- }
- if (ret)
- clear_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags);
if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags) ||
test_bit(NFS_DELEGATION_RETURN_DELAYED, &delegation->flags) ||
test_bit(NFS_DELEGATION_REVOKED, &delegation->flags))
@@ -619,6 +629,9 @@ static int nfs_server_return_marked_delegations(struct nfs_server *server,
struct nfs_delegation *place_holder_deleg = NULL;
int err = 0;
+ if (!test_and_clear_bit(NFS4SERV_DELEGRETURN,
+ &server->delegation_flags))
+ return 0;
restart:
/*
* To avoid quadratic looping we hold a reference
@@ -670,6 +683,7 @@ restart:
cond_resched();
if (!err)
goto restart;
+ set_bit(NFS4SERV_DELEGRETURN, &server->delegation_flags);
set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state);
goto out;
}
@@ -684,6 +698,9 @@ static bool nfs_server_clear_delayed_delegations(struct nfs_server *server)
struct nfs_delegation *d;
bool ret = false;
+ if (!test_and_clear_bit(NFS4SERV_DELEGRETURN_DELAYED,
+ &server->delegation_flags))
+ goto out;
list_for_each_entry_rcu (d, &server->delegations, super_list) {
if (!test_bit(NFS_DELEGATION_RETURN_DELAYED, &d->flags))
continue;
@@ -691,6 +708,7 @@ static bool nfs_server_clear_delayed_delegations(struct nfs_server *server)
clear_bit(NFS_DELEGATION_RETURN_DELAYED, &d->flags);
ret = true;
}
+out:
return ret;
}
@@ -750,7 +768,7 @@ void nfs_inode_evict_delegation(struct inode *inode)
set_bit(NFS_DELEGATION_RETURNING, &delegation->flags);
set_bit(NFS_DELEGATION_INODE_FREEING, &delegation->flags);
nfs_do_return_delegation(inode, delegation, 1);
- nfs_free_delegation(delegation);
+ nfs_free_delegation(NFS_SERVER(inode), delegation);
}
}
@@ -781,6 +799,43 @@ int nfs4_inode_return_delegation(struct inode *inode)
}
/**
+ * nfs4_inode_set_return_delegation_on_close - asynchronously return a delegation
+ * @inode: inode to process
+ *
+ * This routine is called to request that the delegation be returned as soon
+ * as the file is closed. If the file is already closed, the delegation is
+ * immediately returned.
+ */
+void nfs4_inode_set_return_delegation_on_close(struct inode *inode)
+{
+ struct nfs_delegation *delegation;
+ struct nfs_delegation *ret = NULL;
+
+ if (!inode)
+ return;
+ rcu_read_lock();
+ delegation = nfs4_get_valid_delegation(inode);
+ if (!delegation)
+ goto out;
+ spin_lock(&delegation->lock);
+ if (!delegation->inode)
+ goto out_unlock;
+ if (list_empty(&NFS_I(inode)->open_files) &&
+ !test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) {
+ /* Refcount matched in nfs_end_delegation_return() */
+ ret = nfs_get_delegation(delegation);
+ } else
+ set_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags);
+out_unlock:
+ spin_unlock(&delegation->lock);
+ if (ret)
+ nfs_clear_verifier_delegated(inode);
+out:
+ rcu_read_unlock();
+ nfs_end_delegation_return(inode, ret, 0);
+}
+
+/**
* nfs4_inode_return_delegation_on_close - asynchronously return a delegation
* @inode: inode to process
*
@@ -799,7 +854,8 @@ void nfs4_inode_return_delegation_on_close(struct inode *inode)
if (!delegation)
goto out;
if (test_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags) ||
- atomic_long_read(&nfs_active_delegations) >= nfs_delegation_watermark) {
+ atomic_long_read(&NFS_SERVER(inode)->nr_active_delegations) >=
+ nfs_delegation_watermark) {
spin_lock(&delegation->lock);
if (delegation->inode &&
list_empty(&NFS_I(inode)->open_files) &&
@@ -841,11 +897,25 @@ int nfs4_inode_make_writeable(struct inode *inode)
return nfs4_inode_return_delegation(inode);
}
-static void nfs_mark_return_if_closed_delegation(struct nfs_server *server,
- struct nfs_delegation *delegation)
+static void
+nfs_mark_return_if_closed_delegation(struct nfs_server *server,
+ struct nfs_delegation *delegation)
{
- set_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags);
- set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state);
+ struct inode *inode;
+
+ if (test_bit(NFS_DELEGATION_RETURN, &delegation->flags) ||
+ test_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags))
+ return;
+ spin_lock(&delegation->lock);
+ inode = delegation->inode;
+ if (!inode)
+ goto out;
+ if (list_empty(&NFS_I(inode)->open_files))
+ nfs_mark_return_delegation(server, delegation);
+ else
+ set_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags);
+out:
+ spin_unlock(&delegation->lock);
}
static bool nfs_server_mark_return_all_delegations(struct nfs_server *server)
@@ -961,7 +1031,7 @@ static void nfs_revoke_delegation(struct inode *inode,
}
spin_unlock(&delegation->lock);
}
- nfs_mark_delegation_revoked(delegation);
+ nfs_mark_delegation_revoked(NFS_SERVER(inode), delegation);
ret = true;
out:
rcu_read_unlock();
@@ -969,13 +1039,6 @@ out:
nfs_inode_find_state_and_recover(inode, stateid);
}
-void nfs_remove_bad_delegation(struct inode *inode,
- const nfs4_stateid *stateid)
-{
- nfs_revoke_delegation(inode, stateid);
-}
-EXPORT_SYMBOL_GPL(nfs_remove_bad_delegation);
-
void nfs_delegation_mark_returned(struct inode *inode,
const nfs4_stateid *stateid)
{
@@ -1000,7 +1063,7 @@ void nfs_delegation_mark_returned(struct inode *inode,
delegation->stateid.seqid = stateid->seqid;
}
- nfs_mark_delegation_revoked(delegation);
+ nfs_mark_delegation_revoked(NFS_SERVER(inode), delegation);
clear_bit(NFS_DELEGATION_RETURNING, &delegation->flags);
spin_unlock(&delegation->lock);
if (nfs_detach_delegation(NFS_I(inode), delegation, NFS_SERVER(inode)))
@@ -1018,6 +1081,24 @@ out_rcu_unlock:
}
/**
+ * nfs_remove_bad_delegation - handle delegations that are unusable
+ * @inode: inode to process
+ * @stateid: the delegation's stateid
+ *
+ * If the server ACK-ed our FREE_STATEID then clean
+ * up the delegation, else mark and keep the revoked state.
+ */
+void nfs_remove_bad_delegation(struct inode *inode,
+ const nfs4_stateid *stateid)
+{
+ if (stateid && stateid->type == NFS4_FREED_STATEID_TYPE)
+ nfs_delegation_mark_returned(inode, stateid);
+ else
+ nfs_revoke_delegation(inode, stateid);
+}
+EXPORT_SYMBOL_GPL(nfs_remove_bad_delegation);
+
+/**
* nfs_expire_unused_delegation_types
* @clp: client to process
* @flags: delegation types to expire
@@ -1095,11 +1176,12 @@ static struct inode *
nfs_delegation_find_inode_server(struct nfs_server *server,
const struct nfs_fh *fhandle)
{
+ struct hlist_head *head = nfs_delegation_hash(server, fhandle);
struct nfs_delegation *delegation;
struct super_block *freeme = NULL;
struct inode *res = NULL;
- list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
+ hlist_for_each_entry_rcu(delegation, head, hash) {
spin_lock(&delegation->lock);
if (delegation->inode != NULL &&
!test_bit(NFS_DELEGATION_REVOKED, &delegation->flags) &&
@@ -1202,7 +1284,7 @@ restart:
if (delegation != NULL) {
if (nfs_detach_delegation(NFS_I(inode), delegation,
server) != NULL)
- nfs_free_delegation(delegation);
+ nfs_free_delegation(server, delegation);
/* Match nfs_start_delegation_return_locked */
nfs_put_delegation(delegation);
}
@@ -1239,6 +1321,7 @@ static void nfs_mark_test_expired_delegation(struct nfs_server *server,
return;
clear_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
set_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags);
+ set_bit(NFS4SERV_DELEGATION_EXPIRED, &server->delegation_flags);
set_bit(NFS4CLNT_DELEGATION_EXPIRED, &server->nfs_client->cl_state);
}
@@ -1317,6 +1400,9 @@ static int nfs_server_reap_expired_delegations(struct nfs_server *server,
nfs4_stateid stateid;
unsigned long gen = ++server->delegation_gen;
+ if (!test_and_clear_bit(NFS4SERV_DELEGATION_EXPIRED,
+ &server->delegation_flags))
+ return 0;
restart:
rcu_read_lock();
list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
@@ -1346,6 +1432,9 @@ restart:
goto restart;
}
nfs_inode_mark_test_expired_delegation(server,inode);
+ set_bit(NFS4SERV_DELEGATION_EXPIRED, &server->delegation_flags);
+ set_bit(NFS4CLNT_DELEGATION_EXPIRED,
+ &server->nfs_client->cl_state);
iput(inode);
return -EAGAIN;
}
@@ -1500,4 +1589,17 @@ out:
return ret;
}
-module_param_named(delegation_watermark, nfs_delegation_watermark, uint, 0644);
+int nfs4_delegation_hash_alloc(struct nfs_server *server)
+{
+ int delegation_buckets, i;
+
+ delegation_buckets = roundup_pow_of_two(nfs_delegation_watermark / 16);
+ server->delegation_hash_mask = delegation_buckets - 1;
+ server->delegation_hash_table = kmalloc_array(delegation_buckets,
+ sizeof(*server->delegation_hash_table), GFP_KERNEL);
+ if (!server->delegation_hash_table)
+ return -ENOMEM;
+ for (i = 0; i < delegation_buckets; i++)
+ INIT_HLIST_HEAD(&server->delegation_hash_table[i]);
+ return 0;
+}
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 71524d34ed20..08ec2e9c68a4 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -14,6 +14,7 @@
* NFSv4 delegation
*/
struct nfs_delegation {
+ struct hlist_node hash;
struct list_head super_list;
const struct cred *cred;
struct inode *inode;
@@ -49,6 +50,7 @@ void nfs_inode_reclaim_delegation(struct inode *inode, const struct cred *cred,
unsigned long pagemod_limit, u32 deleg_type);
int nfs4_inode_return_delegation(struct inode *inode);
void nfs4_inode_return_delegation_on_close(struct inode *inode);
+void nfs4_inode_set_return_delegation_on_close(struct inode *inode);
int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid);
void nfs_inode_evict_delegation(struct inode *inode);
@@ -122,4 +124,6 @@ static inline int nfs_have_delegated_mtime(struct inode *inode)
NFS_DELEGATION_FLAG_TIME);
}
+int nfs4_delegation_hash_alloc(struct nfs_server *server);
+
#endif
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 2b04038b0e40..d81217923936 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -666,6 +666,8 @@ static bool nfs_use_readdirplus(struct inode *dir, struct dir_context *ctx,
{
if (!nfs_server_capable(dir, NFS_CAP_READDIRPLUS))
return false;
+ if (NFS_SERVER(dir)->flags & NFS_MOUNT_FORCE_RDIRPLUS)
+ return true;
if (ctx->pos == 0 ||
cache_hits + cache_misses > NFS_READDIR_CACHE_USAGE_THRESHOLD)
return true;
@@ -1532,7 +1534,8 @@ static int nfs_is_exclusive_create(struct inode *dir, unsigned int flags)
{
if (NFS_PROTO(dir)->version == 2)
return 0;
- return flags & LOOKUP_EXCL;
+ return (flags & (LOOKUP_CREATE | LOOKUP_EXCL)) ==
+ (LOOKUP_CREATE | LOOKUP_EXCL);
}
/*
@@ -1825,9 +1828,7 @@ static void block_revalidate(struct dentry *dentry)
static void unblock_revalidate(struct dentry *dentry)
{
- /* store_release ensures wait_var_event() sees the update */
- smp_store_release(&dentry->d_fsdata, NULL);
- wake_up_var(&dentry->d_fsdata);
+ store_release_wake_up(&dentry->d_fsdata, NULL);
}
/*
@@ -2421,11 +2422,11 @@ EXPORT_SYMBOL_GPL(nfs_mknod);
/*
* See comments for nfs_proc_create regarding failed operations.
*/
-int nfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+struct dentry *nfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct iattr attr;
- int error;
+ struct dentry *ret;
dfprintk(VFS, "NFS: mkdir(%s/%lu), %pd\n",
dir->i_sb->s_id, dir->i_ino, dentry);
@@ -2434,14 +2435,9 @@ int nfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
attr.ia_mode = mode | S_IFDIR;
trace_nfs_mkdir_enter(dir, dentry);
- error = NFS_PROTO(dir)->mkdir(dir, dentry, &attr);
- trace_nfs_mkdir_exit(dir, dentry, error);
- if (error != 0)
- goto out_err;
- return 0;
-out_err:
- d_drop(dentry);
- return error;
+ ret = NFS_PROTO(dir)->mkdir(dir, dentry, &attr);
+ trace_nfs_mkdir_exit(dir, dentry, PTR_ERR_OR_ZERO(ret));
+ return ret;
}
EXPORT_SYMBOL_GPL(nfs_mkdir);
@@ -2678,6 +2674,18 @@ nfs_unblock_rename(struct rpc_task *task, struct nfs_renamedata *data)
unblock_revalidate(new_dentry);
}
+static bool nfs_rename_is_unsafe_cross_dir(struct dentry *old_dentry,
+ struct dentry *new_dentry)
+{
+ struct nfs_server *server = NFS_SB(old_dentry->d_sb);
+
+ if (old_dentry->d_parent != new_dentry->d_parent)
+ return false;
+ if (server->fh_expire_type & NFS_FH_RENAME_UNSAFE)
+ return !(server->fh_expire_type & NFS_FH_NOEXPIRE_WITH_OPEN);
+ return true;
+}
+
/*
* RENAME
* FIXME: Some nfsds, like the Linux user space nfsd, may generate a
@@ -2765,7 +2773,8 @@ int nfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
}
- if (S_ISREG(old_inode->i_mode))
+ if (S_ISREG(old_inode->i_mode) &&
+ nfs_rename_is_unsafe_cross_dir(old_dentry, new_dentry))
nfs_sync_inode(old_inode);
task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry,
must_unblock ? nfs_unblock_rename : NULL);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index f45beea92d03..48d89716193a 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -56,6 +56,7 @@
#include <linux/uaccess.h>
#include <linux/atomic.h>
+#include "delegation.h"
#include "internal.h"
#include "iostat.h"
#include "pnfs.h"
@@ -130,6 +131,20 @@ static void nfs_direct_truncate_request(struct nfs_direct_req *dreq,
dreq->count = req_start;
}
+static void nfs_direct_file_adjust_size_locked(struct inode *inode,
+ loff_t offset, size_t count)
+{
+ loff_t newsize = offset + (loff_t)count;
+ loff_t oldsize = i_size_read(inode);
+
+ if (newsize > oldsize) {
+ i_size_write(inode, newsize);
+ NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_SIZE;
+ trace_nfs_size_grow(inode, newsize);
+ nfs_inc_stats(inode, NFSIOS_EXTENDWRITE);
+ }
+}
+
/**
* nfs_swap_rw - NFS address space operation for swap I/O
* @iocb: target I/O control block
@@ -272,6 +287,8 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
nfs_direct_count_bytes(dreq, hdr);
spin_unlock(&dreq->lock);
+ nfs_update_delegated_atime(dreq->inode);
+
while (!list_empty(&hdr->pages)) {
struct nfs_page *req = nfs_list_entry(hdr->pages.next);
struct page *page = req->wb_page;
@@ -740,7 +757,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
{
struct nfs_direct_req *dreq = hdr->dreq;
struct nfs_commit_info cinfo;
- struct nfs_page *req = nfs_list_entry(hdr->pages.next);
+ struct inode *inode = dreq->inode;
int flags = NFS_ODIRECT_DONE;
trace_nfs_direct_write_completion(dreq);
@@ -762,7 +779,13 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
}
spin_unlock(&dreq->lock);
+ spin_lock(&inode->i_lock);
+ nfs_direct_file_adjust_size_locked(inode, dreq->io_start, dreq->count);
+ nfs_update_delegated_mtime_locked(dreq->inode);
+ spin_unlock(&inode->i_lock);
+
while (!list_empty(&hdr->pages)) {
+ struct nfs_page *req;
req = nfs_list_entry(hdr->pages.next);
nfs_list_remove_request(req);
diff --git a/fs/nfs/export.c b/fs/nfs/export.c
index be686b8e0c54..a10dd5f9d078 100644
--- a/fs/nfs/export.c
+++ b/fs/nfs/export.c
@@ -66,14 +66,21 @@ nfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
{
struct nfs_fattr *fattr = NULL;
struct nfs_fh *server_fh = nfs_exp_embedfh(fid->raw);
- size_t fh_size = offsetof(struct nfs_fh, data) + server_fh->size;
+ size_t fh_size = offsetof(struct nfs_fh, data);
const struct nfs_rpc_ops *rpc_ops;
struct dentry *dentry;
struct inode *inode;
- int len = EMBED_FH_OFF + XDR_QUADLEN(fh_size);
+ int len = EMBED_FH_OFF;
u32 *p = fid->raw;
int ret;
+ /* Initial check of bounds */
+ if (fh_len < len + XDR_QUADLEN(fh_size) ||
+ fh_len > XDR_QUADLEN(NFS_MAXFHSIZE))
+ return NULL;
+ /* Calculate embedded filehandle size */
+ fh_size += server_fh->size;
+ len += XDR_QUADLEN(fh_size);
/* NULL translates to ESTALE */
if (fh_len < len || fh_type != len)
return NULL;
@@ -154,5 +161,6 @@ const struct export_operations nfs_export_ops = {
EXPORT_OP_CLOSE_BEFORE_UNLINK |
EXPORT_OP_REMOTE_FS |
EXPORT_OP_NOATOMIC_ATTR |
- EXPORT_OP_FLUSH_ON_CLOSE,
+ EXPORT_OP_FLUSH_ON_CLOSE |
+ EXPORT_OP_NOLOCKS,
};
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 1bb646752e46..86e36c630f09 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -29,6 +29,7 @@
#include <linux/pagemap.h>
#include <linux/gfp.h>
#include <linux/swap.h>
+#include <linux/compaction.h>
#include <linux/uaccess.h>
#include <linux/filelock.h>
@@ -206,24 +207,25 @@ nfs_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe
EXPORT_SYMBOL_GPL(nfs_file_splice_read);
int
-nfs_file_mmap(struct file *file, struct vm_area_struct *vma)
+nfs_file_mmap_prepare(struct vm_area_desc *desc)
{
+ struct file *file = desc->file;
struct inode *inode = file_inode(file);
int status;
dprintk("NFS: mmap(%pD2)\n", file);
- /* Note: generic_file_mmap() returns ENOSYS on nommu systems
+ /* Note: generic_file_mmap_prepare() returns ENOSYS on nommu systems
* so we call that before revalidating the mapping
*/
- status = generic_file_mmap(file, vma);
+ status = generic_file_mmap_prepare(desc);
if (!status) {
- vma->vm_ops = &nfs_file_vm_ops;
+ desc->vm_ops = &nfs_file_vm_ops;
status = nfs_revalidate_mapping(inode, file->f_mapping);
}
return status;
}
-EXPORT_SYMBOL_GPL(nfs_file_mmap);
+EXPORT_SYMBOL_GPL(nfs_file_mmap_prepare);
/*
* Flush any dirty pages for this process, and check for write errors.
@@ -341,12 +343,14 @@ static bool nfs_want_read_modify_write(struct file *file, struct folio *folio,
* If the writer ends up delaying the write, the writer needs to
* increment the page use counts until he is done with the page.
*/
-static int nfs_write_begin(struct file *file, struct address_space *mapping,
+static int nfs_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
loff_t pos, unsigned len, struct folio **foliop,
void **fsdata)
{
fgf_t fgp = FGP_WRITEBEGIN;
struct folio *folio;
+ struct file *file = iocb->ki_filp;
int once_thru = 0;
int ret;
@@ -376,10 +380,12 @@ start:
return ret;
}
-static int nfs_write_end(struct file *file, struct address_space *mapping,
+static int nfs_write_end(const struct kiocb *iocb,
+ struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct folio *folio, void *fsdata)
{
+ struct file *file = iocb->ki_filp;
struct nfs_open_context *ctx = nfs_file_open_context(file);
unsigned offset = offset_in_folio(folio, pos);
int status;
@@ -457,7 +463,7 @@ static bool nfs_release_folio(struct folio *folio, gfp_t gfp)
/* If the private flag is set, then the folio is not freeable */
if (folio_test_private(folio)) {
if ((current_gfp_context(gfp) & GFP_KERNEL) != GFP_KERNEL ||
- current_is_kswapd())
+ current_is_kswapd() || current_is_kcompactd())
return false;
if (nfs_wb_folio(folio->mapping->host, folio) < 0)
return false;
@@ -898,7 +904,7 @@ const struct file_operations nfs_file_operations = {
.llseek = nfs_file_llseek,
.read_iter = nfs_file_read,
.write_iter = nfs_file_write,
- .mmap = nfs_file_mmap,
+ .mmap_prepare = nfs_file_mmap_prepare,
.open = nfs_file_open,
.flush = nfs_file_flush,
.release = nfs_file_release,
diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c
index 4fa304fa5bc4..29d9234d5c08 100644
--- a/fs/nfs/filelayout/filelayoutdev.c
+++ b/fs/nfs/filelayout/filelayoutdev.c
@@ -76,6 +76,7 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
struct page *scratch;
struct list_head dsaddrs;
struct nfs4_pnfs_ds_addr *da;
+ struct net *net = server->nfs_client->cl_net;
/* set up xdr stream */
scratch = alloc_page(gfp_flags);
@@ -159,8 +160,7 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
mp_count = be32_to_cpup(p); /* multipath count */
for (j = 0; j < mp_count; j++) {
- da = nfs4_decode_mp_ds_addr(server->nfs_client->cl_net,
- &stream, gfp_flags);
+ da = nfs4_decode_mp_ds_addr(net, &stream, gfp_flags);
if (da)
list_add_tail(&da->da_node, &dsaddrs);
}
@@ -170,7 +170,7 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
goto out_err_free_deviceid;
}
- dsaddr->ds_list[i] = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags);
+ dsaddr->ds_list[i] = nfs4_pnfs_ds_add(net, &dsaddrs, gfp_flags);
if (!dsaddr->ds_list[i])
goto out_err_drain_dsaddrs;
trace_fl_getdevinfo(server, &pdev->dev_id, dsaddr->ds_list[i]->ds_remotestr);
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 98b45b636be3..8dc921d83538 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -762,14 +762,14 @@ ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg,
{
struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
struct nfs4_ff_layout_mirror *mirror;
- struct nfs4_pnfs_ds *ds;
+ struct nfs4_pnfs_ds *ds = ERR_PTR(-EAGAIN);
u32 idx;
/* mirrors are initially sorted by efficiency */
for (idx = start_idx; idx < fls->mirror_array_cnt; idx++) {
mirror = FF_LAYOUT_COMP(lseg, idx);
ds = nfs4_ff_layout_prepare_ds(lseg, mirror, false);
- if (!ds)
+ if (IS_ERR(ds))
continue;
if (check_device &&
@@ -777,10 +777,10 @@ ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg,
continue;
*best_idx = idx;
- return ds;
+ break;
}
- return NULL;
+ return ds;
}
static struct nfs4_pnfs_ds *
@@ -942,7 +942,7 @@ retry:
for (i = 0; i < pgio->pg_mirror_count; i++) {
mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i);
ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, mirror, true);
- if (!ds) {
+ if (IS_ERR(ds)) {
if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg))
goto out_mds;
pnfs_generic_pg_cleanup(pgio);
@@ -1105,6 +1105,7 @@ static void ff_layout_reset_read(struct nfs_pgio_header *hdr)
}
static int ff_layout_async_handle_error_v4(struct rpc_task *task,
+ u32 op_status,
struct nfs4_state *state,
struct nfs_client *clp,
struct pnfs_layout_segment *lseg,
@@ -1115,32 +1116,42 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task,
struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
struct nfs4_slot_table *tbl = &clp->cl_session->fc_slot_table;
- switch (task->tk_status) {
- case -NFS4ERR_BADSESSION:
- case -NFS4ERR_BADSLOT:
- case -NFS4ERR_BAD_HIGH_SLOT:
- case -NFS4ERR_DEADSESSION:
- case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
- case -NFS4ERR_SEQ_FALSE_RETRY:
- case -NFS4ERR_SEQ_MISORDERED:
+ switch (op_status) {
+ case NFS4_OK:
+ case NFS4ERR_NXIO:
+ break;
+ case NFSERR_PERM:
+ if (!task->tk_xprt)
+ break;
+ xprt_force_disconnect(task->tk_xprt);
+ goto out_retry;
+ case NFS4ERR_BADSESSION:
+ case NFS4ERR_BADSLOT:
+ case NFS4ERR_BAD_HIGH_SLOT:
+ case NFS4ERR_DEADSESSION:
+ case NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+ case NFS4ERR_SEQ_FALSE_RETRY:
+ case NFS4ERR_SEQ_MISORDERED:
dprintk("%s ERROR %d, Reset session. Exchangeid "
"flags 0x%x\n", __func__, task->tk_status,
clp->cl_exchange_flags);
nfs4_schedule_session_recovery(clp->cl_session, task->tk_status);
- break;
- case -NFS4ERR_DELAY:
- case -NFS4ERR_GRACE:
+ goto out_retry;
+ case NFS4ERR_DELAY:
+ nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY);
+ fallthrough;
+ case NFS4ERR_GRACE:
rpc_delay(task, FF_LAYOUT_POLL_RETRY_MAX);
- break;
- case -NFS4ERR_RETRY_UNCACHED_REP:
- break;
+ goto out_retry;
+ case NFS4ERR_RETRY_UNCACHED_REP:
+ goto out_retry;
/* Invalidate Layout errors */
- case -NFS4ERR_PNFS_NO_LAYOUT:
- case -ESTALE: /* mapped NFS4ERR_STALE */
- case -EBADHANDLE: /* mapped NFS4ERR_BADHANDLE */
- case -EISDIR: /* mapped NFS4ERR_ISDIR */
- case -NFS4ERR_FHEXPIRED:
- case -NFS4ERR_WRONG_TYPE:
+ case NFS4ERR_PNFS_NO_LAYOUT:
+ case NFS4ERR_STALE:
+ case NFS4ERR_BADHANDLE:
+ case NFS4ERR_ISDIR:
+ case NFS4ERR_FHEXPIRED:
+ case NFS4ERR_WRONG_TYPE:
dprintk("%s Invalid layout error %d\n", __func__,
task->tk_status);
/*
@@ -1153,11 +1164,20 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task,
pnfs_destroy_layout(NFS_I(inode));
rpc_wake_up(&tbl->slot_tbl_waitq);
goto reset;
+ default:
+ break;
+ }
+
+ switch (task->tk_status) {
/* RPC connection errors */
+ case -ENETDOWN:
+ case -ENETUNREACH:
+ if (test_bit(NFS_CS_NETUNREACH_FATAL, &clp->cl_flags))
+ return -NFS4ERR_FATAL_IOERROR;
+ fallthrough;
case -ECONNREFUSED:
case -EHOSTDOWN:
case -EHOSTUNREACH:
- case -ENETUNREACH:
case -EIO:
case -ETIMEDOUT:
case -EPIPE:
@@ -1168,26 +1188,56 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task,
nfs4_delete_deviceid(devid->ld, devid->nfs_client,
&devid->deviceid);
rpc_wake_up(&tbl->slot_tbl_waitq);
- fallthrough;
+ break;
default:
- if (ff_layout_avoid_mds_available_ds(lseg))
- return -NFS4ERR_RESET_TO_PNFS;
-reset:
- dprintk("%s Retry through MDS. Error %d\n", __func__,
- task->tk_status);
- return -NFS4ERR_RESET_TO_MDS;
+ break;
}
+
+ if (ff_layout_avoid_mds_available_ds(lseg))
+ return -NFS4ERR_RESET_TO_PNFS;
+reset:
+ dprintk("%s Retry through MDS. Error %d\n", __func__,
+ task->tk_status);
+ return -NFS4ERR_RESET_TO_MDS;
+
+out_retry:
task->tk_status = 0;
return -EAGAIN;
}
/* Retry all errors through either pNFS or MDS except for -EJUKEBOX */
static int ff_layout_async_handle_error_v3(struct rpc_task *task,
+ u32 op_status,
+ struct nfs_client *clp,
struct pnfs_layout_segment *lseg,
u32 idx)
{
struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
+ switch (op_status) {
+ case NFS_OK:
+ case NFSERR_NXIO:
+ break;
+ case NFSERR_PERM:
+ if (!task->tk_xprt)
+ break;
+ xprt_force_disconnect(task->tk_xprt);
+ goto out_retry;
+ case NFSERR_ACCES:
+ case NFSERR_BADHANDLE:
+ case NFSERR_FBIG:
+ case NFSERR_IO:
+ case NFSERR_NOSPC:
+ case NFSERR_ROFS:
+ case NFSERR_STALE:
+ goto out_reset_to_pnfs;
+ case NFSERR_JUKEBOX:
+ nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY);
+ goto out_retry;
+ default:
+ break;
+ }
+
switch (task->tk_status) {
/* File access problems. Don't mark the device as unavailable */
case -EACCES:
@@ -1200,12 +1250,18 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task,
case -EJUKEBOX:
nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY);
goto out_retry;
+ case -ENETDOWN:
+ case -ENETUNREACH:
+ if (test_bit(NFS_CS_NETUNREACH_FATAL, &clp->cl_flags))
+ return -NFS4ERR_FATAL_IOERROR;
+ fallthrough;
default:
dprintk("%s DS connection error %d\n", __func__,
task->tk_status);
nfs4_delete_deviceid(devid->ld, devid->nfs_client,
&devid->deviceid);
}
+out_reset_to_pnfs:
/* FIXME: Need to prevent infinite looping here. */
return -NFS4ERR_RESET_TO_PNFS;
out_retry:
@@ -1216,6 +1272,7 @@ out_retry:
}
static int ff_layout_async_handle_error(struct rpc_task *task,
+ u32 op_status,
struct nfs4_state *state,
struct nfs_client *clp,
struct pnfs_layout_segment *lseg,
@@ -1234,10 +1291,11 @@ static int ff_layout_async_handle_error(struct rpc_task *task,
switch (vers) {
case 3:
- return ff_layout_async_handle_error_v3(task, lseg, idx);
- case 4:
- return ff_layout_async_handle_error_v4(task, state, clp,
+ return ff_layout_async_handle_error_v3(task, op_status, clp,
lseg, idx);
+ case 4:
+ return ff_layout_async_handle_error_v4(task, op_status, state,
+ clp, lseg, idx);
default:
/* should never happen */
WARN_ON_ONCE(1);
@@ -1264,6 +1322,7 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
case -ECONNRESET:
case -EHOSTDOWN:
case -EHOSTUNREACH:
+ case -ENETDOWN:
case -ENETUNREACH:
case -EADDRINUSE:
case -ENOBUFS:
@@ -1289,6 +1348,7 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
switch (status) {
case NFS4ERR_DELAY:
case NFS4ERR_GRACE:
+ case NFS4ERR_PERM:
break;
case NFS4ERR_NXIO:
ff_layout_mark_ds_unreachable(lseg, idx);
@@ -1318,10 +1378,11 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
hdr->args.offset, hdr->args.count,
&hdr->res.op_status, OP_READ,
task->tk_status);
- trace_ff_layout_read_error(hdr);
+ trace_ff_layout_read_error(hdr, task->tk_status);
}
- err = ff_layout_async_handle_error(task, hdr->args.context->state,
+ err = ff_layout_async_handle_error(task, hdr->res.op_status,
+ hdr->args.context->state,
hdr->ds_clp, hdr->lseg,
hdr->pgio_mirror_idx);
@@ -1337,6 +1398,9 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
return task->tk_status;
case -EAGAIN:
goto out_eagain;
+ case -NFS4ERR_FATAL_IOERROR:
+ task->tk_status = -EIO;
+ return 0;
}
return 0;
@@ -1488,10 +1552,11 @@ static int ff_layout_write_done_cb(struct rpc_task *task,
hdr->args.offset, hdr->args.count,
&hdr->res.op_status, OP_WRITE,
task->tk_status);
- trace_ff_layout_write_error(hdr);
+ trace_ff_layout_write_error(hdr, task->tk_status);
}
- err = ff_layout_async_handle_error(task, hdr->args.context->state,
+ err = ff_layout_async_handle_error(task, hdr->res.op_status,
+ hdr->args.context->state,
hdr->ds_clp, hdr->lseg,
hdr->pgio_mirror_idx);
@@ -1507,6 +1572,9 @@ static int ff_layout_write_done_cb(struct rpc_task *task,
return task->tk_status;
case -EAGAIN:
return -EAGAIN;
+ case -NFS4ERR_FATAL_IOERROR:
+ task->tk_status = -EIO;
+ return 0;
}
if (hdr->res.verf->committed == NFS_FILE_SYNC ||
@@ -1534,11 +1602,12 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
data->args.offset, data->args.count,
&data->res.op_status, OP_COMMIT,
task->tk_status);
- trace_ff_layout_commit_error(data);
+ trace_ff_layout_commit_error(data, task->tk_status);
}
- err = ff_layout_async_handle_error(task, NULL, data->ds_clp,
- data->lseg, data->ds_commit_index);
+ err = ff_layout_async_handle_error(task, data->res.op_status,
+ NULL, data->ds_clp, data->lseg,
+ data->ds_commit_index);
trace_nfs4_pnfs_commit_ds(data, err);
switch (err) {
@@ -1551,6 +1620,9 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
case -EAGAIN:
rpc_restart_call_prepare(task);
return -EAGAIN;
+ case -NFS4ERR_FATAL_IOERROR:
+ task->tk_status = -EIO;
+ return 0;
}
ff_layout_set_layoutcommit(data->inode, data->lseg, data->lwb);
@@ -1795,6 +1867,7 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
u32 idx = hdr->pgio_mirror_idx;
int vers;
struct nfs_fh *fh;
+ bool ds_fatal_error = false;
dprintk("--> %s ino %lu pgbase %u req %zu@%llu\n",
__func__, hdr->inode->i_ino,
@@ -1802,8 +1875,10 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
mirror = FF_LAYOUT_COMP(lseg, idx);
ds = nfs4_ff_layout_prepare_ds(lseg, mirror, false);
- if (!ds)
+ if (IS_ERR(ds)) {
+ ds_fatal_error = nfs_error_is_fatal(PTR_ERR(ds));
goto out_failed;
+ }
ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp,
hdr->inode);
@@ -1851,7 +1926,7 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
return PNFS_ATTEMPTED;
out_failed:
- if (ff_layout_avoid_mds_available_ds(lseg))
+ if (ff_layout_avoid_mds_available_ds(lseg) && !ds_fatal_error)
return PNFS_TRY_AGAIN;
trace_pnfs_mds_fallback_read_pagelist(hdr->inode,
hdr->args.offset, hdr->args.count,
@@ -1873,11 +1948,14 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
int vers;
struct nfs_fh *fh;
u32 idx = hdr->pgio_mirror_idx;
+ bool ds_fatal_error = false;
mirror = FF_LAYOUT_COMP(lseg, idx);
ds = nfs4_ff_layout_prepare_ds(lseg, mirror, true);
- if (!ds)
+ if (IS_ERR(ds)) {
+ ds_fatal_error = nfs_error_is_fatal(PTR_ERR(ds));
goto out_failed;
+ }
ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp,
hdr->inode);
@@ -1928,7 +2006,7 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
return PNFS_ATTEMPTED;
out_failed:
- if (ff_layout_avoid_mds_available_ds(lseg))
+ if (ff_layout_avoid_mds_available_ds(lseg) && !ds_fatal_error)
return PNFS_TRY_AGAIN;
trace_pnfs_mds_fallback_write_pagelist(hdr->inode,
hdr->args.offset, hdr->args.count,
@@ -1971,7 +2049,7 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
idx = calc_ds_index_from_commit(lseg, data->ds_commit_index);
mirror = FF_LAYOUT_COMP(lseg, idx);
ds = nfs4_ff_layout_prepare_ds(lseg, mirror, true);
- if (!ds)
+ if (IS_ERR(ds))
goto out_err;
ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp,
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index e58bedfb1dcc..30365ec782bb 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -49,6 +49,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
struct nfs4_pnfs_ds_addr *da;
struct nfs4_ff_layout_ds *new_ds = NULL;
struct nfs4_ff_ds_version *ds_versions = NULL;
+ struct net *net = server->nfs_client->cl_net;
u32 mp_count;
u32 version_count;
__be32 *p;
@@ -80,8 +81,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
for (i = 0; i < mp_count; i++) {
/* multipath ds */
- da = nfs4_decode_mp_ds_addr(server->nfs_client->cl_net,
- &stream, gfp_flags);
+ da = nfs4_decode_mp_ds_addr(net, &stream, gfp_flags);
if (da)
list_add_tail(&da->da_node, &dsaddrs);
}
@@ -149,7 +149,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
new_ds->ds_versions = ds_versions;
new_ds->ds_versions_cnt = version_count;
- new_ds->ds = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags);
+ new_ds->ds = nfs4_pnfs_ds_add(net, &dsaddrs, gfp_flags);
if (!new_ds->ds)
goto out_err_drain_dsaddrs;
@@ -370,11 +370,11 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg,
struct nfs4_ff_layout_mirror *mirror,
bool fail_return)
{
- struct nfs4_pnfs_ds *ds = NULL;
+ struct nfs4_pnfs_ds *ds;
struct inode *ino = lseg->pls_layout->plh_inode;
struct nfs_server *s = NFS_SERVER(ino);
unsigned int max_payload;
- int status;
+ int status = -EAGAIN;
if (!ff_layout_init_mirror_ds(lseg->pls_layout, mirror))
goto noconnect;
@@ -400,7 +400,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg,
* keep ds_clp even if DS is local, so that if local IO cannot
* proceed somehow, we can fall back to NFS whenever we want.
*/
- nfs_local_probe(ds->ds_clp);
+ nfs_local_probe_async(ds->ds_clp);
max_payload =
nfs_block_size(rpc_max_payload(ds->ds_clp->cl_rpcclient),
NULL);
@@ -418,7 +418,7 @@ noconnect:
ff_layout_send_layouterror(lseg);
if (fail_return || !ff_layout_has_available_ds(lseg))
pnfs_error_mark_layout_for_return(ino, lseg);
- ds = NULL;
+ ds = ERR_PTR(status);
out:
return ds;
}
diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c
index b069385eea17..9e94d18448ff 100644
--- a/fs/nfs/fs_context.c
+++ b/fs/nfs/fs_context.c
@@ -50,6 +50,7 @@ enum nfs_param {
Opt_clientaddr,
Opt_cto,
Opt_alignwrite,
+ Opt_fatal_neterrors,
Opt_fg,
Opt_fscache,
Opt_fscache_flag,
@@ -72,6 +73,8 @@ enum nfs_param {
Opt_posix,
Opt_proto,
Opt_rdirplus,
+ Opt_rdirplus_none,
+ Opt_rdirplus_force,
Opt_rdma,
Opt_resvport,
Opt_retrans,
@@ -93,6 +96,22 @@ enum nfs_param {
Opt_wsize,
Opt_write,
Opt_xprtsec,
+ Opt_cert_serial,
+ Opt_privkey_serial,
+};
+
+enum {
+ Opt_fatal_neterrors_default,
+ Opt_fatal_neterrors_enetunreach,
+ Opt_fatal_neterrors_none,
+};
+
+static const struct constant_table nfs_param_enums_fatal_neterrors[] = {
+ { "default", Opt_fatal_neterrors_default },
+ { "ENETDOWN:ENETUNREACH", Opt_fatal_neterrors_enetunreach },
+ { "ENETUNREACH:ENETDOWN", Opt_fatal_neterrors_enetunreach },
+ { "none", Opt_fatal_neterrors_none },
+ {}
};
enum {
@@ -151,6 +170,8 @@ static const struct fs_parameter_spec nfs_fs_parameters[] = {
fsparam_string("clientaddr", Opt_clientaddr),
fsparam_flag_no("cto", Opt_cto),
fsparam_flag_no("alignwrite", Opt_alignwrite),
+ fsparam_enum("fatal_neterrors", Opt_fatal_neterrors,
+ nfs_param_enums_fatal_neterrors),
fsparam_flag ("fg", Opt_fg),
fsparam_flag_no("fsc", Opt_fscache_flag),
fsparam_string("fsc", Opt_fscache),
@@ -174,7 +195,8 @@ static const struct fs_parameter_spec nfs_fs_parameters[] = {
fsparam_u32 ("port", Opt_port),
fsparam_flag_no("posix", Opt_posix),
fsparam_string("proto", Opt_proto),
- fsparam_flag_no("rdirplus", Opt_rdirplus),
+ fsparam_flag_no("rdirplus", Opt_rdirplus), // rdirplus|nordirplus
+ fsparam_string("rdirplus", Opt_rdirplus), // rdirplus=...
fsparam_flag ("rdma", Opt_rdma),
fsparam_flag_no("resvport", Opt_resvport),
fsparam_u32 ("retrans", Opt_retrans),
@@ -201,6 +223,8 @@ static const struct fs_parameter_spec nfs_fs_parameters[] = {
fsparam_enum ("write", Opt_write, nfs_param_enums_write),
fsparam_u32 ("wsize", Opt_wsize),
fsparam_string("xprtsec", Opt_xprtsec),
+ fsparam_s32("cert_serial", Opt_cert_serial),
+ fsparam_s32("privkey_serial", Opt_privkey_serial),
{}
};
@@ -288,6 +312,12 @@ static const struct constant_table nfs_xprtsec_policies[] = {
{}
};
+static const struct constant_table nfs_rdirplus_tokens[] = {
+ { "none", Opt_rdirplus_none },
+ { "force", Opt_rdirplus_force },
+ {}
+};
+
/*
* Sanity-check a server address provided by the mount command.
*
@@ -525,6 +555,32 @@ static int nfs_parse_version_string(struct fs_context *fc,
return 0;
}
+#ifdef CONFIG_KEYS
+static int nfs_tls_key_verify(key_serial_t key_id)
+{
+ struct key *key = key_lookup(key_id);
+ int error = 0;
+
+ if (IS_ERR(key)) {
+ pr_err("key id %08x not found\n", key_id);
+ return PTR_ERR(key);
+ }
+ if (test_bit(KEY_FLAG_REVOKED, &key->flags) ||
+ test_bit(KEY_FLAG_INVALIDATED, &key->flags)) {
+ pr_err("key id %08x revoked\n", key_id);
+ error = -EKEYREVOKED;
+ }
+
+ key_put(key);
+ return error;
+}
+#else
+static inline int nfs_tls_key_verify(key_serial_t key_id)
+{
+ return -ENOENT;
+}
+#endif /* CONFIG_KEYS */
+
/*
* Parse a single mount parameter.
*/
@@ -636,10 +692,25 @@ static int nfs_fs_context_parse_param(struct fs_context *fc,
ctx->flags &= ~NFS_MOUNT_NOACL;
break;
case Opt_rdirplus:
- if (result.negated)
+ if (result.negated) {
+ ctx->flags &= ~NFS_MOUNT_FORCE_RDIRPLUS;
ctx->flags |= NFS_MOUNT_NORDIRPLUS;
- else
- ctx->flags &= ~NFS_MOUNT_NORDIRPLUS;
+ } else if (!param->string) {
+ ctx->flags &= ~(NFS_MOUNT_NORDIRPLUS | NFS_MOUNT_FORCE_RDIRPLUS);
+ } else {
+ switch (lookup_constant(nfs_rdirplus_tokens, param->string, -1)) {
+ case Opt_rdirplus_none:
+ ctx->flags &= ~NFS_MOUNT_FORCE_RDIRPLUS;
+ ctx->flags |= NFS_MOUNT_NORDIRPLUS;
+ break;
+ case Opt_rdirplus_force:
+ ctx->flags &= ~NFS_MOUNT_NORDIRPLUS;
+ ctx->flags |= NFS_MOUNT_FORCE_RDIRPLUS;
+ break;
+ default:
+ goto out_invalid_value;
+ }
+ }
break;
case Opt_sharecache:
if (result.negated)
@@ -766,6 +837,18 @@ static int nfs_fs_context_parse_param(struct fs_context *fc,
if (ret < 0)
return ret;
break;
+ case Opt_cert_serial:
+ ret = nfs_tls_key_verify(result.int_32);
+ if (ret < 0)
+ return ret;
+ ctx->xprtsec.cert_serial = result.int_32;
+ break;
+ case Opt_privkey_serial:
+ ret = nfs_tls_key_verify(result.int_32);
+ if (ret < 0)
+ return ret;
+ ctx->xprtsec.privkey_serial = result.int_32;
+ break;
case Opt_proto:
if (!param->string)
@@ -872,6 +955,25 @@ static int nfs_fs_context_parse_param(struct fs_context *fc,
goto out_of_bounds;
ctx->nfs_server.max_connect = result.uint_32;
break;
+ case Opt_fatal_neterrors:
+ trace_nfs_mount_assign(param->key, param->string);
+ switch (result.uint_32) {
+ case Opt_fatal_neterrors_default:
+ if (fc->net_ns != &init_net)
+ ctx->flags |= NFS_MOUNT_NETUNREACH_FATAL;
+ else
+ ctx->flags &= ~NFS_MOUNT_NETUNREACH_FATAL;
+ break;
+ case Opt_fatal_neterrors_enetunreach:
+ ctx->flags |= NFS_MOUNT_NETUNREACH_FATAL;
+ break;
+ case Opt_fatal_neterrors_none:
+ ctx->flags &= ~NFS_MOUNT_NETUNREACH_FATAL;
+ break;
+ default:
+ goto out_invalid_value;
+ }
+ break;
case Opt_lookupcache:
trace_nfs_mount_assign(param->key, param->string);
switch (result.uint_32) {
@@ -1651,6 +1753,9 @@ static int nfs_init_fs_context(struct fs_context *fc)
ctx->xprtsec.cert_serial = TLS_NO_CERT;
ctx->xprtsec.privkey_serial = TLS_NO_PRIVKEY;
+ if (fc->net_ns != &init_net)
+ ctx->flags |= NFS_MOUNT_NETUNREACH_FATAL;
+
fc->s_iflags |= SB_I_STABLE_WRITES;
}
fc->fs_private = ctx;
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index e278a1ad1ca3..8b0785178731 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -367,6 +367,7 @@ void nfs_netfs_read_completion(struct nfs_pgio_header *hdr)
sreq = netfs->sreq;
if (test_bit(NFS_IOHDR_EOF, &hdr->flags) &&
+ sreq->rreq->origin != NETFS_UNBUFFERED_READ &&
sreq->rreq->origin != NETFS_DIO_READ)
__set_bit(NETFS_SREQ_CLEAR_TAIL, &sreq->flags);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 1aa67fca69b2..338ef77ae423 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -74,6 +74,8 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr)
int nfs_wait_bit_killable(struct wait_bit_key *key, int mode)
{
+ if (unlikely(nfs_current_task_exiting()))
+ return -EINTR;
schedule();
if (signal_pending_state(mode, current))
return -ERESTARTSYS;
@@ -195,6 +197,7 @@ void nfs_set_cache_invalid(struct inode *inode, unsigned long flags)
if (!(flags & NFS_INO_REVAL_FORCED))
flags &= ~(NFS_INO_INVALID_MODE |
NFS_INO_INVALID_OTHER |
+ NFS_INO_INVALID_BTIME |
NFS_INO_INVALID_XATTR);
flags &= ~(NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_SIZE);
}
@@ -520,6 +523,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
inode_set_atime(inode, 0, 0);
inode_set_mtime(inode, 0, 0);
inode_set_ctime(inode, 0, 0);
+ memset(&nfsi->btime, 0, sizeof(nfsi->btime));
inode_set_iversion_raw(inode, 0);
inode->i_size = 0;
clear_nlink(inode);
@@ -543,6 +547,10 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
inode_set_ctime_to_ts(inode, fattr->ctime);
else if (fattr_supported & NFS_ATTR_FATTR_CTIME)
nfs_set_cache_invalid(inode, NFS_INO_INVALID_CTIME);
+ if (fattr->valid & NFS_ATTR_FATTR_BTIME)
+ nfsi->btime = fattr->btime;
+ else if (fattr_supported & NFS_ATTR_FATTR_BTIME)
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_BTIME);
if (fattr->valid & NFS_ATTR_FATTR_CHANGE)
inode_set_iversion_raw(inode, fattr->change_attr);
else
@@ -555,6 +563,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
set_nlink(inode, fattr->nlink);
else if (fattr_supported & NFS_ATTR_FATTR_NLINK)
nfs_set_cache_invalid(inode, NFS_INO_INVALID_NLINK);
+ else
+ set_nlink(inode, 1);
if (fattr->valid & NFS_ATTR_FATTR_OWNER)
inode->i_uid = fattr->uid;
else if (fattr_supported & NFS_ATTR_FATTR_OWNER)
@@ -631,6 +641,34 @@ nfs_fattr_fixup_delegated(struct inode *inode, struct nfs_fattr *fattr)
}
}
+static void nfs_set_timestamps_to_ts(struct inode *inode, struct iattr *attr)
+{
+ unsigned int cache_flags = 0;
+
+ if (attr->ia_valid & ATTR_MTIME_SET) {
+ struct timespec64 ctime = inode_get_ctime(inode);
+ struct timespec64 mtime = inode_get_mtime(inode);
+ struct timespec64 now;
+ int updated = 0;
+
+ now = inode_set_ctime_current(inode);
+ if (!timespec64_equal(&now, &ctime))
+ updated |= S_CTIME;
+
+ inode_set_mtime_to_ts(inode, attr->ia_mtime);
+ if (!timespec64_equal(&now, &mtime))
+ updated |= S_MTIME;
+
+ inode_maybe_inc_iversion(inode, updated);
+ cache_flags |= NFS_INO_INVALID_CTIME | NFS_INO_INVALID_MTIME;
+ }
+ if (attr->ia_valid & ATTR_ATIME_SET) {
+ inode_set_atime_to_ts(inode, attr->ia_atime);
+ cache_flags |= NFS_INO_INVALID_ATIME;
+ }
+ NFS_I(inode)->cache_validity &= ~cache_flags;
+}
+
static void nfs_update_timestamps(struct inode *inode, unsigned int ia_valid)
{
enum file_time_flags time_flags = 0;
@@ -699,14 +737,27 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
if (nfs_have_delegated_mtime(inode) && attr->ia_valid & ATTR_MTIME) {
spin_lock(&inode->i_lock);
- nfs_update_timestamps(inode, attr->ia_valid);
+ if (attr->ia_valid & ATTR_MTIME_SET) {
+ nfs_set_timestamps_to_ts(inode, attr);
+ attr->ia_valid &= ~(ATTR_MTIME|ATTR_MTIME_SET|
+ ATTR_ATIME|ATTR_ATIME_SET);
+ } else {
+ nfs_update_timestamps(inode, attr->ia_valid);
+ attr->ia_valid &= ~(ATTR_MTIME|ATTR_ATIME);
+ }
spin_unlock(&inode->i_lock);
- attr->ia_valid &= ~(ATTR_MTIME | ATTR_ATIME);
} else if (nfs_have_delegated_atime(inode) &&
attr->ia_valid & ATTR_ATIME &&
!(attr->ia_valid & ATTR_MTIME)) {
- nfs_update_delegated_atime(inode);
- attr->ia_valid &= ~ATTR_ATIME;
+ if (attr->ia_valid & ATTR_ATIME_SET) {
+ spin_lock(&inode->i_lock);
+ nfs_set_timestamps_to_ts(inode, attr);
+ spin_unlock(&inode->i_lock);
+ attr->ia_valid &= ~(ATTR_ATIME|ATTR_ATIME_SET);
+ } else {
+ nfs_update_delegated_atime(inode);
+ attr->ia_valid &= ~ATTR_ATIME;
+ }
}
/* Optimization: if the end result is no change, don't RPC */
@@ -886,6 +937,7 @@ static void nfs_readdirplus_parent_cache_hit(struct dentry *dentry)
static u32 nfs_get_valid_attrmask(struct inode *inode)
{
+ u64 fattr_valid = NFS_SERVER(inode)->fattr_valid;
unsigned long cache_validity = READ_ONCE(NFS_I(inode)->cache_validity);
u32 reply_mask = STATX_INO | STATX_TYPE;
@@ -905,6 +957,9 @@ static u32 nfs_get_valid_attrmask(struct inode *inode)
reply_mask |= STATX_UID | STATX_GID;
if (!(cache_validity & NFS_INO_INVALID_BLOCKS))
reply_mask |= STATX_BLOCKS;
+ if (!(cache_validity & NFS_INO_INVALID_BTIME) &&
+ (fattr_valid & NFS_ATTR_FATTR_BTIME))
+ reply_mask |= STATX_BTIME;
if (!(cache_validity & NFS_INO_INVALID_CHANGE))
reply_mask |= STATX_CHANGE_COOKIE;
return reply_mask;
@@ -915,6 +970,7 @@ int nfs_getattr(struct mnt_idmap *idmap, const struct path *path,
{
struct inode *inode = d_inode(path->dentry);
struct nfs_server *server = NFS_SERVER(inode);
+ u64 fattr_valid = server->fattr_valid;
unsigned long cache_validity;
int err = 0;
bool force_sync = query_flags & AT_STATX_FORCE_SYNC;
@@ -925,9 +981,12 @@ int nfs_getattr(struct mnt_idmap *idmap, const struct path *path,
request_mask &= STATX_TYPE | STATX_MODE | STATX_NLINK | STATX_UID |
STATX_GID | STATX_ATIME | STATX_MTIME | STATX_CTIME |
- STATX_INO | STATX_SIZE | STATX_BLOCKS |
+ STATX_INO | STATX_SIZE | STATX_BLOCKS | STATX_BTIME |
STATX_CHANGE_COOKIE;
+ if (!(fattr_valid & NFS_ATTR_FATTR_BTIME))
+ request_mask &= ~STATX_BTIME;
+
if ((query_flags & AT_STATX_DONT_SYNC) && !force_sync) {
if (readdirplus_enabled)
nfs_readdirplus_parent_cache_hit(path->dentry);
@@ -959,7 +1018,7 @@ int nfs_getattr(struct mnt_idmap *idmap, const struct path *path,
/* Is the user requesting attributes that might need revalidation? */
if (!(request_mask & (STATX_MODE|STATX_NLINK|STATX_ATIME|STATX_CTIME|
STATX_MTIME|STATX_UID|STATX_GID|
- STATX_SIZE|STATX_BLOCKS|
+ STATX_SIZE|STATX_BLOCKS|STATX_BTIME|
STATX_CHANGE_COOKIE)))
goto out_no_revalidate;
@@ -983,6 +1042,8 @@ int nfs_getattr(struct mnt_idmap *idmap, const struct path *path,
do_update |= cache_validity & NFS_INO_INVALID_OTHER;
if (request_mask & STATX_BLOCKS)
do_update |= cache_validity & NFS_INO_INVALID_BLOCKS;
+ if (request_mask & STATX_BTIME)
+ do_update |= cache_validity & NFS_INO_INVALID_BTIME;
if (do_update) {
if (readdirplus_enabled)
@@ -1004,6 +1065,7 @@ out_no_revalidate:
stat->attributes |= STATX_ATTR_CHANGE_MONOTONIC;
if (S_ISDIR(inode->i_mode))
stat->blksize = NFS_SERVER(inode)->dtsize;
+ stat->btime = NFS_I(inode)->btime;
out:
trace_nfs_getattr_exit(inode, err);
return err;
@@ -1898,7 +1960,7 @@ static int nfs_inode_finish_partial_attr_update(const struct nfs_fattr *fattr,
NFS_INO_INVALID_ATIME | NFS_INO_INVALID_CTIME |
NFS_INO_INVALID_MTIME | NFS_INO_INVALID_SIZE |
NFS_INO_INVALID_BLOCKS | NFS_INO_INVALID_OTHER |
- NFS_INO_INVALID_NLINK;
+ NFS_INO_INVALID_NLINK | NFS_INO_INVALID_BTIME;
unsigned long cache_validity = NFS_I(inode)->cache_validity;
enum nfs4_change_attr_type ctype = NFS_SERVER(inode)->change_attr_type;
@@ -2164,7 +2226,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
bool attr_changed = false;
bool have_delegation;
- dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%x)\n",
+ dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%llx)\n",
__func__, inode->i_sb->s_id, inode->i_ino,
nfs_display_fhandle_hash(NFS_FH(inode)),
atomic_read(&inode->i_count), fattr->valid);
@@ -2259,7 +2321,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
| NFS_INO_INVALID_BLOCKS
| NFS_INO_INVALID_NLINK
| NFS_INO_INVALID_MODE
- | NFS_INO_INVALID_OTHER;
+ | NFS_INO_INVALID_OTHER
+ | NFS_INO_INVALID_BTIME;
if (S_ISDIR(inode->i_mode))
nfs_force_lookup_revalidate(inode);
attr_changed = true;
@@ -2293,6 +2356,12 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
nfsi->cache_validity |=
save_cache_validity & NFS_INO_INVALID_CTIME;
+ if (fattr->valid & NFS_ATTR_FATTR_BTIME)
+ nfsi->btime = fattr->btime;
+ else if (fattr_supported & NFS_ATTR_FATTR_BTIME)
+ nfsi->cache_validity |=
+ save_cache_validity & NFS_INO_INVALID_BTIME;
+
/* Check if our cached file size is stale */
if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
new_isize = nfs_size_to_loff_t(fattr->size);
@@ -2544,15 +2613,26 @@ EXPORT_SYMBOL_GPL(nfs_net_id);
static int nfs_net_init(struct net *net)
{
struct nfs_net *nn = net_generic(net, nfs_net_id);
+ int err;
nfs_clients_init(net);
if (!rpc_proc_register(net, &nn->rpcstats)) {
- nfs_clients_exit(net);
- return -ENOMEM;
+ err = -ENOMEM;
+ goto err_proc_rpc;
}
- return nfs_fs_proc_net_init(net);
+ err = nfs_fs_proc_net_init(net);
+ if (err)
+ goto err_proc_nfs;
+
+ return 0;
+
+err_proc_nfs:
+ rpc_proc_unregister(net, "nfs");
+err_proc_rpc:
+ nfs_clients_exit(net);
+ return err;
}
static void nfs_net_exit(struct net *net)
@@ -2569,6 +2649,35 @@ static struct pernet_operations nfs_net_ops = {
.size = sizeof(struct nfs_net),
};
+#ifdef CONFIG_KEYS
+static struct key *nfs_keyring;
+
+static int __init nfs_init_keyring(void)
+{
+ nfs_keyring = keyring_alloc(".nfs",
+ GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
+ current_cred(),
+ (KEY_POS_ALL & ~KEY_POS_SETATTR) |
+ (KEY_USR_ALL & ~KEY_USR_SETATTR),
+ KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL);
+ return PTR_ERR_OR_ZERO(nfs_keyring);
+}
+
+static void nfs_exit_keyring(void)
+{
+ key_put(nfs_keyring);
+}
+#else
+static inline int nfs_init_keyring(void)
+{
+ return 0;
+}
+
+static inline void nfs_exit_keyring(void)
+{
+}
+#endif /* CONFIG_KEYS */
+
/*
* Initialize NFS
*/
@@ -2576,6 +2685,10 @@ static int __init init_nfs_fs(void)
{
int err;
+ err = nfs_init_keyring();
+ if (err)
+ return err;
+
err = nfs_sysfs_init();
if (err < 0)
goto out10;
@@ -2636,6 +2749,7 @@ out7:
out9:
nfs_sysfs_exit();
out10:
+ nfs_exit_keyring();
return err;
}
@@ -2651,6 +2765,7 @@ static void __exit exit_nfs_fs(void)
nfs_fs_proc_exit();
nfsiod_stop();
nfs_sysfs_exit();
+ nfs_exit_keyring();
}
/* Not quite true; I just maintain it */
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index fae2c7ae4acc..74d712b58423 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -207,7 +207,6 @@ struct nfs_mount_request {
};
extern int nfs_mount(struct nfs_mount_request *info, int timeo, int retrans);
-extern void nfs_umount(const struct nfs_mount_request *info);
/* client.c */
extern const struct rpc_program nfs_program;
@@ -232,7 +231,7 @@ extern struct nfs_client *
nfs4_find_client_sessionid(struct net *, const struct sockaddr *,
struct nfs4_sessionid *, u32);
extern struct nfs_server *nfs_create_server(struct fs_context *);
-extern void nfs4_server_set_init_caps(struct nfs_server *);
+extern void nfs_server_set_init_caps(struct nfs_server *);
extern struct nfs_server *nfs4_create_server(struct fs_context *);
extern struct nfs_server *nfs4_create_referral_server(struct fs_context *);
extern int nfs4_update_server(struct nfs_server *server, const char *hostname,
@@ -400,8 +399,8 @@ struct dentry *nfs_lookup(struct inode *, struct dentry *, unsigned int);
void nfs_d_prune_case_insensitive_aliases(struct inode *inode);
int nfs_create(struct mnt_idmap *, struct inode *, struct dentry *,
umode_t, bool);
-int nfs_mkdir(struct mnt_idmap *, struct inode *, struct dentry *,
- umode_t);
+struct dentry *nfs_mkdir(struct mnt_idmap *, struct inode *, struct dentry *,
+ umode_t);
int nfs_rmdir(struct inode *, struct dentry *);
int nfs_unlink(struct inode *, struct dentry *);
int nfs_symlink(struct mnt_idmap *, struct inode *, struct dentry *,
@@ -432,7 +431,7 @@ loff_t nfs_file_llseek(struct file *, loff_t, int);
ssize_t nfs_file_read(struct kiocb *, struct iov_iter *);
ssize_t nfs_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe,
size_t len, unsigned int flags);
-int nfs_file_mmap(struct file *, struct vm_area_struct *);
+int nfs_file_mmap_prepare(struct vm_area_desc *);
ssize_t nfs_file_write(struct kiocb *, struct iov_iter *);
int nfs_file_release(struct inode *, struct file *);
int nfs_lock(struct file *, int, struct file_lock *);
@@ -455,7 +454,6 @@ extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode);
#if IS_ENABLED(CONFIG_NFS_LOCALIO)
/* localio.c */
-extern void nfs_local_probe(struct nfs_client *);
extern void nfs_local_probe_async(struct nfs_client *);
extern void nfs_local_probe_async_work(struct work_struct *);
extern struct nfsd_file *nfs_local_open_fh(struct nfs_client *,
@@ -672,9 +670,12 @@ nfs_write_match_verf(const struct nfs_writeverf *verf,
static inline gfp_t nfs_io_gfp_mask(void)
{
- if (current->flags & PF_WQ_WORKER)
- return GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
- return GFP_KERNEL;
+ gfp_t ret = current_gfp_context(GFP_KERNEL);
+
+ /* For workers __GFP_NORETRY only with __GFP_IO or __GFP_FS */
+ if ((current->flags & PF_WQ_WORKER) && ret == GFP_KERNEL)
+ ret |= __GFP_NORETRY | __GFP_NOWARN;
+ return ret;
}
/*
@@ -899,18 +900,16 @@ u64 nfs_timespec_to_change_attr(const struct timespec64 *ts)
return ((u64)ts->tv_sec << 30) + ts->tv_nsec;
}
-#ifdef CONFIG_CRC32
static inline u32 nfs_stateid_hash(const nfs4_stateid *stateid)
{
return ~crc32_le(0xFFFFFFFF, &stateid->other[0],
NFS4_STATEID_OTHER_SIZE);
}
-#else
-static inline u32 nfs_stateid_hash(nfs4_stateid *stateid)
+
+static inline bool nfs_current_task_exiting(void)
{
- return 0;
+ return (current->flags & PF_EXITING) != 0;
}
-#endif
static inline bool nfs_error_is_fatal(int err)
{
diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c
index 5c21caeae075..bd5fca285899 100644
--- a/fs/nfs/localio.c
+++ b/fs/nfs/localio.c
@@ -171,7 +171,7 @@ static bool nfs_server_uuid_is_local(struct nfs_client *clp)
* - called after alloc_client and init_client (so cl_rpcclient exists)
* - this function is idempotent, it can be called for old or new clients
*/
-void nfs_local_probe(struct nfs_client *clp)
+static void nfs_local_probe(struct nfs_client *clp)
{
/* Disallow localio if disabled via sysfs or AUTH_SYS isn't used */
if (!localio_enabled ||
@@ -191,14 +191,16 @@ void nfs_local_probe(struct nfs_client *clp)
nfs_localio_enable_client(clp);
nfs_uuid_end(&clp->cl_uuid);
}
-EXPORT_SYMBOL_GPL(nfs_local_probe);
void nfs_local_probe_async_work(struct work_struct *work)
{
struct nfs_client *clp =
container_of(work, struct nfs_client, cl_local_probe_work);
+ if (!refcount_inc_not_zero(&clp->cl_count))
+ return;
nfs_local_probe(clp);
+ nfs_put_client(clp);
}
void nfs_local_probe_async(struct nfs_client *clp)
@@ -207,14 +209,16 @@ void nfs_local_probe_async(struct nfs_client *clp)
}
EXPORT_SYMBOL_GPL(nfs_local_probe_async);
-static inline struct nfsd_file *nfs_local_file_get(struct nfsd_file *nf)
+static inline void nfs_local_file_put(struct nfsd_file *localio)
{
- return nfs_to->nfsd_file_get(nf);
-}
+ /* nfs_to_nfsd_file_put_local() expects an __rcu pointer
+ * but we have a __kernel pointer. It is always safe
+ * to cast a __kernel pointer to an __rcu pointer
+ * because the cast only weakens what is known about the pointer.
+ */
+ struct nfsd_file __rcu *nf = (struct nfsd_file __rcu*) localio;
-static inline void nfs_local_file_put(struct nfsd_file *nf)
-{
- nfs_to->nfsd_file_put(nf);
+ nfs_to_nfsd_file_put_local(&nf);
}
/*
@@ -226,12 +230,13 @@ static inline void nfs_local_file_put(struct nfsd_file *nf)
static struct nfsd_file *
__nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred,
struct nfs_fh *fh, struct nfs_file_localio *nfl,
+ struct nfsd_file __rcu **pnf,
const fmode_t mode)
{
struct nfsd_file *localio;
localio = nfs_open_local_fh(&clp->cl_uuid, clp->cl_rpcclient,
- cred, fh, nfl, mode);
+ cred, fh, nfl, pnf, mode);
if (IS_ERR(localio)) {
int status = PTR_ERR(localio);
trace_nfs_local_open_fh(fh, mode, status);
@@ -258,7 +263,7 @@ nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred,
struct nfs_fh *fh, struct nfs_file_localio *nfl,
const fmode_t mode)
{
- struct nfsd_file *nf, *new, __rcu **pnf;
+ struct nfsd_file *nf, __rcu **pnf;
if (!nfs_server_is_local(clp))
return NULL;
@@ -270,29 +275,9 @@ nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred,
else
pnf = &nfl->ro_file;
- new = NULL;
- rcu_read_lock();
- nf = rcu_dereference(*pnf);
- if (!nf) {
- rcu_read_unlock();
- new = __nfs_local_open_fh(clp, cred, fh, nfl, mode);
- if (IS_ERR(new))
- return NULL;
- /* try to swap in the pointer */
- spin_lock(&clp->cl_uuid.lock);
- nf = rcu_dereference_protected(*pnf, 1);
- if (!nf) {
- nf = new;
- new = NULL;
- rcu_assign_pointer(*pnf, nf);
- }
- spin_unlock(&clp->cl_uuid.lock);
- rcu_read_lock();
- }
- nf = nfs_local_file_get(nf);
- rcu_read_unlock();
- if (new)
- nfs_to_nfsd_file_put_local(new);
+ nf = __nfs_local_open_fh(clp, cred, fh, nfl, pnf, mode);
+ if (IS_ERR(nf))
+ return NULL;
return nf;
}
EXPORT_SYMBOL_GPL(nfs_local_open_fh);
@@ -515,14 +500,13 @@ nfs_copy_boot_verifier(struct nfs_write_verifier *verifier, struct inode *inode)
{
struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
u32 *verf = (u32 *)verifier->data;
- int seq = 0;
+ unsigned int seq;
do {
- read_seqbegin_or_lock(&clp->cl_boot_lock, &seq);
+ seq = read_seqbegin(&clp->cl_boot_lock);
verf[0] = (u32)clp->cl_nfssvc_boot.tv_sec;
verf[1] = (u32)clp->cl_nfssvc_boot.tv_nsec;
- } while (need_seqretry(&clp->cl_boot_lock, seq));
- done_seqretry(&clp->cl_boot_lock, seq);
+ } while (read_seqretry(&clp->cl_boot_lock, seq));
}
static void
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 57c9dd700b58..db8dfb920394 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -223,74 +223,6 @@ out_mnt_err:
goto out;
}
-/**
- * nfs_umount - Notify a server that we have unmounted this export
- * @info: pointer to umount request arguments
- *
- * MOUNTPROC_UMNT is advisory, so we set a short timeout, and always
- * use UDP.
- */
-void nfs_umount(const struct nfs_mount_request *info)
-{
- static const struct rpc_timeout nfs_umnt_timeout = {
- .to_initval = 1 * HZ,
- .to_maxval = 3 * HZ,
- .to_retries = 2,
- };
- struct rpc_create_args args = {
- .net = info->net,
- .protocol = IPPROTO_UDP,
- .address = (struct sockaddr *)info->sap,
- .addrsize = info->salen,
- .timeout = &nfs_umnt_timeout,
- .servername = info->hostname,
- .program = &mnt_program,
- .version = info->version,
- .authflavor = RPC_AUTH_UNIX,
- .flags = RPC_CLNT_CREATE_NOPING,
- .cred = current_cred(),
- };
- struct rpc_message msg = {
- .rpc_argp = info->dirpath,
- };
- struct rpc_clnt *clnt;
- int status;
-
- if (strlen(info->dirpath) > MNTPATHLEN)
- return;
-
- if (info->noresvport)
- args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
-
- clnt = rpc_create(&args);
- if (IS_ERR(clnt))
- goto out_clnt_err;
-
- dprintk("NFS: sending UMNT request for %s:%s\n",
- (info->hostname ? info->hostname : "server"), info->dirpath);
-
- if (info->version == NFS_MNT3_VERSION)
- msg.rpc_proc = &clnt->cl_procinfo[MOUNTPROC3_UMNT];
- else
- msg.rpc_proc = &clnt->cl_procinfo[MOUNTPROC_UMNT];
-
- status = rpc_call_sync(clnt, &msg, 0);
- rpc_shutdown_client(clnt);
-
- if (unlikely(status < 0))
- goto out_call_err;
-
- return;
-
-out_clnt_err:
- dprintk("NFS: failed to create UMNT RPC client, status=%ld\n",
- PTR_ERR(clnt));
- return;
-
-out_call_err:
- dprintk("NFS: UMNT request failed, status=%d\n", status);
-}
-
/*
* XDR encode/decode functions for MOUNT
*/
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 973aed9cc5fe..7f1ec9c67ff2 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -195,7 +195,6 @@ struct vfsmount *nfs_d_automount(struct path *path)
if (IS_ERR(mnt))
goto out_fc;
- mntget(mnt); /* prevent immediate expiration */
if (timeout <= 0)
goto out_fc;
diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h
index a68b21603ea9..6ba3ea39e928 100644
--- a/fs/nfs/netns.h
+++ b/fs/nfs/netns.h
@@ -31,7 +31,11 @@ struct nfs_net {
unsigned short nfs_callback_tcpport;
unsigned short nfs_callback_tcpport6;
int cb_users[NFS4_MAX_MINOR_VERSION + 1];
-#endif
+#endif /* CONFIG_NFS_V4 */
+#if IS_ENABLED(CONFIG_NFS_V4_1)
+ struct list_head nfs4_data_server_cache;
+ spinlock_t nfs4_data_server_lock;
+#endif /* CONFIG_NFS_V4_1 */
struct nfs_netns_client *nfs_client;
spinlock_t nfs_client_lock;
ktime_t boot_time;
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 18d8f6529f61..a126eb31f62f 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -104,7 +104,7 @@ struct posix_acl *nfs3_get_acl(struct inode *inode, int type, bool rcu)
switch (status) {
case 0:
- status = nfs_refresh_inode(inode, res.fattr);
+ nfs_refresh_inode(inode, res.fattr);
break;
case -EPFNOSUPPORT:
case -EPROTONOSUPPORT:
diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c
index b0c8a39c2bbd..0d7310c1ee0c 100644
--- a/fs/nfs/nfs3client.c
+++ b/fs/nfs/nfs3client.c
@@ -120,6 +120,8 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv,
if (mds_srv->flags & NFS_MOUNT_NORESVPORT)
__set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
+ if (test_bit(NFS_CS_NETUNREACH_FATAL, &mds_clp->cl_flags))
+ __set_bit(NFS_CS_NETUNREACH_FATAL, &cl_init.init_flags);
__set_bit(NFS_CS_DS, &cl_init.init_flags);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 0c3bc98cd999..a4cb67573aa7 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -39,7 +39,7 @@ nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
__set_current_state(TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
schedule_timeout(NFS_JUKEBOX_RETRY_TIME);
res = -ERESTARTSYS;
- } while (!fatal_signal_pending(current));
+ } while (!fatal_signal_pending(current) && !nfs_current_task_exiting());
return res;
}
@@ -578,13 +578,13 @@ out:
return status;
}
-static int
+static struct dentry *
nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
{
struct posix_acl *default_acl, *acl;
struct nfs3_createdata *data;
- struct dentry *d_alias;
- int status = -ENOMEM;
+ struct dentry *ret = ERR_PTR(-ENOMEM);
+ int status;
dprintk("NFS call mkdir %pd\n", dentry);
@@ -592,8 +592,9 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
if (data == NULL)
goto out;
- status = posix_acl_create(dir, &sattr->ia_mode, &default_acl, &acl);
- if (status)
+ ret = ERR_PTR(posix_acl_create(dir, &sattr->ia_mode,
+ &default_acl, &acl));
+ if (IS_ERR(ret))
goto out;
data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKDIR];
@@ -602,25 +603,27 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
data->arg.mkdir.len = dentry->d_name.len;
data->arg.mkdir.sattr = sattr;
- d_alias = nfs3_do_create(dir, dentry, data);
- status = PTR_ERR_OR_ZERO(d_alias);
+ ret = nfs3_do_create(dir, dentry, data);
- if (status != 0)
+ if (IS_ERR(ret))
goto out_release_acls;
- if (d_alias)
- dentry = d_alias;
+ if (ret)
+ dentry = ret;
status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl);
+ if (status) {
+ dput(ret);
+ ret = ERR_PTR(status);
+ }
- dput(d_alias);
out_release_acls:
posix_acl_release(acl);
posix_acl_release(default_acl);
out:
nfs3_free_createdata(data);
- dprintk("NFS reply mkdir: %d\n", status);
- return status;
+ dprintk("NFS reply mkdir: %d\n", PTR_ERR_OR_ZERO(ret));
+ return ret;
}
static int
diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h
index 0282d93c8bcc..aafd15a4afce 100644
--- a/fs/nfs/nfs42.h
+++ b/fs/nfs/nfs42.h
@@ -21,6 +21,7 @@ int nfs42_proc_allocate(struct file *, loff_t, loff_t);
ssize_t nfs42_proc_copy(struct file *, loff_t, struct file *, loff_t, size_t,
struct nl4_server *, nfs4_stateid *, bool);
int nfs42_proc_deallocate(struct file *, loff_t, loff_t);
+int nfs42_proc_zero_range(struct file *, loff_t, loff_t);
loff_t nfs42_proc_llseek(struct file *, loff_t, int);
int nfs42_proc_layoutstats_generic(struct nfs_server *,
struct nfs42_layoutstat_data *);
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 1924c4a2077b..01c01f45358b 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -21,6 +21,8 @@
#define NFSDBG_FACILITY NFSDBG_PROC
static int nfs42_do_offload_cancel_async(struct file *dst, nfs4_stateid *std);
+static int nfs42_proc_offload_status(struct file *file, nfs4_stateid *stateid,
+ u64 *copied);
static void nfs42_set_netaddr(struct file *filep, struct nfs42_netaddr *naddr)
{
@@ -144,7 +146,8 @@ int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len)
err = nfs42_proc_fallocate(&msg, filep, offset, len);
if (err == -EOPNOTSUPP)
- NFS_SERVER(inode)->caps &= ~NFS_CAP_ALLOCATE;
+ NFS_SERVER(inode)->caps &= ~(NFS_CAP_ALLOCATE |
+ NFS_CAP_ZERO_RANGE);
inode_unlock(inode);
return err;
@@ -167,12 +170,50 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len)
if (err == 0)
truncate_pagecache_range(inode, offset, (offset + len) -1);
if (err == -EOPNOTSUPP)
- NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE;
+ NFS_SERVER(inode)->caps &= ~(NFS_CAP_DEALLOCATE |
+ NFS_CAP_ZERO_RANGE);
inode_unlock(inode);
return err;
}
+int nfs42_proc_zero_range(struct file *filep, loff_t offset, loff_t len)
+{
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ZERO_RANGE],
+ };
+ struct inode *inode = file_inode(filep);
+ int err;
+
+ if (!nfs_server_capable(inode, NFS_CAP_ZERO_RANGE))
+ return -EOPNOTSUPP;
+
+ inode_lock(inode);
+
+ err = nfs42_proc_fallocate(&msg, filep, offset, len);
+ if (err == 0)
+ truncate_pagecache_range(inode, offset, (offset + len) -1);
+ if (err == -EOPNOTSUPP)
+ NFS_SERVER(inode)->caps &= ~NFS_CAP_ZERO_RANGE;
+
+ inode_unlock(inode);
+ return err;
+}
+
+static void nfs4_copy_dequeue_callback(struct nfs_server *dst_server,
+ struct nfs_server *src_server,
+ struct nfs4_copy_state *copy)
+{
+ spin_lock(&dst_server->nfs_client->cl_lock);
+ list_del_init(&copy->copies);
+ spin_unlock(&dst_server->nfs_client->cl_lock);
+ if (dst_server != src_server) {
+ spin_lock(&src_server->nfs_client->cl_lock);
+ list_del_init(&copy->src_copies);
+ spin_unlock(&src_server->nfs_client->cl_lock);
+ }
+}
+
static int handle_async_copy(struct nfs42_copy_res *res,
struct nfs_server *dst_server,
struct nfs_server *src_server,
@@ -182,9 +223,12 @@ static int handle_async_copy(struct nfs42_copy_res *res,
bool *restart)
{
struct nfs4_copy_state *copy, *tmp_copy = NULL, *iter;
- int status = NFS4_OK;
struct nfs_open_context *dst_ctx = nfs_file_open_context(dst);
struct nfs_open_context *src_ctx = nfs_file_open_context(src);
+ struct nfs_client *clp = dst_server->nfs_client;
+ unsigned long timeout = 3 * HZ;
+ int status = NFS4_OK;
+ u64 copied;
copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_KERNEL);
if (!copy)
@@ -222,15 +266,12 @@ static int handle_async_copy(struct nfs42_copy_res *res,
spin_unlock(&src_server->nfs_client->cl_lock);
}
- status = wait_for_completion_interruptible(&copy->completion);
- spin_lock(&dst_server->nfs_client->cl_lock);
- list_del_init(&copy->copies);
- spin_unlock(&dst_server->nfs_client->cl_lock);
- if (dst_server != src_server) {
- spin_lock(&src_server->nfs_client->cl_lock);
- list_del_init(&copy->src_copies);
- spin_unlock(&src_server->nfs_client->cl_lock);
- }
+wait:
+ status = wait_for_completion_interruptible_timeout(&copy->completion,
+ timeout);
+ if (!status)
+ goto timeout;
+ nfs4_copy_dequeue_callback(dst_server, src_server, copy);
if (status == -ERESTARTSYS) {
goto out_cancel;
} else if (copy->flags || copy->error == NFS4ERR_PARTNER_NO_AUTH) {
@@ -240,6 +281,7 @@ static int handle_async_copy(struct nfs42_copy_res *res,
}
out:
res->write_res.count = copy->count;
+ /* Copy out the updated write verifier provided by CB_OFFLOAD. */
memcpy(&res->write_res.verifier, &copy->verf, sizeof(copy->verf));
status = -copy->error;
@@ -251,6 +293,39 @@ out_cancel:
if (!nfs42_files_from_same_server(src, dst))
nfs42_do_offload_cancel_async(src, src_stateid);
goto out_free;
+timeout:
+ timeout <<= 1;
+ if (timeout > (clp->cl_lease_time >> 1))
+ timeout = clp->cl_lease_time >> 1;
+ status = nfs42_proc_offload_status(dst, &copy->stateid, &copied);
+ if (status == -EINPROGRESS)
+ goto wait;
+ nfs4_copy_dequeue_callback(dst_server, src_server, copy);
+ switch (status) {
+ case 0:
+ /* The server recognized the copy stateid, so it hasn't
+ * rebooted. Don't overwrite the verifier returned in the
+ * COPY result. */
+ res->write_res.count = copied;
+ goto out_free;
+ case -EREMOTEIO:
+ /* COPY operation failed on the server. */
+ status = -EOPNOTSUPP;
+ res->write_res.count = copied;
+ goto out_free;
+ case -EBADF:
+ /* Server did not recognize the copy stateid. It has
+ * probably restarted and lost the plot. */
+ res->write_res.count = 0;
+ status = -EOPNOTSUPP;
+ break;
+ case -EOPNOTSUPP:
+ /* RFC 7862 REQUIREs server to support OFFLOAD_STATUS when
+ * it has signed up for an async COPY, so server is not
+ * spec-compliant. */
+ res->write_res.count = 0;
+ }
+ goto out_free;
}
static int process_copy_commit(struct file *dst, loff_t pos_dst,
@@ -582,6 +657,108 @@ static int nfs42_do_offload_cancel_async(struct file *dst,
return status;
}
+static int
+_nfs42_proc_offload_status(struct nfs_server *server, struct file *file,
+ struct nfs42_offload_data *data)
+{
+ struct nfs_open_context *ctx = nfs_file_open_context(file);
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OFFLOAD_STATUS],
+ .rpc_argp = &data->args,
+ .rpc_resp = &data->res,
+ .rpc_cred = ctx->cred,
+ };
+ int status;
+
+ status = nfs4_call_sync(server->client, server, &msg,
+ &data->args.osa_seq_args,
+ &data->res.osr_seq_res, 1);
+ trace_nfs4_offload_status(&data->args, status);
+ switch (status) {
+ case 0:
+ break;
+
+ case -NFS4ERR_ADMIN_REVOKED:
+ case -NFS4ERR_BAD_STATEID:
+ case -NFS4ERR_OLD_STATEID:
+ /*
+ * Server does not recognize the COPY stateid. CB_OFFLOAD
+ * could have purged it, or server might have rebooted.
+ * Since COPY stateids don't have an associated inode,
+ * avoid triggering state recovery.
+ */
+ status = -EBADF;
+ break;
+ case -NFS4ERR_NOTSUPP:
+ case -ENOTSUPP:
+ case -EOPNOTSUPP:
+ server->caps &= ~NFS_CAP_OFFLOAD_STATUS;
+ status = -EOPNOTSUPP;
+ break;
+ }
+
+ return status;
+}
+
+/**
+ * nfs42_proc_offload_status - Poll completion status of an async copy operation
+ * @dst: handle of file being copied into
+ * @stateid: copy stateid (from async COPY result)
+ * @copied: OUT: number of bytes copied so far
+ *
+ * Return values:
+ * %0: Server returned an NFS4_OK completion status
+ * %-EINPROGRESS: Server returned no completion status
+ * %-EREMOTEIO: Server returned an error completion status
+ * %-EBADF: Server did not recognize the copy stateid
+ * %-EOPNOTSUPP: Server does not support OFFLOAD_STATUS
+ * %-ERESTARTSYS: Wait interrupted by signal
+ *
+ * Other negative errnos indicate the client could not complete the
+ * request.
+ */
+static int
+nfs42_proc_offload_status(struct file *dst, nfs4_stateid *stateid, u64 *copied)
+{
+ struct inode *inode = file_inode(dst);
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs4_exception exception = {
+ .inode = inode,
+ };
+ struct nfs42_offload_data *data;
+ int status;
+
+ if (!(server->caps & NFS_CAP_OFFLOAD_STATUS))
+ return -EOPNOTSUPP;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+ data->seq_server = server;
+ data->args.osa_src_fh = NFS_FH(inode);
+ memcpy(&data->args.osa_stateid, stateid,
+ sizeof(data->args.osa_stateid));
+ exception.stateid = &data->args.osa_stateid;
+ do {
+ status = _nfs42_proc_offload_status(server, dst, data);
+ if (status == -EOPNOTSUPP)
+ goto out;
+ status = nfs4_handle_exception(server, status, &exception);
+ } while (exception.retry);
+ if (status)
+ goto out;
+
+ *copied = data->res.osr_count;
+ if (!data->res.complete_count)
+ status = -EINPROGRESS;
+ else if (data->res.osr_complete != NFS_OK)
+ status = -EREMOTEIO;
+
+out:
+ kfree(data);
+ return status;
+}
+
static int _nfs42_proc_copy_notify(struct file *src, struct file *dst,
struct nfs42_copy_notify_args *args,
struct nfs42_copy_notify_res *res)
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 5072d7ea72e9..4cc915d5741d 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -35,6 +35,11 @@
#define encode_offload_cancel_maxsz (op_encode_hdr_maxsz + \
XDR_QUADLEN(NFS4_STATEID_SIZE))
#define decode_offload_cancel_maxsz (op_decode_hdr_maxsz)
+#define encode_offload_status_maxsz (op_encode_hdr_maxsz + \
+ XDR_QUADLEN(NFS4_STATEID_SIZE))
+#define decode_offload_status_maxsz (op_decode_hdr_maxsz + \
+ 2 /* osr_count */ + \
+ 2 /* osr_complete */)
#define encode_copy_notify_maxsz (op_encode_hdr_maxsz + \
XDR_QUADLEN(NFS4_STATEID_SIZE) + \
1 + /* nl4_type */ \
@@ -143,6 +148,14 @@
decode_sequence_maxsz + \
decode_putfh_maxsz + \
decode_offload_cancel_maxsz)
+#define NFS4_enc_offload_status_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_offload_status_maxsz)
+#define NFS4_dec_offload_status_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_offload_status_maxsz)
#define NFS4_enc_copy_notify_sz (compound_encode_hdr_maxsz + \
encode_sequence_maxsz + \
encode_putfh_maxsz + \
@@ -161,6 +174,18 @@
decode_putfh_maxsz + \
decode_deallocate_maxsz + \
decode_getattr_maxsz)
+#define NFS4_enc_zero_range_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_deallocate_maxsz + \
+ encode_allocate_maxsz + \
+ encode_getattr_maxsz)
+#define NFS4_dec_zero_range_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_deallocate_maxsz + \
+ decode_allocate_maxsz + \
+ decode_getattr_maxsz)
#define NFS4_enc_read_plus_sz (compound_encode_hdr_maxsz + \
encode_sequence_maxsz + \
encode_putfh_maxsz + \
@@ -345,6 +370,14 @@ static void encode_offload_cancel(struct xdr_stream *xdr,
encode_nfs4_stateid(xdr, &args->osa_stateid);
}
+static void encode_offload_status(struct xdr_stream *xdr,
+ const struct nfs42_offload_status_args *args,
+ struct compound_hdr *hdr)
+{
+ encode_op_hdr(xdr, OP_OFFLOAD_STATUS, decode_offload_status_maxsz, hdr);
+ encode_nfs4_stateid(xdr, &args->osa_stateid);
+}
+
static void encode_copy_notify(struct xdr_stream *xdr,
const struct nfs42_copy_notify_args *args,
struct compound_hdr *hdr)
@@ -570,6 +603,25 @@ static void nfs4_xdr_enc_offload_cancel(struct rpc_rqst *req,
}
/*
+ * Encode OFFLOAD_STATUS request
+ */
+static void nfs4_xdr_enc_offload_status(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs42_offload_status_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->osa_seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->osa_seq_args, &hdr);
+ encode_putfh(xdr, args->osa_src_fh, &hdr);
+ encode_offload_status(xdr, args, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
* Encode COPY_NOTIFY request
*/
static void nfs4_xdr_enc_copy_notify(struct rpc_rqst *req,
@@ -609,6 +661,27 @@ static void nfs4_xdr_enc_deallocate(struct rpc_rqst *req,
}
/*
+ * Encode ZERO_RANGE request
+ */
+static void nfs4_xdr_enc_zero_range(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const void *data)
+{
+ const struct nfs42_falloc_args *args = data;
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->falloc_fh, &hdr);
+ encode_deallocate(xdr, args, &hdr);
+ encode_allocate(xdr, args, &hdr);
+ encode_getfattr(xdr, args->falloc_bitmask, &hdr);
+ encode_nops(&hdr);
+}
+
+/*
* Encode READ_PLUS request
*/
static void nfs4_xdr_enc_read_plus(struct rpc_rqst *req,
@@ -921,6 +994,26 @@ static int decode_offload_cancel(struct xdr_stream *xdr,
return decode_op_hdr(xdr, OP_OFFLOAD_CANCEL);
}
+static int decode_offload_status(struct xdr_stream *xdr,
+ struct nfs42_offload_status_res *res)
+{
+ ssize_t result;
+ int status;
+
+ status = decode_op_hdr(xdr, OP_OFFLOAD_STATUS);
+ if (status)
+ return status;
+ /* osr_count */
+ if (xdr_stream_decode_u64(xdr, &res->osr_count) < 0)
+ return -EIO;
+ /* osr_complete<1> */
+ result = xdr_stream_decode_uint32_array(xdr, &res->osr_complete, 1);
+ if (result < 0)
+ return -EIO;
+ res->complete_count = result;
+ return 0;
+}
+
static int decode_copy_notify(struct xdr_stream *xdr,
struct nfs42_copy_notify_res *res)
{
@@ -1371,6 +1464,32 @@ out:
}
/*
+ * Decode OFFLOAD_STATUS response
+ */
+static int nfs4_xdr_dec_offload_status(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs42_offload_status_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->osr_seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_offload_status(xdr, res);
+
+out:
+ return status;
+}
+
+/*
* Decode COPY_NOTIFY response
*/
static int nfs4_xdr_dec_copy_notify(struct rpc_rqst *rqstp,
@@ -1425,6 +1544,37 @@ out:
}
/*
+ * Decode ZERO_RANGE request
+ */
+static int nfs4_xdr_dec_zero_range(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ void *data)
+{
+ struct nfs42_falloc_res *res = data;
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_deallocate(xdr, res);
+ if (status)
+ goto out;
+ status = decode_allocate(xdr, res);
+ if (status)
+ goto out;
+ decode_getfattr(xdr, res->falloc_fattr, res->falloc_server);
+out:
+ return status;
+}
+
+/*
* Decode READ_PLUS request
*/
static int nfs4_xdr_dec_read_plus(struct rpc_rqst *rqstp,
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 7d383d29a995..c34c89af9c7d 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -63,12 +63,11 @@ struct nfs4_minor_version_ops {
bool (*match_stateid)(const nfs4_stateid *,
const nfs4_stateid *);
int (*find_root_sec)(struct nfs_server *, struct nfs_fh *,
- struct nfs_fsinfo *);
+ struct nfs_fattr *);
void (*free_lock_state)(struct nfs_server *,
struct nfs4_lock_state *);
int (*test_and_free_expired)(struct nfs_server *,
- const nfs4_stateid *,
- const struct cred *);
+ nfs4_stateid *, const struct cred *);
struct nfs_seqid *
(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
void (*session_trunk)(struct rpc_clnt *clnt,
@@ -297,7 +296,8 @@ extern int nfs4_call_sync(struct rpc_clnt *, struct nfs_server *,
extern void nfs4_init_sequence(struct nfs4_sequence_args *, struct nfs4_sequence_res *, int, int);
extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, const struct cred *, struct nfs4_setclientid_res *);
extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, const struct cred *);
-extern int nfs4_proc_get_rootfh(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *, bool);
+extern int nfs4_proc_get_rootfh(struct nfs_server *, struct nfs_fh *,
+ struct nfs_fattr *, bool);
extern int nfs4_proc_bind_conn_to_session(struct nfs_client *, const struct cred *cred);
extern int nfs4_proc_exchange_id(struct nfs_client *clp, const struct cred *cred);
extern int nfs4_destroy_clientid(struct nfs_client *clp);
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 83378f69b35e..6fddf43d729c 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -233,6 +233,8 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init)
__set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags);
if (test_bit(NFS_CS_PNFS, &cl_init->init_flags))
__set_bit(NFS_CS_PNFS, &clp->cl_flags);
+ if (test_bit(NFS_CS_NETUNREACH_FATAL, &cl_init->init_flags))
+ __set_bit(NFS_CS_NETUNREACH_FATAL, &clp->cl_flags);
/*
* Set up the connection to the server before we add add to the
* global list.
@@ -800,6 +802,7 @@ static void nfs4_destroy_server(struct nfs_server *server)
unset_pnfs_layoutdriver(server);
nfs4_purge_state_owners(server, &freeme);
nfs4_free_state_owners(&freeme);
+ kfree(server->delegation_hash_table);
}
/*
@@ -893,52 +896,40 @@ nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr,
* Set up an NFS4 client
*/
static int nfs4_set_client(struct nfs_server *server,
- const char *hostname,
- const struct sockaddr_storage *addr,
- const size_t addrlen,
- const char *ip_addr,
- int proto, const struct rpc_timeout *timeparms,
- u32 minorversion, unsigned int nconnect,
- unsigned int max_connect,
- struct net *net,
- struct xprtsec_parms *xprtsec)
+ struct nfs_client_initdata *cl_init)
{
- struct nfs_client_initdata cl_init = {
- .hostname = hostname,
- .addr = addr,
- .addrlen = addrlen,
- .ip_addr = ip_addr,
- .nfs_mod = &nfs_v4,
- .proto = proto,
- .minorversion = minorversion,
- .net = net,
- .timeparms = timeparms,
- .cred = server->cred,
- .xprtsec = *xprtsec,
- };
struct nfs_client *clp;
- if (minorversion == 0)
- __set_bit(NFS_CS_REUSEPORT, &cl_init.init_flags);
- else
- cl_init.max_connect = max_connect;
- switch (proto) {
+ cl_init->nfs_mod = &nfs_v4;
+ cl_init->cred = server->cred;
+
+ if (cl_init->minorversion == 0) {
+ __set_bit(NFS_CS_REUSEPORT, &cl_init->init_flags);
+ cl_init->max_connect = 0;
+ }
+
+ switch (cl_init->proto) {
case XPRT_TRANSPORT_RDMA:
case XPRT_TRANSPORT_TCP:
case XPRT_TRANSPORT_TCP_TLS:
- cl_init.nconnect = nconnect;
+ break;
+ default:
+ cl_init->nconnect = 0;
}
if (server->flags & NFS_MOUNT_NORESVPORT)
- __set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
+ __set_bit(NFS_CS_NORESVPORT, &cl_init->init_flags);
if (server->options & NFS_OPTION_MIGRATION)
- __set_bit(NFS_CS_MIGRATION, &cl_init.init_flags);
+ __set_bit(NFS_CS_MIGRATION, &cl_init->init_flags);
if (test_bit(NFS_MIG_TSM_POSSIBLE, &server->mig_status))
- __set_bit(NFS_CS_TSM_POSSIBLE, &cl_init.init_flags);
- server->port = rpc_get_port((struct sockaddr *)addr);
+ __set_bit(NFS_CS_TSM_POSSIBLE, &cl_init->init_flags);
+ server->port = rpc_get_port((struct sockaddr *)cl_init->addr);
+
+ if (server->flags & NFS_MOUNT_NETUNREACH_FATAL)
+ __set_bit(NFS_CS_NETUNREACH_FATAL, &cl_init->init_flags);
/* Allocate or find a client reference we can use */
- clp = nfs_get_client(&cl_init);
+ clp = nfs_get_client(cl_init);
if (IS_ERR(clp))
return PTR_ERR(clp);
@@ -1011,6 +1002,8 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
if (mds_srv->flags & NFS_MOUNT_NORESVPORT)
__set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
+ if (test_bit(NFS_CS_NETUNREACH_FATAL, &mds_clp->cl_flags))
+ __set_bit(NFS_CS_NETUNREACH_FATAL, &cl_init.init_flags);
__set_bit(NFS_CS_PNFS, &cl_init.init_flags);
cl_init.max_connect = NFS_MAX_TRANSPORTS;
@@ -1081,29 +1074,15 @@ static void nfs4_session_limit_xasize(struct nfs_server *server)
#endif
}
-void nfs4_server_set_init_caps(struct nfs_server *server)
-{
- /* Set the basic capabilities */
- server->caps |= server->nfs_client->cl_mvops->init_caps;
- if (server->flags & NFS_MOUNT_NORDIRPLUS)
- server->caps &= ~NFS_CAP_READDIRPLUS;
- if (server->nfs_client->cl_proto == XPRT_TRANSPORT_RDMA)
- server->caps &= ~NFS_CAP_READ_PLUS;
-
- /*
- * Don't use NFS uid/gid mapping if we're using AUTH_SYS or lower
- * authentication.
- */
- if (nfs4_disable_idmapping &&
- server->client->cl_auth->au_flavor == RPC_AUTH_UNIX)
- server->caps |= NFS_CAP_UIDGID_NOMAP;
-}
-
static int nfs4_server_common_setup(struct nfs_server *server,
struct nfs_fh *mntfh, bool auth_probe)
{
int error;
+ error = nfs4_delegation_hash_alloc(server);
+ if (error)
+ return error;
+
/* data servers support only a subset of NFSv4.1 */
if (is_ds_only_client(server->nfs_client))
return -EPROTONOSUPPORT;
@@ -1111,14 +1090,14 @@ static int nfs4_server_common_setup(struct nfs_server *server,
/* We must ensure the session is initialised first */
error = nfs4_init_session(server->nfs_client);
if (error < 0)
- goto out;
+ return error;
- nfs4_server_set_init_caps(server);
+ nfs_server_set_init_caps(server);
/* Probe the root fh to retrieve its FSID and filehandle */
error = nfs4_get_rootfh(server, mntfh, auth_probe);
if (error < 0)
- goto out;
+ return error;
dprintk("Server FSID: %llx:%llx\n",
(unsigned long long) server->fsid.major,
@@ -1127,7 +1106,7 @@ static int nfs4_server_common_setup(struct nfs_server *server,
error = nfs_probe_server(server, mntfh);
if (error < 0)
- goto out;
+ return error;
nfs4_session_limit_rwsize(server);
nfs4_session_limit_xasize(server);
@@ -1138,8 +1117,7 @@ static int nfs4_server_common_setup(struct nfs_server *server,
nfs_server_insert_lists(server);
server->mount_time = jiffies;
server->destroy = nfs4_destroy_server;
-out:
- return error;
+ return 0;
}
/*
@@ -1149,6 +1127,19 @@ static int nfs4_init_server(struct nfs_server *server, struct fs_context *fc)
{
struct nfs_fs_context *ctx = nfs_fc2context(fc);
struct rpc_timeout timeparms;
+ struct nfs_client_initdata cl_init = {
+ .hostname = ctx->nfs_server.hostname,
+ .addr = &ctx->nfs_server._address,
+ .addrlen = ctx->nfs_server.addrlen,
+ .ip_addr = ctx->client_address,
+ .proto = ctx->nfs_server.protocol,
+ .minorversion = ctx->minorversion,
+ .net = fc->net_ns,
+ .timeparms = &timeparms,
+ .xprtsec = ctx->xprtsec,
+ .nconnect = ctx->nfs_server.nconnect,
+ .max_connect = ctx->nfs_server.max_connect,
+ };
int error;
nfs_init_timeout_values(&timeparms, ctx->nfs_server.protocol,
@@ -1168,18 +1159,7 @@ static int nfs4_init_server(struct nfs_server *server, struct fs_context *fc)
ctx->selected_flavor = RPC_AUTH_UNIX;
/* Get a client record */
- error = nfs4_set_client(server,
- ctx->nfs_server.hostname,
- &ctx->nfs_server._address,
- ctx->nfs_server.addrlen,
- ctx->client_address,
- ctx->nfs_server.protocol,
- &timeparms,
- ctx->minorversion,
- ctx->nfs_server.nconnect,
- ctx->nfs_server.max_connect,
- fc->net_ns,
- &ctx->xprtsec);
+ error = nfs4_set_client(server, &cl_init);
if (error < 0)
return error;
@@ -1239,18 +1219,28 @@ error:
struct nfs_server *nfs4_create_referral_server(struct fs_context *fc)
{
struct nfs_fs_context *ctx = nfs_fc2context(fc);
- struct nfs_client *parent_client;
- struct nfs_server *server, *parent_server;
- int proto, error;
+ struct nfs_server *parent_server = NFS_SB(ctx->clone_data.sb);
+ struct nfs_client *parent_client = parent_server->nfs_client;
+ struct nfs_client_initdata cl_init = {
+ .hostname = ctx->nfs_server.hostname,
+ .addr = &ctx->nfs_server._address,
+ .addrlen = ctx->nfs_server.addrlen,
+ .ip_addr = parent_client->cl_ipaddr,
+ .minorversion = parent_client->cl_mvops->minor_version,
+ .net = parent_client->cl_net,
+ .timeparms = parent_server->client->cl_timeout,
+ .xprtsec = parent_client->cl_xprtsec,
+ .nconnect = parent_client->cl_nconnect,
+ .max_connect = parent_client->cl_max_connect,
+ };
+ struct nfs_server *server;
bool auth_probe;
+ int error;
server = nfs_alloc_server();
if (!server)
return ERR_PTR(-ENOMEM);
- parent_server = NFS_SB(ctx->clone_data.sb);
- parent_client = parent_server->nfs_client;
-
server->cred = get_cred(parent_server->cred);
/* Initialise the client representation from the parent server */
@@ -1259,38 +1249,17 @@ struct nfs_server *nfs4_create_referral_server(struct fs_context *fc)
/* Get a client representation */
#if IS_ENABLED(CONFIG_SUNRPC_XPRT_RDMA)
rpc_set_port(&ctx->nfs_server.address, NFS_RDMA_PORT);
- error = nfs4_set_client(server,
- ctx->nfs_server.hostname,
- &ctx->nfs_server._address,
- ctx->nfs_server.addrlen,
- parent_client->cl_ipaddr,
- XPRT_TRANSPORT_RDMA,
- parent_server->client->cl_timeout,
- parent_client->cl_mvops->minor_version,
- parent_client->cl_nconnect,
- parent_client->cl_max_connect,
- parent_client->cl_net,
- &parent_client->cl_xprtsec);
+ cl_init.proto = XPRT_TRANSPORT_RDMA;
+ error = nfs4_set_client(server, &cl_init);
if (!error)
goto init_server;
#endif /* IS_ENABLED(CONFIG_SUNRPC_XPRT_RDMA) */
- proto = XPRT_TRANSPORT_TCP;
+ cl_init.proto = XPRT_TRANSPORT_TCP;
if (parent_client->cl_xprtsec.policy != RPC_XPRTSEC_NONE)
- proto = XPRT_TRANSPORT_TCP_TLS;
+ cl_init.proto = XPRT_TRANSPORT_TCP_TLS;
rpc_set_port(&ctx->nfs_server.address, NFS_PORT);
- error = nfs4_set_client(server,
- ctx->nfs_server.hostname,
- &ctx->nfs_server._address,
- ctx->nfs_server.addrlen,
- parent_client->cl_ipaddr,
- proto,
- parent_server->client->cl_timeout,
- parent_client->cl_mvops->minor_version,
- parent_client->cl_nconnect,
- parent_client->cl_max_connect,
- parent_client->cl_net,
- &parent_client->cl_xprtsec);
+ error = nfs4_set_client(server, &cl_init);
if (error < 0)
goto error;
@@ -1346,6 +1315,19 @@ int nfs4_update_server(struct nfs_server *server, const char *hostname,
char buf[INET6_ADDRSTRLEN + 1];
struct sockaddr_storage address;
struct sockaddr *localaddr = (struct sockaddr *)&address;
+ struct nfs_client_initdata cl_init = {
+ .hostname = hostname,
+ .addr = sap,
+ .addrlen = salen,
+ .ip_addr = buf,
+ .proto = clp->cl_proto,
+ .minorversion = clp->cl_minorversion,
+ .net = net,
+ .timeparms = clnt->cl_timeout,
+ .xprtsec = clp->cl_xprtsec,
+ .nconnect = clp->cl_nconnect,
+ .max_connect = clp->cl_max_connect,
+ };
int error;
error = rpc_switch_client_transport(clnt, &xargs, clnt->cl_timeout);
@@ -1361,11 +1343,7 @@ int nfs4_update_server(struct nfs_server *server, const char *hostname,
nfs_server_remove_lists(server);
set_bit(NFS_MIG_TSM_POSSIBLE, &server->mig_status);
- error = nfs4_set_client(server, hostname, sap, salen, buf,
- clp->cl_proto, clnt->cl_timeout,
- clp->cl_minorversion,
- clp->cl_nconnect, clp->cl_max_connect,
- net, &clp->cl_xprtsec);
+ error = nfs4_set_client(server, &cl_init);
clear_bit(NFS_MIG_TSM_POSSIBLE, &server->mig_status);
if (error != 0) {
nfs_server_insert_lists(server);
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 1cd9652f3c28..1d6b5f4230c9 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -225,8 +225,14 @@ static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t
if (!S_ISREG(inode->i_mode))
return -EOPNOTSUPP;
- if ((mode != 0) && (mode != (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)))
+ switch (mode) {
+ case 0:
+ case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE:
+ case FALLOC_FL_ZERO_RANGE:
+ break;
+ default:
return -EOPNOTSUPP;
+ }
ret = inode_newsize_ok(inode, offset + len);
if (ret < 0)
@@ -234,6 +240,8 @@ static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t
if (mode & FALLOC_FL_PUNCH_HOLE)
return nfs42_proc_deallocate(filep, offset, len);
+ else if (mode & FALLOC_FL_ZERO_RANGE)
+ return nfs42_proc_zero_range(filep, offset ,len);
return nfs42_proc_allocate(filep, offset, len);
}
@@ -245,7 +253,6 @@ static loff_t nfs42_remap_file_range(struct file *src_file, loff_t src_off,
struct nfs_server *server = NFS_SERVER(dst_inode);
struct inode *src_inode = file_inode(src_file);
unsigned int bs = server->clone_blksize;
- bool same_inode = false;
int ret;
/* NFS does not support deduplication. */
@@ -267,20 +274,8 @@ static loff_t nfs42_remap_file_range(struct file *src_file, loff_t src_off,
goto out;
}
- if (src_inode == dst_inode)
- same_inode = true;
-
/* XXX: do we lock at all? what if server needs CB_RECALL_LAYOUT? */
- if (same_inode) {
- inode_lock(src_inode);
- } else if (dst_inode < src_inode) {
- inode_lock_nested(dst_inode, I_MUTEX_PARENT);
- inode_lock_nested(src_inode, I_MUTEX_CHILD);
- } else {
- inode_lock_nested(src_inode, I_MUTEX_PARENT);
- inode_lock_nested(dst_inode, I_MUTEX_CHILD);
- }
-
+ lock_two_nondirectories(src_inode, dst_inode);
/* flush all pending writes on both src and dst so that server
* has the latest data */
ret = nfs_sync_inode(src_inode);
@@ -298,15 +293,7 @@ static loff_t nfs42_remap_file_range(struct file *src_file, loff_t src_off,
truncate_inode_pages_range(&dst_inode->i_data, dst_off, dst_off + count - 1);
out_unlock:
- if (same_inode) {
- inode_unlock(src_inode);
- } else if (dst_inode < src_inode) {
- inode_unlock(src_inode);
- inode_unlock(dst_inode);
- } else {
- inode_unlock(dst_inode);
- inode_unlock(src_inode);
- }
+ unlock_two_nondirectories(src_inode, dst_inode);
out:
return ret < 0 ? ret : count;
}
@@ -448,7 +435,7 @@ static int nfs4_setlease(struct file *file, int arg, struct file_lease **lease,
const struct file_operations nfs4_file_operations = {
.read_iter = nfs_file_read,
.write_iter = nfs_file_write,
- .mmap = nfs_file_mmap,
+ .mmap_prepare = nfs_file_mmap_prepare,
.open = nfs4_file_open,
.flush = nfs4_file_flush,
.release = nfs_file_release,
diff --git a/fs/nfs/nfs4getroot.c b/fs/nfs/nfs4getroot.c
index 1a69479a3a59..e67ea345de69 100644
--- a/fs/nfs/nfs4getroot.c
+++ b/fs/nfs/nfs4getroot.c
@@ -12,30 +12,28 @@
int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool auth_probe)
{
- struct nfs_fsinfo fsinfo;
+ struct nfs_fattr *fattr = nfs_alloc_fattr();
int ret = -ENOMEM;
- fsinfo.fattr = nfs_alloc_fattr();
- if (fsinfo.fattr == NULL)
+ if (fattr == NULL)
goto out;
/* Start by getting the root filehandle from the server */
- ret = nfs4_proc_get_rootfh(server, mntfh, &fsinfo, auth_probe);
+ ret = nfs4_proc_get_rootfh(server, mntfh, fattr, auth_probe);
if (ret < 0) {
dprintk("nfs4_get_rootfh: getroot error = %d\n", -ret);
goto out;
}
- if (!(fsinfo.fattr->valid & NFS_ATTR_FATTR_TYPE)
- || !S_ISDIR(fsinfo.fattr->mode)) {
+ if (!(fattr->valid & NFS_ATTR_FATTR_TYPE) || !S_ISDIR(fattr->mode)) {
printk(KERN_ERR "nfs4_get_rootfh:"
" getroot encountered non-directory\n");
ret = -ENOTDIR;
goto out;
}
- memcpy(&server->fsid, &fsinfo.fattr->fsid, sizeof(server->fsid));
+ memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid));
out:
- nfs_free_fattr(fsinfo.fattr);
+ nfs_free_fattr(fattr);
return ret;
}
diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
index 25a7c771cfd8..00932500fce4 100644
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -424,26 +424,16 @@ static void nfs_idmap_pipe_destroy(struct dentry *dir,
struct rpc_pipe_dir_object *pdo)
{
struct idmap *idmap = pdo->pdo_data;
- struct rpc_pipe *pipe = idmap->idmap_pipe;
- if (pipe->dentry) {
- rpc_unlink(pipe->dentry);
- pipe->dentry = NULL;
- }
+ rpc_unlink(idmap->idmap_pipe);
}
static int nfs_idmap_pipe_create(struct dentry *dir,
struct rpc_pipe_dir_object *pdo)
{
struct idmap *idmap = pdo->pdo_data;
- struct rpc_pipe *pipe = idmap->idmap_pipe;
- struct dentry *dentry;
- dentry = rpc_mkpipe_dentry(dir, "idmap", idmap, pipe);
- if (IS_ERR(dentry))
- return PTR_ERR(dentry);
- pipe->dentry = dentry;
- return 0;
+ return rpc_mkpipe_dentry(dir, "idmap", idmap, idmap->idmap_pipe);
}
static const struct rpc_pipe_dir_object_ops nfs_idmap_pipe_dir_object_ops = {
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index df9669d4ded7..7d2b67e06cc3 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -105,7 +105,7 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp,
bool is_privileged);
static int nfs41_test_stateid(struct nfs_server *, const nfs4_stateid *,
const struct cred *);
-static int nfs41_free_stateid(struct nfs_server *, const nfs4_stateid *,
+static int nfs41_free_stateid(struct nfs_server *, nfs4_stateid *,
const struct cred *, bool);
#endif
@@ -133,6 +133,7 @@ nfs4_label_init_security(struct inode *dir, struct dentry *dentry,
if (err)
return NULL;
+ label->lsmid = shim.id;
label->label = shim.context;
label->len = shim.len;
return label;
@@ -145,7 +146,7 @@ nfs4_label_release_security(struct nfs4_label *label)
if (label) {
shim.context = label->label;
shim.len = label->len;
- shim.id = LSM_ID_UNDEF;
+ shim.id = label->lsmid;
security_release_secctx(&shim);
}
}
@@ -194,6 +195,9 @@ static int nfs4_map_errors(int err)
return -EBUSY;
case -NFS4ERR_NOT_SAME:
return -ENOTSYNC;
+ case -ENETDOWN:
+ case -ENETUNREACH:
+ break;
default:
dprintk("%s could not handle NFSv4 error %d\n",
__func__, -err);
@@ -218,6 +222,7 @@ const u32 nfs4_fattr_bitmap[3] = {
| FATTR4_WORD1_RAWDEV
| FATTR4_WORD1_SPACE_USED
| FATTR4_WORD1_TIME_ACCESS
+ | FATTR4_WORD1_TIME_CREATE
| FATTR4_WORD1_TIME_METADATA
| FATTR4_WORD1_TIME_MODIFY
| FATTR4_WORD1_MOUNTED_ON_FILEID,
@@ -239,6 +244,7 @@ static const u32 nfs4_pnfs_open_bitmap[3] = {
| FATTR4_WORD1_RAWDEV
| FATTR4_WORD1_SPACE_USED
| FATTR4_WORD1_TIME_ACCESS
+ | FATTR4_WORD1_TIME_CREATE
| FATTR4_WORD1_TIME_METADATA
| FATTR4_WORD1_TIME_MODIFY,
FATTR4_WORD2_MDSTHRESHOLD
@@ -319,16 +325,19 @@ static void nfs4_bitmap_copy_adjust(__u32 *dst, const __u32 *src,
if (!(cache_validity & NFS_INO_INVALID_OTHER))
dst[1] &= ~(FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP);
+ if (!(cache_validity & NFS_INO_INVALID_BTIME))
+ dst[1] &= ~FATTR4_WORD1_TIME_CREATE;
+
if (nfs_have_delegated_mtime(inode)) {
if (!(cache_validity & NFS_INO_INVALID_ATIME))
- dst[1] &= ~FATTR4_WORD1_TIME_ACCESS;
+ dst[1] &= ~(FATTR4_WORD1_TIME_ACCESS|FATTR4_WORD1_TIME_ACCESS_SET);
if (!(cache_validity & NFS_INO_INVALID_MTIME))
- dst[1] &= ~FATTR4_WORD1_TIME_MODIFY;
+ dst[1] &= ~(FATTR4_WORD1_TIME_MODIFY|FATTR4_WORD1_TIME_MODIFY_SET);
if (!(cache_validity & NFS_INO_INVALID_CTIME))
- dst[1] &= ~FATTR4_WORD1_TIME_METADATA;
+ dst[1] &= ~(FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY_SET);
} else if (nfs_have_delegated_atime(inode)) {
if (!(cache_validity & NFS_INO_INVALID_ATIME))
- dst[1] &= ~FATTR4_WORD1_TIME_ACCESS;
+ dst[1] &= ~(FATTR4_WORD1_TIME_ACCESS|FATTR4_WORD1_TIME_ACCESS_SET);
}
}
@@ -442,6 +451,8 @@ static int nfs4_delay_killable(long *timeout)
{
might_sleep();
+ if (unlikely(nfs_current_task_exiting()))
+ return -EINTR;
__set_current_state(TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
schedule_timeout(nfs4_update_delay(timeout));
if (!__fatal_signal_pending(current))
@@ -453,6 +464,8 @@ static int nfs4_delay_interruptible(long *timeout)
{
might_sleep();
+ if (unlikely(nfs_current_task_exiting()))
+ return -EINTR;
__set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE_UNSAFE);
schedule_timeout(nfs4_update_delay(timeout));
if (!signal_pending(current))
@@ -663,6 +676,15 @@ nfs4_async_handle_exception(struct rpc_task *task, struct nfs_server *server,
struct nfs_client *clp = server->nfs_client;
int ret;
+ if ((task->tk_rpc_status == -ENETDOWN ||
+ task->tk_rpc_status == -ENETUNREACH) &&
+ task->tk_flags & RPC_TASK_NETUNREACH_FATAL) {
+ exception->delay = 0;
+ exception->recovering = 0;
+ exception->retry = 0;
+ return -EIO;
+ }
+
ret = nfs4_do_handle_exception(server, errorcode, exception);
if (exception->delay) {
int ret2 = nfs4_exception_should_retrans(server, exception);
@@ -1290,7 +1312,8 @@ nfs4_update_changeattr_locked(struct inode *inode,
NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL |
NFS_INO_INVALID_SIZE | NFS_INO_INVALID_OTHER |
NFS_INO_INVALID_BLOCKS | NFS_INO_INVALID_NLINK |
- NFS_INO_INVALID_MODE | NFS_INO_INVALID_XATTR;
+ NFS_INO_INVALID_MODE | NFS_INO_INVALID_BTIME |
+ NFS_INO_INVALID_XATTR;
nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
}
nfsi->attrtimeo_timestamp = jiffies;
@@ -1773,7 +1796,8 @@ static void nfs_set_open_stateid_locked(struct nfs4_state *state,
rcu_read_unlock();
trace_nfs4_open_stateid_update_wait(state->inode, stateid, 0);
- if (!fatal_signal_pending(current)) {
+ if (!fatal_signal_pending(current) &&
+ !nfs_current_task_exiting()) {
if (schedule_timeout(5*HZ) == 0)
status = -EAGAIN;
else
@@ -2885,16 +2909,14 @@ static int nfs40_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st
}
static int nfs40_test_and_free_expired_stateid(struct nfs_server *server,
- const nfs4_stateid *stateid,
- const struct cred *cred)
+ nfs4_stateid *stateid, const struct cred *cred)
{
return -NFS4ERR_BAD_STATEID;
}
#if defined(CONFIG_NFS_V4_1)
static int nfs41_test_and_free_expired_stateid(struct nfs_server *server,
- const nfs4_stateid *stateid,
- const struct cred *cred)
+ nfs4_stateid *stateid, const struct cred *cred)
{
int status;
@@ -2903,6 +2925,7 @@ static int nfs41_test_and_free_expired_stateid(struct nfs_server *server,
break;
case NFS4_INVALID_STATEID_TYPE:
case NFS4_SPECIAL_STATEID_TYPE:
+ case NFS4_FREED_STATEID_TYPE:
return -NFS4ERR_BAD_STATEID;
case NFS4_REVOKED_STATEID_TYPE:
goto out_free;
@@ -3153,9 +3176,7 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
if (d_really_is_negative(dentry)) {
struct dentry *alias;
d_drop(dentry);
- alias = d_exact_alias(dentry, state->inode);
- if (!alias)
- alias = d_splice_alias(igrab(state->inode), dentry);
+ alias = d_splice_alias(igrab(state->inode), dentry);
/* d_splice_alias() can't fail here - it's a non-directory */
if (alias) {
dput(ctx->dentry);
@@ -3577,7 +3598,7 @@ static bool nfs4_refresh_open_old_stateid(nfs4_stateid *dst,
write_sequnlock(&state->seqlock);
trace_nfs4_close_stateid_update_wait(state->inode, dst, 0);
- if (fatal_signal_pending(current))
+ if (fatal_signal_pending(current) || nfs_current_task_exiting())
status = -EINTR;
else
if (schedule_timeout(5*HZ) != 0)
@@ -3906,8 +3927,11 @@ nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx,
static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
{
+ struct dentry *dentry = ctx->dentry;
if (ctx->state == NULL)
return;
+ if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
+ nfs4_inode_set_return_delegation_on_close(d_inode(dentry));
if (is_sync)
nfs4_close_sync(ctx->state, _nfs4_ctx_to_openmode(ctx));
else
@@ -3957,8 +3981,9 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
FATTR4_WORD0_CASE_INSENSITIVE |
FATTR4_WORD0_CASE_PRESERVING;
if (minorversion)
- bitmask[2] = FATTR4_WORD2_SUPPATTR_EXCLCREAT |
- FATTR4_WORD2_OPEN_ARGUMENTS;
+ bitmask[2] = FATTR4_WORD2_SUPPATTR_EXCLCREAT;
+ if (minorversion > 1)
+ bitmask[2] |= FATTR4_WORD2_OPEN_ARGUMENTS;
status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
if (status == 0) {
@@ -4028,6 +4053,10 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
server->fattr_valid &= ~NFS_ATTR_FATTR_CTIME;
if (!(res.attr_bitmask[1] & FATTR4_WORD1_TIME_MODIFY))
server->fattr_valid &= ~NFS_ATTR_FATTR_MTIME;
+ if (!(res.attr_bitmask[1] & FATTR4_WORD1_TIME_MODIFY))
+ server->fattr_valid &= ~NFS_ATTR_FATTR_MTIME;
+ if (!(res.attr_bitmask[1] & FATTR4_WORD1_TIME_CREATE))
+ server->fattr_valid &= ~NFS_ATTR_FATTR_BTIME;
memcpy(server->attr_bitmask_nl, res.attr_bitmask,
sizeof(server->attr_bitmask));
server->attr_bitmask_nl[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
@@ -4063,7 +4092,7 @@ int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
};
int err;
- nfs4_server_set_init_caps(server);
+ nfs_server_set_init_caps(server);
do {
err = nfs4_handle_exception(server,
_nfs4_server_capabilities(server, fhandle),
@@ -4211,15 +4240,18 @@ out:
}
static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
- struct nfs_fsinfo *info)
+ struct nfs_fattr *fattr)
{
- u32 bitmask[3];
+ u32 bitmask[3] = {
+ [0] = FATTR4_WORD0_TYPE | FATTR4_WORD0_CHANGE |
+ FATTR4_WORD0_SIZE | FATTR4_WORD0_FSID,
+ };
struct nfs4_lookup_root_arg args = {
.bitmask = bitmask,
};
struct nfs4_lookup_res res = {
.server = server,
- .fattr = info->fattr,
+ .fattr = fattr,
.fh = fhandle,
};
struct rpc_message msg = {
@@ -4228,27 +4260,20 @@ static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
.rpc_resp = &res,
};
- bitmask[0] = nfs4_fattr_bitmap[0];
- bitmask[1] = nfs4_fattr_bitmap[1];
- /*
- * Process the label in the upcoming getfattr
- */
- bitmask[2] = nfs4_fattr_bitmap[2] & ~FATTR4_WORD2_SECURITY_LABEL;
-
- nfs_fattr_init(info->fattr);
+ nfs_fattr_init(fattr);
return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
}
static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
- struct nfs_fsinfo *info)
+ struct nfs_fattr *fattr)
{
struct nfs4_exception exception = {
.interruptible = true,
};
int err;
do {
- err = _nfs4_lookup_root(server, fhandle, info);
- trace_nfs4_lookup_root(server, fhandle, info->fattr, err);
+ err = _nfs4_lookup_root(server, fhandle, fattr);
+ trace_nfs4_lookup_root(server, fhandle, fattr, err);
switch (err) {
case 0:
case -NFS4ERR_WRONGSEC:
@@ -4261,8 +4286,9 @@ out:
return err;
}
-static int nfs4_lookup_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
- struct nfs_fsinfo *info, rpc_authflavor_t flavor)
+static int nfs4_lookup_root_sec(struct nfs_server *server,
+ struct nfs_fh *fhandle, struct nfs_fattr *fattr,
+ rpc_authflavor_t flavor)
{
struct rpc_auth_create_args auth_args = {
.pseudoflavor = flavor,
@@ -4272,7 +4298,7 @@ static int nfs4_lookup_root_sec(struct nfs_server *server, struct nfs_fh *fhandl
auth = rpcauth_create(&auth_args, server->client);
if (IS_ERR(auth))
return -EACCES;
- return nfs4_lookup_root(server, fhandle, info);
+ return nfs4_lookup_root(server, fhandle, fattr);
}
/*
@@ -4285,7 +4311,7 @@ static int nfs4_lookup_root_sec(struct nfs_server *server, struct nfs_fh *fhandl
* negative errno value.
*/
static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
- struct nfs_fsinfo *info)
+ struct nfs_fattr *fattr)
{
/* Per 3530bis 15.33.5 */
static const rpc_authflavor_t flav_array[] = {
@@ -4301,8 +4327,9 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
if (server->auth_info.flavor_len > 0) {
/* try each flavor specified by user */
for (i = 0; i < server->auth_info.flavor_len; i++) {
- status = nfs4_lookup_root_sec(server, fhandle, info,
- server->auth_info.flavors[i]);
+ status = nfs4_lookup_root_sec(
+ server, fhandle, fattr,
+ server->auth_info.flavors[i]);
if (status == -NFS4ERR_WRONGSEC || status == -EACCES)
continue;
break;
@@ -4310,7 +4337,7 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
} else {
/* no flavors specified by user, try default list */
for (i = 0; i < ARRAY_SIZE(flav_array); i++) {
- status = nfs4_lookup_root_sec(server, fhandle, info,
+ status = nfs4_lookup_root_sec(server, fhandle, fattr,
flav_array[i]);
if (status == -NFS4ERR_WRONGSEC || status == -EACCES)
continue;
@@ -4334,28 +4361,22 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
* nfs4_proc_get_rootfh - get file handle for server's pseudoroot
* @server: initialized nfs_server handle
* @fhandle: we fill in the pseudo-fs root file handle
- * @info: we fill in an FSINFO struct
+ * @fattr: we fill in a bare bones struct fattr
* @auth_probe: probe the auth flavours
*
* Returns zero on success, or a negative errno.
*/
int nfs4_proc_get_rootfh(struct nfs_server *server, struct nfs_fh *fhandle,
- struct nfs_fsinfo *info,
- bool auth_probe)
+ struct nfs_fattr *fattr, bool auth_probe)
{
int status = 0;
if (!auth_probe)
- status = nfs4_lookup_root(server, fhandle, info);
+ status = nfs4_lookup_root(server, fhandle, fattr);
if (auth_probe || status == NFS4ERR_WRONGSEC)
- status = server->nfs_client->cl_mvops->find_root_sec(server,
- fhandle, info);
-
- if (status == 0)
- status = nfs4_server_capabilities(server, fhandle);
- if (status == 0)
- status = nfs4_do_fsinfo(server, fhandle, info);
+ status = server->nfs_client->cl_mvops->find_root_sec(
+ server, fhandle, fattr);
return nfs4_map_errors(status);
}
@@ -5135,9 +5156,6 @@ static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_
&data->arg.seq_args, &data->res.seq_res, 1);
if (status == 0) {
spin_lock(&dir->i_lock);
- /* Creating a directory bumps nlink in the parent */
- if (data->arg.ftype == NF4DIR)
- nfs4_inc_nlink_locked(dir);
nfs4_update_changeattr_locked(dir, &data->res.dir_cinfo,
data->res.fattr->time_start,
NFS_INO_INVALID_DATA);
@@ -5147,6 +5165,31 @@ static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_
return status;
}
+static struct dentry *nfs4_do_mkdir(struct inode *dir, struct dentry *dentry,
+ struct nfs4_createdata *data, int *statusp)
+{
+ struct dentry *ret;
+
+ *statusp = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &data->msg,
+ &data->arg.seq_args, &data->res.seq_res, 1);
+
+ if (*statusp)
+ return NULL;
+
+ spin_lock(&dir->i_lock);
+ /* Creating a directory bumps nlink in the parent */
+ nfs4_inc_nlink_locked(dir);
+ nfs4_update_changeattr_locked(dir, &data->res.dir_cinfo,
+ data->res.fattr->time_start,
+ NFS_INO_INVALID_DATA);
+ spin_unlock(&dir->i_lock);
+ ret = nfs_add_or_obtain(dentry, data->res.fh, data->res.fattr);
+ if (!IS_ERR(ret))
+ return ret;
+ *statusp = PTR_ERR(ret);
+ return NULL;
+}
+
static void nfs4_free_createdata(struct nfs4_createdata *data)
{
nfs4_label_free(data->fattr.label);
@@ -5203,32 +5246,35 @@ static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
return err;
}
-static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
- struct iattr *sattr, struct nfs4_label *label)
+static struct dentry *_nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
+ struct iattr *sattr,
+ struct nfs4_label *label, int *statusp)
{
struct nfs4_createdata *data;
- int status = -ENOMEM;
+ struct dentry *ret = NULL;
+ *statusp = -ENOMEM;
data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4DIR);
if (data == NULL)
goto out;
data->arg.label = label;
- status = nfs4_do_create(dir, dentry, data);
+ ret = nfs4_do_mkdir(dir, dentry, data, statusp);
nfs4_free_createdata(data);
out:
- return status;
+ return ret;
}
-static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
- struct iattr *sattr)
+static struct dentry *nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
+ struct iattr *sattr)
{
struct nfs_server *server = NFS_SERVER(dir);
struct nfs4_exception exception = {
.interruptible = true,
};
struct nfs4_label l, *label;
+ struct dentry *alias;
int err;
label = nfs4_label_init_security(dir, dentry, sattr, &l);
@@ -5236,14 +5282,16 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
if (!(server->attr_bitmask[2] & FATTR4_WORD2_MODE_UMASK))
sattr->ia_mode &= ~current_umask();
do {
- err = _nfs4_proc_mkdir(dir, dentry, sattr, label);
+ alias = _nfs4_proc_mkdir(dir, dentry, sattr, label, &err);
trace_nfs4_mkdir(dir, &dentry->d_name, err);
- err = nfs4_handle_exception(NFS_SERVER(dir), err,
- &exception);
+ if (err)
+ alias = ERR_PTR(nfs4_handle_exception(NFS_SERVER(dir),
+ err,
+ &exception));
} while (exception.retry);
nfs4_label_release_security(label);
- return err;
+ return alias;
}
static int _nfs4_proc_readdir(struct nfs_readdir_arg *nr_arg,
@@ -5735,6 +5783,8 @@ void nfs4_bitmask_set(__u32 bitmask[], const __u32 src[],
bitmask[1] |= FATTR4_WORD1_TIME_MODIFY;
if (cache_validity & NFS_INO_INVALID_BLOCKS)
bitmask[1] |= FATTR4_WORD1_SPACE_USED;
+ if (cache_validity & NFS_INO_INVALID_BTIME)
+ bitmask[1] |= FATTR4_WORD1_TIME_CREATE;
if (cache_validity & NFS_INO_INVALID_SIZE)
bitmask[0] |= FATTR4_WORD0_SIZE;
@@ -6173,6 +6223,8 @@ static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen,
struct nfs_server *server = NFS_SERVER(inode);
int ret;
+ if (unlikely(NFS_FH(inode)->size == 0))
+ return -ENODATA;
if (!nfs4_server_supports_acls(server, type))
return -EOPNOTSUPP;
ret = nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE);
@@ -6247,6 +6299,9 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf,
{
struct nfs4_exception exception = { };
int err;
+
+ if (unlikely(NFS_FH(inode)->size == 0))
+ return -ENODATA;
do {
err = __nfs4_proc_set_acl(inode, buf, buflen, type);
trace_nfs4_set_acl(inode, err);
@@ -6269,7 +6324,7 @@ static int _nfs4_get_security_label(struct inode *inode, void *buf,
size_t buflen)
{
struct nfs_server *server = NFS_SERVER(inode);
- struct nfs4_label label = {0, 0, buflen, buf};
+ struct nfs4_label label = {0, 0, 0, buflen, buf};
u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL };
struct nfs_fattr fattr = {
@@ -6374,7 +6429,7 @@ static int nfs4_do_set_security_label(struct inode *inode,
static int
nfs4_set_security_label(struct inode *inode, const void *buf, size_t buflen)
{
- struct nfs4_label ilabel = {0, 0, buflen, (char *)buf };
+ struct nfs4_label ilabel = {0, 0, 0, buflen, (char *)buf };
struct nfs_fattr *fattr;
int status;
@@ -7045,10 +7100,18 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
struct nfs4_unlockdata *p;
struct nfs4_state *state = lsp->ls_state;
struct inode *inode = state->inode;
+ struct nfs_lock_context *l_ctx;
p = kzalloc(sizeof(*p), GFP_KERNEL);
if (p == NULL)
return NULL;
+ l_ctx = nfs_get_lock_context(ctx);
+ if (!IS_ERR(l_ctx)) {
+ p->l_ctx = l_ctx;
+ } else {
+ kfree(p);
+ return NULL;
+ }
p->arg.fh = NFS_FH(inode);
p->arg.fl = &p->fl;
p->arg.seqid = seqid;
@@ -7056,7 +7119,6 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
p->lsp = lsp;
/* Ensure we don't close file until we're done freeing locks! */
p->ctx = get_nfs_open_context(ctx);
- p->l_ctx = nfs_get_lock_context(ctx);
locks_init_lock(&p->fl);
locks_copy_lock(&p->fl, fl);
p->server = NFS_SERVER(inode);
@@ -9573,7 +9635,7 @@ static void nfs41_sequence_call_done(struct rpc_task *task, void *data)
return;
trace_nfs4_sequence(clp, task->tk_status);
- if (task->tk_status < 0 && !task->tk_client->cl_shutdown) {
+ if (task->tk_status < 0 && clp->cl_cons_state >= 0) {
dprintk("%s ERROR %d\n", __func__, task->tk_status);
if (refcount_read(&clp->cl_count) == 1)
return;
@@ -10281,10 +10343,10 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync)
* Use the state managment nfs_client cl_rpcclient, which uses krb5i (if
* possible) as per RFC3530bis and RFC5661 Security Considerations sections
*/
-static int
-_nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle,
- struct nfs_fsinfo *info,
- struct nfs4_secinfo_flavors *flavors, bool use_integrity)
+static int _nfs41_proc_secinfo_no_name(struct nfs_server *server,
+ struct nfs_fh *fhandle,
+ struct nfs4_secinfo_flavors *flavors,
+ bool use_integrity)
{
struct nfs41_secinfo_no_name_args args = {
.style = SECINFO_STYLE_CURRENT_FH,
@@ -10328,9 +10390,9 @@ _nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle,
return status;
}
-static int
-nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle,
- struct nfs_fsinfo *info, struct nfs4_secinfo_flavors *flavors)
+static int nfs41_proc_secinfo_no_name(struct nfs_server *server,
+ struct nfs_fh *fhandle,
+ struct nfs4_secinfo_flavors *flavors)
{
struct nfs4_exception exception = {
.interruptible = true,
@@ -10342,7 +10404,7 @@ nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle,
/* try to use integrity protection with machine cred */
if (_nfs4_is_integrity_protected(server->nfs_client))
- err = _nfs41_proc_secinfo_no_name(server, fhandle, info,
+ err = _nfs41_proc_secinfo_no_name(server, fhandle,
flavors, true);
/*
@@ -10352,7 +10414,7 @@ nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle,
* the current filesystem's rpc_client and the user cred.
*/
if (err == -NFS4ERR_WRONGSEC)
- err = _nfs41_proc_secinfo_no_name(server, fhandle, info,
+ err = _nfs41_proc_secinfo_no_name(server, fhandle,
flavors, false);
switch (err) {
@@ -10368,9 +10430,8 @@ out:
return err;
}
-static int
-nfs41_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
- struct nfs_fsinfo *info)
+static int nfs41_find_root_sec(struct nfs_server *server,
+ struct nfs_fh *fhandle, struct nfs_fattr *fattr)
{
int err;
struct page *page;
@@ -10386,14 +10447,14 @@ nfs41_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
}
flavors = page_address(page);
- err = nfs41_proc_secinfo_no_name(server, fhandle, info, flavors);
+ err = nfs41_proc_secinfo_no_name(server, fhandle, flavors);
/*
* Fall back on "guess and check" method if
* the server doesn't support SECINFO_NO_NAME
*/
if (err == -NFS4ERR_WRONGSEC || err == -ENOTSUPP) {
- err = nfs4_find_root_sec(server, fhandle, info);
+ err = nfs4_find_root_sec(server, fhandle, fattr);
goto out_freepage;
}
if (err)
@@ -10418,8 +10479,8 @@ nfs41_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
flavor = RPC_AUTH_MAXFLAVOR;
if (flavor != RPC_AUTH_MAXFLAVOR) {
- err = nfs4_lookup_root_sec(server, fhandle,
- info, flavor);
+ err = nfs4_lookup_root_sec(server, fhandle, fattr,
+ flavor);
if (!err)
break;
}
@@ -10566,7 +10627,7 @@ static const struct rpc_call_ops nfs41_free_stateid_ops = {
* Note: this function is always asynchronous.
*/
static int nfs41_free_stateid(struct nfs_server *server,
- const nfs4_stateid *stateid,
+ nfs4_stateid *stateid,
const struct cred *cred,
bool privileged)
{
@@ -10606,6 +10667,7 @@ static int nfs41_free_stateid(struct nfs_server *server,
if (IS_ERR(task))
return PTR_ERR(task);
rpc_put_task(task);
+ stateid->type = NFS4_FREED_STATEID_TYPE;
return 0;
}
@@ -10621,6 +10683,8 @@ nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp)
static bool nfs41_match_stateid(const nfs4_stateid *s1,
const nfs4_stateid *s2)
{
+ trace_nfs41_match_stateid(s1, s2);
+
if (s1->type != s2->type)
return false;
@@ -10638,6 +10702,8 @@ static bool nfs41_match_stateid(const nfs4_stateid *s1,
static bool nfs4_match_stateid(const nfs4_stateid *s1,
const nfs4_stateid *s2)
{
+ trace_nfs4_match_stateid(s1, s2);
+
return nfs4_stateid_match(s1, s2);
}
@@ -10772,12 +10838,14 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
| NFS_CAP_OFFLOAD_CANCEL
| NFS_CAP_COPY_NOTIFY
| NFS_CAP_DEALLOCATE
+ | NFS_CAP_ZERO_RANGE
| NFS_CAP_SEEK
| NFS_CAP_LAYOUTSTATS
| NFS_CAP_CLONE
| NFS_CAP_LAYOUTERROR
| NFS_CAP_READ_PLUS
- | NFS_CAP_MOVEABLE,
+ | NFS_CAP_MOVEABLE
+ | NFS_CAP_OFFLOAD_STATUS,
.init_client = nfs41_init_client,
.shutdown_client = nfs41_shutdown_client,
.match_stateid = nfs41_match_stateid,
@@ -10806,7 +10874,7 @@ const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = {
static ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size)
{
- ssize_t error, error2, error3;
+ ssize_t error, error2, error3, error4 = 0;
size_t left = size;
error = generic_listxattr(dentry, list, left);
@@ -10829,8 +10897,18 @@ static ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size)
error3 = nfs4_listxattr_nfs4_user(d_inode(dentry), list, left);
if (error3 < 0)
return error3;
+ if (list) {
+ list += error3;
+ left -= error3;
+ }
+
+ if (!nfs_server_capable(d_inode(dentry), NFS_CAP_SECURITY_LABEL)) {
+ error4 = security_inode_listsecurity(d_inode(dentry), list, left);
+ if (error4 < 0)
+ return error4;
+ }
- error += error2 + error3;
+ error += error2 + error3 + error4;
if (size && error > size)
return -ERANGE;
return error;
@@ -10882,6 +10960,26 @@ static const struct inode_operations nfs4_file_inode_operations = {
.listxattr = nfs4_listxattr,
};
+static struct nfs_server *nfs4_clone_server(struct nfs_server *source,
+ struct nfs_fh *fh, struct nfs_fattr *fattr,
+ rpc_authflavor_t flavor)
+{
+ struct nfs_server *server;
+ int error;
+
+ server = nfs_clone_server(source, fh, fattr, flavor);
+ if (IS_ERR(server))
+ return server;
+
+ error = nfs4_delegation_hash_alloc(server);
+ if (error) {
+ nfs_free_server(server);
+ return ERR_PTR(error);
+ }
+
+ return server;
+}
+
const struct nfs_rpc_ops nfs_v4_clientops = {
.version = 4, /* protocol version */
.dentry_ops = &nfs4_dentry_operations,
@@ -10934,7 +11032,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
.init_client = nfs4_init_client,
.free_client = nfs4_free_client,
.create_server = nfs4_create_server,
- .clone_server = nfs_clone_server,
+ .clone_server = nfs4_clone_server,
.discover_trunking = nfs4_discover_trunking,
.enable_swap = nfs4_enable_swap,
.disable_swap = nfs4_disable_swap,
diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h
index 351616c61df5..f9c291e2165c 100644
--- a/fs/nfs/nfs4session.h
+++ b/fs/nfs/nfs4session.h
@@ -148,16 +148,12 @@ static inline void nfs4_copy_sessionid(struct nfs4_sessionid *dst,
memcpy(dst->data, src->data, NFS4_MAX_SESSIONID_LEN);
}
-#ifdef CONFIG_CRC32
/*
* nfs_session_id_hash - calculate the crc32 hash for the session id
* @session - pointer to session
*/
#define nfs_session_id_hash(sess_id) \
(~crc32_le(0xFFFFFFFF, &(sess_id)->data[0], sizeof((sess_id)->data)))
-#else
-#define nfs_session_id_hash(session) (0)
-#endif
#else /* defined(CONFIG_NFS_V4_1) */
static inline int nfs4_init_session(struct nfs_client *clp)
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 542cdf71229f..7612e977e80b 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1198,7 +1198,7 @@ void nfs4_schedule_state_manager(struct nfs_client *clp)
struct rpc_clnt *clnt = clp->cl_rpcclient;
bool swapon = false;
- if (clnt->cl_shutdown)
+ if (clp->cl_cons_state < 0)
return;
set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state);
@@ -1403,7 +1403,7 @@ int nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4_
dprintk("%s: scheduling stateid recovery for server %s\n", __func__,
clp->cl_hostname);
nfs4_schedule_state_manager(clp);
- return 0;
+ return clp->cl_cons_state < 0 ? clp->cl_cons_state : 0;
}
EXPORT_SYMBOL_GPL(nfs4_schedule_stateid_recovery);
@@ -2739,7 +2739,15 @@ out_error:
pr_warn_ratelimited("NFS: state manager%s%s failed on NFSv4 server %s"
" with error %d\n", section_sep, section,
clp->cl_hostname, -status);
- ssleep(1);
+ switch (status) {
+ case -ENETDOWN:
+ case -ENETUNREACH:
+ nfs_mark_client_ready(clp, -EIO);
+ break;
+ default:
+ ssleep(1);
+ break;
+ }
out_drain:
memalloc_nofs_restore(memflags);
nfs4_end_drain_session(clp);
diff --git a/fs/nfs/nfs4trace.c b/fs/nfs/nfs4trace.c
index 389941ccc9c9..987c92d6364b 100644
--- a/fs/nfs/nfs4trace.c
+++ b/fs/nfs/nfs4trace.c
@@ -26,11 +26,13 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_read_done);
EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_write_done);
EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_read_pagelist);
EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_write_pagelist);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_ds_connect);
EXPORT_TRACEPOINT_SYMBOL_GPL(ff_layout_read_error);
EXPORT_TRACEPOINT_SYMBOL_GPL(ff_layout_write_error);
EXPORT_TRACEPOINT_SYMBOL_GPL(ff_layout_commit_error);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bl_ext_tree_prepare_commit);
EXPORT_TRACEPOINT_SYMBOL_GPL(bl_pr_key_reg);
EXPORT_TRACEPOINT_SYMBOL_GPL(bl_pr_key_reg_err);
EXPORT_TRACEPOINT_SYMBOL_GPL(bl_pr_key_unreg);
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index 22c973316f0b..9776d220cec3 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -14,6 +14,8 @@
#include <trace/misc/fs.h>
#include <trace/misc/nfs.h>
+#include "delegation.h"
+
#define show_nfs_fattr_flags(valid) \
__print_flags((unsigned long)valid, "|", \
{ NFS_ATTR_FATTR_TYPE, "TYPE" }, \
@@ -30,7 +32,8 @@
{ NFS_ATTR_FATTR_CTIME, "CTIME" }, \
{ NFS_ATTR_FATTR_CHANGE, "CHANGE" }, \
{ NFS_ATTR_FATTR_OWNER_NAME, "OWNER_NAME" }, \
- { NFS_ATTR_FATTR_GROUP_NAME, "GROUP_NAME" })
+ { NFS_ATTR_FATTR_GROUP_NAME, "GROUP_NAME" }, \
+ { NFS_ATTR_FATTR_BTIME, "BTIME" })
DECLARE_EVENT_CLASS(nfs4_clientid_event,
TP_PROTO(
@@ -273,6 +276,32 @@ TRACE_EVENT(nfs4_cb_offload,
show_nfs_stable_how(__entry->cb_how)
)
);
+
+TRACE_EVENT(pnfs_ds_connect,
+ TP_PROTO(
+ char *ds_remotestr,
+ int status
+ ),
+
+ TP_ARGS(ds_remotestr, status),
+
+ TP_STRUCT__entry(
+ __string(ds_ips, ds_remotestr)
+ __field(int, status)
+ ),
+
+ TP_fast_assign(
+ __assign_str(ds_ips);
+ __entry->status = status;
+ ),
+
+ TP_printk(
+ "ds_ips=%s, status=%d",
+ __get_str(ds_ips),
+ __entry->status
+ )
+);
+
#endif /* CONFIG_NFS_V4_1 */
TRACE_EVENT(nfs4_setup_sequence,
@@ -956,6 +985,52 @@ DECLARE_EVENT_CLASS(nfs4_set_delegation_event,
TP_ARGS(inode, fmode))
DEFINE_NFS4_SET_DELEGATION_EVENT(nfs4_set_delegation);
DEFINE_NFS4_SET_DELEGATION_EVENT(nfs4_reclaim_delegation);
+DEFINE_NFS4_SET_DELEGATION_EVENT(nfs4_detach_delegation);
+
+#define show_delegation_flags(flags) \
+ __print_flags(flags, "|", \
+ { BIT(NFS_DELEGATION_NEED_RECLAIM), "NEED_RECLAIM" }, \
+ { BIT(NFS_DELEGATION_RETURN), "RETURN" }, \
+ { BIT(NFS_DELEGATION_RETURN_IF_CLOSED), "RETURN_IF_CLOSED" }, \
+ { BIT(NFS_DELEGATION_REFERENCED), "REFERENCED" }, \
+ { BIT(NFS_DELEGATION_RETURNING), "RETURNING" }, \
+ { BIT(NFS_DELEGATION_REVOKED), "REVOKED" }, \
+ { BIT(NFS_DELEGATION_TEST_EXPIRED), "TEST_EXPIRED" }, \
+ { BIT(NFS_DELEGATION_INODE_FREEING), "INODE_FREEING" }, \
+ { BIT(NFS_DELEGATION_RETURN_DELAYED), "RETURN_DELAYED" })
+
+DECLARE_EVENT_CLASS(nfs4_delegation_event,
+ TP_PROTO(
+ const struct nfs_delegation *delegation
+ ),
+
+ TP_ARGS(delegation),
+
+ TP_STRUCT__entry(
+ __field(u32, fhandle)
+ __field(unsigned int, fmode)
+ __field(unsigned long, flags)
+ ),
+
+ TP_fast_assign(
+ __entry->fhandle = nfs_fhandle_hash(NFS_FH(delegation->inode));
+ __entry->fmode = delegation->type;
+ __entry->flags = delegation->flags;
+ ),
+
+ TP_printk(
+ "fhandle=0x%08x fmode=%s flags=%s",
+ __entry->fhandle, show_fs_fmode_flags(__entry->fmode),
+ show_delegation_flags(__entry->flags)
+ )
+);
+#define DEFINE_NFS4_DELEGATION_EVENT(name) \
+ DEFINE_EVENT(nfs4_delegation_event, name, \
+ TP_PROTO( \
+ const struct nfs_delegation *delegation \
+ ), \
+ TP_ARGS(delegation))
+DEFINE_NFS4_DELEGATION_EVENT(nfs_delegation_need_return);
TRACE_EVENT(nfs4_delegreturn_exit,
TP_PROTO(
@@ -1449,6 +1524,63 @@ DECLARE_EVENT_CLASS(nfs4_inode_stateid_callback_event,
DEFINE_NFS4_INODE_STATEID_CALLBACK_EVENT(nfs4_cb_recall);
DEFINE_NFS4_INODE_STATEID_CALLBACK_EVENT(nfs4_cb_layoutrecall_file);
+#define show_stateid_type(type) \
+ __print_symbolic(type, \
+ { NFS4_INVALID_STATEID_TYPE, "INVALID" }, \
+ { NFS4_SPECIAL_STATEID_TYPE, "SPECIAL" }, \
+ { NFS4_OPEN_STATEID_TYPE, "OPEN" }, \
+ { NFS4_LOCK_STATEID_TYPE, "LOCK" }, \
+ { NFS4_DELEGATION_STATEID_TYPE, "DELEGATION" }, \
+ { NFS4_LAYOUT_STATEID_TYPE, "LAYOUT" }, \
+ { NFS4_PNFS_DS_STATEID_TYPE, "PNFS_DS" }, \
+ { NFS4_REVOKED_STATEID_TYPE, "REVOKED" }, \
+ { NFS4_FREED_STATEID_TYPE, "FREED" })
+
+DECLARE_EVENT_CLASS(nfs4_match_stateid_event,
+ TP_PROTO(
+ const nfs4_stateid *s1,
+ const nfs4_stateid *s2
+ ),
+
+ TP_ARGS(s1, s2),
+
+ TP_STRUCT__entry(
+ __field(int, s1_seq)
+ __field(int, s2_seq)
+ __field(u32, s1_hash)
+ __field(u32, s2_hash)
+ __field(int, s1_type)
+ __field(int, s2_type)
+ ),
+
+ TP_fast_assign(
+ __entry->s1_seq = s1->seqid;
+ __entry->s1_hash = nfs_stateid_hash(s1);
+ __entry->s1_type = s1->type;
+ __entry->s2_seq = s2->seqid;
+ __entry->s2_hash = nfs_stateid_hash(s2);
+ __entry->s2_type = s2->type;
+ ),
+
+ TP_printk(
+ "s1=%s:%x:%u s2=%s:%x:%u",
+ show_stateid_type(__entry->s1_type),
+ __entry->s1_hash, __entry->s1_seq,
+ show_stateid_type(__entry->s2_type),
+ __entry->s2_hash, __entry->s2_seq
+ )
+);
+
+#define DEFINE_NFS4_MATCH_STATEID_EVENT(name) \
+ DEFINE_EVENT(nfs4_match_stateid_event, name, \
+ TP_PROTO( \
+ const nfs4_stateid *s1, \
+ const nfs4_stateid *s2 \
+ ), \
+ TP_ARGS(s1, s2))
+DEFINE_NFS4_MATCH_STATEID_EVENT(nfs41_match_stateid);
+DEFINE_NFS4_MATCH_STATEID_EVENT(nfs4_match_stateid);
+
DECLARE_EVENT_CLASS(nfs4_idmap_event,
TP_PROTO(
const char *name,
@@ -2051,13 +2183,15 @@ TRACE_EVENT(fl_getdevinfo,
DECLARE_EVENT_CLASS(nfs4_flexfiles_io_event,
TP_PROTO(
- const struct nfs_pgio_header *hdr
+ const struct nfs_pgio_header *hdr,
+ int error
),
- TP_ARGS(hdr),
+ TP_ARGS(hdr, error),
TP_STRUCT__entry(
__field(unsigned long, error)
+ __field(unsigned long, nfs_error)
__field(dev_t, dev)
__field(u32, fhandle)
__field(u64, fileid)
@@ -2073,7 +2207,8 @@ DECLARE_EVENT_CLASS(nfs4_flexfiles_io_event,
TP_fast_assign(
const struct inode *inode = hdr->inode;
- __entry->error = hdr->res.op_status;
+ __entry->error = -error;
+ __entry->nfs_error = hdr->res.op_status;
__entry->fhandle = nfs_fhandle_hash(hdr->args.fh);
__entry->fileid = NFS_FILEID(inode);
__entry->dev = inode->i_sb->s_dev;
@@ -2088,7 +2223,8 @@ DECLARE_EVENT_CLASS(nfs4_flexfiles_io_event,
TP_printk(
"error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
- "offset=%llu count=%u stateid=%d:0x%08x dstaddr=%s",
+ "offset=%llu count=%u stateid=%d:0x%08x dstaddr=%s "
+ "nfs_error=%lu (%s)",
-__entry->error,
show_nfs4_status(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
@@ -2096,28 +2232,32 @@ DECLARE_EVENT_CLASS(nfs4_flexfiles_io_event,
__entry->fhandle,
__entry->offset, __entry->count,
__entry->stateid_seq, __entry->stateid_hash,
- __get_str(dstaddr)
+ __get_str(dstaddr), __entry->nfs_error,
+ show_nfs4_status(__entry->nfs_error)
)
);
#define DEFINE_NFS4_FLEXFILES_IO_EVENT(name) \
DEFINE_EVENT(nfs4_flexfiles_io_event, name, \
TP_PROTO( \
- const struct nfs_pgio_header *hdr \
+ const struct nfs_pgio_header *hdr, \
+ int error \
), \
- TP_ARGS(hdr))
+ TP_ARGS(hdr, error))
DEFINE_NFS4_FLEXFILES_IO_EVENT(ff_layout_read_error);
DEFINE_NFS4_FLEXFILES_IO_EVENT(ff_layout_write_error);
TRACE_EVENT(ff_layout_commit_error,
TP_PROTO(
- const struct nfs_commit_data *data
+ const struct nfs_commit_data *data,
+ int error
),
- TP_ARGS(data),
+ TP_ARGS(data, error),
TP_STRUCT__entry(
__field(unsigned long, error)
+ __field(unsigned long, nfs_error)
__field(dev_t, dev)
__field(u32, fhandle)
__field(u64, fileid)
@@ -2131,7 +2271,8 @@ TRACE_EVENT(ff_layout_commit_error,
TP_fast_assign(
const struct inode *inode = data->inode;
- __entry->error = data->res.op_status;
+ __entry->error = -error;
+ __entry->nfs_error = data->res.op_status;
__entry->fhandle = nfs_fhandle_hash(data->args.fh);
__entry->fileid = NFS_FILEID(inode);
__entry->dev = inode->i_sb->s_dev;
@@ -2142,14 +2283,49 @@ TRACE_EVENT(ff_layout_commit_error,
TP_printk(
"error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
- "offset=%llu count=%u dstaddr=%s",
+ "offset=%llu count=%u dstaddr=%s nfs_error=%lu (%s)",
-__entry->error,
show_nfs4_status(__entry->error),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle,
__entry->offset, __entry->count,
- __get_str(dstaddr)
+ __get_str(dstaddr), __entry->nfs_error,
+ show_nfs4_status(__entry->nfs_error)
+ )
+);
+
+TRACE_EVENT(bl_ext_tree_prepare_commit,
+ TP_PROTO(
+ int ret,
+ size_t count,
+ u64 lwb,
+ bool not_all_ranges
+ ),
+
+ TP_ARGS(ret, count, lwb, not_all_ranges),
+
+ TP_STRUCT__entry(
+ __field(int, ret)
+ __field(size_t, count)
+ __field(u64, lwb)
+ __field(bool, not_all_ranges)
+ ),
+
+ TP_fast_assign(
+ __entry->ret = ret;
+ __entry->count = count;
+ __entry->lwb = lwb;
+ __entry->not_all_ranges = not_all_ranges;
+ ),
+
+ TP_printk(
+ "ret=%d, found %zu ranges, lwb=%llu%s",
+ __entry->ret,
+ __entry->count,
+ __entry->lwb,
+ __entry->not_all_ranges ? ", not all ranges encoded" :
+ ""
)
);
@@ -2608,7 +2784,7 @@ TRACE_EVENT(nfs4_copy_notify,
)
);
-TRACE_EVENT(nfs4_offload_cancel,
+DECLARE_EVENT_CLASS(nfs4_offload_class,
TP_PROTO(
const struct nfs42_offload_status_args *args,
int error
@@ -2640,6 +2816,15 @@ TRACE_EVENT(nfs4_offload_cancel,
__entry->stateid_seq, __entry->stateid_hash
)
);
+#define DEFINE_NFS4_OFFLOAD_EVENT(name) \
+ DEFINE_EVENT(nfs4_offload_class, name, \
+ TP_PROTO( \
+ const struct nfs42_offload_status_args *args, \
+ int error \
+ ), \
+ TP_ARGS(args, error))
+DEFINE_NFS4_OFFLOAD_EVENT(nfs4_offload_cancel);
+DEFINE_NFS4_OFFLOAD_EVENT(nfs4_offload_status);
DECLARE_EVENT_CLASS(nfs4_xattr_event,
TP_PROTO(
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index e8ac3f615f93..49ff98571fa5 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -82,9 +82,8 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
* we currently use size 2 (u64) out of (NFS4_OPAQUE_LIMIT >> 2)
*/
#define pagepad_maxsz (1)
-#define open_owner_id_maxsz (1 + 2 + 1 + 1 + 2)
-#define lock_owner_id_maxsz (1 + 1 + 4)
-#define decode_lockowner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ))
+#define open_owner_id_maxsz (2 + 1 + 2 + 2)
+#define lock_owner_id_maxsz (2 + 1 + 2)
#define compound_encode_hdr_maxsz (3 + (NFS4_MAXTAGLEN >> 2))
#define compound_decode_hdr_maxsz (3 + (NFS4_MAXTAGLEN >> 2))
#define op_encode_hdr_maxsz (1)
@@ -185,7 +184,7 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
#define encode_claim_null_maxsz (1 + nfs4_name_maxsz)
#define encode_open_maxsz (op_encode_hdr_maxsz + \
2 + encode_share_access_maxsz + 2 + \
- open_owner_id_maxsz + \
+ 1 + open_owner_id_maxsz + \
encode_opentype_maxsz + \
encode_claim_null_maxsz)
#define decode_space_limit_maxsz (3)
@@ -255,13 +254,14 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
#define encode_link_maxsz (op_encode_hdr_maxsz + \
nfs4_name_maxsz)
#define decode_link_maxsz (op_decode_hdr_maxsz + decode_change_info_maxsz)
-#define encode_lockowner_maxsz (7)
+#define encode_lockowner_maxsz (2 + 1 + lock_owner_id_maxsz)
+
#define encode_lock_maxsz (op_encode_hdr_maxsz + \
7 + \
1 + encode_stateid_maxsz + 1 + \
encode_lockowner_maxsz)
#define decode_lock_denied_maxsz \
- (8 + decode_lockowner_maxsz)
+ (2 + 2 + 1 + 2 + 1 + lock_owner_id_maxsz)
#define decode_lock_maxsz (op_decode_hdr_maxsz + \
decode_lock_denied_maxsz)
#define encode_lockt_maxsz (op_encode_hdr_maxsz + 5 + \
@@ -617,7 +617,7 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
encode_lockowner_maxsz)
#define NFS4_dec_release_lockowner_sz \
(compound_decode_hdr_maxsz + \
- decode_lockowner_maxsz)
+ decode_release_lockowner_maxsz)
#define NFS4_enc_access_sz (compound_encode_hdr_maxsz + \
encode_sequence_maxsz + \
encode_putfh_maxsz + \
@@ -1412,7 +1412,7 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
__be32 *p;
/*
* opcode 4, seqid 4, share_access 4, share_deny 4, clientid 8, ownerlen 4,
- * owner 4 = 32
+ * owner 28
*/
encode_nfs4_seqid(xdr, arg->seqid);
encode_share_access(xdr, arg->share_access);
@@ -1623,6 +1623,7 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
| FATTR4_WORD1_RAWDEV
| FATTR4_WORD1_SPACE_USED
| FATTR4_WORD1_TIME_ACCESS
+ | FATTR4_WORD1_TIME_CREATE
| FATTR4_WORD1_TIME_METADATA
| FATTR4_WORD1_TIME_MODIFY;
attrs[2] |= FATTR4_WORD2_SECURITY_LABEL;
@@ -4207,6 +4208,24 @@ static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, str
return status;
}
+static int decode_attr_time_create(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec64 *time)
+{
+ int status = 0;
+
+ time->tv_sec = 0;
+ time->tv_nsec = 0;
+ if (unlikely(bitmap[1] & (FATTR4_WORD1_TIME_CREATE - 1U)))
+ return -EIO;
+ if (likely(bitmap[1] & FATTR4_WORD1_TIME_CREATE)) {
+ status = decode_attr_time(xdr, time);
+ if (status == 0)
+ status = NFS_ATTR_FATTR_BTIME;
+ bitmap[1] &= ~FATTR4_WORD1_TIME_CREATE;
+ }
+ dprintk("%s: btime=%lld\n", __func__, time->tv_sec);
+ return status;
+}
+
static int decode_attr_time_metadata(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec64 *time)
{
int status = 0;
@@ -4781,6 +4800,11 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
goto xdr_error;
fattr->valid |= status;
+ status = decode_attr_time_create(xdr, bitmap, &fattr->btime);
+ if (status < 0)
+ goto xdr_error;
+ fattr->valid |= status;
+
status = decode_attr_time_metadata(xdr, bitmap, &fattr->ctime);
if (status < 0)
goto xdr_error;
@@ -5077,7 +5101,7 @@ static int decode_link(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
/*
* We create the owner, so we know a proper owner.id length is 4.
*/
-static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
+static int decode_lock_denied(struct xdr_stream *xdr, struct file_lock *fl)
{
uint64_t offset, length, clientid;
__be32 *p;
@@ -7702,6 +7726,7 @@ const struct rpc_procinfo nfs4_procedures[] = {
PROC42(CLONE, enc_clone, dec_clone),
PROC42(COPY, enc_copy, dec_copy),
PROC42(OFFLOAD_CANCEL, enc_offload_cancel, dec_offload_cancel),
+ PROC42(OFFLOAD_STATUS, enc_offload_status, dec_offload_status),
PROC42(COPY_NOTIFY, enc_copy_notify, dec_copy_notify),
PROC(LOOKUPP, enc_lookupp, dec_lookupp),
PROC42(LAYOUTERROR, enc_layouterror, dec_layouterror),
@@ -7710,6 +7735,7 @@ const struct rpc_procinfo nfs4_procedures[] = {
PROC42(LISTXATTRS, enc_listxattrs, dec_listxattrs),
PROC42(REMOVEXATTR, enc_removexattr, dec_removexattr),
PROC42(READ_PLUS, enc_read_plus, dec_read_plus),
+ PROC42(ZERO_RANGE, enc_zero_range, dec_zero_range),
};
static unsigned int nfs_version4_counts[ARRAY_SIZE(nfs4_procedures)];
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 7a058bd8c566..96b1323318c2 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -32,7 +32,8 @@
{ NFS_INO_INVALID_BLOCKS, "INVALID_BLOCKS" }, \
{ NFS_INO_INVALID_XATTR, "INVALID_XATTR" }, \
{ NFS_INO_INVALID_NLINK, "INVALID_NLINK" }, \
- { NFS_INO_INVALID_MODE, "INVALID_MODE" })
+ { NFS_INO_INVALID_MODE, "INVALID_MODE" }, \
+ { NFS_INO_INVALID_BTIME, "INVALID_BTIME" })
#define nfs_show_nfsi_flags(v) \
__print_flags(v, "|", \
@@ -56,6 +57,7 @@ DECLARE_EVENT_CLASS(nfs_inode_event,
__field(u32, fhandle)
__field(u64, fileid)
__field(u64, version)
+ __field(unsigned long, cache_validity)
),
TP_fast_assign(
@@ -64,14 +66,17 @@ DECLARE_EVENT_CLASS(nfs_inode_event,
__entry->fileid = nfsi->fileid;
__entry->fhandle = nfs_fhandle_hash(&nfsi->fh);
__entry->version = inode_peek_iversion_raw(inode);
+ __entry->cache_validity = nfsi->cache_validity;
),
TP_printk(
- "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu ",
+ "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu cache_validity=0x%lx (%s)",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle,
- (unsigned long long)__entry->version
+ (unsigned long long)__entry->version,
+ __entry->cache_validity,
+ nfs_show_cache_validity(__entry->cache_validity)
)
);
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 11968dcb7243..6e69ce43a13f 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -253,13 +253,14 @@ nfs_page_group_unlock(struct nfs_page *req)
nfs_page_clear_headlock(req);
}
-/*
- * nfs_page_group_sync_on_bit_locked
+/**
+ * nfs_page_group_sync_on_bit_locked - Test if all requests have @bit set
+ * @req: request in page group
+ * @bit: PG_* bit that is used to sync page group
*
* must be called with page group lock held
*/
-static bool
-nfs_page_group_sync_on_bit_locked(struct nfs_page *req, unsigned int bit)
+bool nfs_page_group_sync_on_bit_locked(struct nfs_page *req, unsigned int bit)
{
struct nfs_page *head = req->wb_head;
struct nfs_page *tmp;
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 5f582713bf05..a3135b5af7ee 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -306,7 +306,6 @@ void
pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
{
struct inode *inode;
- unsigned long i_state;
if (!lo)
return;
@@ -317,12 +316,11 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
if (!list_empty(&lo->plh_segs))
WARN_ONCE(1, "NFS: BUG unfreed layout segments.\n");
pnfs_detach_layout_hdr(lo);
- i_state = inode->i_state;
+ /* Notify pnfs_destroy_layout_final() that we're done */
+ if (inode->i_state & (I_FREEING | I_CLEAR))
+ wake_up_var_locked(lo, &inode->i_lock);
spin_unlock(&inode->i_lock);
pnfs_free_layout_hdr(lo);
- /* Notify pnfs_destroy_layout_final() that we're done */
- if (i_state & (I_FREEING | I_CLEAR))
- wake_up_var(lo);
}
}
@@ -745,6 +743,14 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
return remaining;
}
+static void pnfs_reset_return_info(struct pnfs_layout_hdr *lo)
+{
+ struct pnfs_layout_segment *lseg;
+
+ list_for_each_entry(lseg, &lo->plh_return_segs, pls_list)
+ pnfs_set_plh_return_info(lo, lseg->pls_range.iomode, 0);
+}
+
static void
pnfs_free_returned_lsegs(struct pnfs_layout_hdr *lo,
struct list_head *free_me,
@@ -801,23 +807,17 @@ void pnfs_destroy_layout(struct nfs_inode *nfsi)
}
EXPORT_SYMBOL_GPL(pnfs_destroy_layout);
-static bool pnfs_layout_removed(struct nfs_inode *nfsi,
- struct pnfs_layout_hdr *lo)
-{
- bool ret;
-
- spin_lock(&nfsi->vfs_inode.i_lock);
- ret = nfsi->layout != lo;
- spin_unlock(&nfsi->vfs_inode.i_lock);
- return ret;
-}
-
void pnfs_destroy_layout_final(struct nfs_inode *nfsi)
{
struct pnfs_layout_hdr *lo = __pnfs_destroy_layout(nfsi);
+ struct inode *inode = &nfsi->vfs_inode;
- if (lo)
- wait_var_event(lo, pnfs_layout_removed(nfsi, lo));
+ if (lo) {
+ spin_lock(&inode->i_lock);
+ wait_var_event_spinlock(lo, nfsi->layout != lo,
+ &inode->i_lock);
+ spin_unlock(&inode->i_lock);
+ }
}
static bool
@@ -1246,21 +1246,15 @@ static void pnfs_clear_layoutcommit(struct inode *inode,
static void
pnfs_layoutreturn_retry_later_locked(struct pnfs_layout_hdr *lo,
const nfs4_stateid *arg_stateid,
- const struct pnfs_layout_range *range)
+ const struct pnfs_layout_range *range,
+ struct list_head *freeme)
{
- const struct pnfs_layout_segment *lseg;
- u32 seq = be32_to_cpu(arg_stateid->seqid);
-
if (pnfs_layout_is_valid(lo) &&
- nfs4_stateid_match_other(&lo->plh_stateid, arg_stateid)) {
- list_for_each_entry(lseg, &lo->plh_return_segs, pls_list) {
- if (pnfs_seqid_is_newer(lseg->pls_seq, seq) ||
- !pnfs_should_free_range(&lseg->pls_range, range))
- continue;
- pnfs_set_plh_return_info(lo, range->iomode, seq);
- break;
- }
- }
+ nfs4_stateid_match_other(&lo->plh_stateid, arg_stateid))
+ pnfs_reset_return_info(lo);
+ else
+ pnfs_mark_layout_stateid_invalid(lo, freeme);
+ pnfs_clear_layoutreturn_waitbit(lo);
}
void pnfs_layoutreturn_retry_later(struct pnfs_layout_hdr *lo,
@@ -1268,11 +1262,12 @@ void pnfs_layoutreturn_retry_later(struct pnfs_layout_hdr *lo,
const struct pnfs_layout_range *range)
{
struct inode *inode = lo->plh_inode;
+ LIST_HEAD(freeme);
spin_lock(&inode->i_lock);
- pnfs_layoutreturn_retry_later_locked(lo, arg_stateid, range);
- pnfs_clear_layoutreturn_waitbit(lo);
+ pnfs_layoutreturn_retry_later_locked(lo, arg_stateid, range, &freeme);
spin_unlock(&inode->i_lock);
+ pnfs_free_lseg_list(&freeme);
}
void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo,
@@ -1292,6 +1287,7 @@ void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo,
pnfs_mark_matching_lsegs_invalid(lo, &freeme, range, seq);
pnfs_free_returned_lsegs(lo, &freeme, range, seq);
pnfs_set_layout_stateid(lo, stateid, NULL, true);
+ pnfs_reset_return_info(lo);
} else
pnfs_mark_layout_stateid_invalid(lo, &freeme);
out_unlock:
@@ -1661,6 +1657,18 @@ int pnfs_roc_done(struct rpc_task *task, struct nfs4_layoutreturn_args **argpp,
/* Was there an RPC level error? If not, retry */
if (task->tk_rpc_status == 0)
break;
+ /*
+ * Is there a fatal network level error?
+ * If so release the layout, but flag the error.
+ */
+ if ((task->tk_rpc_status == -ENETDOWN ||
+ task->tk_rpc_status == -ENETUNREACH) &&
+ task->tk_flags & RPC_TASK_NETUNREACH_FATAL) {
+ *ret = 0;
+ (*respp)->lrs_present = 0;
+ retval = -EIO;
+ break;
+ }
/* If the call was not sent, let caller handle it */
if (!RPC_WAS_SENT(task))
return 0;
@@ -1695,6 +1703,7 @@ void pnfs_roc_release(struct nfs4_layoutreturn_args *args,
struct inode *inode = args->inode;
const nfs4_stateid *res_stateid = NULL;
struct nfs4_xdr_opaque_data *ld_private = args->ld_private;
+ LIST_HEAD(freeme);
switch (ret) {
case -NFS4ERR_BADSESSION:
@@ -1703,9 +1712,9 @@ void pnfs_roc_release(struct nfs4_layoutreturn_args *args,
case -NFS4ERR_NOMATCHING_LAYOUT:
spin_lock(&inode->i_lock);
pnfs_layoutreturn_retry_later_locked(lo, &args->stateid,
- &args->range);
- pnfs_clear_layoutreturn_waitbit(lo);
+ &args->range, &freeme);
spin_unlock(&inode->i_lock);
+ pnfs_free_lseg_list(&freeme);
break;
case 0:
if (res->lrs_present)
@@ -2042,8 +2051,10 @@ static void nfs_layoutget_begin(struct pnfs_layout_hdr *lo)
static void nfs_layoutget_end(struct pnfs_layout_hdr *lo)
{
if (atomic_dec_and_test(&lo->plh_outstanding) &&
- test_and_clear_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags))
+ test_and_clear_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags)) {
+ smp_mb__after_atomic();
wake_up_bit(&lo->plh_flags, NFS_LAYOUT_DRAIN);
+ }
}
static bool pnfs_is_first_layoutget(struct pnfs_layout_hdr *lo)
@@ -3321,6 +3332,7 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
struct nfs_inode *nfsi = NFS_I(inode);
loff_t end_pos;
int status;
+ bool mark_as_dirty = false;
if (!pnfs_layoutcommit_outstanding(inode))
return 0;
@@ -3372,19 +3384,23 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
if (ld->prepare_layoutcommit) {
status = ld->prepare_layoutcommit(&data->args);
if (status) {
- put_cred(data->cred);
+ if (status != -ENOSPC)
+ put_cred(data->cred);
spin_lock(&inode->i_lock);
set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags);
if (end_pos > nfsi->layout->plh_lwb)
nfsi->layout->plh_lwb = end_pos;
- goto out_unlock;
+ if (status != -ENOSPC)
+ goto out_unlock;
+ spin_unlock(&inode->i_lock);
+ mark_as_dirty = true;
}
}
status = nfs4_proc_layoutcommit(data, sync);
out:
- if (status)
+ if (status || mark_as_dirty)
mark_inode_dirty_sync(inode);
dprintk("<-- %s status %d\n", __func__, status);
return status;
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 30d2613e912b..91ff877185c8 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -60,6 +60,7 @@ struct nfs4_pnfs_ds {
struct list_head ds_node; /* nfs4_pnfs_dev_hlist dev_dslist */
char *ds_remotestr; /* comma sep list of addrs */
struct list_head ds_addrs;
+ const struct net *ds_net;
struct nfs_client *ds_clp;
refcount_t ds_count;
unsigned long ds_state;
@@ -415,7 +416,8 @@ int pnfs_generic_commit_pagelist(struct inode *inode,
int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo, int max);
void pnfs_generic_write_commit_done(struct rpc_task *task, void *data);
void nfs4_pnfs_ds_put(struct nfs4_pnfs_ds *ds);
-struct nfs4_pnfs_ds *nfs4_pnfs_ds_add(struct list_head *dsaddrs,
+struct nfs4_pnfs_ds *nfs4_pnfs_ds_add(const struct net *net,
+ struct list_head *dsaddrs,
gfp_t gfp_flags);
void nfs4_pnfs_v3_ds_connect_unload(void);
int nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds,
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index dbef837e871a..7b32afb29782 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -16,6 +16,8 @@
#include "nfs4session.h"
#include "internal.h"
#include "pnfs.h"
+#include "netns.h"
+#include "nfs4trace.h"
#define NFSDBG_FACILITY NFSDBG_PNFS
@@ -504,14 +506,14 @@ EXPORT_SYMBOL_GPL(pnfs_generic_commit_pagelist);
/*
* Data server cache
*
- * Data servers can be mapped to different device ids.
- * nfs4_pnfs_ds reference counting
+ * Data servers can be mapped to different device ids, but should
+ * never be shared between net namespaces.
+ *
+ * nfs4_pnfs_ds reference counting:
* - set to 1 on allocation
* - incremented when a device id maps a data server already in the cache.
* - decremented when deviceid is removed from the cache.
*/
-static DEFINE_SPINLOCK(nfs4_ds_cache_lock);
-static LIST_HEAD(nfs4_data_server_cache);
/* Debug routines */
static void
@@ -604,11 +606,11 @@ _same_data_server_addrs_locked(const struct list_head *dsaddrs1,
* Lookup DS by addresses. nfs4_ds_cache_lock is held
*/
static struct nfs4_pnfs_ds *
-_data_server_lookup_locked(const struct list_head *dsaddrs)
+_data_server_lookup_locked(const struct nfs_net *nn, const struct list_head *dsaddrs)
{
struct nfs4_pnfs_ds *ds;
- list_for_each_entry(ds, &nfs4_data_server_cache, ds_node)
+ list_for_each_entry(ds, &nn->nfs4_data_server_cache, ds_node)
if (_same_data_server_addrs_locked(&ds->ds_addrs, dsaddrs))
return ds;
return NULL;
@@ -653,10 +655,11 @@ static void destroy_ds(struct nfs4_pnfs_ds *ds)
void nfs4_pnfs_ds_put(struct nfs4_pnfs_ds *ds)
{
- if (refcount_dec_and_lock(&ds->ds_count,
- &nfs4_ds_cache_lock)) {
+ struct nfs_net *nn = net_generic(ds->ds_net, nfs_net_id);
+
+ if (refcount_dec_and_lock(&ds->ds_count, &nn->nfs4_data_server_lock)) {
list_del_init(&ds->ds_node);
- spin_unlock(&nfs4_ds_cache_lock);
+ spin_unlock(&nn->nfs4_data_server_lock);
destroy_ds(ds);
}
}
@@ -716,8 +719,9 @@ out_err:
* uncached and return cached struct nfs4_pnfs_ds.
*/
struct nfs4_pnfs_ds *
-nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags)
+nfs4_pnfs_ds_add(const struct net *net, struct list_head *dsaddrs, gfp_t gfp_flags)
{
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
struct nfs4_pnfs_ds *tmp_ds, *ds = NULL;
char *remotestr;
@@ -733,16 +737,17 @@ nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags)
/* this is only used for debugging, so it's ok if its NULL */
remotestr = nfs4_pnfs_remotestr(dsaddrs, gfp_flags);
- spin_lock(&nfs4_ds_cache_lock);
- tmp_ds = _data_server_lookup_locked(dsaddrs);
+ spin_lock(&nn->nfs4_data_server_lock);
+ tmp_ds = _data_server_lookup_locked(nn, dsaddrs);
if (tmp_ds == NULL) {
INIT_LIST_HEAD(&ds->ds_addrs);
list_splice_init(dsaddrs, &ds->ds_addrs);
ds->ds_remotestr = remotestr;
refcount_set(&ds->ds_count, 1);
INIT_LIST_HEAD(&ds->ds_node);
+ ds->ds_net = net;
ds->ds_clp = NULL;
- list_add(&ds->ds_node, &nfs4_data_server_cache);
+ list_add(&ds->ds_node, &nn->nfs4_data_server_cache);
dprintk("%s add new data server %s\n", __func__,
ds->ds_remotestr);
} else {
@@ -754,7 +759,7 @@ nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags)
refcount_read(&tmp_ds->ds_count));
ds = tmp_ds;
}
- spin_unlock(&nfs4_ds_cache_lock);
+ spin_unlock(&nn->nfs4_data_server_lock);
out:
return ds;
}
@@ -826,10 +831,16 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv,
.servername = clp->cl_hostname,
.connect_timeout = connect_timeout,
.reconnect_timeout = connect_timeout,
+ .xprtsec = clp->cl_xprtsec,
};
- if (da->da_transport != clp->cl_proto)
+ if (da->da_transport != clp->cl_proto &&
+ clp->cl_proto != XPRT_TRANSPORT_TCP_TLS)
continue;
+ if (da->da_transport == XPRT_TRANSPORT_TCP &&
+ mds_srv->nfs_client->cl_proto == XPRT_TRANSPORT_TCP_TLS)
+ xprt_args.ident = XPRT_TRANSPORT_TCP_TLS;
+
if (da->da_addr.ss_family != clp->cl_addr.ss_family)
continue;
/* Add this address as an alias */
@@ -837,6 +848,9 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv,
rpc_clnt_test_and_add_xprt, NULL);
continue;
}
+ if (da->da_transport == XPRT_TRANSPORT_TCP &&
+ mds_srv->nfs_client->cl_proto == XPRT_TRANSPORT_TCP_TLS)
+ da->da_transport = XPRT_TRANSPORT_TCP_TLS;
clp = get_v3_ds_connect(mds_srv,
&da->da_addr,
da->da_addrlen, da->da_transport,
@@ -994,8 +1008,10 @@ int nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds,
err = nfs4_wait_ds_connect(ds);
if (err || ds->ds_clp)
goto out;
- if (nfs4_test_deviceid_unavailable(devid))
- return -ENODEV;
+ if (nfs4_test_deviceid_unavailable(devid)) {
+ err = -ENODEV;
+ goto out;
+ }
} while (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) != 0);
if (ds->ds_clp)
@@ -1025,11 +1041,12 @@ out:
if (!ds->ds_clp || !nfs_client_init_is_complete(ds->ds_clp)) {
WARN_ON_ONCE(ds->ds_clp ||
!nfs4_test_deviceid_unavailable(devid));
- return -EINVAL;
- }
- err = nfs_client_init_status(ds->ds_clp);
+ err = -EINVAL;
+ } else
+ err = nfs_client_init_status(ds->ds_clp);
}
+ trace_pnfs_ds_connect(ds->ds_remotestr, err);
return err;
}
EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_connect);
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 77920a2e3cef..63e71310b9f6 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -446,13 +446,14 @@ out:
return status;
}
-static int
+static struct dentry *
nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
{
struct nfs_createdata *data;
struct rpc_message msg = {
.rpc_proc = &nfs_procedures[NFSPROC_MKDIR],
};
+ struct dentry *alias = NULL;
int status = -ENOMEM;
dprintk("NFS call mkdir %pd\n", dentry);
@@ -464,12 +465,15 @@ nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0);
nfs_mark_for_revalidate(dir);
- if (status == 0)
- status = nfs_instantiate(dentry, data->res.fh, data->res.fattr);
+ if (status == 0) {
+ alias = nfs_add_or_obtain(dentry, data->res.fh, data->res.fattr);
+ status = PTR_ERR_OR_ZERO(alias);
+ } else
+ alias = ERR_PTR(status);
nfs_free_createdata(data);
out:
dprintk("NFS reply mkdir: %d\n", status);
- return status;
+ return alias;
}
static int
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 81bd1b9aba17..3c1fa320b3f1 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -56,7 +56,8 @@ static int nfs_return_empty_folio(struct folio *folio)
{
folio_zero_segment(folio, 0, folio_size(folio));
folio_mark_uptodate(folio);
- folio_unlock(folio);
+ if (nfs_netfs_folio_unlock(folio))
+ folio_unlock(folio);
return 0;
}
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index aeb715b4a690..72dee6f3050e 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -454,8 +454,12 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
{ NFS_MOUNT_NONLM, ",nolock", "" },
{ NFS_MOUNT_NOACL, ",noacl", "" },
{ NFS_MOUNT_NORDIRPLUS, ",nordirplus", "" },
+ { NFS_MOUNT_FORCE_RDIRPLUS, ",rdirplus=force", "" },
{ NFS_MOUNT_UNSHARED, ",nosharecache", "" },
{ NFS_MOUNT_NORESVPORT, ",noresvport", "" },
+ { NFS_MOUNT_NETUNREACH_FATAL,
+ ",fatal_neterrors=ENETDOWN:ENETUNREACH",
+ ",fatal_neterrors=none" },
{ 0, NULL, NULL }
};
const struct proc_nfs_info *nfs_infop;
@@ -1048,6 +1052,16 @@ int nfs_reconfigure(struct fs_context *fc)
sync_filesystem(sb);
/*
+ * The SB_RDONLY flag has been removed from the superblock during
+ * mounts to prevent interference between different filesystems.
+ * Similarly, it is also necessary to ignore the SB_RDONLY flag
+ * during reconfiguration; otherwise, it may also result in the
+ * creation of redundant superblocks when mounting a directory with
+ * different rw and ro flags multiple times.
+ */
+ fc->sb_flags_mask &= ~SB_RDONLY;
+
+ /*
* Userspace mount programs that send binary options generally send
* them populated with default values. We have no way to know which
* ones were explicitly specified. Fall back to legacy behavior and
@@ -1169,7 +1183,7 @@ static int nfs_set_super(struct super_block *s, struct fs_context *fc)
struct nfs_server *server = fc->s_fs_info;
int ret;
- s->s_d_op = server->nfs_client->rpc_ops->dentry_ops;
+ set_default_d_op(s, server->nfs_client->rpc_ops->dentry_ops);
ret = set_anon_super(s, server);
if (ret == 0)
server->s_dev = s->s_dev;
@@ -1304,8 +1318,17 @@ int nfs_get_tree_common(struct fs_context *fc)
if (IS_ERR(server))
return PTR_ERR(server);
+ /*
+ * When NFS_MOUNT_UNSHARED is not set, NFS forces the sharing of a
+ * superblock among each filesystem that mounts sub-directories
+ * belonging to a single exported root path.
+ * To prevent interference between different filesystems, the
+ * SB_RDONLY flag should be removed from the superblock.
+ */
if (server->flags & NFS_MOUNT_UNSHARED)
compare_super = NULL;
+ else
+ fc->sb_flags &= ~SB_RDONLY;
/* -o noac implies -o sync */
if (server->flags & NFS_MOUNT_NOAC)
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
index 1c62a5a9f51d..58146e935402 100644
--- a/fs/nfs/symlink.c
+++ b/fs/nfs/symlink.c
@@ -40,31 +40,31 @@ static const char *nfs_get_link(struct dentry *dentry,
struct inode *inode,
struct delayed_call *done)
{
- struct page *page;
+ struct folio *folio;
void *err;
if (!dentry) {
err = ERR_PTR(nfs_revalidate_mapping_rcu(inode));
if (err)
return err;
- page = find_get_page(inode->i_mapping, 0);
- if (!page)
+ folio = filemap_get_folio(inode->i_mapping, 0);
+ if (IS_ERR(folio))
return ERR_PTR(-ECHILD);
- if (!PageUptodate(page)) {
- put_page(page);
+ if (!folio_test_uptodate(folio)) {
+ folio_put(folio);
return ERR_PTR(-ECHILD);
}
} else {
err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping));
if (err)
return err;
- page = read_cache_page(&inode->i_data, 0, nfs_symlink_filler,
+ folio = read_cache_folio(&inode->i_data, 0, nfs_symlink_filler,
NULL);
- if (IS_ERR(page))
- return ERR_CAST(page);
+ if (IS_ERR(folio))
+ return ERR_CAST(folio);
}
- set_delayed_call(done, page_put_link, page);
- return page_address(page);
+ set_delayed_call(done, page_put_link, folio);
+ return folio_address(folio);
}
/*
diff --git a/fs/nfs/sysfs.c b/fs/nfs/sysfs.c
index 7b59a40d40c0..545148d42dcc 100644
--- a/fs/nfs/sysfs.c
+++ b/fs/nfs/sysfs.c
@@ -14,6 +14,7 @@
#include <linux/rcupdate.h>
#include <linux/lockd/lockd.h>
+#include "internal.h"
#include "nfs4_fs.h"
#include "netns.h"
#include "sysfs.h"
@@ -228,6 +229,25 @@ static void shutdown_client(struct rpc_clnt *clnt)
rpc_cancel_tasks(clnt, -EIO, shutdown_match_client, NULL);
}
+/*
+ * Shut down the nfs_client only once all the superblocks
+ * have been shut down.
+ */
+static void shutdown_nfs_client(struct nfs_client *clp)
+{
+ struct nfs_server *server;
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+ if (!(server->flags & NFS_MOUNT_SHUTDOWN)) {
+ rcu_read_unlock();
+ return;
+ }
+ }
+ rcu_read_unlock();
+ nfs_mark_client_ready(clp, -EIO);
+ shutdown_client(clp->cl_rpcclient);
+}
+
static ssize_t
shutdown_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
@@ -259,7 +279,6 @@ shutdown_store(struct kobject *kobj, struct kobj_attribute *attr,
server->flags |= NFS_MOUNT_SHUTDOWN;
shutdown_client(server->client);
- shutdown_client(server->nfs_client->cl_rpcclient);
if (!IS_ERR(server->client_acl))
shutdown_client(server->client_acl);
@@ -267,11 +286,44 @@ shutdown_store(struct kobject *kobj, struct kobj_attribute *attr,
if (server->nlm_host)
shutdown_client(server->nlm_host->h_rpcclnt);
out:
+ shutdown_nfs_client(server->nfs_client);
return count;
}
static struct kobj_attribute nfs_sysfs_attr_shutdown = __ATTR_RW(shutdown);
+#if IS_ENABLED(CONFIG_NFS_V4_1)
+static ssize_t
+implid_domain_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct nfs_server *server = container_of(kobj, struct nfs_server, kobj);
+ struct nfs41_impl_id *impl_id = server->nfs_client->cl_implid;
+
+ if (!impl_id || strlen(impl_id->domain) == 0)
+ return 0; //sysfs_emit(buf, "");
+ return sysfs_emit(buf, "%s\n", impl_id->domain);
+}
+
+static struct kobj_attribute nfs_sysfs_attr_implid_domain = __ATTR_RO(implid_domain);
+
+
+static ssize_t
+implid_name_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct nfs_server *server = container_of(kobj, struct nfs_server, kobj);
+ struct nfs41_impl_id *impl_id = server->nfs_client->cl_implid;
+
+ if (!impl_id || strlen(impl_id->name) == 0)
+ return 0; //sysfs_emit(buf, "");
+ return sysfs_emit(buf, "%s\n", impl_id->name);
+}
+
+static struct kobj_attribute nfs_sysfs_attr_implid_name = __ATTR_RO(implid_name);
+
+#endif /* IS_ENABLED(CONFIG_NFS_V4_1) */
+
#define RPC_CLIENT_NAME_SIZE 64
void nfs_sysfs_link_rpc_client(struct nfs_server *server,
@@ -309,6 +361,59 @@ static struct kobj_type nfs_sb_ktype = {
.child_ns_type = nfs_netns_object_child_ns_type,
};
+#if IS_ENABLED(CONFIG_NFS_V4_1)
+static void nfs_sysfs_add_nfsv41_server(struct nfs_server *server)
+{
+ int ret;
+
+ if (!server->nfs_client->cl_implid)
+ return;
+
+ ret = sysfs_create_file_ns(&server->kobj, &nfs_sysfs_attr_implid_domain.attr,
+ nfs_netns_server_namespace(&server->kobj));
+ if (ret < 0)
+ pr_warn("NFS: sysfs_create_file_ns for server-%d failed (%d)\n",
+ server->s_sysfs_id, ret);
+
+ ret = sysfs_create_file_ns(&server->kobj, &nfs_sysfs_attr_implid_name.attr,
+ nfs_netns_server_namespace(&server->kobj));
+ if (ret < 0)
+ pr_warn("NFS: sysfs_create_file_ns for server-%d failed (%d)\n",
+ server->s_sysfs_id, ret);
+}
+#else /* CONFIG_NFS_V4_1 */
+static inline void nfs_sysfs_add_nfsv41_server(struct nfs_server *server)
+{
+}
+#endif /* CONFIG_NFS_V4_1 */
+
+#if IS_ENABLED(CONFIG_NFS_LOCALIO)
+
+static ssize_t
+localio_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ struct nfs_server *server = container_of(kobj, struct nfs_server, kobj);
+ bool localio = nfs_server_is_local(server->nfs_client);
+ return sysfs_emit(buf, "%d\n", localio);
+}
+
+static struct kobj_attribute nfs_sysfs_attr_localio = __ATTR_RO(localio);
+
+static void nfs_sysfs_add_nfs_localio_server(struct nfs_server *server)
+{
+ int ret = sysfs_create_file_ns(&server->kobj, &nfs_sysfs_attr_localio.attr,
+ nfs_netns_server_namespace(&server->kobj));
+ if (ret < 0)
+ pr_warn("NFS: sysfs_create_file_ns for server-%d failed (%d)\n",
+ server->s_sysfs_id, ret);
+}
+#else
+static inline void nfs_sysfs_add_nfs_localio_server(struct nfs_server *server)
+{
+}
+#endif /* IS_ENABLED(CONFIG_NFS_LOCALIO) */
+
void nfs_sysfs_add_server(struct nfs_server *server)
{
int ret;
@@ -325,6 +430,9 @@ void nfs_sysfs_add_server(struct nfs_server *server)
if (ret < 0)
pr_warn("NFS: sysfs_create_file_ns for server-%d failed (%d)\n",
server->s_sysfs_id, ret);
+
+ nfs_sysfs_add_nfsv41_server(server);
+ nfs_sysfs_add_nfs_localio_server(server);
}
EXPORT_SYMBOL_GPL(nfs_sysfs_add_server);
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index bf77399696a7..b55467911648 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -464,18 +464,17 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry)
sdentry = NULL;
do {
- int slen;
dput(sdentry);
sillycounter++;
- slen = scnprintf(silly, sizeof(silly),
- SILLYNAME_PREFIX "%0*llx%0*x",
- SILLYNAME_FILEID_LEN, fileid,
- SILLYNAME_COUNTER_LEN, sillycounter);
+ scnprintf(silly, sizeof(silly),
+ SILLYNAME_PREFIX "%0*llx%0*x",
+ SILLYNAME_FILEID_LEN, fileid,
+ SILLYNAME_COUNTER_LEN, sillycounter);
dfprintk(VFS, "NFS: trying to rename %pd to %s\n",
dentry, silly);
- sdentry = lookup_one_len(silly, dentry->d_parent, slen);
+ sdentry = lookup_noperm(&QSTR(silly), dentry->d_parent);
/*
* N.B. Better to return EBUSY here ... it could be
* dangerous to delete the file while it's in use.
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index aa3d8bea3ec0..8b7c04737967 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -153,20 +153,10 @@ nfs_page_set_inode_ref(struct nfs_page *req, struct inode *inode)
}
}
-static int
-nfs_cancel_remove_inode(struct nfs_page *req, struct inode *inode)
+static void nfs_cancel_remove_inode(struct nfs_page *req, struct inode *inode)
{
- int ret;
-
- if (!test_bit(PG_REMOVE, &req->wb_flags))
- return 0;
- ret = nfs_page_group_lock(req);
- if (ret)
- return ret;
if (test_and_clear_bit(PG_REMOVE, &req->wb_flags))
nfs_page_set_inode_ref(req, inode);
- nfs_page_group_unlock(req);
- return 0;
}
/**
@@ -579,23 +569,24 @@ retry:
while (!nfs_lock_request(head)) {
ret = nfs_wait_on_request(head);
- if (ret < 0)
+ if (ret < 0) {
+ nfs_release_request(head);
return ERR_PTR(ret);
+ }
}
+ ret = nfs_page_group_lock(head);
+ if (ret < 0)
+ goto out_unlock;
+
/* Ensure that nobody removed the request before we locked it */
if (head != folio->private) {
+ nfs_page_group_unlock(head);
nfs_unlock_and_release_request(head);
goto retry;
}
- ret = nfs_cancel_remove_inode(head, inode);
- if (ret < 0)
- goto out_unlock;
-
- ret = nfs_page_group_lock(head);
- if (ret < 0)
- goto out_unlock;
+ nfs_cancel_remove_inode(head, inode);
/* lock each request in the page group */
for (subreq = head->wb_this_page;
@@ -630,19 +621,19 @@ static void nfs_write_error(struct nfs_page *req, int error)
* Find an associated nfs write request, and prepare to flush it out
* May return an error if the user signalled nfs_wait_on_request().
*/
-static int nfs_page_async_flush(struct folio *folio,
- struct writeback_control *wbc,
- struct nfs_pageio_descriptor *pgio)
+static int nfs_do_writepage(struct folio *folio, struct writeback_control *wbc,
+ struct nfs_pageio_descriptor *pgio)
{
struct nfs_page *req;
- int ret = 0;
+ int ret;
+
+ nfs_pageio_cond_complete(pgio, folio->index);
req = nfs_lock_and_join_requests(folio);
if (!req)
- goto out;
- ret = PTR_ERR(req);
+ return 0;
if (IS_ERR(req))
- goto out;
+ return PTR_ERR(req);
nfs_folio_set_writeback(folio);
WARN_ON_ONCE(test_bit(PG_CLEAN, &req->wb_flags));
@@ -652,7 +643,6 @@ static int nfs_page_async_flush(struct folio *folio,
if (nfs_error_is_fatal_on_server(ret))
goto out_launder;
- ret = 0;
if (!nfs_pageio_add_request(pgio, req)) {
ret = pgio->pg_error;
/*
@@ -660,28 +650,20 @@ static int nfs_page_async_flush(struct folio *folio,
*/
if (nfs_error_is_fatal_on_server(ret))
goto out_launder;
- if (wbc->sync_mode == WB_SYNC_NONE)
- ret = AOP_WRITEPAGE_ACTIVATE;
folio_redirty_for_writepage(wbc, folio);
nfs_redirty_request(req);
pgio->pg_error = 0;
- } else
- nfs_add_stats(folio->mapping->host,
- NFSIOS_WRITEPAGES, 1);
-out:
- return ret;
+ return ret;
+ }
+
+ nfs_add_stats(folio->mapping->host, NFSIOS_WRITEPAGES, 1);
+ return 0;
+
out_launder:
nfs_write_error(req, ret);
return 0;
}
-static int nfs_do_writepage(struct folio *folio, struct writeback_control *wbc,
- struct nfs_pageio_descriptor *pgio)
-{
- nfs_pageio_cond_complete(pgio, folio->index);
- return nfs_page_async_flush(folio, wbc, pgio);
-}
-
/*
* Write an mmapped page to the server.
*/
@@ -701,17 +683,6 @@ static int nfs_writepage_locked(struct folio *folio,
return err;
}
-static int nfs_writepages_callback(struct folio *folio,
- struct writeback_control *wbc, void *data)
-{
- int ret;
-
- ret = nfs_do_writepage(folio, wbc, data);
- if (ret != AOP_WRITEPAGE_ACTIVATE)
- folio_unlock(folio);
- return ret;
-}
-
static void nfs_io_completion_commit(void *inode)
{
nfs_commit_inode(inode, 0);
@@ -738,7 +709,7 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
if (!(mntflags & NFS_MOUNT_WRITE_EAGER) || wbc->for_kupdate ||
- wbc->for_background || wbc->for_sync || wbc->for_reclaim) {
+ wbc->for_background || wbc->for_sync) {
ioc = nfs_io_completion_alloc(GFP_KERNEL);
if (ioc)
nfs_io_completion_init(ioc, nfs_io_completion_commit,
@@ -747,11 +718,15 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
}
do {
+ struct folio *folio = NULL;
+
nfs_pageio_init_write(&pgio, inode, priority, false,
&nfs_async_write_completion_ops);
pgio.pg_io_completion = ioc;
- err = write_cache_pages(mapping, wbc, nfs_writepages_callback,
- &pgio);
+ while ((folio = writeback_iter(mapping, wbc, folio, &err))) {
+ err = nfs_do_writepage(folio, wbc, &pgio);
+ folio_unlock(folio);
+ }
pgio.pg_error = 0;
nfs_pageio_complete(&pgio);
if (err == -EAGAIN && mntflags & NFS_MOUNT_SOFTERR)
@@ -800,7 +775,8 @@ static void nfs_inode_remove_request(struct nfs_page *req)
{
struct nfs_inode *nfsi = NFS_I(nfs_page_to_inode(req));
- if (nfs_page_group_sync_on_bit(req, PG_REMOVE)) {
+ nfs_page_group_lock(req);
+ if (nfs_page_group_sync_on_bit_locked(req, PG_REMOVE)) {
struct folio *folio = nfs_page_to_folio(req->wb_head);
struct address_space *mapping = folio->mapping;
@@ -812,6 +788,7 @@ static void nfs_inode_remove_request(struct nfs_page *req)
}
spin_unlock(&mapping->i_private_lock);
}
+ nfs_page_group_unlock(req);
if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) {
atomic_long_dec(&nfsi->nrequests);
@@ -2127,8 +2104,12 @@ int nfs_migrate_folio(struct address_space *mapping, struct folio *dst,
* that we can safely release the inode reference while holding
* the folio lock.
*/
- if (folio_test_private(src))
- return -EBUSY;
+ if (folio_test_private(src)) {
+ if (mode == MIGRATE_SYNC)
+ nfs_wb_folio(src->mapping->host, src);
+ if (folio_test_private(src))
+ return -EBUSY;
+ }
if (folio_test_private_2(src)) { /* [DEPRECATED] */
if (mode == MIGRATE_ASYNC)
diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
index ea382b75b26c..e2eaac14fd8e 100644
--- a/fs/nfs_common/nfsacl.c
+++ b/fs/nfs_common/nfsacl.c
@@ -42,7 +42,7 @@ struct nfsacl_encode_desc {
};
struct nfsacl_simple_acl {
- struct posix_acl acl;
+ struct posix_acl_hdr acl;
struct posix_acl_entry ace[4];
};
@@ -112,7 +112,8 @@ int nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
xdr_encode_word(buf, base, entries))
return -EINVAL;
if (encode_entries && acl && acl->a_count == 3) {
- struct posix_acl *acl2 = &aclbuf.acl;
+ struct posix_acl *acl2 =
+ container_of(&aclbuf.acl, struct posix_acl, hdr);
/* Avoid the use of posix_acl_alloc(). nfsacl_encode() is
* invoked in contexts where a memory allocation failure is
@@ -177,7 +178,8 @@ bool nfs_stream_encode_acl(struct xdr_stream *xdr, struct inode *inode,
return false;
if (encode_entries && acl && acl->a_count == 3) {
- struct posix_acl *acl2 = &aclbuf.acl;
+ struct posix_acl *acl2 =
+ container_of(&aclbuf.acl, struct posix_acl, hdr);
/* Avoid the use of posix_acl_alloc(). nfsacl_encode() is
* invoked in contexts where a memory allocation failure is
diff --git a/fs/nfs_common/nfslocalio.c b/fs/nfs_common/nfslocalio.c
index 6a0bdea6d644..dd715cdb6c04 100644
--- a/fs/nfs_common/nfslocalio.c
+++ b/fs/nfs_common/nfslocalio.c
@@ -151,8 +151,7 @@ EXPORT_SYMBOL_GPL(nfs_localio_enable_client);
*/
static bool nfs_uuid_put(nfs_uuid_t *nfs_uuid)
{
- LIST_HEAD(local_files);
- struct nfs_file_localio *nfl, *tmp;
+ struct nfs_file_localio *nfl;
spin_lock(&nfs_uuid->lock);
if (unlikely(!rcu_access_pointer(nfs_uuid->net))) {
@@ -166,17 +165,41 @@ static bool nfs_uuid_put(nfs_uuid_t *nfs_uuid)
nfs_uuid->dom = NULL;
}
- list_splice_init(&nfs_uuid->files, &local_files);
- spin_unlock(&nfs_uuid->lock);
-
/* Walk list of files and ensure their last references dropped */
- list_for_each_entry_safe(nfl, tmp, &local_files, list) {
- nfs_close_local_fh(nfl);
+
+ while ((nfl = list_first_entry_or_null(&nfs_uuid->files,
+ struct nfs_file_localio,
+ list)) != NULL) {
+ /* If nfs_uuid is already NULL, nfs_close_local_fh is
+ * closing and we must wait, else we unlink and close.
+ */
+ if (rcu_access_pointer(nfl->nfs_uuid) == NULL) {
+ /* nfs_close_local_fh() is doing the
+ * close and we must wait. until it unlinks
+ */
+ wait_var_event_spinlock(nfs_uuid,
+ list_first_entry_or_null(
+ &nfs_uuid->files,
+ struct nfs_file_localio,
+ list) != nfl,
+ &nfs_uuid->lock);
+ continue;
+ }
+
+ /* Remove nfl from nfs_uuid->files list */
+ list_del_init(&nfl->list);
+ spin_unlock(&nfs_uuid->lock);
+
+ nfs_to_nfsd_file_put_local(&nfl->ro_file);
+ nfs_to_nfsd_file_put_local(&nfl->rw_file);
cond_resched();
- }
- spin_lock(&nfs_uuid->lock);
- BUG_ON(!list_empty(&nfs_uuid->files));
+ spin_lock(&nfs_uuid->lock);
+ /* Now we can allow racing nfs_close_local_fh() to
+ * skip the locking.
+ */
+ store_release_wake_up(&nfl->nfs_uuid, RCU_INITIALIZER(NULL));
+ }
/* Remove client from nn->local_clients */
if (nfs_uuid->list_lock) {
@@ -219,15 +242,20 @@ void nfs_localio_invalidate_clients(struct list_head *nn_local_clients,
}
EXPORT_SYMBOL_GPL(nfs_localio_invalidate_clients);
-static void nfs_uuid_add_file(nfs_uuid_t *nfs_uuid, struct nfs_file_localio *nfl)
+static int nfs_uuid_add_file(nfs_uuid_t *nfs_uuid, struct nfs_file_localio *nfl)
{
+ int ret = 0;
+
/* Add nfl to nfs_uuid->files if it isn't already */
spin_lock(&nfs_uuid->lock);
- if (list_empty(&nfl->list)) {
+ if (rcu_access_pointer(nfs_uuid->net) == NULL) {
+ ret = -ENXIO;
+ } else if (list_empty(&nfl->list)) {
rcu_assign_pointer(nfl->nfs_uuid, nfs_uuid);
list_add_tail(&nfl->list, &nfs_uuid->files);
}
spin_unlock(&nfs_uuid->lock);
+ return ret;
}
/*
@@ -237,6 +265,7 @@ static void nfs_uuid_add_file(nfs_uuid_t *nfs_uuid, struct nfs_file_localio *nfl
struct nfsd_file *nfs_open_local_fh(nfs_uuid_t *uuid,
struct rpc_clnt *rpc_clnt, const struct cred *cred,
const struct nfs_fh *nfs_fh, struct nfs_file_localio *nfl,
+ struct nfsd_file __rcu **pnf,
const fmode_t fmode)
{
struct net *net;
@@ -260,12 +289,13 @@ struct nfsd_file *nfs_open_local_fh(nfs_uuid_t *uuid,
}
rcu_read_unlock();
/* We have an implied reference to net thanks to nfsd_net_try_get */
- localio = nfs_to->nfsd_open_local_fh(net, uuid->dom, rpc_clnt,
- cred, nfs_fh, fmode);
- if (IS_ERR(localio))
- nfs_to_nfsd_net_put(net);
- else
- nfs_uuid_add_file(uuid, nfl);
+ localio = nfs_to->nfsd_open_local_fh(net, uuid->dom, rpc_clnt, cred,
+ nfs_fh, pnf, fmode);
+ if (!IS_ERR(localio) && nfs_uuid_add_file(uuid, nfl) < 0) {
+ /* Delete the cached file when racing with nfs_uuid_put() */
+ nfs_to_nfsd_file_put_local(pnf);
+ }
+ nfs_to_nfsd_net_put(net);
return localio;
}
@@ -273,8 +303,6 @@ EXPORT_SYMBOL_GPL(nfs_open_local_fh);
void nfs_close_local_fh(struct nfs_file_localio *nfl)
{
- struct nfsd_file *ro_nf = NULL;
- struct nfsd_file *rw_nf = NULL;
nfs_uuid_t *nfs_uuid;
rcu_read_lock();
@@ -285,28 +313,39 @@ void nfs_close_local_fh(struct nfs_file_localio *nfl)
return;
}
- ro_nf = rcu_access_pointer(nfl->ro_file);
- rw_nf = rcu_access_pointer(nfl->rw_file);
- if (ro_nf || rw_nf) {
- spin_lock(&nfs_uuid->lock);
- if (ro_nf)
- ro_nf = rcu_dereference_protected(xchg(&nfl->ro_file, NULL), 1);
- if (rw_nf)
- rw_nf = rcu_dereference_protected(xchg(&nfl->rw_file, NULL), 1);
-
- /* Remove nfl from nfs_uuid->files list */
- RCU_INIT_POINTER(nfl->nfs_uuid, NULL);
- list_del_init(&nfl->list);
+ spin_lock(&nfs_uuid->lock);
+ if (!rcu_access_pointer(nfl->nfs_uuid)) {
+ /* nfs_uuid_put has finished here */
spin_unlock(&nfs_uuid->lock);
rcu_read_unlock();
-
- if (ro_nf)
- nfs_to_nfsd_file_put_local(ro_nf);
- if (rw_nf)
- nfs_to_nfsd_file_put_local(rw_nf);
return;
}
+ if (list_empty(&nfl->list)) {
+ /* nfs_uuid_put() has started closing files, wait for it
+ * to finished
+ */
+ spin_unlock(&nfs_uuid->lock);
+ rcu_read_unlock();
+ wait_var_event(&nfl->nfs_uuid,
+ rcu_access_pointer(nfl->nfs_uuid) == NULL);
+ return;
+ }
+ /* tell nfs_uuid_put() to wait for us */
+ RCU_INIT_POINTER(nfl->nfs_uuid, NULL);
+ spin_unlock(&nfs_uuid->lock);
rcu_read_unlock();
+
+ nfs_to_nfsd_file_put_local(&nfl->ro_file);
+ nfs_to_nfsd_file_put_local(&nfl->rw_file);
+
+ /* Remove nfl from nfs_uuid->files list and signal nfs_uuid_put()
+ * that we are done. The moment we drop the spinlock the
+ * nfs_uuid could be freed.
+ */
+ spin_lock(&nfs_uuid->lock);
+ list_del_init(&nfl->list);
+ wake_up_var_locked(nfs_uuid, &nfs_uuid->lock);
+ spin_unlock(&nfs_uuid->lock);
}
EXPORT_SYMBOL_GPL(nfs_close_local_fh);
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index c0bd1509ccd4..879e0b104d1c 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -4,6 +4,7 @@ config NFSD
depends on INET
depends on FILE_LOCKING
depends on FSNOTIFY
+ select CRC32
select LOCKD
select SUNRPC
select EXPORTFS
@@ -76,8 +77,8 @@ config NFSD_V4
select FS_POSIX_ACL
select RPCSEC_GSS_KRB5
select CRYPTO
+ select CRYPTO_LIB_SHA256
select CRYPTO_MD5
- select CRYPTO_SHA256
select GRACE_PERIOD
select NFS_V4_2_SSC_HELPER if NFS_V4_2
help
@@ -172,6 +173,16 @@ config NFSD_LEGACY_CLIENT_TRACKING
recoverydir, or spawn a process directly using a usermodehelper
upcall.
- These legacy client tracking methods have proven to be probelmatic
+ These legacy client tracking methods have proven to be problematic
and will be removed in the future. Say Y here if you need support
for them in the interim.
+
+config NFSD_V4_DELEG_TIMESTAMPS
+ bool "Support delegated timestamps"
+ depends on NFSD_V4
+ default n
+ help
+ NFSD implements delegated timestamps according to
+ draft-ietf-nfsv4-delstid-08 "Extending the Opening of Files". This
+ is currently an experimental feature and is therefore left disabled
+ by default.
diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index 2f687619f65b..55744bb786c9 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile
@@ -24,6 +24,7 @@ nfsd-$(CONFIG_NFSD_BLOCKLAYOUT) += blocklayout.o blocklayoutxdr.o
nfsd-$(CONFIG_NFSD_SCSILAYOUT) += blocklayout.o blocklayoutxdr.o
nfsd-$(CONFIG_NFSD_FLEXFILELAYOUT) += flexfilelayout.o flexfilelayoutxdr.o
nfsd-$(CONFIG_NFS_LOCALIO) += localio.o
+nfsd-$(CONFIG_DEBUG_FS) += debugfs.o
.PHONY: xdrgen
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index 08a20e5bcf7f..19078a043e85 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -178,11 +178,13 @@ nfsd4_block_proc_layoutcommit(struct inode *inode,
{
struct iomap *iomaps;
int nr_iomaps;
+ __be32 nfserr;
- nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout,
- lcp->lc_up_len, &iomaps, i_blocksize(inode));
- if (nr_iomaps < 0)
- return nfserrno(nr_iomaps);
+ nfserr = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout,
+ lcp->lc_up_len, &iomaps, &nr_iomaps,
+ i_blocksize(inode));
+ if (nfserr != nfs_ok)
+ return nfserr;
return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps);
}
@@ -316,11 +318,13 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode,
{
struct iomap *iomaps;
int nr_iomaps;
+ __be32 nfserr;
- nr_iomaps = nfsd4_scsi_decode_layoutupdate(lcp->lc_up_layout,
- lcp->lc_up_len, &iomaps, i_blocksize(inode));
- if (nr_iomaps < 0)
- return nfserrno(nr_iomaps);
+ nfserr = nfsd4_scsi_decode_layoutupdate(lcp->lc_up_layout,
+ lcp->lc_up_len, &iomaps, &nr_iomaps,
+ i_blocksize(inode));
+ if (nfserr != nfs_ok)
+ return nfserr;
return nfsd4_block_commit_blocks(inode, lcp, iomaps, nr_iomaps);
}
diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
index ce78f74715ee..bcf21fde9120 100644
--- a/fs/nfsd/blocklayoutxdr.c
+++ b/fs/nfsd/blocklayoutxdr.c
@@ -112,35 +112,46 @@ nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr,
return 0;
}
-int
+/**
+ * nfsd4_block_decode_layoutupdate - decode the block layout extent array
+ * @p: pointer to the xdr data
+ * @len: number of bytes to decode
+ * @iomapp: pointer to store the decoded extent array
+ * @nr_iomapsp: pointer to store the number of extents
+ * @block_size: alignment of extent offset and length
+ *
+ * This function decodes the opaque field of the layoutupdate4 structure
+ * in a layoutcommit request for the block layout driver. The field is
+ * actually an array of extents sent by the client. It also checks that
+ * the file offset, storage offset and length of each extent are aligned
+ * by @block_size.
+ *
+ * Return values:
+ * %nfs_ok: Successful decoding, @iomapp and @nr_iomapsp are valid
+ * %nfserr_bad_xdr: The encoded array in @p is invalid
+ * %nfserr_inval: An unaligned extent found
+ * %nfserr_delay: Failed to allocate memory for @iomapp
+ */
+__be32
nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
- u32 block_size)
+ int *nr_iomapsp, u32 block_size)
{
struct iomap *iomaps;
u32 nr_iomaps, i;
- if (len < sizeof(u32)) {
- dprintk("%s: extent array too small: %u\n", __func__, len);
- return -EINVAL;
- }
+ if (len < sizeof(u32))
+ return nfserr_bad_xdr;
len -= sizeof(u32);
- if (len % PNFS_BLOCK_EXTENT_SIZE) {
- dprintk("%s: extent array invalid: %u\n", __func__, len);
- return -EINVAL;
- }
+ if (len % PNFS_BLOCK_EXTENT_SIZE)
+ return nfserr_bad_xdr;
nr_iomaps = be32_to_cpup(p++);
- if (nr_iomaps != len / PNFS_BLOCK_EXTENT_SIZE) {
- dprintk("%s: extent array size mismatch: %u/%u\n",
- __func__, len, nr_iomaps);
- return -EINVAL;
- }
+ if (nr_iomaps != len / PNFS_BLOCK_EXTENT_SIZE)
+ return nfserr_bad_xdr;
iomaps = kcalloc(nr_iomaps, sizeof(*iomaps), GFP_KERNEL);
- if (!iomaps) {
- dprintk("%s: failed to allocate extent array\n", __func__);
- return -ENOMEM;
- }
+ if (!iomaps)
+ return nfserr_delay;
for (i = 0; i < nr_iomaps; i++) {
struct pnfs_block_extent bex;
@@ -150,26 +161,18 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
p = xdr_decode_hyper(p, &bex.foff);
if (bex.foff & (block_size - 1)) {
- dprintk("%s: unaligned offset 0x%llx\n",
- __func__, bex.foff);
goto fail;
}
p = xdr_decode_hyper(p, &bex.len);
if (bex.len & (block_size - 1)) {
- dprintk("%s: unaligned length 0x%llx\n",
- __func__, bex.foff);
goto fail;
}
p = xdr_decode_hyper(p, &bex.soff);
if (bex.soff & (block_size - 1)) {
- dprintk("%s: unaligned disk offset 0x%llx\n",
- __func__, bex.soff);
goto fail;
}
bex.es = be32_to_cpup(p++);
if (bex.es != PNFS_BLOCK_READWRITE_DATA) {
- dprintk("%s: incorrect extent state %d\n",
- __func__, bex.es);
goto fail;
}
@@ -178,59 +181,71 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
}
*iomapp = iomaps;
- return nr_iomaps;
+ *nr_iomapsp = nr_iomaps;
+ return nfs_ok;
fail:
kfree(iomaps);
- return -EINVAL;
+ return nfserr_inval;
}
-int
+/**
+ * nfsd4_scsi_decode_layoutupdate - decode the scsi layout extent array
+ * @p: pointer to the xdr data
+ * @len: number of bytes to decode
+ * @iomapp: pointer to store the decoded extent array
+ * @nr_iomapsp: pointer to store the number of extents
+ * @block_size: alignment of extent offset and length
+ *
+ * This function decodes the opaque field of the layoutupdate4 structure
+ * in a layoutcommit request for the scsi layout driver. The field is
+ * actually an array of extents sent by the client. It also checks that
+ * the offset and length of each extent are aligned by @block_size.
+ *
+ * Return values:
+ * %nfs_ok: Successful decoding, @iomapp and @nr_iomapsp are valid
+ * %nfserr_bad_xdr: The encoded array in @p is invalid
+ * %nfserr_inval: An unaligned extent found
+ * %nfserr_delay: Failed to allocate memory for @iomapp
+ */
+__be32
nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
- u32 block_size)
+ int *nr_iomapsp, u32 block_size)
{
struct iomap *iomaps;
u32 nr_iomaps, expected, i;
- if (len < sizeof(u32)) {
- dprintk("%s: extent array too small: %u\n", __func__, len);
- return -EINVAL;
- }
+ if (len < sizeof(u32))
+ return nfserr_bad_xdr;
nr_iomaps = be32_to_cpup(p++);
expected = sizeof(__be32) + nr_iomaps * PNFS_SCSI_RANGE_SIZE;
- if (len != expected) {
- dprintk("%s: extent array size mismatch: %u/%u\n",
- __func__, len, expected);
- return -EINVAL;
- }
+ if (len != expected)
+ return nfserr_bad_xdr;
iomaps = kcalloc(nr_iomaps, sizeof(*iomaps), GFP_KERNEL);
- if (!iomaps) {
- dprintk("%s: failed to allocate extent array\n", __func__);
- return -ENOMEM;
- }
+ if (!iomaps)
+ return nfserr_delay;
for (i = 0; i < nr_iomaps; i++) {
u64 val;
p = xdr_decode_hyper(p, &val);
if (val & (block_size - 1)) {
- dprintk("%s: unaligned offset 0x%llx\n", __func__, val);
goto fail;
}
iomaps[i].offset = val;
p = xdr_decode_hyper(p, &val);
if (val & (block_size - 1)) {
- dprintk("%s: unaligned length 0x%llx\n", __func__, val);
goto fail;
}
iomaps[i].length = val;
}
*iomapp = iomaps;
- return nr_iomaps;
+ *nr_iomapsp = nr_iomaps;
+ return nfs_ok;
fail:
kfree(iomaps);
- return -EINVAL;
+ return nfserr_inval;
}
diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h
index 4e28ac8f1127..15b3569f3d9a 100644
--- a/fs/nfsd/blocklayoutxdr.h
+++ b/fs/nfsd/blocklayoutxdr.h
@@ -54,9 +54,9 @@ __be32 nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr,
const struct nfsd4_getdeviceinfo *gdp);
__be32 nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
const struct nfsd4_layoutget *lgp);
-int nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
- u32 block_size);
-int nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
- u32 block_size);
+__be32 nfsd4_block_decode_layoutupdate(__be32 *p, u32 len,
+ struct iomap **iomapp, int *nr_iomapsp, u32 block_size);
+__be32 nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len,
+ struct iomap **iomapp, int *nr_iomapsp, u32 block_size);
#endif /* _NFSD_BLOCKLAYOUTXDR_H */
diff --git a/fs/nfsd/debugfs.c b/fs/nfsd/debugfs.c
new file mode 100644
index 000000000000..84b0c8b559dc
--- /dev/null
+++ b/fs/nfsd/debugfs.c
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/debugfs.h>
+
+#include "nfsd.h"
+
+static struct dentry *nfsd_top_dir __read_mostly;
+
+/*
+ * /sys/kernel/debug/nfsd/disable-splice-read
+ *
+ * Contents:
+ * %0: NFS READ is allowed to use page splicing
+ * %1: NFS READ uses only iov iter read
+ *
+ * The default value of this setting is zero (page splicing is
+ * allowed). This setting takes immediate effect for all NFS
+ * versions, all exports, and in all NFSD net namespaces.
+ */
+
+static int nfsd_dsr_get(void *data, u64 *val)
+{
+ *val = nfsd_disable_splice_read ? 1 : 0;
+ return 0;
+}
+
+static int nfsd_dsr_set(void *data, u64 val)
+{
+ nfsd_disable_splice_read = (val > 0) ? true : false;
+ return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(nfsd_dsr_fops, nfsd_dsr_get, nfsd_dsr_set, "%llu\n");
+
+void nfsd_debugfs_exit(void)
+{
+ debugfs_remove_recursive(nfsd_top_dir);
+ nfsd_top_dir = NULL;
+}
+
+void nfsd_debugfs_init(void)
+{
+ nfsd_top_dir = debugfs_create_dir("nfsd", NULL);
+
+ debugfs_create_file("disable-splice-read", S_IWUSR | S_IRUGO,
+ nfsd_top_dir, NULL, &nfsd_dsr_fops);
+}
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 0363720280d4..cadfc2bae60e 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -82,8 +82,7 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen)
int len;
struct auth_domain *dom = NULL;
int err;
- int fsidtype;
- char *ep;
+ u8 fsidtype;
struct svc_expkey key;
struct svc_expkey *ek = NULL;
@@ -109,10 +108,9 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen)
err = -EINVAL;
if (qword_get(&mesg, buf, PAGE_SIZE) <= 0)
goto out;
- fsidtype = simple_strtoul(buf, &ep, 10);
- if (*ep)
+ if (kstrtou8(buf, 10, &fsidtype))
goto out;
- dprintk("found fsidtype %d\n", fsidtype);
+ dprintk("found fsidtype %u\n", fsidtype);
if (key_len(fsidtype)==0) /* invalid type */
goto out;
if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0)
@@ -1124,7 +1122,8 @@ __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp,
test_bit(XPT_PEER_AUTH, &xprt->xpt_flags))
goto ok;
}
- goto denied;
+ if (!may_bypass_gss)
+ goto denied;
ok:
/* legacy gss-only clients are always OK: */
diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h
index 4d92b99c1ffd..b9c0adb3ce09 100644
--- a/fs/nfsd/export.h
+++ b/fs/nfsd/export.h
@@ -88,7 +88,7 @@ struct svc_expkey {
struct cache_head h;
struct auth_domain * ek_client;
- int ek_fsidtype;
+ u8 ek_fsidtype;
u32 ek_fsid[6];
struct path ek_path;
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 0e552d873eaa..732abf6b92a5 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -319,15 +319,14 @@ nfsd_file_check_writeback(struct nfsd_file *nf)
mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK);
}
-
-static bool nfsd_file_lru_add(struct nfsd_file *nf)
+static void nfsd_file_lru_add(struct nfsd_file *nf)
{
- set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags);
- if (list_lru_add_obj(&nfsd_file_lru, &nf->nf_lru)) {
+ refcount_inc(&nf->nf_ref);
+ if (list_lru_add_obj(&nfsd_file_lru, &nf->nf_lru))
trace_nfsd_file_lru_add(nf);
- return true;
- }
- return false;
+ else
+ WARN_ON(1);
+ nfsd_file_schedule_laundrette();
}
static bool nfsd_file_lru_remove(struct nfsd_file *nf)
@@ -363,51 +362,57 @@ nfsd_file_put(struct nfsd_file *nf)
if (test_bit(NFSD_FILE_GC, &nf->nf_flags) &&
test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
- /*
- * If this is the last reference (nf_ref == 1), then try to
- * transfer it to the LRU.
- */
- if (refcount_dec_not_one(&nf->nf_ref))
- return;
-
- /* Try to add it to the LRU. If that fails, decrement. */
- if (nfsd_file_lru_add(nf)) {
- /* If it's still hashed, we're done */
- if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
- nfsd_file_schedule_laundrette();
- return;
- }
-
- /*
- * We're racing with unhashing, so try to remove it from
- * the LRU. If removal fails, then someone else already
- * has our reference.
- */
- if (!nfsd_file_lru_remove(nf))
- return;
- }
+ set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags);
+ set_bit(NFSD_FILE_RECENT, &nf->nf_flags);
}
+
if (refcount_dec_and_test(&nf->nf_ref))
nfsd_file_free(nf);
}
/**
* nfsd_file_put_local - put nfsd_file reference and arm nfsd_net_put in caller
- * @nf: nfsd_file of which to put the reference
+ * @pnf: nfsd_file of which to put the reference
*
* First save the associated net to return to caller, then put
* the reference of the nfsd_file.
*/
struct net *
-nfsd_file_put_local(struct nfsd_file *nf)
+nfsd_file_put_local(struct nfsd_file __rcu **pnf)
{
- struct net *net = nf->nf_net;
+ struct nfsd_file *nf;
+ struct net *net = NULL;
- nfsd_file_put(nf);
+ nf = unrcu_pointer(xchg(pnf, NULL));
+ if (nf) {
+ net = nf->nf_net;
+ nfsd_file_put(nf);
+ }
return net;
}
/**
+ * nfsd_file_get_local - get nfsd_file reference and reference to net
+ * @nf: nfsd_file of which to put the reference
+ *
+ * Get reference to both the nfsd_file and nf->nf_net.
+ */
+struct nfsd_file *
+nfsd_file_get_local(struct nfsd_file *nf)
+{
+ struct net *net = nf->nf_net;
+
+ if (nfsd_net_try_get(net)) {
+ nf = nfsd_file_get(nf);
+ if (!nf)
+ nfsd_net_put(net);
+ } else {
+ nf = NULL;
+ }
+ return nf;
+}
+
+/**
* nfsd_file_file - get the backing file of an nfsd_file
* @nf: nfsd_file of which to access the backing file.
*
@@ -446,11 +451,20 @@ nfsd_file_dispose_list_delayed(struct list_head *dispose)
struct nfsd_file, nf_gc);
struct nfsd_net *nn = net_generic(nf->nf_net, nfsd_net_id);
struct nfsd_fcache_disposal *l = nn->fcache_disposal;
+ struct svc_serv *serv;
spin_lock(&l->lock);
list_move_tail(&nf->nf_gc, &l->freeme);
spin_unlock(&l->lock);
- svc_wake_up(nn->nfsd_serv);
+
+ /*
+ * The filecache laundrette is shut down after the
+ * nn->nfsd_serv pointer is cleared, but before the
+ * svc_serv is freed.
+ */
+ serv = nn->nfsd_serv;
+ if (serv)
+ svc_wake_up(serv);
}
}
@@ -521,13 +535,12 @@ nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru,
}
/*
- * Put the reference held on behalf of the LRU. If it wasn't the last
- * one, then just remove it from the LRU and ignore it.
+ * Put the reference held on behalf of the LRU if it is the last
+ * reference, else rotate.
*/
- if (!refcount_dec_and_test(&nf->nf_ref)) {
+ if (!refcount_dec_if_one(&nf->nf_ref)) {
trace_nfsd_file_gc_in_use(nf);
- list_lru_isolate(lru, &nf->nf_lru);
- return LRU_REMOVED;
+ return LRU_ROTATE;
}
/* Refcount went to zero. Unhash it and queue it to the dispose list */
@@ -539,14 +552,54 @@ nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru,
return LRU_REMOVED;
}
+static enum lru_status
+nfsd_file_gc_cb(struct list_head *item, struct list_lru_one *lru,
+ void *arg)
+{
+ struct nfsd_file *nf = list_entry(item, struct nfsd_file, nf_lru);
+
+ if (test_and_clear_bit(NFSD_FILE_RECENT, &nf->nf_flags)) {
+ /*
+ * "REFERENCED" really means "should be at the end of the
+ * LRU. As we are putting it there we can clear the flag.
+ */
+ clear_bit(NFSD_FILE_REFERENCED, &nf->nf_flags);
+ trace_nfsd_file_gc_aged(nf);
+ return LRU_ROTATE;
+ }
+ return nfsd_file_lru_cb(item, lru, arg);
+}
+
+/* If the shrinker runs between calls to list_lru_walk_node() in
+ * nfsd_file_gc(), the "remaining" count will be wrong. This could
+ * result in premature freeing of some files. This may not matter much
+ * but is easy to fix with this spinlock which temporarily disables
+ * the shrinker.
+ */
+static DEFINE_SPINLOCK(nfsd_gc_lock);
static void
nfsd_file_gc(void)
{
+ unsigned long ret = 0;
LIST_HEAD(dispose);
- unsigned long ret;
+ int nid;
+
+ spin_lock(&nfsd_gc_lock);
+ for_each_node_state(nid, N_NORMAL_MEMORY) {
+ unsigned long remaining = list_lru_count_node(&nfsd_file_lru, nid);
- ret = list_lru_walk(&nfsd_file_lru, nfsd_file_lru_cb,
- &dispose, list_lru_count(&nfsd_file_lru));
+ while (remaining > 0) {
+ unsigned long nr = min(remaining, NFSD_FILE_GC_BATCH);
+
+ remaining -= nr;
+ ret += list_lru_walk_node(&nfsd_file_lru, nid, nfsd_file_gc_cb,
+ &dispose, &nr);
+ if (nr)
+ /* walk aborted early */
+ remaining = 0;
+ }
+ }
+ spin_unlock(&nfsd_gc_lock);
trace_nfsd_file_gc_removed(ret, list_lru_count(&nfsd_file_lru));
nfsd_file_dispose_list_delayed(&dispose);
}
@@ -554,9 +607,9 @@ nfsd_file_gc(void)
static void
nfsd_file_gc_worker(struct work_struct *work)
{
- nfsd_file_gc();
if (list_lru_count(&nfsd_file_lru))
- nfsd_file_schedule_laundrette();
+ nfsd_file_gc();
+ nfsd_file_schedule_laundrette();
}
static unsigned long
@@ -571,8 +624,12 @@ nfsd_file_lru_scan(struct shrinker *s, struct shrink_control *sc)
LIST_HEAD(dispose);
unsigned long ret;
+ if (!spin_trylock(&nfsd_gc_lock))
+ return SHRINK_STOP;
+
ret = list_lru_shrink_walk(&nfsd_file_lru, sc,
nfsd_file_lru_cb, &dispose);
+ spin_unlock(&nfsd_gc_lock);
trace_nfsd_file_shrinker_removed(ret, list_lru_count(&nfsd_file_lru));
nfsd_file_dispose_list_delayed(&dispose);
return ret;
@@ -677,17 +734,12 @@ nfsd_file_close_inode(struct inode *inode)
void
nfsd_file_close_inode_sync(struct inode *inode)
{
- struct nfsd_file *nf;
LIST_HEAD(dispose);
trace_nfsd_file_close(inode);
nfsd_file_queue_for_close(inode, &dispose);
- while (!list_empty(&dispose)) {
- nf = list_first_entry(&dispose, struct nfsd_file, nf_gc);
- list_del_init(&nf->nf_gc);
- nfsd_file_free(nf);
- }
+ nfsd_file_dispose_list(&dispose);
}
static int
@@ -1049,16 +1101,8 @@ retry:
nf = nfsd_file_lookup_locked(net, current_cred(), inode, need, want_gc);
rcu_read_unlock();
- if (nf) {
- /*
- * If the nf is on the LRU then it holds an extra reference
- * that must be put if it's removed. It had better not be
- * the last one however, since we should hold another.
- */
- if (nfsd_file_lru_remove(nf))
- refcount_dec(&nf->nf_ref);
+ if (nf)
goto wait_for_construction;
- }
new = nfsd_file_alloc(net, inode, need, want_gc);
if (!new) {
@@ -1152,6 +1196,9 @@ open_file:
*/
if (status != nfs_ok || inode->i_nlink == 0)
nfsd_file_unhash(nf);
+ else if (want_gc)
+ nfsd_file_lru_add(nf);
+
clear_and_wake_up_bit(NFSD_FILE_PENDING, &nf->nf_flags);
if (status == nfs_ok)
goto out;
diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
index d5db6b34ba30..722b26c71e45 100644
--- a/fs/nfsd/filecache.h
+++ b/fs/nfsd/filecache.h
@@ -4,6 +4,12 @@
#include <linux/fsnotify_backend.h>
/*
+ * Limit the time that the list_lru_one lock is held during
+ * an LRU scan.
+ */
+#define NFSD_FILE_GC_BATCH (16UL)
+
+/*
* This is the fsnotify_mark container that nfsd attaches to the files that it
* is holding open. Note that we have a separate refcount here aside from the
* one in the fsnotify_mark. We only want a single fsnotify_mark attached to
@@ -38,6 +44,7 @@ struct nfsd_file {
#define NFSD_FILE_PENDING (1)
#define NFSD_FILE_REFERENCED (2)
#define NFSD_FILE_GC (3)
+#define NFSD_FILE_RECENT (4)
unsigned long nf_flags;
refcount_t nf_ref;
unsigned char nf_may;
@@ -55,7 +62,8 @@ void nfsd_file_cache_shutdown(void);
int nfsd_file_cache_start_net(struct net *net);
void nfsd_file_cache_shutdown_net(struct net *net);
void nfsd_file_put(struct nfsd_file *nf);
-struct net *nfsd_file_put_local(struct nfsd_file *nf);
+struct net *nfsd_file_put_local(struct nfsd_file __rcu **nf);
+struct nfsd_file *nfsd_file_get_local(struct nfsd_file *nf);
struct nfsd_file *nfsd_file_get(struct nfsd_file *nf);
struct file *nfsd_file_file(struct nfsd_file *nf);
void nfsd_file_close_inode_sync(struct inode *inode);
diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c
index 238647fa379e..cb237f1b902a 100644
--- a/fs/nfsd/localio.c
+++ b/fs/nfsd/localio.c
@@ -24,21 +24,6 @@
#include "filecache.h"
#include "cache.h"
-static const struct nfsd_localio_operations nfsd_localio_ops = {
- .nfsd_net_try_get = nfsd_net_try_get,
- .nfsd_net_put = nfsd_net_put,
- .nfsd_open_local_fh = nfsd_open_local_fh,
- .nfsd_file_put_local = nfsd_file_put_local,
- .nfsd_file_get = nfsd_file_get,
- .nfsd_file_put = nfsd_file_put,
- .nfsd_file_file = nfsd_file_file,
-};
-
-void nfsd_localio_ops_init(void)
-{
- nfs_to = &nfsd_localio_ops;
-}
-
/**
* nfsd_open_local_fh - lookup a local filehandle @nfs_fh and map to nfsd_file
*
@@ -47,6 +32,7 @@ void nfsd_localio_ops_init(void)
* @rpc_clnt: rpc_clnt that the client established
* @cred: cred that the client established
* @nfs_fh: filehandle to lookup
+ * @pnf: place to find the nfsd_file, or store it if it was non-NULL
* @fmode: fmode_t to use for open
*
* This function maps a local fh to a path on a local filesystem.
@@ -57,10 +43,11 @@ void nfsd_localio_ops_init(void)
* set. Caller (NFS client) is responsible for calling nfsd_net_put and
* nfsd_file_put (via nfs_to_nfsd_file_put_local).
*/
-struct nfsd_file *
+static struct nfsd_file *
nfsd_open_local_fh(struct net *net, struct auth_domain *dom,
struct rpc_clnt *rpc_clnt, const struct cred *cred,
- const struct nfs_fh *nfs_fh, const fmode_t fmode)
+ const struct nfs_fh *nfs_fh, struct nfsd_file __rcu **pnf,
+ const fmode_t fmode)
{
int mayflags = NFSD_MAY_LOCALIO;
struct svc_cred rq_cred;
@@ -71,6 +58,15 @@ nfsd_open_local_fh(struct net *net, struct auth_domain *dom,
if (nfs_fh->size > NFS4_FHSIZE)
return ERR_PTR(-EINVAL);
+ if (!nfsd_net_try_get(net))
+ return ERR_PTR(-ENXIO);
+
+ rcu_read_lock();
+ localio = nfsd_file_get(rcu_dereference(*pnf));
+ rcu_read_unlock();
+ if (localio)
+ return localio;
+
/* nfs_fh -> svc_fh */
fh_init(&fh, NFS4_FHSIZE);
fh.fh_handle.fh_size = nfs_fh->size;
@@ -92,9 +88,48 @@ nfsd_open_local_fh(struct net *net, struct auth_domain *dom,
if (rq_cred.cr_group_info)
put_group_info(rq_cred.cr_group_info);
+ if (!IS_ERR(localio)) {
+ struct nfsd_file *new;
+ if (!nfsd_net_try_get(net)) {
+ nfsd_file_put(localio);
+ nfsd_net_put(net);
+ return ERR_PTR(-ENXIO);
+ }
+ nfsd_file_get(localio);
+ again:
+ new = unrcu_pointer(cmpxchg(pnf, NULL, RCU_INITIALIZER(localio)));
+ if (new) {
+ /* Some other thread installed an nfsd_file */
+ if (nfsd_file_get(new) == NULL)
+ goto again;
+ /*
+ * Drop the ref we were going to install (both file and
+ * net) and the one we were going to return (only file).
+ */
+ nfsd_file_put(localio);
+ nfsd_net_put(net);
+ nfsd_file_put(localio);
+ localio = new;
+ }
+ } else
+ nfsd_net_put(net);
+
return localio;
}
-EXPORT_SYMBOL_GPL(nfsd_open_local_fh);
+
+static const struct nfsd_localio_operations nfsd_localio_ops = {
+ .nfsd_net_try_get = nfsd_net_try_get,
+ .nfsd_net_put = nfsd_net_put,
+ .nfsd_open_local_fh = nfsd_open_local_fh,
+ .nfsd_file_put_local = nfsd_file_put_local,
+ .nfsd_file_get_local = nfsd_file_get_local,
+ .nfsd_file_file = nfsd_file_file,
+};
+
+void nfsd_localio_ops_init(void)
+{
+ nfs_to = &nfsd_localio_ops;
+}
/*
* UUID_IS_LOCAL XDR functions
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index 4e3be7201b1c..5fb202acb0fd 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -84,6 +84,8 @@ out:
fail:
posix_acl_release(resp->acl_access);
posix_acl_release(resp->acl_default);
+ resp->acl_access = NULL;
+ resp->acl_default = NULL;
goto out;
}
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index 5e34e98db969..7b5433bd3019 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -76,6 +76,8 @@ out:
fail:
posix_acl_release(resp->acl_access);
posix_acl_release(resp->acl_default);
+ resp->acl_access = NULL;
+ resp->acl_default = NULL;
goto out;
}
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 372bdcf5e07a..b6d03e1ef5f7 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -14,6 +14,7 @@
#include "xdr3.h"
#include "vfs.h"
#include "filecache.h"
+#include "trace.h"
#define NFSDDBG_FACILITY NFSDDBG_PROC
@@ -69,8 +70,7 @@ nfsd3_proc_getattr(struct svc_rqst *rqstp)
struct nfsd_fhandle *argp = rqstp->rq_argp;
struct nfsd3_attrstat *resp = rqstp->rq_resp;
- dprintk("nfsd: GETATTR(3) %s\n",
- SVCFH_fmt(&argp->fh));
+ trace_nfsd_vfs_getattr(rqstp, &argp->fh);
fh_copy(&resp->fh, &argp->fh);
resp->status = fh_verify(rqstp, &resp->fh, 0,
@@ -220,7 +220,6 @@ nfsd3_proc_write(struct svc_rqst *rqstp)
struct nfsd3_writeargs *argp = rqstp->rq_argp;
struct nfsd3_writeres *resp = rqstp->rq_resp;
unsigned long cnt = argp->len;
- unsigned int nvecs;
dprintk("nfsd: WRITE(3) %s %d bytes at %Lu%s\n",
SVCFH_fmt(&argp->fh),
@@ -235,10 +234,8 @@ nfsd3_proc_write(struct svc_rqst *rqstp)
fh_copy(&resp->fh, &argp->fh);
resp->committed = argp->stable;
- nvecs = svc_fill_write_vector(rqstp, &argp->payload);
-
resp->status = nfsd_write(rqstp, &resp->fh, argp->offset,
- rqstp->rq_vec, nvecs, &cnt,
+ &argp->payload, &cnt,
resp->committed, resp->verf);
resp->count = cnt;
resp->status = nfsd3_map_status(resp->status);
@@ -266,6 +263,8 @@ nfsd3_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp,
__be32 status;
int host_err;
+ trace_nfsd_vfs_create(rqstp, fhp, S_IFREG, argp->name, argp->len);
+
if (isdotent(argp->name, argp->len))
return nfserr_exist;
if (!(iap->ia_valid & ATTR_MODE))
@@ -284,7 +283,9 @@ nfsd3_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp,
inode_lock_nested(inode, I_MUTEX_PARENT);
- child = lookup_one_len(argp->name, parent, argp->len);
+ child = lookup_one(&nop_mnt_idmap,
+ &QSTR_LEN(argp->name, argp->len),
+ parent);
if (IS_ERR(child)) {
status = nfserrno(PTR_ERR(child));
goto out;
@@ -380,11 +381,6 @@ nfsd3_proc_create(struct svc_rqst *rqstp)
struct nfsd3_diropres *resp = rqstp->rq_resp;
svc_fh *dirfhp, *newfhp;
- dprintk("nfsd: CREATE(3) %s %.*s\n",
- SVCFH_fmt(&argp->fh),
- argp->len,
- argp->name);
-
dirfhp = fh_copy(&resp->dirfh, &argp->fh);
newfhp = fh_init(&resp->fh, NFS3_FHSIZE);
@@ -405,11 +401,6 @@ nfsd3_proc_mkdir(struct svc_rqst *rqstp)
.na_iattr = &argp->attrs,
};
- dprintk("nfsd: MKDIR(3) %s %.*s\n",
- SVCFH_fmt(&argp->fh),
- argp->len,
- argp->name);
-
argp->attrs.ia_valid &= ~ATTR_SIZE;
fh_copy(&resp->dirfh, &argp->fh);
fh_init(&resp->fh, NFS3_FHSIZE);
@@ -445,11 +436,6 @@ nfsd3_proc_symlink(struct svc_rqst *rqstp)
goto out;
}
- dprintk("nfsd: SYMLINK(3) %s %.*s -> %.*s\n",
- SVCFH_fmt(&argp->ffh),
- argp->flen, argp->fname,
- argp->tlen, argp->tname);
-
fh_copy(&resp->dirfh, &argp->ffh);
fh_init(&resp->fh, NFS3_FHSIZE);
resp->status = nfsd_symlink(rqstp, &resp->dirfh, argp->fname,
@@ -474,11 +460,6 @@ nfsd3_proc_mknod(struct svc_rqst *rqstp)
int type;
dev_t rdev = 0;
- dprintk("nfsd: MKNOD(3) %s %.*s\n",
- SVCFH_fmt(&argp->fh),
- argp->len,
- argp->name);
-
fh_copy(&resp->dirfh, &argp->fh);
fh_init(&resp->fh, NFS3_FHSIZE);
@@ -511,11 +492,6 @@ nfsd3_proc_remove(struct svc_rqst *rqstp)
struct nfsd3_diropargs *argp = rqstp->rq_argp;
struct nfsd3_attrstat *resp = rqstp->rq_resp;
- dprintk("nfsd: REMOVE(3) %s %.*s\n",
- SVCFH_fmt(&argp->fh),
- argp->len,
- argp->name);
-
/* Unlink. -S_IFDIR means file must not be a directory */
fh_copy(&resp->fh, &argp->fh);
resp->status = nfsd_unlink(rqstp, &resp->fh, -S_IFDIR,
@@ -533,11 +509,6 @@ nfsd3_proc_rmdir(struct svc_rqst *rqstp)
struct nfsd3_diropargs *argp = rqstp->rq_argp;
struct nfsd3_attrstat *resp = rqstp->rq_resp;
- dprintk("nfsd: RMDIR(3) %s %.*s\n",
- SVCFH_fmt(&argp->fh),
- argp->len,
- argp->name);
-
fh_copy(&resp->fh, &argp->fh);
resp->status = nfsd_unlink(rqstp, &resp->fh, S_IFDIR,
argp->name, argp->len);
@@ -551,15 +522,6 @@ nfsd3_proc_rename(struct svc_rqst *rqstp)
struct nfsd3_renameargs *argp = rqstp->rq_argp;
struct nfsd3_renameres *resp = rqstp->rq_resp;
- dprintk("nfsd: RENAME(3) %s %.*s ->\n",
- SVCFH_fmt(&argp->ffh),
- argp->flen,
- argp->fname);
- dprintk("nfsd: -> %s %.*s\n",
- SVCFH_fmt(&argp->tfh),
- argp->tlen,
- argp->tname);
-
fh_copy(&resp->ffh, &argp->ffh);
fh_copy(&resp->tfh, &argp->tfh);
resp->status = nfsd_rename(rqstp, &resp->ffh, argp->fname, argp->flen,
@@ -574,13 +536,6 @@ nfsd3_proc_link(struct svc_rqst *rqstp)
struct nfsd3_linkargs *argp = rqstp->rq_argp;
struct nfsd3_linkres *resp = rqstp->rq_resp;
- dprintk("nfsd: LINK(3) %s ->\n",
- SVCFH_fmt(&argp->ffh));
- dprintk("nfsd: -> %s %.*s\n",
- SVCFH_fmt(&argp->tfh),
- argp->tlen,
- argp->tname);
-
fh_copy(&resp->fh, &argp->ffh);
fh_copy(&resp->tfh, &argp->tfh);
resp->status = nfsd_link(rqstp, &resp->tfh, argp->tname, argp->tlen,
@@ -606,7 +561,7 @@ static void nfsd3_init_dirlist_pages(struct svc_rqst *rqstp,
buf->pages = rqstp->rq_next_page;
rqstp->rq_next_page += (buf->buflen + PAGE_SIZE - 1) >> PAGE_SHIFT;
- xdr_init_encode_pages(xdr, buf, buf->pages, NULL);
+ xdr_init_encode_pages(xdr, buf);
}
/*
@@ -619,9 +574,7 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp)
struct nfsd3_readdirres *resp = rqstp->rq_resp;
loff_t offset;
- dprintk("nfsd: READDIR(3) %s %d bytes at %d\n",
- SVCFH_fmt(&argp->fh),
- argp->count, (u32) argp->cookie);
+ trace_nfsd_vfs_readdir(rqstp, &argp->fh, argp->count, argp->cookie);
nfsd3_init_dirlist_pages(rqstp, resp, argp->count);
@@ -653,9 +606,7 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp)
struct nfsd3_readdirres *resp = rqstp->rq_resp;
loff_t offset;
- dprintk("nfsd: READDIR+(3) %s %d bytes at %d\n",
- SVCFH_fmt(&argp->fh),
- argp->count, (u32) argp->cookie);
+ trace_nfsd_vfs_readdir(rqstp, &argp->fh, argp->count, argp->cookie);
nfsd3_init_dirlist_pages(rqstp, resp, argp->count);
@@ -696,9 +647,6 @@ nfsd3_proc_fsstat(struct svc_rqst *rqstp)
struct nfsd_fhandle *argp = rqstp->rq_argp;
struct nfsd3_fsstatres *resp = rqstp->rq_resp;
- dprintk("nfsd: FSSTAT(3) %s\n",
- SVCFH_fmt(&argp->fh));
-
resp->status = nfsd_statfs(rqstp, &argp->fh, &resp->stats, 0);
fh_put(&argp->fh);
resp->status = nfsd3_map_status(resp->status);
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index a7a07470c1f8..ef4971d71ac4 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -1001,7 +1001,9 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
} else
dchild = dget(dparent);
} else
- dchild = lookup_positive_unlocked(name, dparent, namlen);
+ dchild = lookup_one_positive_unlocked(&nop_mnt_idmap,
+ &QSTR_LEN(name, namlen),
+ dparent);
if (IS_ERR(dchild))
return rv;
if (d_mountpoint(dchild))
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 50e468bdb8d4..e00b2aea8da2 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -46,8 +46,6 @@
#define NFSDDBG_FACILITY NFSDDBG_PROC
-static void nfsd4_mark_cb_fault(struct nfs4_client *clp);
-
#define NFSPROC4_CB_NULL 0
#define NFSPROC4_CB_COMPOUND 1
@@ -101,15 +99,15 @@ static int decode_cb_fattr4(struct xdr_stream *xdr, uint32_t *bitmap,
if (bitmap[0] & FATTR4_WORD0_CHANGE)
if (xdr_stream_decode_u64(xdr, &fattr->ncf_cb_change) < 0)
- return -NFSERR_BAD_XDR;
+ return -EIO;
if (bitmap[0] & FATTR4_WORD0_SIZE)
if (xdr_stream_decode_u64(xdr, &fattr->ncf_cb_fsize) < 0)
- return -NFSERR_BAD_XDR;
+ return -EIO;
if (bitmap[2] & FATTR4_WORD2_TIME_DELEG_ACCESS) {
fattr4_time_deleg_access access;
if (!xdrgen_decode_fattr4_time_deleg_access(xdr, &access))
- return -NFSERR_BAD_XDR;
+ return -EIO;
fattr->ncf_cb_atime.tv_sec = access.seconds;
fattr->ncf_cb_atime.tv_nsec = access.nseconds;
@@ -118,7 +116,7 @@ static int decode_cb_fattr4(struct xdr_stream *xdr, uint32_t *bitmap,
fattr4_time_deleg_modify modify;
if (!xdrgen_decode_fattr4_time_deleg_modify(xdr, &modify))
- return -NFSERR_BAD_XDR;
+ return -EIO;
fattr->ncf_cb_mtime.tv_sec = modify.seconds;
fattr->ncf_cb_mtime.tv_nsec = modify.nseconds;
@@ -419,6 +417,29 @@ static u32 highest_slotid(struct nfsd4_session *ses)
return idx;
}
+static void
+encode_referring_call4(struct xdr_stream *xdr,
+ const struct nfsd4_referring_call *rc)
+{
+ encode_uint32(xdr, rc->rc_sequenceid);
+ encode_uint32(xdr, rc->rc_slotid);
+}
+
+static void
+encode_referring_call_list4(struct xdr_stream *xdr,
+ const struct nfsd4_referring_call_list *rcl)
+{
+ struct nfsd4_referring_call *rc;
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN);
+ xdr_encode_opaque_fixed(p, rcl->rcl_sessionid.data,
+ NFS4_MAX_SESSIONID_LEN);
+ encode_uint32(xdr, rcl->__nr_referring_calls);
+ list_for_each_entry(rc, &rcl->rcl_referring_calls, __list)
+ encode_referring_call4(xdr, rc);
+}
+
/*
* CB_SEQUENCE4args
*
@@ -436,6 +457,7 @@ static void encode_cb_sequence4args(struct xdr_stream *xdr,
struct nfs4_cb_compound_hdr *hdr)
{
struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
+ struct nfsd4_referring_call_list *rcl;
__be32 *p;
if (hdr->minorversion == 0)
@@ -444,12 +466,16 @@ static void encode_cb_sequence4args(struct xdr_stream *xdr,
encode_nfs_cb_opnum4(xdr, OP_CB_SEQUENCE);
encode_sessionid4(xdr, session);
- p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4 + 4);
+ p = xdr_reserve_space(xdr, XDR_UNIT * 4);
*p++ = cpu_to_be32(session->se_cb_seq_nr[cb->cb_held_slot]); /* csa_sequenceid */
*p++ = cpu_to_be32(cb->cb_held_slot); /* csa_slotid */
*p++ = cpu_to_be32(highest_slotid(session)); /* csa_highest_slotid */
*p++ = xdr_zero; /* csa_cachethis */
- xdr_encode_empty_array(p); /* csa_referring_call_lists */
+
+ /* csa_referring_call_lists */
+ encode_uint32(xdr, cb->cb_nr_referring_call_list);
+ list_for_each_entry(rcl, &cb->cb_referring_call_list, __list)
+ encode_referring_call_list4(xdr, rcl);
hdr->nops++;
}
@@ -679,18 +705,18 @@ static int nfs4_xdr_dec_cb_getattr(struct rpc_rqst *rqstp,
return status;
status = decode_cb_op_status(xdr, OP_CB_GETATTR, &cb->cb_status);
- if (unlikely(status || cb->cb_seq_status))
+ if (unlikely(status || cb->cb_status))
return status;
if (xdr_stream_decode_uint32_array(xdr, bitmap, 3) < 0)
- return -NFSERR_BAD_XDR;
+ return -EIO;
if (xdr_stream_decode_u32(xdr, &attrlen) < 0)
- return -NFSERR_BAD_XDR;
+ return -EIO;
maxlen = sizeof(ncf->ncf_cb_change) + sizeof(ncf->ncf_cb_fsize);
if (bitmap[2] != 0)
maxlen += (sizeof(ncf->ncf_cb_mtime.tv_sec) +
sizeof(ncf->ncf_cb_mtime.tv_nsec)) * 2;
if (attrlen > maxlen)
- return -NFSERR_BAD_XDR;
+ return -EIO;
status = decode_cb_fattr4(xdr, bitmap, ncf);
return status;
}
@@ -1064,6 +1090,17 @@ static bool nfsd4_queue_cb(struct nfsd4_callback *cb)
return queue_work(clp->cl_callback_wq, &cb->cb_work);
}
+static void nfsd4_requeue_cb(struct rpc_task *task, struct nfsd4_callback *cb)
+{
+ struct nfs4_client *clp = cb->cb_clp;
+
+ if (!test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags)) {
+ trace_nfsd_cb_restart(clp, cb);
+ task->tk_status = 0;
+ set_bit(NFSD4_CALLBACK_REQUEUE, &cb->cb_flags);
+ }
+}
+
static void nfsd41_cb_inflight_begin(struct nfs4_client *clp)
{
atomic_inc(&clp->cl_cb_inflight);
@@ -1301,15 +1338,113 @@ static void nfsd41_destroy_cb(struct nfsd4_callback *cb)
trace_nfsd_cb_destroy(clp, cb);
nfsd41_cb_release_slot(cb);
+ if (test_bit(NFSD4_CALLBACK_WAKE, &cb->cb_flags))
+ clear_and_wake_up_bit(NFSD4_CALLBACK_RUNNING, &cb->cb_flags);
+ else
+ clear_bit(NFSD4_CALLBACK_RUNNING, &cb->cb_flags);
+
if (cb->cb_ops && cb->cb_ops->release)
cb->cb_ops->release(cb);
nfsd41_cb_inflight_end(clp);
}
-/*
- * TODO: cb_sequence should support referring call lists, cachethis,
- * and mark callback channel down on communication errors.
+/**
+ * nfsd41_cb_referring_call - add a referring call to a callback operation
+ * @cb: context of callback to add the rc to
+ * @sessionid: referring call's session ID
+ * @slotid: referring call's session slot index
+ * @seqno: referring call's slot sequence number
+ *
+ * Caller serializes access to @cb.
+ *
+ * NB: If memory allocation fails, the referring call is not added.
*/
+void nfsd41_cb_referring_call(struct nfsd4_callback *cb,
+ struct nfs4_sessionid *sessionid,
+ u32 slotid, u32 seqno)
+{
+ struct nfsd4_referring_call_list *rcl;
+ struct nfsd4_referring_call *rc;
+ bool found;
+
+ might_sleep();
+
+ found = false;
+ list_for_each_entry(rcl, &cb->cb_referring_call_list, __list) {
+ if (!memcmp(rcl->rcl_sessionid.data, sessionid->data,
+ NFS4_MAX_SESSIONID_LEN)) {
+ found = true;
+ break;
+ }
+ }
+ if (!found) {
+ rcl = kmalloc(sizeof(*rcl), GFP_KERNEL);
+ if (!rcl)
+ return;
+ memcpy(rcl->rcl_sessionid.data, sessionid->data,
+ NFS4_MAX_SESSIONID_LEN);
+ rcl->__nr_referring_calls = 0;
+ INIT_LIST_HEAD(&rcl->rcl_referring_calls);
+ list_add(&rcl->__list, &cb->cb_referring_call_list);
+ cb->cb_nr_referring_call_list++;
+ }
+
+ found = false;
+ list_for_each_entry(rc, &rcl->rcl_referring_calls, __list) {
+ if (rc->rc_sequenceid == seqno && rc->rc_slotid == slotid) {
+ found = true;
+ break;
+ }
+ }
+ if (!found) {
+ rc = kmalloc(sizeof(*rc), GFP_KERNEL);
+ if (!rc)
+ goto out;
+ rc->rc_sequenceid = seqno;
+ rc->rc_slotid = slotid;
+ rcl->__nr_referring_calls++;
+ list_add(&rc->__list, &rcl->rcl_referring_calls);
+ }
+
+out:
+ if (!rcl->__nr_referring_calls) {
+ cb->cb_nr_referring_call_list--;
+ list_del(&rcl->__list);
+ kfree(rcl);
+ }
+}
+
+/**
+ * nfsd41_cb_destroy_referring_call_list - release referring call info
+ * @cb: context of a callback that has completed
+ *
+ * Callers who allocate referring calls using nfsd41_cb_referring_call() must
+ * release those resources by calling nfsd41_cb_destroy_referring_call_list.
+ *
+ * Caller serializes access to @cb.
+ */
+void nfsd41_cb_destroy_referring_call_list(struct nfsd4_callback *cb)
+{
+ struct nfsd4_referring_call_list *rcl;
+ struct nfsd4_referring_call *rc;
+
+ while (!list_empty(&cb->cb_referring_call_list)) {
+ rcl = list_first_entry(&cb->cb_referring_call_list,
+ struct nfsd4_referring_call_list,
+ __list);
+
+ while (!list_empty(&rcl->rcl_referring_calls)) {
+ rc = list_first_entry(&rcl->rcl_referring_calls,
+ struct nfsd4_referring_call,
+ __list);
+ list_del(&rc->__list);
+ kfree(rc);
+ }
+ list_del(&rcl->__list);
+ kfree(rcl);
+ }
+}
+
static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
{
struct nfsd4_callback *cb = calldata;
@@ -1328,30 +1463,14 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
rpc_call_start(task);
}
+/* Returns true if CB_COMPOUND processing should continue */
static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback *cb)
{
- struct nfs4_client *clp = cb->cb_clp;
- struct nfsd4_session *session = clp->cl_cb_session;
- bool ret = true;
-
- if (!clp->cl_minorversion) {
- /*
- * If the backchannel connection was shut down while this
- * task was queued, we need to resubmit it after setting up
- * a new backchannel connection.
- *
- * Note that if we lost our callback connection permanently
- * the submission code will error out, so we don't need to
- * handle that case here.
- */
- if (RPC_SIGNALLED(task))
- goto need_restart;
-
- return true;
- }
+ struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
+ bool ret = false;
if (cb->cb_held_slot < 0)
- goto need_restart;
+ goto requeue;
/* This is the operation status code for CB_SEQUENCE */
trace_nfsd_cb_seq_status(task, cb);
@@ -1365,11 +1484,16 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
* (sequence ID, cached reply) MUST NOT change.
*/
++session->se_cb_seq_nr[cb->cb_held_slot];
+ ret = true;
break;
case -ESERVERFAULT:
- ++session->se_cb_seq_nr[cb->cb_held_slot];
+ /*
+ * Call succeeded, but the session, slot index, or slot
+ * sequence number in the response do not match the same
+ * in the server's call. The sequence information is thus
+ * untrustworthy.
+ */
nfsd4_mark_cb_fault(cb->cb_clp);
- ret = false;
break;
case 1:
/*
@@ -1381,43 +1505,42 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
fallthrough;
case -NFS4ERR_BADSESSION:
nfsd4_mark_cb_fault(cb->cb_clp);
- ret = false;
- goto need_restart;
+ goto requeue;
case -NFS4ERR_DELAY:
cb->cb_seq_status = 1;
- if (!rpc_restart_call(task))
- goto out;
-
+ if (RPC_SIGNALLED(task) || !rpc_restart_call(task))
+ goto requeue;
rpc_delay(task, 2 * HZ);
return false;
+ case -NFS4ERR_SEQ_MISORDERED:
case -NFS4ERR_BADSLOT:
+ /*
+ * A SEQ_MISORDERED or BADSLOT error means that the client and
+ * server are out of sync as to the backchannel parameters. Mark
+ * the backchannel faulty and restart the RPC, but leak the slot
+ * so that it's no longer used.
+ */
+ nfsd4_mark_cb_fault(cb->cb_clp);
+ cb->cb_held_slot = -1;
goto retry_nowait;
- case -NFS4ERR_SEQ_MISORDERED:
- if (session->se_cb_seq_nr[cb->cb_held_slot] != 1) {
- session->se_cb_seq_nr[cb->cb_held_slot] = 1;
- goto retry_nowait;
- }
- break;
default:
nfsd4_mark_cb_fault(cb->cb_clp);
}
trace_nfsd_cb_free_slot(task, cb);
nfsd41_cb_release_slot(cb);
-
- if (RPC_SIGNALLED(task))
- goto need_restart;
-out:
return ret;
retry_nowait:
- if (rpc_restart_call_prepare(task))
- ret = false;
- goto out;
-need_restart:
- if (!test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags)) {
- trace_nfsd_cb_restart(clp, cb);
- task->tk_status = 0;
- cb->cb_need_restart = true;
+ /*
+ * RPC_SIGNALLED() means that the rpc_client is being torn down and
+ * (possibly) recreated. Requeue the call in that case.
+ */
+ if (!RPC_SIGNALLED(task)) {
+ if (rpc_restart_call_prepare(task))
+ return false;
}
+requeue:
+ nfsd41_cb_release_slot(cb);
+ nfsd4_requeue_cb(task, cb);
return false;
}
@@ -1428,8 +1551,21 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
trace_nfsd_cb_rpc_done(clp);
- if (!nfsd4_cb_sequence_done(task, cb))
+ if (!clp->cl_minorversion) {
+ /*
+ * If the backchannel connection was shut down while this
+ * task was queued, we need to resubmit it after setting up
+ * a new backchannel connection.
+ *
+ * Note that if we lost our callback connection permanently
+ * the submission code will error out, so we don't need to
+ * handle that case here.
+ */
+ if (RPC_SIGNALLED(task))
+ nfsd4_requeue_cb(task, cb);
+ } else if (!nfsd4_cb_sequence_done(task, cb)) {
return;
+ }
if (cb->cb_status) {
WARN_ONCE(task->tk_status,
@@ -1462,7 +1598,7 @@ static void nfsd4_cb_release(void *calldata)
trace_nfsd_cb_rpc_release(cb->cb_clp);
- if (cb->cb_need_restart)
+ if (test_bit(NFSD4_CALLBACK_REQUEUE, &cb->cb_flags))
nfsd4_queue_cb(cb);
else
nfsd41_destroy_cb(cb);
@@ -1575,7 +1711,7 @@ nfsd4_run_cb_work(struct work_struct *work)
container_of(work, struct nfsd4_callback, cb_work);
struct nfs4_client *clp = cb->cb_clp;
struct rpc_clnt *clnt;
- int flags;
+ int flags, ret;
trace_nfsd_cb_start(clp);
@@ -1583,8 +1719,11 @@ nfsd4_run_cb_work(struct work_struct *work)
nfsd4_process_cb_update(cb);
clnt = clp->cl_cb_client;
- if (!clnt) {
- /* Callback channel broken, or client killed; give up: */
+ if (!clnt || clp->cl_state == NFSD4_COURTESY) {
+ /*
+ * Callback channel broken, client killed or
+ * nfs4_client in courtesy state; give up.
+ */
nfsd41_destroy_cb(cb);
return;
}
@@ -1598,16 +1737,19 @@ nfsd4_run_cb_work(struct work_struct *work)
return;
}
- if (cb->cb_need_restart) {
- cb->cb_need_restart = false;
- } else {
+ if (!test_and_clear_bit(NFSD4_CALLBACK_REQUEUE, &cb->cb_flags)) {
if (cb->cb_ops && cb->cb_ops->prepare)
cb->cb_ops->prepare(cb);
}
+
cb->cb_msg.rpc_cred = clp->cl_cb_cred;
flags = clp->cl_minorversion ? RPC_TASK_NOCONNECT : RPC_TASK_SOFTCONN;
- rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | flags,
- cb->cb_ops ? &nfsd4_cb_ops : &nfsd4_cb_probe_ops, cb);
+ ret = rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | flags,
+ cb->cb_ops ? &nfsd4_cb_ops : &nfsd4_cb_probe_ops, cb);
+ if (ret != 0) {
+ set_bit(NFSD4_CALLBACK_REQUEUE, &cb->cb_flags);
+ nfsd4_queue_cb(cb);
+ }
}
void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
@@ -1617,11 +1759,13 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
cb->cb_msg.rpc_proc = &nfs4_cb_procedures[op];
cb->cb_msg.rpc_argp = cb;
cb->cb_msg.rpc_resp = cb;
+ cb->cb_flags = 0;
cb->cb_ops = ops;
INIT_WORK(&cb->cb_work, nfsd4_run_cb_work);
cb->cb_status = 0;
- cb->cb_need_restart = false;
cb->cb_held_slot = -1;
+ cb->cb_nr_referring_call_list = 0;
+ INIT_LIST_HEAD(&cb->cb_referring_call_list);
}
/**
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index fbfddd3c4c94..aea905fcaf87 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -65,7 +65,7 @@ nfsd4_alloc_devid_map(const struct svc_fh *fhp)
return;
map->fsid_type = fh->fh_fsid_type;
- memcpy(&map->fsid, fh->fh_fsid, fsid_len);
+ memcpy(&map->fsid, fh_fsid(fh), fsid_len);
spin_lock(&nfsd_devid_lock);
if (fhp->fh_export->ex_devid_map)
@@ -75,7 +75,7 @@ nfsd4_alloc_devid_map(const struct svc_fh *fhp)
list_for_each_entry(old, &nfsd_devid_hash[i], hash) {
if (old->fsid_type != fh->fh_fsid_type)
continue;
- if (memcmp(old->fsid, fh->fh_fsid,
+ if (memcmp(old->fsid, fh_fsid(fh),
key_len(old->fsid_type)))
continue;
@@ -344,9 +344,10 @@ nfsd4_recall_file_layout(struct nfs4_layout_stateid *ls)
atomic_inc(&ls->ls_stid.sc_file->fi_lo_recalls);
trace_nfsd_layout_recall(&ls->ls_stid.sc_stateid);
- refcount_inc(&ls->ls_stid.sc_count);
- nfsd4_run_cb(&ls->ls_recall);
-
+ if (!test_and_set_bit(NFSD4_CALLBACK_RUNNING, &ls->ls_recall.cb_flags)) {
+ refcount_inc(&ls->ls_stid.sc_count);
+ nfsd4_run_cb(&ls->ls_recall);
+ }
out_unlock:
spin_unlock(&ls->ls_lock);
}
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index f6e06c779d09..71b428efcbb5 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -266,7 +266,9 @@ nfsd4_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp,
inode_lock_nested(inode, I_MUTEX_PARENT);
- child = lookup_one_len(open->op_fname, parent, open->op_fnamelen);
+ child = lookup_one(&nop_mnt_idmap,
+ &QSTR_LEN(open->op_fname, open->op_fnamelen),
+ parent);
if (IS_ERR(child)) {
status = nfserrno(PTR_ERR(child));
goto out;
@@ -876,6 +878,8 @@ nfsd4_getattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_getattr *getattr = &u->getattr;
__be32 status;
+ trace_nfsd_vfs_getattr(rqstp, &cstate->current_fh);
+
status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP);
if (status)
return status;
@@ -998,6 +1002,9 @@ nfsd4_readdir(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
u64 cookie = readdir->rd_cookie;
static const nfs4_verifier zeroverf;
+ trace_nfsd_vfs_readdir(rqstp, &cstate->current_fh,
+ readdir->rd_maxcount, readdir->rd_cookie);
+
/* no need to check permission - this will be done in nfsd_readdir() */
if (readdir->rd_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1)
@@ -1211,7 +1218,6 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd_file *nf = NULL;
__be32 status = nfs_ok;
unsigned long cnt;
- int nvecs;
if (write->wr_offset > (u64)OFFSET_MAX ||
write->wr_offset + write->wr_buflen > (u64)OFFSET_MAX)
@@ -1226,13 +1232,9 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
return status;
write->wr_how_written = write->wr_stable_how;
-
- nvecs = svc_fill_write_vector(rqstp, &write->wr_payload);
- WARN_ON_ONCE(nvecs > ARRAY_SIZE(rqstp->rq_vec));
-
status = nfsd_vfs_write(rqstp, &cstate->current_fh, nf,
- write->wr_offset, rqstp->rq_vec, nvecs, &cnt,
- write->wr_how_written,
+ write->wr_offset, &write->wr_payload,
+ &cnt, write->wr_how_written,
(__be32 *)write->wr_verifier.data);
nfsd_file_put(nf);
@@ -1379,8 +1381,11 @@ static void nfs4_put_copy(struct nfsd4_copy *copy)
static void nfsd4_stop_copy(struct nfsd4_copy *copy)
{
trace_nfsd_copy_async_cancel(copy);
- if (!test_and_set_bit(NFSD4_COPY_F_STOPPED, &copy->cp_flags))
+ if (!test_and_set_bit(NFSD4_COPY_F_STOPPED, &copy->cp_flags)) {
kthread_stop(copy->copy_task);
+ copy->nfserr = nfs_ok;
+ set_bit(NFSD4_COPY_F_COMPLETED, &copy->cp_flags);
+ }
nfs4_put_copy(copy);
}
@@ -1709,10 +1714,11 @@ static int nfsd4_cb_offload_done(struct nfsd4_callback *cb,
switch (task->tk_status) {
case -NFS4ERR_DELAY:
if (cbo->co_retries--) {
- rpc_delay(task, 1 * HZ);
+ rpc_delay(task, HZ / 5);
return 0;
}
}
+ nfsd41_cb_destroy_referring_call_list(cb);
return 1;
}
@@ -1845,9 +1851,12 @@ static void nfsd4_send_cb_offload(struct nfsd4_copy *copy)
nfsd4_init_cb(&cbo->co_cb, copy->cp_clp, &nfsd4_cb_offload_ops,
NFSPROC4_CLNT_CB_OFFLOAD);
+ nfsd41_cb_referring_call(&cbo->co_cb, &cbo->co_referring_sessionid,
+ cbo->co_referring_slotid,
+ cbo->co_referring_seqno);
trace_nfsd_cb_offload(copy->cp_clp, &cbo->co_res.cb_stateid,
&cbo->co_fh, copy->cp_count, copy->nfserr);
- nfsd4_run_cb(&cbo->co_cb);
+ nfsd4_try_run_cb(&cbo->co_cb);
}
/**
@@ -1908,13 +1917,6 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd42_write_res *result;
__be32 status;
- /*
- * Currently, async COPY is not reliable. Force all COPY
- * requests to be synchronous to avoid client application
- * hangs waiting for COPY completion.
- */
- nfsd4_copy_set_sync(copy, true);
-
result = &copy->cp_res;
nfsd_copy_write_verifier((__be32 *)&result->wr_verifier.data, nn);
@@ -1961,6 +1963,11 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
memcpy(&result->cb_stateid, &copy->cp_stateid.cs_stid,
sizeof(result->cb_stateid));
dup_copy_fields(copy, async_copy);
+ memcpy(async_copy->cp_cb_offload.co_referring_sessionid.data,
+ cstate->session->se_sessionid.data,
+ NFS4_MAX_SESSIONID_LEN);
+ async_copy->cp_cb_offload.co_referring_slotid = cstate->slot->sl_index;
+ async_copy->cp_cb_offload.co_referring_seqno = cstate->slot->sl_seqid;
async_copy->copy_task = kthread_create(nfsd4_do_async_copy,
async_copy, "%s", "copy thread");
if (IS_ERR(async_copy->copy_task))
@@ -2828,20 +2835,10 @@ nfsd4_proc_compound(struct svc_rqst *rqstp)
rqstp->rq_lease_breaker = (void **)&cstate->clp;
- trace_nfsd_compound(rqstp, args->tag, args->taglen, args->client_opcnt);
+ trace_nfsd_compound(rqstp, args->tag, args->taglen, args->opcnt);
while (!status && resp->opcnt < args->opcnt) {
op = &args->ops[resp->opcnt++];
- if (unlikely(resp->opcnt == NFSD_MAX_OPS_PER_COMPOUND)) {
- /* If there are still more operations to process,
- * stop here and report NFS4ERR_RESOURCE. */
- if (cstate->minorversion == 0 &&
- args->client_opcnt > resp->opcnt) {
- op->status = nfserr_resource;
- goto encode_op;
- }
- }
-
/*
* The XDR decode routines may have pre-set op->status;
* for example, if there is a miscellaneous XDR error
@@ -2918,7 +2915,7 @@ encode_op:
status = op->status;
}
- trace_nfsd_compound_status(args->client_opcnt, resp->opcnt,
+ trace_nfsd_compound_status(args->opcnt, resp->opcnt,
status, nfsd4_op_name(op->opnum));
nfsd4_cstate_clear_replay(cstate);
@@ -3766,7 +3763,8 @@ bool nfsd4_spo_must_allow(struct svc_rqst *rqstp)
struct nfs4_op_map *allow = &cstate->clp->cl_spo_must_allow;
u32 opiter;
- if (!cstate->minorversion)
+ if (rqstp->rq_procinfo != &nfsd_version4.vs_proc[NFSPROC4_COMPOUND] ||
+ cstate->minorversion == 0)
return false;
if (cstate->spo_must_allowed)
@@ -3832,7 +3830,7 @@ static const struct svc_procedure nfsd_procedures4[2] = {
.pc_ressize = sizeof(struct nfsd4_compoundres),
.pc_release = nfsd4_release_compoundargs,
.pc_cachetype = RC_NOCACHE,
- .pc_xdrressize = NFSD_BUFSIZE/4,
+ .pc_xdrressize = 3+NFSSVC_MAXBLKSIZE/4,
.pc_name = "COMPOUND",
},
};
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 28f4d5311c40..2231192ec33f 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -33,6 +33,7 @@
*/
#include <crypto/hash.h>
+#include <crypto/sha2.h>
#include <linux/file.h>
#include <linux/slab.h>
#include <linux/namei.h>
@@ -218,7 +219,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
/* lock the parent */
inode_lock(d_inode(dir));
- dentry = lookup_one_len(dname, dir, HEXDIR_LEN-1);
+ dentry = lookup_one(&nop_mnt_idmap, &QSTR(dname), dir);
if (IS_ERR(dentry)) {
status = PTR_ERR(dentry);
goto out_unlock;
@@ -233,9 +234,12 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
* as well be forgiving and just succeed silently.
*/
goto out_put;
- status = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), dentry, S_IRWXU);
+ dentry = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), dentry, S_IRWXU);
+ if (IS_ERR(dentry))
+ status = PTR_ERR(dentry);
out_put:
- dput(dentry);
+ if (!status)
+ dput(dentry);
out_unlock:
inode_unlock(d_inode(dir));
if (status == 0) {
@@ -313,7 +317,8 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn)
list_for_each_entry_safe(entry, tmp, &ctx.names, list) {
if (!status) {
struct dentry *dentry;
- dentry = lookup_one_len(entry->name, dir, HEXDIR_LEN-1);
+ dentry = lookup_one(&nop_mnt_idmap,
+ &QSTR(entry->name), dir);
if (IS_ERR(dentry)) {
status = PTR_ERR(dentry);
break;
@@ -336,16 +341,16 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn)
}
static int
-nfsd4_unlink_clid_dir(char *name, int namlen, struct nfsd_net *nn)
+nfsd4_unlink_clid_dir(char *name, struct nfsd_net *nn)
{
struct dentry *dir, *dentry;
int status;
- dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name);
+ dprintk("NFSD: nfsd4_unlink_clid_dir. name %s\n", name);
dir = nn->rec_file->f_path.dentry;
inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
- dentry = lookup_one_len(name, dir, namlen);
+ dentry = lookup_one(&nop_mnt_idmap, &QSTR(name), dir);
if (IS_ERR(dentry)) {
status = PTR_ERR(dentry);
goto out_unlock;
@@ -405,7 +410,7 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp)
if (status < 0)
goto out_drop_write;
- status = nfsd4_unlink_clid_dir(dname, HEXDIR_LEN-1, nn);
+ status = nfsd4_unlink_clid_dir(dname, nn);
nfs4_reset_creds(original_cred);
if (status == 0) {
vfs_fsync(nn->rec_file, 0);
@@ -733,7 +738,6 @@ struct cld_net {
spinlock_t cn_lock;
struct list_head cn_list;
unsigned int cn_xid;
- struct crypto_shash *cn_tfm;
#ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING
bool cn_has_legacy;
#endif
@@ -946,38 +950,32 @@ static const struct rpc_pipe_ops cld_upcall_ops = {
.destroy_msg = cld_pipe_destroy_msg,
};
-static struct dentry *
+static int
nfsd4_cld_register_sb(struct super_block *sb, struct rpc_pipe *pipe)
{
- struct dentry *dir, *dentry;
+ struct dentry *dir;
+ int err;
dir = rpc_d_lookup_sb(sb, NFSD_PIPE_DIR);
if (dir == NULL)
- return ERR_PTR(-ENOENT);
- dentry = rpc_mkpipe_dentry(dir, NFSD_CLD_PIPE, NULL, pipe);
+ return -ENOENT;
+ err = rpc_mkpipe_dentry(dir, NFSD_CLD_PIPE, NULL, pipe);
dput(dir);
- return dentry;
+ return err;
}
-static void
-nfsd4_cld_unregister_sb(struct rpc_pipe *pipe)
-{
- if (pipe->dentry)
- rpc_unlink(pipe->dentry);
-}
-
-static struct dentry *
+static int
nfsd4_cld_register_net(struct net *net, struct rpc_pipe *pipe)
{
struct super_block *sb;
- struct dentry *dentry;
+ int err;
sb = rpc_get_sb_net(net);
if (!sb)
- return NULL;
- dentry = nfsd4_cld_register_sb(sb, pipe);
+ return 0;
+ err = nfsd4_cld_register_sb(sb, pipe);
rpc_put_sb_net(net);
- return dentry;
+ return err;
}
static void
@@ -987,7 +985,7 @@ nfsd4_cld_unregister_net(struct net *net, struct rpc_pipe *pipe)
sb = rpc_get_sb_net(net);
if (sb) {
- nfsd4_cld_unregister_sb(pipe);
+ rpc_unlink(pipe);
rpc_put_sb_net(net);
}
}
@@ -997,7 +995,6 @@ static int
__nfsd4_init_cld_pipe(struct net *net)
{
int ret;
- struct dentry *dentry;
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
struct cld_net *cn;
@@ -1018,13 +1015,10 @@ __nfsd4_init_cld_pipe(struct net *net)
spin_lock_init(&cn->cn_lock);
INIT_LIST_HEAD(&cn->cn_list);
- dentry = nfsd4_cld_register_net(net, cn->cn_pipe);
- if (IS_ERR(dentry)) {
- ret = PTR_ERR(dentry);
+ ret = nfsd4_cld_register_net(net, cn->cn_pipe);
+ if (unlikely(ret))
goto err_destroy_data;
- }
- cn->cn_pipe->dentry = dentry;
#ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING
cn->cn_has_legacy = false;
#endif
@@ -1059,8 +1053,6 @@ nfsd4_remove_cld_pipe(struct net *net)
nfsd4_cld_unregister_net(net, cn->cn_pipe);
rpc_destroy_pipe_data(cn->cn_pipe);
- if (cn->cn_tfm)
- crypto_free_shash(cn->cn_tfm);
kfree(nn->cld_net);
nn->cld_net = NULL;
}
@@ -1154,8 +1146,6 @@ nfsd4_cld_create_v2(struct nfs4_client *clp)
struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
struct cld_net *cn = nn->cld_net;
struct cld_msg_v2 *cmsg;
- struct crypto_shash *tfm = cn->cn_tfm;
- struct xdr_netobj cksum;
char *principal = NULL;
/* Don't upcall if it's already stored */
@@ -1178,22 +1168,9 @@ nfsd4_cld_create_v2(struct nfs4_client *clp)
else if (clp->cl_cred.cr_principal)
principal = clp->cl_cred.cr_principal;
if (principal) {
- cksum.len = crypto_shash_digestsize(tfm);
- cksum.data = kmalloc(cksum.len, GFP_KERNEL);
- if (cksum.data == NULL) {
- ret = -ENOMEM;
- goto out;
- }
- ret = crypto_shash_tfm_digest(tfm, principal, strlen(principal),
- cksum.data);
- if (ret) {
- kfree(cksum.data);
- goto out;
- }
- cmsg->cm_u.cm_clntinfo.cc_princhash.cp_len = cksum.len;
- memcpy(cmsg->cm_u.cm_clntinfo.cc_princhash.cp_data,
- cksum.data, cksum.len);
- kfree(cksum.data);
+ sha256(principal, strlen(principal),
+ cmsg->cm_u.cm_clntinfo.cc_princhash.cp_data);
+ cmsg->cm_u.cm_clntinfo.cc_princhash.cp_len = SHA256_DIGEST_SIZE;
} else
cmsg->cm_u.cm_clntinfo.cc_princhash.cp_len = 0;
@@ -1203,7 +1180,6 @@ nfsd4_cld_create_v2(struct nfs4_client *clp)
set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
}
-out:
free_cld_upcall(cup);
out_err:
if (ret)
@@ -1342,12 +1318,11 @@ found:
static int
nfsd4_cld_check_v2(struct nfs4_client *clp)
{
- struct nfs4_client_reclaim *crp;
struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+#ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING
struct cld_net *cn = nn->cld_net;
- int status;
- struct crypto_shash *tfm = cn->cn_tfm;
- struct xdr_netobj cksum;
+#endif
+ struct nfs4_client_reclaim *crp;
char *principal = NULL;
/* did we already find that this client is stable? */
@@ -1363,6 +1338,7 @@ nfsd4_cld_check_v2(struct nfs4_client *clp)
if (cn->cn_has_legacy) {
struct xdr_netobj name;
char dname[HEXDIR_LEN];
+ int status;
status = nfs4_make_rec_clidname(dname, &clp->cl_name);
if (status)
@@ -1385,28 +1361,18 @@ nfsd4_cld_check_v2(struct nfs4_client *clp)
return -ENOENT;
found:
if (crp->cr_princhash.len) {
+ u8 digest[SHA256_DIGEST_SIZE];
+
if (clp->cl_cred.cr_raw_principal)
principal = clp->cl_cred.cr_raw_principal;
else if (clp->cl_cred.cr_principal)
principal = clp->cl_cred.cr_principal;
if (principal == NULL)
return -ENOENT;
- cksum.len = crypto_shash_digestsize(tfm);
- cksum.data = kmalloc(cksum.len, GFP_KERNEL);
- if (cksum.data == NULL)
- return -ENOENT;
- status = crypto_shash_tfm_digest(tfm, principal,
- strlen(principal), cksum.data);
- if (status) {
- kfree(cksum.data);
- return -ENOENT;
- }
- if (memcmp(crp->cr_princhash.data, cksum.data,
- crp->cr_princhash.len)) {
- kfree(cksum.data);
+ sha256(principal, strlen(principal), digest);
+ if (memcmp(crp->cr_princhash.data, digest,
+ crp->cr_princhash.len))
return -ENOENT;
- }
- kfree(cksum.data);
}
crp->cr_clp = clp;
return 0;
@@ -1586,7 +1552,6 @@ nfsd4_cld_tracking_init(struct net *net)
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
bool running;
int retries = 10;
- struct crypto_shash *tfm;
status = nfs4_cld_state_init(net);
if (status)
@@ -1611,12 +1576,6 @@ nfsd4_cld_tracking_init(struct net *net)
status = -ETIMEDOUT;
goto err_remove;
}
- tfm = crypto_alloc_shash("sha256", 0, 0);
- if (IS_ERR(tfm)) {
- status = PTR_ERR(tfm);
- goto err_remove;
- }
- nn->cld_net->cn_tfm = tfm;
status = nfsd4_cld_get_version(nn);
if (status == -EOPNOTSUPP)
@@ -2152,7 +2111,6 @@ rpc_pipefs_event(struct notifier_block *nb, unsigned long event, void *ptr)
struct net *net = sb->s_fs_info;
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
struct cld_net *cn = nn->cld_net;
- struct dentry *dentry;
int ret = 0;
if (!try_module_get(THIS_MODULE))
@@ -2165,16 +2123,10 @@ rpc_pipefs_event(struct notifier_block *nb, unsigned long event, void *ptr)
switch (event) {
case RPC_PIPEFS_MOUNT:
- dentry = nfsd4_cld_register_sb(sb, cn->cn_pipe);
- if (IS_ERR(dentry)) {
- ret = PTR_ERR(dentry);
- break;
- }
- cn->cn_pipe->dentry = dentry;
+ ret = nfsd4_cld_register_sb(sb, cn->cn_pipe);
break;
case RPC_PIPEFS_UMOUNT:
- if (cn->cn_pipe->dentry)
- nfsd4_cld_unregister_sb(cn->cn_pipe);
+ rpc_unlink(cn->cn_pipe);
break;
default:
ret = -ENOTSUPP;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index b7a0cfd05401..88c347957da5 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -633,18 +633,6 @@ find_readable_file(struct nfs4_file *f)
return ret;
}
-static struct nfsd_file *
-find_rw_file(struct nfs4_file *f)
-{
- struct nfsd_file *ret;
-
- spin_lock(&f->fi_lock);
- ret = nfsd_file_get(f->fi_fds[O_RDWR]);
- spin_unlock(&f->fi_lock);
-
- return ret;
-}
-
struct nfsd_file *
find_any_file(struct nfs4_file *f)
{
@@ -946,15 +934,6 @@ struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *sla
spin_lock_init(&stid->sc_lock);
INIT_LIST_HEAD(&stid->sc_cp_list);
- /*
- * It shouldn't be a problem to reuse an opaque stateid value.
- * I don't think it is for 4.1. But with 4.0 I worry that, for
- * example, a stray write retransmission could be accepted by
- * the server when it should have been rejected. Therefore,
- * adopt a trick from the sctp code to attempt to maximize the
- * amount of time until an id is reused, by ensuring they always
- * "increase" (mod INT_MAX):
- */
return stid;
out_free:
kmem_cache_free(slab, stid);
@@ -1050,6 +1029,12 @@ static struct nfs4_ol_stateid * nfs4_alloc_open_stateid(struct nfs4_client *clp)
return openlockstateid(stid);
}
+/*
+ * As the sc_free callback of deleg, this may be called by nfs4_put_stid
+ * in nfsd_break_one_deleg.
+ * Considering nfsd_break_one_deleg is called with the flc->flc_lock held,
+ * this function mustn't ever sleep.
+ */
static void nfs4_free_deleg(struct nfs4_stid *stid)
{
struct nfs4_delegation *dp = delegstateid(stid);
@@ -1221,15 +1206,20 @@ nfs4_inc_and_copy_stateid(stateid_t *dst, struct nfs4_stid *stid)
static void put_deleg_file(struct nfs4_file *fp)
{
+ struct nfsd_file *rnf = NULL;
struct nfsd_file *nf = NULL;
spin_lock(&fp->fi_lock);
- if (--fp->fi_delegees == 0)
+ if (--fp->fi_delegees == 0) {
swap(nf, fp->fi_deleg_file);
+ swap(rnf, fp->fi_rdeleg_file);
+ }
spin_unlock(&fp->fi_lock);
if (nf)
nfsd_file_put(nf);
+ if (rnf)
+ nfs4_file_put_access(fp, NFS4_SHARE_ACCESS_READ);
}
static void nfs4_unlock_deleg_lease(struct nfs4_delegation *dp)
@@ -1378,7 +1368,8 @@ static void revoke_delegation(struct nfs4_delegation *dp)
struct nfs4_client *clp = dp->dl_stid.sc_client;
WARN_ON(!list_empty(&dp->dl_recall_lru));
- WARN_ON_ONCE(!(dp->dl_stid.sc_status &
+ WARN_ON_ONCE(dp->dl_stid.sc_client->cl_minorversion > 0 &&
+ !(dp->dl_stid.sc_status &
(SC_STATUS_REVOKED | SC_STATUS_ADMIN_REVOKED)));
trace_nfsd_stid_revoke(&dp->dl_stid);
@@ -1989,26 +1980,30 @@ reduce_session_slots(struct nfsd4_session *ses, int dec)
return ret;
}
-/*
- * We don't actually need to cache the rpc and session headers, so we
- * can allocate a little less for each slot:
- */
-static inline u32 slot_bytes(struct nfsd4_channel_attrs *ca)
+static struct nfsd4_slot *nfsd4_alloc_slot(struct nfsd4_channel_attrs *fattrs,
+ int index, gfp_t gfp)
{
- u32 size;
+ struct nfsd4_slot *slot;
+ size_t size;
- if (ca->maxresp_cached < NFSD_MIN_HDR_SEQ_SZ)
- size = 0;
- else
- size = ca->maxresp_cached - NFSD_MIN_HDR_SEQ_SZ;
- return size + sizeof(struct nfsd4_slot);
+ /*
+ * The RPC and NFS session headers are never saved in
+ * the slot reply cache buffer.
+ */
+ size = fattrs->maxresp_cached < NFSD_MIN_HDR_SEQ_SZ ?
+ 0 : fattrs->maxresp_cached - NFSD_MIN_HDR_SEQ_SZ;
+
+ slot = kzalloc(struct_size(slot, sl_data, size), gfp);
+ if (!slot)
+ return NULL;
+ slot->sl_index = index;
+ return slot;
}
static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
struct nfsd4_channel_attrs *battrs)
{
int numslots = fattrs->maxreqs;
- int slotsize = slot_bytes(fattrs);
struct nfsd4_session *new;
struct nfsd4_slot *slot;
int i;
@@ -2017,14 +2012,14 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
if (!new)
return NULL;
xa_init(&new->se_slots);
- /* allocate each struct nfsd4_slot and data cache in one piece */
- slot = kzalloc(slotsize, GFP_KERNEL);
+
+ slot = nfsd4_alloc_slot(fattrs, 0, GFP_KERNEL);
if (!slot || xa_is_err(xa_store(&new->se_slots, 0, slot, GFP_KERNEL)))
goto out_free;
for (i = 1; i < numslots; i++) {
const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
- slot = kzalloc(slotsize, gfp);
+ slot = nfsd4_alloc_slot(fattrs, i, gfp);
if (!slot)
break;
if (xa_is_err(xa_store(&new->se_slots, i, slot, gfp))) {
@@ -3168,7 +3163,6 @@ nfsd4_cb_recall_any_release(struct nfsd4_callback *cb)
{
struct nfs4_client *clp = cb->cb_clp;
- clear_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags);
drop_client(clp);
}
@@ -3199,7 +3193,6 @@ nfsd4_cb_getattr_release(struct nfsd4_callback *cb)
struct nfs4_delegation *dp =
container_of(ncf, struct nfs4_delegation, dl_cb_fattr);
- clear_and_wake_up_bit(CB_GETATTR_BUSY, &ncf->ncf_cb_flags);
nfs4_put_stid(&dp->dl_stid);
}
@@ -3220,11 +3213,15 @@ static void nfs4_cb_getattr(struct nfs4_cb_fattr *ncf)
struct nfs4_delegation *dp =
container_of(ncf, struct nfs4_delegation, dl_cb_fattr);
- if (test_and_set_bit(CB_GETATTR_BUSY, &ncf->ncf_cb_flags))
+ if (test_and_set_bit(NFSD4_CALLBACK_RUNNING, &ncf->ncf_getattr.cb_flags))
return;
+
/* set to proper status when nfsd4_cb_getattr_done runs */
ncf->ncf_cb_status = NFS4ERR_IO;
+ /* ensure that wake_bit is done when RUNNING is cleared */
+ set_bit(NFSD4_CALLBACK_WAKE, &ncf->ncf_getattr.cb_flags);
+
refcount_inc(&dp->dl_stid.sc_count);
nfsd4_run_cb(&ncf->ncf_getattr);
}
@@ -3868,7 +3865,6 @@ static __be32 check_forechannel_attrs(struct nfsd4_channel_attrs *ca, struct nfs
ca->headerpadsz = 0;
ca->maxreq_sz = min_t(u32, ca->maxreq_sz, maxrpc);
ca->maxresp_sz = min_t(u32, ca->maxresp_sz, maxrpc);
- ca->maxops = min_t(u32, ca->maxops, NFSD_MAX_OPS_PER_COMPOUND);
ca->maxresp_cached = min_t(u32, ca->maxresp_cached,
NFSD_SLOT_CACHE_SIZE + NFSD_MIN_HDR_SEQ_SZ);
ca->maxreqs = min_t(u32, ca->maxreqs, NFSD_MAX_SLOTS_PER_SESSION);
@@ -4402,7 +4398,7 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
nfserr_rep_too_big;
if (xdr_restrict_buflen(xdr, buflen - rqstp->rq_auth_slack))
goto out_put_session;
- svc_reserve(rqstp, buflen);
+ svc_reserve_auth(rqstp, buflen);
status = nfs_ok;
/* Success! accept new slot seqid */
@@ -4438,8 +4434,8 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
* spinlock, and only succeeds if there is
* plenty of memory.
*/
- slot = kzalloc(slot_bytes(&session->se_fchannel),
- GFP_NOWAIT);
+ slot = nfsd4_alloc_slot(&session->se_fchannel, s,
+ GFP_NOWAIT);
prev_slot = xa_load(&session->se_slots, s);
if (xa_is_value(prev_slot) && slot) {
slot->sl_seqid = xa_to_value(prev_slot);
@@ -4459,10 +4455,11 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
}
} while (slot && --cnt > 0);
}
+
+out:
seq->maxslots = max(session->se_target_maxslots, seq->maxslots);
seq->target_maxslots = session->se_target_maxslots;
-out:
switch (clp->cl_cb_state) {
case NFSD4_CB_DOWN:
seq->status_flags = SEQ4_STATUS_CB_PATH_DOWN;
@@ -4692,10 +4689,16 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
}
status = nfs_ok;
if (conf) {
- old = unconf;
- unhash_client_locked(old);
- nfsd4_change_callback(conf, &unconf->cl_cb_conn);
- } else {
+ if (get_client_locked(conf) == nfs_ok) {
+ old = unconf;
+ unhash_client_locked(old);
+ nfsd4_change_callback(conf, &unconf->cl_cb_conn);
+ } else {
+ conf = NULL;
+ }
+ }
+
+ if (!conf) {
old = find_confirmed_client_by_name(&unconf->cl_name, nn);
if (old) {
status = nfserr_clid_inuse;
@@ -4712,10 +4715,14 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
}
trace_nfsd_clid_replaced(&old->cl_clientid);
}
+ status = get_client_locked(unconf);
+ if (status != nfs_ok) {
+ old = NULL;
+ goto out;
+ }
move_to_confirmed(unconf);
conf = unconf;
}
- get_client_locked(conf);
spin_unlock(&nn->client_lock);
if (conf == unconf)
fsnotify_dentry(conf->cl_nfsd_info_dentry, FS_MODIFY);
@@ -4745,6 +4752,7 @@ static void nfsd4_file_init(const struct svc_fh *fh, struct nfs4_file *fp)
INIT_LIST_HEAD(&fp->fi_clnt_odstate);
fh_copy_shallow(&fp->fi_fhandle, &fh->fh_handle);
fp->fi_deleg_file = NULL;
+ fp->fi_rdeleg_file = NULL;
fp->fi_had_conflict = false;
fp->fi_share_deny = 0;
memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
@@ -4814,8 +4822,8 @@ out:
static unsigned long
nfsd4_state_shrinker_count(struct shrinker *shrink, struct shrink_control *sc)
{
- int count;
struct nfsd_net *nn = shrink->private_data;
+ long count;
count = atomic_read(&nn->nfsd_courtesy_clients);
if (!count)
@@ -5413,6 +5421,11 @@ static const struct nfsd4_callback_ops nfsd4_cb_recall_ops = {
static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
{
+ bool queued;
+
+ if (test_and_set_bit(NFSD4_CALLBACK_RUNNING, &dp->dl_recall.cb_flags))
+ return;
+
/*
* We're assuming the state code never drops its reference
* without first removing the lease. Since we're in this lease
@@ -5421,7 +5434,10 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
* we know it's safe to take a reference.
*/
refcount_inc(&dp->dl_stid.sc_count);
- WARN_ON_ONCE(!nfsd4_run_cb(&dp->dl_recall));
+ queued = nfsd4_run_cb(&dp->dl_recall);
+ WARN_ON_ONCE(!queued);
+ if (!queued)
+ refcount_dec(&dp->dl_stid.sc_count);
}
/* Called from break_lease() with flc_lock held. */
@@ -5947,11 +5963,23 @@ nfsd4_verify_setuid_write(struct nfsd4_open *open, struct nfsd_file *nf)
return 0;
}
+#ifdef CONFIG_NFSD_V4_DELEG_TIMESTAMPS
+static bool nfsd4_want_deleg_timestamps(const struct nfsd4_open *open)
+{
+ return open->op_deleg_want & OPEN4_SHARE_ACCESS_WANT_DELEG_TIMESTAMPS;
+}
+#else /* CONFIG_NFSD_V4_DELEG_TIMESTAMPS */
+static bool nfsd4_want_deleg_timestamps(const struct nfsd4_open *open)
+{
+ return false;
+}
+#endif /* CONFIG NFSD_V4_DELEG_TIMESTAMPS */
+
static struct nfs4_delegation *
nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
struct svc_fh *parent)
{
- bool deleg_ts = open->op_deleg_want & OPEN4_SHARE_ACCESS_WANT_DELEG_TIMESTAMPS;
+ bool deleg_ts = nfsd4_want_deleg_timestamps(open);
struct nfs4_client *clp = stp->st_stid.sc_client;
struct nfs4_file *fp = stp->st_stid.sc_file;
struct nfs4_clnt_odstate *odstate = stp->st_clnt_odstate;
@@ -5975,14 +6003,19 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
* "An OPEN_DELEGATE_WRITE delegation allows the client to handle,
* on its own, all opens."
*
- * Furthermore the client can use a write delegation for most READ
- * operations as well, so we require a O_RDWR file here.
+ * Furthermore, section 9.1.2 says:
*
- * Offer a write delegation in the case of a BOTH open, and ensure
- * we get the O_RDWR descriptor.
+ * "In the case of READ, the server may perform the corresponding
+ * check on the access mode, or it may choose to allow READ for
+ * OPEN4_SHARE_ACCESS_WRITE, to accommodate clients whose WRITE
+ * implementation may unavoidably do reads (e.g., due to buffer
+ * cache constraints)."
+ *
+ * We choose to offer a write delegation for OPEN with the
+ * OPEN4_SHARE_ACCESS_WRITE access mode to accommodate such clients.
*/
- if ((open->op_share_access & NFS4_SHARE_ACCESS_BOTH) == NFS4_SHARE_ACCESS_BOTH) {
- nf = find_rw_file(fp);
+ if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) {
+ nf = find_writeable_file(fp);
dl_type = deleg_ts ? OPEN_DELEGATE_WRITE_ATTRS_DELEG : OPEN_DELEGATE_WRITE;
}
@@ -5998,6 +6031,15 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
if (!nf)
return ERR_PTR(-EAGAIN);
+ /*
+ * File delegations and associated locks cannot be recovered if the
+ * export is from an NFS proxy server.
+ */
+ if (exportfs_cannot_lock(nf->nf_file->f_path.mnt->mnt_sb->s_export_op)) {
+ nfsd_file_put(nf);
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+
spin_lock(&state_lock);
spin_lock(&fp->fi_lock);
if (nfs4_delegation_exists(clp, fp))
@@ -6104,7 +6146,7 @@ static bool
nfs4_delegation_stat(struct nfs4_delegation *dp, struct svc_fh *currentfh,
struct kstat *stat)
{
- struct nfsd_file *nf = find_rw_file(dp->dl_stid.sc_file);
+ struct nfsd_file *nf = find_writeable_file(dp->dl_stid.sc_file);
struct path path;
int rc;
@@ -6123,6 +6165,34 @@ nfs4_delegation_stat(struct nfs4_delegation *dp, struct svc_fh *currentfh,
}
/*
+ * Add NFS4_SHARE_ACCESS_READ to the write delegation granted on OPEN
+ * with NFS4_SHARE_ACCESS_WRITE by allocating separate nfsd_file and
+ * struct file to be used for read with delegation stateid.
+ *
+ */
+static bool
+nfsd4_add_rdaccess_to_wrdeleg(struct svc_rqst *rqstp, struct nfsd4_open *open,
+ struct svc_fh *fh, struct nfs4_ol_stateid *stp)
+{
+ struct nfs4_file *fp;
+ struct nfsd_file *nf = NULL;
+
+ if ((open->op_share_access & NFS4_SHARE_ACCESS_BOTH) ==
+ NFS4_SHARE_ACCESS_WRITE) {
+ if (nfsd_file_acquire_opened(rqstp, fh, NFSD_MAY_READ, NULL, &nf))
+ return (false);
+ fp = stp->st_stid.sc_file;
+ spin_lock(&fp->fi_lock);
+ __nfs4_file_get_access(fp, NFS4_SHARE_ACCESS_READ);
+ fp = stp->st_stid.sc_file;
+ fp->fi_fds[O_RDONLY] = nf;
+ fp->fi_rdeleg_file = nf;
+ spin_unlock(&fp->fi_lock);
+ }
+ return true;
+}
+
+/*
* The Linux NFS server does not offer write delegations to NFSv4.0
* clients in order to avoid conflicts between write delegations and
* GETATTRs requesting CHANGE or SIZE attributes.
@@ -6147,11 +6217,12 @@ nfs4_delegation_stat(struct nfs4_delegation *dp, struct svc_fh *currentfh,
* open or lock state.
*/
static void
-nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
- struct svc_fh *currentfh)
+nfs4_open_delegation(struct svc_rqst *rqstp, struct nfsd4_open *open,
+ struct nfs4_ol_stateid *stp, struct svc_fh *currentfh,
+ struct svc_fh *fh)
{
- bool deleg_ts = open->op_deleg_want & OPEN4_SHARE_ACCESS_WANT_DELEG_TIMESTAMPS;
struct nfs4_openowner *oo = openowner(stp->st_stateowner);
+ bool deleg_ts = nfsd4_want_deleg_timestamps(open);
struct nfs4_client *clp = stp->st_stid.sc_client;
struct svc_fh *parent = NULL;
struct nfs4_delegation *dp;
@@ -6193,7 +6264,8 @@ nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
memcpy(&open->op_delegate_stateid, &dp->dl_stid.sc_stateid, sizeof(dp->dl_stid.sc_stateid));
if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) {
- if (!nfs4_delegation_stat(dp, currentfh, &stat)) {
+ if (!nfsd4_add_rdaccess_to_wrdeleg(rqstp, open, fh, stp) ||
+ !nfs4_delegation_stat(dp, currentfh, &stat)) {
nfs4_put_stid(&dp->dl_stid);
destroy_delegation(dp);
goto out_no_deleg;
@@ -6288,6 +6360,20 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
status = nfs4_check_deleg(cl, open, &dp);
if (status)
goto out;
+ if (dp && nfsd4_is_deleg_cur(open) &&
+ (dp->dl_stid.sc_file != fp)) {
+ /*
+ * RFC8881 section 8.2.4 mandates the server to return
+ * NFS4ERR_BAD_STATEID if the selected table entry does
+ * not match the current filehandle. However returning
+ * NFS4ERR_BAD_STATEID in the OPEN can cause the client
+ * to repeatedly retry the operation with the same
+ * stateid, since the stateid itself is valid. To avoid
+ * this situation NFSD returns NFS4ERR_INVAL instead.
+ */
+ status = nfserr_inval;
+ goto out;
+ }
stp = nfsd4_find_and_lock_existing_open(fp, open);
} else {
open->op_file = NULL;
@@ -6349,7 +6435,8 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
* Attempt to hand out a delegation. No error return, because the
* OPEN succeeds even if we fail.
*/
- nfs4_open_delegation(open, stp, &resp->cstate.current_fh);
+ nfs4_open_delegation(rqstp, open, stp,
+ &resp->cstate.current_fh, current_fh);
/*
* If there is an existing open stateid, it must be updated and
@@ -6854,38 +6941,34 @@ deleg_reaper(struct nfsd_net *nn)
{
struct list_head *pos, *next;
struct nfs4_client *clp;
- LIST_HEAD(cblist);
spin_lock(&nn->client_lock);
list_for_each_safe(pos, next, &nn->client_lru) {
clp = list_entry(pos, struct nfs4_client, cl_lru);
- if (clp->cl_state != NFSD4_ACTIVE ||
- list_empty(&clp->cl_delegations) ||
- atomic_read(&clp->cl_delegs_in_recall) ||
- test_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags) ||
- (ktime_get_boottime_seconds() -
- clp->cl_ra_time < 5)) {
+
+ if (clp->cl_state != NFSD4_ACTIVE)
+ continue;
+ if (list_empty(&clp->cl_delegations))
+ continue;
+ if (atomic_read(&clp->cl_delegs_in_recall))
+ continue;
+ if (test_and_set_bit(NFSD4_CALLBACK_RUNNING, &clp->cl_ra->ra_cb.cb_flags))
+ continue;
+ if (ktime_get_boottime_seconds() - clp->cl_ra_time < 5)
+ continue;
+ if (clp->cl_cb_state != NFSD4_CB_UP)
continue;
- }
- list_add(&clp->cl_ra_cblist, &cblist);
/* release in nfsd4_cb_recall_any_release */
kref_get(&clp->cl_nfsdfs.cl_ref);
- set_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags);
clp->cl_ra_time = ktime_get_boottime_seconds();
- }
- spin_unlock(&nn->client_lock);
-
- while (!list_empty(&cblist)) {
- clp = list_first_entry(&cblist, struct nfs4_client,
- cl_ra_cblist);
- list_del_init(&clp->cl_ra_cblist);
clp->cl_ra->ra_keep = 0;
clp->cl_ra->ra_bmval[0] = BIT(RCA4_TYPE_MASK_RDATA_DLG) |
BIT(RCA4_TYPE_MASK_WDATA_DLG);
trace_nfsd_cb_recall_any(clp->cl_ra);
nfsd4_run_cb(&clp->cl_ra->ra_cb);
}
+ spin_unlock(&nn->client_lock);
}
static void
@@ -7046,11 +7129,11 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
return_revoked = true;
if (typemask & SC_TYPE_DELEG)
/* Always allow REVOKED for DELEG so we can
- * retturn the appropriate error.
+ * return the appropriate error.
*/
statusmask |= SC_STATUS_REVOKED;
- statusmask |= SC_STATUS_ADMIN_REVOKED;
+ statusmask |= SC_STATUS_ADMIN_REVOKED | SC_STATUS_FREEABLE;
if (ZERO_STATEID(stateid) || ONE_STATEID(stateid) ||
CLOSE_STATEID(stateid))
@@ -7089,10 +7172,6 @@ nfs4_find_file(struct nfs4_stid *s, int flags)
switch (s->sc_type) {
case SC_TYPE_DELEG:
- spin_lock(&s->sc_file->fi_lock);
- ret = nfsd_file_get(s->sc_file->fi_deleg_file);
- spin_unlock(&s->sc_file->fi_lock);
- break;
case SC_TYPE_OPEN:
case SC_TYPE_LOCK:
if (flags & RD_STATE)
@@ -7705,9 +7784,7 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
return status;
- status = nfsd4_lookup_stateid(cstate, stateid, SC_TYPE_DELEG,
- SC_STATUS_REVOKED | SC_STATUS_FREEABLE,
- &s, nn);
+ status = nfsd4_lookup_stateid(cstate, stateid, SC_TYPE_DELEG, SC_STATUS_REVOKED, &s, nn);
if (status)
goto out;
dp = delegstateid(s);
@@ -7815,7 +7892,7 @@ nfsd4_lm_notify(struct file_lock *fl)
if (queue) {
trace_nfsd_cb_notify_lock(lo, nbl);
- nfsd4_run_cb(&nbl->nbl_cb);
+ nfsd4_try_run_cb(&nbl->nbl_cb);
}
}
@@ -8133,6 +8210,10 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0);
if (status != nfs_ok)
return status;
+ if (exportfs_cannot_lock(cstate->current_fh.fh_dentry->d_sb->s_export_op)) {
+ status = nfserr_notsupp;
+ goto out;
+ }
if (lock->lk_is_new) {
if (nfsd4_has_session(cstate))
@@ -8472,6 +8553,11 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
status = nfserr_lock_range;
goto put_stateid;
}
+ if (exportfs_cannot_lock(nf->nf_file->f_path.mnt->mnt_sb->s_export_op)) {
+ status = nfserr_notsupp;
+ goto put_file;
+ }
+
file_lock = locks_alloc_lock();
if (!file_lock) {
dprintk("NFSD: %s: unable to allocate lock!\n", __func__);
@@ -9181,8 +9267,8 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct dentry *dentry,
nfs4_cb_getattr(&dp->dl_cb_fattr);
spin_unlock(&ctx->flc_lock);
- wait_on_bit_timeout(&ncf->ncf_cb_flags, CB_GETATTR_BUSY,
- TASK_INTERRUPTIBLE, NFSD_CB_GETATTR_TIMEOUT);
+ wait_on_bit_timeout(&ncf->ncf_getattr.cb_flags, NFSD4_CALLBACK_RUNNING,
+ TASK_UNINTERRUPTIBLE, NFSD_CB_GETATTR_TIMEOUT);
if (ncf->ncf_cb_status) {
/* Recall delegation only if client didn't respond */
status = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ));
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index e67420729ecd..ea91bad4eee2 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2500,10 +2500,8 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
if (xdr_stream_decode_u32(argp->xdr, &argp->minorversion) < 0)
return false;
- if (xdr_stream_decode_u32(argp->xdr, &argp->client_opcnt) < 0)
+ if (xdr_stream_decode_u32(argp->xdr, &argp->opcnt) < 0)
return false;
- argp->opcnt = min_t(u32, argp->client_opcnt,
- NFSD_MAX_OPS_PER_COMPOUND);
if (argp->opcnt > ARRAY_SIZE(argp->iops)) {
argp->ops = vcalloc(argp->opcnt, sizeof(*argp->ops));
@@ -2564,7 +2562,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
/* Sessions make the DRC unnecessary: */
if (argp->minorversion)
cachethis = false;
- svc_reserve(argp->rqstp, max_reply + readbytes);
+ svc_reserve_auth(argp->rqstp, max_reply + readbytes);
argp->rqstp->rq_cachetype = cachethis ? RC_REPLBUFF : RC_NOCACHE;
argp->splice_ok = nfsd_read_splice_ok(argp->rqstp);
@@ -3391,6 +3389,23 @@ static __be32 nfsd4_encode_fattr4_suppattr_exclcreat(struct xdr_stream *xdr,
return nfsd4_encode_bitmap4(xdr, supp[0], supp[1], supp[2]);
}
+/*
+ * Copied from generic_remap_checks/generic_remap_file_range_prep.
+ *
+ * These generic functions use the file system's s_blocksize, but
+ * individual file systems aren't required to use
+ * generic_remap_file_range_prep. Until there is a mechanism for
+ * determining a particular file system's (or file's) clone block
+ * size, this is the best NFSD can do.
+ */
+static __be32 nfsd4_encode_fattr4_clone_blksize(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ struct inode *inode = d_inode(args->dentry);
+
+ return nfsd4_encode_uint32_t(xdr, inode->i_sb->s_blocksize);
+}
+
#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
static __be32 nfsd4_encode_fattr4_sec_label(struct xdr_stream *xdr,
const struct nfsd4_fattr_args *args)
@@ -3545,7 +3560,7 @@ static const nfsd4_enc_attr nfsd4_enc_fattr4_encode_ops[] = {
[FATTR4_MODE_SET_MASKED] = nfsd4_encode_fattr4__noop,
[FATTR4_SUPPATTR_EXCLCREAT] = nfsd4_encode_fattr4_suppattr_exclcreat,
[FATTR4_FS_CHARSET_CAP] = nfsd4_encode_fattr4__noop,
- [FATTR4_CLONE_BLKSIZE] = nfsd4_encode_fattr4__noop,
+ [FATTR4_CLONE_BLKSIZE] = nfsd4_encode_fattr4_clone_blksize,
[FATTR4_SPACE_FREED] = nfsd4_encode_fattr4__noop,
[FATTR4_CHANGE_ATTR_TYPE] = nfsd4_encode_fattr4__noop,
@@ -3812,7 +3827,9 @@ nfsd4_encode_entry4_fattr(struct nfsd4_readdir *cd, const char *name,
__be32 nfserr;
int ignore_crossmnt = 0;
- dentry = lookup_positive_unlocked(name, cd->rd_fhp->fh_dentry, namlen);
+ dentry = lookup_one_positive_unlocked(&nop_mnt_idmap,
+ &QSTR_LEN(name, namlen),
+ cd->rd_fhp->fh_dentry);
if (IS_ERR(dentry))
return nfserrno(PTR_ERR(dentry));
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index ce2a71e4904c..bc6b776fc657 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1436,7 +1436,7 @@ unsigned int nfsd_net_id;
static int nfsd_genl_rpc_status_compose_msg(struct sk_buff *skb,
struct netlink_callback *cb,
- struct nfsd_genl_rqstp *rqstp)
+ struct nfsd_genl_rqstp *genl_rqstp)
{
void *hdr;
u32 i;
@@ -1446,22 +1446,22 @@ static int nfsd_genl_rpc_status_compose_msg(struct sk_buff *skb,
if (!hdr)
return -ENOBUFS;
- if (nla_put_be32(skb, NFSD_A_RPC_STATUS_XID, rqstp->rq_xid) ||
- nla_put_u32(skb, NFSD_A_RPC_STATUS_FLAGS, rqstp->rq_flags) ||
- nla_put_u32(skb, NFSD_A_RPC_STATUS_PROG, rqstp->rq_prog) ||
- nla_put_u32(skb, NFSD_A_RPC_STATUS_PROC, rqstp->rq_proc) ||
- nla_put_u8(skb, NFSD_A_RPC_STATUS_VERSION, rqstp->rq_vers) ||
+ if (nla_put_be32(skb, NFSD_A_RPC_STATUS_XID, genl_rqstp->rq_xid) ||
+ nla_put_u32(skb, NFSD_A_RPC_STATUS_FLAGS, genl_rqstp->rq_flags) ||
+ nla_put_u32(skb, NFSD_A_RPC_STATUS_PROG, genl_rqstp->rq_prog) ||
+ nla_put_u32(skb, NFSD_A_RPC_STATUS_PROC, genl_rqstp->rq_proc) ||
+ nla_put_u8(skb, NFSD_A_RPC_STATUS_VERSION, genl_rqstp->rq_vers) ||
nla_put_s64(skb, NFSD_A_RPC_STATUS_SERVICE_TIME,
- ktime_to_us(rqstp->rq_stime),
+ ktime_to_us(genl_rqstp->rq_stime),
NFSD_A_RPC_STATUS_PAD))
return -ENOBUFS;
- switch (rqstp->rq_saddr.sa_family) {
+ switch (genl_rqstp->rq_saddr.sa_family) {
case AF_INET: {
const struct sockaddr_in *s_in, *d_in;
- s_in = (const struct sockaddr_in *)&rqstp->rq_saddr;
- d_in = (const struct sockaddr_in *)&rqstp->rq_daddr;
+ s_in = (const struct sockaddr_in *)&genl_rqstp->rq_saddr;
+ d_in = (const struct sockaddr_in *)&genl_rqstp->rq_daddr;
if (nla_put_in_addr(skb, NFSD_A_RPC_STATUS_SADDR4,
s_in->sin_addr.s_addr) ||
nla_put_in_addr(skb, NFSD_A_RPC_STATUS_DADDR4,
@@ -1476,8 +1476,8 @@ static int nfsd_genl_rpc_status_compose_msg(struct sk_buff *skb,
case AF_INET6: {
const struct sockaddr_in6 *s_in, *d_in;
- s_in = (const struct sockaddr_in6 *)&rqstp->rq_saddr;
- d_in = (const struct sockaddr_in6 *)&rqstp->rq_daddr;
+ s_in = (const struct sockaddr_in6 *)&genl_rqstp->rq_saddr;
+ d_in = (const struct sockaddr_in6 *)&genl_rqstp->rq_daddr;
if (nla_put_in6_addr(skb, NFSD_A_RPC_STATUS_SADDR6,
&s_in->sin6_addr) ||
nla_put_in6_addr(skb, NFSD_A_RPC_STATUS_DADDR6,
@@ -1491,9 +1491,9 @@ static int nfsd_genl_rpc_status_compose_msg(struct sk_buff *skb,
}
}
- for (i = 0; i < rqstp->rq_opcnt; i++)
+ for (i = 0; i < genl_rqstp->rq_opcnt; i++)
if (nla_put_u32(skb, NFSD_A_RPC_STATUS_COMPOUND_OPS,
- rqstp->rq_opnum[i]))
+ genl_rqstp->rq_opnum[i]))
return -ENOBUFS;
genlmsg_end(skb, hdr);
@@ -1569,7 +1569,8 @@ int nfsd_nl_rpc_status_get_dumpit(struct sk_buff *skb,
int j;
args = rqstp->rq_argp;
- genl_rqstp.rq_opcnt = args->opcnt;
+ genl_rqstp.rq_opcnt = min_t(u32, args->opcnt,
+ ARRAY_SIZE(genl_rqstp.rq_opnum));
for (j = 0; j < genl_rqstp.rq_opcnt; j++)
genl_rqstp.rq_opnum[j] =
args->ops[j].opnum;
@@ -1611,7 +1612,7 @@ out_unlock:
*/
int nfsd_nl_threads_set_doit(struct sk_buff *skb, struct genl_info *info)
{
- int *nthreads, count = 0, nrpools, i, ret = -EOPNOTSUPP, rem;
+ int *nthreads, nrpools = 0, i, ret = -EOPNOTSUPP, rem;
struct net *net = genl_info_net(info);
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
const struct nlattr *attr;
@@ -1621,14 +1622,12 @@ int nfsd_nl_threads_set_doit(struct sk_buff *skb, struct genl_info *info)
return -EINVAL;
/* count number of SERVER_THREADS values */
- nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) {
- if (nla_type(attr) == NFSD_A_SERVER_THREADS)
- count++;
- }
+ nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_THREADS, info->nlhdr,
+ GENL_HDRLEN, rem)
+ nrpools++;
mutex_lock(&nfsd_mutex);
- nrpools = max(count, nfsd_nrpools(net));
nthreads = kcalloc(nrpools, sizeof(int), GFP_KERNEL);
if (!nthreads) {
ret = -ENOMEM;
@@ -1636,12 +1635,11 @@ int nfsd_nl_threads_set_doit(struct sk_buff *skb, struct genl_info *info)
}
i = 0;
- nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) {
- if (nla_type(attr) == NFSD_A_SERVER_THREADS) {
- nthreads[i++] = nla_get_u32(attr);
- if (i >= nrpools)
- break;
- }
+ nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_THREADS, info->nlhdr,
+ GENL_HDRLEN, rem) {
+ nthreads[i++] = nla_get_u32(attr);
+ if (i >= nrpools)
+ break;
}
if (info->attrs[NFSD_A_SERVER_GRACETIME] ||
@@ -1782,14 +1780,12 @@ int nfsd_nl_version_set_doit(struct sk_buff *skb, struct genl_info *info)
for (i = 0; i <= NFSD_SUPPORTED_MINOR_VERSION; i++)
nfsd_minorversion(nn, i, NFSD_CLEAR);
- nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) {
+ nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_PROTO_VERSION, info->nlhdr,
+ GENL_HDRLEN, rem) {
struct nlattr *tb[NFSD_A_VERSION_MAX + 1];
u32 major, minor = 0;
bool enabled;
- if (nla_type(attr) != NFSD_A_SERVER_PROTO_VERSION)
- continue;
-
if (nla_parse_nested(tb, NFSD_A_VERSION_MAX, attr,
nfsd_version_nl_policy, info->extack) < 0)
continue;
@@ -1917,6 +1913,7 @@ int nfsd_nl_listener_set_doit(struct sk_buff *skb, struct genl_info *info)
struct svc_serv *serv;
LIST_HEAD(permsocks);
struct nfsd_net *nn;
+ bool delete = false;
int err, rem;
mutex_lock(&nfsd_mutex);
@@ -1939,14 +1936,12 @@ int nfsd_nl_listener_set_doit(struct sk_buff *skb, struct genl_info *info)
* Walk the list of server_socks from userland and move any that match
* back to sv_permsocks
*/
- nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) {
+ nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_SOCK_ADDR, info->nlhdr,
+ GENL_HDRLEN, rem) {
struct nlattr *tb[NFSD_A_SOCK_MAX + 1];
const char *xcl_name;
struct sockaddr *sa;
- if (nla_type(attr) != NFSD_A_SERVER_SOCK_ADDR)
- continue;
-
if (nla_parse_nested(tb, NFSD_A_SOCK_MAX, attr,
nfsd_sock_nl_policy, info->extack) < 0)
continue;
@@ -1977,45 +1972,37 @@ int nfsd_nl_listener_set_doit(struct sk_buff *skb, struct genl_info *info)
}
}
- /* For now, no removing old sockets while server is running */
- if (serv->sv_nrthreads && !list_empty(&permsocks)) {
+ /*
+ * If there are listener transports remaining on the permsocks list,
+ * it means we were asked to remove a listener.
+ */
+ if (!list_empty(&permsocks)) {
list_splice_init(&permsocks, &serv->sv_permsocks);
- spin_unlock_bh(&serv->sv_lock);
- err = -EBUSY;
- goto out_unlock_mtx;
+ delete = true;
}
+ spin_unlock_bh(&serv->sv_lock);
- /* Close the remaining sockets on the permsocks list */
- while (!list_empty(&permsocks)) {
- xprt = list_first_entry(&permsocks, struct svc_xprt, xpt_list);
- list_move(&xprt->xpt_list, &serv->sv_permsocks);
-
- /*
- * Newly-created sockets are born with the BUSY bit set. Clear
- * it if there are no threads, since nothing can pick it up
- * in that case.
- */
- if (!serv->sv_nrthreads)
- clear_bit(XPT_BUSY, &xprt->xpt_flags);
-
- set_bit(XPT_CLOSE, &xprt->xpt_flags);
- spin_unlock_bh(&serv->sv_lock);
- svc_xprt_close(xprt);
- spin_lock_bh(&serv->sv_lock);
+ /* Do not remove listeners while there are active threads. */
+ if (serv->sv_nrthreads) {
+ err = -EBUSY;
+ goto out_unlock_mtx;
}
- spin_unlock_bh(&serv->sv_lock);
+ /*
+ * Since we can't delete an arbitrary llist entry, destroy the
+ * remaining listeners and recreate the list.
+ */
+ if (delete)
+ svc_xprt_destroy_all(serv, net);
/* walk list of addrs again, open any that still don't exist */
- nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) {
+ nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_SOCK_ADDR, info->nlhdr,
+ GENL_HDRLEN, rem) {
struct nlattr *tb[NFSD_A_SOCK_MAX + 1];
const char *xcl_name;
struct sockaddr *sa;
int ret;
- if (nla_type(attr) != NFSD_A_SERVER_SOCK_ADDR)
- continue;
-
if (nla_parse_nested(tb, NFSD_A_SOCK_MAX, attr,
nfsd_sock_nl_policy, info->extack) < 0)
continue;
@@ -2031,6 +2018,9 @@ int nfsd_nl_listener_set_doit(struct sk_buff *skb, struct genl_info *info)
xprt = svc_find_listener(serv, xcl_name, net, sa);
if (xprt) {
+ if (delete)
+ WARN_ONCE(1, "Transport type=%s already exists\n",
+ xcl_name);
svc_xprt_put(xprt);
continue;
}
@@ -2204,8 +2194,14 @@ static __net_init int nfsd_net_init(struct net *net)
NFSD_STATS_COUNTERS_NUM);
if (retval)
goto out_repcache_error;
+
memset(&nn->nfsd_svcstats, 0, sizeof(nn->nfsd_svcstats));
nn->nfsd_svcstats.program = &nfsd_programs[0];
+ if (!nfsd_proc_stat_init(net)) {
+ retval = -ENOMEM;
+ goto out_proc_error;
+ }
+
for (i = 0; i < sizeof(nn->nfsd_versions); i++)
nn->nfsd_versions[i] = nfsd_support_version(i);
for (i = 0; i < sizeof(nn->nfsd4_minorversions); i++)
@@ -2215,13 +2211,14 @@ static __net_init int nfsd_net_init(struct net *net)
nfsd4_init_leases_net(nn);
get_random_bytes(&nn->siphash_key, sizeof(nn->siphash_key));
seqlock_init(&nn->writeverf_lock);
- nfsd_proc_stat_init(net);
#if IS_ENABLED(CONFIG_NFS_LOCALIO)
spin_lock_init(&nn->local_clients_lock);
INIT_LIST_HEAD(&nn->local_clients);
#endif
return 0;
+out_proc_error:
+ percpu_counter_destroy_many(nn->counter, NFSD_STATS_COUNTERS_NUM);
out_repcache_error:
nfsd_idmap_shutdown(net);
out_idmap_error:
@@ -2276,6 +2273,8 @@ static int __init init_nfsd(void)
{
int retval;
+ nfsd_debugfs_init();
+
retval = nfsd4_init_slabs();
if (retval)
return retval;
@@ -2286,12 +2285,9 @@ static int __init init_nfsd(void)
if (retval)
goto out_free_pnfs;
nfsd_lockd_init(); /* lockd->nfsd callbacks */
- retval = create_proc_exports_entry();
- if (retval)
- goto out_free_lockd;
retval = register_pernet_subsys(&nfsd_net_ops);
if (retval < 0)
- goto out_free_exports;
+ goto out_free_lockd;
retval = register_cld_notifier();
if (retval)
goto out_free_subsys;
@@ -2300,22 +2296,26 @@ static int __init init_nfsd(void)
goto out_free_cld;
retval = register_filesystem(&nfsd_fs_type);
if (retval)
- goto out_free_all;
+ goto out_free_nfsd4;
retval = genl_register_family(&nfsd_nl_family);
if (retval)
+ goto out_free_filesystem;
+ retval = create_proc_exports_entry();
+ if (retval)
goto out_free_all;
nfsd_localio_ops_init();
return 0;
out_free_all:
+ genl_unregister_family(&nfsd_nl_family);
+out_free_filesystem:
+ unregister_filesystem(&nfsd_fs_type);
+out_free_nfsd4:
nfsd4_destroy_laundry_wq();
out_free_cld:
unregister_cld_notifier();
out_free_subsys:
unregister_pernet_subsys(&nfsd_net_ops);
-out_free_exports:
- remove_proc_entry("fs/nfs/exports", NULL);
- remove_proc_entry("fs/nfs", NULL);
out_free_lockd:
nfsd_lockd_shutdown();
nfsd_drc_slab_free();
@@ -2323,22 +2323,24 @@ out_free_pnfs:
nfsd4_exit_pnfs();
out_free_slabs:
nfsd4_free_slabs();
+ nfsd_debugfs_exit();
return retval;
}
static void __exit exit_nfsd(void)
{
+ remove_proc_entry("fs/nfs/exports", NULL);
+ remove_proc_entry("fs/nfs", NULL);
genl_unregister_family(&nfsd_nl_family);
unregister_filesystem(&nfsd_fs_type);
nfsd4_destroy_laundry_wq();
unregister_cld_notifier();
unregister_pernet_subsys(&nfsd_net_ops);
nfsd_drc_slab_free();
- remove_proc_entry("fs/nfs/exports", NULL);
- remove_proc_entry("fs/nfs", NULL);
nfsd_lockd_shutdown();
nfsd4_free_slabs();
nfsd4_exit_pnfs();
+ nfsd_debugfs_exit();
}
MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>");
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index e2997f0ffbc5..1cd0bed57bc2 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -44,32 +44,19 @@ bool nfsd_support_version(int vers);
#include "stats.h"
/*
- * Maximum blocksizes supported by daemon under various circumstances.
+ * Default and maximum payload size (NFS READ or WRITE), in bytes.
+ * The default is historical, and the maximum is an implementation
+ * limit.
*/
-#define NFSSVC_MAXBLKSIZE RPCSVC_MAXPAYLOAD
-/* NFSv2 is limited by the protocol specification, see RFC 1094 */
-#define NFSSVC_MAXBLKSIZE_V2 (8*1024)
-
-
-/*
- * Largest number of bytes we need to allocate for an NFS
- * call or reply. Used to control buffer sizes. We use
- * the length of v3 WRITE, READDIR and READDIR replies
- * which are an RPC header, up to 26 XDR units of reply
- * data, and some page data.
- *
- * Note that accuracy here doesn't matter too much as the
- * size is rounded up to a page size when allocating space.
- */
-#define NFSD_BUFSIZE ((RPC_MAX_HEADER_WITH_AUTH+26)*XDR_UNIT + NFSSVC_MAXBLKSIZE)
+enum {
+ NFSSVC_DEFBLKSIZE = 1 * 1024 * 1024,
+ NFSSVC_MAXBLKSIZE = RPCSVC_MAXPAYLOAD,
+};
struct readdir_cd {
__be32 err; /* 0, nfserr, or nfserr_eof */
};
-/* Maximum number of operations per session compound */
-#define NFSD_MAX_OPS_PER_COMPOUND 50
-
struct nfsd_genl_rqstp {
struct sockaddr rq_daddr;
struct sockaddr rq_saddr;
@@ -82,7 +69,7 @@ struct nfsd_genl_rqstp {
/* NFSv4 compound */
u32 rq_opcnt;
- u32 rq_opnum[NFSD_MAX_OPS_PER_COMPOUND];
+ u32 rq_opnum[16];
};
extern struct svc_program nfsd_programs[];
@@ -156,6 +143,16 @@ void nfsd_reset_versions(struct nfsd_net *nn);
int nfsd_create_serv(struct net *net);
void nfsd_destroy_serv(struct net *net);
+#ifdef CONFIG_DEBUG_FS
+void nfsd_debugfs_init(void);
+void nfsd_debugfs_exit(void);
+#else
+static inline void nfsd_debugfs_init(void) {}
+static inline void nfsd_debugfs_exit(void) {}
+#endif
+
+extern bool nfsd_disable_splice_read __read_mostly;
+
extern int nfsd_max_blksize;
static inline int nfsd_v4client(struct svc_rqst *rq)
@@ -283,6 +280,7 @@ void nfsd_lockd_shutdown(void);
#define nfserr_cb_path_down cpu_to_be32(NFSERR_CB_PATH_DOWN)
#define nfserr_locked cpu_to_be32(NFSERR_LOCKED)
#define nfserr_wrongsec cpu_to_be32(NFSERR_WRONGSEC)
+#define nfserr_delay cpu_to_be32(NFS4ERR_DELAY)
#define nfserr_badiomode cpu_to_be32(NFS4ERR_BADIOMODE)
#define nfserr_badlayout cpu_to_be32(NFS4ERR_BADLAYOUT)
#define nfserr_bad_session_digest cpu_to_be32(NFS4ERR_BAD_SESSION_DIGEST)
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 32019751a41e..74cf1f4de174 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -172,6 +172,8 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct net *net,
if (len == 0)
return error;
if (fh->fh_fsid_type == FSID_MAJOR_MINOR) {
+ u32 *fsid = fh_fsid(fh);
+
/* deprecated, convert to type 3 */
len = key_len(FSID_ENCODE_DEV)/4;
fh->fh_fsid_type = FSID_ENCODE_DEV;
@@ -181,17 +183,17 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct net *net,
* confuses sparse, so we must use __force here to
* keep it from complaining.
*/
- fh->fh_fsid[0] = new_encode_dev(MKDEV(ntohl((__force __be32)fh->fh_fsid[0]),
- ntohl((__force __be32)fh->fh_fsid[1])));
- fh->fh_fsid[1] = fh->fh_fsid[2];
+ fsid[0] = new_encode_dev(MKDEV(ntohl((__force __be32)fsid[0]),
+ ntohl((__force __be32)fsid[1])));
+ fsid[1] = fsid[2];
}
data_left -= len;
if (data_left < 0)
return error;
exp = rqst_exp_find(rqstp ? &rqstp->rq_chandle : NULL,
net, client, gssclient,
- fh->fh_fsid_type, fh->fh_fsid);
- fid = (struct fid *)(fh->fh_fsid + len);
+ fh->fh_fsid_type, fh_fsid(fh));
+ fid = (struct fid *)(fh_fsid(fh) + len);
error = nfserr_stale;
if (IS_ERR(exp)) {
@@ -380,8 +382,9 @@ __fh_verify(struct svc_rqst *rqstp,
error = check_nfsd_access(exp, rqstp, may_bypass_gss);
if (error)
goto out;
-
- svc_xprt_set_valid(rqstp->rq_xprt);
+ /* During LOCALIO call to fh_verify will be called with a NULL rqstp */
+ if (rqstp)
+ svc_xprt_set_valid(rqstp->rq_xprt);
/* Finally, check access permissions. */
error = nfsd_permission(cred, exp, dentry, access);
@@ -462,7 +465,7 @@ static void _fh_update(struct svc_fh *fhp, struct svc_export *exp,
{
if (dentry != exp->ex_path.dentry) {
struct fid *fid = (struct fid *)
- (fhp->fh_handle.fh_fsid + fhp->fh_handle.fh_size/4 - 1);
+ (fh_fsid(&fhp->fh_handle) + fhp->fh_handle.fh_size/4 - 1);
int maxsize = (fhp->fh_maxsize - fhp->fh_handle.fh_size)/4;
int fh_flags = (exp->ex_flags & NFSEXP_NOSUBTREECHECK) ? 0 :
EXPORT_FH_CONNECTABLE;
@@ -613,7 +616,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
fhp->fh_handle.fh_auth_type = 0;
mk_fsid(fhp->fh_handle.fh_fsid_type,
- fhp->fh_handle.fh_fsid,
+ fh_fsid(&fhp->fh_handle),
ex_dev,
d_inode(exp->ex_path.dentry)->i_ino,
exp->ex_fsid, exp->ex_uuid);
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index 876152a91f12..1cf979722521 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -49,18 +49,19 @@ struct knfsd_fh {
* Points to the current size while
* building a new file handle.
*/
- union {
- char fh_raw[NFS4_FHSIZE];
- struct {
- u8 fh_version; /* == 1 */
- u8 fh_auth_type; /* deprecated */
- u8 fh_fsid_type;
- u8 fh_fileid_type;
- u32 fh_fsid[]; /* flexible-array member */
- };
- };
+ u8 fh_raw[NFS4_FHSIZE];
};
+#define fh_version fh_raw[0]
+#define fh_auth_type fh_raw[1]
+#define fh_fsid_type fh_raw[2]
+#define fh_fileid_type fh_raw[3]
+
+static inline u32 *fh_fsid(const struct knfsd_fh *fh)
+{
+ return (u32 *)&fh->fh_raw[4];
+}
+
static inline __u32 ino_t_to_u32(ino_t ino)
{
return (__u32) ino;
@@ -260,14 +261,16 @@ static inline bool fh_match(const struct knfsd_fh *fh1,
static inline bool fh_fsid_match(const struct knfsd_fh *fh1,
const struct knfsd_fh *fh2)
{
+ u32 *fsid1 = fh_fsid(fh1);
+ u32 *fsid2 = fh_fsid(fh2);
+
if (fh1->fh_fsid_type != fh2->fh_fsid_type)
return false;
- if (memcmp(fh1->fh_fsid, fh2->fh_fsid, key_len(fh1->fh_fsid_type)) != 0)
+ if (memcmp(fsid1, fsid2, key_len(fh1->fh_fsid_type)) != 0)
return false;
return true;
}
-#ifdef CONFIG_CRC32
/**
* knfsd_fh_hash - calculate the crc32 hash for the filehandle
* @fh - pointer to filehandle
@@ -279,12 +282,6 @@ static inline u32 knfsd_fh_hash(const struct knfsd_fh *fh)
{
return ~crc32_le(0xFFFFFFFF, fh->fh_raw, fh->fh_size);
}
-#else
-static inline u32 knfsd_fh_hash(const struct knfsd_fh *fh)
-{
- return 0;
-}
-#endif
/**
* fh_clear_pre_post_attrs - Reset pre/post attributes
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 6dda081eb24c..8f71f5748c75 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -10,6 +10,7 @@
#include "cache.h"
#include "xdr.h"
#include "vfs.h"
+#include "trace.h"
#define NFSDDBG_FACILITY NFSDDBG_PROC
@@ -54,7 +55,7 @@ nfsd_proc_getattr(struct svc_rqst *rqstp)
struct nfsd_fhandle *argp = rqstp->rq_argp;
struct nfsd_attrstat *resp = rqstp->rq_resp;
- dprintk("nfsd: GETATTR %s\n", SVCFH_fmt(&argp->fh));
+ trace_nfsd_vfs_getattr(rqstp, &argp->fh);
fh_copy(&resp->fh, &argp->fh);
resp->status = fh_verify(rqstp, &resp->fh, 0,
@@ -211,7 +212,7 @@ nfsd_proc_read(struct svc_rqst *rqstp)
SVCFH_fmt(&argp->fh),
argp->count, argp->offset);
- argp->count = min_t(u32, argp->count, NFSSVC_MAXBLKSIZE_V2);
+ argp->count = min_t(u32, argp->count, NFS_MAXDATA);
argp->count = min_t(u32, argp->count, rqstp->rq_res.buflen);
resp->pages = rqstp->rq_next_page;
@@ -250,17 +251,14 @@ nfsd_proc_write(struct svc_rqst *rqstp)
struct nfsd_writeargs *argp = rqstp->rq_argp;
struct nfsd_attrstat *resp = rqstp->rq_resp;
unsigned long cnt = argp->len;
- unsigned int nvecs;
dprintk("nfsd: WRITE %s %u bytes at %d\n",
SVCFH_fmt(&argp->fh),
argp->len, argp->offset);
- nvecs = svc_fill_write_vector(rqstp, &argp->payload);
-
- resp->status = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh),
- argp->offset, rqstp->rq_vec, nvecs,
- &cnt, NFS_DATA_SYNC, NULL);
+ fh_copy(&resp->fh, &argp->fh);
+ resp->status = nfsd_write(rqstp, &resp->fh, argp->offset,
+ &argp->payload, &cnt, NFS_DATA_SYNC, NULL);
if (resp->status == nfs_ok)
resp->status = fh_getattr(&resp->fh, &resp->stat);
else if (resp->status == nfserr_jukebox)
@@ -292,9 +290,6 @@ nfsd_proc_create(struct svc_rqst *rqstp)
int hosterr;
dev_t rdev = 0, wanted = new_decode_dev(attr->ia_size);
- dprintk("nfsd: CREATE %s %.*s\n",
- SVCFH_fmt(dirfhp), argp->len, argp->name);
-
/* First verify the parent file handle */
resp->status = fh_verify(rqstp, dirfhp, S_IFDIR, NFSD_MAY_EXEC);
if (resp->status != nfs_ok)
@@ -312,7 +307,8 @@ nfsd_proc_create(struct svc_rqst *rqstp)
}
inode_lock_nested(dirfhp->fh_dentry->d_inode, I_MUTEX_PARENT);
- dchild = lookup_one_len(argp->name, dirfhp->fh_dentry, argp->len);
+ dchild = lookup_one(&nop_mnt_idmap, &QSTR_LEN(argp->name, argp->len),
+ dirfhp->fh_dentry);
if (IS_ERR(dchild)) {
resp->status = nfserrno(PTR_ERR(dchild));
goto out_unlock;
@@ -331,7 +327,7 @@ nfsd_proc_create(struct svc_rqst *rqstp)
*/
resp->status = nfserr_acces;
if (!newfhp->fh_dentry) {
- printk(KERN_WARNING
+ printk(KERN_WARNING
"nfsd_proc_create: file handle not verified\n");
goto out_unlock;
}
@@ -445,9 +441,6 @@ nfsd_proc_remove(struct svc_rqst *rqstp)
struct nfsd_diropargs *argp = rqstp->rq_argp;
struct nfsd_stat *resp = rqstp->rq_resp;
- dprintk("nfsd: REMOVE %s %.*s\n", SVCFH_fmt(&argp->fh),
- argp->len, argp->name);
-
/* Unlink. -SIFDIR means file must not be a directory */
resp->status = nfsd_unlink(rqstp, &argp->fh, -S_IFDIR,
argp->name, argp->len);
@@ -462,11 +455,6 @@ nfsd_proc_rename(struct svc_rqst *rqstp)
struct nfsd_renameargs *argp = rqstp->rq_argp;
struct nfsd_stat *resp = rqstp->rq_resp;
- dprintk("nfsd: RENAME %s %.*s -> \n",
- SVCFH_fmt(&argp->ffh), argp->flen, argp->fname);
- dprintk("nfsd: -> %s %.*s\n",
- SVCFH_fmt(&argp->tfh), argp->tlen, argp->tname);
-
resp->status = nfsd_rename(rqstp, &argp->ffh, argp->fname, argp->flen,
&argp->tfh, argp->tname, argp->tlen);
fh_put(&argp->ffh);
@@ -481,13 +469,6 @@ nfsd_proc_link(struct svc_rqst *rqstp)
struct nfsd_linkargs *argp = rqstp->rq_argp;
struct nfsd_stat *resp = rqstp->rq_resp;
- dprintk("nfsd: LINK %s ->\n",
- SVCFH_fmt(&argp->ffh));
- dprintk("nfsd: %s %.*s\n",
- SVCFH_fmt(&argp->tfh),
- argp->tlen,
- argp->tname);
-
resp->status = nfsd_link(rqstp, &argp->tfh, argp->tname, argp->tlen,
&argp->ffh);
fh_put(&argp->ffh);
@@ -519,10 +500,6 @@ nfsd_proc_symlink(struct svc_rqst *rqstp)
goto out;
}
- dprintk("nfsd: SYMLINK %s %.*s -> %.*s\n",
- SVCFH_fmt(&argp->ffh), argp->flen, argp->fname,
- argp->tlen, argp->tname);
-
fh_init(&newfh, NFS_FHSIZE);
resp->status = nfsd_symlink(rqstp, &argp->ffh, argp->fname, argp->flen,
argp->tname, &attrs, &newfh);
@@ -548,8 +525,6 @@ nfsd_proc_mkdir(struct svc_rqst *rqstp)
.na_iattr = &argp->attrs,
};
- dprintk("nfsd: MKDIR %s %.*s\n", SVCFH_fmt(&argp->fh), argp->len, argp->name);
-
if (resp->fh.fh_dentry) {
printk(KERN_WARNING
"nfsd_proc_mkdir: response already verified??\n");
@@ -578,8 +553,6 @@ nfsd_proc_rmdir(struct svc_rqst *rqstp)
struct nfsd_diropargs *argp = rqstp->rq_argp;
struct nfsd_stat *resp = rqstp->rq_resp;
- dprintk("nfsd: RMDIR %s %.*s\n", SVCFH_fmt(&argp->fh), argp->len, argp->name);
-
resp->status = nfsd_unlink(rqstp, &argp->fh, S_IFDIR,
argp->name, argp->len);
fh_put(&argp->fh);
@@ -602,7 +575,7 @@ static void nfsd_init_dirlist_pages(struct svc_rqst *rqstp,
buf->pages = rqstp->rq_next_page;
rqstp->rq_next_page++;
- xdr_init_encode_pages(xdr, buf, buf->pages, NULL);
+ xdr_init_encode_pages(xdr, buf);
}
/*
@@ -615,9 +588,7 @@ nfsd_proc_readdir(struct svc_rqst *rqstp)
struct nfsd_readdirres *resp = rqstp->rq_resp;
loff_t offset;
- dprintk("nfsd: READDIR %s %d bytes at %d\n",
- SVCFH_fmt(&argp->fh),
- argp->count, argp->cookie);
+ trace_nfsd_vfs_readdir(rqstp, &argp->fh, argp->count, argp->cookie);
nfsd_init_dirlist_pages(rqstp, resp, argp->count);
@@ -642,8 +613,6 @@ nfsd_proc_statfs(struct svc_rqst *rqstp)
struct nfsd_fhandle *argp = rqstp->rq_argp;
struct nfsd_statfsres *resp = rqstp->rq_resp;
- dprintk("nfsd: STATFS %s\n", SVCFH_fmt(&argp->fh));
-
resp->status = nfsd_statfs(rqstp, &argp->fh, &resp->stats,
NFSD_MAY_BYPASS_GSS_ON_ROOT);
fh_put(&argp->fh);
@@ -739,7 +708,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_argzero = sizeof(struct nfsd_readargs),
.pc_ressize = sizeof(struct nfsd_readres),
.pc_cachetype = RC_NOCACHE,
- .pc_xdrressize = ST+AT+1+NFSSVC_MAXBLKSIZE_V2/4,
+ .pc_xdrressize = ST+AT+1+NFS_MAXDATA/4,
.pc_name = "READ",
},
[NFSPROC_WRITECACHE] = {
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 9b3d6cff0e1e..82b0111ac469 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -396,13 +396,13 @@ static int nfsd_startup_net(struct net *net, const struct cred *cred)
if (ret)
goto out_filecache;
+#ifdef CONFIG_NFSD_V4_2_INTER_SSC
+ nfsd4_ssc_init_umount_work(nn);
+#endif
ret = nfs4_state_start_net(net);
if (ret)
goto out_reply_cache;
-#ifdef CONFIG_NFSD_V4_2_INTER_SSC
- nfsd4_ssc_init_umount_work(nn);
-#endif
nn->nfsd_net_up = true;
return 0;
@@ -582,7 +582,7 @@ static int nfsd_get_default_max_blksize(void)
*/
target >>= 12;
- ret = NFSSVC_MAXBLKSIZE;
+ ret = NFSSVC_DEFBLKSIZE;
while (ret > target && ret >= 8*1024*2)
ret /= 2;
return ret;
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index 5777f40c7353..fc262ceafca9 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -336,7 +336,7 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
/* opaque data */
if (xdr_stream_decode_u32(xdr, &args->len) < 0)
return false;
- if (args->len > NFSSVC_MAXBLKSIZE_V2)
+ if (args->len > NFS_MAXDATA)
return false;
return xdr_stream_subsegment(xdr, &args->payload, args->len);
@@ -540,7 +540,7 @@ nfssvc_encode_statfsres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
p = xdr_reserve_space(xdr, XDR_UNIT * 5);
if (!p)
return false;
- *p++ = cpu_to_be32(NFSSVC_MAXBLKSIZE_V2);
+ *p++ = cpu_to_be32(NFS_MAXDATA);
*p++ = cpu_to_be32(stat->f_bsize);
*p++ = cpu_to_be32(stat->f_blocks);
*p++ = cpu_to_be32(stat->f_bfree);
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 74d2d7b42676..8adc2550129e 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -64,15 +64,36 @@ typedef struct {
refcount_t cs_count;
} copy_stateid_t;
+struct nfsd4_referring_call {
+ struct list_head __list;
+
+ u32 rc_sequenceid;
+ u32 rc_slotid;
+};
+
+struct nfsd4_referring_call_list {
+ struct list_head __list;
+
+ struct nfs4_sessionid rcl_sessionid;
+ int __nr_referring_calls;
+ struct list_head rcl_referring_calls;
+};
+
struct nfsd4_callback {
struct nfs4_client *cb_clp;
struct rpc_message cb_msg;
+#define NFSD4_CALLBACK_RUNNING (0)
+#define NFSD4_CALLBACK_WAKE (1)
+#define NFSD4_CALLBACK_REQUEUE (2)
+ unsigned long cb_flags;
const struct nfsd4_callback_ops *cb_ops;
struct work_struct cb_work;
int cb_seq_status;
int cb_status;
int cb_held_slot;
- bool cb_need_restart;
+
+ int cb_nr_referring_call_list;
+ struct list_head cb_referring_call_list;
};
struct nfsd4_callback_ops {
@@ -162,15 +183,11 @@ struct nfs4_cb_fattr {
struct timespec64 ncf_cb_mtime;
struct timespec64 ncf_cb_atime;
- unsigned long ncf_cb_flags;
bool ncf_file_modified;
u64 ncf_initial_cinfo;
u64 ncf_cur_fsize;
};
-/* bits for ncf_cb_flags */
-#define CB_GETATTR_BUSY 0
-
/*
* Represents a delegation stateid. The nfs4_client holds references to these
* and they are put when it is being destroyed or when the delegation is
@@ -198,8 +215,8 @@ struct nfs4_delegation {
struct list_head dl_perclnt;
struct list_head dl_recall_lru; /* delegation recalled */
struct nfs4_clnt_odstate *dl_clnt_odstate;
- u32 dl_type;
time64_t dl_time;
+ u32 dl_type;
/* For recall: */
int dl_retries;
struct nfsd4_callback dl_recall;
@@ -261,6 +278,7 @@ struct nfsd4_slot {
u32 sl_seqid;
__be32 sl_status;
struct svc_cred sl_cred;
+ u32 sl_index;
u32 sl_datalen;
u16 sl_opcnt;
u16 sl_generation;
@@ -452,7 +470,6 @@ struct nfs4_client {
#define NFSD4_CLIENT_UPCALL_LOCK (5) /* upcall serialization */
#define NFSD4_CLIENT_CB_FLAG_MASK (1 << NFSD4_CLIENT_CB_UPDATE | \
1 << NFSD4_CLIENT_CB_KILL)
-#define NFSD4_CLIENT_CB_RECALL_ANY (6)
unsigned long cl_flags;
struct workqueue_struct *cl_callback_wq;
@@ -498,7 +515,6 @@ struct nfs4_client {
struct nfsd4_cb_recall_any *cl_ra;
time64_t cl_ra_time;
- struct list_head cl_ra_cblist;
};
/* struct nfs4_client_reset
@@ -649,6 +665,7 @@ struct nfs4_file {
atomic_t fi_access[2];
u32 fi_share_deny;
struct nfsd_file *fi_deleg_file;
+ struct nfsd_file *fi_rdeleg_file;
int fi_delegees;
struct knfsd_fh fi_fhandle;
bool fi_had_conflict;
@@ -777,9 +794,20 @@ extern __be32 nfs4_check_open_reclaim(struct nfs4_client *);
extern void nfsd4_probe_callback(struct nfs4_client *clp);
extern void nfsd4_probe_callback_sync(struct nfs4_client *clp);
extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
+extern void nfsd41_cb_referring_call(struct nfsd4_callback *cb,
+ struct nfs4_sessionid *sessionid,
+ u32 slotid, u32 seqno);
+extern void nfsd41_cb_destroy_referring_call_list(struct nfsd4_callback *cb);
extern void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
const struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op);
extern bool nfsd4_run_cb(struct nfsd4_callback *cb);
+
+static inline void nfsd4_try_run_cb(struct nfsd4_callback *cb)
+{
+ if (!test_and_set_bit(NFSD4_CALLBACK_RUNNING, &cb->cb_flags))
+ WARN_ON_ONCE(!nfsd4_run_cb(cb));
+}
+
extern void nfsd4_shutdown_callback(struct nfs4_client *);
extern void nfsd4_shutdown_copy(struct nfs4_client *clp);
void nfsd4_async_copy_reaper(struct nfsd_net *nn);
diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c
index bb22893f1157..f7eaf95e20fc 100644
--- a/fs/nfsd/stats.c
+++ b/fs/nfsd/stats.c
@@ -73,11 +73,11 @@ static int nfsd_show(struct seq_file *seq, void *v)
DEFINE_PROC_SHOW_ATTRIBUTE(nfsd);
-void nfsd_proc_stat_init(struct net *net)
+struct proc_dir_entry *nfsd_proc_stat_init(struct net *net)
{
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
- svc_proc_register(net, &nn->nfsd_svcstats, &nfsd_proc_ops);
+ return svc_proc_register(net, &nn->nfsd_svcstats, &nfsd_proc_ops);
}
void nfsd_proc_stat_shutdown(struct net *net)
diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h
index 04aacb6c36e2..e4efb0e4e56d 100644
--- a/fs/nfsd/stats.h
+++ b/fs/nfsd/stats.h
@@ -10,7 +10,7 @@
#include <uapi/linux/nfsd/stats.h>
#include <linux/percpu_counter.h>
-void nfsd_proc_stat_init(struct net *net);
+struct proc_dir_entry *nfsd_proc_stat_init(struct net *net);
void nfsd_proc_stat_shutdown(struct net *net);
static inline void nfsd_stats_rc_hits_inc(struct nfsd_net *nn)
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index ad2c0c432d08..a664fdf1161e 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -11,6 +11,7 @@
#include <linux/tracepoint.h>
#include <linux/sunrpc/clnt.h>
#include <linux/sunrpc/xprt.h>
+#include <trace/misc/fs.h>
#include <trace/misc/nfs.h>
#include <trace/misc/sunrpc.h>
@@ -18,22 +19,40 @@
#include "nfsfh.h"
#include "xdr4.h"
-#define NFSD_TRACE_PROC_RES_FIELDS \
+#define NFSD_TRACE_PROC_CALL_FIELDS(r) \
+ __field(unsigned int, netns_ino) \
+ __field(u32, xid) \
+ __sockaddr(server, (r)->rq_xprt->xpt_locallen) \
+ __sockaddr(client, (r)->rq_xprt->xpt_remotelen)
+
+#define NFSD_TRACE_PROC_CALL_ASSIGNMENTS(r) \
+ do { \
+ struct svc_xprt *xprt = (r)->rq_xprt; \
+ __entry->netns_ino = SVC_NET(r)->ns.inum; \
+ __entry->xid = be32_to_cpu((r)->rq_xid); \
+ __assign_sockaddr(server, &xprt->xpt_local, \
+ xprt->xpt_locallen); \
+ __assign_sockaddr(client, &xprt->xpt_remote, \
+ xprt->xpt_remotelen); \
+ } while (0)
+
+#define NFSD_TRACE_PROC_RES_FIELDS(r) \
__field(unsigned int, netns_ino) \
__field(u32, xid) \
__field(unsigned long, status) \
- __array(unsigned char, server, sizeof(struct sockaddr_in6)) \
- __array(unsigned char, client, sizeof(struct sockaddr_in6))
+ __sockaddr(server, (r)->rq_xprt->xpt_locallen) \
+ __sockaddr(client, (r)->rq_xprt->xpt_remotelen)
-#define NFSD_TRACE_PROC_RES_ASSIGNMENTS(error) \
+#define NFSD_TRACE_PROC_RES_ASSIGNMENTS(r, error) \
do { \
- __entry->netns_ino = SVC_NET(rqstp)->ns.inum; \
- __entry->xid = be32_to_cpu(rqstp->rq_xid); \
+ struct svc_xprt *xprt = (r)->rq_xprt; \
+ __entry->netns_ino = SVC_NET(r)->ns.inum; \
+ __entry->xid = be32_to_cpu((r)->rq_xid); \
__entry->status = be32_to_cpu(error); \
- memcpy(__entry->server, &rqstp->rq_xprt->xpt_local, \
- rqstp->rq_xprt->xpt_locallen); \
- memcpy(__entry->client, &rqstp->rq_xprt->xpt_remote, \
- rqstp->rq_xprt->xpt_remotelen); \
+ __assign_sockaddr(server, &xprt->xpt_local, \
+ xprt->xpt_locallen); \
+ __assign_sockaddr(client, &xprt->xpt_remote, \
+ xprt->xpt_remotelen); \
} while (0);
DECLARE_EVENT_CLASS(nfsd_xdr_err_class,
@@ -145,14 +164,14 @@ TRACE_EVENT(nfsd_compound_decode_err,
),
TP_ARGS(rqstp, args_opcnt, resp_opcnt, opnum, status),
TP_STRUCT__entry(
- NFSD_TRACE_PROC_RES_FIELDS
+ NFSD_TRACE_PROC_RES_FIELDS(rqstp)
__field(u32, args_opcnt)
__field(u32, resp_opcnt)
__field(u32, opnum)
),
TP_fast_assign(
- NFSD_TRACE_PROC_RES_ASSIGNMENTS(status)
+ NFSD_TRACE_PROC_RES_ASSIGNMENTS(rqstp, status)
__entry->args_opcnt = args_opcnt;
__entry->resp_opcnt = resp_opcnt;
@@ -171,12 +190,12 @@ DECLARE_EVENT_CLASS(nfsd_compound_err_class,
),
TP_ARGS(rqstp, opnum, status),
TP_STRUCT__entry(
- NFSD_TRACE_PROC_RES_FIELDS
+ NFSD_TRACE_PROC_RES_FIELDS(rqstp)
__field(u32, opnum)
),
TP_fast_assign(
- NFSD_TRACE_PROC_RES_ASSIGNMENTS(status)
+ NFSD_TRACE_PROC_RES_ASSIGNMENTS(rqstp, status)
__entry->opnum = opnum;
),
@@ -325,7 +344,7 @@ TRACE_EVENT(nfsd_exp_find_key,
int status),
TP_ARGS(key, status),
TP_STRUCT__entry(
- __field(int, fsidtype)
+ __field(u8, fsidtype)
__array(u32, fsid, 6)
__string(auth_domain, key->ek_client->name)
__field(int, status)
@@ -348,7 +367,7 @@ TRACE_EVENT(nfsd_expkey_update,
TP_PROTO(const struct svc_expkey *key, const char *exp_path),
TP_ARGS(key, exp_path),
TP_STRUCT__entry(
- __field(int, fsidtype)
+ __field(u8, fsidtype)
__array(u32, fsid, 6)
__string(auth_domain, key->ek_client->name)
__string(path, exp_path)
@@ -451,6 +470,8 @@ DEFINE_NFSD_IO_EVENT(write_start);
DEFINE_NFSD_IO_EVENT(write_opened);
DEFINE_NFSD_IO_EVENT(write_io_done);
DEFINE_NFSD_IO_EVENT(write_done);
+DEFINE_NFSD_IO_EVENT(commit_start);
+DEFINE_NFSD_IO_EVENT(commit_done);
DECLARE_EVENT_CLASS(nfsd_err_class,
TP_PROTO(struct svc_rqst *rqstp,
@@ -803,6 +824,14 @@ DEFINE_EVENT(nfsd_cs_slot_class, nfsd_##name, \
DEFINE_CS_SLOT_EVENT(slot_seqid_conf);
DEFINE_CS_SLOT_EVENT(slot_seqid_unconf);
+#define show_nfs_slot_flags(val) \
+ __print_flags(val, "|", \
+ { NFSD4_SLOT_INUSE, "INUSE" }, \
+ { NFSD4_SLOT_CACHETHIS, "CACHETHIS" }, \
+ { NFSD4_SLOT_INITIALIZED, "INITIALIZED" }, \
+ { NFSD4_SLOT_CACHED, "CACHED" }, \
+ { NFSD4_SLOT_REUSED, "REUSED" })
+
TRACE_EVENT(nfsd_slot_seqid_sequence,
TP_PROTO(
const struct nfs4_client *clp,
@@ -813,10 +842,11 @@ TRACE_EVENT(nfsd_slot_seqid_sequence,
TP_STRUCT__entry(
__field(u32, seqid)
__field(u32, slot_seqid)
+ __field(u32, slot_index)
+ __field(unsigned long, slot_flags)
__field(u32, cl_boot)
__field(u32, cl_id)
__sockaddr(addr, clp->cl_cb_conn.cb_addrlen)
- __field(bool, in_use)
),
TP_fast_assign(
__entry->cl_boot = clp->cl_clientid.cl_boot;
@@ -825,11 +855,13 @@ TRACE_EVENT(nfsd_slot_seqid_sequence,
clp->cl_cb_conn.cb_addrlen);
__entry->seqid = seq->seqid;
__entry->slot_seqid = slot->sl_seqid;
+ __entry->slot_index = seq->slotid;
+ __entry->slot_flags = slot->sl_flags;
),
- TP_printk("addr=%pISpc client %08x:%08x seqid=%u slot_seqid=%u (%sin use)",
+ TP_printk("addr=%pISpc client %08x:%08x idx=%u seqid=%u slot_seqid=%u flags=%s",
__get_sockaddr(addr), __entry->cl_boot, __entry->cl_id,
- __entry->seqid, __entry->slot_seqid,
- __entry->in_use ? "" : "not "
+ __entry->slot_index, __entry->seqid, __entry->slot_seqid,
+ show_nfs_slot_flags(__entry->slot_flags)
)
);
@@ -1039,6 +1071,7 @@ DEFINE_CLID_EVENT(confirmed_r);
{ 1 << NFSD_FILE_HASHED, "HASHED" }, \
{ 1 << NFSD_FILE_PENDING, "PENDING" }, \
{ 1 << NFSD_FILE_REFERENCED, "REFERENCED" }, \
+ { 1 << NFSD_FILE_RECENT, "RECENT" }, \
{ 1 << NFSD_FILE_GC, "GC" })
DECLARE_EVENT_CLASS(nfsd_file_class,
@@ -1075,7 +1108,6 @@ DEFINE_NFSD_FILE_EVENT(nfsd_file_free);
DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash);
DEFINE_NFSD_FILE_EVENT(nfsd_file_put);
DEFINE_NFSD_FILE_EVENT(nfsd_file_closing);
-DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash_and_queue);
TRACE_EVENT(nfsd_file_alloc,
TP_PROTO(
@@ -1311,12 +1343,11 @@ DEFINE_EVENT(nfsd_file_gc_class, name, \
TP_ARGS(nf))
DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_lru_add);
-DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_lru_add_disposed);
DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_lru_del);
-DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_lru_del_disposed);
DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_in_use);
DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_writeback);
DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_referenced);
+DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_aged);
DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_disposed);
DECLARE_EVENT_CLASS(nfsd_file_lruwalk_class,
@@ -1602,7 +1633,7 @@ DECLARE_EVENT_CLASS(nfsd_cb_lifetime_class,
__entry->cl_id = clp->cl_clientid.cl_id;
__entry->cb = cb;
__entry->opcode = cb->cb_ops ? cb->cb_ops->opcode : _CB_NULL;
- __entry->need_restart = cb->cb_need_restart;
+ __entry->need_restart = test_bit(NFSD4_CALLBACK_REQUEUE, &cb->cb_flags);
__assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr,
clp->cl_cb_conn.cb_addrlen)
),
@@ -2068,25 +2099,6 @@ TRACE_EVENT(nfsd_ctl_maxblksize,
)
);
-TRACE_EVENT(nfsd_ctl_maxconn,
- TP_PROTO(
- const struct net *net,
- int maxconn
- ),
- TP_ARGS(net, maxconn),
- TP_STRUCT__entry(
- __field(unsigned int, netns_ino)
- __field(int, maxconn)
- ),
- TP_fast_assign(
- __entry->netns_ino = net->ns.inum;
- __entry->maxconn = maxconn;
- ),
- TP_printk("maxconn=%d",
- __entry->maxconn
- )
-);
-
TRACE_EVENT(nfsd_ctl_time,
TP_PROTO(
const struct net *net,
@@ -2321,6 +2333,259 @@ DEFINE_EVENT(nfsd_copy_async_done_class, \
DEFINE_COPY_ASYNC_DONE_EVENT(done);
DEFINE_COPY_ASYNC_DONE_EVENT(cancel);
+TRACE_EVENT(nfsd_vfs_setattr,
+ TP_PROTO(
+ const struct svc_rqst *rqstp,
+ const struct svc_fh *fhp,
+ const struct iattr *iap,
+ const struct timespec64 *guardtime
+ ),
+ TP_ARGS(rqstp, fhp, iap, guardtime),
+ TP_STRUCT__entry(
+ NFSD_TRACE_PROC_CALL_FIELDS(rqstp)
+ __field(u32, fh_hash)
+ __field(s64, gtime_tv_sec)
+ __field(u32, gtime_tv_nsec)
+ __field(unsigned int, ia_valid)
+ __field(loff_t, ia_size)
+ __field(uid_t, ia_uid)
+ __field(gid_t, ia_gid)
+ __field(umode_t, ia_mode)
+ ),
+ TP_fast_assign(
+ NFSD_TRACE_PROC_CALL_ASSIGNMENTS(rqstp);
+ __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle);
+ __entry->gtime_tv_sec = guardtime ? guardtime->tv_sec : 0;
+ __entry->gtime_tv_nsec = guardtime ? guardtime->tv_nsec : 0;
+ __entry->ia_valid = iap->ia_valid;
+ __entry->ia_size = iap->ia_size;
+ __entry->ia_uid = __kuid_val(iap->ia_uid);
+ __entry->ia_gid = __kgid_val(iap->ia_gid);
+ __entry->ia_mode = iap->ia_mode;
+ ),
+ TP_printk(
+ "xid=0x%08x fh_hash=0x%08x ia_valid=%s ia_size=%llu ia_mode=0%o ia_uid=%u ia_gid=%u guard_time=%lld.%u",
+ __entry->xid, __entry->fh_hash, show_ia_valid_flags(__entry->ia_valid),
+ __entry->ia_size, __entry->ia_mode, __entry->ia_uid, __entry->ia_gid,
+ __entry->gtime_tv_sec, __entry->gtime_tv_nsec
+ )
+)
+
+TRACE_EVENT(nfsd_vfs_lookup,
+ TP_PROTO(
+ const struct svc_rqst *rqstp,
+ const struct svc_fh *fhp,
+ const char *name,
+ unsigned int len
+ ),
+ TP_ARGS(rqstp, fhp, name, len),
+ TP_STRUCT__entry(
+ NFSD_TRACE_PROC_CALL_FIELDS(rqstp)
+ __field(u32, fh_hash)
+ __string_len(name, name, len)
+ ),
+ TP_fast_assign(
+ NFSD_TRACE_PROC_CALL_ASSIGNMENTS(rqstp);
+ __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle);
+ __assign_str(name);
+ ),
+ TP_printk("xid=0x%08x fh_hash=0x%08x name=%s",
+ __entry->xid, __entry->fh_hash, __get_str(name)
+ )
+);
+
+TRACE_EVENT(nfsd_vfs_create,
+ TP_PROTO(
+ const struct svc_rqst *rqstp,
+ const struct svc_fh *fhp,
+ umode_t type,
+ const char *name,
+ unsigned int len
+ ),
+ TP_ARGS(rqstp, fhp, type, name, len),
+ TP_STRUCT__entry(
+ NFSD_TRACE_PROC_CALL_FIELDS(rqstp)
+ __field(u32, fh_hash)
+ __field(umode_t, type)
+ __string_len(name, name, len)
+ ),
+ TP_fast_assign(
+ NFSD_TRACE_PROC_CALL_ASSIGNMENTS(rqstp);
+ __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle);
+ __entry->type = type;
+ __assign_str(name);
+ ),
+ TP_printk("xid=0x%08x fh_hash=0x%08x type=%s name=%s",
+ __entry->xid, __entry->fh_hash,
+ show_fs_file_type(__entry->type), __get_str(name)
+ )
+);
+
+TRACE_EVENT(nfsd_vfs_symlink,
+ TP_PROTO(
+ const struct svc_rqst *rqstp,
+ const struct svc_fh *fhp,
+ const char *name,
+ unsigned int namelen,
+ const char *target
+ ),
+ TP_ARGS(rqstp, fhp, name, namelen, target),
+ TP_STRUCT__entry(
+ NFSD_TRACE_PROC_CALL_FIELDS(rqstp)
+ __field(u32, fh_hash)
+ __string_len(name, name, namelen)
+ __string(target, target)
+ ),
+ TP_fast_assign(
+ NFSD_TRACE_PROC_CALL_ASSIGNMENTS(rqstp);
+ __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle);
+ __assign_str(name);
+ __assign_str(target);
+ ),
+ TP_printk("xid=0x%08x fh_hash=0x%08x name=%s target=%s",
+ __entry->xid, __entry->fh_hash,
+ __get_str(name), __get_str(target)
+ )
+);
+
+TRACE_EVENT(nfsd_vfs_link,
+ TP_PROTO(
+ const struct svc_rqst *rqstp,
+ const struct svc_fh *sfhp,
+ const struct svc_fh *tfhp,
+ const char *name,
+ unsigned int namelen
+ ),
+ TP_ARGS(rqstp, sfhp, tfhp, name, namelen),
+ TP_STRUCT__entry(
+ NFSD_TRACE_PROC_CALL_FIELDS(rqstp)
+ __field(u32, sfh_hash)
+ __field(u32, tfh_hash)
+ __string_len(name, name, namelen)
+ ),
+ TP_fast_assign(
+ NFSD_TRACE_PROC_CALL_ASSIGNMENTS(rqstp);
+ __entry->sfh_hash = knfsd_fh_hash(&sfhp->fh_handle);
+ __entry->tfh_hash = knfsd_fh_hash(&tfhp->fh_handle);
+ __assign_str(name);
+ ),
+ TP_printk("xid=0x%08x src_fh=0x%08x tgt_fh=0x%08x name=%s",
+ __entry->xid, __entry->sfh_hash, __entry->tfh_hash,
+ __get_str(name)
+ )
+);
+
+TRACE_EVENT(nfsd_vfs_unlink,
+ TP_PROTO(
+ const struct svc_rqst *rqstp,
+ const struct svc_fh *fhp,
+ const char *name,
+ unsigned int len
+ ),
+ TP_ARGS(rqstp, fhp, name, len),
+ TP_STRUCT__entry(
+ NFSD_TRACE_PROC_CALL_FIELDS(rqstp)
+ __field(u32, fh_hash)
+ __string_len(name, name, len)
+ ),
+ TP_fast_assign(
+ NFSD_TRACE_PROC_CALL_ASSIGNMENTS(rqstp);
+ __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle);
+ __assign_str(name);
+ ),
+ TP_printk("xid=0x%08x fh_hash=0x%08x name=%s",
+ __entry->xid, __entry->fh_hash,
+ __get_str(name)
+ )
+);
+
+TRACE_EVENT(nfsd_vfs_rename,
+ TP_PROTO(
+ const struct svc_rqst *rqstp,
+ const struct svc_fh *sfhp,
+ const struct svc_fh *tfhp,
+ const char *source,
+ unsigned int sourcelen,
+ const char *target,
+ unsigned int targetlen
+ ),
+ TP_ARGS(rqstp, sfhp, tfhp, source, sourcelen, target, targetlen),
+ TP_STRUCT__entry(
+ NFSD_TRACE_PROC_CALL_FIELDS(rqstp)
+ __field(u32, sfh_hash)
+ __field(u32, tfh_hash)
+ __string_len(source, source, sourcelen)
+ __string_len(target, target, targetlen)
+ ),
+ TP_fast_assign(
+ NFSD_TRACE_PROC_CALL_ASSIGNMENTS(rqstp);
+ __entry->sfh_hash = knfsd_fh_hash(&sfhp->fh_handle);
+ __entry->tfh_hash = knfsd_fh_hash(&tfhp->fh_handle);
+ __assign_str(source);
+ __assign_str(target);
+ ),
+ TP_printk("xid=0x%08x sfh_hash=0x%08x tfh_hash=0x%08x source=%s target=%s",
+ __entry->xid, __entry->sfh_hash, __entry->tfh_hash,
+ __get_str(source), __get_str(target)
+ )
+);
+
+TRACE_EVENT(nfsd_vfs_readdir,
+ TP_PROTO(
+ const struct svc_rqst *rqstp,
+ const struct svc_fh *fhp,
+ u32 count,
+ u64 offset
+ ),
+ TP_ARGS(rqstp, fhp, count, offset),
+ TP_STRUCT__entry(
+ NFSD_TRACE_PROC_CALL_FIELDS(rqstp)
+ __field(u32, fh_hash)
+ __field(u32, count)
+ __field(u64, offset)
+ ),
+ TP_fast_assign(
+ NFSD_TRACE_PROC_CALL_ASSIGNMENTS(rqstp);
+ __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle);
+ __entry->count = count;
+ __entry->offset = offset;
+ ),
+ TP_printk("xid=0x%08x fh_hash=0x%08x offset=%llu count=%u",
+ __entry->xid, __entry->fh_hash,
+ __entry->offset, __entry->count
+ )
+);
+
+DECLARE_EVENT_CLASS(nfsd_vfs_getattr_class,
+ TP_PROTO(
+ const struct svc_rqst *rqstp,
+ const struct svc_fh *fhp
+ ),
+ TP_ARGS(rqstp, fhp),
+ TP_STRUCT__entry(
+ NFSD_TRACE_PROC_CALL_FIELDS(rqstp)
+ __field(u32, fh_hash)
+ ),
+ TP_fast_assign(
+ NFSD_TRACE_PROC_CALL_ASSIGNMENTS(rqstp);
+ __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle);
+ ),
+ TP_printk("xid=0x%08x fh_hash=0x%08x",
+ __entry->xid, __entry->fh_hash
+ )
+);
+
+#define DEFINE_NFSD_VFS_GETATTR_EVENT(__name) \
+DEFINE_EVENT(nfsd_vfs_getattr_class, __name, \
+ TP_PROTO( \
+ const struct svc_rqst *rqstp, \
+ const struct svc_fh *fhp \
+ ), \
+ TP_ARGS(rqstp, fhp))
+
+DEFINE_NFSD_VFS_GETATTR_EVENT(nfsd_vfs_getattr);
+DEFINE_NFSD_VFS_GETATTR_EVENT(nfsd_vfs_statfs);
+
#endif /* _NFSD_TRACE_H */
#undef TRACE_INCLUDE_PATH
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 29cb7b812d71..edf050766e57 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -31,6 +31,7 @@
#include <linux/exportfs.h>
#include <linux/writeback.h>
#include <linux/security.h>
+#include <linux/sunrpc/xdr.h>
#include "xdr3.h"
@@ -47,6 +48,8 @@
#define NFSDDBG_FACILITY NFSDDBG_FILEOP
+bool nfsd_disable_splice_read __read_mostly;
+
/**
* nfserrno - Map Linux errnos to NFS errnos
* @errno: POSIX(-ish) error code to be mapped
@@ -71,7 +74,6 @@ nfserrno (int errno)
{ nfserr_acces, -EACCES },
{ nfserr_exist, -EEXIST },
{ nfserr_xdev, -EXDEV },
- { nfserr_mlink, -EMLINK },
{ nfserr_nodev, -ENODEV },
{ nfserr_notdir, -ENOTDIR },
{ nfserr_isdir, -EISDIR },
@@ -245,7 +247,7 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct dentry *dentry;
int host_err;
- dprintk("nfsd: nfsd_lookup(fh %s, %.*s)\n", SVCFH_fmt(fhp), len,name);
+ trace_nfsd_vfs_lookup(rqstp, fhp, name, len);
dparent = fhp->fh_dentry;
exp = exp_get(fhp->fh_export);
@@ -265,7 +267,8 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
goto out_nfserr;
}
} else {
- dentry = lookup_one_len_unlocked(name, dparent, len);
+ dentry = lookup_one_unlocked(&nop_mnt_idmap,
+ &QSTR_LEN(name, len), dparent);
host_err = PTR_ERR(dentry);
if (IS_ERR(dentry))
goto out_nfserr;
@@ -467,7 +470,15 @@ static int __nfsd_setattr(struct dentry *dentry, struct iattr *iap)
if (!iap->ia_valid)
return 0;
- iap->ia_valid |= ATTR_CTIME;
+ /*
+ * If ATTR_DELEG is set, then this is an update from a client that
+ * holds a delegation. If this is an update for only the atime, the
+ * ctime should not be changed. If the update contains the mtime
+ * too, then ATTR_CTIME should already be set.
+ */
+ if (!(iap->ia_valid & ATTR_DELEG))
+ iap->ia_valid |= ATTR_CTIME;
+
return notify_change(&nop_mnt_idmap, dentry, iap, NULL);
}
@@ -500,6 +511,8 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
bool size_change = (iap->ia_valid & ATTR_SIZE);
int retries;
+ trace_nfsd_vfs_setattr(rqstp, fhp, iap, guardtime);
+
if (iap->ia_valid & ATTR_SIZE) {
accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE;
ftype = S_IFREG;
@@ -923,7 +936,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
* directories, but we never have and it doesn't seem to have
* caused anyone a problem. If we were to change this, note
* also that our filldir callbacks would need a variant of
- * lookup_one_len that doesn't check permissions.
+ * lookup_one_positive_unlocked() that doesn't check permissions.
*/
if (type == S_IFREG)
may_flags |= NFSD_MAY_OWNER_OVERRIDE;
@@ -1081,25 +1094,28 @@ __be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
{
unsigned long v, total;
struct iov_iter iter;
- loff_t ppos = offset;
- struct page *page;
+ struct kiocb kiocb;
ssize_t host_err;
+ size_t len;
+
+ init_sync_kiocb(&kiocb, file);
+ kiocb.ki_pos = offset;
v = 0;
total = *count;
while (total) {
- page = *(rqstp->rq_next_page++);
- rqstp->rq_vec[v].iov_base = page_address(page) + base;
- rqstp->rq_vec[v].iov_len = min_t(size_t, total, PAGE_SIZE - base);
- total -= rqstp->rq_vec[v].iov_len;
+ len = min_t(size_t, total, PAGE_SIZE - base);
+ bvec_set_page(&rqstp->rq_bvec[v], *(rqstp->rq_next_page++),
+ len, base);
+ total -= len;
++v;
base = 0;
}
- WARN_ON_ONCE(v > ARRAY_SIZE(rqstp->rq_vec));
+ WARN_ON_ONCE(v > rqstp->rq_maxpages);
trace_nfsd_read_vector(rqstp, fhp, offset, *count);
- iov_iter_kvec(&iter, ITER_DEST, rqstp->rq_vec, v, *count);
- host_err = vfs_iter_read(file, &iter, &ppos, 0);
+ iov_iter_bvec(&iter, ITER_DEST, rqstp->rq_bvec, v, *count);
+ host_err = vfs_iocb_iter_read(file, &kiocb, &iter);
return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err);
}
@@ -1140,25 +1156,41 @@ static int wait_for_concurrent_writes(struct file *file)
return err;
}
+/**
+ * nfsd_vfs_write - write data to an already-open file
+ * @rqstp: RPC execution context
+ * @fhp: File handle of file to write into
+ * @nf: An open file matching @fhp
+ * @offset: Byte offset of start
+ * @payload: xdr_buf containing the write payload
+ * @cnt: IN: number of bytes to write, OUT: number of bytes actually written
+ * @stable: An NFS stable_how value
+ * @verf: NFS WRITE verifier
+ *
+ * Upon return, caller must invoke fh_put on @fhp.
+ *
+ * Return values:
+ * An nfsstat value in network byte order.
+ */
__be32
-nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
- loff_t offset, struct kvec *vec, int vlen,
- unsigned long *cnt, int stable,
- __be32 *verf)
+nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct nfsd_file *nf, loff_t offset,
+ const struct xdr_buf *payload, unsigned long *cnt,
+ int stable, __be32 *verf)
{
struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
struct file *file = nf->nf_file;
struct super_block *sb = file_inode(file)->i_sb;
+ struct kiocb kiocb;
struct svc_export *exp;
struct iov_iter iter;
errseq_t since;
__be32 nfserr;
int host_err;
- loff_t pos = offset;
unsigned long exp_op_flags = 0;
unsigned int pflags = current->flags;
- rwf_t flags = 0;
bool restore_flags = false;
+ unsigned int nvecs;
trace_nfsd_write_opened(rqstp, fhp, offset, *cnt);
@@ -1182,15 +1214,17 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
if (!EX_ISSYNC(exp))
stable = NFS_UNSTABLE;
-
+ init_sync_kiocb(&kiocb, file);
+ kiocb.ki_pos = offset;
if (stable && !fhp->fh_use_wgather)
- flags |= RWF_SYNC;
+ kiocb.ki_flags |= IOCB_DSYNC;
- iov_iter_kvec(&iter, ITER_SOURCE, vec, vlen, *cnt);
+ nvecs = xdr_buf_to_bvec(rqstp->rq_bvec, rqstp->rq_maxpages, payload);
+ iov_iter_bvec(&iter, ITER_SOURCE, rqstp->rq_bvec, nvecs, *cnt);
since = READ_ONCE(file->f_wb_err);
if (verf)
nfsd_copy_write_verifier(verf, nn);
- host_err = vfs_iter_write(file, &iter, &pos, flags);
+ host_err = vfs_iocb_iter_write(file, &kiocb, &iter);
if (host_err < 0) {
commit_reset_write_verifier(nn, rqstp, host_err);
goto out_nfserr;
@@ -1237,6 +1271,8 @@ out_nfserr:
*/
bool nfsd_read_splice_ok(struct svc_rqst *rqstp)
{
+ if (nfsd_disable_splice_read)
+ return false;
switch (svc_auth_flavor(rqstp)) {
case RPC_AUTH_GSS_KRB5I:
case RPC_AUTH_GSS_KRB5P:
@@ -1284,14 +1320,24 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
return err;
}
-/*
- * Write data to a file.
- * The stable flag requests synchronous writes.
- * N.B. After this call fhp needs an fh_put
+/**
+ * nfsd_write - open a file and write data to it
+ * @rqstp: RPC execution context
+ * @fhp: File handle of file to write into; nfsd_write() may modify it
+ * @offset: Byte offset of start
+ * @payload: xdr_buf containing the write payload
+ * @cnt: IN: number of bytes to write, OUT: number of bytes actually written
+ * @stable: An NFS stable_how value
+ * @verf: NFS WRITE verifier
+ *
+ * Upon return, caller must invoke fh_put on @fhp.
+ *
+ * Return values:
+ * An nfsstat value in network byte order.
*/
__be32
nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
- struct kvec *vec, int vlen, unsigned long *cnt, int stable,
+ const struct xdr_buf *payload, unsigned long *cnt, int stable,
__be32 *verf)
{
struct nfsd_file *nf;
@@ -1303,8 +1349,8 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
if (err)
goto out;
- err = nfsd_vfs_write(rqstp, fhp, nf, offset, vec,
- vlen, cnt, stable, verf);
+ err = nfsd_vfs_write(rqstp, fhp, nf, offset, payload, cnt,
+ stable, verf);
nfsd_file_put(nf);
out:
trace_nfsd_write_done(rqstp, fhp, offset, *cnt);
@@ -1340,6 +1386,8 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
loff_t start, end;
struct nfsd_net *nn;
+ trace_nfsd_commit_start(rqstp, fhp, offset, count);
+
/*
* Convert the client-provided (offset, count) range to a
* (start, end) range. If the client-provided range falls
@@ -1378,6 +1426,7 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
} else
nfsd_copy_write_verifier(verf, nn);
+ trace_nfsd_commit_done(rqstp, fhp, offset, count);
return err;
}
@@ -1461,7 +1510,7 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct inode *dirp;
struct iattr *iap = attrs->na_iattr;
__be32 err;
- int host_err;
+ int host_err = 0;
dentry = fhp->fh_dentry;
dirp = d_inode(dentry);
@@ -1488,28 +1537,15 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
nfsd_check_ignore_resizing(iap);
break;
case S_IFDIR:
- host_err = vfs_mkdir(&nop_mnt_idmap, dirp, dchild, iap->ia_mode);
- if (!host_err && unlikely(d_unhashed(dchild))) {
- struct dentry *d;
- d = lookup_one_len(dchild->d_name.name,
- dchild->d_parent,
- dchild->d_name.len);
- if (IS_ERR(d)) {
- host_err = PTR_ERR(d);
- break;
- }
- if (unlikely(d_is_negative(d))) {
- dput(d);
- err = nfserr_serverfault;
- goto out;
- }
+ dchild = vfs_mkdir(&nop_mnt_idmap, dirp, dchild, iap->ia_mode);
+ if (IS_ERR(dchild)) {
+ host_err = PTR_ERR(dchild);
+ } else if (d_is_negative(dchild)) {
+ err = nfserr_serverfault;
+ goto out;
+ } else if (unlikely(dchild != resfhp->fh_dentry)) {
dput(resfhp->fh_dentry);
- resfhp->fh_dentry = dget(d);
- err = fh_update(resfhp);
- dput(dchild);
- dchild = d;
- if (err)
- goto out;
+ resfhp->fh_dentry = dget(dchild);
}
break;
case S_IFCHR:
@@ -1530,7 +1566,8 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
err = nfsd_create_setattr(rqstp, fhp, resfhp, attrs);
out:
- dput(dchild);
+ if (!IS_ERR(dchild))
+ dput(dchild);
return err;
out_nfserr:
@@ -1553,6 +1590,8 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
__be32 err;
int host_err;
+ trace_nfsd_vfs_create(rqstp, fhp, type, fname, flen);
+
if (isdotent(fname, flen))
return nfserr_exist;
@@ -1567,7 +1606,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
return nfserrno(host_err);
inode_lock_nested(dentry->d_inode, I_MUTEX_PARENT);
- dchild = lookup_one_len(fname, dentry, flen);
+ dchild = lookup_one(&nop_mnt_idmap, &QSTR_LEN(fname, flen), dentry);
host_err = PTR_ERR(dchild);
if (IS_ERR(dchild)) {
err = nfserrno(host_err);
@@ -1653,6 +1692,8 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
__be32 err, cerr;
int host_err;
+ trace_nfsd_vfs_symlink(rqstp, fhp, fname, flen, path);
+
err = nfserr_noent;
if (!flen || path[0] == '\0')
goto out;
@@ -1672,7 +1713,7 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
dentry = fhp->fh_dentry;
inode_lock_nested(dentry->d_inode, I_MUTEX_PARENT);
- dnew = lookup_one_len(fname, dentry, flen);
+ dnew = lookup_one(&nop_mnt_idmap, &QSTR_LEN(fname, flen), dentry);
if (IS_ERR(dnew)) {
err = nfserrno(PTR_ERR(dnew));
inode_unlock(dentry->d_inode);
@@ -1699,9 +1740,17 @@ out:
return err;
}
-/*
- * Create a hardlink
- * N.B. After this call _both_ ffhp and tfhp need an fh_put
+/**
+ * nfsd_link - create a link
+ * @rqstp: RPC transaction context
+ * @ffhp: the file handle of the directory where the new link is to be created
+ * @name: the filename of the new link
+ * @len: the length of @name in octets
+ * @tfhp: the file handle of an existing file object
+ *
+ * After this call _both_ ffhp and tfhp need an fh_put.
+ *
+ * Returns a generic NFS status code in network byte-order.
*/
__be32
nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
@@ -1709,9 +1758,12 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
{
struct dentry *ddir, *dnew, *dold;
struct inode *dirp;
+ int type;
__be32 err;
int host_err;
+ trace_nfsd_vfs_link(rqstp, ffhp, tfhp, name, len);
+
err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_CREATE);
if (err)
goto out;
@@ -1728,19 +1780,19 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
if (isdotent(name, len))
goto out;
+ err = nfs_ok;
+ type = d_inode(tfhp->fh_dentry)->i_mode & S_IFMT;
host_err = fh_want_write(tfhp);
- if (host_err) {
- err = nfserrno(host_err);
+ if (host_err)
goto out;
- }
ddir = ffhp->fh_dentry;
dirp = d_inode(ddir);
inode_lock_nested(dirp, I_MUTEX_PARENT);
- dnew = lookup_one_len(name, ddir, len);
+ dnew = lookup_one(&nop_mnt_idmap, &QSTR_LEN(name, len), ddir);
if (IS_ERR(dnew)) {
- err = nfserrno(PTR_ERR(dnew));
+ host_err = PTR_ERR(dnew);
goto out_unlock;
}
@@ -1756,17 +1808,26 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
fh_fill_post_attrs(ffhp);
inode_unlock(dirp);
if (!host_err) {
- err = nfserrno(commit_metadata(ffhp));
- if (!err)
- err = nfserrno(commit_metadata(tfhp));
- } else {
- err = nfserrno(host_err);
+ host_err = commit_metadata(ffhp);
+ if (!host_err)
+ host_err = commit_metadata(tfhp);
}
+
dput(dnew);
out_drop_write:
fh_drop_write(tfhp);
+ if (host_err == -EBUSY) {
+ /*
+ * See RFC 8881 Section 18.9.4 para 1-2: NFSv4 LINK
+ * wants a status unique to the object type.
+ */
+ if (type != S_IFDIR)
+ err = nfserr_file_open;
+ else
+ err = nfserr_acces;
+ }
out:
- return err;
+ return err != nfs_ok ? err : nfserrno(host_err);
out_dput:
dput(dnew);
@@ -1795,20 +1856,32 @@ nfsd_has_cached_files(struct dentry *dentry)
return ret;
}
-/*
- * Rename a file
- * N.B. After this call _both_ ffhp and tfhp need an fh_put
+/**
+ * nfsd_rename - rename a directory entry
+ * @rqstp: RPC transaction context
+ * @ffhp: the file handle of parent directory containing the entry to be renamed
+ * @fname: the filename of directory entry to be renamed
+ * @flen: the length of @fname in octets
+ * @tfhp: the file handle of parent directory to contain the renamed entry
+ * @tname: the filename of the new entry
+ * @tlen: the length of @tlen in octets
+ *
+ * After this call _both_ ffhp and tfhp need an fh_put.
+ *
+ * Returns a generic NFS status code in network byte-order.
*/
__be32
nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
struct svc_fh *tfhp, char *tname, int tlen)
{
struct dentry *fdentry, *tdentry, *odentry, *ndentry, *trap;
- struct inode *fdir, *tdir;
+ int type = S_IFDIR;
__be32 err;
int host_err;
bool close_cached = false;
+ trace_nfsd_vfs_rename(rqstp, ffhp, tfhp, fname, flen, tname, tlen);
+
err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_REMOVE);
if (err)
goto out;
@@ -1817,10 +1890,8 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
goto out;
fdentry = ffhp->fh_dentry;
- fdir = d_inode(fdentry);
tdentry = tfhp->fh_dentry;
- tdir = d_inode(tdentry);
err = nfserr_perm;
if (!flen || isdotent(fname, flen) || !tlen || isdotent(tname, tlen))
@@ -1851,7 +1922,7 @@ retry:
if (err != nfs_ok)
goto out_unlock;
- odentry = lookup_one_len(fname, fdentry, flen);
+ odentry = lookup_one(&nop_mnt_idmap, &QSTR_LEN(fname, flen), fdentry);
host_err = PTR_ERR(odentry);
if (IS_ERR(odentry))
goto out_nfserr;
@@ -1862,11 +1933,14 @@ retry:
host_err = -EINVAL;
if (odentry == trap)
goto out_dput_old;
+ type = d_inode(odentry)->i_mode & S_IFMT;
- ndentry = lookup_one_len(tname, tdentry, tlen);
+ ndentry = lookup_one(&nop_mnt_idmap, &QSTR_LEN(tname, tlen), tdentry);
host_err = PTR_ERR(ndentry);
if (IS_ERR(ndentry))
goto out_dput_old;
+ if (d_inode(ndentry))
+ type = d_inode(ndentry)->i_mode & S_IFMT;
host_err = -ENOTEMPTY;
if (ndentry == trap)
goto out_dput_new;
@@ -1878,10 +1952,10 @@ retry:
} else {
struct renamedata rd = {
.old_mnt_idmap = &nop_mnt_idmap,
- .old_dir = fdir,
+ .old_parent = fdentry,
.old_dentry = odentry,
.new_mnt_idmap = &nop_mnt_idmap,
- .new_dir = tdir,
+ .new_parent = tdentry,
.new_dentry = ndentry,
};
int retries;
@@ -1904,7 +1978,18 @@ retry:
out_dput_old:
dput(odentry);
out_nfserr:
- err = nfserrno(host_err);
+ if (host_err == -EBUSY) {
+ /*
+ * See RFC 8881 Section 18.26.4 para 1-3: NFSv4 RENAME
+ * wants a status unique to the object type.
+ */
+ if (type != S_IFDIR)
+ err = nfserr_file_open;
+ else
+ err = nfserr_acces;
+ } else {
+ err = nfserrno(host_err);
+ }
if (!close_cached) {
fh_fill_post_attrs(ffhp);
@@ -1931,9 +2016,17 @@ out:
return err;
}
-/*
- * Unlink a file or directory
- * N.B. After this call fhp needs an fh_put
+/**
+ * nfsd_unlink - remove a directory entry
+ * @rqstp: RPC transaction context
+ * @fhp: the file handle of the parent directory to be modified
+ * @type: enforced file type of the object to be removed
+ * @fname: the name of directory entry to be removed
+ * @flen: length of @fname in octets
+ *
+ * After this call fhp needs an fh_put.
+ *
+ * Returns a generic NFS status code in network byte-order.
*/
__be32
nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
@@ -1945,6 +2038,8 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
__be32 err;
int host_err;
+ trace_nfsd_vfs_unlink(rqstp, fhp, fname, flen);
+
err = nfserr_acces;
if (!flen || isdotent(fname, flen))
goto out;
@@ -1960,7 +2055,7 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
dirp = d_inode(dentry);
inode_lock_nested(dirp, I_MUTEX_PARENT);
- rdentry = lookup_one_len(fname, dentry, flen);
+ rdentry = lookup_one(&nop_mnt_idmap, &QSTR_LEN(fname, flen), dentry);
host_err = PTR_ERR(rdentry);
if (IS_ERR(rdentry))
goto out_unlock;
@@ -2007,15 +2102,17 @@ out_drop_write:
fh_drop_write(fhp);
out_nfserr:
if (host_err == -EBUSY) {
- /* name is mounted-on. There is no perfect
- * error status.
+ /*
+ * See RFC 8881 Section 18.25.4 para 4: NFSv4 REMOVE
+ * wants a status unique to the object type.
*/
- err = nfserr_file_open;
- } else {
- err = nfserrno(host_err);
+ if (type != S_IFDIR)
+ err = nfserr_file_open;
+ else
+ err = nfserr_acces;
}
out:
- return err;
+ return err != nfs_ok ? err : nfserrno(host_err);
out_unlock:
inode_unlock(dirp);
goto out_drop_write;
@@ -2231,6 +2328,8 @@ nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat, in
{
__be32 err;
+ trace_nfsd_vfs_statfs(rqstp, fhp);
+
err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP | access);
if (!err) {
struct path path = {
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index f9b09b842856..eff04959606f 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -128,13 +128,13 @@ bool nfsd_read_splice_ok(struct svc_rqst *rqstp);
__be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
loff_t offset, unsigned long *count,
u32 *eof);
-__be32 nfsd_write(struct svc_rqst *, struct svc_fh *, loff_t,
- struct kvec *, int, unsigned long *,
- int stable, __be32 *verf);
+__be32 nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ loff_t offset, const struct xdr_buf *payload,
+ unsigned long *cnt, int stable, __be32 *verf);
__be32 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct nfsd_file *nf, loff_t offset,
- struct kvec *vec, int vlen, unsigned long *cnt,
- int stable, __be32 *verf);
+ const struct xdr_buf *payload,
+ unsigned long *cnt, int stable, __be32 *verf);
__be32 nfsd_readlink(struct svc_rqst *, struct svc_fh *,
char *, int *);
__be32 nfsd_symlink(struct svc_rqst *, struct svc_fh *,
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index c26ba86dbdfd..a23bc56051ca 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -676,6 +676,10 @@ struct nfsd4_cb_offload {
__be32 co_nfserr;
unsigned int co_retries;
struct knfsd_fh co_fh;
+
+ struct nfs4_sessionid co_referring_sessionid;
+ u32 co_referring_slotid;
+ u32 co_referring_seqno;
};
struct nfsd4_copy {
@@ -866,7 +870,6 @@ struct nfsd4_compoundargs {
char * tag;
u32 taglen;
u32 minorversion;
- u32 client_opcnt;
u32 opcnt;
bool splice_ok;
struct nfsd4_op *ops;
diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h
index f1a315cd31b7..f4e29c0c701c 100644
--- a/fs/nfsd/xdr4cb.h
+++ b/fs/nfsd/xdr4cb.h
@@ -6,8 +6,11 @@
#define cb_compound_enc_hdr_sz 4
#define cb_compound_dec_hdr_sz (3 + (NFS4_MAXTAGLEN >> 2))
#define sessionid_sz (NFS4_MAX_SESSIONID_LEN >> 2)
+#define enc_referring_call4_sz (1 + 1)
+#define enc_referring_call_list4_sz (sessionid_sz + 1 + \
+ enc_referring_call4_sz)
#define cb_sequence_enc_sz (sessionid_sz + 4 + \
- 1 /* no referring calls list yet */)
+ enc_referring_call_list4_sz)
#define cb_sequence_dec_sz (op_dec_sz + sessionid_sz + 4)
#define op_enc_sz 1
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 0d8f7fb15c2e..dd0c8e560ef6 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -2102,11 +2102,13 @@ static int nilfs_btree_propagate(struct nilfs_bmap *btree,
ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1, 0);
if (ret < 0) {
- if (unlikely(ret == -ENOENT))
+ if (unlikely(ret == -ENOENT)) {
nilfs_crit(btree->b_inode->i_sb,
"writing node/leaf block does not appear in b-tree (ino=%lu) at key=%llu, level=%d",
btree->b_inode->i_ino,
(unsigned long long)key, level);
+ ret = -EINVAL;
+ }
goto out;
}
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 9b7f8e9655a2..6ca3d74be1e1 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -96,7 +96,7 @@ static void nilfs_commit_chunk(struct folio *folio,
int err;
nr_dirty = nilfs_page_count_clean_buffers(folio, from, to);
- copied = block_write_end(NULL, mapping, pos, len, len, folio, NULL);
+ copied = block_write_end(pos, len, len, folio);
if (pos + copied > dir->i_size)
i_size_write(dir, pos + copied);
if (IS_DIRSYNC(dir))
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
index 893ab36824cc..2d8dc6b35b54 100644
--- a/fs/nilfs2/direct.c
+++ b/fs/nilfs2/direct.c
@@ -273,6 +273,9 @@ static int nilfs_direct_propagate(struct nilfs_bmap *bmap,
dat = nilfs_bmap_get_dat(bmap);
key = nilfs_bmap_data_get_key(bmap, bh);
ptr = nilfs_direct_get_ptr(bmap, key);
+ if (ptr == NILFS_BMAP_INVALID_PTR)
+ return -EINVAL;
+
if (!buffer_nilfs_volatile(bh)) {
oldreq.pr_entry_nr = ptr;
newreq.pr_entry_nr = ptr;
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 0e3fc5ba33c7..1b8d754db44d 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -125,10 +125,10 @@ static const struct vm_operations_struct nilfs_file_vm_ops = {
.page_mkwrite = nilfs_page_mkwrite,
};
-static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma)
+static int nilfs_file_mmap_prepare(struct vm_area_desc *desc)
{
- file_accessed(file);
- vma->vm_ops = &nilfs_file_vm_ops;
+ file_accessed(desc->file);
+ desc->vm_ops = &nilfs_file_vm_ops;
return 0;
}
@@ -144,7 +144,7 @@ const struct file_operations nilfs_file_operations = {
#ifdef CONFIG_COMPAT
.compat_ioctl = nilfs_compat_ioctl,
#endif /* CONFIG_COMPAT */
- .mmap = nilfs_file_mmap,
+ .mmap_prepare = nilfs_file_mmap_prepare,
.open = generic_file_open,
/* .release = nilfs_release_file, */
.fsync = nilfs_sync_file,
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index e8015d24a82c..87ddde159f0c 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -218,7 +218,8 @@ void nilfs_write_failed(struct address_space *mapping, loff_t to)
}
}
-static int nilfs_write_begin(struct file *file, struct address_space *mapping,
+static int nilfs_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
loff_t pos, unsigned len,
struct folio **foliop, void **fsdata)
@@ -237,7 +238,8 @@ static int nilfs_write_begin(struct file *file, struct address_space *mapping,
return err;
}
-static int nilfs_write_end(struct file *file, struct address_space *mapping,
+static int nilfs_write_end(const struct kiocb *iocb,
+ struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct folio *folio, void *fsdata)
{
@@ -248,7 +250,7 @@ static int nilfs_write_end(struct file *file, struct address_space *mapping,
nr_dirty = nilfs_page_count_clean_buffers(folio, start,
start + copied);
- copied = generic_write_end(file, mapping, pos, len, copied, folio,
+ copied = generic_write_end(iocb, mapping, pos, len, copied, folio,
fsdata);
nilfs_set_file_dirty(inode, nr_dirty);
err = nilfs_transaction_commit(inode->i_sb);
@@ -472,11 +474,18 @@ static int __nilfs_read_inode(struct super_block *sb,
inode->i_op = &nilfs_symlink_inode_operations;
inode_nohighmem(inode);
inode->i_mapping->a_ops = &nilfs_aops;
- } else {
+ } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
+ S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
inode->i_op = &nilfs_special_inode_operations;
init_special_inode(
inode, inode->i_mode,
huge_decode_dev(le64_to_cpu(raw_inode->i_device_code)));
+ } else {
+ nilfs_error(sb,
+ "invalid file type bits in mode 0%o for inode %lu",
+ inode->i_mode, ino);
+ err = -EIO;
+ goto failed_unmap;
}
nilfs_ifile_unmap_inode(raw_inode);
brelse(bh);
@@ -1186,7 +1195,7 @@ int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
if (size) {
if (phys && blkphy << blkbits == phys + size) {
/* The current extent goes on */
- size += n << blkbits;
+ size += (u64)n << blkbits;
} else {
/* Terminate the current extent */
ret = fiemap_fill_next_extent(
@@ -1199,14 +1208,14 @@ int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
flags = FIEMAP_EXTENT_MERGED;
logical = blkoff << blkbits;
phys = blkphy << blkbits;
- size = n << blkbits;
+ size = (u64)n << blkbits;
}
} else {
/* Start a new extent */
flags = FIEMAP_EXTENT_MERGED;
logical = blkoff << blkbits;
phys = blkphy << blkbits;
- size = n << blkbits;
+ size = (u64)n << blkbits;
}
blkoff += n;
}
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index a66d62a51f77..3288c3b4be9e 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -118,7 +118,7 @@ static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs,
*
* Return: always 0 as success.
*/
-int nilfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+int nilfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
{
struct inode *inode = d_inode(dentry);
@@ -136,7 +136,7 @@ int nilfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
* Return: 0 on success, or a negative error code on failure.
*/
int nilfs_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa)
+ struct dentry *dentry, struct file_kattr *fa)
{
struct inode *inode = d_inode(dentry);
struct nilfs_transaction_info ti;
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 2f850a18d6e7..946b0d3534a5 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -422,8 +422,6 @@ static int nilfs_mdt_write_folio(struct folio *folio,
if (wbc->sync_mode == WB_SYNC_ALL)
err = nilfs_construct_segment(sb);
- else if (wbc->for_reclaim)
- nilfs_flush_segment(sb, inode->i_ino);
return err;
}
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 953fbd5f0851..40f4b1a28705 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -218,8 +218,8 @@ static int nilfs_link(struct dentry *old_dentry, struct inode *dir,
return err;
}
-static int nilfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *nilfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct inode *inode;
struct nilfs_transaction_info ti;
@@ -227,7 +227,7 @@ static int nilfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
if (err)
- return err;
+ return ERR_PTR(err);
inc_nlink(dir);
@@ -258,7 +258,7 @@ out:
else
nilfs_transaction_abort(dir->i_sb);
- return err;
+ return ERR_PTR(err);
out_fail:
drop_nlink(inode);
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index cb6ed54accd7..f466daa39440 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -268,9 +268,9 @@ int nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
extern int nilfs_sync_file(struct file *, loff_t, loff_t, int);
/* ioctl.c */
-int nilfs_fileattr_get(struct dentry *dentry, struct fileattr *m);
+int nilfs_fileattr_get(struct dentry *dentry, struct file_kattr *m);
int nilfs_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa);
+ struct dentry *dentry, struct file_kattr *fa);
long nilfs_ioctl(struct file *, unsigned int, unsigned long);
long nilfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *, struct nilfs_argv *,
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 22aecf6e2344..a9c61d0492cb 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -560,8 +560,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
if (unlikely(err))
goto failed_folio;
- block_write_end(NULL, inode->i_mapping, pos, blocksize,
- blocksize, folio, NULL);
+ block_write_end(pos, blocksize, blocksize, folio);
folio_unlock(folio);
folio_put(folio);
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 3a202e51b360..f15ca6fc400d 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -2221,22 +2221,6 @@ static void nilfs_segctor_do_flush(struct nilfs_sc_info *sci, int bn)
spin_unlock(&sci->sc_state_lock);
}
-/**
- * nilfs_flush_segment - trigger a segment construction for resource control
- * @sb: super block
- * @ino: inode number of the file to be flushed out.
- */
-void nilfs_flush_segment(struct super_block *sb, ino_t ino)
-{
- struct the_nilfs *nilfs = sb->s_fs_info;
- struct nilfs_sc_info *sci = nilfs->ns_writer;
-
- if (!sci || nilfs_doing_construction())
- return;
- nilfs_segctor_do_flush(sci, NILFS_MDT_INODE(sb, ino) ? ino : 0);
- /* assign bit 0 to data files */
-}
-
struct nilfs_segctor_wait_request {
wait_queue_entry_t wq;
__u32 seq;
@@ -2424,7 +2408,7 @@ static void nilfs_segctor_accept(struct nilfs_sc_info *sci)
* the area protected by sc_state_lock.
*/
if (thread_is_alive)
- del_timer_sync(&sci->sc_timer);
+ timer_delete_sync(&sci->sc_timer);
}
/**
@@ -2501,7 +2485,7 @@ static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode)
static void nilfs_construction_timeout(struct timer_list *t)
{
- struct nilfs_sc_info *sci = from_timer(sci, t, sc_timer);
+ struct nilfs_sc_info *sci = timer_container_of(sci, t, sc_timer);
wake_up_process(sci->sc_task);
}
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index f723f47ddc4e..4b39ed43ae72 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -226,7 +226,6 @@ extern void nilfs_relax_pressure_in_lock(struct super_block *);
extern int nilfs_construct_segment(struct super_block *);
extern int nilfs_construct_dsync_segment(struct super_block *, struct inode *,
loff_t, loff_t);
-extern void nilfs_flush_segment(struct super_block *, ino_t);
extern int nilfs_clean_segments(struct super_block *, struct nilfs_argv *,
void **);
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index cb01ea81724d..d0bcf744c553 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -705,8 +705,6 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
int blocksize;
int err;
- down_write(&nilfs->ns_sem);
-
blocksize = sb_min_blocksize(sb, NILFS_MIN_BLOCK_SIZE);
if (!blocksize) {
nilfs_err(sb, "unable to set blocksize");
@@ -779,7 +777,6 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
set_nilfs_init(nilfs);
err = 0;
out:
- up_write(&nilfs->ns_sem);
return err;
failed_sbh:
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index c4cdaf5fa7ed..9fb73bafd41d 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -308,6 +308,10 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg)
goto out_err;
}
+ error = file_f_owner_allocate(filp);
+ if (error)
+ goto out_err;
+
/* new fsnotify mark, we expect most fcntl calls to add a new mark */
new_dn_mark = kmem_cache_alloc(dnotify_mark_cache, GFP_KERNEL);
if (!new_dn_mark) {
@@ -315,10 +319,6 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg)
goto out_err;
}
- error = file_f_owner_allocate(filp);
- if (error)
- goto out_err;
-
/* set up the new_fsn_mark and new_dn_mark */
new_fsn_mark = &new_dn_mark->fsn_mark;
fsnotify_init_mark(new_fsn_mark, dnotify_group);
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 95646f7c46ca..bfe884d624e7 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -166,6 +166,8 @@ static bool fanotify_should_merge(struct fanotify_event *old,
case FANOTIFY_EVENT_TYPE_FS_ERROR:
return fanotify_error_event_equal(FANOTIFY_EE(old),
FANOTIFY_EE(new));
+ case FANOTIFY_EVENT_TYPE_MNT:
+ return false;
default:
WARN_ON_ONCE(1);
}
@@ -312,7 +314,10 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group,
pr_debug("%s: report_mask=%x mask=%x data=%p data_type=%d\n",
__func__, iter_info->report_mask, event_mask, data, data_type);
- if (!fid_mode) {
+ if (FAN_GROUP_FLAG(group, FAN_REPORT_MNT)) {
+ if (data_type != FSNOTIFY_EVENT_MNT)
+ return 0;
+ } else if (!fid_mode) {
/* Do we have path to open a file descriptor? */
if (!path)
return 0;
@@ -410,7 +415,7 @@ static int fanotify_encode_fh(struct fanotify_fh *fh, struct inode *inode,
{
int dwords, type = 0;
char *ext_buf = NULL;
- void *buf = fh->buf;
+ void *buf = fh + 1;
int err;
fh->type = FILEID_ROOT;
@@ -449,7 +454,13 @@ static int fanotify_encode_fh(struct fanotify_fh *fh, struct inode *inode,
dwords = fh_len >> 2;
type = exportfs_encode_fid(inode, buf, &dwords);
err = -EINVAL;
- if (type <= 0 || type == FILEID_INVALID || fh_len != dwords << 2)
+ /*
+ * Unlike file_handle, type and len of struct fanotify_fh are u8.
+ * Traditionally, filesystem return handle_type < 0xff, but there
+ * is no enforecement for that in vfs.
+ */
+ BUILD_BUG_ON(MAX_HANDLE_SZ > 0xff || FILEID_INVALID > 0xff);
+ if (type <= 0 || type >= FILEID_INVALID || fh_len != dwords << 2)
goto out_err;
fh->type = type;
@@ -557,6 +568,20 @@ static struct fanotify_event *fanotify_alloc_path_event(const struct path *path,
return &pevent->fae;
}
+static struct fanotify_event *fanotify_alloc_mnt_event(u64 mnt_id, gfp_t gfp)
+{
+ struct fanotify_mnt_event *pevent;
+
+ pevent = kmem_cache_alloc(fanotify_mnt_event_cachep, gfp);
+ if (!pevent)
+ return NULL;
+
+ pevent->fae.type = FANOTIFY_EVENT_TYPE_MNT;
+ pevent->mnt_id = mnt_id;
+
+ return &pevent->fae;
+}
+
static struct fanotify_event *fanotify_alloc_perm_event(const void *data,
int data_type,
gfp_t gfp)
@@ -731,6 +756,7 @@ static struct fanotify_event *fanotify_alloc_event(
fid_mode);
struct inode *dirid = fanotify_dfid_inode(mask, data, data_type, dir);
const struct path *path = fsnotify_data_path(data, data_type);
+ u64 mnt_id = fsnotify_data_mnt_id(data, data_type);
struct mem_cgroup *old_memcg;
struct dentry *moved = NULL;
struct inode *child = NULL;
@@ -826,8 +852,12 @@ static struct fanotify_event *fanotify_alloc_event(
moved, &hash, gfp);
} else if (fid_mode) {
event = fanotify_alloc_fid_event(id, fsid, &hash, gfp);
- } else {
+ } else if (path) {
event = fanotify_alloc_path_event(path, &hash, gfp);
+ } else if (mnt_id) {
+ event = fanotify_alloc_mnt_event(mnt_id, gfp);
+ } else {
+ WARN_ON_ONCE(1);
}
if (!event)
@@ -927,7 +957,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask,
BUILD_BUG_ON(FAN_RENAME != FS_RENAME);
BUILD_BUG_ON(FAN_PRE_ACCESS != FS_PRE_ACCESS);
- BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 22);
+ BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 24);
mask = fanotify_group_event_mask(group, iter_info, &match_mask,
mask, data, data_type, dir);
@@ -985,6 +1015,7 @@ finish:
static void fanotify_free_group_priv(struct fsnotify_group *group)
{
+ put_user_ns(group->user_ns);
kfree(group->fanotify_data.merge_hash);
if (group->fanotify_data.ucounts)
dec_ucount(group->fanotify_data.ucounts,
@@ -1028,6 +1059,11 @@ static void fanotify_free_error_event(struct fsnotify_group *group,
mempool_free(fee, &group->fanotify_data.error_events_pool);
}
+static void fanotify_free_mnt_event(struct fanotify_event *event)
+{
+ kmem_cache_free(fanotify_mnt_event_cachep, FANOTIFY_ME(event));
+}
+
static void fanotify_free_event(struct fsnotify_group *group,
struct fsnotify_event *fsn_event)
{
@@ -1054,6 +1090,9 @@ static void fanotify_free_event(struct fsnotify_group *group,
case FANOTIFY_EVENT_TYPE_FS_ERROR:
fanotify_free_error_event(group, event);
break;
+ case FANOTIFY_EVENT_TYPE_MNT:
+ fanotify_free_mnt_event(event);
+ break;
default:
WARN_ON_ONCE(1);
}
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index c12cbc270539..b78308975082 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -9,6 +9,7 @@ extern struct kmem_cache *fanotify_mark_cache;
extern struct kmem_cache *fanotify_fid_event_cachep;
extern struct kmem_cache *fanotify_path_event_cachep;
extern struct kmem_cache *fanotify_perm_event_cachep;
+extern struct kmem_cache *fanotify_mnt_event_cachep;
/* Possible states of the permission event */
enum {
@@ -24,7 +25,7 @@ enum {
* stored in either the first or last 2 dwords.
*/
#define FANOTIFY_INLINE_FH_LEN (3 << 2)
-#define FANOTIFY_FH_HDR_LEN offsetof(struct fanotify_fh, buf)
+#define FANOTIFY_FH_HDR_LEN sizeof(struct fanotify_fh)
/* Fixed size struct for file handle */
struct fanotify_fh {
@@ -33,7 +34,6 @@ struct fanotify_fh {
#define FANOTIFY_FH_FLAG_EXT_BUF 1
u8 flags;
u8 pad;
- unsigned char buf[];
} __aligned(4);
/* Variable size struct for dir file handle + child file handle + name */
@@ -91,7 +91,7 @@ static inline char **fanotify_fh_ext_buf_ptr(struct fanotify_fh *fh)
BUILD_BUG_ON(FANOTIFY_FH_HDR_LEN % 4);
BUILD_BUG_ON(__alignof__(char *) - 4 + sizeof(char *) >
FANOTIFY_INLINE_FH_LEN);
- return (char **)ALIGN((unsigned long)(fh->buf), __alignof__(char *));
+ return (char **)ALIGN((unsigned long)(fh + 1), __alignof__(char *));
}
static inline void *fanotify_fh_ext_buf(struct fanotify_fh *fh)
@@ -101,7 +101,7 @@ static inline void *fanotify_fh_ext_buf(struct fanotify_fh *fh)
static inline void *fanotify_fh_buf(struct fanotify_fh *fh)
{
- return fanotify_fh_has_ext_buf(fh) ? fanotify_fh_ext_buf(fh) : fh->buf;
+ return fanotify_fh_has_ext_buf(fh) ? fanotify_fh_ext_buf(fh) : fh + 1;
}
static inline int fanotify_info_dir_fh_len(struct fanotify_info *info)
@@ -244,6 +244,7 @@ enum fanotify_event_type {
FANOTIFY_EVENT_TYPE_PATH_PERM,
FANOTIFY_EVENT_TYPE_OVERFLOW, /* struct fanotify_event */
FANOTIFY_EVENT_TYPE_FS_ERROR, /* struct fanotify_error_event */
+ FANOTIFY_EVENT_TYPE_MNT,
__FANOTIFY_EVENT_TYPE_NUM
};
@@ -276,7 +277,7 @@ static inline void fanotify_init_event(struct fanotify_event *event,
#define FANOTIFY_INLINE_FH(name, size) \
struct { \
struct fanotify_fh name; \
- /* Space for object_fh.buf[] - access with fanotify_fh_buf() */ \
+ /* Space for filehandle - access with fanotify_fh_buf() */ \
unsigned char _inline_fh_buf[size]; \
}
@@ -409,12 +410,23 @@ struct fanotify_path_event {
struct path path;
};
+struct fanotify_mnt_event {
+ struct fanotify_event fae;
+ u64 mnt_id;
+};
+
static inline struct fanotify_path_event *
FANOTIFY_PE(struct fanotify_event *event)
{
return container_of(event, struct fanotify_path_event, fae);
}
+static inline struct fanotify_mnt_event *
+FANOTIFY_ME(struct fanotify_event *event)
+{
+ return container_of(event, struct fanotify_mnt_event, fae);
+}
+
/*
* Structure for permission fanotify events. It gets allocated and freed in
* fanotify_handle_event() since we wait there for user response. When the
@@ -466,6 +478,11 @@ static inline bool fanotify_is_error_event(u32 mask)
return mask & FAN_FS_ERROR;
}
+static inline bool fanotify_is_mnt_event(u32 mask)
+{
+ return mask & (FAN_MNT_ATTACH | FAN_MNT_DETACH);
+}
+
static inline const struct path *fanotify_event_path(struct fanotify_event *event)
{
if (event->type == FANOTIFY_EVENT_TYPE_PATH)
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index ba3e2d09eb44..b192ee068a7a 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -113,6 +113,7 @@ struct kmem_cache *fanotify_mark_cache __ro_after_init;
struct kmem_cache *fanotify_fid_event_cachep __ro_after_init;
struct kmem_cache *fanotify_path_event_cachep __ro_after_init;
struct kmem_cache *fanotify_perm_event_cachep __ro_after_init;
+struct kmem_cache *fanotify_mnt_event_cachep __ro_after_init;
#define FANOTIFY_EVENT_ALIGN 4
#define FANOTIFY_FID_INFO_HDR_LEN \
@@ -123,6 +124,8 @@ struct kmem_cache *fanotify_perm_event_cachep __ro_after_init;
(sizeof(struct fanotify_event_info_error))
#define FANOTIFY_RANGE_INFO_LEN \
(sizeof(struct fanotify_event_info_range))
+#define FANOTIFY_MNT_INFO_LEN \
+ (sizeof(struct fanotify_event_info_mnt))
static int fanotify_fid_info_len(int fh_len, int name_len)
{
@@ -178,6 +181,8 @@ static size_t fanotify_event_len(unsigned int info_mode,
fh_len = fanotify_event_object_fh_len(event);
event_len += fanotify_fid_info_len(fh_len, dot_len);
}
+ if (fanotify_is_mnt_event(event->mask))
+ event_len += FANOTIFY_MNT_INFO_LEN;
if (info_mode & FAN_REPORT_PIDFD)
event_len += FANOTIFY_PIDFD_INFO_LEN;
@@ -405,6 +410,25 @@ static int process_access_response(struct fsnotify_group *group,
return -ENOENT;
}
+static size_t copy_mnt_info_to_user(struct fanotify_event *event,
+ char __user *buf, int count)
+{
+ struct fanotify_event_info_mnt info = { };
+
+ info.hdr.info_type = FAN_EVENT_INFO_TYPE_MNT;
+ info.hdr.len = FANOTIFY_MNT_INFO_LEN;
+
+ if (WARN_ON(count < info.hdr.len))
+ return -EFAULT;
+
+ info.mnt_id = FANOTIFY_ME(event)->mnt_id;
+
+ if (copy_to_user(buf, &info, sizeof(info)))
+ return -EFAULT;
+
+ return info.hdr.len;
+}
+
static size_t copy_error_info_to_user(struct fanotify_event *event,
char __user *buf, int count)
{
@@ -700,6 +724,15 @@ static int copy_info_records_to_user(struct fanotify_event *event,
total_bytes += ret;
}
+ if (fanotify_is_mnt_event(event->mask)) {
+ ret = copy_mnt_info_to_user(event, buf, count);
+ if (ret < 0)
+ return ret;
+ buf += ret;
+ count -= ret;
+ total_bytes += ret;
+ }
+
return total_bytes;
}
@@ -1301,6 +1334,7 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
* A group with FAN_UNLIMITED_MARKS does not contribute to mark count
* in the limited groups account.
*/
+ BUILD_BUG_ON(!(FANOTIFY_ADMIN_INIT_FLAGS & FAN_UNLIMITED_MARKS));
if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS) &&
!inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_FANOTIFY_MARKS))
return ERR_PTR(-ENOSPC);
@@ -1465,6 +1499,7 @@ static struct hlist_head *fanotify_alloc_merge_hash(void)
/* fanotify syscalls */
SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
{
+ struct user_namespace *user_ns = current_user_ns();
struct fsnotify_group *group;
int f_flags, fd;
unsigned int fid_mode = flags & FANOTIFY_FID_BITS;
@@ -1479,10 +1514,11 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
/*
* An unprivileged user can setup an fanotify group with
* limited functionality - an unprivileged group is limited to
- * notification events with file handles and it cannot use
- * unlimited queue/marks.
+ * notification events with file handles or mount ids and it
+ * cannot use unlimited queue/marks.
*/
- if ((flags & FANOTIFY_ADMIN_INIT_FLAGS) || !fid_mode)
+ if ((flags & FANOTIFY_ADMIN_INIT_FLAGS) ||
+ !(flags & (FANOTIFY_FID_BITS | FAN_REPORT_MNT)))
return -EPERM;
/*
@@ -1508,6 +1544,14 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
if ((flags & FAN_REPORT_PIDFD) && (flags & FAN_REPORT_TID))
return -EINVAL;
+ /* Don't allow mixing mnt events with inode events for now */
+ if (flags & FAN_REPORT_MNT) {
+ if (class != FAN_CLASS_NOTIF)
+ return -EINVAL;
+ if (flags & (FANOTIFY_FID_BITS | FAN_REPORT_FD_ERROR))
+ return -EINVAL;
+ }
+
if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS)
return -EINVAL;
@@ -1553,8 +1597,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
}
/* Enforce groups limits per user in all containing user ns */
- group->fanotify_data.ucounts = inc_ucount(current_user_ns(),
- current_euid(),
+ group->fanotify_data.ucounts = inc_ucount(user_ns, current_euid(),
UCOUNT_FANOTIFY_GROUPS);
if (!group->fanotify_data.ucounts) {
fd = -EMFILE;
@@ -1563,6 +1606,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
group->fanotify_data.flags = flags | internal_flags;
group->memcg = get_mem_cgroup_from_mm(current->mm);
+ group->user_ns = get_user_ns(user_ns);
group->fanotify_data.merge_hash = fanotify_alloc_merge_hash();
if (!group->fanotify_data.merge_hash) {
@@ -1596,21 +1640,13 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
goto out_destroy_group;
}
+ BUILD_BUG_ON(!(FANOTIFY_ADMIN_INIT_FLAGS & FAN_UNLIMITED_QUEUE));
if (flags & FAN_UNLIMITED_QUEUE) {
- fd = -EPERM;
- if (!capable(CAP_SYS_ADMIN))
- goto out_destroy_group;
group->max_events = UINT_MAX;
} else {
group->max_events = fanotify_max_queued_events;
}
- if (flags & FAN_UNLIMITED_MARKS) {
- fd = -EPERM;
- if (!capable(CAP_SYS_ADMIN))
- goto out_destroy_group;
- }
-
if (flags & FAN_ENABLE_AUDIT) {
fd = -EPERM;
if (!capable(CAP_AUDIT_WRITE))
@@ -1767,16 +1803,17 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
int dfd, const char __user *pathname)
{
struct inode *inode = NULL;
- struct vfsmount *mnt = NULL;
struct fsnotify_group *group;
struct path path;
struct fan_fsid __fsid, *fsid = NULL;
+ struct user_namespace *user_ns = NULL;
+ struct mnt_namespace *mntns;
u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS;
unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
unsigned int mark_cmd = flags & FANOTIFY_MARK_CMD_BITS;
unsigned int ignore = flags & FANOTIFY_MARK_IGNORE_BITS;
unsigned int obj_type, fid_mode;
- void *obj;
+ void *obj = NULL;
u32 umask = 0;
int ret;
@@ -1800,6 +1837,9 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
case FAN_MARK_FILESYSTEM:
obj_type = FSNOTIFY_OBJ_TYPE_SB;
break;
+ case FAN_MARK_MNTNS:
+ obj_type = FSNOTIFY_OBJ_TYPE_MNTNS;
+ break;
default:
return -EINVAL;
}
@@ -1847,13 +1887,24 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
return -EINVAL;
group = fd_file(f)->private_data;
+ /* Only report mount events on mnt namespace */
+ if (FAN_GROUP_FLAG(group, FAN_REPORT_MNT)) {
+ if (mask & ~FANOTIFY_MOUNT_EVENTS)
+ return -EINVAL;
+ if (mark_type != FAN_MARK_MNTNS)
+ return -EINVAL;
+ } else {
+ if (mask & FANOTIFY_MOUNT_EVENTS)
+ return -EINVAL;
+ if (mark_type == FAN_MARK_MNTNS)
+ return -EINVAL;
+ }
+
/*
- * An unprivileged user is not allowed to setup mount nor filesystem
- * marks. This also includes setting up such marks by a group that
- * was initialized by an unprivileged user.
+ * A user is allowed to setup sb/mount/mntns marks only if it is
+ * capable in the user ns where the group was created.
*/
- if ((!capable(CAP_SYS_ADMIN) ||
- FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV)) &&
+ if (!ns_capable(group->user_ns, CAP_SYS_ADMIN) &&
mark_type != FAN_MARK_INODE)
return -EPERM;
@@ -1888,7 +1939,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
* point.
*/
fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS);
- if (mask & ~(FANOTIFY_FD_EVENTS|FANOTIFY_EVENT_FLAGS) &&
+ if (mask & ~(FANOTIFY_FD_EVENTS|FANOTIFY_MOUNT_EVENTS|FANOTIFY_EVENT_FLAGS) &&
(!fid_mode || mark_type == FAN_MARK_MOUNT))
return -EINVAL;
@@ -1905,12 +1956,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
return -EINVAL;
if (mark_cmd == FAN_MARK_FLUSH) {
- if (mark_type == FAN_MARK_MOUNT)
- fsnotify_clear_vfsmount_marks_by_group(group);
- else if (mark_type == FAN_MARK_FILESYSTEM)
- fsnotify_clear_sb_marks_by_group(group);
- else
- fsnotify_clear_inode_marks_by_group(group);
+ fsnotify_clear_marks_by_group(group, obj_type);
return 0;
}
@@ -1937,18 +1983,35 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
fsid = &__fsid;
}
- /* inode held in place by reference to path; group by fget on fd */
- if (mark_type == FAN_MARK_INODE) {
+ /*
+ * In addition to being capable in the user ns where group was created,
+ * the user also needs to be capable in the user ns associated with
+ * the filesystem or in the user ns associated with the mntns
+ * (when marking mntns).
+ */
+ if (obj_type == FSNOTIFY_OBJ_TYPE_INODE) {
inode = path.dentry->d_inode;
obj = inode;
- } else {
- mnt = path.mnt;
- if (mark_type == FAN_MARK_MOUNT)
- obj = mnt;
- else
- obj = mnt->mnt_sb;
+ } else if (obj_type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) {
+ user_ns = path.mnt->mnt_sb->s_user_ns;
+ obj = path.mnt;
+ } else if (obj_type == FSNOTIFY_OBJ_TYPE_SB) {
+ user_ns = path.mnt->mnt_sb->s_user_ns;
+ obj = path.mnt->mnt_sb;
+ } else if (obj_type == FSNOTIFY_OBJ_TYPE_MNTNS) {
+ mntns = mnt_ns_from_dentry(path.dentry);
+ user_ns = mntns->user_ns;
+ obj = mntns;
}
+ ret = -EPERM;
+ if (user_ns && !ns_capable(user_ns, CAP_SYS_ADMIN))
+ goto path_put_and_out;
+
+ ret = -EINVAL;
+ if (!obj)
+ goto path_put_and_out;
+
/*
* If some other task has this inode open for write we should not add
* an ignore mask, unless that ignore mask is supposed to survive
@@ -1956,10 +2019,10 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
*/
if (mark_cmd == FAN_MARK_ADD && (flags & FANOTIFY_MARK_IGNORE_BITS) &&
!(flags & FAN_MARK_IGNORED_SURV_MODIFY)) {
- ret = mnt ? -EINVAL : -EISDIR;
+ ret = !inode ? -EINVAL : -EISDIR;
/* FAN_MARK_IGNORE requires SURV_MODIFY for sb/mount/dir marks */
if (ignore == FAN_MARK_IGNORE &&
- (mnt || S_ISDIR(inode->i_mode)))
+ (!inode || S_ISDIR(inode->i_mode)))
goto path_put_and_out;
ret = 0;
@@ -1968,7 +2031,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
}
/* Mask out FAN_EVENT_ON_CHILD flag for sb/mount/non-dir marks */
- if (mnt || !S_ISDIR(inode->i_mode)) {
+ if (!inode || !S_ISDIR(inode->i_mode)) {
mask &= ~FAN_EVENT_ON_CHILD;
umask = FAN_EVENT_ON_CHILD;
/*
@@ -2042,7 +2105,7 @@ static int __init fanotify_user_setup(void)
FANOTIFY_DEFAULT_MAX_USER_MARKS);
BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS);
- BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 13);
+ BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 14);
BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 11);
fanotify_mark_cache = KMEM_CACHE(fanotify_mark,
@@ -2055,6 +2118,7 @@ static int __init fanotify_user_setup(void)
fanotify_perm_event_cachep =
KMEM_CACHE(fanotify_perm_event, SLAB_PANIC);
}
+ fanotify_mnt_event_cachep = KMEM_CACHE(fanotify_mnt_event, SLAB_PANIC);
fanotify_max_queued_events = FANOTIFY_DEFAULT_MAX_EVENTS;
init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS] =
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index e933f9c65d90..1161eabf11ee 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -121,6 +121,11 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
seq_printf(m, "fanotify sdev:%x mflags:%x mask:%x ignored_mask:%x\n",
sb->s_dev, mflags, mark->mask, mark->ignore_mask);
+ } else if (mark->connector->type == FSNOTIFY_OBJ_TYPE_MNTNS) {
+ struct mnt_namespace *mnt_ns = fsnotify_conn_mntns(mark->connector);
+
+ seq_printf(m, "fanotify mnt_ns:%u mflags:%x mask:%x ignored_mask:%x\n",
+ mnt_ns->ns.inum, mflags, mark->mask, mark->ignore_mask);
}
}
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 8ee495a58d0a..079b868552c2 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -28,6 +28,11 @@ void __fsnotify_vfsmount_delete(struct vfsmount *mnt)
fsnotify_clear_marks_by_mount(mnt);
}
+void __fsnotify_mntns_delete(struct mnt_namespace *mntns)
+{
+ fsnotify_clear_marks_by_mntns(mntns);
+}
+
/**
* fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes.
* @sb: superblock being unmounted.
@@ -194,8 +199,8 @@ static bool fsnotify_event_needs_parent(struct inode *inode, __u32 mnt_mask,
}
/* Are there any inode/mount/sb objects that watch for these events? */
-static inline bool fsnotify_object_watched(struct inode *inode, __u32 mnt_mask,
- __u32 mask)
+static inline __u32 fsnotify_object_watched(struct inode *inode, __u32 mnt_mask,
+ __u32 mask)
{
__u32 marks_mask = READ_ONCE(inode->i_fsnotify_mask) | mnt_mask |
READ_ONCE(inode->i_sb->s_fsnotify_mask);
@@ -420,7 +425,7 @@ static int send_to_group(__u32 mask, const void *data, int data_type,
file_name, cookie, iter_info);
}
-static struct fsnotify_mark *fsnotify_first_mark(struct fsnotify_mark_connector **connp)
+static struct fsnotify_mark *fsnotify_first_mark(struct fsnotify_mark_connector *const *connp)
{
struct fsnotify_mark_connector *conn;
struct hlist_node *node = NULL;
@@ -538,14 +543,15 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
{
const struct path *path = fsnotify_data_path(data, data_type);
struct super_block *sb = fsnotify_data_sb(data, data_type);
- struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb);
+ const struct fsnotify_mnt *mnt_data = fsnotify_data_mnt(data, data_type);
+ struct fsnotify_sb_info *sbinfo = sb ? fsnotify_sb_info(sb) : NULL;
struct fsnotify_iter_info iter_info = {};
struct mount *mnt = NULL;
struct inode *inode2 = NULL;
struct dentry *moved;
int inode2_type;
int ret = 0;
- __u32 test_mask, marks_mask;
+ __u32 test_mask, marks_mask = 0;
if (path)
mnt = real_mount(path->mnt);
@@ -578,17 +584,20 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
if ((!sbinfo || !sbinfo->sb_marks) &&
(!mnt || !mnt->mnt_fsnotify_marks) &&
(!inode || !inode->i_fsnotify_marks) &&
- (!inode2 || !inode2->i_fsnotify_marks))
+ (!inode2 || !inode2->i_fsnotify_marks) &&
+ (!mnt_data || !mnt_data->ns->n_fsnotify_marks))
return 0;
- marks_mask = READ_ONCE(sb->s_fsnotify_mask);
+ if (sb)
+ marks_mask |= READ_ONCE(sb->s_fsnotify_mask);
if (mnt)
marks_mask |= READ_ONCE(mnt->mnt_fsnotify_mask);
if (inode)
marks_mask |= READ_ONCE(inode->i_fsnotify_mask);
if (inode2)
marks_mask |= READ_ONCE(inode2->i_fsnotify_mask);
-
+ if (mnt_data)
+ marks_mask |= READ_ONCE(mnt_data->ns->n_fsnotify_mask);
/*
* If this is a modify event we may need to clear some ignore masks.
@@ -618,6 +627,10 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
iter_info.marks[inode2_type] =
fsnotify_first_mark(&inode2->i_fsnotify_marks);
}
+ if (mnt_data) {
+ iter_info.marks[FSNOTIFY_ITER_TYPE_MNTNS] =
+ fsnotify_first_mark(&mnt_data->ns->n_fsnotify_marks);
+ }
/*
* We need to merge inode/vfsmount/sb mark lists so that e.g. inode mark
@@ -643,20 +656,20 @@ EXPORT_SYMBOL_GPL(fsnotify);
#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
/*
- * At open time we check fsnotify_sb_has_priority_watchers() and set the
- * FMODE_NONOTIFY_ mode bits accordignly.
+ * At open time we check fsnotify_sb_has_priority_watchers(), call the open perm
+ * hook and set the FMODE_NONOTIFY_ mode bits accordignly.
* Later, fsnotify permission hooks do not check if there are permission event
* watches, but that there were permission event watches at open time.
*/
-void file_set_fsnotify_mode(struct file *file)
+int fsnotify_open_perm_and_set_mode(struct file *file)
{
struct dentry *dentry = file->f_path.dentry, *parent;
struct super_block *sb = dentry->d_sb;
- __u32 mnt_mask, p_mask;
+ __u32 mnt_mask, p_mask = 0;
/* Is it a file opened by fanotify? */
if (FMODE_FSNOTIFY_NONE(file->f_mode))
- return;
+ return 0;
/*
* Permission events is a super set of pre-content events, so if there
@@ -665,48 +678,93 @@ void file_set_fsnotify_mode(struct file *file)
*/
if (likely(!fsnotify_sb_has_priority_watchers(sb,
FSNOTIFY_PRIO_CONTENT))) {
- file->f_mode |= FMODE_NONOTIFY_PERM;
- return;
+ file_set_fsnotify_mode(file, FMODE_NONOTIFY_PERM);
+ return 0;
+ }
+
+ /*
+ * OK, there are some permission event watchers. Check if anybody is
+ * watching for permission events on *this* file.
+ */
+ mnt_mask = READ_ONCE(real_mount(file->f_path.mnt)->mnt_fsnotify_mask);
+ p_mask = fsnotify_object_watched(d_inode(dentry), mnt_mask,
+ ALL_FSNOTIFY_PERM_EVENTS);
+ if (dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED) {
+ parent = dget_parent(dentry);
+ p_mask |= fsnotify_inode_watches_children(d_inode(parent));
+ dput(parent);
+ }
+
+ /*
+ * Legacy FAN_ACCESS_PERM events have very high performance overhead,
+ * so unlikely to be used in the wild. If they are used there will be
+ * no optimizations at all.
+ */
+ if (unlikely(p_mask & FS_ACCESS_PERM)) {
+ /* Enable all permission and pre-content events */
+ file_set_fsnotify_mode(file, 0);
+ goto open_perm;
}
/*
- * If there are permission event watchers but no pre-content event
+ * Pre-content events are only supported on regular files.
+ * If there are pre-content event watchers and no permission access
* watchers, set FMODE_NONOTIFY | FMODE_NONOTIFY_PERM to indicate that.
+ * That is the common case with HSM service.
*/
- if ((!d_is_dir(dentry) && !d_is_reg(dentry)) ||
- likely(!fsnotify_sb_has_priority_watchers(sb,
- FSNOTIFY_PRIO_PRE_CONTENT))) {
- file->f_mode |= FMODE_NONOTIFY | FMODE_NONOTIFY_PERM;
- return;
+ if (d_is_reg(dentry) && (p_mask & FSNOTIFY_PRE_CONTENT_EVENTS)) {
+ file_set_fsnotify_mode(file, FMODE_NONOTIFY |
+ FMODE_NONOTIFY_PERM);
+ goto open_perm;
}
+ /* Nobody watching permission and pre-content events on this file */
+ file_set_fsnotify_mode(file, FMODE_NONOTIFY_PERM);
+
+open_perm:
/*
- * OK, there are some pre-content watchers. Check if anybody is
- * watching for pre-content events on *this* file.
+ * Send open perm events depending on object masks and regardless of
+ * FMODE_NONOTIFY_PERM.
*/
- mnt_mask = READ_ONCE(real_mount(file->f_path.mnt)->mnt_fsnotify_mask);
- if (unlikely(fsnotify_object_watched(d_inode(dentry), mnt_mask,
- FSNOTIFY_PRE_CONTENT_EVENTS)))
- return;
+ if (file->f_flags & __FMODE_EXEC && p_mask & FS_OPEN_EXEC_PERM) {
+ int ret = fsnotify_path(&file->f_path, FS_OPEN_EXEC_PERM);
- /* Is parent watching for pre-content events on this file? */
- if (dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED) {
- parent = dget_parent(dentry);
- p_mask = fsnotify_inode_watches_children(d_inode(parent));
- dput(parent);
- if (p_mask & FSNOTIFY_PRE_CONTENT_EVENTS)
- return;
+ if (ret)
+ return ret;
}
- /* Nobody watching for pre-content events from this file */
- file->f_mode |= FMODE_NONOTIFY | FMODE_NONOTIFY_PERM;
+
+ if (p_mask & FS_OPEN_PERM)
+ return fsnotify_path(&file->f_path, FS_OPEN_PERM);
+
+ return 0;
}
#endif
+void fsnotify_mnt(__u32 mask, struct mnt_namespace *ns, struct vfsmount *mnt)
+{
+ struct fsnotify_mnt data = {
+ .ns = ns,
+ .mnt_id = real_mount(mnt)->mnt_id_unique,
+ };
+
+ if (WARN_ON_ONCE(!ns))
+ return;
+
+ /*
+ * This is an optimization as well as making sure fsnotify_init() has
+ * been called.
+ */
+ if (!ns->n_fsnotify_marks)
+ return;
+
+ fsnotify(mask, &data, FSNOTIFY_EVENT_MNT, NULL, NULL, NULL, 0);
+}
+
static __init int fsnotify_init(void)
{
int ret;
- BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 24);
+ BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 26);
ret = init_srcu_struct(&fsnotify_mark_srcu);
if (ret)
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 663759ed6fbc..5950c7a67f41 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -33,6 +33,12 @@ static inline struct super_block *fsnotify_conn_sb(
return conn->obj;
}
+static inline struct mnt_namespace *fsnotify_conn_mntns(
+ struct fsnotify_mark_connector *conn)
+{
+ return conn->obj;
+}
+
static inline struct super_block *fsnotify_object_sb(void *obj,
enum fsnotify_obj_type obj_type)
{
@@ -89,6 +95,11 @@ static inline void fsnotify_clear_marks_by_sb(struct super_block *sb)
fsnotify_destroy_marks(fsnotify_sb_marks(sb));
}
+static inline void fsnotify_clear_marks_by_mntns(struct mnt_namespace *mntns)
+{
+ fsnotify_destroy_marks(&mntns->n_fsnotify_marks);
+}
+
/*
* update the dentry->d_flags of all of inode's children to indicate if inode cares
* about events that happen to its children.
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 4981439e6209..798340db69d7 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -107,6 +107,8 @@ static fsnotify_connp_t *fsnotify_object_connp(void *obj,
return &real_mount(obj)->mnt_fsnotify_marks;
case FSNOTIFY_OBJ_TYPE_SB:
return fsnotify_sb_marks(obj);
+ case FSNOTIFY_OBJ_TYPE_MNTNS:
+ return &((struct mnt_namespace *)obj)->n_fsnotify_marks;
default:
return NULL;
}
@@ -120,6 +122,8 @@ static __u32 *fsnotify_conn_mask_p(struct fsnotify_mark_connector *conn)
return &fsnotify_conn_mount(conn)->mnt_fsnotify_mask;
else if (conn->type == FSNOTIFY_OBJ_TYPE_SB)
return &fsnotify_conn_sb(conn)->s_fsnotify_mask;
+ else if (conn->type == FSNOTIFY_OBJ_TYPE_MNTNS)
+ return &fsnotify_conn_mntns(conn)->n_fsnotify_mask;
return NULL;
}
@@ -346,12 +350,15 @@ static void *fsnotify_detach_connector_from_object(
fsnotify_conn_mount(conn)->mnt_fsnotify_mask = 0;
} else if (conn->type == FSNOTIFY_OBJ_TYPE_SB) {
fsnotify_conn_sb(conn)->s_fsnotify_mask = 0;
+ } else if (conn->type == FSNOTIFY_OBJ_TYPE_MNTNS) {
+ fsnotify_conn_mntns(conn)->n_fsnotify_mask = 0;
}
rcu_assign_pointer(*connp, NULL);
conn->obj = NULL;
conn->type = FSNOTIFY_OBJ_TYPE_DETACHED;
- fsnotify_update_sb_watchers(sb, conn);
+ if (sb)
+ fsnotify_update_sb_watchers(sb, conn);
return inode;
}
@@ -724,7 +731,7 @@ static int fsnotify_add_mark_list(struct fsnotify_mark *mark, void *obj,
* Attach the sb info before attaching a connector to any object on sb.
* The sb info will remain attached as long as sb lives.
*/
- if (!fsnotify_sb_info(sb)) {
+ if (sb && !fsnotify_sb_info(sb)) {
err = fsnotify_attach_info_to_sb(sb);
if (err)
return err;
@@ -770,7 +777,8 @@ restart:
/* mark should be the last entry. last is the current last entry */
hlist_add_behind_rcu(&mark->obj_list, &last->obj_list);
added:
- fsnotify_update_sb_watchers(sb, conn);
+ if (sb)
+ fsnotify_update_sb_watchers(sb, conn);
/*
* Since connector is attached to object using cmpxchg() we are
* guaranteed that connector initialization is fully visible by anyone
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 663f8656158d..59aa801347a7 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -37,7 +37,6 @@ static char *ns_dname(struct dentry *dentry, char *buffer, int buflen)
}
const struct dentry_operations ns_dentry_operations = {
- .d_delete = always_delete_dentry,
.d_dname = ns_dname,
.d_prune = stashed_dentry_prune,
};
@@ -152,19 +151,49 @@ static int copy_ns_info_to_user(const struct mnt_namespace *mnt_ns,
return 0;
}
+static bool nsfs_ioctl_valid(unsigned int cmd)
+{
+ switch (cmd) {
+ case NS_GET_USERNS:
+ case NS_GET_PARENT:
+ case NS_GET_NSTYPE:
+ case NS_GET_OWNER_UID:
+ case NS_GET_MNTNS_ID:
+ case NS_GET_PID_FROM_PIDNS:
+ case NS_GET_TGID_FROM_PIDNS:
+ case NS_GET_PID_IN_PIDNS:
+ case NS_GET_TGID_IN_PIDNS:
+ return (_IOC_TYPE(cmd) == _IOC_TYPE(cmd));
+ }
+
+ /* Extensible ioctls require some extra handling. */
+ switch (_IOC_NR(cmd)) {
+ case _IOC_NR(NS_MNT_GET_INFO):
+ case _IOC_NR(NS_MNT_GET_NEXT):
+ case _IOC_NR(NS_MNT_GET_PREV):
+ return (_IOC_TYPE(cmd) == _IOC_TYPE(cmd));
+ }
+
+ return false;
+}
+
static long ns_ioctl(struct file *filp, unsigned int ioctl,
unsigned long arg)
{
struct user_namespace *user_ns;
struct pid_namespace *pid_ns;
struct task_struct *tsk;
- struct ns_common *ns = get_proc_ns(file_inode(filp));
+ struct ns_common *ns;
struct mnt_namespace *mnt_ns;
bool previous = false;
uid_t __user *argp;
uid_t uid;
int ret;
+ if (!nsfs_ioctl_valid(ioctl))
+ return -ENOIOCTLCMD;
+
+ ns = get_proc_ns(file_inode(filp));
switch (ioctl) {
case NS_GET_USERNS:
return open_related_ns(ns, ns_get_owner);
diff --git a/fs/ntfs3/attrib.c b/fs/ntfs3/attrib.c
index af94e3737470..eced9013a881 100644
--- a/fs/ntfs3/attrib.c
+++ b/fs/ntfs3/attrib.c
@@ -2605,74 +2605,3 @@ int attr_force_nonresident(struct ntfs_inode *ni)
return err;
}
-
-/*
- * Change the compression of data attribute
- */
-int attr_set_compress(struct ntfs_inode *ni, bool compr)
-{
- struct ATTRIB *attr;
- struct mft_inode *mi;
-
- attr = ni_find_attr(ni, NULL, NULL, ATTR_DATA, NULL, 0, NULL, &mi);
- if (!attr)
- return -ENOENT;
-
- if (is_attr_compressed(attr) == !!compr) {
- /* Already required compressed state. */
- return 0;
- }
-
- if (attr->non_res) {
- u16 run_off;
- u32 run_size;
- char *run;
-
- if (attr->nres.data_size) {
- /*
- * There are rare cases when it possible to change
- * compress state without big changes.
- * TODO: Process these cases.
- */
- return -EOPNOTSUPP;
- }
-
- run_off = le16_to_cpu(attr->nres.run_off);
- run_size = le32_to_cpu(attr->size) - run_off;
- run = Add2Ptr(attr, run_off);
-
- if (!compr) {
- /* remove field 'attr->nres.total_size'. */
- memmove(run - 8, run, run_size);
- run_off -= 8;
- }
-
- if (!mi_resize_attr(mi, attr, compr ? +8 : -8)) {
- /*
- * Ignore rare case when there are no 8 bytes in record with attr.
- * TODO: split attribute.
- */
- return -EOPNOTSUPP;
- }
-
- if (compr) {
- /* Make a gap for 'attr->nres.total_size'. */
- memmove(run + 8, run, run_size);
- run_off += 8;
- attr->nres.total_size = attr->nres.alloc_size;
- }
- attr->nres.run_off = cpu_to_le16(run_off);
- }
-
- /* Update data attribute flags. */
- if (compr) {
- attr->flags |= ATTR_FLAG_COMPRESSED;
- attr->nres.c_unit = NTFS_LZNT_CUNIT;
- } else {
- attr->flags &= ~ATTR_FLAG_COMPRESSED;
- attr->nres.c_unit = 0;
- }
- mi->dirty = true;
-
- return 0;
-}
diff --git a/fs/ntfs3/dir.c b/fs/ntfs3/dir.c
index b6da80c69ca6..1b5c865a0339 100644
--- a/fs/ntfs3/dir.c
+++ b/fs/ntfs3/dir.c
@@ -304,6 +304,9 @@ static inline bool ntfs_dir_emit(struct ntfs_sb_info *sbi,
if (sbi->options->nohidden && (fname->dup.fa & FILE_ATTRIBUTE_HIDDEN))
return true;
+ if (fname->name_len + sizeof(struct NTFS_DE) > le16_to_cpu(e->size))
+ return true;
+
name_len = ntfs_utf16_to_nls(sbi, fname->name, fname->name_len, name,
PATH_MAX);
if (name_len <= 0) {
@@ -329,8 +332,7 @@ static inline bool ntfs_dir_emit(struct ntfs_sb_info *sbi,
* It does additional locks/reads just to get the type of name.
* Should we use additional mount option to enable branch below?
*/
- if (((fname->dup.fa & FILE_ATTRIBUTE_REPARSE_POINT) ||
- fname->dup.ea_size) &&
+ if (fname->dup.extend_data &&
ino != ni->mi.rno) {
struct inode *inode = ntfs_iget5(sbi->sb, &e->ref, NULL);
if (!IS_ERR_OR_NULL(inode)) {
diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c
index 3f96a11804c9..c1ece707b195 100644
--- a/fs/ntfs3/file.c
+++ b/fs/ntfs3/file.c
@@ -50,72 +50,6 @@ static int ntfs_ioctl_fitrim(struct ntfs_sb_info *sbi, unsigned long arg)
}
/*
- * ntfs_fileattr_get - inode_operations::fileattr_get
- */
-int ntfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
-{
- struct inode *inode = d_inode(dentry);
- struct ntfs_inode *ni = ntfs_i(inode);
- u32 flags = 0;
-
- if (inode->i_flags & S_IMMUTABLE)
- flags |= FS_IMMUTABLE_FL;
-
- if (inode->i_flags & S_APPEND)
- flags |= FS_APPEND_FL;
-
- if (is_compressed(ni))
- flags |= FS_COMPR_FL;
-
- if (is_encrypted(ni))
- flags |= FS_ENCRYPT_FL;
-
- fileattr_fill_flags(fa, flags);
-
- return 0;
-}
-
-/*
- * ntfs_fileattr_set - inode_operations::fileattr_set
- */
-int ntfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry,
- struct fileattr *fa)
-{
- struct inode *inode = d_inode(dentry);
- struct ntfs_inode *ni = ntfs_i(inode);
- u32 flags = fa->flags;
- unsigned int new_fl = 0;
-
- if (fileattr_has_fsx(fa))
- return -EOPNOTSUPP;
-
- if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_COMPR_FL))
- return -EOPNOTSUPP;
-
- if (flags & FS_IMMUTABLE_FL)
- new_fl |= S_IMMUTABLE;
-
- if (flags & FS_APPEND_FL)
- new_fl |= S_APPEND;
-
- /* Allowed to change compression for empty files and for directories only. */
- if (!is_dedup(ni) && !is_encrypted(ni) &&
- (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) {
- /* Change compress state. */
- int err = ni_set_compress(inode, flags & FS_COMPR_FL);
- if (err)
- return err;
- }
-
- inode_set_flags(inode, new_fl, S_IMMUTABLE | S_APPEND);
-
- inode_set_ctime_current(inode);
- mark_inode_dirty(inode);
-
- return 0;
-}
-
-/*
* ntfs_ioctl - file_operations::unlocked_ioctl
*/
long ntfs_ioctl(struct file *filp, u32 cmd, unsigned long arg)
@@ -123,6 +57,10 @@ long ntfs_ioctl(struct file *filp, u32 cmd, unsigned long arg)
struct inode *inode = file_inode(filp);
struct ntfs_sb_info *sbi = inode->i_sb->s_fs_info;
+ /* Avoid any operation if inode is bad. */
+ if (unlikely(is_bad_ni(ntfs_i(inode))))
+ return -EINVAL;
+
switch (cmd) {
case FITRIM:
return ntfs_ioctl_fitrim(sbi, arg);
@@ -147,6 +85,10 @@ int ntfs_getattr(struct mnt_idmap *idmap, const struct path *path,
struct inode *inode = d_inode(path->dentry);
struct ntfs_inode *ni = ntfs_i(inode);
+ /* Avoid any operation if inode is bad. */
+ if (unlikely(is_bad_ni(ni)))
+ return -EINVAL;
+
stat->result_mask |= STATX_BTIME;
stat->btime = ni->i_crtime;
stat->blksize = ni->mi.sbi->cluster_size; /* 512, 1K, ..., 2M */
@@ -220,13 +162,13 @@ static int ntfs_extend_initialized_size(struct file *file,
if (pos + len > new_valid)
len = new_valid - pos;
- err = ntfs_write_begin(file, mapping, pos, len, &folio, NULL);
+ err = ntfs_write_begin(NULL, mapping, pos, len, &folio, NULL);
if (err)
goto out;
folio_zero_range(folio, zerofrom, folio_size(folio) - zerofrom);
- err = ntfs_write_end(file, mapping, pos, len, len, folio, NULL);
+ err = ntfs_write_end(NULL, mapping, pos, len, len, folio, NULL);
if (err < 0)
goto out;
pos += len;
@@ -327,16 +269,21 @@ out:
}
/*
- * ntfs_file_mmap - file_operations::mmap
+ * ntfs_file_mmap_prepare - file_operations::mmap_prepare
*/
-static int ntfs_file_mmap(struct file *file, struct vm_area_struct *vma)
+static int ntfs_file_mmap_prepare(struct vm_area_desc *desc)
{
+ struct file *file = desc->file;
struct inode *inode = file_inode(file);
struct ntfs_inode *ni = ntfs_i(inode);
- u64 from = ((u64)vma->vm_pgoff << PAGE_SHIFT);
- bool rw = vma->vm_flags & VM_WRITE;
+ u64 from = ((u64)desc->pgoff << PAGE_SHIFT);
+ bool rw = desc->vm_flags & VM_WRITE;
int err;
+ /* Avoid any operation if inode is bad. */
+ if (unlikely(is_bad_ni(ni)))
+ return -EINVAL;
+
if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
return -EIO;
@@ -357,7 +304,7 @@ static int ntfs_file_mmap(struct file *file, struct vm_area_struct *vma)
if (rw) {
u64 to = min_t(loff_t, i_size_read(inode),
- from + vma->vm_end - vma->vm_start);
+ from + desc->end - desc->start);
if (is_sparsed(ni)) {
/* Allocate clusters for rw map. */
@@ -376,7 +323,10 @@ static int ntfs_file_mmap(struct file *file, struct vm_area_struct *vma)
}
if (ni->i_valid < to) {
- inode_lock(inode);
+ if (!inode_trylock(inode)) {
+ err = -EAGAIN;
+ goto out;
+ }
err = ntfs_extend_initialized_size(file, ni,
ni->i_valid, to);
inode_unlock(inode);
@@ -385,7 +335,7 @@ static int ntfs_file_mmap(struct file *file, struct vm_area_struct *vma)
}
}
- err = generic_file_mmap(file, vma);
+ err = generic_file_mmap_prepare(desc);
out:
return err;
}
@@ -801,6 +751,10 @@ int ntfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
umode_t mode = inode->i_mode;
int err;
+ /* Avoid any operation if inode is bad. */
+ if (unlikely(is_bad_ni(ni)))
+ return -EINVAL;
+
if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
return -EIO;
@@ -861,6 +815,10 @@ static int check_read_restriction(struct inode *inode)
{
struct ntfs_inode *ni = ntfs_i(inode);
+ /* Avoid any operation if inode is bad. */
+ if (unlikely(is_bad_ni(ni)))
+ return -EINVAL;
+
if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
return -EIO;
@@ -979,7 +937,8 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from)
struct ntfs_inode *ni = ntfs_i(inode);
u64 valid = ni->i_valid;
struct ntfs_sb_info *sbi = ni->mi.sbi;
- struct page *page, **pages = NULL;
+ struct page **pages = NULL;
+ struct folio *folio;
size_t written = 0;
u8 frame_bits = NTFS_LZNT_CUNIT + sbi->cluster_bits;
u32 frame_size = 1u << frame_bits;
@@ -989,7 +948,6 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from)
u64 frame_vbo;
pgoff_t index;
bool frame_uptodate;
- struct folio *folio;
if (frame_size < PAGE_SIZE) {
/*
@@ -1043,8 +1001,7 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from)
pages_per_frame);
if (err) {
for (ip = 0; ip < pages_per_frame; ip++) {
- page = pages[ip];
- folio = page_folio(page);
+ folio = page_folio(pages[ip]);
folio_unlock(folio);
folio_put(folio);
}
@@ -1055,10 +1012,9 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from)
ip = off >> PAGE_SHIFT;
off = offset_in_page(valid);
for (; ip < pages_per_frame; ip++, off = 0) {
- page = pages[ip];
- folio = page_folio(page);
- zero_user_segment(page, off, PAGE_SIZE);
- flush_dcache_page(page);
+ folio = page_folio(pages[ip]);
+ folio_zero_segment(folio, off, PAGE_SIZE);
+ flush_dcache_folio(folio);
folio_mark_uptodate(folio);
}
@@ -1067,8 +1023,7 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from)
ni_unlock(ni);
for (ip = 0; ip < pages_per_frame; ip++) {
- page = pages[ip];
- folio = page_folio(page);
+ folio = page_folio(pages[ip]);
folio_mark_uptodate(folio);
folio_unlock(folio);
folio_put(folio);
@@ -1112,8 +1067,7 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from)
if (err) {
for (ip = 0; ip < pages_per_frame;
ip++) {
- page = pages[ip];
- folio = page_folio(page);
+ folio = page_folio(pages[ip]);
folio_unlock(folio);
folio_put(folio);
}
@@ -1131,10 +1085,10 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from)
for (;;) {
size_t cp, tail = PAGE_SIZE - off;
- page = pages[ip];
- cp = copy_page_from_iter_atomic(page, off,
+ folio = page_folio(pages[ip]);
+ cp = copy_folio_from_iter_atomic(folio, off,
min(tail, bytes), from);
- flush_dcache_page(page);
+ flush_dcache_folio(folio);
copied += cp;
bytes -= cp;
@@ -1154,9 +1108,8 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from)
ni_unlock(ni);
for (ip = 0; ip < pages_per_frame; ip++) {
- page = pages[ip];
- ClearPageDirty(page);
- folio = page_folio(page);
+ folio = page_folio(pages[ip]);
+ folio_clear_dirty(folio);
folio_mark_uptodate(folio);
folio_unlock(folio);
folio_put(folio);
@@ -1201,6 +1154,10 @@ static int check_write_restriction(struct inode *inode)
{
struct ntfs_inode *ni = ntfs_i(inode);
+ /* Avoid any operation if inode is bad. */
+ if (unlikely(is_bad_ni(ni)))
+ return -EINVAL;
+
if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
return -EIO;
@@ -1228,21 +1185,22 @@ static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
ssize_t ret;
int err;
- err = check_write_restriction(inode);
- if (err)
- return err;
-
- if (is_compressed(ni) && (iocb->ki_flags & IOCB_DIRECT)) {
- ntfs_inode_warn(inode, "direct i/o + compressed not supported");
- return -EOPNOTSUPP;
- }
-
if (!inode_trylock(inode)) {
if (iocb->ki_flags & IOCB_NOWAIT)
return -EAGAIN;
inode_lock(inode);
}
+ ret = check_write_restriction(inode);
+ if (ret)
+ goto out;
+
+ if (is_compressed(ni) && (iocb->ki_flags & IOCB_DIRECT)) {
+ ntfs_inode_warn(inode, "direct i/o + compressed not supported");
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
ret = generic_write_checks(iocb, from);
if (ret <= 0)
goto out;
@@ -1282,6 +1240,10 @@ int ntfs_file_open(struct inode *inode, struct file *file)
{
struct ntfs_inode *ni = ntfs_i(inode);
+ /* Avoid any operation if inode is bad. */
+ if (unlikely(is_bad_ni(ni)))
+ return -EINVAL;
+
if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
return -EIO;
@@ -1351,6 +1313,10 @@ int ntfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
int err;
struct ntfs_inode *ni = ntfs_i(inode);
+ /* Avoid any operation if inode is bad. */
+ if (unlikely(is_bad_ni(ni)))
+ return -EINVAL;
+
err = fiemap_prep(inode, fieinfo, start, &len, ~FIEMAP_FLAG_XATTR);
if (err)
return err;
@@ -1389,8 +1355,6 @@ const struct inode_operations ntfs_file_inode_operations = {
.get_acl = ntfs_get_acl,
.set_acl = ntfs_set_acl,
.fiemap = ntfs_fiemap,
- .fileattr_get = ntfs_fileattr_get,
- .fileattr_set = ntfs_fileattr_set,
};
const struct file_operations ntfs_file_operations = {
@@ -1403,7 +1367,7 @@ const struct file_operations ntfs_file_operations = {
#endif
.splice_read = ntfs_file_splice_read,
.splice_write = ntfs_file_splice_write,
- .mmap = ntfs_file_mmap,
+ .mmap_prepare = ntfs_file_mmap_prepare,
.open = ntfs_file_open,
.fsync = generic_file_fsync,
.fallocate = ntfs_fallocate,
diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c
index 5df6a0b5add9..8f9fe1d7a690 100644
--- a/fs/ntfs3/frecord.c
+++ b/fs/ntfs3/frecord.c
@@ -281,63 +281,6 @@ struct ATTRIB *ni_enum_attr_ex(struct ntfs_inode *ni, struct ATTRIB *attr,
}
/*
- * ni_load_attr - Load attribute that contains given VCN.
- */
-struct ATTRIB *ni_load_attr(struct ntfs_inode *ni, enum ATTR_TYPE type,
- const __le16 *name, u8 name_len, CLST vcn,
- struct mft_inode **pmi)
-{
- struct ATTR_LIST_ENTRY *le;
- struct ATTRIB *attr;
- struct mft_inode *mi;
- struct ATTR_LIST_ENTRY *next;
-
- if (!ni->attr_list.size) {
- if (pmi)
- *pmi = &ni->mi;
- return mi_find_attr(ni, &ni->mi, NULL, type, name, name_len,
- NULL);
- }
-
- le = al_find_ex(ni, NULL, type, name, name_len, NULL);
- if (!le)
- return NULL;
-
- /*
- * Unfortunately ATTR_LIST_ENTRY contains only start VCN.
- * So to find the ATTRIB segment that contains 'vcn' we should
- * enumerate some entries.
- */
- if (vcn) {
- for (;; le = next) {
- next = al_find_ex(ni, le, type, name, name_len, NULL);
- if (!next || le64_to_cpu(next->vcn) > vcn)
- break;
- }
- }
-
- if (ni_load_mi(ni, le, &mi))
- return NULL;
-
- if (pmi)
- *pmi = mi;
-
- attr = mi_find_attr(ni, mi, NULL, type, name, name_len, &le->id);
- if (!attr)
- return NULL;
-
- if (!attr->non_res)
- return attr;
-
- if (le64_to_cpu(attr->nres.svcn) <= vcn &&
- vcn <= le64_to_cpu(attr->nres.evcn))
- return attr;
-
- _ntfs_bad_inode(&ni->vfs_inode);
- return NULL;
-}
-
-/*
* ni_load_all_mi - Load all subrecords.
*/
int ni_load_all_mi(struct ntfs_inode *ni)
@@ -3060,8 +3003,7 @@ int ni_add_name(struct ntfs_inode *dir_ni, struct ntfs_inode *ni,
* ni_rename - Remove one name and insert new name.
*/
int ni_rename(struct ntfs_inode *dir_ni, struct ntfs_inode *new_dir_ni,
- struct ntfs_inode *ni, struct NTFS_DE *de, struct NTFS_DE *new_de,
- bool *is_bad)
+ struct ntfs_inode *ni, struct NTFS_DE *de, struct NTFS_DE *new_de)
{
int err;
struct NTFS_DE *de2 = NULL;
@@ -3084,8 +3026,8 @@ int ni_rename(struct ntfs_inode *dir_ni, struct ntfs_inode *new_dir_ni,
err = ni_add_name(new_dir_ni, ni, new_de);
if (!err) {
err = ni_remove_name(dir_ni, ni, de, &de2, &undo);
- if (err && ni_remove_name(new_dir_ni, ni, new_de, &de2, &undo))
- *is_bad = true;
+ WARN_ON(err && ni_remove_name(new_dir_ni, ni, new_de, &de2,
+ &undo));
}
/*
@@ -3176,11 +3118,21 @@ static bool ni_update_parent(struct ntfs_inode *ni, struct NTFS_DUP_INFO *dup,
}
}
- /* TODO: Fill reparse info. */
- dup->reparse = 0;
- dup->ea_size = 0;
+ dup->extend_data = 0;
+
+ if (dup->fa & FILE_ATTRIBUTE_REPARSE_POINT) {
+ attr = ni_find_attr(ni, NULL, NULL, ATTR_REPARSE, NULL, 0, NULL,
+ NULL);
+
+ if (attr) {
+ const struct REPARSE_POINT *rp;
- if (ni->ni_flags & NI_FLAG_EA) {
+ rp = resident_data_ex(attr, sizeof(struct REPARSE_POINT));
+ /* If ATTR_REPARSE exists 'rp' can't be NULL. */
+ if (rp)
+ dup->extend_data = rp->ReparseTag;
+ }
+ } else if (ni->ni_flags & NI_FLAG_EA) {
attr = ni_find_attr(ni, attr, &le, ATTR_EA_INFO, NULL, 0, NULL,
NULL);
if (attr) {
@@ -3189,7 +3141,7 @@ static bool ni_update_parent(struct ntfs_inode *ni, struct NTFS_DUP_INFO *dup,
info = resident_data_ex(attr, sizeof(struct EA_INFO));
/* If ATTR_EA_INFO exists 'info' can't be NULL. */
if (info)
- dup->ea_size = info->size_pack;
+ dup->extend_data = info->size;
}
}
@@ -3256,6 +3208,10 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint)
if (is_bad_inode(inode) || sb_rdonly(sb))
return 0;
+ /* Avoid any operation if inode is bad. */
+ if (unlikely(is_bad_ni(ni)))
+ return -EINVAL;
+
if (unlikely(ntfs3_forced_shutdown(sb)))
return -EIO;
@@ -3384,75 +3340,3 @@ out:
return 0;
}
-
-/*
- * ni_set_compress
- *
- * Helper for 'ntfs_fileattr_set'.
- * Changes compression for empty files and directories only.
- */
-int ni_set_compress(struct inode *inode, bool compr)
-{
- int err;
- struct ntfs_inode *ni = ntfs_i(inode);
- struct ATTR_STD_INFO *std;
- const char *bad_inode;
-
- if (is_compressed(ni) == !!compr)
- return 0;
-
- if (is_sparsed(ni)) {
- /* sparse and compress not compatible. */
- return -EOPNOTSUPP;
- }
-
- if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) {
- /*Skip other inodes. (symlink,fifo,...) */
- return -EOPNOTSUPP;
- }
-
- bad_inode = NULL;
-
- ni_lock(ni);
-
- std = ni_std(ni);
- if (!std) {
- bad_inode = "no std";
- goto out;
- }
-
- if (S_ISREG(inode->i_mode)) {
- err = attr_set_compress(ni, compr);
- if (err) {
- if (err == -ENOENT) {
- /* Fix on the fly? */
- /* Each file must contain data attribute. */
- bad_inode = "no data attribute";
- }
- goto out;
- }
- }
-
- ni->std_fa = std->fa;
- if (compr)
- std->fa |= FILE_ATTRIBUTE_COMPRESSED;
- else
- std->fa &= ~FILE_ATTRIBUTE_COMPRESSED;
-
- if (ni->std_fa != std->fa) {
- ni->std_fa = std->fa;
- ni->mi.dirty = true;
- }
- /* update duplicate information and directory entries in ni_write_inode.*/
- ni->ni_flags |= NI_FLAG_UPDATE_PARENT;
- err = 0;
-
-out:
- ni_unlock(ni);
- if (bad_inode) {
- ntfs_bad_inode(inode, bad_inode);
- err = -EINVAL;
- }
-
- return err;
-}
diff --git a/fs/ntfs3/fslog.c b/fs/ntfs3/fslog.c
index d0d530f4e2b9..38934e6978ec 100644
--- a/fs/ntfs3/fslog.c
+++ b/fs/ntfs3/fslog.c
@@ -3091,16 +3091,16 @@ static int do_action(struct ntfs_log *log, struct OPEN_ATTR_ENRTY *oe,
inode = ilookup(sbi->sb, rno);
if (inode) {
mi = &ntfs_i(inode)->mi;
- } else if (op == InitializeFileRecordSegment) {
- mi = kzalloc(sizeof(struct mft_inode), GFP_NOFS);
- if (!mi)
- return -ENOMEM;
- err = mi_format_new(mi, sbi, rno, 0, false);
- if (err)
- goto out;
} else {
/* Read from disk. */
err = mi_get(sbi, rno, &mi);
+ if (err && op == InitializeFileRecordSegment) {
+ mi = kzalloc(sizeof(struct mft_inode),
+ GFP_NOFS);
+ if (!mi)
+ return -ENOMEM;
+ err = mi_format_new(mi, sbi, rno, 0, false);
+ }
if (err)
return err;
}
@@ -3109,15 +3109,13 @@ static int do_action(struct ntfs_log *log, struct OPEN_ATTR_ENRTY *oe,
if (op == DeallocateFileRecordSegment)
goto skip_load_parent;
- if (InitializeFileRecordSegment != op) {
- if (rec->rhdr.sign == NTFS_BAAD_SIGNATURE)
- goto dirty_vol;
- if (!check_lsn(&rec->rhdr, rlsn))
- goto out;
- if (!check_file_record(rec, NULL, sbi))
- goto dirty_vol;
- attr = Add2Ptr(rec, roff);
- }
+ if (rec->rhdr.sign == NTFS_BAAD_SIGNATURE)
+ goto dirty_vol;
+ if (!check_lsn(&rec->rhdr, rlsn))
+ goto out;
+ if (!check_file_record(rec, NULL, sbi))
+ goto dirty_vol;
+ attr = Add2Ptr(rec, roff);
if (is_rec_base(rec) || InitializeFileRecordSegment == op) {
rno_base = rno;
@@ -3143,7 +3141,7 @@ static int do_action(struct ntfs_log *log, struct OPEN_ATTR_ENRTY *oe,
if (inode)
iput(inode);
- else if (mi)
+ else
mi_put(mi);
inode = inode_parent;
diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c
index 938d351ebac7..c7a2f191254d 100644
--- a/fs/ntfs3/fsntfs.c
+++ b/fs/ntfs3/fsntfs.c
@@ -905,9 +905,13 @@ void ntfs_update_mftmirr(struct ntfs_sb_info *sbi, int wait)
void ntfs_bad_inode(struct inode *inode, const char *hint)
{
struct ntfs_sb_info *sbi = inode->i_sb->s_fs_info;
+ struct ntfs_inode *ni = ntfs_i(inode);
ntfs_inode_err(inode, "%s", hint);
- make_bad_inode(inode);
+
+ /* Do not call make_bad_inode()! */
+ ni->ni_bad = true;
+
/* Avoid recursion if bad inode is $Volume. */
if (inode->i_ino != MFT_REC_VOL &&
!(sbi->flags & NTFS_FLAGS_LOG_REPLAYING)) {
@@ -1035,34 +1039,6 @@ struct buffer_head *ntfs_bread(struct super_block *sb, sector_t block)
return NULL;
}
-int ntfs_sb_read(struct super_block *sb, u64 lbo, size_t bytes, void *buffer)
-{
- struct block_device *bdev = sb->s_bdev;
- u32 blocksize = sb->s_blocksize;
- u64 block = lbo >> sb->s_blocksize_bits;
- u32 off = lbo & (blocksize - 1);
- u32 op = blocksize - off;
-
- for (; bytes; block += 1, off = 0, op = blocksize) {
- struct buffer_head *bh = __bread(bdev, block, blocksize);
-
- if (!bh)
- return -EIO;
-
- if (op > bytes)
- op = bytes;
-
- memcpy(buffer, bh->b_data + off, op);
-
- put_bh(bh);
-
- bytes -= op;
- buffer = Add2Ptr(buffer, op);
- }
-
- return 0;
-}
-
int ntfs_sb_write(struct super_block *sb, u64 lbo, size_t bytes,
const void *buf, int wait)
{
diff --git a/fs/ntfs3/index.c b/fs/ntfs3/index.c
index 7eb9fae22f8d..1bf2a6593dec 100644
--- a/fs/ntfs3/index.c
+++ b/fs/ntfs3/index.c
@@ -618,7 +618,7 @@ static bool index_hdr_check(const struct INDEX_HDR *hdr, u32 bytes)
u32 off = le32_to_cpu(hdr->de_off);
if (!IS_ALIGNED(off, 8) || tot > bytes || end > tot ||
- off + sizeof(struct NTFS_DE) > end) {
+ size_add(off, sizeof(struct NTFS_DE)) > end) {
/* incorrect index buffer. */
return false;
}
@@ -736,7 +736,7 @@ fill_table:
if (end > total)
return NULL;
- if (off + sizeof(struct NTFS_DE) > end)
+ if (size_add(off, sizeof(struct NTFS_DE)) > end)
return NULL;
e = Add2Ptr(hdr, off);
@@ -2182,6 +2182,10 @@ static int indx_get_entry_to_replace(struct ntfs_index *indx,
e = hdr_first_de(&n->index->ihdr);
fnd_push(fnd, n, e);
+ if (!e) {
+ err = -EINVAL;
+ goto out;
+ }
if (!de_is_last(e)) {
/*
@@ -2203,6 +2207,10 @@ static int indx_get_entry_to_replace(struct ntfs_index *indx,
n = fnd->nodes[level];
te = hdr_first_de(&n->index->ihdr);
+ if (!te) {
+ err = -EINVAL;
+ goto out;
+ }
/* Copy the candidate entry into the replacement entry buffer. */
re = kmalloc(le16_to_cpu(te->size) + sizeof(u64), GFP_NOFS);
if (!re) {
diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c
index a1e11228dafd..37cbbee7fa58 100644
--- a/fs/ntfs3/inode.c
+++ b/fs/ntfs3/inode.c
@@ -805,6 +805,10 @@ static ssize_t ntfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
ret = 0;
goto out;
}
+ if (is_compressed(ni)) {
+ ret = 0;
+ goto out;
+ }
ret = blockdev_direct_IO(iocb, inode, iter,
wr ? ntfs_get_block_direct_IO_W :
@@ -874,6 +878,10 @@ static int ntfs_resident_writepage(struct folio *folio,
struct ntfs_inode *ni = ntfs_i(inode);
int ret;
+ /* Avoid any operation if inode is bad. */
+ if (unlikely(is_bad_ni(ni)))
+ return -EINVAL;
+
if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
return -EIO;
@@ -892,6 +900,10 @@ static int ntfs_writepages(struct address_space *mapping,
{
struct inode *inode = mapping->host;
+ /* Avoid any operation if inode is bad. */
+ if (unlikely(is_bad_ni(ntfs_i(inode))))
+ return -EINVAL;
+
if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
return -EIO;
@@ -908,13 +920,17 @@ static int ntfs_get_block_write_begin(struct inode *inode, sector_t vbn,
bh_result, create, GET_BLOCK_WRITE_BEGIN);
}
-int ntfs_write_begin(struct file *file, struct address_space *mapping,
+int ntfs_write_begin(const struct kiocb *iocb, struct address_space *mapping,
loff_t pos, u32 len, struct folio **foliop, void **fsdata)
{
int err;
struct inode *inode = mapping->host;
struct ntfs_inode *ni = ntfs_i(inode);
+ /* Avoid any operation if inode is bad. */
+ if (unlikely(is_bad_ni(ni)))
+ return -EINVAL;
+
if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
return -EIO;
@@ -953,7 +969,8 @@ out:
/*
* ntfs_write_end - Address_space_operations::write_end.
*/
-int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos,
+int ntfs_write_end(const struct kiocb *iocb,
+ struct address_space *mapping, loff_t pos,
u32 len, u32 copied, struct folio *folio, void *fsdata)
{
struct inode *inode = mapping->host;
@@ -985,7 +1002,7 @@ int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos,
folio_unlock(folio);
folio_put(folio);
} else {
- err = generic_write_end(file, mapping, pos, len, copied, folio,
+ err = generic_write_end(iocb, mapping, pos, len, copied, folio,
fsdata);
}
@@ -1025,46 +1042,6 @@ int ntfs_sync_inode(struct inode *inode)
}
/*
- * writeback_inode - Helper function for ntfs_flush_inodes().
- *
- * This writes both the inode and the file data blocks, waiting
- * for in flight data blocks before the start of the call. It
- * does not wait for any io started during the call.
- */
-static int writeback_inode(struct inode *inode)
-{
- int ret = sync_inode_metadata(inode, 0);
-
- if (!ret)
- ret = filemap_fdatawrite(inode->i_mapping);
- return ret;
-}
-
-/*
- * ntfs_flush_inodes
- *
- * Write data and metadata corresponding to i1 and i2. The io is
- * started but we do not wait for any of it to finish.
- *
- * filemap_flush() is used for the block device, so if there is a dirty
- * page for a block already in flight, we will not wait and start the
- * io over again.
- */
-int ntfs_flush_inodes(struct super_block *sb, struct inode *i1,
- struct inode *i2)
-{
- int ret = 0;
-
- if (i1)
- ret = writeback_inode(i1);
- if (!ret && i2)
- ret = writeback_inode(i2);
- if (!ret)
- ret = filemap_flush(sb->s_bdev_file->f_mapping);
- return ret;
-}
-
-/*
* Helper function to read file.
*/
int inode_read_data(struct inode *inode, void *data, size_t bytes)
@@ -1098,10 +1075,10 @@ int inode_read_data(struct inode *inode, void *data, size_t bytes)
* Number of bytes for REPARSE_DATA_BUFFER(IO_REPARSE_TAG_SYMLINK)
* for unicode string of @uni_len length.
*/
-static inline u32 ntfs_reparse_bytes(u32 uni_len)
+static inline u32 ntfs_reparse_bytes(u32 uni_len, bool is_absolute)
{
/* Header + unicode string + decorated unicode string. */
- return sizeof(short) * (2 * uni_len + 4) +
+ return sizeof(short) * (2 * uni_len + (is_absolute ? 4 : 0)) +
offsetof(struct REPARSE_DATA_BUFFER,
SymbolicLinkReparseBuffer.PathBuffer);
}
@@ -1114,8 +1091,11 @@ ntfs_create_reparse_buffer(struct ntfs_sb_info *sbi, const char *symname,
struct REPARSE_DATA_BUFFER *rp;
__le16 *rp_name;
typeof(rp->SymbolicLinkReparseBuffer) *rs;
+ bool is_absolute;
+
+ is_absolute = (strlen(symname) > 1 && symname[1] == ':');
- rp = kzalloc(ntfs_reparse_bytes(2 * size + 2), GFP_NOFS);
+ rp = kzalloc(ntfs_reparse_bytes(2 * size + 2, is_absolute), GFP_NOFS);
if (!rp)
return ERR_PTR(-ENOMEM);
@@ -1130,7 +1110,7 @@ ntfs_create_reparse_buffer(struct ntfs_sb_info *sbi, const char *symname,
goto out;
/* err = the length of unicode name of symlink. */
- *nsize = ntfs_reparse_bytes(err);
+ *nsize = ntfs_reparse_bytes(err, is_absolute);
if (*nsize > sbi->reparse.max_size) {
err = -EFBIG;
@@ -1150,7 +1130,7 @@ ntfs_create_reparse_buffer(struct ntfs_sb_info *sbi, const char *symname,
/* PrintName + SubstituteName. */
rs->SubstituteNameOffset = cpu_to_le16(sizeof(short) * err);
- rs->SubstituteNameLength = cpu_to_le16(sizeof(short) * err + 8);
+ rs->SubstituteNameLength = cpu_to_le16(sizeof(short) * err + (is_absolute ? 8 : 0));
rs->PrintNameLength = rs->SubstituteNameOffset;
/*
@@ -1158,16 +1138,18 @@ ntfs_create_reparse_buffer(struct ntfs_sb_info *sbi, const char *symname,
* parse this path.
* 0-absolute path 1- relative path (SYMLINK_FLAG_RELATIVE).
*/
- rs->Flags = 0;
+ rs->Flags = cpu_to_le32(is_absolute ? 0 : SYMLINK_FLAG_RELATIVE);
- memmove(rp_name + err + 4, rp_name, sizeof(short) * err);
+ memmove(rp_name + err + (is_absolute ? 4 : 0), rp_name, sizeof(short) * err);
- /* Decorate SubstituteName. */
- rp_name += err;
- rp_name[0] = cpu_to_le16('\\');
- rp_name[1] = cpu_to_le16('?');
- rp_name[2] = cpu_to_le16('?');
- rp_name[3] = cpu_to_le16('\\');
+ if (is_absolute) {
+ /* Decorate SubstituteName. */
+ rp_name += err;
+ rp_name[0] = cpu_to_le16('\\');
+ rp_name[1] = cpu_to_le16('?');
+ rp_name[2] = cpu_to_le16('?');
+ rp_name[3] = cpu_to_le16('\\');
+ }
return rp;
out:
@@ -1296,6 +1278,12 @@ int ntfs_create_inode(struct mnt_idmap *idmap, struct inode *dir,
goto out1;
}
+ /* Avoid any operation if inode is bad. */
+ if (unlikely(is_bad_ni(dir_ni))) {
+ err = -EINVAL;
+ goto out2;
+ }
+
if (unlikely(ntfs3_forced_shutdown(sb))) {
err = -EIO;
goto out2;
@@ -1386,7 +1374,7 @@ int ntfs_create_inode(struct mnt_idmap *idmap, struct inode *dir,
fname->dup.a_time = std5->cr_time;
fname->dup.alloc_size = fname->dup.data_size = 0;
fname->dup.fa = std5->fa;
- fname->dup.ea_size = fname->dup.reparse = 0;
+ fname->dup.extend_data = S_ISLNK(mode) ? IO_REPARSE_TAG_SYMLINK : 0;
dsize = le16_to_cpu(new_de->key_size);
asize = ALIGN(SIZEOF_RESIDENT + dsize, 8);
@@ -1626,27 +1614,29 @@ int ntfs_create_inode(struct mnt_idmap *idmap, struct inode *dir,
inode->i_flags |= S_NOSEC;
}
- /*
- * ntfs_init_acl and ntfs_save_wsl_perm update extended attribute.
- * The packed size of extended attribute is stored in direntry too.
- * 'fname' here points to inside new_de.
- */
- err = ntfs_save_wsl_perm(inode, &fname->dup.ea_size);
- if (err)
- goto out6;
+ if (!S_ISLNK(mode)) {
+ /*
+ * ntfs_init_acl and ntfs_save_wsl_perm update extended attribute.
+ * The packed size of extended attribute is stored in direntry too.
+ * 'fname' here points to inside new_de.
+ */
+ err = ntfs_save_wsl_perm(inode, &fname->dup.extend_data);
+ if (err)
+ goto out6;
- /*
- * update ea_size in file_name attribute too.
- * Use ni_find_attr cause layout of MFT record may be changed
- * in ntfs_init_acl and ntfs_save_wsl_perm.
- */
- attr = ni_find_attr(ni, NULL, NULL, ATTR_NAME, NULL, 0, NULL, NULL);
- if (attr) {
- struct ATTR_FILE_NAME *fn;
+ /*
+ * update ea_size in file_name attribute too.
+ * Use ni_find_attr cause layout of MFT record may be changed
+ * in ntfs_init_acl and ntfs_save_wsl_perm.
+ */
+ attr = ni_find_attr(ni, NULL, NULL, ATTR_NAME, NULL, 0, NULL, NULL);
+ if (attr) {
+ struct ATTR_FILE_NAME *fn;
- fn = resident_data_ex(attr, SIZEOF_ATTRIBUTE_FILENAME);
- if (fn)
- fn->dup.ea_size = fname->dup.ea_size;
+ fn = resident_data_ex(attr, SIZEOF_ATTRIBUTE_FILENAME);
+ if (fn)
+ fn->dup.extend_data = fname->dup.extend_data;
+ }
}
/* We do not need to update parent directory later */
@@ -2108,5 +2098,6 @@ const struct address_space_operations ntfs_aops_cmpr = {
.read_folio = ntfs_read_folio,
.readahead = ntfs_readahead,
.dirty_folio = block_dirty_folio,
+ .direct_IO = ntfs_direct_IO,
};
// clang-format on
diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c
index abf7e81584a9..82c8ae56beee 100644
--- a/fs/ntfs3/namei.c
+++ b/fs/ntfs3/namei.c
@@ -171,6 +171,10 @@ static int ntfs_unlink(struct inode *dir, struct dentry *dentry)
struct ntfs_inode *ni = ntfs_i(dir);
int err;
+ /* Avoid any operation if inode is bad. */
+ if (unlikely(is_bad_ni(ni)))
+ return -EINVAL;
+
if (unlikely(ntfs3_forced_shutdown(dir->i_sb)))
return -EIO;
@@ -191,6 +195,10 @@ static int ntfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
{
u32 size = strlen(symname);
+ /* Avoid any operation if inode is bad. */
+ if (unlikely(is_bad_ni(ntfs_i(dir))))
+ return -EINVAL;
+
if (unlikely(ntfs3_forced_shutdown(dir->i_sb)))
return -EIO;
@@ -201,11 +209,11 @@ static int ntfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
/*
* ntfs_mkdir- inode_operations::mkdir
*/
-static int ntfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *ntfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
- return ntfs_create_inode(idmap, dir, dentry, NULL, S_IFDIR | mode, 0,
- NULL, 0, NULL);
+ return ERR_PTR(ntfs_create_inode(idmap, dir, dentry, NULL, S_IFDIR | mode, 0,
+ NULL, 0, NULL));
}
/*
@@ -216,6 +224,10 @@ static int ntfs_rmdir(struct inode *dir, struct dentry *dentry)
struct ntfs_inode *ni = ntfs_i(dir);
int err;
+ /* Avoid any operation if inode is bad. */
+ if (unlikely(is_bad_ni(ni)))
+ return -EINVAL;
+
if (unlikely(ntfs3_forced_shutdown(dir->i_sb)))
return -EIO;
@@ -244,7 +256,7 @@ static int ntfs_rename(struct mnt_idmap *idmap, struct inode *dir,
struct ntfs_inode *ni = ntfs_i(inode);
struct inode *new_inode = d_inode(new_dentry);
struct NTFS_DE *de, *new_de;
- bool is_same, is_bad;
+ bool is_same;
/*
* de - memory of PATH_MAX bytes:
* [0-1024) - original name (dentry->d_name)
@@ -256,6 +268,10 @@ static int ntfs_rename(struct mnt_idmap *idmap, struct inode *dir,
1024);
static_assert(PATH_MAX >= 4 * 1024);
+ /* Avoid any operation if inode is bad. */
+ if (unlikely(is_bad_ni(ni)))
+ return -EINVAL;
+
if (unlikely(ntfs3_forced_shutdown(sb)))
return -EIO;
@@ -313,12 +329,8 @@ static int ntfs_rename(struct mnt_idmap *idmap, struct inode *dir,
if (dir_ni != new_dir_ni)
ni_lock_dir2(new_dir_ni);
- is_bad = false;
- err = ni_rename(dir_ni, new_dir_ni, ni, de, new_de, &is_bad);
- if (is_bad) {
- /* Restore after failed rename failed too. */
- _ntfs_bad_inode(inode);
- } else if (!err) {
+ err = ni_rename(dir_ni, new_dir_ni, ni, de, new_de);
+ if (!err) {
simple_rename_timestamp(dir, dentry, new_dir, new_dentry);
mark_inode_dirty(inode);
mark_inode_dirty(dir);
@@ -507,8 +519,6 @@ const struct inode_operations ntfs_dir_inode_operations = {
.getattr = ntfs_getattr,
.listxattr = ntfs_listxattr,
.fiemap = ntfs_fiemap,
- .fileattr_get = ntfs_fileattr_get,
- .fileattr_set = ntfs_fileattr_set,
};
const struct inode_operations ntfs_special_inode_operations = {
diff --git a/fs/ntfs3/ntfs.h b/fs/ntfs3/ntfs.h
index 241f2ffdd920..552b97905813 100644
--- a/fs/ntfs3/ntfs.h
+++ b/fs/ntfs3/ntfs.h
@@ -561,8 +561,7 @@ struct NTFS_DUP_INFO {
__le64 alloc_size; // 0x20: Data attribute allocated size, multiple of cluster size.
__le64 data_size; // 0x28: Data attribute size <= Dataalloc_size.
enum FILE_ATTRIBUTE fa; // 0x30: Standard DOS attributes & more.
- __le16 ea_size; // 0x34: Packed EAs.
- __le16 reparse; // 0x36: Used by Reparse.
+ __le32 extend_data; // 0x34: Extended data.
}; // 0x38
@@ -717,7 +716,7 @@ static inline struct NTFS_DE *hdr_first_de(const struct INDEX_HDR *hdr)
struct NTFS_DE *e;
u16 esize;
- if (de_off >= used || de_off + sizeof(struct NTFS_DE) > used )
+ if (de_off >= used || size_add(de_off, sizeof(struct NTFS_DE)) > used)
return NULL;
e = Add2Ptr(hdr, de_off);
diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h
index 382820464dee..1296e6fcc779 100644
--- a/fs/ntfs3/ntfs_fs.h
+++ b/fs/ntfs3/ntfs_fs.h
@@ -377,6 +377,13 @@ struct ntfs_inode {
*/
u8 mi_loaded;
+ /*
+ * Use this field to avoid any write(s).
+ * If inode is bad during initialization - use make_bad_inode
+ * If inode is bad during operations - use this field
+ */
+ u8 ni_bad;
+
union {
struct ntfs_index dir;
struct {
@@ -454,7 +461,6 @@ int attr_collapse_range(struct ntfs_inode *ni, u64 vbo, u64 bytes);
int attr_insert_range(struct ntfs_inode *ni, u64 vbo, u64 bytes);
int attr_punch_hole(struct ntfs_inode *ni, u64 vbo, u64 bytes, u32 *frame_size);
int attr_force_nonresident(struct ntfs_inode *ni);
-int attr_set_compress(struct ntfs_inode *ni, bool compr);
/* Functions from attrlist.c */
void al_destroy(struct ntfs_inode *ni);
@@ -497,9 +503,6 @@ extern const struct file_operations ntfs_dir_operations;
extern const struct file_operations ntfs_legacy_dir_operations;
/* Globals from file.c */
-int ntfs_fileattr_get(struct dentry *dentry, struct fileattr *fa);
-int ntfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry,
- struct fileattr *fa);
int ntfs_getattr(struct mnt_idmap *idmap, const struct path *path,
struct kstat *stat, u32 request_mask, u32 flags);
int ntfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
@@ -530,9 +533,6 @@ struct ATTRIB *ni_find_attr(struct ntfs_inode *ni, struct ATTRIB *attr,
struct ATTRIB *ni_enum_attr_ex(struct ntfs_inode *ni, struct ATTRIB *attr,
struct ATTR_LIST_ENTRY **le,
struct mft_inode **mi);
-struct ATTRIB *ni_load_attr(struct ntfs_inode *ni, enum ATTR_TYPE type,
- const __le16 *name, u8 name_len, CLST vcn,
- struct mft_inode **pmi);
int ni_load_all_mi(struct ntfs_inode *ni);
bool ni_add_subrecord(struct ntfs_inode *ni, CLST rno, struct mft_inode **mi);
int ni_remove_attr(struct ntfs_inode *ni, enum ATTR_TYPE type,
@@ -584,11 +584,9 @@ int ni_add_name(struct ntfs_inode *dir_ni, struct ntfs_inode *ni,
struct NTFS_DE *de);
int ni_rename(struct ntfs_inode *dir_ni, struct ntfs_inode *new_dir_ni,
- struct ntfs_inode *ni, struct NTFS_DE *de, struct NTFS_DE *new_de,
- bool *is_bad);
+ struct ntfs_inode *ni, struct NTFS_DE *de, struct NTFS_DE *new_de);
bool ni_is_dirty(struct inode *inode);
-int ni_set_compress(struct inode *inode, bool compr);
/* Globals from fslog.c */
bool check_index_header(const struct INDEX_HDR *hdr, size_t bytes);
@@ -619,7 +617,6 @@ enum NTFS_DIRTY_FLAGS {
NTFS_DIRTY_ERROR = 2,
};
int ntfs_set_state(struct ntfs_sb_info *sbi, enum NTFS_DIRTY_FLAGS dirty);
-int ntfs_sb_read(struct super_block *sb, u64 lbo, size_t bytes, void *buffer);
int ntfs_sb_write(struct super_block *sb, u64 lbo, size_t bytes,
const void *buffer, int wait);
int ntfs_sb_write_run(struct ntfs_sb_info *sbi, const struct runs_tree *run,
@@ -711,14 +708,14 @@ struct inode *ntfs_iget5(struct super_block *sb, const struct MFT_REF *ref,
int ntfs_set_size(struct inode *inode, u64 new_size);
int ntfs_get_block(struct inode *inode, sector_t vbn,
struct buffer_head *bh_result, int create);
-int ntfs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, u32 len, struct folio **foliop, void **fsdata);
-int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos,
- u32 len, u32 copied, struct folio *folio, void *fsdata);
+int ntfs_write_begin(const struct kiocb *iocb, struct address_space *mapping,
+ loff_t pos, u32 len, struct folio **foliop,
+ void **fsdata);
+int ntfs_write_end(const struct kiocb *iocb, struct address_space *mapping,
+ loff_t pos, u32 len, u32 copied, struct folio *folio,
+ void *fsdata);
int ntfs3_write_inode(struct inode *inode, struct writeback_control *wbc);
int ntfs_sync_inode(struct inode *inode);
-int ntfs_flush_inodes(struct super_block *sb, struct inode *i1,
- struct inode *i2);
int inode_read_data(struct inode *inode, void *data, size_t bytes);
int ntfs_create_inode(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, const struct cpu_str *uni,
@@ -885,7 +882,7 @@ int ntfs_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry);
ssize_t ntfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
extern const struct xattr_handler *const ntfs_xattr_handlers[];
-int ntfs_save_wsl_perm(struct inode *inode, __le16 *ea_size);
+int ntfs_save_wsl_perm(struct inode *inode, __le32 *ea_size);
void ntfs_get_wsl_perm(struct inode *inode);
/* globals from lznt.c */
@@ -1036,6 +1033,11 @@ static inline bool is_compressed(const struct ntfs_inode *ni)
(ni->ni_flags & NI_FLAG_COMPRESSED_MASK);
}
+static inline bool is_bad_ni(const struct ntfs_inode *ni)
+{
+ return ni->ni_bad;
+}
+
static inline int ni_ext_compress_bits(const struct ntfs_inode *ni)
{
return 0xb + (ni->ni_flags & NI_FLAG_COMPRESSED_MASK);
diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c
index 6a0f6b0a3ab2..ddff94c091b8 100644
--- a/fs/ntfs3/super.c
+++ b/fs/ntfs3/super.c
@@ -555,6 +555,55 @@ static const struct proc_ops ntfs3_label_fops = {
.proc_write = ntfs3_label_write,
};
+static void ntfs_create_procdir(struct super_block *sb)
+{
+ struct proc_dir_entry *e;
+
+ if (!proc_info_root)
+ return;
+
+ e = proc_mkdir(sb->s_id, proc_info_root);
+ if (e) {
+ struct ntfs_sb_info *sbi = sb->s_fs_info;
+
+ proc_create_data("volinfo", 0444, e,
+ &ntfs3_volinfo_fops, sb);
+ proc_create_data("label", 0644, e,
+ &ntfs3_label_fops, sb);
+ sbi->procdir = e;
+ }
+}
+
+static void ntfs_remove_procdir(struct super_block *sb)
+{
+ struct ntfs_sb_info *sbi = sb->s_fs_info;
+
+ if (!sbi->procdir)
+ return;
+
+ remove_proc_entry("label", sbi->procdir);
+ remove_proc_entry("volinfo", sbi->procdir);
+ remove_proc_entry(sb->s_id, proc_info_root);
+ sbi->procdir = NULL;
+}
+
+static void ntfs_create_proc_root(void)
+{
+ proc_info_root = proc_mkdir("fs/ntfs3", NULL);
+}
+
+static void ntfs_remove_proc_root(void)
+{
+ if (proc_info_root) {
+ remove_proc_entry("fs/ntfs3", NULL);
+ proc_info_root = NULL;
+ }
+}
+#else
+static void ntfs_create_procdir(struct super_block *sb) {}
+static void ntfs_remove_procdir(struct super_block *sb) {}
+static void ntfs_create_proc_root(void) {}
+static void ntfs_remove_proc_root(void) {}
#endif
static struct kmem_cache *ntfs_inode_cachep;
@@ -644,15 +693,7 @@ static void ntfs_put_super(struct super_block *sb)
{
struct ntfs_sb_info *sbi = sb->s_fs_info;
-#ifdef CONFIG_PROC_FS
- // Remove /proc/fs/ntfs3/..
- if (sbi->procdir) {
- remove_proc_entry("label", sbi->procdir);
- remove_proc_entry("volinfo", sbi->procdir);
- remove_proc_entry(sb->s_id, proc_info_root);
- sbi->procdir = NULL;
- }
-#endif
+ ntfs_remove_procdir(sb);
/* Mark rw ntfs as clear, if possible. */
ntfs_set_state(sbi, NTFS_DIRTY_CLEAR);
@@ -1182,7 +1223,8 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_export_op = &ntfs_export_ops;
sb->s_time_gran = NTFS_TIME_GRAN; // 100 nsec
sb->s_xattr = ntfs_xattr_handlers;
- sb->s_d_op = options->nocase ? &ntfs_dentry_ops : NULL;
+ if (options->nocase)
+ set_default_d_op(sb, &ntfs_dentry_ops);
options->nls = ntfs_load_nls(options->nls_name);
if (IS_ERR(options->nls)) {
@@ -1590,20 +1632,7 @@ load_root:
kfree(boot2);
}
-#ifdef CONFIG_PROC_FS
- /* Create /proc/fs/ntfs3/.. */
- if (proc_info_root) {
- struct proc_dir_entry *e = proc_mkdir(sb->s_id, proc_info_root);
- static_assert((S_IRUGO | S_IWUSR) == 0644);
- if (e) {
- proc_create_data("volinfo", S_IRUGO, e,
- &ntfs3_volinfo_fops, sb);
- proc_create_data("label", S_IRUGO | S_IWUSR, e,
- &ntfs3_label_fops, sb);
- sbi->procdir = e;
- }
- }
-#endif
+ ntfs_create_procdir(sb);
if (is_legacy_ntfs(sb))
sb->s_flags |= SB_RDONLY;
@@ -1853,14 +1882,11 @@ static int __init init_ntfs_fs(void)
if (IS_ENABLED(CONFIG_NTFS3_LZX_XPRESS))
pr_info("ntfs3: Read-only LZX/Xpress compression included\n");
-#ifdef CONFIG_PROC_FS
- /* Create "/proc/fs/ntfs3" */
- proc_info_root = proc_mkdir("fs/ntfs3", NULL);
-#endif
+ ntfs_create_proc_root();
err = ntfs3_init_bitmap();
if (err)
- return err;
+ goto out2;
ntfs_inode_cachep = kmem_cache_create(
"ntfs_inode_cache", sizeof(struct ntfs_inode), 0,
@@ -1880,6 +1906,8 @@ out:
kmem_cache_destroy(ntfs_inode_cachep);
out1:
ntfs3_exit_bitmap();
+out2:
+ ntfs_remove_proc_root();
return err;
}
@@ -1890,11 +1918,7 @@ static void __exit exit_ntfs_fs(void)
unregister_filesystem(&ntfs_fs_type);
unregister_as_ntfs_legacy();
ntfs3_exit_bitmap();
-
-#ifdef CONFIG_PROC_FS
- if (proc_info_root)
- remove_proc_entry("fs/ntfs3", NULL);
-#endif
+ ntfs_remove_proc_root();
}
MODULE_LICENSE("GPL");
diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c
index e0055dcf8fe3..e519e21596a7 100644
--- a/fs/ntfs3/xattr.c
+++ b/fs/ntfs3/xattr.c
@@ -313,7 +313,7 @@ out:
static noinline int ntfs_set_ea(struct inode *inode, const char *name,
size_t name_len, const void *value,
size_t val_size, int flags, bool locked,
- __le16 *ea_size)
+ __le32 *ea_size)
{
struct ntfs_inode *ni = ntfs_i(inode);
struct ntfs_sb_info *sbi = ni->mi.sbi;
@@ -522,7 +522,7 @@ update_ea:
if (ea_info.size_pack != size_pack)
ni->ni_flags |= NI_FLAG_UPDATE_PARENT;
if (ea_size)
- *ea_size = ea_info.size_pack;
+ *ea_size = ea_info.size;
mark_inode_dirty(&ni->vfs_inode);
out:
@@ -552,6 +552,10 @@ struct posix_acl *ntfs_get_acl(struct mnt_idmap *idmap, struct dentry *dentry,
int err;
void *buf;
+ /* Avoid any operation if inode is bad. */
+ if (unlikely(is_bad_ni(ni)))
+ return ERR_PTR(-EINVAL);
+
/* Allocate PATH_MAX bytes. */
buf = __getname();
if (!buf)
@@ -600,6 +604,10 @@ static noinline int ntfs_set_acl_ex(struct mnt_idmap *idmap,
int flags;
umode_t mode;
+ /* Avoid any operation if inode is bad. */
+ if (unlikely(is_bad_ni(ntfs_i(inode))))
+ return -EINVAL;
+
if (S_ISLNK(inode->i_mode))
return -EOPNOTSUPP;
@@ -730,6 +738,10 @@ ssize_t ntfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
struct ntfs_inode *ni = ntfs_i(inode);
ssize_t ret;
+ /* Avoid any operation if inode is bad. */
+ if (unlikely(is_bad_ni(ni)))
+ return -EINVAL;
+
if (!(ni->ni_flags & NI_FLAG_EA)) {
/* no xattr in file */
return 0;
@@ -751,6 +763,10 @@ static int ntfs_getxattr(const struct xattr_handler *handler, struct dentry *de,
int err;
struct ntfs_inode *ni = ntfs_i(inode);
+ /* Avoid any operation if inode is bad. */
+ if (unlikely(is_bad_ni(ni)))
+ return -EINVAL;
+
if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
return -EIO;
@@ -950,7 +966,7 @@ out:
*
* save uid/gid/mode in xattr
*/
-int ntfs_save_wsl_perm(struct inode *inode, __le16 *ea_size)
+int ntfs_save_wsl_perm(struct inode *inode, __le32 *ea_size)
{
int err;
__le32 value;
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 4414743b638e..821cb7874685 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -1803,6 +1803,14 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci,
el = root_el;
while (el->l_tree_depth) {
+ if (unlikely(le16_to_cpu(el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH)) {
+ ocfs2_error(ocfs2_metadata_cache_get_super(ci),
+ "Owner %llu has invalid tree depth %u in extent list\n",
+ (unsigned long long)ocfs2_metadata_cache_owner(ci),
+ le16_to_cpu(el->l_tree_depth));
+ ret = -EROFS;
+ goto out;
+ }
if (le16_to_cpu(el->l_next_free_rec) == 0) {
ocfs2_error(ocfs2_metadata_cache_get_super(ci),
"Owner %llu has empty extent list at depth %u\n",
@@ -6910,6 +6918,7 @@ static int ocfs2_grab_folios(struct inode *inode, loff_t start, loff_t end,
if (IS_ERR(folios[numfolios])) {
ret = PTR_ERR(folios[numfolios]);
mlog_errno(ret);
+ folios[numfolios] = NULL;
goto out;
}
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 5bbeb6fbb1ac..76c86f1c2b1c 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -46,7 +46,6 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh = NULL;
struct buffer_head *buffer_cache_bh = NULL;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- void *kaddr;
trace_ocfs2_symlink_get_block(
(unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -91,17 +90,11 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
* could've happened. Since we've got a reference on
* the bh, even if it commits while we're doing the
* copy, the data is still good. */
- if (buffer_jbd(buffer_cache_bh)
- && ocfs2_inode_is_new(inode)) {
- kaddr = kmap_atomic(bh_result->b_page);
- if (!kaddr) {
- mlog(ML_ERROR, "couldn't kmap!\n");
- goto bail;
- }
- memcpy(kaddr + (bh_result->b_size * iblock),
- buffer_cache_bh->b_data,
- bh_result->b_size);
- kunmap_atomic(kaddr);
+ if (buffer_jbd(buffer_cache_bh) && ocfs2_inode_is_new(inode)) {
+ memcpy_to_folio(bh_result->b_folio,
+ bh_result->b_size * iblock,
+ buffer_cache_bh->b_data,
+ bh_result->b_size);
set_buffer_uptodate(bh_result);
}
brelse(buffer_cache_bh);
@@ -920,7 +913,7 @@ static void ocfs2_write_failure(struct inode *inode,
ocfs2_jbd2_inode_add_write(wc->w_handle, inode,
user_pos, user_len);
- block_commit_write(&folio->page, from, to);
+ block_commit_write(folio, from, to);
}
}
}
@@ -1078,6 +1071,7 @@ static int ocfs2_grab_folios_for_write(struct address_space *mapping,
if (IS_ERR(wc->w_folios[i])) {
ret = PTR_ERR(wc->w_folios[i]);
mlog_errno(ret);
+ wc->w_folios[i] = NULL;
goto out;
}
}
@@ -1863,7 +1857,8 @@ out:
return ret;
}
-static int ocfs2_write_begin(struct file *file, struct address_space *mapping,
+static int ocfs2_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
loff_t pos, unsigned len,
struct folio **foliop, void **fsdata)
{
@@ -2012,7 +2007,7 @@ int ocfs2_write_end_nolock(struct address_space *mapping, loff_t pos,
ocfs2_jbd2_inode_add_write(handle, inode,
start_byte, length);
}
- block_commit_write(&folio->page, from, to);
+ block_commit_write(folio, from, to);
}
}
@@ -2054,7 +2049,8 @@ out:
return copied;
}
-static int ocfs2_write_end(struct file *file, struct address_space *mapping,
+static int ocfs2_write_end(const struct kiocb *iocb,
+ struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct folio *folio, void *fsdata)
{
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 0f46b22561d6..b05d4e9d13b2 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -724,7 +724,7 @@ static void o2net_shutdown_sc(struct work_struct *work)
if (o2net_unregister_callbacks(sc->sc_sock->sk, sc)) {
/* we shouldn't flush as we're in the thread, the
* races with pending sc work structs are harmless */
- del_timer_sync(&sc->sc_idle_timeout);
+ timer_delete_sync(&sc->sc_idle_timeout);
o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work);
sc_put(sc);
kernel_sock_shutdown(sc->sc_sock, SHUT_RDWR);
@@ -1483,12 +1483,13 @@ static void o2net_sc_send_keep_req(struct work_struct *work)
sc_put(sc);
}
-/* socket shutdown does a del_timer_sync against this as it tears down.
+/* socket shutdown does a timer_delete_sync against this as it tears down.
* we can't start this timer until we've got to the point in sc buildup
* where shutdown is going to be involved */
static void o2net_idle_timer(struct timer_list *t)
{
- struct o2net_sock_container *sc = from_timer(sc, t, sc_idle_timeout);
+ struct o2net_sock_container *sc = timer_container_of(sc, t,
+ sc_idle_timeout);
struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
#ifdef CONFIG_DEBUG_FS
unsigned long msecs = ktime_to_ms(ktime_get()) -
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 7799f4d16ce9..8c9c4825f984 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -798,6 +798,14 @@ static int ocfs2_dx_dir_lookup_rec(struct inode *inode,
}
}
+ if (le16_to_cpu(el->l_next_free_rec) == 0) {
+ ret = ocfs2_error(inode->i_sb,
+ "Inode %lu has empty extent list at depth %u\n",
+ inode->i_ino,
+ le16_to_cpu(el->l_tree_depth));
+ goto out;
+ }
+
found = 0;
for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
rec = &el->l_recs[i];
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 67fc62a49a76..00f52812dbb0 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2632,7 +2632,7 @@ again:
dlm_reco_master_ready(dlm),
msecs_to_jiffies(1000));
if (!dlm_reco_master_ready(dlm)) {
- mlog(0, "%s: reco master taking awhile\n",
+ mlog(0, "%s: reco master taking a while\n",
dlm->name);
goto again;
}
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 2a7f36643895..5130ec44e5e1 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -402,10 +402,10 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
* File creation. Allocate an inode, and we're done..
*/
/* SMP-safe */
-static int dlmfs_mkdir(struct mnt_idmap * idmap,
- struct inode * dir,
- struct dentry * dentry,
- umode_t mode)
+static struct dentry *dlmfs_mkdir(struct mnt_idmap * idmap,
+ struct inode * dir,
+ struct dentry * dentry,
+ umode_t mode)
{
int status;
struct inode *inode = NULL;
@@ -448,7 +448,7 @@ static int dlmfs_mkdir(struct mnt_idmap * idmap,
bail:
if (status < 0)
iput(inode);
- return status;
+ return ERR_PTR(status);
}
static int dlmfs_create(struct mnt_idmap *idmap,
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index e54f2c4b5a90..21d797ccccd0 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -813,7 +813,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
/* must not update i_size! */
- block_commit_write(&folio->page, block_start + 1, block_start + 1);
+ block_commit_write(folio, block_start + 1, block_start + 1);
}
/*
@@ -2800,7 +2800,7 @@ const struct inode_operations ocfs2_special_file_iops = {
*/
const struct file_operations ocfs2_fops = {
.llseek = ocfs2_file_llseek,
- .mmap = ocfs2_mmap,
+ .mmap_prepare = ocfs2_mmap_prepare,
.fsync = ocfs2_sync_file,
.release = ocfs2_file_release,
.open = ocfs2_file_open,
@@ -2850,7 +2850,7 @@ const struct file_operations ocfs2_dops = {
*/
const struct file_operations ocfs2_fops_no_plocks = {
.llseek = ocfs2_file_llseek,
- .mmap = ocfs2_mmap,
+ .mmap_prepare = ocfs2_mmap_prepare,
.fsync = ocfs2_sync_file,
.release = ocfs2_file_release,
.open = ocfs2_file_open,
diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c
index 1ad7106741f8..3ad7baf67658 100644
--- a/fs/ocfs2/filecheck.c
+++ b/fs/ocfs2/filecheck.c
@@ -505,5 +505,5 @@ static ssize_t ocfs2_filecheck_attr_store(struct kobject *kobj,
ocfs2_filecheck_handle_entry(ent, entry);
exit:
- return (!ret ? count : ret);
+ return ret ?: count;
}
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 12e5d1f73325..14bf440ea4df 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -50,8 +50,6 @@ struct ocfs2_find_inode_args
unsigned int fi_sysfile_type;
};
-static struct lock_class_key ocfs2_sysfile_lock_key[NUM_SYSTEM_INODES];
-
static int ocfs2_read_locked_inode(struct inode *inode,
struct ocfs2_find_inode_args *args);
static int ocfs2_init_locked_inode(struct inode *inode, void *opaque);
@@ -250,14 +248,77 @@ bail:
static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
{
struct ocfs2_find_inode_args *args = opaque;
+#ifdef CONFIG_LOCKDEP
+ static struct lock_class_key ocfs2_sysfile_lock_key[NUM_SYSTEM_INODES];
static struct lock_class_key ocfs2_quota_ip_alloc_sem_key,
ocfs2_file_ip_alloc_sem_key;
+#endif
inode->i_ino = args->fi_ino;
OCFS2_I(inode)->ip_blkno = args->fi_blkno;
- if (args->fi_sysfile_type != 0)
+#ifdef CONFIG_LOCKDEP
+ switch (args->fi_sysfile_type) {
+ case BAD_BLOCK_SYSTEM_INODE:
+ break;
+ case GLOBAL_INODE_ALLOC_SYSTEM_INODE:
+ lockdep_set_class(&inode->i_rwsem,
+ &ocfs2_sysfile_lock_key[GLOBAL_INODE_ALLOC_SYSTEM_INODE]);
+ break;
+ case SLOT_MAP_SYSTEM_INODE:
+ lockdep_set_class(&inode->i_rwsem,
+ &ocfs2_sysfile_lock_key[SLOT_MAP_SYSTEM_INODE]);
+ break;
+ case HEARTBEAT_SYSTEM_INODE:
+ lockdep_set_class(&inode->i_rwsem,
+ &ocfs2_sysfile_lock_key[HEARTBEAT_SYSTEM_INODE]);
+ break;
+ case GLOBAL_BITMAP_SYSTEM_INODE:
+ lockdep_set_class(&inode->i_rwsem,
+ &ocfs2_sysfile_lock_key[GLOBAL_BITMAP_SYSTEM_INODE]);
+ break;
+ case USER_QUOTA_SYSTEM_INODE:
+ lockdep_set_class(&inode->i_rwsem,
+ &ocfs2_sysfile_lock_key[USER_QUOTA_SYSTEM_INODE]);
+ break;
+ case GROUP_QUOTA_SYSTEM_INODE:
+ lockdep_set_class(&inode->i_rwsem,
+ &ocfs2_sysfile_lock_key[GROUP_QUOTA_SYSTEM_INODE]);
+ break;
+ case ORPHAN_DIR_SYSTEM_INODE:
+ lockdep_set_class(&inode->i_rwsem,
+ &ocfs2_sysfile_lock_key[ORPHAN_DIR_SYSTEM_INODE]);
+ break;
+ case EXTENT_ALLOC_SYSTEM_INODE:
lockdep_set_class(&inode->i_rwsem,
- &ocfs2_sysfile_lock_key[args->fi_sysfile_type]);
+ &ocfs2_sysfile_lock_key[EXTENT_ALLOC_SYSTEM_INODE]);
+ break;
+ case INODE_ALLOC_SYSTEM_INODE:
+ lockdep_set_class(&inode->i_rwsem,
+ &ocfs2_sysfile_lock_key[INODE_ALLOC_SYSTEM_INODE]);
+ break;
+ case JOURNAL_SYSTEM_INODE:
+ lockdep_set_class(&inode->i_rwsem,
+ &ocfs2_sysfile_lock_key[JOURNAL_SYSTEM_INODE]);
+ break;
+ case LOCAL_ALLOC_SYSTEM_INODE:
+ lockdep_set_class(&inode->i_rwsem,
+ &ocfs2_sysfile_lock_key[LOCAL_ALLOC_SYSTEM_INODE]);
+ break;
+ case TRUNCATE_LOG_SYSTEM_INODE:
+ lockdep_set_class(&inode->i_rwsem,
+ &ocfs2_sysfile_lock_key[TRUNCATE_LOG_SYSTEM_INODE]);
+ break;
+ case LOCAL_USER_QUOTA_SYSTEM_INODE:
+ lockdep_set_class(&inode->i_rwsem,
+ &ocfs2_sysfile_lock_key[LOCAL_USER_QUOTA_SYSTEM_INODE]);
+ break;
+ case LOCAL_GROUP_QUOTA_SYSTEM_INODE:
+ lockdep_set_class(&inode->i_rwsem,
+ &ocfs2_sysfile_lock_key[LOCAL_GROUP_QUOTA_SYSTEM_INODE]);
+ break;
+ default:
+ WARN_ONCE(1, "Unknown sysfile type %d\n", args->fi_sysfile_type);
+ }
if (args->fi_sysfile_type == USER_QUOTA_SYSTEM_INODE ||
args->fi_sysfile_type == GROUP_QUOTA_SYSTEM_INODE ||
args->fi_sysfile_type == LOCAL_USER_QUOTA_SYSTEM_INODE ||
@@ -267,6 +328,7 @@ static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
else
lockdep_set_class(&OCFS2_I(inode)->ip_alloc_sem,
&ocfs2_file_ip_alloc_sem_key);
+#endif
return 0;
}
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 7ae96fb8807a..db14c92302a1 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -62,7 +62,7 @@ static inline int o2info_coherent(struct ocfs2_info_request *req)
return (!(req->ir_flags & OCFS2_INFO_FL_NON_COHERENT));
}
-int ocfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+int ocfs2_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
{
struct inode *inode = d_inode(dentry);
unsigned int flags;
@@ -83,7 +83,7 @@ int ocfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa)
}
int ocfs2_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa)
+ struct dentry *dentry, struct file_kattr *fa)
{
struct inode *inode = d_inode(dentry);
unsigned int flags = fa->flags;
diff --git a/fs/ocfs2/ioctl.h b/fs/ocfs2/ioctl.h
index 48a5fdfe87a1..4a1c2313b429 100644
--- a/fs/ocfs2/ioctl.h
+++ b/fs/ocfs2/ioctl.h
@@ -11,9 +11,9 @@
#ifndef OCFS2_IOCTL_PROTO_H
#define OCFS2_IOCTL_PROTO_H
-int ocfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+int ocfs2_fileattr_get(struct dentry *dentry, struct file_kattr *fa);
int ocfs2_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa);
+ struct dentry *dentry, struct file_kattr *fa);
long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index f1b4b3e611cb..e5f58ff2175f 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -174,7 +174,7 @@ int ocfs2_recovery_init(struct ocfs2_super *osb)
struct ocfs2_recovery_map *rm;
mutex_init(&osb->recovery_lock);
- osb->disable_recovery = 0;
+ osb->recovery_state = OCFS2_REC_ENABLED;
osb->recovery_thread_task = NULL;
init_waitqueue_head(&osb->recovery_event);
@@ -190,31 +190,53 @@ int ocfs2_recovery_init(struct ocfs2_super *osb)
return 0;
}
-/* we can't grab the goofy sem lock from inside wait_event, so we use
- * memory barriers to make sure that we'll see the null task before
- * being woken up */
static int ocfs2_recovery_thread_running(struct ocfs2_super *osb)
{
- mb();
return osb->recovery_thread_task != NULL;
}
-void ocfs2_recovery_exit(struct ocfs2_super *osb)
+static void ocfs2_recovery_disable(struct ocfs2_super *osb,
+ enum ocfs2_recovery_state state)
{
- struct ocfs2_recovery_map *rm;
-
- /* disable any new recovery threads and wait for any currently
- * running ones to exit. Do this before setting the vol_state. */
mutex_lock(&osb->recovery_lock);
- osb->disable_recovery = 1;
+ /*
+ * If recovery thread is not running, we can directly transition to
+ * final state.
+ */
+ if (!ocfs2_recovery_thread_running(osb)) {
+ osb->recovery_state = state + 1;
+ goto out_lock;
+ }
+ osb->recovery_state = state;
+ /* Wait for recovery thread to acknowledge state transition */
+ wait_event_cmd(osb->recovery_event,
+ !ocfs2_recovery_thread_running(osb) ||
+ osb->recovery_state >= state + 1,
+ mutex_unlock(&osb->recovery_lock),
+ mutex_lock(&osb->recovery_lock));
+out_lock:
mutex_unlock(&osb->recovery_lock);
- wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb));
- /* At this point, we know that no more recovery threads can be
- * launched, so wait for any recovery completion work to
- * complete. */
+ /*
+ * At this point we know that no more recovery work can be queued so
+ * wait for any recovery completion work to complete.
+ */
if (osb->ocfs2_wq)
flush_workqueue(osb->ocfs2_wq);
+}
+
+void ocfs2_recovery_disable_quota(struct ocfs2_super *osb)
+{
+ ocfs2_recovery_disable(osb, OCFS2_REC_QUOTA_WANT_DISABLE);
+}
+
+void ocfs2_recovery_exit(struct ocfs2_super *osb)
+{
+ struct ocfs2_recovery_map *rm;
+
+ /* disable any new recovery threads and wait for any currently
+ * running ones to exit. Do this before setting the vol_state. */
+ ocfs2_recovery_disable(osb, OCFS2_REC_WANT_DISABLE);
/*
* Now that recovery is shut down, and the osb is about to be
@@ -1249,7 +1271,7 @@ static int ocfs2_force_read_journal(struct inode *inode)
}
for (i = 0; i < p_blocks; i++, p_blkno++) {
- bh = __find_get_block(osb->sb->s_bdev, p_blkno,
+ bh = __find_get_block_nonatomic(osb->sb->s_bdev, p_blkno,
osb->sb->s_blocksize);
/* block not cached. */
if (!bh)
@@ -1472,6 +1494,18 @@ static int __ocfs2_recovery_thread(void *arg)
}
}
restart:
+ if (quota_enabled) {
+ mutex_lock(&osb->recovery_lock);
+ /* Confirm that recovery thread will no longer recover quotas */
+ if (osb->recovery_state == OCFS2_REC_QUOTA_WANT_DISABLE) {
+ osb->recovery_state = OCFS2_REC_QUOTA_DISABLED;
+ wake_up(&osb->recovery_event);
+ }
+ if (osb->recovery_state >= OCFS2_REC_QUOTA_DISABLED)
+ quota_enabled = 0;
+ mutex_unlock(&osb->recovery_lock);
+ }
+
status = ocfs2_super_lock(osb, 1);
if (status < 0) {
mlog_errno(status);
@@ -1569,27 +1603,29 @@ bail:
ocfs2_free_replay_slots(osb);
osb->recovery_thread_task = NULL;
- mb(); /* sync with ocfs2_recovery_thread_running */
+ if (osb->recovery_state == OCFS2_REC_WANT_DISABLE)
+ osb->recovery_state = OCFS2_REC_DISABLED;
wake_up(&osb->recovery_event);
mutex_unlock(&osb->recovery_lock);
- if (quota_enabled)
- kfree(rm_quota);
+ kfree(rm_quota);
return status;
}
void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
{
+ int was_set = -1;
+
mutex_lock(&osb->recovery_lock);
+ if (osb->recovery_state < OCFS2_REC_WANT_DISABLE)
+ was_set = ocfs2_recovery_map_set(osb, node_num);
trace_ocfs2_recovery_thread(node_num, osb->node_num,
- osb->disable_recovery, osb->recovery_thread_task,
- osb->disable_recovery ?
- -1 : ocfs2_recovery_map_set(osb, node_num));
+ osb->recovery_state, osb->recovery_thread_task, was_set);
- if (osb->disable_recovery)
+ if (osb->recovery_state >= OCFS2_REC_WANT_DISABLE)
goto out;
if (osb->recovery_thread_task)
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index e3c3a35dc5e0..6397170f302f 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -148,6 +148,7 @@ void ocfs2_wait_for_recovery(struct ocfs2_super *osb);
int ocfs2_recovery_init(struct ocfs2_super *osb);
void ocfs2_recovery_exit(struct ocfs2_super *osb);
+void ocfs2_recovery_disable_quota(struct ocfs2_super *osb);
int ocfs2_compute_replay_slots(struct ocfs2_super *osb);
void ocfs2_free_replay_slots(struct ocfs2_super *osb);
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 6a314e9f2b49..50e2faf64c19 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -159,8 +159,9 @@ static const struct vm_operations_struct ocfs2_file_vm_ops = {
.page_mkwrite = ocfs2_page_mkwrite,
};
-int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
+int ocfs2_mmap_prepare(struct vm_area_desc *desc)
{
+ struct file *file = desc->file;
int ret = 0, lock_level = 0;
ret = ocfs2_inode_lock_atime(file_inode(file),
@@ -171,7 +172,7 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
}
ocfs2_inode_unlock(file_inode(file), lock_level);
out:
- vma->vm_ops = &ocfs2_file_vm_ops;
+ desc->vm_ops = &ocfs2_file_vm_ops;
return 0;
}
diff --git a/fs/ocfs2/mmap.h b/fs/ocfs2/mmap.h
index 1051507cc684..d21c30de6b8c 100644
--- a/fs/ocfs2/mmap.h
+++ b/fs/ocfs2/mmap.h
@@ -2,6 +2,6 @@
#ifndef OCFS2_MMAP_H
#define OCFS2_MMAP_H
-int ocfs2_mmap(struct file *file, struct vm_area_struct *vma);
+int ocfs2_mmap_prepare(struct vm_area_desc *desc);
#endif /* OCFS2_MMAP_H */
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 369c7d27befd..cbe2f8ed8897 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -617,6 +617,8 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
*/
credits += OCFS2_INODE_UPDATE_CREDITS + 1;
+ inode_lock(tl_inode);
+
/*
* ocfs2_move_extent() didn't reserve any clusters in lock_allocators()
* logic, while we still need to lock the global_bitmap.
@@ -626,7 +628,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
if (!gb_inode) {
mlog(ML_ERROR, "unable to get global_bitmap inode\n");
ret = -EIO;
- goto out;
+ goto out_unlock_tl_inode;
}
inode_lock(gb_inode);
@@ -634,16 +636,14 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1);
if (ret) {
mlog_errno(ret);
- goto out_unlock_gb_mutex;
+ goto out_unlock_gb_inode;
}
- inode_lock(tl_inode);
-
handle = ocfs2_start_trans(osb, credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
mlog_errno(ret);
- goto out_unlock_tl_inode;
+ goto out_unlock;
}
new_phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *new_phys_cpos);
@@ -703,15 +703,14 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
out_commit:
ocfs2_commit_trans(osb, handle);
brelse(gd_bh);
-
-out_unlock_tl_inode:
- inode_unlock(tl_inode);
-
+out_unlock:
ocfs2_inode_unlock(gb_inode, 1);
-out_unlock_gb_mutex:
+out_unlock_gb_inode:
inode_unlock(gb_inode);
brelse(gb_bh);
iput(gb_inode);
+out_unlock_tl_inode:
+ inode_unlock(tl_inode);
out:
if (context->meta_ac) {
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 0ec63a1a94b8..c90b254da75e 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -142,6 +142,8 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
bail_add:
ret = d_splice_alias(inode, dentry);
+ if (IS_ERR(ret))
+ goto bail_unlock;
if (inode) {
/*
@@ -154,15 +156,16 @@ bail_add:
* NOTE: This dentry already has ->d_op set from
* ocfs2_get_parent() and ocfs2_get_dentry()
*/
- if (!IS_ERR_OR_NULL(ret))
+ if (ret)
dentry = ret;
status = ocfs2_dentry_attach_lock(dentry, inode,
OCFS2_I(dir)->ip_blkno);
if (status) {
mlog_errno(status);
+ if (ret)
+ dput(ret);
ret = ERR_PTR(status);
- goto bail_unlock;
}
} else
ocfs2_dentry_attach_gen(dentry);
@@ -644,10 +647,10 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
suballoc_loc, suballoc_bit);
}
-static int ocfs2_mkdir(struct mnt_idmap *idmap,
- struct inode *dir,
- struct dentry *dentry,
- umode_t mode)
+static struct dentry *ocfs2_mkdir(struct mnt_idmap *idmap,
+ struct inode *dir,
+ struct dentry *dentry,
+ umode_t mode)
{
int ret;
@@ -657,7 +660,7 @@ static int ocfs2_mkdir(struct mnt_idmap *idmap,
if (ret)
mlog_errno(ret);
- return ret;
+ return ERR_PTR(ret);
}
static int ocfs2_create(struct mnt_idmap *idmap,
@@ -1452,8 +1455,8 @@ static int ocfs2_rename(struct mnt_idmap *idmap,
newfe = (struct ocfs2_dinode *) newfe_bh->b_data;
trace_ocfs2_rename_over_existing(
- (unsigned long long)newfe_blkno, newfe_bh, newfe_bh ?
- (unsigned long long)newfe_bh->b_blocknr : 0ULL);
+ (unsigned long long)newfe_blkno, newfe_bh,
+ (unsigned long long)newfe_bh->b_blocknr);
if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) {
status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 51c52768132d..6aaa94c554c1 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -308,6 +308,21 @@ enum ocfs2_journal_trigger_type {
void ocfs2_initialize_journal_triggers(struct super_block *sb,
struct ocfs2_triggers triggers[]);
+enum ocfs2_recovery_state {
+ OCFS2_REC_ENABLED = 0,
+ OCFS2_REC_QUOTA_WANT_DISABLE,
+ /*
+ * Must be OCFS2_REC_QUOTA_WANT_DISABLE + 1 for
+ * ocfs2_recovery_disable_quota() to work.
+ */
+ OCFS2_REC_QUOTA_DISABLED,
+ OCFS2_REC_WANT_DISABLE,
+ /*
+ * Must be OCFS2_REC_WANT_DISABLE + 1 for ocfs2_recovery_exit() to work
+ */
+ OCFS2_REC_DISABLED,
+};
+
struct ocfs2_journal;
struct ocfs2_slot_info;
struct ocfs2_recovery_map;
@@ -370,7 +385,7 @@ struct ocfs2_super
struct ocfs2_recovery_map *recovery_map;
struct ocfs2_replay_map *replay_map;
struct task_struct *recovery_thread_task;
- int disable_recovery;
+ enum ocfs2_recovery_state recovery_state;
wait_queue_head_t checkpoint_event;
struct ocfs2_journal *journal;
unsigned long osb_commit_interval;
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 15d9acd456ec..e85b1ccf81be 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -273,7 +273,7 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
if (new)
memset(bh->b_data, 0, sb->s_blocksize);
memcpy(bh->b_data + offset, data, len);
- flush_dcache_page(bh->b_page);
+ flush_dcache_folio(bh->b_folio);
set_buffer_uptodate(bh);
unlock_buffer(bh);
ocfs2_set_buffer_uptodate(INODE_CACHE(gqinode), bh);
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 2956d888c131..de7f12858729 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -453,8 +453,7 @@ out:
/* Sync changes in local quota file into global quota file and
* reinitialize local quota file.
- * The function expects local quota file to be already locked and
- * s_umount locked in shared mode. */
+ * The function expects local quota file to be already locked. */
static int ocfs2_recover_local_quota_file(struct inode *lqinode,
int type,
struct ocfs2_quota_recovery *rec)
@@ -588,7 +587,6 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
{
unsigned int ino[OCFS2_MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
LOCAL_GROUP_QUOTA_SYSTEM_INODE };
- struct super_block *sb = osb->sb;
struct ocfs2_local_disk_dqinfo *ldinfo;
struct buffer_head *bh;
handle_t *handle;
@@ -600,7 +598,6 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
printk(KERN_NOTICE "ocfs2: Finishing quota recovery on device (%s) for "
"slot %u\n", osb->dev_str, slot_num);
- down_read(&sb->s_umount);
for (type = 0; type < OCFS2_MAXQUOTAS; type++) {
if (list_empty(&(rec->r_list[type])))
continue;
@@ -677,8 +674,7 @@ out_put:
break;
}
out:
- up_read(&sb->s_umount);
- kfree(rec);
+ ocfs2_free_quota_recovery(rec);
return status;
}
@@ -843,8 +839,7 @@ static int ocfs2_local_free_info(struct super_block *sb, int type)
ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk);
/*
- * s_umount held in exclusive mode protects us against racing with
- * recovery thread...
+ * ocfs2_dismount_volume() has already aborted quota recovery...
*/
if (oinfo->dqi_rec) {
ocfs2_free_quota_recovery(oinfo->dqi_rec);
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 77edcd70f72c..0f045e45fa0c 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -360,7 +360,6 @@ static int ocfs2_control_do_setnode_msg(struct file *file,
struct ocfs2_control_message_setn *msg)
{
long nodenum;
- char *ptr = NULL;
struct ocfs2_control_private *p = file->private_data;
if (ocfs2_control_get_handshake_state(file) !=
@@ -375,8 +374,7 @@ static int ocfs2_control_do_setnode_msg(struct file *file,
return -EINVAL;
msg->space = msg->newline = '\0';
- nodenum = simple_strtol(msg->nodestr, &ptr, 16);
- if (!ptr || *ptr)
+ if (kstrtol(msg->nodestr, 16, &nodenum))
return -EINVAL;
if ((nodenum == LONG_MIN) || (nodenum == LONG_MAX) ||
@@ -391,7 +389,6 @@ static int ocfs2_control_do_setversion_msg(struct file *file,
struct ocfs2_control_message_setv *msg)
{
long major, minor;
- char *ptr = NULL;
struct ocfs2_control_private *p = file->private_data;
struct ocfs2_protocol_version *max =
&ocfs2_user_plugin.sp_max_proto;
@@ -409,11 +406,9 @@ static int ocfs2_control_do_setversion_msg(struct file *file,
return -EINVAL;
msg->space1 = msg->space2 = msg->newline = '\0';
- major = simple_strtol(msg->major, &ptr, 16);
- if (!ptr || *ptr)
+ if (kstrtol(msg->major, 16, &major))
return -EINVAL;
- minor = simple_strtol(msg->minor, &ptr, 16);
- if (!ptr || *ptr)
+ if (kstrtol(msg->minor, 16, &minor))
return -EINVAL;
/*
@@ -441,7 +436,6 @@ static int ocfs2_control_do_down_msg(struct file *file,
struct ocfs2_control_message_down *msg)
{
long nodenum;
- char *p = NULL;
if (ocfs2_control_get_handshake_state(file) !=
OCFS2_CONTROL_HANDSHAKE_VALID)
@@ -456,8 +450,7 @@ static int ocfs2_control_do_down_msg(struct file *file,
return -EINVAL;
msg->space1 = msg->space2 = msg->newline = '\0';
- nodenum = simple_strtol(msg->nodestr, &p, 16);
- if (!p || *p)
+ if (kstrtol(msg->nodestr, 16, &nodenum))
return -EINVAL;
if ((nodenum == LONG_MIN) || (nodenum == LONG_MAX) ||
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index ddd761cf44c8..a28c127b9934 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -691,8 +691,7 @@ static void __exit ocfs2_stack_glue_exit(void)
memset(&locking_max_version, 0,
sizeof(struct ocfs2_protocol_version));
ocfs2_sysfs_exit();
- if (ocfs2_table_header)
- unregister_sysctl_table(ocfs2_table_header);
+ unregister_sysctl_table(ocfs2_table_header);
}
MODULE_AUTHOR("Oracle");
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index f7b483f0de2a..6ac4dcd54588 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -698,10 +698,12 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode,
ac, cl);
- if (PTR_ERR(bg_bh) == -ENOSPC)
+ if (PTR_ERR(bg_bh) == -ENOSPC) {
+ ac->ac_which = OCFS2_AC_USE_MAIN_DISCONTIG;
bg_bh = ocfs2_block_group_alloc_discontig(handle,
alloc_inode,
ac, cl);
+ }
if (IS_ERR(bg_bh)) {
status = PTR_ERR(bg_bh);
bg_bh = NULL;
@@ -1794,6 +1796,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
{
int status;
u16 chain;
+ u32 contig_bits;
u64 next_group;
struct inode *alloc_inode = ac->ac_inode;
struct buffer_head *group_bh = NULL;
@@ -1819,10 +1822,21 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
status = -ENOSPC;
/* for now, the chain search is a bit simplistic. We just use
* the 1st group with any empty bits. */
- while ((status = ac->ac_group_search(alloc_inode, group_bh,
- bits_wanted, min_bits,
- ac->ac_max_block,
- res)) == -ENOSPC) {
+ while (1) {
+ if (ac->ac_which == OCFS2_AC_USE_MAIN_DISCONTIG) {
+ contig_bits = le16_to_cpu(bg->bg_contig_free_bits);
+ if (!contig_bits)
+ contig_bits = ocfs2_find_max_contig_free_bits(bg->bg_bitmap,
+ le16_to_cpu(bg->bg_bits), 0);
+ if (bits_wanted > contig_bits && contig_bits >= min_bits)
+ bits_wanted = contig_bits;
+ }
+
+ status = ac->ac_group_search(alloc_inode, group_bh,
+ bits_wanted, min_bits,
+ ac->ac_max_block, res);
+ if (status != -ENOSPC)
+ break;
if (!bg->bg_next_group)
break;
@@ -1982,6 +1996,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
victim = ocfs2_find_victim_chain(cl);
ac->ac_chain = victim;
+search:
status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
res, &bits_left);
if (!status) {
@@ -2022,6 +2037,16 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
}
}
+ /* Chains can't supply the bits_wanted contiguous space.
+ * We should switch to using every single bit when allocating
+ * from the global bitmap. */
+ if (i == le16_to_cpu(cl->cl_next_free_rec) &&
+ status == -ENOSPC && ac->ac_which == OCFS2_AC_USE_MAIN) {
+ ac->ac_which = OCFS2_AC_USE_MAIN_DISCONTIG;
+ ac->ac_chain = victim;
+ goto search;
+ }
+
set_hint:
if (status != -ENOSPC) {
/* If the next search of this group is not likely to
@@ -2365,7 +2390,8 @@ int __ocfs2_claim_clusters(handle_t *handle,
BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL
- && ac->ac_which != OCFS2_AC_USE_MAIN);
+ && ac->ac_which != OCFS2_AC_USE_MAIN
+ && ac->ac_which != OCFS2_AC_USE_MAIN_DISCONTIG);
if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
WARN_ON(min_clusters > 1);
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index b481b834857d..bcf2ed4a8631 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -29,6 +29,7 @@ struct ocfs2_alloc_context {
#define OCFS2_AC_USE_MAIN 2
#define OCFS2_AC_USE_INODE 3
#define OCFS2_AC_USE_META 4
+#define OCFS2_AC_USE_MAIN_DISCONTIG 5
u32 ac_which;
/* these are used by the chain search */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index e0b91dbaa0ac..53daa4482406 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1812,6 +1812,9 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
/* Orphan scan should be stopped as early as possible */
ocfs2_orphan_scan_stop(osb);
+ /* Stop quota recovery so that we can disable quotas */
+ ocfs2_recovery_disable_quota(osb);
+
ocfs2_disable_quotas(osb);
/* All dquots should be freed by now */
@@ -1959,7 +1962,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
sb->s_fs_info = osb;
sb->s_op = &ocfs2_sops;
- sb->s_d_op = &ocfs2_dentry_ops;
+ set_default_d_op(sb, &ocfs2_dentry_ops);
sb->s_export_op = &ocfs2_export_ops;
sb->s_qcop = &dquot_quotactl_sysfile_ops;
sb->dq_op = &ocfs2_quota_operations;
@@ -2285,7 +2288,7 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
mlog(ML_ERROR, "found superblock with incorrect block "
"size bits: found %u, should be 9, 10, 11, or 12\n",
blksz_bits);
- } else if ((1 << le32_to_cpu(blksz_bits)) != blksz) {
+ } else if ((1 << blksz_bits) != blksz) {
mlog(ML_ERROR, "found superblock with incorrect block "
"size: found %u, should be %u\n", 1 << blksz_bits, blksz);
} else if (le16_to_cpu(di->id2.i_super.s_major_rev_level) !=
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index 6bda275826d6..2ed541fccf33 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -279,10 +279,10 @@ out_free_inode:
return err;
}
-static int omfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *omfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
- return omfs_add_node(dir, dentry, mode | S_IFDIR);
+ return ERR_PTR(omfs_add_node(dir, dentry, mode | S_IFDIR));
}
static int omfs_create(struct mnt_idmap *idmap, struct inode *dir,
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 98358d405b6a..49a1de5a827f 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -310,9 +310,10 @@ static void omfs_write_failed(struct address_space *mapping, loff_t to)
}
}
-static int omfs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len,
- struct folio **foliop, void **fsdata)
+static int omfs_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len,
+ struct folio **foliop, void **fsdata)
{
int ret;
@@ -332,7 +333,7 @@ const struct file_operations omfs_file_operations = {
.llseek = generic_file_llseek,
.read_iter = generic_file_read_iter,
.write_iter = generic_file_write_iter,
- .mmap = generic_file_mmap,
+ .mmap_prepare = generic_file_mmap_prepare,
.fsync = generic_file_fsync,
.splice_read = filemap_splice_read,
};
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index d6cd81163030..135c49c5d848 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -9,12 +9,13 @@
#include <linux/fs.h>
#include <linux/vfs.h>
#include <linux/cred.h>
-#include <linux/parser.h>
#include <linux/buffer_head.h>
#include <linux/vmalloc.h>
#include <linux/writeback.h>
#include <linux/seq_file.h>
#include <linux/crc-itu-t.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include "omfs.h"
MODULE_AUTHOR("Bob Copeland <me@bobcopeland.com>");
@@ -384,79 +385,83 @@ nomem:
return -ENOMEM;
}
+struct omfs_mount_options {
+ kuid_t s_uid;
+ kgid_t s_gid;
+ int s_dmask;
+ int s_fmask;
+};
+
enum {
- Opt_uid, Opt_gid, Opt_umask, Opt_dmask, Opt_fmask, Opt_err
+ Opt_uid, Opt_gid, Opt_umask, Opt_dmask, Opt_fmask,
};
-static const match_table_t tokens = {
- {Opt_uid, "uid=%u"},
- {Opt_gid, "gid=%u"},
- {Opt_umask, "umask=%o"},
- {Opt_dmask, "dmask=%o"},
- {Opt_fmask, "fmask=%o"},
- {Opt_err, NULL},
+static const struct fs_parameter_spec omfs_param_spec[] = {
+ fsparam_uid ("uid", Opt_uid),
+ fsparam_gid ("gid", Opt_gid),
+ fsparam_u32oct ("umask", Opt_umask),
+ fsparam_u32oct ("dmask", Opt_dmask),
+ fsparam_u32oct ("fmask", Opt_fmask),
+ {}
};
-static int parse_options(char *options, struct omfs_sb_info *sbi)
+static int
+omfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- char *p;
- substring_t args[MAX_OPT_ARGS];
- int option;
-
- if (!options)
- return 1;
-
- while ((p = strsep(&options, ",")) != NULL) {
- int token;
- if (!*p)
- continue;
-
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_uid:
- if (match_int(&args[0], &option))
- return 0;
- sbi->s_uid = make_kuid(current_user_ns(), option);
- if (!uid_valid(sbi->s_uid))
- return 0;
- break;
- case Opt_gid:
- if (match_int(&args[0], &option))
- return 0;
- sbi->s_gid = make_kgid(current_user_ns(), option);
- if (!gid_valid(sbi->s_gid))
- return 0;
- break;
- case Opt_umask:
- if (match_octal(&args[0], &option))
- return 0;
- sbi->s_fmask = sbi->s_dmask = option;
- break;
- case Opt_dmask:
- if (match_octal(&args[0], &option))
- return 0;
- sbi->s_dmask = option;
- break;
- case Opt_fmask:
- if (match_octal(&args[0], &option))
- return 0;
- sbi->s_fmask = option;
- break;
- default:
- return 0;
- }
+ struct omfs_mount_options *opts = fc->fs_private;
+ int token;
+ struct fs_parse_result result;
+
+ /* All options are ignored on remount */
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE)
+ return 0;
+
+ token = fs_parse(fc, omfs_param_spec, param, &result);
+ if (token < 0)
+ return token;
+
+ switch (token) {
+ case Opt_uid:
+ opts->s_uid = result.uid;
+ break;
+ case Opt_gid:
+ opts->s_gid = result.gid;
+ break;
+ case Opt_umask:
+ opts->s_fmask = opts->s_dmask = result.uint_32;
+ break;
+ case Opt_dmask:
+ opts->s_dmask = result.uint_32;
+ break;
+ case Opt_fmask:
+ opts->s_fmask = result.uint_32;
+ break;
+ default:
+ return -EINVAL;
}
- return 1;
+
+ return 0;
}
-static int omfs_fill_super(struct super_block *sb, void *data, int silent)
+static void
+omfs_set_options(struct omfs_sb_info *sbi, struct omfs_mount_options *opts)
+{
+ sbi->s_uid = opts->s_uid;
+ sbi->s_gid = opts->s_gid;
+ sbi->s_dmask = opts->s_dmask;
+ sbi->s_fmask = opts->s_fmask;
+}
+
+static int omfs_fill_super(struct super_block *sb, struct fs_context *fc)
{
struct buffer_head *bh, *bh2;
struct omfs_super_block *omfs_sb;
struct omfs_root_block *omfs_rb;
struct omfs_sb_info *sbi;
struct inode *root;
+ struct omfs_mount_options *parsed_opts = fc->fs_private;
int ret = -EINVAL;
+ int silent = fc->sb_flags & SB_SILENT;
sbi = kzalloc(sizeof(struct omfs_sb_info), GFP_KERNEL);
if (!sbi)
@@ -464,12 +469,7 @@ static int omfs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_fs_info = sbi;
- sbi->s_uid = current_uid();
- sbi->s_gid = current_gid();
- sbi->s_dmask = sbi->s_fmask = current_umask();
-
- if (!parse_options((char *) data, sbi))
- goto end;
+ omfs_set_options(sbi, parsed_opts);
sb->s_maxbytes = 0xffffffff;
@@ -594,18 +594,50 @@ end:
return ret;
}
-static struct dentry *omfs_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int omfs_get_tree(struct fs_context *fc)
+{
+ return get_tree_bdev(fc, omfs_fill_super);
+}
+
+static void omfs_free_fc(struct fs_context *fc);
+
+static const struct fs_context_operations omfs_context_ops = {
+ .parse_param = omfs_parse_param,
+ .get_tree = omfs_get_tree,
+ .free = omfs_free_fc,
+};
+
+static int omfs_init_fs_context(struct fs_context *fc)
+{
+ struct omfs_mount_options *opts;
+
+ opts = kzalloc(sizeof(*opts), GFP_KERNEL);
+ if (!opts)
+ return -ENOMEM;
+
+ /* Set mount options defaults */
+ opts->s_uid = current_uid();
+ opts->s_gid = current_gid();
+ opts->s_dmask = opts->s_fmask = current_umask();
+
+ fc->fs_private = opts;
+ fc->ops = &omfs_context_ops;
+
+ return 0;
+}
+
+static void omfs_free_fc(struct fs_context *fc)
{
- return mount_bdev(fs_type, flags, dev_name, data, omfs_fill_super);
+ kfree(fc->fs_private);
}
static struct file_system_type omfs_fs_type = {
- .owner = THIS_MODULE,
- .name = "omfs",
- .mount = omfs_mount,
- .kill_sb = kill_block_super,
- .fs_flags = FS_REQUIRES_DEV,
+ .owner = THIS_MODULE,
+ .name = "omfs",
+ .kill_sb = kill_block_super,
+ .fs_flags = FS_REQUIRES_DEV,
+ .init_fs_context = omfs_init_fs_context,
+ .parameters = omfs_param_spec,
};
MODULE_ALIAS_FS("omfs");
diff --git a/fs/open.c b/fs/open.c
index 932e5a6de63b..9655158c3885 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -60,18 +60,21 @@ int do_truncate(struct mnt_idmap *idmap, struct dentry *dentry,
if (ret)
newattrs.ia_valid |= ret | ATTR_FORCE;
- inode_lock(dentry->d_inode);
+ ret = inode_lock_killable(dentry->d_inode);
+ if (ret)
+ return ret;
+
/* Note any delegations or leases have already been broken: */
ret = notify_change(idmap, dentry, &newattrs, NULL);
inode_unlock(dentry->d_inode);
return ret;
}
-long vfs_truncate(const struct path *path, loff_t length)
+int vfs_truncate(const struct path *path, loff_t length)
{
struct mnt_idmap *idmap;
struct inode *inode;
- long error;
+ int error;
inode = path->dentry->d_inode;
@@ -123,7 +126,7 @@ mnt_drop_write_and_out:
}
EXPORT_SYMBOL_GPL(vfs_truncate);
-long do_sys_truncate(const char __user *pathname, loff_t length)
+int do_sys_truncate(const char __user *pathname, loff_t length)
{
unsigned int lookup_flags = LOOKUP_FOLLOW;
struct path path;
@@ -157,7 +160,7 @@ COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length
}
#endif
-long do_ftruncate(struct file *file, loff_t length, int small)
+int do_ftruncate(struct file *file, loff_t length, int small)
{
struct inode *inode;
struct dentry *dentry;
@@ -196,7 +199,7 @@ long do_ftruncate(struct file *file, loff_t length, int small)
return error;
}
-long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
+int do_sys_ftruncate(unsigned int fd, loff_t length, int small)
{
if (length < 0)
return -EINVAL;
@@ -251,7 +254,7 @@ COMPAT_SYSCALL_DEFINE3(ftruncate64, unsigned int, fd,
int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
{
struct inode *inode = file_inode(file);
- long ret;
+ int ret;
loff_t sum;
if (offset < 0 || len <= 0)
@@ -278,6 +281,7 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
break;
case FALLOC_FL_COLLAPSE_RANGE:
case FALLOC_FL_INSERT_RANGE:
+ case FALLOC_FL_WRITE_ZEROES:
if (mode & FALLOC_FL_KEEP_SIZE)
return -EOPNOTSUPP;
break;
@@ -460,7 +464,7 @@ static const struct cred *access_override_creds(void)
return override_creds(override_cred);
}
-static long do_faccessat(int dfd, const char __user *filename, int mode, int flags)
+static int do_faccessat(int dfd, const char __user *filename, int mode, int flags)
{
struct path path;
struct inode *inode;
@@ -635,7 +639,9 @@ int chmod_common(const struct path *path, umode_t mode)
if (error)
return error;
retry_deleg:
- inode_lock(inode);
+ error = inode_lock_killable(inode);
+ if (error)
+ goto out_mnt_unlock;
error = security_path_chmod(path, mode);
if (error)
goto out_unlock;
@@ -650,6 +656,7 @@ out_unlock:
if (!error)
goto retry_deleg;
}
+out_mnt_unlock:
mnt_drop_write(path->mnt);
return error;
}
@@ -769,7 +776,9 @@ retry_deleg:
return -EINVAL;
if ((group != (gid_t)-1) && !setattr_vfsgid(&newattrs, gid))
return -EINVAL;
- inode_lock(inode);
+ error = inode_lock_killable(inode);
+ if (error)
+ return error;
if (!S_ISDIR(inode->i_mode))
newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_PRIV |
setattr_should_drop_sgid(idmap, inode);
@@ -905,7 +914,8 @@ static int do_dentry_open(struct file *f,
f->f_sb_err = file_sample_sb_err(f);
if (unlikely(f->f_flags & O_PATH)) {
- f->f_mode = FMODE_PATH | FMODE_OPENED | FMODE_NONOTIFY;
+ f->f_mode = FMODE_PATH | FMODE_OPENED;
+ file_set_fsnotify_mode(f, FMODE_NONOTIFY);
f->f_op = &empty_fops;
return 0;
}
@@ -934,12 +944,12 @@ static int do_dentry_open(struct file *f,
goto cleanup_all;
/*
- * Set FMODE_NONOTIFY_* bits according to existing permission watches.
- * If FMODE_NONOTIFY was already set for an fanotify fd, this doesn't
- * change anything.
+ * Call fsnotify open permission hook and set FMODE_NONOTIFY_* bits
+ * according to existing permission watches.
+ * If FMODE_NONOTIFY mode was already set for an fanotify fd or for a
+ * pseudo file, this call will not change the mode.
*/
- file_set_fsnotify_mode(f);
- error = fsnotify_open_perm(f);
+ error = fsnotify_open_perm_and_set_mode(f);
if (error)
goto cleanup_all;
@@ -1122,7 +1132,7 @@ struct file *dentry_open_nonotify(const struct path *path, int flags,
if (!IS_ERR(f)) {
int error;
- f->f_mode |= FMODE_NONOTIFY;
+ file_set_fsnotify_mode(f, FMODE_NONOTIFY);
error = vfs_open(path, f);
if (error) {
fput(f);
@@ -1195,14 +1205,11 @@ struct file *kernel_file_open(const struct path *path, int flags,
if (IS_ERR(f))
return f;
- f->f_path = *path;
- error = do_dentry_open(f, NULL);
+ error = vfs_open(path, f);
if (error) {
fput(f);
return ERR_PTR(error);
}
-
- fsnotify_open(f);
return f;
}
EXPORT_SYMBOL_GPL(kernel_file_open);
@@ -1408,22 +1415,23 @@ struct file *file_open_root(const struct path *root,
}
EXPORT_SYMBOL(file_open_root);
-static long do_sys_openat2(int dfd, const char __user *filename,
- struct open_how *how)
+static int do_sys_openat2(int dfd, const char __user *filename,
+ struct open_how *how)
{
struct open_flags op;
- int fd = build_open_flags(how, &op);
struct filename *tmp;
+ int err, fd;
- if (fd)
- return fd;
+ err = build_open_flags(how, &op);
+ if (unlikely(err))
+ return err;
tmp = getname(filename);
if (IS_ERR(tmp))
return PTR_ERR(tmp);
fd = get_unused_fd_flags(how->flags);
- if (fd >= 0) {
+ if (likely(fd >= 0)) {
struct file *f = do_filp_open(dfd, tmp, &op);
if (IS_ERR(f)) {
put_unused_fd(fd);
@@ -1436,7 +1444,7 @@ static long do_sys_openat2(int dfd, const char __user *filename,
return fd;
}
-long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
+int do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
{
struct open_how how = build_open_how(flags, mode);
return do_sys_openat2(dfd, filename, &how);
@@ -1550,7 +1558,7 @@ int filp_close(struct file *filp, fl_owner_t id)
int retval;
retval = filp_flush(filp, id);
- fput(filp);
+ fput_close(filp);
return retval;
}
@@ -1576,13 +1584,16 @@ SYSCALL_DEFINE1(close, unsigned int, fd)
* We're returning to user space. Don't bother
* with any delayed fput() cases.
*/
- __fput_sync(file);
+ fput_close_sync(file);
+
+ if (likely(retval == 0))
+ return 0;
/* can't restart close syscall because file table entry was cleared */
- if (unlikely(retval == -ERESTARTSYS ||
- retval == -ERESTARTNOINTR ||
- retval == -ERESTARTNOHAND ||
- retval == -ERESTART_RESTARTBLOCK))
+ if (retval == -ERESTARTSYS ||
+ retval == -ERESTARTNOINTR ||
+ retval == -ERESTARTNOHAND ||
+ retval == -ERESTART_RESTARTBLOCK)
retval = -EINTR;
return retval;
diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
index d68372241b30..919f99b16834 100644
--- a/fs/orangefs/file.c
+++ b/fs/orangefs/file.c
@@ -57,8 +57,8 @@ ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode,
int buffer_index;
ssize_t ret;
size_t copy_amount;
- int open_for_read;
- int open_for_write;
+ bool open_for_read;
+ bool open_for_write;
new_op = op_alloc(ORANGEFS_VFS_OP_FILE_IO);
if (!new_op)
@@ -398,8 +398,9 @@ static const struct vm_operations_struct orangefs_file_vm_ops = {
/*
* Memory map a region of a file.
*/
-static int orangefs_file_mmap(struct file *file, struct vm_area_struct *vma)
+static int orangefs_file_mmap_prepare(struct vm_area_desc *desc)
{
+ struct file *file = desc->file;
int ret;
ret = orangefs_revalidate_mapping(file_inode(file));
@@ -410,10 +411,11 @@ static int orangefs_file_mmap(struct file *file, struct vm_area_struct *vma)
"orangefs_file_mmap: called on %pD\n", file);
/* set the sequential readahead hint */
- vm_flags_mod(vma, VM_SEQ_READ, VM_RAND_READ);
+ desc->vm_flags |= VM_SEQ_READ;
+ desc->vm_flags &= ~VM_RAND_READ;
file_accessed(file);
- vma->vm_ops = &orangefs_file_vm_ops;
+ desc->vm_ops = &orangefs_file_vm_ops;
return 0;
}
@@ -574,7 +576,7 @@ const struct file_operations orangefs_file_operations = {
.read_iter = orangefs_file_read_iter,
.write_iter = orangefs_file_write_iter,
.lock = orangefs_lock,
- .mmap = orangefs_file_mmap,
+ .mmap_prepare = orangefs_file_mmap_prepare,
.open = generic_file_open,
.splice_read = orangefs_file_splice_read,
.splice_write = iter_file_splice_write,
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index aae6d2b8767d..a01400cd41fd 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -16,60 +16,50 @@
#include "orangefs-kernel.h"
#include "orangefs-bufmap.h"
-static int orangefs_writepage_locked(struct page *page,
- struct writeback_control *wbc)
+static int orangefs_writepage_locked(struct folio *folio,
+ struct writeback_control *wbc)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
struct orangefs_write_range *wr = NULL;
struct iov_iter iter;
struct bio_vec bv;
- size_t len, wlen;
+ size_t wlen;
ssize_t ret;
- loff_t off;
+ loff_t len, off;
- set_page_writeback(page);
+ folio_start_writeback(folio);
len = i_size_read(inode);
- if (PagePrivate(page)) {
- wr = (struct orangefs_write_range *)page_private(page);
- WARN_ON(wr->pos >= len);
+ if (folio->private) {
+ wr = folio->private;
off = wr->pos;
- if (off + wr->len > len)
+ if ((off + wr->len > len) && (off <= len))
wlen = len - off;
else
wlen = wr->len;
+ if (wlen == 0)
+ wlen = wr->len;
} else {
WARN_ON(1);
- off = page_offset(page);
- if (off + PAGE_SIZE > len)
+ off = folio_pos(folio);
+ wlen = folio_size(folio);
+
+ if (wlen > len - off)
wlen = len - off;
- else
- wlen = PAGE_SIZE;
}
- /* Should've been handled in orangefs_invalidate_folio. */
- WARN_ON(off == len || off + wlen > len);
WARN_ON(wlen == 0);
- bvec_set_page(&bv, page, wlen, off % PAGE_SIZE);
+ bvec_set_folio(&bv, folio, wlen, offset_in_folio(folio, off));
iov_iter_bvec(&iter, ITER_SOURCE, &bv, 1, wlen);
ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, wlen,
len, wr, NULL, NULL);
if (ret < 0) {
- mapping_set_error(page->mapping, ret);
+ mapping_set_error(folio->mapping, ret);
} else {
ret = 0;
}
- kfree(detach_page_private(page));
- return ret;
-}
-
-static int orangefs_writepage(struct page *page, struct writeback_control *wbc)
-{
- int ret;
- ret = orangefs_writepage_locked(page, wbc);
- unlock_page(page);
- end_page_writeback(page);
+ kfree(folio_detach_private(folio));
return ret;
}
@@ -79,33 +69,33 @@ struct orangefs_writepages {
kuid_t uid;
kgid_t gid;
int maxpages;
- int npages;
- struct page **pages;
+ int nfolios;
+ struct address_space *mapping;
+ struct folio **folios;
struct bio_vec *bv;
};
static int orangefs_writepages_work(struct orangefs_writepages *ow,
- struct writeback_control *wbc)
+ struct writeback_control *wbc)
{
- struct inode *inode = ow->pages[0]->mapping->host;
+ struct inode *inode = ow->mapping->host;
struct orangefs_write_range *wrp, wr;
struct iov_iter iter;
ssize_t ret;
- size_t len;
- loff_t off;
+ size_t start;
+ loff_t len, off;
int i;
len = i_size_read(inode);
- for (i = 0; i < ow->npages; i++) {
- set_page_writeback(ow->pages[i]);
- bvec_set_page(&ow->bv[i], ow->pages[i],
- min(page_offset(ow->pages[i]) + PAGE_SIZE,
- ow->off + ow->len) -
- max(ow->off, page_offset(ow->pages[i])),
- i == 0 ? ow->off - page_offset(ow->pages[i]) : 0);
+ start = offset_in_folio(ow->folios[0], ow->off);
+ for (i = 0; i < ow->nfolios; i++) {
+ folio_start_writeback(ow->folios[i]);
+ bvec_set_folio(&ow->bv[i], ow->folios[i],
+ folio_size(ow->folios[i]) - start, start);
+ start = 0;
}
- iov_iter_bvec(&iter, ITER_SOURCE, ow->bv, ow->npages, ow->len);
+ iov_iter_bvec(&iter, ITER_SOURCE, ow->bv, ow->nfolios, ow->len);
WARN_ON(ow->off >= len);
if (ow->off + ow->len > len)
@@ -116,40 +106,24 @@ static int orangefs_writepages_work(struct orangefs_writepages *ow,
wr.gid = ow->gid;
ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, ow->len,
0, &wr, NULL, NULL);
- if (ret < 0) {
- for (i = 0; i < ow->npages; i++) {
- mapping_set_error(ow->pages[i]->mapping, ret);
- if (PagePrivate(ow->pages[i])) {
- wrp = (struct orangefs_write_range *)
- page_private(ow->pages[i]);
- ClearPagePrivate(ow->pages[i]);
- put_page(ow->pages[i]);
- kfree(wrp);
- }
- end_page_writeback(ow->pages[i]);
- unlock_page(ow->pages[i]);
- }
- } else {
+ if (ret < 0)
+ mapping_set_error(ow->mapping, ret);
+ else
ret = 0;
- for (i = 0; i < ow->npages; i++) {
- if (PagePrivate(ow->pages[i])) {
- wrp = (struct orangefs_write_range *)
- page_private(ow->pages[i]);
- ClearPagePrivate(ow->pages[i]);
- put_page(ow->pages[i]);
- kfree(wrp);
- }
- end_page_writeback(ow->pages[i]);
- unlock_page(ow->pages[i]);
- }
+
+ for (i = 0; i < ow->nfolios; i++) {
+ wrp = folio_detach_private(ow->folios[i]);
+ kfree(wrp);
+ folio_end_writeback(ow->folios[i]);
+ folio_unlock(ow->folios[i]);
}
+
return ret;
}
static int orangefs_writepages_callback(struct folio *folio,
- struct writeback_control *wbc, void *data)
+ struct writeback_control *wbc, struct orangefs_writepages *ow)
{
- struct orangefs_writepages *ow = data;
struct orangefs_write_range *wr = folio->private;
int ret;
@@ -162,41 +136,41 @@ static int orangefs_writepages_callback(struct folio *folio,
}
ret = -1;
- if (ow->npages == 0) {
+ if (ow->nfolios == 0) {
ow->off = wr->pos;
ow->len = wr->len;
ow->uid = wr->uid;
ow->gid = wr->gid;
- ow->pages[ow->npages++] = &folio->page;
+ ow->folios[ow->nfolios++] = folio;
ret = 0;
goto done;
}
if (!uid_eq(ow->uid, wr->uid) || !gid_eq(ow->gid, wr->gid)) {
orangefs_writepages_work(ow, wbc);
- ow->npages = 0;
+ ow->nfolios = 0;
ret = -1;
goto done;
}
if (ow->off + ow->len == wr->pos) {
ow->len += wr->len;
- ow->pages[ow->npages++] = &folio->page;
+ ow->folios[ow->nfolios++] = folio;
ret = 0;
goto done;
}
done:
if (ret == -1) {
- if (ow->npages) {
+ if (ow->nfolios) {
orangefs_writepages_work(ow, wbc);
- ow->npages = 0;
+ ow->nfolios = 0;
}
- ret = orangefs_writepage_locked(&folio->page, wbc);
+ ret = orangefs_writepage_locked(folio, wbc);
mapping_set_error(folio->mapping, ret);
folio_unlock(folio);
folio_end_writeback(folio);
} else {
- if (ow->npages == ow->maxpages) {
+ if (ow->nfolios == ow->maxpages) {
orangefs_writepages_work(ow, wbc);
- ow->npages = 0;
+ ow->nfolios = 0;
}
}
return ret;
@@ -207,31 +181,35 @@ static int orangefs_writepages(struct address_space *mapping,
{
struct orangefs_writepages *ow;
struct blk_plug plug;
- int ret;
+ int error;
+ struct folio *folio = NULL;
+
ow = kzalloc(sizeof(struct orangefs_writepages), GFP_KERNEL);
if (!ow)
return -ENOMEM;
ow->maxpages = orangefs_bufmap_size_query()/PAGE_SIZE;
- ow->pages = kcalloc(ow->maxpages, sizeof(struct page *), GFP_KERNEL);
- if (!ow->pages) {
+ ow->folios = kcalloc(ow->maxpages, sizeof(struct folio *), GFP_KERNEL);
+ if (!ow->folios) {
kfree(ow);
return -ENOMEM;
}
ow->bv = kcalloc(ow->maxpages, sizeof(struct bio_vec), GFP_KERNEL);
if (!ow->bv) {
- kfree(ow->pages);
+ kfree(ow->folios);
kfree(ow);
return -ENOMEM;
}
+ ow->mapping = mapping;
blk_start_plug(&plug);
- ret = write_cache_pages(mapping, wbc, orangefs_writepages_callback, ow);
- if (ow->npages)
- ret = orangefs_writepages_work(ow, wbc);
+ while ((folio = writeback_iter(mapping, wbc, folio, &error)))
+ error = orangefs_writepages_callback(folio, wbc, ow);
+ if (ow->nfolios)
+ error = orangefs_writepages_work(ow, wbc);
blk_finish_plug(&plug);
- kfree(ow->pages);
+ kfree(ow->folios);
kfree(ow->bv);
kfree(ow);
- return ret;
+ return error;
}
static int orangefs_launder_folio(struct folio *);
@@ -307,9 +285,10 @@ static int orangefs_read_folio(struct file *file, struct folio *folio)
return ret;
}
-static int orangefs_write_begin(struct file *file,
- struct address_space *mapping, loff_t pos, unsigned len,
- struct folio **foliop, void **fsdata)
+static int orangefs_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping, loff_t pos,
+ unsigned len, struct folio **foliop,
+ void **fsdata)
{
struct orangefs_write_range *wr;
struct folio *folio;
@@ -341,6 +320,8 @@ static int orangefs_write_begin(struct file *file,
wr->len += len;
goto okay;
} else {
+ wr->pos = pos;
+ wr->len = len;
ret = orangefs_launder_folio(folio);
if (ret)
return ret;
@@ -360,9 +341,10 @@ okay:
return 0;
}
-static int orangefs_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied, struct folio *folio,
- void *fsdata)
+static int orangefs_write_end(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct folio *folio, void *fsdata)
{
struct inode *inode = folio->mapping->host;
loff_t last_pos = pos + copied;
@@ -392,7 +374,7 @@ static int orangefs_write_end(struct file *file, struct address_space *mapping,
folio_unlock(folio);
folio_put(folio);
- mark_inode_dirty_sync(file_inode(file));
+ mark_inode_dirty_sync(file_inode(iocb->ki_filp));
return copied;
}
@@ -484,7 +466,7 @@ static int orangefs_launder_folio(struct folio *folio)
};
folio_wait_writeback(folio);
if (folio_clear_dirty_for_io(folio)) {
- r = orangefs_writepage_locked(&folio->page, &wbc);
+ r = orangefs_writepage_locked(folio, &wbc);
folio_end_writeback(folio);
}
return r;
@@ -606,7 +588,6 @@ out:
/** ORANGEFS2 implementation of address space operations */
static const struct address_space_operations orangefs_address_operations = {
- .writepage = orangefs_writepage,
.readahead = orangefs_readahead,
.read_folio = orangefs_read_folio,
.writepages = orangefs_writepages,
@@ -616,6 +597,7 @@ static const struct address_space_operations orangefs_address_operations = {
.invalidate_folio = orangefs_invalidate_folio,
.release_folio = orangefs_release_folio,
.free_folio = orangefs_free_folio,
+ .migrate_folio = filemap_migrate_folio,
.launder_folio = orangefs_launder_folio,
.direct_IO = orangefs_direct_IO,
};
@@ -907,7 +889,7 @@ int orangefs_update_time(struct inode *inode, int flags)
return __orangefs_setattr(inode, &iattr);
}
-static int orangefs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+static int orangefs_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
{
u64 val = 0;
int ret;
@@ -928,7 +910,7 @@ static int orangefs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
}
static int orangefs_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa)
+ struct dentry *dentry, struct file_kattr *fa)
{
u64 val = 0;
diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c
index 200558ec72f0..82395fe2b956 100644
--- a/fs/orangefs/namei.c
+++ b/fs/orangefs/namei.c
@@ -300,8 +300,8 @@ out:
return ret;
}
-static int orangefs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *orangefs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct orangefs_inode_s *parent = ORANGEFS_I(dir);
struct orangefs_kernel_op_s *new_op;
@@ -312,7 +312,7 @@ static int orangefs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
new_op = op_alloc(ORANGEFS_VFS_OP_MKDIR);
if (!new_op)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
new_op->upcall.req.mkdir.parent_refn = parent->refn;
@@ -366,7 +366,7 @@ static int orangefs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
__orangefs_setattr(dir, &iattr);
out:
op_release(new_op);
- return ret;
+ return ERR_PTR(ret);
}
static int orangefs_rename(struct mnt_idmap *idmap,
diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c
index edcca4beb765..b562d3dbc76b 100644
--- a/fs/orangefs/orangefs-bufmap.c
+++ b/fs/orangefs/orangefs-bufmap.c
@@ -197,18 +197,6 @@ int orangefs_bufmap_size_query(void)
return size;
}
-int orangefs_bufmap_shift_query(void)
-{
- struct orangefs_bufmap *bufmap;
- int shift = 0;
- spin_lock(&orangefs_bufmap_lock);
- bufmap = __orangefs_bufmap;
- if (bufmap)
- shift = bufmap->desc_shift;
- spin_unlock(&orangefs_bufmap_lock);
- return shift;
-}
-
static DECLARE_WAIT_QUEUE_HEAD(bufmap_waitq);
static DECLARE_WAIT_QUEUE_HEAD(readdir_waitq);
@@ -532,16 +520,3 @@ int orangefs_bufmap_copy_to_iovec(struct iov_iter *iter,
}
return 0;
}
-
-void orangefs_bufmap_page_fill(void *page_to,
- int buffer_index,
- int slot_index)
-{
- struct orangefs_bufmap_desc *from;
- void *page_from;
-
- from = &__orangefs_bufmap->desc_array[buffer_index];
- page_from = kmap_atomic(from->page_array[slot_index]);
- memcpy(page_to, page_from, PAGE_SIZE);
- kunmap_atomic(page_from);
-}
diff --git a/fs/orangefs/orangefs-bufmap.h b/fs/orangefs/orangefs-bufmap.h
index 75b2d2833af1..4231175ccdb2 100644
--- a/fs/orangefs/orangefs-bufmap.h
+++ b/fs/orangefs/orangefs-bufmap.h
@@ -10,8 +10,6 @@
int orangefs_bufmap_size_query(void);
-int orangefs_bufmap_shift_query(void);
-
int orangefs_bufmap_initialize(struct ORANGEFS_dev_map_desc *user_desc);
void orangefs_bufmap_finalize(void);
@@ -34,6 +32,5 @@ int orangefs_bufmap_copy_to_iovec(struct iov_iter *iter,
int buffer_index,
size_t size);
-void orangefs_bufmap_page_fill(void *kaddr, int buffer_index, int slot_index);
#endif /* __ORANGEFS_BUFMAP_H */
diff --git a/fs/orangefs/orangefs-debug.h b/fs/orangefs/orangefs-debug.h
index 6e079d4230d0..d4463534cec6 100644
--- a/fs/orangefs/orangefs-debug.h
+++ b/fs/orangefs/orangefs-debug.h
@@ -43,47 +43,4 @@
#define GOSSIP_MAX_NR 16
#define GOSSIP_MAX_DEBUG (((__u64)1 << GOSSIP_MAX_NR) - 1)
-/* a private internal type */
-struct __keyword_mask_s {
- const char *keyword;
- __u64 mask_val;
-};
-
-/*
- * Map all kmod keywords to kmod debug masks here. Keep this
- * structure "packed":
- *
- * "all" is always last...
- *
- * keyword mask_val index
- * foo 1 0
- * bar 2 1
- * baz 4 2
- * qux 8 3
- * . . .
- */
-static struct __keyword_mask_s s_kmod_keyword_mask_map[] = {
- {"super", GOSSIP_SUPER_DEBUG},
- {"inode", GOSSIP_INODE_DEBUG},
- {"file", GOSSIP_FILE_DEBUG},
- {"dir", GOSSIP_DIR_DEBUG},
- {"utils", GOSSIP_UTILS_DEBUG},
- {"wait", GOSSIP_WAIT_DEBUG},
- {"acl", GOSSIP_ACL_DEBUG},
- {"dcache", GOSSIP_DCACHE_DEBUG},
- {"dev", GOSSIP_DEV_DEBUG},
- {"name", GOSSIP_NAME_DEBUG},
- {"bufmap", GOSSIP_BUFMAP_DEBUG},
- {"cache", GOSSIP_CACHE_DEBUG},
- {"debugfs", GOSSIP_DEBUGFS_DEBUG},
- {"xattr", GOSSIP_XATTR_DEBUG},
- {"init", GOSSIP_INIT_DEBUG},
- {"sysfs", GOSSIP_SYSFS_DEBUG},
- {"none", GOSSIP_NO_DEBUG},
- {"all", GOSSIP_MAX_DEBUG}
-};
-
-static const int num_kmod_keyword_mask_map = (int)
- (ARRAY_SIZE(s_kmod_keyword_mask_map));
-
#endif /* __ORANGEFS_DEBUG_H */
diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c
index 9729f071c5aa..1c375fb65018 100644
--- a/fs/orangefs/orangefs-debugfs.c
+++ b/fs/orangefs/orangefs-debugfs.c
@@ -44,6 +44,49 @@
#include "protocol.h"
#include "orangefs-kernel.h"
+/* a private internal type */
+struct __keyword_mask_s {
+ const char *keyword;
+ __u64 mask_val;
+};
+
+/*
+ * Map all kmod keywords to kmod debug masks here. Keep this
+ * structure "packed":
+ *
+ * "all" is always last...
+ *
+ * keyword mask_val index
+ * foo 1 0
+ * bar 2 1
+ * baz 4 2
+ * qux 8 3
+ * . . .
+ */
+static struct __keyword_mask_s s_kmod_keyword_mask_map[] = {
+ {"super", GOSSIP_SUPER_DEBUG},
+ {"inode", GOSSIP_INODE_DEBUG},
+ {"file", GOSSIP_FILE_DEBUG},
+ {"dir", GOSSIP_DIR_DEBUG},
+ {"utils", GOSSIP_UTILS_DEBUG},
+ {"wait", GOSSIP_WAIT_DEBUG},
+ {"acl", GOSSIP_ACL_DEBUG},
+ {"dcache", GOSSIP_DCACHE_DEBUG},
+ {"dev", GOSSIP_DEV_DEBUG},
+ {"name", GOSSIP_NAME_DEBUG},
+ {"bufmap", GOSSIP_BUFMAP_DEBUG},
+ {"cache", GOSSIP_CACHE_DEBUG},
+ {"debugfs", GOSSIP_DEBUGFS_DEBUG},
+ {"xattr", GOSSIP_XATTR_DEBUG},
+ {"init", GOSSIP_INIT_DEBUG},
+ {"sysfs", GOSSIP_SYSFS_DEBUG},
+ {"none", GOSSIP_NO_DEBUG},
+ {"all", GOSSIP_MAX_DEBUG}
+};
+
+static const int num_kmod_keyword_mask_map = (int)
+ (ARRAY_SIZE(s_kmod_keyword_mask_map));
+
#define DEBUG_HELP_STRING_SIZE 4096
#define HELP_STRING_UNINITIALIZED \
"Client Debug Keywords are unknown until the first time\n" \
@@ -353,7 +396,7 @@ static ssize_t orangefs_debug_read(struct file *file,
goto out;
mutex_lock(&orangefs_debug_lock);
- sprintf_ret = sprintf(buf, "%s", (char *)file->private_data);
+ sprintf_ret = scnprintf(buf, ORANGEFS_MAX_DEBUG_STRING_LEN, "%s", (char *)file->private_data);
mutex_unlock(&orangefs_debug_lock);
read_ret = simple_read_from_buffer(ubuf, count, ppos, buf, sprintf_ret);
@@ -392,9 +435,9 @@ static ssize_t orangefs_debug_write(struct file *file,
* Thwart users who try to jamb a ridiculous number
* of bytes into the debug file...
*/
- if (count > ORANGEFS_MAX_DEBUG_STRING_LEN + 1) {
+ if (count > ORANGEFS_MAX_DEBUG_STRING_LEN) {
silly = count;
- count = ORANGEFS_MAX_DEBUG_STRING_LEN + 1;
+ count = ORANGEFS_MAX_DEBUG_STRING_LEN;
}
buf = kzalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL);
@@ -726,8 +769,8 @@ static void do_k_string(void *k_mask, int index)
if (*mask & s_kmod_keyword_mask_map[index].mask_val) {
if ((strlen(kernel_debug_string) +
- strlen(s_kmod_keyword_mask_map[index].keyword))
- < ORANGEFS_MAX_DEBUG_STRING_LEN - 1) {
+ strlen(s_kmod_keyword_mask_map[index].keyword) + 1)
+ < ORANGEFS_MAX_DEBUG_STRING_LEN) {
strcat(kernel_debug_string,
s_kmod_keyword_mask_map[index].keyword);
strcat(kernel_debug_string, ",");
@@ -754,7 +797,7 @@ static void do_c_string(void *c_mask, int index)
(mask->mask2 & cdm_array[index].mask2)) {
if ((strlen(client_debug_string) +
strlen(cdm_array[index].keyword) + 1)
- < ORANGEFS_MAX_DEBUG_STRING_LEN - 2) {
+ < ORANGEFS_MAX_DEBUG_STRING_LEN) {
strcat(client_debug_string,
cdm_array[index].keyword);
strcat(client_debug_string, ",");
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index 3d4b883a7660..3e153c2f6b82 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -32,6 +32,8 @@
#include <linux/slab.h>
#include <linux/types.h>
#include <linux/fs.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include <linux/vmalloc.h>
#include <linux/aio.h>
@@ -328,11 +330,9 @@ void purge_waiting_ops(void);
* defined in super.c
*/
extern uint64_t orangefs_features;
+extern const struct fs_parameter_spec orangefs_fs_param_spec[];
-struct dentry *orangefs_mount(struct file_system_type *fst,
- int flags,
- const char *devname,
- void *data);
+int orangefs_init_fs_context(struct fs_context *fc);
void orangefs_kill_sb(struct super_block *sb);
int orangefs_remount(struct orangefs_sb_info_s *);
diff --git a/fs/orangefs/orangefs-mod.c b/fs/orangefs/orangefs-mod.c
index 5ab741c60b7e..7ac16a4d2dc6 100644
--- a/fs/orangefs/orangefs-mod.c
+++ b/fs/orangefs/orangefs-mod.c
@@ -46,7 +46,8 @@ MODULE_PARM_DESC(hash_table_size,
static struct file_system_type orangefs_fs_type = {
.name = "pvfs2",
- .mount = orangefs_mount,
+ .init_fs_context = orangefs_init_fs_context,
+ .parameters = orangefs_fs_param_spec,
.kill_sb = orangefs_kill_sb,
.owner = THIS_MODULE,
};
diff --git a/fs/orangefs/orangefs-sysfs.c b/fs/orangefs/orangefs-sysfs.c
index 04e15dfa504a..369455b354ef 100644
--- a/fs/orangefs/orangefs-sysfs.c
+++ b/fs/orangefs/orangefs-sysfs.c
@@ -217,36 +217,31 @@ static ssize_t sysfs_int_show(struct kobject *kobj,
if (!strcmp(kobj->name, ORANGEFS_KOBJ_ID)) {
if (!strcmp(attr->attr.name, "op_timeout_secs")) {
- rc = scnprintf(buf,
- PAGE_SIZE,
+ rc = sysfs_emit(buf,
"%d\n",
op_timeout_secs);
goto out;
} else if (!strcmp(attr->attr.name,
"slot_timeout_secs")) {
- rc = scnprintf(buf,
- PAGE_SIZE,
+ rc = sysfs_emit(buf,
"%d\n",
slot_timeout_secs);
goto out;
} else if (!strcmp(attr->attr.name,
"cache_timeout_msecs")) {
- rc = scnprintf(buf,
- PAGE_SIZE,
+ rc = sysfs_emit(buf,
"%d\n",
orangefs_cache_timeout_msecs);
goto out;
} else if (!strcmp(attr->attr.name,
"dcache_timeout_msecs")) {
- rc = scnprintf(buf,
- PAGE_SIZE,
+ rc = sysfs_emit(buf,
"%d\n",
orangefs_dcache_timeout_msecs);
goto out;
} else if (!strcmp(attr->attr.name,
"getattr_timeout_msecs")) {
- rc = scnprintf(buf,
- PAGE_SIZE,
+ rc = sysfs_emit(buf,
"%d\n",
orangefs_getattr_timeout_msecs);
goto out;
@@ -256,14 +251,12 @@ static ssize_t sysfs_int_show(struct kobject *kobj,
} else if (!strcmp(kobj->name, STATS_KOBJ_ID)) {
if (!strcmp(attr->attr.name, "reads")) {
- rc = scnprintf(buf,
- PAGE_SIZE,
+ rc = sysfs_emit(buf,
"%lu\n",
orangefs_stats.reads);
goto out;
} else if (!strcmp(attr->attr.name, "writes")) {
- rc = scnprintf(buf,
- PAGE_SIZE,
+ rc = sysfs_emit(buf,
"%lu\n",
orangefs_stats.writes);
goto out;
@@ -497,19 +490,18 @@ out:
if (strcmp(kobj->name, PC_KOBJ_ID)) {
if (new_op->upcall.req.param.op ==
ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT_SIZE) {
- rc = scnprintf(buf, PAGE_SIZE, "%d %d\n",
+ rc = sysfs_emit(buf, "%d %d\n",
(int)new_op->downcall.resp.param.u.
value32[0],
(int)new_op->downcall.resp.param.u.
value32[1]);
} else {
- rc = scnprintf(buf, PAGE_SIZE, "%d\n",
+ rc = sysfs_emit(buf, "%d\n",
(int)new_op->downcall.resp.param.u.value64);
}
} else {
- rc = scnprintf(
+ rc = sysfs_emit(
buf,
- PAGE_SIZE,
"%s",
new_op->downcall.resp.perf_count.buffer);
}
diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c
index eba3e357192e..f3da840758e7 100644
--- a/fs/orangefs/super.c
+++ b/fs/orangefs/super.c
@@ -9,7 +9,6 @@
#include "orangefs-kernel.h"
#include "orangefs-bufmap.h"
-#include <linux/parser.h>
#include <linux/hashtable.h>
#include <linux/seq_file.h>
@@ -22,18 +21,16 @@ LIST_HEAD(orangefs_superblocks);
DEFINE_SPINLOCK(orangefs_superblocks_lock);
enum {
- Opt_intr,
Opt_acl,
+ Opt_intr,
Opt_local_lock,
-
- Opt_err
};
-static const match_table_t tokens = {
- { Opt_acl, "acl" },
- { Opt_intr, "intr" },
- { Opt_local_lock, "local_lock" },
- { Opt_err, NULL }
+const struct fs_parameter_spec orangefs_fs_param_spec[] = {
+ fsparam_flag ("acl", Opt_acl),
+ fsparam_flag ("intr", Opt_intr),
+ fsparam_flag ("local_lock", Opt_local_lock),
+ {}
};
uint64_t orangefs_features;
@@ -51,48 +48,30 @@ static int orangefs_show_options(struct seq_file *m, struct dentry *root)
return 0;
}
-static int parse_mount_options(struct super_block *sb, char *options,
- int silent)
+static int orangefs_parse_param(struct fs_context *fc,
+ struct fs_parameter *param)
{
- struct orangefs_sb_info_s *orangefs_sb = ORANGEFS_SB(sb);
- substring_t args[MAX_OPT_ARGS];
- char *p;
-
- /*
- * Force any potential flags that might be set from the mount
- * to zero, ie, initialize to unset.
- */
- sb->s_flags &= ~SB_POSIXACL;
- orangefs_sb->flags &= ~ORANGEFS_OPT_INTR;
- orangefs_sb->flags &= ~ORANGEFS_OPT_LOCAL_LOCK;
-
- while ((p = strsep(&options, ",")) != NULL) {
- int token;
-
- if (!*p)
- continue;
-
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_acl:
- sb->s_flags |= SB_POSIXACL;
- break;
- case Opt_intr:
- orangefs_sb->flags |= ORANGEFS_OPT_INTR;
- break;
- case Opt_local_lock:
- orangefs_sb->flags |= ORANGEFS_OPT_LOCAL_LOCK;
- break;
- default:
- goto fail;
- }
+ struct orangefs_sb_info_s *orangefs_sb = fc->s_fs_info;
+ struct fs_parse_result result;
+ int opt;
+
+ opt = fs_parse(fc, orangefs_fs_param_spec, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_acl:
+ fc->sb_flags |= SB_POSIXACL;
+ break;
+ case Opt_intr:
+ orangefs_sb->flags |= ORANGEFS_OPT_INTR;
+ break;
+ case Opt_local_lock:
+ orangefs_sb->flags |= ORANGEFS_OPT_LOCAL_LOCK;
+ break;
}
return 0;
-fail:
- if (!silent)
- gossip_err("Error: mount option [%s] is not supported.\n", p);
- return -EINVAL;
}
static void orangefs_inode_cache_ctor(void *req)
@@ -223,10 +202,20 @@ out_op_release:
* Remount as initiated by VFS layer. We just need to reparse the mount
* options, no need to signal pvfs2-client-core about it.
*/
-static int orangefs_remount_fs(struct super_block *sb, int *flags, char *data)
+static int orangefs_reconfigure(struct fs_context *fc)
{
- gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_remount_fs: called\n");
- return parse_mount_options(sb, data, 1);
+ struct super_block *sb = fc->root->d_sb;
+ struct orangefs_sb_info_s *orangefs_sb = ORANGEFS_SB(sb);
+ struct orangefs_sb_info_s *revised = fc->s_fs_info;
+ unsigned int flags;
+
+ flags = orangefs_sb->flags;
+ flags &= ~(ORANGEFS_OPT_INTR | ORANGEFS_OPT_LOCAL_LOCK);
+ flags |= revised->flags;
+ WRITE_ONCE(orangefs_sb->flags, flags);
+
+ gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_reconfigure: called\n");
+ return 0;
}
/*
@@ -319,7 +308,6 @@ static const struct super_operations orangefs_s_ops = {
.write_inode = orangefs_write_inode,
.drop_inode = generic_delete_inode,
.statfs = orangefs_statfs,
- .remount_fs = orangefs_remount_fs,
.show_options = orangefs_show_options,
};
@@ -410,8 +398,8 @@ static int orangefs_unmount(int id, __s32 fs_id, const char *devname)
}
static int orangefs_fill_sb(struct super_block *sb,
- struct orangefs_fs_mount_response *fs_mount,
- void *data, int silent)
+ struct fs_context *fc,
+ struct orangefs_fs_mount_response *fs_mount)
{
int ret;
struct inode *root;
@@ -424,17 +412,11 @@ static int orangefs_fill_sb(struct super_block *sb,
ORANGEFS_SB(sb)->fs_id = fs_mount->fs_id;
ORANGEFS_SB(sb)->id = fs_mount->id;
- if (data) {
- ret = parse_mount_options(sb, data, silent);
- if (ret)
- return ret;
- }
-
/* Hang the xattr handlers off the superblock */
sb->s_xattr = orangefs_xattr_handlers;
sb->s_magic = ORANGEFS_SUPER_MAGIC;
sb->s_op = &orangefs_s_ops;
- sb->s_d_op = &orangefs_dentry_operations;
+ set_default_d_op(sb, &orangefs_dentry_operations);
sb->s_blocksize = PAGE_SIZE;
sb->s_blocksize_bits = PAGE_SHIFT;
@@ -470,30 +452,24 @@ static int orangefs_fill_sb(struct super_block *sb,
return 0;
}
-struct dentry *orangefs_mount(struct file_system_type *fst,
- int flags,
- const char *devname,
- void *data)
+static int orangefs_get_tree(struct fs_context *fc)
{
int ret;
struct super_block *sb = ERR_PTR(-EINVAL);
struct orangefs_kernel_op_s *new_op;
- struct dentry *d = ERR_PTR(-EINVAL);
+
+ if (!fc->source)
+ return invalf(fc, "Device name not specified.\n");
gossip_debug(GOSSIP_SUPER_DEBUG,
"orangefs_mount: called with devname %s\n",
- devname);
-
- if (!devname) {
- gossip_err("ERROR: device name not specified.\n");
- return ERR_PTR(-EINVAL);
- }
+ fc->source);
new_op = op_alloc(ORANGEFS_VFS_OP_FS_MOUNT);
if (!new_op)
- return ERR_PTR(-ENOMEM);
+ return -ENOMEM;
- strscpy(new_op->upcall.req.fs_mount.orangefs_config_server, devname);
+ strscpy(new_op->upcall.req.fs_mount.orangefs_config_server, fc->source);
gossip_debug(GOSSIP_SUPER_DEBUG,
"Attempting ORANGEFS Mount via host %s\n",
@@ -511,37 +487,27 @@ struct dentry *orangefs_mount(struct file_system_type *fst,
goto free_op;
}
- sb = sget(fst, NULL, set_anon_super, flags, NULL);
+ sb = sget_fc(fc, NULL, set_anon_super_fc);
if (IS_ERR(sb)) {
- d = ERR_CAST(sb);
+ ret = PTR_ERR(sb);
orangefs_unmount(new_op->downcall.resp.fs_mount.id,
- new_op->downcall.resp.fs_mount.fs_id, devname);
- goto free_op;
- }
-
- /* alloc and init our private orangefs sb info */
- sb->s_fs_info = kzalloc(sizeof(struct orangefs_sb_info_s), GFP_KERNEL);
- if (!ORANGEFS_SB(sb)) {
- d = ERR_PTR(-ENOMEM);
+ new_op->downcall.resp.fs_mount.fs_id,
+ fc->source);
goto free_op;
}
- ret = orangefs_fill_sb(sb,
- &new_op->downcall.resp.fs_mount, data,
- flags & SB_SILENT ? 1 : 0);
+ /* init our private orangefs sb info */
+ ret = orangefs_fill_sb(sb, fc, &new_op->downcall.resp.fs_mount);
- if (ret) {
- d = ERR_PTR(ret);
+ if (ret)
goto free_sb_and_op;
- }
/*
* on successful mount, store the devname and data
* used
*/
- strscpy(ORANGEFS_SB(sb)->devname, devname);
-
+ strscpy(ORANGEFS_SB(sb)->devname, fc->source);
/* mount_pending must be cleared */
ORANGEFS_SB(sb)->mount_pending = 0;
@@ -564,7 +530,7 @@ struct dentry *orangefs_mount(struct file_system_type *fst,
if (orangefs_userspace_version >= 20906) {
new_op = op_alloc(ORANGEFS_VFS_OP_FEATURES);
if (!new_op)
- return ERR_PTR(-ENOMEM);
+ return -ENOMEM;
new_op->upcall.req.features.features = 0;
ret = service_operation(new_op, "orangefs_features", 0);
orangefs_features = new_op->downcall.resp.features.features;
@@ -573,7 +539,8 @@ struct dentry *orangefs_mount(struct file_system_type *fst,
orangefs_features = 0;
}
- return dget(sb->s_root);
+ fc->root = dget(sb->s_root);
+ return 0;
free_sb_and_op:
/* Will call orangefs_kill_sb with sb not in list. */
@@ -589,7 +556,43 @@ free_op:
op_release(new_op);
- return d;
+ return ret;
+}
+
+static void orangefs_free_fc(struct fs_context *fc)
+{
+ kfree(fc->s_fs_info);
+}
+
+static const struct fs_context_operations orangefs_context_ops = {
+ .free = orangefs_free_fc,
+ .parse_param = orangefs_parse_param,
+ .get_tree = orangefs_get_tree,
+ .reconfigure = orangefs_reconfigure,
+};
+
+/*
+ * Set up the filesystem mount context.
+ */
+int orangefs_init_fs_context(struct fs_context *fc)
+{
+ struct orangefs_sb_info_s *osi;
+
+ osi = kzalloc(sizeof(struct orangefs_sb_info_s), GFP_KERNEL);
+ if (!osi)
+ return -ENOMEM;
+
+ /*
+ * Force any potential flags that might be set from the mount
+ * to zero, ie, initialize to unset.
+ */
+ fc->sb_flags_mask &= ~SB_POSIXACL;
+ osi->flags &= ~ORANGEFS_OPT_INTR;
+ osi->flags &= ~ORANGEFS_OPT_LOCAL_LOCK;
+
+ fc->s_fs_info = osi;
+ fc->ops = &orangefs_context_ops;
+ return 0;
}
void orangefs_kill_sb(struct super_block *sb)
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 0c28e5fa3407..27396fe63f6d 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -171,14 +171,14 @@ out:
static int ovl_copy_fileattr(struct inode *inode, const struct path *old,
const struct path *new)
{
- struct fileattr oldfa = { .flags_valid = true };
- struct fileattr newfa = { .flags_valid = true };
+ struct file_kattr oldfa = { .flags_valid = true };
+ struct file_kattr newfa = { .flags_valid = true };
int err;
err = ovl_real_fileattr_get(old, &oldfa);
if (err) {
/* Ntfs-3g returns -EINVAL for "no fileattr support" */
- if (err == -ENOTTY || err == -EINVAL)
+ if (err == -EOPNOTSUPP || err == -EINVAL)
return 0;
pr_warn("failed to retrieve lower fileattr (%pd2, err=%i)\n",
old->dentry, err);
@@ -517,15 +517,12 @@ static int ovl_set_upper_fh(struct ovl_fs *ofs, struct dentry *upper,
/*
* Create and install index entry.
- *
- * Caller must hold i_mutex on indexdir.
*/
static int ovl_create_index(struct dentry *dentry, const struct ovl_fh *fh,
struct dentry *upper)
{
struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
struct dentry *indexdir = ovl_indexdir(dentry->d_sb);
- struct inode *dir = d_inode(indexdir);
struct dentry *index = NULL;
struct dentry *temp = NULL;
struct qstr name = { };
@@ -559,16 +556,20 @@ static int ovl_create_index(struct dentry *dentry, const struct ovl_fh *fh,
if (err)
goto out;
+ err = ovl_parent_lock(indexdir, temp);
+ if (err)
+ goto out;
index = ovl_lookup_upper(ofs, name.name, indexdir, name.len);
if (IS_ERR(index)) {
err = PTR_ERR(index);
} else {
- err = ovl_do_rename(ofs, dir, temp, dir, index, 0);
+ err = ovl_do_rename(ofs, indexdir, temp, indexdir, index, 0);
dput(index);
}
+ ovl_parent_unlock(indexdir);
out:
if (err)
- ovl_cleanup(ofs, dir, temp);
+ ovl_cleanup(ofs, indexdir, temp);
dput(temp);
free_name:
kfree(name.name);
@@ -618,7 +619,6 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c)
err = PTR_ERR(upper);
if (!IS_ERR(upper)) {
err = ovl_do_link(ofs, ovl_dentry_upper(c->dentry), udir, upper);
- dput(upper);
if (!err) {
/* Restore timestamps on parent (best effort) */
@@ -626,6 +626,7 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c)
ovl_dentry_set_upper_alias(c->dentry);
ovl_dentry_update_reval(c->dentry, upper);
}
+ dput(upper);
}
inode_unlock(udir);
if (err)
@@ -762,7 +763,6 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c)
{
struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb);
struct inode *inode;
- struct inode *udir = d_inode(c->destdir), *wdir = d_inode(c->workdir);
struct path path = { .mnt = ovl_upper_mnt(ofs) };
struct dentry *temp, *upper, *trap;
struct ovl_cu_creds cc;
@@ -779,9 +779,7 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c)
return err;
ovl_start_write(c->dentry);
- inode_lock(wdir);
temp = ovl_create_temp(ofs, c->workdir, &cattr);
- inode_unlock(wdir);
ovl_end_write(c->dentry);
ovl_revert_cu_creds(&cc);
@@ -794,45 +792,47 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c)
*/
path.dentry = temp;
err = ovl_copy_up_data(c, &path);
+ ovl_start_write(c->dentry);
+ if (err)
+ goto cleanup_unlocked;
+
+ if (S_ISDIR(c->stat.mode) && c->indexed) {
+ err = ovl_create_index(c->dentry, c->origin_fh, temp);
+ if (err)
+ goto cleanup_unlocked;
+ }
+
/*
* We cannot hold lock_rename() throughout this helper, because of
* lock ordering with sb_writers, which shouldn't be held when calling
* ovl_copy_up_data(), so lock workdir and destdir and make sure that
* temp wasn't moved before copy up completion or cleanup.
*/
- ovl_start_write(c->dentry);
trap = lock_rename(c->workdir, c->destdir);
if (trap || temp->d_parent != c->workdir) {
/* temp or workdir moved underneath us? abort without cleanup */
dput(temp);
err = -EIO;
- if (IS_ERR(trap))
- goto out;
- goto unlock;
- } else if (err) {
- goto cleanup;
+ if (!IS_ERR(trap))
+ unlock_rename(c->workdir, c->destdir);
+ goto out;
}
err = ovl_copy_up_metadata(c, temp);
if (err)
goto cleanup;
- if (S_ISDIR(c->stat.mode) && c->indexed) {
- err = ovl_create_index(c->dentry, c->origin_fh, temp);
- if (err)
- goto cleanup;
- }
-
upper = ovl_lookup_upper(ofs, c->destname.name, c->destdir,
c->destname.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto cleanup;
- err = ovl_do_rename(ofs, wdir, temp, udir, upper, 0);
+ err = ovl_do_rename(ofs, c->workdir, temp, c->destdir, upper, 0);
+ unlock_rename(c->workdir, c->destdir);
dput(upper);
if (err)
- goto cleanup;
+ goto cleanup_unlocked;
inode = d_inode(c->dentry);
if (c->metacopy_digest)
@@ -846,17 +846,17 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c)
ovl_inode_update(inode, temp);
if (S_ISDIR(inode->i_mode))
ovl_set_flag(OVL_WHITEOUTS, inode);
-unlock:
- unlock_rename(c->workdir, c->destdir);
out:
ovl_end_write(c->dentry);
return err;
cleanup:
- ovl_cleanup(ofs, wdir, temp);
+ unlock_rename(c->workdir, c->destdir);
+cleanup_unlocked:
+ ovl_cleanup(ofs, c->workdir, temp);
dput(temp);
- goto unlock;
+ goto out;
}
/* Copyup using O_TMPFILE which does not require cross dir locking */
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index c9993ff66fc2..dbd63a74df4b 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -24,7 +24,8 @@ MODULE_PARM_DESC(redirect_max,
static int ovl_set_redirect(struct dentry *dentry, bool samedir);
-int ovl_cleanup(struct ovl_fs *ofs, struct inode *wdir, struct dentry *wdentry)
+static int ovl_cleanup_locked(struct ovl_fs *ofs, struct inode *wdir,
+ struct dentry *wdentry)
{
int err;
@@ -43,6 +44,21 @@ int ovl_cleanup(struct ovl_fs *ofs, struct inode *wdir, struct dentry *wdentry)
return err;
}
+int ovl_cleanup(struct ovl_fs *ofs, struct dentry *workdir,
+ struct dentry *wdentry)
+{
+ int err;
+
+ err = ovl_parent_lock(workdir, wdentry);
+ if (err)
+ return err;
+
+ ovl_cleanup_locked(ofs, workdir->d_inode, wdentry);
+ ovl_parent_unlock(workdir);
+
+ return 0;
+}
+
struct dentry *ovl_lookup_temp(struct ovl_fs *ofs, struct dentry *workdir)
{
struct dentry *temp;
@@ -62,7 +78,6 @@ struct dentry *ovl_lookup_temp(struct ovl_fs *ofs, struct dentry *workdir)
return temp;
}
-/* caller holds i_mutex on workdir */
static struct dentry *ovl_whiteout(struct ovl_fs *ofs)
{
int err;
@@ -70,47 +85,52 @@ static struct dentry *ovl_whiteout(struct ovl_fs *ofs)
struct dentry *workdir = ofs->workdir;
struct inode *wdir = workdir->d_inode;
+ guard(mutex)(&ofs->whiteout_lock);
+
if (!ofs->whiteout) {
+ inode_lock_nested(wdir, I_MUTEX_PARENT);
whiteout = ovl_lookup_temp(ofs, workdir);
- if (IS_ERR(whiteout))
- goto out;
-
- err = ovl_do_whiteout(ofs, wdir, whiteout);
- if (err) {
- dput(whiteout);
- whiteout = ERR_PTR(err);
- goto out;
+ if (!IS_ERR(whiteout)) {
+ err = ovl_do_whiteout(ofs, wdir, whiteout);
+ if (err) {
+ dput(whiteout);
+ whiteout = ERR_PTR(err);
+ }
}
+ inode_unlock(wdir);
+ if (IS_ERR(whiteout))
+ return whiteout;
ofs->whiteout = whiteout;
}
if (!ofs->no_shared_whiteout) {
+ inode_lock_nested(wdir, I_MUTEX_PARENT);
whiteout = ovl_lookup_temp(ofs, workdir);
- if (IS_ERR(whiteout))
- goto out;
-
- err = ovl_do_link(ofs, ofs->whiteout, wdir, whiteout);
- if (!err)
- goto out;
-
- if (err != -EMLINK) {
- pr_warn("Failed to link whiteout - disabling whiteout inode sharing(nlink=%u, err=%i)\n",
- ofs->whiteout->d_inode->i_nlink, err);
+ if (!IS_ERR(whiteout)) {
+ err = ovl_do_link(ofs, ofs->whiteout, wdir, whiteout);
+ if (err) {
+ dput(whiteout);
+ whiteout = ERR_PTR(err);
+ }
+ }
+ inode_unlock(wdir);
+ if (!IS_ERR(whiteout))
+ return whiteout;
+ if (PTR_ERR(whiteout) != -EMLINK) {
+ pr_warn("Failed to link whiteout - disabling whiteout inode sharing(nlink=%u, err=%lu)\n",
+ ofs->whiteout->d_inode->i_nlink,
+ PTR_ERR(whiteout));
ofs->no_shared_whiteout = true;
}
- dput(whiteout);
}
whiteout = ofs->whiteout;
ofs->whiteout = NULL;
-out:
return whiteout;
}
-/* Caller must hold i_mutex on both workdir and dir */
-int ovl_cleanup_and_whiteout(struct ovl_fs *ofs, struct inode *dir,
+int ovl_cleanup_and_whiteout(struct ovl_fs *ofs, struct dentry *dir,
struct dentry *dentry)
{
- struct inode *wdir = ofs->workdir->d_inode;
struct dentry *whiteout;
int err;
int flags = 0;
@@ -123,55 +143,29 @@ int ovl_cleanup_and_whiteout(struct ovl_fs *ofs, struct inode *dir,
if (d_is_dir(dentry))
flags = RENAME_EXCHANGE;
- err = ovl_do_rename(ofs, wdir, whiteout, dir, dentry, flags);
+ err = ovl_lock_rename_workdir(ofs->workdir, whiteout, dir, dentry);
+ if (!err) {
+ err = ovl_do_rename(ofs, ofs->workdir, whiteout, dir, dentry, flags);
+ unlock_rename(ofs->workdir, dir);
+ }
if (err)
goto kill_whiteout;
if (flags)
- ovl_cleanup(ofs, wdir, dentry);
+ ovl_cleanup(ofs, ofs->workdir, dentry);
out:
dput(whiteout);
return err;
kill_whiteout:
- ovl_cleanup(ofs, wdir, whiteout);
+ ovl_cleanup(ofs, ofs->workdir, whiteout);
goto out;
}
-int ovl_mkdir_real(struct ovl_fs *ofs, struct inode *dir,
- struct dentry **newdentry, umode_t mode)
-{
- int err;
- struct dentry *d, *dentry = *newdentry;
-
- err = ovl_do_mkdir(ofs, dir, dentry, mode);
- if (err)
- return err;
-
- if (likely(!d_unhashed(dentry)))
- return 0;
-
- /*
- * vfs_mkdir() may succeed and leave the dentry passed
- * to it unhashed and negative. If that happens, try to
- * lookup a new hashed and positive dentry.
- */
- d = ovl_lookup_upper(ofs, dentry->d_name.name, dentry->d_parent,
- dentry->d_name.len);
- if (IS_ERR(d)) {
- pr_warn("failed lookup after mkdir (%pd2, err=%i).\n",
- dentry, err);
- return PTR_ERR(d);
- }
- dput(dentry);
- *newdentry = d;
-
- return 0;
-}
-
-struct dentry *ovl_create_real(struct ovl_fs *ofs, struct inode *dir,
+struct dentry *ovl_create_real(struct ovl_fs *ofs, struct dentry *parent,
struct dentry *newdentry, struct ovl_cattr *attr)
{
+ struct inode *dir = parent->d_inode;
int err;
if (IS_ERR(newdentry))
@@ -191,7 +185,8 @@ struct dentry *ovl_create_real(struct ovl_fs *ofs, struct inode *dir,
case S_IFDIR:
/* mkdir is special... */
- err = ovl_mkdir_real(ofs, dir, &newdentry, attr->mode);
+ newdentry = ovl_do_mkdir(ofs, dir, newdentry, attr->mode);
+ err = PTR_ERR_OR_ZERO(newdentry);
break;
case S_IFCHR:
@@ -219,7 +214,8 @@ struct dentry *ovl_create_real(struct ovl_fs *ofs, struct inode *dir,
}
out:
if (err) {
- dput(newdentry);
+ if (!IS_ERR(newdentry))
+ dput(newdentry);
return ERR_PTR(err);
}
return newdentry;
@@ -228,8 +224,12 @@ out:
struct dentry *ovl_create_temp(struct ovl_fs *ofs, struct dentry *workdir,
struct ovl_cattr *attr)
{
- return ovl_create_real(ofs, d_inode(workdir),
- ovl_lookup_temp(ofs, workdir), attr);
+ struct dentry *ret;
+ inode_lock_nested(workdir->d_inode, I_MUTEX_PARENT);
+ ret = ovl_create_real(ofs, workdir,
+ ovl_lookup_temp(ofs, workdir), attr);
+ inode_unlock(workdir->d_inode);
+ return ret;
}
static int ovl_set_opaque_xerr(struct dentry *dentry, struct dentry *upper,
@@ -282,7 +282,8 @@ static int ovl_instantiate(struct dentry *dentry, struct inode *inode,
* XXX: if we ever use ovl_obtain_alias() to decode directory
* file handles, need to use ovl_get_inode_locked() and
* d_instantiate_new() here to prevent from creating two
- * hashed directory inode aliases.
+ * hashed directory inode aliases. We then need to return
+ * the obtained alias to ovl_mkdir().
*/
inode = ovl_get_inode(dentry->d_sb, &oip);
if (IS_ERR(inode))
@@ -331,13 +332,13 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
int err;
inode_lock_nested(udir, I_MUTEX_PARENT);
- newdentry = ovl_create_real(ofs, udir,
+ newdentry = ovl_create_real(ofs, upperdir,
ovl_lookup_upper(ofs, dentry->d_name.name,
upperdir, dentry->d_name.len),
attr);
- err = PTR_ERR(newdentry);
+ inode_unlock(udir);
if (IS_ERR(newdentry))
- goto out_unlock;
+ return PTR_ERR(newdentry);
if (ovl_type_merge(dentry->d_parent) && d_is_dir(newdentry) &&
!ovl_allow_offline_changes(ofs)) {
@@ -349,14 +350,12 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
err = ovl_instantiate(dentry, inode, newdentry, !!attr->hardlink, NULL);
if (err)
goto out_cleanup;
-out_unlock:
- inode_unlock(udir);
- return err;
+ return 0;
out_cleanup:
- ovl_cleanup(ofs, udir, newdentry);
+ ovl_cleanup(ofs, upperdir, newdentry);
dput(newdentry);
- goto out_unlock;
+ return err;
}
static struct dentry *ovl_clear_empty(struct dentry *dentry,
@@ -364,9 +363,7 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry,
{
struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
struct dentry *workdir = ovl_workdir(dentry);
- struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
- struct inode *udir = upperdir->d_inode;
struct path upperpath;
struct dentry *upper;
struct dentry *opaquedir;
@@ -376,27 +373,25 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry,
if (WARN_ON(!workdir))
return ERR_PTR(-EROFS);
- err = ovl_lock_rename_workdir(workdir, upperdir);
- if (err)
- goto out;
-
ovl_path_upper(dentry, &upperpath);
err = vfs_getattr(&upperpath, &stat,
STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT);
if (err)
- goto out_unlock;
+ goto out;
err = -ESTALE;
if (!S_ISDIR(stat.mode))
- goto out_unlock;
+ goto out;
upper = upperpath.dentry;
- if (upper->d_parent->d_inode != udir)
- goto out_unlock;
opaquedir = ovl_create_temp(ofs, workdir, OVL_CATTR(stat.mode));
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir))
- goto out_unlock;
+ goto out;
+
+ err = ovl_lock_rename_workdir(workdir, opaquedir, upperdir, upper);
+ if (err)
+ goto out_cleanup_unlocked;
err = ovl_copy_xattr(dentry->d_sb, &upperpath, opaquedir);
if (err)
@@ -412,13 +407,13 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry,
if (err)
goto out_cleanup;
- err = ovl_do_rename(ofs, wdir, opaquedir, udir, upper, RENAME_EXCHANGE);
+ err = ovl_do_rename(ofs, workdir, opaquedir, upperdir, upper, RENAME_EXCHANGE);
+ unlock_rename(workdir, upperdir);
if (err)
- goto out_cleanup;
+ goto out_cleanup_unlocked;
ovl_cleanup_whiteouts(ofs, upper, list);
- ovl_cleanup(ofs, wdir, upper);
- unlock_rename(workdir, upperdir);
+ ovl_cleanup(ofs, workdir, upper);
/* dentry's upper doesn't match now, get rid of it */
d_drop(dentry);
@@ -426,10 +421,10 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry,
return opaquedir;
out_cleanup:
- ovl_cleanup(ofs, wdir, opaquedir);
- dput(opaquedir);
-out_unlock:
unlock_rename(workdir, upperdir);
+out_cleanup_unlocked:
+ ovl_cleanup(ofs, workdir, opaquedir);
+ dput(opaquedir);
out:
return ERR_PTR(err);
}
@@ -448,9 +443,7 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
{
struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
struct dentry *workdir = ovl_workdir(dentry);
- struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
- struct inode *udir = upperdir->d_inode;
struct dentry *upper;
struct dentry *newdentry;
int err;
@@ -467,15 +460,11 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
return err;
}
- err = ovl_lock_rename_workdir(workdir, upperdir);
- if (err)
- goto out;
-
- upper = ovl_lookup_upper(ofs, dentry->d_name.name, upperdir,
- dentry->d_name.len);
+ upper = ovl_lookup_upper_unlocked(ofs, dentry->d_name.name, upperdir,
+ dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
- goto out_unlock;
+ goto out;
err = -ESTALE;
if (d_is_negative(upper) || !ovl_upper_is_whiteout(ofs, upper))
@@ -486,6 +475,10 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
if (IS_ERR(newdentry))
goto out_dput;
+ err = ovl_lock_rename_workdir(workdir, newdentry, upperdir, upper);
+ if (err)
+ goto out_cleanup_unlocked;
+
/*
* mode could have been mutilated due to umask (e.g. sgid directory)
*/
@@ -519,27 +512,27 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
if (err)
goto out_cleanup;
- err = ovl_do_rename(ofs, wdir, newdentry, udir, upper,
+ err = ovl_do_rename(ofs, workdir, newdentry, upperdir, upper,
RENAME_EXCHANGE);
+ unlock_rename(workdir, upperdir);
if (err)
- goto out_cleanup;
+ goto out_cleanup_unlocked;
- ovl_cleanup(ofs, wdir, upper);
+ ovl_cleanup(ofs, workdir, upper);
} else {
- err = ovl_do_rename(ofs, wdir, newdentry, udir, upper, 0);
+ err = ovl_do_rename(ofs, workdir, newdentry, upperdir, upper, 0);
+ unlock_rename(workdir, upperdir);
if (err)
- goto out_cleanup;
+ goto out_cleanup_unlocked;
}
ovl_dir_modified(dentry->d_parent, false);
err = ovl_instantiate(dentry, inode, newdentry, hardlink, NULL);
if (err) {
- ovl_cleanup(ofs, udir, newdentry);
+ ovl_cleanup(ofs, upperdir, newdentry);
dput(newdentry);
}
out_dput:
dput(upper);
-out_unlock:
- unlock_rename(workdir, upperdir);
out:
if (!hardlink) {
posix_acl_release(acl);
@@ -548,7 +541,9 @@ out:
return err;
out_cleanup:
- ovl_cleanup(ofs, wdir, newdentry);
+ unlock_rename(workdir, upperdir);
+out_cleanup_unlocked:
+ ovl_cleanup(ofs, workdir, newdentry);
dput(newdentry);
goto out_dput;
}
@@ -687,10 +682,10 @@ static int ovl_create(struct mnt_idmap *idmap, struct inode *dir,
return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL);
}
-static int ovl_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *ovl_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
- return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL);
+ return ERR_PTR(ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL));
}
static int ovl_mknod(struct mnt_idmap *idmap, struct inode *dir,
@@ -785,15 +780,11 @@ static int ovl_remove_and_whiteout(struct dentry *dentry,
goto out;
}
- err = ovl_lock_rename_workdir(workdir, upperdir);
- if (err)
- goto out_dput;
-
- upper = ovl_lookup_upper(ofs, dentry->d_name.name, upperdir,
- dentry->d_name.len);
+ upper = ovl_lookup_upper_unlocked(ofs, dentry->d_name.name, upperdir,
+ dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
- goto out_unlock;
+ goto out_dput;
err = -ESTALE;
if ((opaquedir && upper != opaquedir) ||
@@ -802,17 +793,13 @@ static int ovl_remove_and_whiteout(struct dentry *dentry,
goto out_dput_upper;
}
- err = ovl_cleanup_and_whiteout(ofs, d_inode(upperdir), upper);
- if (err)
- goto out_d_drop;
+ err = ovl_cleanup_and_whiteout(ofs, upperdir, upper);
+ if (!err)
+ ovl_dir_modified(dentry->d_parent, true);
- ovl_dir_modified(dentry->d_parent, true);
-out_d_drop:
d_drop(dentry);
out_dput_upper:
dput(upper);
-out_unlock:
- unlock_rename(workdir, upperdir);
out_dput:
dput(opaquedir);
out:
@@ -1097,9 +1084,9 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir,
int err;
struct dentry *old_upperdir;
struct dentry *new_upperdir;
- struct dentry *olddentry;
- struct dentry *newdentry;
- struct dentry *trap;
+ struct dentry *olddentry = NULL;
+ struct dentry *newdentry = NULL;
+ struct dentry *trap, *de;
bool old_opaque;
bool new_opaque;
bool cleanup_whiteout = false;
@@ -1212,21 +1199,23 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir,
goto out_revert_creds;
}
- olddentry = ovl_lookup_upper(ofs, old->d_name.name, old_upperdir,
- old->d_name.len);
- err = PTR_ERR(olddentry);
- if (IS_ERR(olddentry))
+ de = ovl_lookup_upper(ofs, old->d_name.name, old_upperdir,
+ old->d_name.len);
+ err = PTR_ERR(de);
+ if (IS_ERR(de))
goto out_unlock;
+ olddentry = de;
err = -ESTALE;
if (!ovl_matches_upper(old, olddentry))
- goto out_dput_old;
+ goto out_unlock;
- newdentry = ovl_lookup_upper(ofs, new->d_name.name, new_upperdir,
- new->d_name.len);
- err = PTR_ERR(newdentry);
- if (IS_ERR(newdentry))
- goto out_dput_old;
+ de = ovl_lookup_upper(ofs, new->d_name.name, new_upperdir,
+ new->d_name.len);
+ err = PTR_ERR(de);
+ if (IS_ERR(de))
+ goto out_unlock;
+ newdentry = de;
old_opaque = ovl_dentry_is_opaque(old);
new_opaque = ovl_dentry_is_opaque(new);
@@ -1235,28 +1224,28 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir,
if (d_inode(new) && ovl_dentry_upper(new)) {
if (opaquedir) {
if (newdentry != opaquedir)
- goto out_dput;
+ goto out_unlock;
} else {
if (!ovl_matches_upper(new, newdentry))
- goto out_dput;
+ goto out_unlock;
}
} else {
if (!d_is_negative(newdentry)) {
if (!new_opaque || !ovl_upper_is_whiteout(ofs, newdentry))
- goto out_dput;
+ goto out_unlock;
} else {
if (flags & RENAME_EXCHANGE)
- goto out_dput;
+ goto out_unlock;
}
}
if (olddentry == trap)
- goto out_dput;
+ goto out_unlock;
if (newdentry == trap)
- goto out_dput;
+ goto out_unlock;
if (olddentry->d_inode == newdentry->d_inode)
- goto out_dput;
+ goto out_unlock;
err = 0;
if (ovl_type_merge_or_lower(old))
@@ -1264,7 +1253,7 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir,
else if (is_dir && !old_opaque && ovl_type_merge(new->d_parent))
err = ovl_set_opaque_xerr(old, olddentry, -EXDEV);
if (err)
- goto out_dput;
+ goto out_unlock;
if (!overwrite && ovl_type_merge_or_lower(new))
err = ovl_set_redirect(new, samedir);
@@ -1272,15 +1261,16 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir,
ovl_type_merge(old->d_parent))
err = ovl_set_opaque_xerr(new, newdentry, -EXDEV);
if (err)
- goto out_dput;
+ goto out_unlock;
- err = ovl_do_rename(ofs, old_upperdir->d_inode, olddentry,
- new_upperdir->d_inode, newdentry, flags);
+ err = ovl_do_rename(ofs, old_upperdir, olddentry,
+ new_upperdir, newdentry, flags);
+ unlock_rename(new_upperdir, old_upperdir);
if (err)
- goto out_dput;
+ goto out_revert_creds;
if (cleanup_whiteout)
- ovl_cleanup(ofs, old_upperdir->d_inode, newdentry);
+ ovl_cleanup(ofs, old_upperdir, newdentry);
if (overwrite && d_inode(new)) {
if (new_is_dir)
@@ -1299,12 +1289,6 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir,
if (d_inode(new) && ovl_dentry_upper(new))
ovl_copyattr(d_inode(new));
-out_dput:
- dput(newdentry);
-out_dput_old:
- dput(olddentry);
-out_unlock:
- unlock_rename(new_upperdir, old_upperdir);
out_revert_creds:
ovl_revert_creds(old_cred);
if (update_nlink)
@@ -1312,9 +1296,15 @@ out_revert_creds:
else
ovl_drop_write(old);
out:
+ dput(newdentry);
+ dput(olddentry);
dput(opaquedir);
ovl_cache_free(&list);
return err;
+
+out_unlock:
+ unlock_rename(new_upperdir, old_upperdir);
+ goto out_revert_creds;
}
static int ovl_create_tmpfile(struct file *file, struct dentry *dentry,
diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c
index 444aeeccb6da..83f80fdb1567 100644
--- a/fs/overlayfs/export.c
+++ b/fs/overlayfs/export.c
@@ -385,11 +385,9 @@ static struct dentry *ovl_lookup_real_one(struct dentry *connected,
*/
take_dentry_name_snapshot(&name, real);
/*
- * No idmap handling here: it's an internal lookup. Could skip
- * permission checking altogether, but for now just use non-idmap
- * transformed ids.
+ * No idmap handling here: it's an internal lookup.
*/
- this = lookup_one_len(name.name.name, connected, name.name.len);
+ this = lookup_noperm(&name.name, connected);
release_dentry_name_snapshot(&name);
err = PTR_ERR(this);
if (IS_ERR(this)) {
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 969b458100fe..f5b8877d5fe2 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -48,8 +48,8 @@ static struct file *ovl_open_realfile(const struct file *file,
if (!inode_owner_or_capable(real_idmap, realinode))
flags &= ~O_NOATIME;
- realfile = backing_file_open(&file->f_path, flags, realpath,
- current_cred());
+ realfile = backing_file_open(file_user_path(file),
+ flags, realpath, current_cred());
}
ovl_revert_creds(old_cred);
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 6f0e15f86c21..ecb9f2019395 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -610,7 +610,7 @@ static int ovl_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
* Introducing security_inode_fileattr_get/set() hooks would solve this issue
* properly.
*/
-static int ovl_security_fileattr(const struct path *realpath, struct fileattr *fa,
+static int ovl_security_fileattr(const struct path *realpath, struct file_kattr *fa,
bool set)
{
struct file *file;
@@ -637,7 +637,7 @@ static int ovl_security_fileattr(const struct path *realpath, struct fileattr *f
return err;
}
-int ovl_real_fileattr_set(const struct path *realpath, struct fileattr *fa)
+int ovl_real_fileattr_set(const struct path *realpath, struct file_kattr *fa)
{
int err;
@@ -649,7 +649,7 @@ int ovl_real_fileattr_set(const struct path *realpath, struct fileattr *fa)
}
int ovl_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa)
+ struct dentry *dentry, struct file_kattr *fa)
{
struct inode *inode = d_inode(dentry);
struct path upperpath;
@@ -697,7 +697,7 @@ out:
}
/* Convert inode protection flags to fileattr flags */
-static void ovl_fileattr_prot_flags(struct inode *inode, struct fileattr *fa)
+static void ovl_fileattr_prot_flags(struct inode *inode, struct file_kattr *fa)
{
BUILD_BUG_ON(OVL_PROT_FS_FLAGS_MASK & ~FS_COMMON_FL);
BUILD_BUG_ON(OVL_PROT_FSX_FLAGS_MASK & ~FS_XFLAG_COMMON);
@@ -712,7 +712,7 @@ static void ovl_fileattr_prot_flags(struct inode *inode, struct fileattr *fa)
}
}
-int ovl_real_fileattr_get(const struct path *realpath, struct fileattr *fa)
+int ovl_real_fileattr_get(const struct path *realpath, struct file_kattr *fa)
{
int err;
@@ -720,13 +720,10 @@ int ovl_real_fileattr_get(const struct path *realpath, struct fileattr *fa)
if (err)
return err;
- err = vfs_fileattr_get(realpath->dentry, fa);
- if (err == -ENOIOCTLCMD)
- err = -ENOTTY;
- return err;
+ return vfs_fileattr_get(realpath->dentry, fa);
}
-int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+int ovl_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
{
struct inode *inode = d_inode(dentry);
struct path realpath;
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index cea820cb3b55..76d6248b625e 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -14,10 +14,9 @@
#include <linux/exportfs.h>
#include "overlayfs.h"
-#include "../internal.h" /* for vfs_path_lookup */
-
struct ovl_lookup_data {
struct super_block *sb;
+ struct dentry *dentry;
const struct ovl_layer *layer;
struct qstr name;
bool is_dir;
@@ -26,6 +25,7 @@ struct ovl_lookup_data {
bool stop;
bool last;
char *redirect;
+ char *upperredirect;
int metacopy;
/* Referring to last redirect xattr */
bool absolute_redirect;
@@ -207,8 +207,8 @@ static struct dentry *ovl_lookup_positive_unlocked(struct ovl_lookup_data *d,
struct dentry *base, int len,
bool drop_negative)
{
- struct dentry *ret = lookup_one_unlocked(mnt_idmap(d->layer->mnt), name,
- base, len);
+ struct dentry *ret = lookup_one_unlocked(mnt_idmap(d->layer->mnt),
+ &QSTR_LEN(name, len), base);
if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
if (drop_negative && ret->d_lockref.count == 1) {
@@ -230,13 +230,26 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
struct dentry **ret, bool drop_negative)
{
struct ovl_fs *ofs = OVL_FS(d->sb);
- struct dentry *this;
+ struct dentry *this = NULL;
+ const char *warn;
struct path path;
int err;
bool last_element = !post[0];
bool is_upper = d->layer->idx == 0;
char val;
+ /*
+ * We allow filesystems that are case-folding capable but deny composing
+ * ovl stack from case-folded directories. If someone has enabled case
+ * folding on a directory on underlying layer, the warranty of the ovl
+ * stack is voided.
+ */
+ if (ovl_dentry_casefolded(base)) {
+ warn = "case folded parent";
+ err = -ESTALE;
+ goto out_warn;
+ }
+
this = ovl_lookup_positive_unlocked(d, name, base, namelen, drop_negative);
if (IS_ERR(this)) {
err = PTR_ERR(this);
@@ -246,10 +259,17 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
goto out_err;
}
+ if (ovl_dentry_casefolded(this)) {
+ warn = "case folded child";
+ err = -EREMOTE;
+ goto out_warn;
+ }
+
if (ovl_dentry_weird(this)) {
/* Don't support traversing automounts and other weirdness */
+ warn = "unsupported object type";
err = -EREMOTE;
- goto out_err;
+ goto out_warn;
}
path.dentry = this;
@@ -283,8 +303,9 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
} else {
if (ovl_lookup_trap_inode(d->sb, this)) {
/* Caught in a trap of overlapping layers */
+ warn = "overlapping layers";
err = -ELOOP;
- goto out_err;
+ goto out_warn;
}
if (last_element)
@@ -316,6 +337,10 @@ put_and_out:
this = NULL;
goto out;
+out_warn:
+ pr_warn_ratelimited("failed lookup in %s (%pd2, name='%.*s', err=%i): %s\n",
+ is_upper ? "upper" : "lower", base,
+ namelen, name, err, warn);
out_err:
dput(this);
return err;
@@ -759,7 +784,7 @@ struct dentry *ovl_get_index_fh(struct ovl_fs *ofs, struct ovl_fh *fh)
if (err)
return ERR_PTR(err);
- index = lookup_positive_unlocked(name.name, ofs->workdir, name.len);
+ index = lookup_noperm_positive_unlocked(&name, ofs->workdir);
kfree(name.name);
if (IS_ERR(index)) {
if (PTR_ERR(index) == -ENOENT)
@@ -791,8 +816,8 @@ struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper,
if (err)
return ERR_PTR(err);
- index = lookup_one_positive_unlocked(ovl_upper_mnt_idmap(ofs), name.name,
- ofs->workdir, name.len);
+ index = lookup_one_positive_unlocked(ovl_upper_mnt_idmap(ofs), &name,
+ ofs->workdir);
if (IS_ERR(index)) {
err = PTR_ERR(index);
if (err == -ENOENT) {
@@ -1026,6 +1051,31 @@ int ovl_verify_lowerdata(struct dentry *dentry)
return ovl_maybe_validate_verity(dentry);
}
+/*
+ * Following redirects/metacopy can have security consequences: it's like a
+ * symlink into the lower layer without the permission checks.
+ *
+ * This is only a problem if the upper layer is untrusted (e.g comes from an USB
+ * drive). This can allow a non-readable file or directory to become readable.
+ *
+ * Only following redirects when redirects are enabled disables this attack
+ * vector when not necessary.
+ */
+static bool ovl_check_follow_redirect(struct ovl_lookup_data *d)
+{
+ struct ovl_fs *ofs = OVL_FS(d->sb);
+
+ if (d->metacopy && !ofs->config.metacopy) {
+ pr_warn_ratelimited("refusing to follow metacopy origin for (%pd2)\n", d->dentry);
+ return false;
+ }
+ if ((d->redirect || d->upperredirect) && !ovl_redirect_follow(ofs)) {
+ pr_warn_ratelimited("refusing to follow redirect for (%pd2)\n", d->dentry);
+ return false;
+ }
+ return true;
+}
+
struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags)
{
@@ -1041,7 +1091,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
unsigned int ctr = 0;
struct inode *inode = NULL;
bool upperopaque = false;
- char *upperredirect = NULL;
+ bool check_redirect = (ovl_redirect_follow(ofs) || ofs->numdatalayer);
struct dentry *this;
unsigned int i;
int err;
@@ -1049,12 +1099,14 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
int metacopy_size = 0;
struct ovl_lookup_data d = {
.sb = dentry->d_sb,
+ .dentry = dentry,
.name = dentry->d_name,
.is_dir = false,
.opaque = false,
.stop = false,
- .last = ovl_redirect_follow(ofs) ? false : !ovl_numlower(poe),
+ .last = check_redirect ? false : !ovl_numlower(poe),
.redirect = NULL,
+ .upperredirect = NULL,
.metacopy = 0,
};
@@ -1096,8 +1148,8 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
if (d.redirect) {
err = -ENOMEM;
- upperredirect = kstrdup(d.redirect, GFP_KERNEL);
- if (!upperredirect)
+ d.upperredirect = kstrdup(d.redirect, GFP_KERNEL);
+ if (!d.upperredirect)
goto out_put_upper;
if (d.redirect[0] == '/')
poe = roe;
@@ -1115,7 +1167,12 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
for (i = 0; !d.stop && i < ovl_numlower(poe); i++) {
struct ovl_path lower = ovl_lowerstack(poe)[i];
- if (!ovl_redirect_follow(ofs))
+ if (!ovl_check_follow_redirect(&d)) {
+ err = -EPERM;
+ goto out_put;
+ }
+
+ if (!check_redirect)
d.last = i == ovl_numlower(poe) - 1;
else if (d.is_dir || !ofs->numdatalayer)
d.last = lower.layer->idx == ovl_numlower(roe);
@@ -1128,13 +1185,6 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
if (!this)
continue;
- if ((uppermetacopy || d.metacopy) && !ofs->config.metacopy) {
- dput(this);
- err = -EPERM;
- pr_warn_ratelimited("refusing to follow metacopy origin for (%pd2)\n", dentry);
- goto out_put;
- }
-
/*
* If no origin fh is stored in upper of a merge dir, store fh
* of lower dir and set upper parent "impure".
@@ -1187,23 +1237,6 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
ctr++;
}
- /*
- * Following redirects can have security consequences: it's like
- * a symlink into the lower layer without the permission checks.
- * This is only a problem if the upper layer is untrusted (e.g
- * comes from an USB drive). This can allow a non-readable file
- * or directory to become readable.
- *
- * Only following redirects when redirects are enabled disables
- * this attack vector when not necessary.
- */
- err = -EPERM;
- if (d.redirect && !ovl_redirect_follow(ofs)) {
- pr_warn_ratelimited("refusing to follow redirect for (%pd2)\n",
- dentry);
- goto out_put;
- }
-
if (d.stop)
break;
@@ -1214,10 +1247,16 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
}
}
- /* Defer lookup of lowerdata in data-only layers to first access */
+ /*
+ * Defer lookup of lowerdata in data-only layers to first access.
+ * Don't require redirect=follow and metacopy=on in this case.
+ */
if (d.metacopy && ctr && ofs->numdatalayer && d.absolute_redirect) {
d.metacopy = 0;
ctr++;
+ } else if (!ovl_check_follow_redirect(&d)) {
+ err = -EPERM;
+ goto out_put;
}
/*
@@ -1300,20 +1339,26 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
/*
* It's safe to assign upperredirect here: the previous
- * assignment of happens only if upperdentry is non-NULL, and
+ * assignment happens only if upperdentry is non-NULL, and
* this one only if upperdentry is NULL.
*/
- upperredirect = ovl_get_redirect_xattr(ofs, &upperpath, 0);
- if (IS_ERR(upperredirect)) {
- err = PTR_ERR(upperredirect);
- upperredirect = NULL;
+ d.upperredirect = ovl_get_redirect_xattr(ofs, &upperpath, 0);
+ if (IS_ERR(d.upperredirect)) {
+ err = PTR_ERR(d.upperredirect);
+ d.upperredirect = NULL;
goto out_free_oe;
}
+
err = ovl_check_metacopy_xattr(ofs, &upperpath, NULL);
if (err < 0)
goto out_free_oe;
- uppermetacopy = err;
+ d.metacopy = uppermetacopy = err;
metacopy_size = err;
+
+ if (!ovl_check_follow_redirect(&d)) {
+ err = -EPERM;
+ goto out_free_oe;
+ }
}
if (upperdentry || ctr) {
@@ -1321,7 +1366,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
.upperdentry = upperdentry,
.oe = oe,
.index = index,
- .redirect = upperredirect,
+ .redirect = d.upperredirect,
};
/* Store lowerdata redirect for lazy lookup */
@@ -1363,7 +1408,7 @@ out_put_upper:
kfree(origin_path);
}
dput(upperdentry);
- kfree(upperredirect);
+ kfree(d.upperredirect);
out:
kfree(d.redirect);
ovl_revert_creds(old_cred);
@@ -1396,9 +1441,15 @@ bool ovl_lower_positive(struct dentry *dentry)
struct dentry *this;
struct ovl_path *parentpath = &ovl_lowerstack(poe)[i];
+ /*
+ * We need to make a non-const copy of dentry->d_name,
+ * because lookup_one_positive_unlocked() will hash name
+ * with parentpath base, which is on another (lower fs).
+ */
this = lookup_one_positive_unlocked(
mnt_idmap(parentpath->layer->mnt),
- name->name, parentpath->dentry, name->len);
+ &QSTR_LEN(name->name, name->len),
+ parentpath->dentry);
if (IS_ERR(this)) {
switch (PTR_ERR(this)) {
case -ENOENT:
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 0021e2025020..bb0d7ded8e76 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -241,13 +241,16 @@ static inline int ovl_do_create(struct ovl_fs *ofs,
return err;
}
-static inline int ovl_do_mkdir(struct ovl_fs *ofs,
- struct inode *dir, struct dentry *dentry,
- umode_t mode)
+static inline struct dentry *ovl_do_mkdir(struct ovl_fs *ofs,
+ struct inode *dir,
+ struct dentry *dentry,
+ umode_t mode)
{
- int err = vfs_mkdir(ovl_upper_mnt_idmap(ofs), dir, dentry, mode);
- pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err);
- return err;
+ struct dentry *ret;
+
+ ret = vfs_mkdir(ovl_upper_mnt_idmap(ofs), dir, dentry, mode);
+ pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, PTR_ERR_OR_ZERO(ret));
+ return ret;
}
static inline int ovl_do_mknod(struct ovl_fs *ofs,
@@ -352,19 +355,19 @@ static inline int ovl_do_remove_acl(struct ovl_fs *ofs, struct dentry *dentry,
return vfs_remove_acl(ovl_upper_mnt_idmap(ofs), dentry, acl_name);
}
-static inline int ovl_do_rename(struct ovl_fs *ofs, struct inode *olddir,
- struct dentry *olddentry, struct inode *newdir,
+static inline int ovl_do_rename(struct ovl_fs *ofs, struct dentry *olddir,
+ struct dentry *olddentry, struct dentry *newdir,
struct dentry *newdentry, unsigned int flags)
{
int err;
struct renamedata rd = {
.old_mnt_idmap = ovl_upper_mnt_idmap(ofs),
- .old_dir = olddir,
- .old_dentry = olddentry,
+ .old_parent = olddir,
+ .old_dentry = olddentry,
.new_mnt_idmap = ovl_upper_mnt_idmap(ofs),
- .new_dir = newdir,
- .new_dentry = newdentry,
- .flags = flags,
+ .new_parent = newdir,
+ .new_dentry = newdentry,
+ .flags = flags,
};
pr_debug("rename(%pd2, %pd2, 0x%x)\n", olddentry, newdentry, flags);
@@ -401,7 +404,16 @@ static inline struct dentry *ovl_lookup_upper(struct ovl_fs *ofs,
const char *name,
struct dentry *base, int len)
{
- return lookup_one(ovl_upper_mnt_idmap(ofs), name, base, len);
+ return lookup_one(ovl_upper_mnt_idmap(ofs), &QSTR_LEN(name, len), base);
+}
+
+static inline struct dentry *ovl_lookup_upper_unlocked(struct ovl_fs *ofs,
+ const char *name,
+ struct dentry *base,
+ int len)
+{
+ return lookup_one_unlocked(ovl_upper_mnt_idmap(ofs),
+ &QSTR_LEN(name, len), base);
}
static inline bool ovl_open_flags_need_copy_up(int flags)
@@ -413,6 +425,11 @@ static inline bool ovl_open_flags_need_copy_up(int flags)
}
/* util.c */
+int ovl_parent_lock(struct dentry *parent, struct dentry *child);
+static inline void ovl_parent_unlock(struct dentry *parent)
+{
+ inode_unlock(parent->d_inode);
+}
int ovl_get_write_access(struct dentry *dentry);
void ovl_put_write_access(struct dentry *dentry);
void ovl_start_write(struct dentry *dentry);
@@ -445,6 +462,12 @@ void ovl_dentry_init_reval(struct dentry *dentry, struct dentry *upperdentry,
void ovl_dentry_init_flags(struct dentry *dentry, struct dentry *upperdentry,
struct ovl_entry *oe, unsigned int mask);
bool ovl_dentry_weird(struct dentry *dentry);
+
+static inline bool ovl_dentry_casefolded(struct dentry *dentry)
+{
+ return sb_has_encoding(dentry->d_sb) && IS_CASEFOLDED(d_inode(dentry));
+}
+
enum ovl_path_type ovl_path_type(struct dentry *dentry);
void ovl_path_upper(struct dentry *dentry, struct path *path);
void ovl_path_lower(struct dentry *dentry, struct path *path);
@@ -532,7 +555,8 @@ bool ovl_is_inuse(struct dentry *dentry);
bool ovl_need_index(struct dentry *dentry);
int ovl_nlink_start(struct dentry *dentry);
void ovl_nlink_end(struct dentry *dentry);
-int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir);
+int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *work,
+ struct dentry *upperdir, struct dentry *upper);
int ovl_check_metacopy_xattr(struct ovl_fs *ofs, const struct path *path,
struct ovl_metacopy *data);
int ovl_set_metacopy_xattr(struct ovl_fs *ofs, struct dentry *d,
@@ -540,8 +564,6 @@ int ovl_set_metacopy_xattr(struct ovl_fs *ofs, struct dentry *d,
bool ovl_is_metacopy_dentry(struct dentry *dentry);
char *ovl_get_redirect_xattr(struct ovl_fs *ofs, const struct path *path, int padding);
int ovl_ensure_verity_loaded(struct path *path);
-int ovl_get_verity_xattr(struct ovl_fs *ofs, const struct path *path,
- u8 *digest_buf, int *buf_length);
int ovl_validate_verity(struct ovl_fs *ofs,
struct path *metapath,
struct path *datapath);
@@ -722,7 +744,7 @@ void ovl_cleanup_whiteouts(struct ovl_fs *ofs, struct dentry *upper,
void ovl_cache_free(struct list_head *list);
void ovl_dir_cache_free(struct inode *inode);
int ovl_check_d_type_supported(const struct path *realpath);
-int ovl_workdir_cleanup(struct ovl_fs *ofs, struct inode *dir,
+int ovl_workdir_cleanup(struct ovl_fs *ofs, struct dentry *parent,
struct vfsmount *mnt, struct dentry *dentry, int level);
int ovl_indexdir_cleanup(struct ovl_fs *ofs);
@@ -816,7 +838,7 @@ void ovl_copyattr(struct inode *to);
void ovl_check_protattr(struct inode *inode, struct dentry *upper);
int ovl_set_protattr(struct inode *inode, struct dentry *upper,
- struct fileattr *fa);
+ struct file_kattr *fa);
static inline void ovl_copyflags(struct inode *from, struct inode *to)
{
@@ -827,7 +849,7 @@ static inline void ovl_copyflags(struct inode *from, struct inode *to)
/* dir.c */
extern const struct inode_operations ovl_dir_inode_operations;
-int ovl_cleanup_and_whiteout(struct ovl_fs *ofs, struct inode *dir,
+int ovl_cleanup_and_whiteout(struct ovl_fs *ofs, struct dentry *dir,
struct dentry *dentry);
struct ovl_cattr {
dev_t rdev;
@@ -838,23 +860,21 @@ struct ovl_cattr {
#define OVL_CATTR(m) (&(struct ovl_cattr) { .mode = (m) })
-int ovl_mkdir_real(struct ovl_fs *ofs, struct inode *dir,
- struct dentry **newdentry, umode_t mode);
struct dentry *ovl_create_real(struct ovl_fs *ofs,
- struct inode *dir, struct dentry *newdentry,
+ struct dentry *parent, struct dentry *newdentry,
struct ovl_cattr *attr);
-int ovl_cleanup(struct ovl_fs *ofs, struct inode *dir, struct dentry *dentry);
+int ovl_cleanup(struct ovl_fs *ofs, struct dentry *workdir, struct dentry *dentry);
struct dentry *ovl_lookup_temp(struct ovl_fs *ofs, struct dentry *workdir);
struct dentry *ovl_create_temp(struct ovl_fs *ofs, struct dentry *workdir,
struct ovl_cattr *attr);
/* file.c */
extern const struct file_operations ovl_file_operations;
-int ovl_real_fileattr_get(const struct path *realpath, struct fileattr *fa);
-int ovl_real_fileattr_set(const struct path *realpath, struct fileattr *fa);
-int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+int ovl_real_fileattr_get(const struct path *realpath, struct file_kattr *fa);
+int ovl_real_fileattr_set(const struct path *realpath, struct file_kattr *fa);
+int ovl_fileattr_get(struct dentry *dentry, struct file_kattr *fa);
int ovl_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa);
+ struct dentry *dentry, struct file_kattr *fa);
struct ovl_file;
struct ovl_file *ovl_file_alloc(struct file *realfile);
void ovl_file_free(struct ovl_file *of);
diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h
index cb449ab310a7..4c1bae935ced 100644
--- a/fs/overlayfs/ovl_entry.h
+++ b/fs/overlayfs/ovl_entry.h
@@ -51,7 +51,7 @@ struct ovl_path {
struct ovl_entry {
unsigned int __numlower;
- struct ovl_path __lowerstack[];
+ struct ovl_path __lowerstack[] __counted_by(__numlower);
};
/* private information held for overlayfs's superblock */
@@ -88,6 +88,7 @@ struct ovl_fs {
/* Shared whiteout cache */
struct dentry *whiteout;
bool no_shared_whiteout;
+ struct mutex whiteout_lock;
/* r/o snapshot of upperdir sb's only taken on volatile mounts */
errseq_t errseq;
};
diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c
index 1115c22deca0..f4e7fff909ac 100644
--- a/fs/overlayfs/params.c
+++ b/fs/overlayfs/params.c
@@ -59,6 +59,7 @@ enum ovl_opt {
Opt_metacopy,
Opt_verity,
Opt_volatile,
+ Opt_override_creds,
};
static const struct constant_table ovl_parameter_bool[] = {
@@ -155,6 +156,7 @@ const struct fs_parameter_spec ovl_parameter_spec[] = {
fsparam_enum("metacopy", Opt_metacopy, ovl_parameter_bool),
fsparam_enum("verity", Opt_verity, ovl_parameter_verity),
fsparam_flag("volatile", Opt_volatile),
+ fsparam_flag_no("override_creds", Opt_override_creds),
{}
};
@@ -280,13 +282,11 @@ static int ovl_mount_dir_check(struct fs_context *fc, const struct path *path,
return invalfc(fc, "%s is not a directory", name);
/*
- * Root dentries of case-insensitive capable filesystems might
- * not have the dentry operations set, but still be incompatible
- * with overlayfs. Check explicitly to prevent post-mount
- * failures.
+ * Allow filesystems that are case-folding capable but deny composing
+ * ovl stack from case-folded directories.
*/
- if (sb_has_encoding(path->mnt->mnt_sb))
- return invalfc(fc, "case-insensitive capable filesystem on %s not supported", name);
+ if (ovl_dentry_casefolded(path->dentry))
+ return invalfc(fc, "case-insensitive directory on %s not supported", name);
if (ovl_dentry_weird(path->dentry))
return invalfc(fc, "filesystem on %s not supported", name);
@@ -662,6 +662,29 @@ static int ovl_parse_param(struct fs_context *fc, struct fs_parameter *param)
case Opt_userxattr:
config->userxattr = true;
break;
+ case Opt_override_creds: {
+ const struct cred *cred = NULL;
+
+ if (result.negated) {
+ swap(cred, ofs->creator_cred);
+ put_cred(cred);
+ break;
+ }
+
+ if (!current_in_userns(fc->user_ns)) {
+ err = -EINVAL;
+ break;
+ }
+
+ cred = prepare_creds();
+ if (cred)
+ swap(cred, ofs->creator_cred);
+ else
+ err = -ENOMEM;
+
+ put_cred(cred);
+ break;
+ }
default:
pr_err("unrecognized mount option \"%s\" or missing value\n",
param->key);
@@ -772,6 +795,8 @@ int ovl_init_fs_context(struct fs_context *fc)
fc->s_fs_info = ofs;
fc->fs_private = ctx;
fc->ops = &ovl_context_ops;
+
+ mutex_init(&ofs->whiteout_lock);
return 0;
out_err:
@@ -846,18 +871,6 @@ int ovl_fs_params_verify(const struct ovl_fs_context *ctx,
config->uuid = OVL_UUID_NULL;
}
- /* Resolve verity -> metacopy dependency */
- if (config->verity_mode && !config->metacopy) {
- /* Don't allow explicit specified conflicting combinations */
- if (set.metacopy) {
- pr_err("conflicting options: metacopy=off,verity=%s\n",
- ovl_verity_mode(config));
- return -EINVAL;
- }
- /* Otherwise automatically enable metacopy. */
- config->metacopy = true;
- }
-
/*
* This is to make the logic below simpler. It doesn't make any other
* difference, since redirect_dir=on is only used for upper.
@@ -865,18 +878,13 @@ int ovl_fs_params_verify(const struct ovl_fs_context *ctx,
if (!config->upperdir && config->redirect_mode == OVL_REDIRECT_FOLLOW)
config->redirect_mode = OVL_REDIRECT_ON;
- /* Resolve verity -> metacopy -> redirect_dir dependency */
+ /* metacopy -> redirect_dir dependency */
if (config->metacopy && config->redirect_mode != OVL_REDIRECT_ON) {
if (set.metacopy && set.redirect) {
pr_err("conflicting options: metacopy=on,redirect_dir=%s\n",
ovl_redirect_mode(config));
return -EINVAL;
}
- if (config->verity_mode && set.redirect) {
- pr_err("conflicting options: verity=%s,redirect_dir=%s\n",
- ovl_verity_mode(config), ovl_redirect_mode(config));
- return -EINVAL;
- }
if (set.redirect) {
/*
* There was an explicit redirect_dir=... that resulted
@@ -945,7 +953,7 @@ int ovl_fs_params_verify(const struct ovl_fs_context *ctx,
}
- /* Resolve userxattr -> !redirect && !metacopy && !verity dependency */
+ /* Resolve userxattr -> !redirect && !metacopy dependency */
if (config->userxattr) {
if (set.redirect &&
config->redirect_mode != OVL_REDIRECT_NOFOLLOW) {
@@ -957,11 +965,6 @@ int ovl_fs_params_verify(const struct ovl_fs_context *ctx,
pr_err("conflicting options: userxattr,metacopy=on\n");
return -EINVAL;
}
- if (config->verity_mode) {
- pr_err("conflicting options: userxattr,verity=%s\n",
- ovl_verity_mode(config));
- return -EINVAL;
- }
/*
* Silently disable default setting of redirect and metacopy.
* This shall be the default in the future as well: these
@@ -1000,11 +1003,6 @@ int ovl_fs_params_verify(const struct ovl_fs_context *ctx,
*/
}
- if (ctx->nr_data > 0 && !config->metacopy) {
- pr_err("lower data-only dirs require metacopy support.\n");
- return -EINVAL;
- }
-
return 0;
}
@@ -1053,17 +1051,16 @@ int ovl_show_options(struct seq_file *m, struct dentry *dentry)
seq_printf(m, ",redirect_dir=%s",
ovl_redirect_mode(&ofs->config));
if (ofs->config.index != ovl_index_def)
- seq_printf(m, ",index=%s", ofs->config.index ? "on" : "off");
+ seq_printf(m, ",index=%s", str_on_off(ofs->config.index));
if (ofs->config.uuid != ovl_uuid_def())
seq_printf(m, ",uuid=%s", ovl_uuid_mode(&ofs->config));
if (ofs->config.nfs_export != ovl_nfs_export_def)
- seq_printf(m, ",nfs_export=%s", ofs->config.nfs_export ?
- "on" : "off");
+ seq_printf(m, ",nfs_export=%s",
+ str_on_off(ofs->config.nfs_export));
if (ofs->config.xino != ovl_xino_def() && !ovl_same_fs(ofs))
seq_printf(m, ",xino=%s", ovl_xino_mode(&ofs->config));
if (ofs->config.metacopy != ovl_metacopy_def)
- seq_printf(m, ",metacopy=%s",
- ofs->config.metacopy ? "on" : "off");
+ seq_printf(m, ",metacopy=%s", str_on_off(ofs->config.metacopy));
if (ofs->config.ovl_volatile)
seq_puts(m, ",volatile");
if (ofs->config.userxattr)
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index 881ec5592da5..b65cdfce31ce 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -13,6 +13,7 @@
#include <linux/security.h>
#include <linux/cred.h>
#include <linux/ratelimit.h>
+#include <linux/overflow.h>
#include "overlayfs.h"
struct ovl_cache_entry {
@@ -147,9 +148,8 @@ static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd,
u64 ino, unsigned int d_type)
{
struct ovl_cache_entry *p;
- size_t size = offsetof(struct ovl_cache_entry, name[len + 1]);
- p = kmalloc(size, GFP_KERNEL);
+ p = kmalloc(struct_size(p, name, len + 1), GFP_KERNEL);
if (!p)
return NULL;
@@ -271,7 +271,6 @@ static bool ovl_fill_merge(struct dir_context *ctx, const char *name,
static int ovl_check_whiteouts(const struct path *path, struct ovl_readdir_data *rdd)
{
int err;
- struct ovl_cache_entry *p;
struct dentry *dentry, *dir = path->dentry;
const struct cred *old_cred;
@@ -280,9 +279,11 @@ static int ovl_check_whiteouts(const struct path *path, struct ovl_readdir_data
err = down_write_killable(&dir->d_inode->i_rwsem);
if (!err) {
while (rdd->first_maybe_whiteout) {
- p = rdd->first_maybe_whiteout;
+ struct ovl_cache_entry *p =
+ rdd->first_maybe_whiteout;
rdd->first_maybe_whiteout = p->next_maybe_whiteout;
- dentry = lookup_one(mnt_idmap(path->mnt), p->name, dir, p->len);
+ dentry = lookup_one(mnt_idmap(path->mnt),
+ &QSTR_LEN(p->name, p->len), dir);
if (!IS_ERR(dentry)) {
p->is_whiteout = ovl_is_whiteout(dentry);
dput(dentry);
@@ -351,6 +352,7 @@ static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list,
struct path realpath;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_fill_merge,
+ .ctx.count = INT_MAX,
.dentry = dentry,
.list = list,
.root = root,
@@ -492,7 +494,7 @@ static int ovl_cache_update(const struct path *path, struct ovl_cache_entry *p,
}
}
/* This checks also for xwhiteouts */
- this = lookup_one(mnt_idmap(path->mnt), p->name, dir, p->len);
+ this = lookup_one(mnt_idmap(path->mnt), &QSTR_LEN(p->name, p->len), dir);
if (IS_ERR_OR_NULL(this) || !this->d_inode) {
/* Mark a stale entry */
p->is_whiteout = true;
@@ -571,6 +573,7 @@ static int ovl_dir_read_impure(const struct path *path, struct list_head *list,
struct ovl_cache_entry *p, *n;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_fill_plain,
+ .ctx.count = INT_MAX,
.list = list,
.root = root,
};
@@ -672,6 +675,7 @@ static bool ovl_fill_real(struct dir_context *ctx, const char *name,
struct ovl_readdir_translate *rdt =
container_of(ctx, struct ovl_readdir_translate, ctx);
struct dir_context *orig_ctx = rdt->orig_ctx;
+ bool res;
if (rdt->parent_ino && strcmp(name, "..") == 0) {
ino = rdt->parent_ino;
@@ -686,7 +690,10 @@ static bool ovl_fill_real(struct dir_context *ctx, const char *name,
name, namelen, rdt->xinowarn);
}
- return orig_ctx->actor(orig_ctx, name, namelen, offset, ino, d_type);
+ res = orig_ctx->actor(orig_ctx, name, namelen, offset, ino, d_type);
+ ctx->count = orig_ctx->count;
+
+ return res;
}
static bool ovl_is_impure_dir(struct file *file)
@@ -713,6 +720,7 @@ static int ovl_iterate_real(struct file *file, struct dir_context *ctx)
const struct ovl_layer *lower_layer = ovl_layer_lower(dir);
struct ovl_readdir_translate rdt = {
.ctx.actor = ovl_fill_real,
+ .ctx.count = ctx->count,
.orig_ctx = ctx,
.xinobits = ovl_xino_bits(ofs),
.xinowarn = ovl_xino_warn(ofs),
@@ -1026,14 +1034,13 @@ void ovl_cleanup_whiteouts(struct ovl_fs *ofs, struct dentry *upper,
{
struct ovl_cache_entry *p;
- inode_lock_nested(upper->d_inode, I_MUTEX_CHILD);
list_for_each_entry(p, list, l_node) {
struct dentry *dentry;
if (WARN_ON(!p->is_whiteout || !p->is_upper))
continue;
- dentry = ovl_lookup_upper(ofs, p->name, upper, p->len);
+ dentry = ovl_lookup_upper_unlocked(ofs, p->name, upper, p->len);
if (IS_ERR(dentry)) {
pr_err("lookup '%s/%.*s' failed (%i)\n",
upper->d_name.name, p->len, p->name,
@@ -1041,10 +1048,9 @@ void ovl_cleanup_whiteouts(struct ovl_fs *ofs, struct dentry *upper,
continue;
}
if (dentry->d_inode)
- ovl_cleanup(ofs, upper->d_inode, dentry);
+ ovl_cleanup(ofs, upper, dentry);
dput(dentry);
}
- inode_unlock(upper->d_inode);
}
static bool ovl_check_d_type(struct dir_context *ctx, const char *name,
@@ -1073,6 +1079,7 @@ int ovl_check_d_type_supported(const struct path *realpath)
int err;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_check_d_type,
+ .ctx.count = INT_MAX,
.d_type_supported = false,
};
@@ -1089,11 +1096,11 @@ static int ovl_workdir_cleanup_recurse(struct ovl_fs *ofs, const struct path *pa
int level)
{
int err;
- struct inode *dir = path->dentry->d_inode;
LIST_HEAD(list);
struct ovl_cache_entry *p;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_fill_plain,
+ .ctx.count = INT_MAX,
.list = &list,
};
bool incompat = false;
@@ -1114,7 +1121,6 @@ static int ovl_workdir_cleanup_recurse(struct ovl_fs *ofs, const struct path *pa
if (err)
goto out;
- inode_lock_nested(dir, I_MUTEX_PARENT);
list_for_each_entry(p, &list, l_node) {
struct dentry *dentry;
@@ -1129,39 +1135,40 @@ static int ovl_workdir_cleanup_recurse(struct ovl_fs *ofs, const struct path *pa
err = -EINVAL;
break;
}
- dentry = ovl_lookup_upper(ofs, p->name, path->dentry, p->len);
+ dentry = ovl_lookup_upper_unlocked(ofs, p->name, path->dentry, p->len);
if (IS_ERR(dentry))
continue;
if (dentry->d_inode)
- err = ovl_workdir_cleanup(ofs, dir, path->mnt, dentry, level);
+ err = ovl_workdir_cleanup(ofs, path->dentry, path->mnt,
+ dentry, level);
dput(dentry);
if (err)
break;
}
- inode_unlock(dir);
out:
ovl_cache_free(&list);
return err;
}
-int ovl_workdir_cleanup(struct ovl_fs *ofs, struct inode *dir,
+int ovl_workdir_cleanup(struct ovl_fs *ofs, struct dentry *parent,
struct vfsmount *mnt, struct dentry *dentry, int level)
{
int err;
- if (!d_is_dir(dentry) || level > 1) {
- return ovl_cleanup(ofs, dir, dentry);
- }
+ if (!d_is_dir(dentry) || level > 1)
+ return ovl_cleanup(ofs, parent, dentry);
- err = ovl_do_rmdir(ofs, dir, dentry);
+ err = ovl_parent_lock(parent, dentry);
+ if (err)
+ return err;
+ err = ovl_do_rmdir(ofs, parent->d_inode, dentry);
+ ovl_parent_unlock(parent);
if (err) {
struct path path = { .mnt = mnt, .dentry = dentry };
- inode_unlock(dir);
err = ovl_workdir_cleanup_recurse(ofs, &path, level + 1);
- inode_lock_nested(dir, I_MUTEX_PARENT);
if (!err)
- err = ovl_cleanup(ofs, dir, dentry);
+ err = ovl_cleanup(ofs, parent, dentry);
}
return err;
@@ -1172,12 +1179,12 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs)
int err;
struct dentry *indexdir = ofs->workdir;
struct dentry *index = NULL;
- struct inode *dir = indexdir->d_inode;
struct path path = { .mnt = ovl_upper_mnt(ofs), .dentry = indexdir };
LIST_HEAD(list);
struct ovl_cache_entry *p;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_fill_plain,
+ .ctx.count = INT_MAX,
.list = &list,
};
@@ -1185,7 +1192,6 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs)
if (err)
goto out;
- inode_lock_nested(dir, I_MUTEX_PARENT);
list_for_each_entry(p, &list, l_node) {
if (p->name[0] == '.') {
if (p->len == 1)
@@ -1193,7 +1199,7 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs)
if (p->len == 2 && p->name[1] == '.')
continue;
}
- index = ovl_lookup_upper(ofs, p->name, indexdir, p->len);
+ index = ovl_lookup_upper_unlocked(ofs, p->name, indexdir, p->len);
if (IS_ERR(index)) {
err = PTR_ERR(index);
index = NULL;
@@ -1201,7 +1207,7 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs)
}
/* Cleanup leftover from index create/cleanup attempt */
if (index->d_name.name[0] == '#') {
- err = ovl_workdir_cleanup(ofs, dir, path.mnt, index, 1);
+ err = ovl_workdir_cleanup(ofs, indexdir, path.mnt, index, 1);
if (err)
break;
goto next;
@@ -1211,7 +1217,7 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs)
goto next;
} else if (err == -ESTALE) {
/* Cleanup stale index entries */
- err = ovl_cleanup(ofs, dir, index);
+ err = ovl_cleanup(ofs, indexdir, index);
} else if (err != -ENOENT) {
/*
* Abort mount to avoid corrupting the index if
@@ -1224,10 +1230,10 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs)
* Whiteout orphan index to block future open by
* handle after overlay nlink dropped to zero.
*/
- err = ovl_cleanup_and_whiteout(ofs, dir, index);
+ err = ovl_cleanup_and_whiteout(ofs, indexdir, index);
} else {
/* Cleanup orphan index entries */
- err = ovl_cleanup(ofs, dir, index);
+ err = ovl_cleanup(ofs, indexdir, index);
}
if (err)
@@ -1238,7 +1244,6 @@ next:
index = NULL;
}
dput(index);
- inode_unlock(dir);
out:
ovl_cache_free(&list);
if (err)
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 86ae6f6da36b..df85a76597e9 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -299,8 +299,8 @@ static struct dentry *ovl_workdir_create(struct ovl_fs *ofs,
int err;
bool retried = false;
- inode_lock_nested(dir, I_MUTEX_PARENT);
retry:
+ inode_lock_nested(dir, I_MUTEX_PARENT);
work = ovl_lookup_upper(ofs, name, ofs->workbasedir, strlen(name));
if (!IS_ERR(work)) {
@@ -311,25 +311,27 @@ retry:
if (work->d_inode) {
err = -EEXIST;
+ inode_unlock(dir);
if (retried)
goto out_dput;
if (persist)
- goto out_unlock;
+ return work;
retried = true;
- err = ovl_workdir_cleanup(ofs, dir, mnt, work, 0);
+ err = ovl_workdir_cleanup(ofs, ofs->workbasedir, mnt, work, 0);
dput(work);
- if (err == -EINVAL) {
- work = ERR_PTR(err);
- goto out_unlock;
- }
+ if (err == -EINVAL)
+ return ERR_PTR(err);
+
goto retry;
}
- err = ovl_mkdir_real(ofs, dir, &work, attr.ia_mode);
- if (err)
- goto out_dput;
+ work = ovl_do_mkdir(ofs, dir, work, attr.ia_mode);
+ inode_unlock(dir);
+ err = PTR_ERR(work);
+ if (IS_ERR(work))
+ goto out_err;
/* Weird filesystem returning with hashed negative (kernfs)? */
err = -EINVAL;
@@ -364,11 +366,10 @@ retry:
if (err)
goto out_dput;
} else {
+ inode_unlock(dir);
err = PTR_ERR(work);
goto out_err;
}
-out_unlock:
- inode_unlock(dir);
return work;
out_dput:
@@ -376,8 +377,7 @@ out_dput:
out_err:
pr_warn("failed to create directory %s/%s (errno: %i); mounting read-only\n",
ofs->config.workdir, name, -err);
- work = NULL;
- goto out_unlock;
+ return NULL;
}
static int ovl_check_namelen(const struct path *path, struct ovl_fs *ofs,
@@ -556,37 +556,42 @@ out:
static int ovl_check_rename_whiteout(struct ovl_fs *ofs)
{
struct dentry *workdir = ofs->workdir;
- struct inode *dir = d_inode(workdir);
struct dentry *temp;
struct dentry *dest;
struct dentry *whiteout;
struct name_snapshot name;
int err;
- inode_lock_nested(dir, I_MUTEX_PARENT);
-
temp = ovl_create_temp(ofs, workdir, OVL_CATTR(S_IFREG | 0));
err = PTR_ERR(temp);
if (IS_ERR(temp))
- goto out_unlock;
+ return err;
+ err = ovl_parent_lock(workdir, temp);
+ if (err) {
+ dput(temp);
+ return err;
+ }
dest = ovl_lookup_temp(ofs, workdir);
err = PTR_ERR(dest);
if (IS_ERR(dest)) {
dput(temp);
- goto out_unlock;
+ ovl_parent_unlock(workdir);
+ return err;
}
/* Name is inline and stable - using snapshot as a copy helper */
take_dentry_name_snapshot(&name, temp);
- err = ovl_do_rename(ofs, dir, temp, dir, dest, RENAME_WHITEOUT);
+ err = ovl_do_rename(ofs, workdir, temp, workdir, dest, RENAME_WHITEOUT);
+ ovl_parent_unlock(workdir);
if (err) {
if (err == -EINVAL)
err = 0;
goto cleanup_temp;
}
- whiteout = ovl_lookup_upper(ofs, name.name.name, workdir, name.name.len);
+ whiteout = ovl_lookup_upper_unlocked(ofs, name.name.name,
+ workdir, name.name.len);
err = PTR_ERR(whiteout);
if (IS_ERR(whiteout))
goto cleanup_temp;
@@ -595,18 +600,15 @@ static int ovl_check_rename_whiteout(struct ovl_fs *ofs)
/* Best effort cleanup of whiteout and temp file */
if (err)
- ovl_cleanup(ofs, dir, whiteout);
+ ovl_cleanup(ofs, workdir, whiteout);
dput(whiteout);
cleanup_temp:
- ovl_cleanup(ofs, dir, temp);
+ ovl_cleanup(ofs, workdir, temp);
release_dentry_name_snapshot(&name);
dput(temp);
dput(dest);
-out_unlock:
- inode_unlock(dir);
-
return err;
}
@@ -620,8 +622,7 @@ static struct dentry *ovl_lookup_or_create(struct ovl_fs *ofs,
inode_lock_nested(parent->d_inode, I_MUTEX_PARENT);
child = ovl_lookup_upper(ofs, name, parent, len);
if (!IS_ERR(child) && !child->d_inode)
- child = ovl_create_real(ofs, parent->d_inode, child,
- OVL_CATTR(mode));
+ child = ovl_create_real(ofs, parent, child, OVL_CATTR(mode));
inode_unlock(parent->d_inode);
dput(parent);
@@ -1137,6 +1138,11 @@ static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb,
return ERR_PTR(-EINVAL);
}
+ if (ctx->nr == ctx->nr_data) {
+ pr_err("at least one non-data lowerdir is required\n");
+ return ERR_PTR(-EINVAL);
+ }
+
err = -EINVAL;
for (i = 0; i < ctx->nr; i++) {
l = &ctx->lower[i];
@@ -1305,6 +1311,7 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
{
struct ovl_fs *ofs = sb->s_fs_info;
struct ovl_fs_context *ctx = fc->fs_private;
+ const struct cred *old_cred = NULL;
struct dentry *root_dentry;
struct ovl_entry *oe;
struct ovl_layer *layers;
@@ -1315,13 +1322,18 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
if (WARN_ON(fc->user_ns != current_user_ns()))
goto out_err;
- sb->s_d_op = &ovl_dentry_operations;
+ set_default_d_op(sb, &ovl_dentry_operations);
err = -ENOMEM;
- ofs->creator_cred = cred = prepare_creds();
+ if (!ofs->creator_cred)
+ ofs->creator_cred = cred = prepare_creds();
+ else
+ cred = (struct cred *)ofs->creator_cred;
if (!cred)
goto out_err;
+ old_cred = ovl_override_creds(sb);
+
err = ovl_fs_params_verify(ctx, &ofs->config);
if (err)
goto out_err;
@@ -1481,11 +1493,19 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_root = root_dentry;
+ ovl_revert_creds(old_cred);
return 0;
out_free_oe:
ovl_free_entry(oe);
out_err:
+ /*
+ * Revert creds before calling ovl_free_fs() which will call
+ * put_cred() and put_cred() requires that the cred's that are
+ * put are not the caller's creds, i.e., current->cred.
+ */
+ if (old_cred)
+ ovl_revert_creds(old_cred);
ovl_free_fs(ofs);
sb->s_fs_info = NULL;
return err;
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 0819c739cc2f..41033bac96cb 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -15,6 +15,7 @@
#include <linux/uuid.h>
#include <linux/namei.h>
#include <linux/ratelimit.h>
+#include <linux/overflow.h>
#include "overlayfs.h"
/* Get write access to upper mnt - may fail if upper sb was remounted ro */
@@ -145,9 +146,9 @@ void ovl_stack_free(struct ovl_path *stack, unsigned int n)
struct ovl_entry *ovl_alloc_entry(unsigned int numlower)
{
- size_t size = offsetof(struct ovl_entry, __lowerstack[numlower]);
- struct ovl_entry *oe = kzalloc(size, GFP_KERNEL);
+ struct ovl_entry *oe;
+ oe = kzalloc(struct_size(oe, __lowerstack, numlower), GFP_KERNEL);
if (oe)
oe->__numlower = numlower;
@@ -205,10 +206,17 @@ bool ovl_dentry_weird(struct dentry *dentry)
if (!d_can_lookup(dentry) && !d_is_file(dentry) && !d_is_symlink(dentry))
return true;
- return dentry->d_flags & (DCACHE_NEED_AUTOMOUNT |
- DCACHE_MANAGE_TRANSIT |
- DCACHE_OP_HASH |
- DCACHE_OP_COMPARE);
+ if (dentry->d_flags & (DCACHE_NEED_AUTOMOUNT | DCACHE_MANAGE_TRANSIT))
+ return true;
+
+ /*
+ * Allow filesystems that are case-folding capable but deny composing
+ * ovl stack from case-folded directories.
+ */
+ if (sb_has_encoding(dentry->d_sb))
+ return IS_CASEFOLDED(d_inode(dentry));
+
+ return dentry->d_flags & (DCACHE_OP_HASH | DCACHE_OP_COMPARE);
}
enum ovl_path_type ovl_path_type(struct dentry *dentry)
@@ -305,7 +313,9 @@ enum ovl_path_type ovl_path_realdata(struct dentry *dentry, struct path *path)
struct dentry *ovl_dentry_upper(struct dentry *dentry)
{
- return ovl_upperdentry_dereference(OVL_I(d_inode(dentry)));
+ struct inode *inode = d_inode(dentry);
+
+ return inode ? ovl_upperdentry_dereference(OVL_I(inode)) : NULL;
}
struct dentry *ovl_dentry_lower(struct dentry *dentry)
@@ -956,7 +966,7 @@ void ovl_check_protattr(struct inode *inode, struct dentry *upper)
}
int ovl_set_protattr(struct inode *inode, struct dentry *upper,
- struct fileattr *fa)
+ struct file_kattr *fa)
{
struct ovl_fs *ofs = OVL_FS(inode->i_sb);
char buf[OVL_PROTATTR_MAX];
@@ -1068,7 +1078,6 @@ static void ovl_cleanup_index(struct dentry *dentry)
{
struct ovl_fs *ofs = OVL_FS(dentry->d_sb);
struct dentry *indexdir = ovl_indexdir(dentry->d_sb);
- struct inode *dir = indexdir->d_inode;
struct dentry *lowerdentry = ovl_dentry_lower(dentry);
struct dentry *upperdentry = ovl_dentry_upper(dentry);
struct dentry *index = NULL;
@@ -1104,21 +1113,18 @@ static void ovl_cleanup_index(struct dentry *dentry)
goto out;
}
- inode_lock_nested(dir, I_MUTEX_PARENT);
- index = ovl_lookup_upper(ofs, name.name, indexdir, name.len);
+ index = ovl_lookup_upper_unlocked(ofs, name.name, indexdir, name.len);
err = PTR_ERR(index);
if (IS_ERR(index)) {
index = NULL;
} else if (ovl_index_all(dentry->d_sb)) {
/* Whiteout orphan index to block future open by handle */
err = ovl_cleanup_and_whiteout(OVL_FS(dentry->d_sb),
- dir, index);
+ indexdir, index);
} else {
/* Cleanup orphan index entries */
- err = ovl_cleanup(ofs, dir, index);
+ err = ovl_cleanup(ofs, indexdir, index);
}
-
- inode_unlock(dir);
if (err)
goto fail;
@@ -1217,20 +1223,21 @@ void ovl_nlink_end(struct dentry *dentry)
ovl_inode_unlock(inode);
}
-int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir)
+int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *work,
+ struct dentry *upperdir, struct dentry *upper)
{
struct dentry *trap;
- /* Workdir should not be the same as upperdir */
- if (workdir == upperdir)
- goto err;
-
/* Workdir should not be subdir of upperdir and vice versa */
trap = lock_rename(workdir, upperdir);
if (IS_ERR(trap))
goto err;
if (trap)
goto err_unlock;
+ if (work && work->d_parent != workdir)
+ goto err_unlock;
+ if (upper && upper->d_parent != upperdir)
+ goto err_unlock;
return 0;
@@ -1541,3 +1548,14 @@ void ovl_copyattr(struct inode *inode)
i_size_write(inode, i_size_read(realinode));
spin_unlock(&inode->i_lock);
}
+
+int ovl_parent_lock(struct dentry *parent, struct dentry *child)
+{
+ inode_lock_nested(parent->d_inode, I_MUTEX_PARENT);
+ if (!child ||
+ (!d_unhashed(child) && child->d_parent == parent))
+ return 0;
+
+ inode_unlock(parent->d_inode);
+ return -EINVAL;
+}
diff --git a/fs/pidfs.c b/fs/pidfs.c
index 049352f973de..108e7527f837 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -20,10 +20,41 @@
#include <linux/time_namespace.h>
#include <linux/utsname.h>
#include <net/net_namespace.h>
+#include <linux/coredump.h>
+#include <linux/xattr.h>
#include "internal.h"
#include "mount.h"
+#define PIDFS_PID_DEAD ERR_PTR(-ESRCH)
+
+static struct kmem_cache *pidfs_attr_cachep __ro_after_init;
+static struct kmem_cache *pidfs_xattr_cachep __ro_after_init;
+
+static struct path pidfs_root_path = {};
+
+void pidfs_get_root(struct path *path)
+{
+ *path = pidfs_root_path;
+ path_get(path);
+}
+
+/*
+ * Stashes information that userspace needs to access even after the
+ * process has been reaped.
+ */
+struct pidfs_exit_info {
+ __u64 cgroupid;
+ __s32 exit_code;
+ __u32 coredump_mask;
+};
+
+struct pidfs_attr {
+ struct simple_xattrs *xattrs;
+ struct pidfs_exit_info __pei;
+ struct pidfs_exit_info *exit_info;
+};
+
static struct rb_root pidfs_ino_tree = RB_ROOT;
#if BITS_PER_LONG == 32
@@ -101,6 +132,7 @@ void pidfs_add_pid(struct pid *pid)
pid->ino = pidfs_ino_nr;
pid->stashed = NULL;
+ pid->attr = NULL;
pidfs_ino_nr++;
write_seqcount_begin(&pidmap_lock_seq);
@@ -115,6 +147,33 @@ void pidfs_remove_pid(struct pid *pid)
write_seqcount_end(&pidmap_lock_seq);
}
+void pidfs_free_pid(struct pid *pid)
+{
+ struct pidfs_attr *attr __free(kfree) = no_free_ptr(pid->attr);
+ struct simple_xattrs *xattrs __free(kfree) = NULL;
+
+ /*
+ * Any dentry must've been wiped from the pid by now.
+ * Otherwise there's a reference count bug.
+ */
+ VFS_WARN_ON_ONCE(pid->stashed);
+
+ /*
+ * This if an error occurred during e.g., task creation that
+ * causes us to never go through the exit path.
+ */
+ if (unlikely(!attr))
+ return;
+
+ /* This never had a pidfd created. */
+ if (IS_ERR(attr))
+ return;
+
+ xattrs = no_free_ptr(attr->xattrs);
+ if (xattrs)
+ simple_xattrs_free(xattrs, NULL);
+}
+
#ifdef CONFIG_PROC_FS
/**
* pidfd_show_fdinfo - print information about a pidfd
@@ -188,36 +247,64 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
{
struct pid *pid = pidfd_pid(file);
- bool thread = file->f_flags & PIDFD_THREAD;
struct task_struct *task;
__poll_t poll_flags = 0;
poll_wait(file, &pid->wait_pidfd, pts);
/*
- * Depending on PIDFD_THREAD, inform pollers when the thread
- * or the whole thread-group exits.
+ * Don't wake waiters if the thread-group leader exited
+ * prematurely. They either get notified when the last subthread
+ * exits or not at all if one of the remaining subthreads execs
+ * and assumes the struct pid of the old thread-group leader.
*/
guard(rcu)();
task = pid_task(pid, PIDTYPE_PID);
if (!task)
poll_flags = EPOLLIN | EPOLLRDNORM | EPOLLHUP;
- else if (task->exit_state && (thread || thread_group_empty(task)))
+ else if (task->exit_state && !delay_group_leader(task))
poll_flags = EPOLLIN | EPOLLRDNORM;
return poll_flags;
}
-static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long arg)
+static inline bool pid_in_current_pidns(const struct pid *pid)
+{
+ const struct pid_namespace *ns = task_active_pid_ns(current);
+
+ if (ns->level <= pid->level)
+ return pid->numbers[ns->level].ns == ns;
+
+ return false;
+}
+
+static __u32 pidfs_coredump_mask(unsigned long mm_flags)
+{
+ switch (__get_dumpable(mm_flags)) {
+ case SUID_DUMP_USER:
+ return PIDFD_COREDUMP_USER;
+ case SUID_DUMP_ROOT:
+ return PIDFD_COREDUMP_ROOT;
+ case SUID_DUMP_DISABLE:
+ return PIDFD_COREDUMP_SKIP;
+ default:
+ WARN_ON_ONCE(true);
+ }
+
+ return 0;
+}
+
+static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg)
{
struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg;
+ struct task_struct *task __free(put_task) = NULL;
+ struct pid *pid = pidfd_pid(file);
size_t usize = _IOC_SIZE(cmd);
struct pidfd_info kinfo = {};
+ struct pidfs_exit_info *exit_info;
struct user_namespace *user_ns;
+ struct pidfs_attr *attr;
const struct cred *c;
__u64 mask;
-#ifdef CONFIG_CGROUPS
- struct cgroup *cgrp;
-#endif
if (!uinfo)
return -EINVAL;
@@ -227,10 +314,54 @@ static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long
if (copy_from_user(&mask, &uinfo->mask, sizeof(mask)))
return -EFAULT;
+ /*
+ * Restrict information retrieval to tasks within the caller's pid
+ * namespace hierarchy.
+ */
+ if (!pid_in_current_pidns(pid))
+ return -ESRCH;
+
+ attr = READ_ONCE(pid->attr);
+ if (mask & PIDFD_INFO_EXIT) {
+ exit_info = READ_ONCE(attr->exit_info);
+ if (exit_info) {
+ kinfo.mask |= PIDFD_INFO_EXIT;
+#ifdef CONFIG_CGROUPS
+ kinfo.cgroupid = exit_info->cgroupid;
+ kinfo.mask |= PIDFD_INFO_CGROUPID;
+#endif
+ kinfo.exit_code = exit_info->exit_code;
+ }
+ }
+
+ if (mask & PIDFD_INFO_COREDUMP) {
+ kinfo.mask |= PIDFD_INFO_COREDUMP;
+ kinfo.coredump_mask = READ_ONCE(attr->__pei.coredump_mask);
+ }
+
+ task = get_pid_task(pid, PIDTYPE_PID);
+ if (!task) {
+ /*
+ * If the task has already been reaped, only exit
+ * information is available
+ */
+ if (!(mask & PIDFD_INFO_EXIT))
+ return -ESRCH;
+
+ goto copy_out;
+ }
+
c = get_task_cred(task);
if (!c)
return -ESRCH;
+ if ((kinfo.mask & PIDFD_INFO_COREDUMP) && !(kinfo.coredump_mask)) {
+ task_lock(task);
+ if (task->mm)
+ kinfo.coredump_mask = pidfs_coredump_mask(task->mm->flags);
+ task_unlock(task);
+ }
+
/* Unconditionally return identifiers and credentials, the rest only on request */
user_ns = current_user_ns();
@@ -246,11 +377,15 @@ static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long
put_cred(c);
#ifdef CONFIG_CGROUPS
- rcu_read_lock();
- cgrp = task_dfl_cgroup(task);
- kinfo.cgroupid = cgroup_id(cgrp);
- kinfo.mask |= PIDFD_INFO_CGROUPID;
- rcu_read_unlock();
+ if (!kinfo.cgroupid) {
+ struct cgroup *cgrp;
+
+ rcu_read_lock();
+ cgrp = task_dfl_cgroup(task);
+ kinfo.cgroupid = cgroup_id(cgrp);
+ kinfo.mask |= PIDFD_INFO_CGROUPID;
+ rcu_read_unlock();
+ }
#endif
/*
@@ -267,19 +402,17 @@ static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long
kinfo.pid = task_pid_vnr(task);
kinfo.mask |= PIDFD_INFO_PID;
- if (kinfo.pid == 0 || kinfo.tgid == 0 || (kinfo.ppid == 0 && kinfo.pid != 1))
+ if (kinfo.pid == 0 || kinfo.tgid == 0)
return -ESRCH;
+copy_out:
/*
* If userspace and the kernel have the same struct size it can just
* be copied. If userspace provides an older struct, only the bits that
* userspace knows about will be copied. If userspace provides a new
* struct, only the bits that the kernel knows about will be copied.
*/
- if (copy_to_user(uinfo, &kinfo, min(usize, sizeof(kinfo))))
- return -EFAULT;
-
- return 0;
+ return copy_struct_to_user(uinfo, usize, &kinfo, sizeof(kinfo), NULL);
}
static bool pidfs_ioctl_valid(unsigned int cmd)
@@ -287,7 +420,6 @@ static bool pidfs_ioctl_valid(unsigned int cmd)
switch (cmd) {
case FS_IOC_GETVERSION:
case PIDFD_GET_CGROUP_NAMESPACE:
- case PIDFD_GET_INFO:
case PIDFD_GET_IPC_NAMESPACE:
case PIDFD_GET_MNT_NAMESPACE:
case PIDFD_GET_NET_NAMESPACE:
@@ -300,6 +432,17 @@ static bool pidfs_ioctl_valid(unsigned int cmd)
return true;
}
+ /* Extensible ioctls require some more careful checks. */
+ switch (_IOC_NR(cmd)) {
+ case _IOC_NR(PIDFD_GET_INFO):
+ /*
+ * Try to prevent performing a pidfd ioctl when someone
+ * erronously mistook the file descriptor for a pidfd.
+ * This is not perfect but will catch most cases.
+ */
+ return (_IOC_TYPE(cmd) == _IOC_TYPE(PIDFD_GET_INFO));
+ }
+
return false;
}
@@ -307,7 +450,6 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
struct task_struct *task __free(put_task) = NULL;
struct nsproxy *nsp __free(put_nsproxy) = NULL;
- struct pid *pid = pidfd_pid(file);
struct ns_common *ns_common = NULL;
struct pid_namespace *pid_ns;
@@ -322,13 +464,13 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
return put_user(file_inode(file)->i_generation, argp);
}
- task = get_pid_task(pid, PIDTYPE_PID);
- if (!task)
- return -ESRCH;
-
/* Extensible IOCTL that does not open namespace FDs, take a shortcut */
if (_IOC_NR(cmd) == _IOC_NR(PIDFD_GET_INFO))
- return pidfd_info(task, cmd, arg);
+ return pidfd_info(file, cmd, arg);
+
+ task = get_pid_task(pidfd_pid(file), PIDTYPE_PID);
+ if (!task)
+ return -ESRCH;
if (arg)
return -EINVAL;
@@ -440,6 +582,93 @@ struct pid *pidfd_pid(const struct file *file)
return file_inode(file)->i_private;
}
+/*
+ * We're called from release_task(). We know there's at least one
+ * reference to struct pid being held that won't be released until the
+ * task has been reaped which cannot happen until we're out of
+ * release_task().
+ *
+ * If this struct pid has at least once been referred to by a pidfd then
+ * pid->attr will be allocated. If not we mark the struct pid as dead so
+ * anyone who is trying to register it with pidfs will fail to do so.
+ * Otherwise we would hand out pidfs for reaped tasks without having
+ * exit information available.
+ *
+ * Worst case is that we've filled in the info and the pid gets freed
+ * right away in free_pid() when no one holds a pidfd anymore. Since
+ * pidfs_exit() currently is placed after exit_task_work() we know that
+ * it cannot be us aka the exiting task holding a pidfd to itself.
+ */
+void pidfs_exit(struct task_struct *tsk)
+{
+ struct pid *pid = task_pid(tsk);
+ struct pidfs_attr *attr;
+ struct pidfs_exit_info *exit_info;
+#ifdef CONFIG_CGROUPS
+ struct cgroup *cgrp;
+#endif
+
+ might_sleep();
+
+ guard(spinlock_irq)(&pid->wait_pidfd.lock);
+ attr = pid->attr;
+ if (!attr) {
+ /*
+ * No one ever held a pidfd for this struct pid.
+ * Mark it as dead so no one can add a pidfs
+ * entry anymore. We're about to be reaped and
+ * so no exit information would be available.
+ */
+ pid->attr = PIDFS_PID_DEAD;
+ return;
+ }
+
+ /*
+ * If @pid->attr is set someone might still legitimately hold a
+ * pidfd to @pid or someone might concurrently still be getting
+ * a reference to an already stashed dentry from @pid->stashed.
+ * So defer cleaning @pid->attr until the last reference to @pid
+ * is put
+ */
+
+ exit_info = &attr->__pei;
+
+#ifdef CONFIG_CGROUPS
+ rcu_read_lock();
+ cgrp = task_dfl_cgroup(tsk);
+ exit_info->cgroupid = cgroup_id(cgrp);
+ rcu_read_unlock();
+#endif
+ exit_info->exit_code = tsk->exit_code;
+
+ /* Ensure that PIDFD_GET_INFO sees either all or nothing. */
+ smp_store_release(&attr->exit_info, &attr->__pei);
+}
+
+#ifdef CONFIG_COREDUMP
+void pidfs_coredump(const struct coredump_params *cprm)
+{
+ struct pid *pid = cprm->pid;
+ struct pidfs_exit_info *exit_info;
+ struct pidfs_attr *attr;
+ __u32 coredump_mask = 0;
+
+ attr = READ_ONCE(pid->attr);
+
+ VFS_WARN_ON_ONCE(!attr);
+ VFS_WARN_ON_ONCE(attr == PIDFS_PID_DEAD);
+
+ exit_info = &attr->__pei;
+ /* Note how we were coredumped. */
+ coredump_mask = pidfs_coredump_mask(cprm->mm_flags);
+ /* Note that we actually did coredump. */
+ coredump_mask |= PIDFD_COREDUMPED;
+ /* If coredumping is set to skip we should never end up here. */
+ VFS_WARN_ON_ONCE(coredump_mask & PIDFD_COREDUMP_SKIP);
+ smp_store_release(&exit_info->coredump_mask, coredump_mask);
+}
+#endif
+
static struct vfsmount *pidfs_mnt __ro_after_init;
/*
@@ -450,41 +679,34 @@ static struct vfsmount *pidfs_mnt __ro_after_init;
static int pidfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
struct iattr *attr)
{
- return -EOPNOTSUPP;
+ return anon_inode_setattr(idmap, dentry, attr);
}
-
-/*
- * User space expects pidfs inodes to have no file type in st_mode.
- *
- * In particular, 'lsof' has this legacy logic:
- *
- * type = s->st_mode & S_IFMT;
- * switch (type) {
- * ...
- * case 0:
- * if (!strcmp(p, "anon_inode"))
- * Lf->ntype = Ntype = N_ANON_INODE;
- *
- * to detect our old anon_inode logic.
- *
- * Rather than mess with our internal sane inode data, just fix it
- * up here in getattr() by masking off the format bits.
- */
static int pidfs_getattr(struct mnt_idmap *idmap, const struct path *path,
struct kstat *stat, u32 request_mask,
unsigned int query_flags)
{
- struct inode *inode = d_inode(path->dentry);
+ return anon_inode_getattr(idmap, path, stat, request_mask, query_flags);
+}
- generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
- stat->mode &= ~S_IFMT;
- return 0;
+static ssize_t pidfs_listxattr(struct dentry *dentry, char *buf, size_t size)
+{
+ struct inode *inode = d_inode(dentry);
+ struct pid *pid = inode->i_private;
+ struct pidfs_attr *attr = pid->attr;
+ struct simple_xattrs *xattrs;
+
+ xattrs = READ_ONCE(attr->xattrs);
+ if (!xattrs)
+ return 0;
+
+ return simple_xattr_list(inode, xattrs, buf, size);
}
static const struct inode_operations pidfs_inode_operations = {
- .getattr = pidfs_getattr,
- .setattr = pidfs_setattr,
+ .getattr = pidfs_getattr,
+ .setattr = pidfs_setattr,
+ .listxattr = pidfs_listxattr,
};
static void pidfs_evict_inode(struct inode *inode)
@@ -511,7 +733,6 @@ static char *pidfs_dname(struct dentry *dentry, char *buffer, int buflen)
}
const struct dentry_operations pidfs_dentry_operations = {
- .d_delete = always_delete_dentry,
.d_dname = pidfs_dname,
.d_prune = stashed_dentry_prune,
};
@@ -598,6 +819,8 @@ static struct dentry *pidfs_fh_to_dentry(struct super_block *sb,
if (ret < 0)
return ERR_PTR(ret);
+ VFS_WARN_ON_ONCE(!pid->attr);
+
mntput(path.mnt);
return path.dentry;
}
@@ -646,7 +869,9 @@ static int pidfs_init_inode(struct inode *inode, void *data)
const struct pid *pid = data;
inode->i_private = data;
- inode->i_flags |= S_PRIVATE;
+ inode->i_flags |= S_PRIVATE | S_ANON_INODE;
+ /* We allow to set xattrs. */
+ inode->i_flags &= ~S_IMMUTABLE;
inode->i_mode |= S_IRWXU;
inode->i_op = &pidfs_inode_operations;
inode->i_fop = &pidfs_file_operations;
@@ -661,9 +886,127 @@ static void pidfs_put_data(void *data)
put_pid(pid);
}
+/**
+ * pidfs_register_pid - register a struct pid in pidfs
+ * @pid: pid to pin
+ *
+ * Register a struct pid in pidfs.
+ *
+ * Return: On success zero, on error a negative error code is returned.
+ */
+int pidfs_register_pid(struct pid *pid)
+{
+ struct pidfs_attr *new_attr __free(kfree) = NULL;
+ struct pidfs_attr *attr;
+
+ might_sleep();
+
+ if (!pid)
+ return 0;
+
+ attr = READ_ONCE(pid->attr);
+ if (unlikely(attr == PIDFS_PID_DEAD))
+ return PTR_ERR(PIDFS_PID_DEAD);
+ if (attr)
+ return 0;
+
+ new_attr = kmem_cache_zalloc(pidfs_attr_cachep, GFP_KERNEL);
+ if (!new_attr)
+ return -ENOMEM;
+
+ /* Synchronize with pidfs_exit(). */
+ guard(spinlock_irq)(&pid->wait_pidfd.lock);
+
+ attr = pid->attr;
+ if (unlikely(attr == PIDFS_PID_DEAD))
+ return PTR_ERR(PIDFS_PID_DEAD);
+ if (unlikely(attr))
+ return 0;
+
+ pid->attr = no_free_ptr(new_attr);
+ return 0;
+}
+
+static struct dentry *pidfs_stash_dentry(struct dentry **stashed,
+ struct dentry *dentry)
+{
+ int ret;
+ struct pid *pid = d_inode(dentry)->i_private;
+
+ VFS_WARN_ON_ONCE(stashed != &pid->stashed);
+
+ ret = pidfs_register_pid(pid);
+ if (ret)
+ return ERR_PTR(ret);
+
+ return stash_dentry(stashed, dentry);
+}
+
static const struct stashed_operations pidfs_stashed_ops = {
- .init_inode = pidfs_init_inode,
- .put_data = pidfs_put_data,
+ .stash_dentry = pidfs_stash_dentry,
+ .init_inode = pidfs_init_inode,
+ .put_data = pidfs_put_data,
+};
+
+static int pidfs_xattr_get(const struct xattr_handler *handler,
+ struct dentry *unused, struct inode *inode,
+ const char *suffix, void *value, size_t size)
+{
+ struct pid *pid = inode->i_private;
+ struct pidfs_attr *attr = pid->attr;
+ const char *name;
+ struct simple_xattrs *xattrs;
+
+ xattrs = READ_ONCE(attr->xattrs);
+ if (!xattrs)
+ return 0;
+
+ name = xattr_full_name(handler, suffix);
+ return simple_xattr_get(xattrs, name, value, size);
+}
+
+static int pidfs_xattr_set(const struct xattr_handler *handler,
+ struct mnt_idmap *idmap, struct dentry *unused,
+ struct inode *inode, const char *suffix,
+ const void *value, size_t size, int flags)
+{
+ struct pid *pid = inode->i_private;
+ struct pidfs_attr *attr = pid->attr;
+ const char *name;
+ struct simple_xattrs *xattrs;
+ struct simple_xattr *old_xattr;
+
+ /* Ensure we're the only one to set @attr->xattrs. */
+ WARN_ON_ONCE(!inode_is_locked(inode));
+
+ xattrs = READ_ONCE(attr->xattrs);
+ if (!xattrs) {
+ xattrs = kmem_cache_zalloc(pidfs_xattr_cachep, GFP_KERNEL);
+ if (!xattrs)
+ return -ENOMEM;
+
+ simple_xattrs_init(xattrs);
+ smp_store_release(&pid->attr->xattrs, xattrs);
+ }
+
+ name = xattr_full_name(handler, suffix);
+ old_xattr = simple_xattr_set(xattrs, name, value, size, flags);
+ if (IS_ERR(old_xattr))
+ return PTR_ERR(old_xattr);
+
+ simple_xattr_free(old_xattr);
+ return 0;
+}
+
+static const struct xattr_handler pidfs_trusted_xattr_handler = {
+ .prefix = XATTR_TRUSTED_PREFIX,
+ .get = pidfs_xattr_get,
+ .set = pidfs_xattr_set,
+};
+
+static const struct xattr_handler *const pidfs_xattr_handlers[] = {
+ &pidfs_trusted_xattr_handler,
+ NULL
};
static int pidfs_init_fs_context(struct fs_context *fc)
@@ -674,9 +1017,12 @@ static int pidfs_init_fs_context(struct fs_context *fc)
if (!ctx)
return -ENOMEM;
+ fc->s_iflags |= SB_I_NOEXEC;
+ fc->s_iflags |= SB_I_NODEV;
ctx->ops = &pidfs_sops;
ctx->eops = &pidfs_export_operations;
ctx->dops = &pidfs_dentry_operations;
+ ctx->xattr = pidfs_xattr_handlers;
fc->s_fs_info = (void *)&pidfs_stashed_ops;
return 0;
}
@@ -689,23 +1035,48 @@ static struct file_system_type pidfs_type = {
struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
{
-
struct file *pidfd_file;
- struct path path;
+ struct path path __free(path_put) = {};
int ret;
+ /*
+ * Ensure that PIDFD_STALE can be passed as a flag without
+ * overloading other uapi pidfd flags.
+ */
+ BUILD_BUG_ON(PIDFD_STALE == PIDFD_THREAD);
+ BUILD_BUG_ON(PIDFD_STALE == PIDFD_NONBLOCK);
+
ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path);
if (ret < 0)
return ERR_PTR(ret);
+ VFS_WARN_ON_ONCE(!pid->attr);
+
+ flags &= ~PIDFD_STALE;
+ flags |= O_RDWR;
pidfd_file = dentry_open(&path, flags, current_cred());
- path_put(&path);
+ /* Raise PIDFD_THREAD explicitly as do_dentry_open() strips it. */
+ if (!IS_ERR(pidfd_file))
+ pidfd_file->f_flags |= (flags & PIDFD_THREAD);
+
return pidfd_file;
}
void __init pidfs_init(void)
{
+ pidfs_attr_cachep = kmem_cache_create("pidfs_attr_cache", sizeof(struct pidfs_attr), 0,
+ (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT |
+ SLAB_ACCOUNT | SLAB_PANIC), NULL);
+
+ pidfs_xattr_cachep = kmem_cache_create("pidfs_xattr_cache",
+ sizeof(struct simple_xattrs), 0,
+ (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT |
+ SLAB_ACCOUNT | SLAB_PANIC), NULL);
+
pidfs_mnt = kern_mount(&pidfs_type);
if (IS_ERR(pidfs_mnt))
panic("Failed to mount pidfs pseudo filesystem");
+
+ pidfs_root_path.mnt = pidfs_mnt;
+ pidfs_root_path.dentry = pidfs_mnt->mnt_root;
}
diff --git a/fs/pipe.c b/fs/pipe.c
index 94b59045ab44..731622d0738d 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -26,6 +26,7 @@
#include <linux/memcontrol.h>
#include <linux/watch_queue.h>
#include <linux/sysctl.h>
+#include <linux/sort.h>
#include <linux/uaccess.h>
#include <asm/ioctls.h>
@@ -76,8 +77,6 @@ static unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR;
* -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09
*/
-#define cmp_int(l, r) ((l > r) - (l < r))
-
#ifdef CONFIG_PROVE_LOCKING
static int pipe_lock_cmp_fn(const struct lockdep_map *a,
const struct lockdep_map *b)
@@ -112,20 +111,40 @@ void pipe_double_lock(struct pipe_inode_info *pipe1,
pipe_lock(pipe2);
}
+static struct page *anon_pipe_get_page(struct pipe_inode_info *pipe)
+{
+ for (int i = 0; i < ARRAY_SIZE(pipe->tmp_page); i++) {
+ if (pipe->tmp_page[i]) {
+ struct page *page = pipe->tmp_page[i];
+ pipe->tmp_page[i] = NULL;
+ return page;
+ }
+ }
+
+ return alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT);
+}
+
+static void anon_pipe_put_page(struct pipe_inode_info *pipe,
+ struct page *page)
+{
+ if (page_count(page) == 1) {
+ for (int i = 0; i < ARRAY_SIZE(pipe->tmp_page); i++) {
+ if (!pipe->tmp_page[i]) {
+ pipe->tmp_page[i] = page;
+ return;
+ }
+ }
+ }
+
+ put_page(page);
+}
+
static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
struct page *page = buf->page;
- /*
- * If nobody else uses this page, and we don't already have a
- * temporary page, let's keep track of it as a one-deep
- * allocation cache. (Otherwise just release our reference to it)
- */
- if (page_count(page) == 1 && !pipe->tmp_page)
- pipe->tmp_page = page;
- else
- put_page(page);
+ anon_pipe_put_page(pipe, page);
}
static bool anon_pipe_buf_try_steal(struct pipe_inode_info *pipe,
@@ -210,11 +229,10 @@ static const struct pipe_buf_operations anon_pipe_buf_ops = {
/* Done while waiting without holding the pipe lock - thus the READ_ONCE() */
static inline bool pipe_readable(const struct pipe_inode_info *pipe)
{
- unsigned int head = READ_ONCE(pipe->head);
- unsigned int tail = READ_ONCE(pipe->tail);
+ union pipe_index idx = { .head_tail = READ_ONCE(pipe->head_tail) };
unsigned int writers = READ_ONCE(pipe->writers);
- return !pipe_empty(head, tail) || !writers;
+ return !pipe_empty(idx.head, idx.tail) || !writers;
}
static inline unsigned int pipe_update_tail(struct pipe_inode_info *pipe,
@@ -248,7 +266,7 @@ static inline unsigned int pipe_update_tail(struct pipe_inode_info *pipe,
}
static ssize_t
-pipe_read(struct kiocb *iocb, struct iov_iter *to)
+anon_pipe_read(struct kiocb *iocb, struct iov_iter *to)
{
size_t total_len = iov_iter_count(to);
struct file *filp = iocb->ki_filp;
@@ -275,7 +293,6 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
/* Read ->head with a barrier vs post_one_notification() */
unsigned int head = smp_load_acquire(&pipe->head);
unsigned int tail = pipe->tail;
- unsigned int mask = pipe->ring_size - 1;
#ifdef CONFIG_WATCH_QUEUE
if (pipe->note_loss) {
@@ -302,7 +319,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
#endif
if (!pipe_empty(head, tail)) {
- struct pipe_buffer *buf = &pipe->bufs[tail & mask];
+ struct pipe_buffer *buf = pipe_buf(pipe, tail);
size_t chars = buf->len;
size_t written;
int error;
@@ -360,29 +377,9 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
break;
}
mutex_unlock(&pipe->mutex);
-
/*
* We only get here if we didn't actually read anything.
*
- * However, we could have seen (and removed) a zero-sized
- * pipe buffer, and might have made space in the buffers
- * that way.
- *
- * You can't make zero-sized pipe buffers by doing an empty
- * write (not even in packet mode), but they can happen if
- * the writer gets an EFAULT when trying to fill a buffer
- * that already got allocated and inserted in the buffer
- * array.
- *
- * So we still need to wake up any pending writers in the
- * _very_ unlikely case that the pipe was full, but we got
- * no data.
- */
- if (unlikely(wake_writer))
- wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
- kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
-
- /*
* But because we didn't read anything, at this point we can
* just return directly with -ERESTARTSYS if we're interrupted,
* since we've done any required wakeups and there's no need
@@ -391,11 +388,10 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0)
return -ERESTARTSYS;
- wake_writer = false;
wake_next_reader = true;
mutex_lock(&pipe->mutex);
}
- if (pipe_empty(pipe->head, pipe->tail))
+ if (pipe_is_empty(pipe))
wake_next_reader = false;
mutex_unlock(&pipe->mutex);
@@ -404,8 +400,15 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
if (wake_next_reader)
wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
+ return ret;
+}
+
+static ssize_t
+fifo_pipe_read(struct kiocb *iocb, struct iov_iter *to)
+{
+ int ret = anon_pipe_read(iocb, to);
if (ret > 0)
- file_accessed(filp);
+ file_accessed(iocb->ki_filp);
return ret;
}
@@ -417,16 +420,15 @@ static inline int is_packetized(struct file *file)
/* Done while waiting without holding the pipe lock - thus the READ_ONCE() */
static inline bool pipe_writable(const struct pipe_inode_info *pipe)
{
- unsigned int head = READ_ONCE(pipe->head);
- unsigned int tail = READ_ONCE(pipe->tail);
+ union pipe_index idx = { .head_tail = READ_ONCE(pipe->head_tail) };
unsigned int max_usage = READ_ONCE(pipe->max_usage);
- return !pipe_full(head, tail, max_usage) ||
+ return !pipe_full(idx.head, idx.tail, max_usage) ||
!READ_ONCE(pipe->readers);
}
static ssize_t
-pipe_write(struct kiocb *iocb, struct iov_iter *from)
+anon_pipe_write(struct kiocb *iocb, struct iov_iter *from)
{
struct file *filp = iocb->ki_filp;
struct pipe_inode_info *pipe = filp->private_data;
@@ -473,8 +475,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
was_empty = pipe_empty(head, pipe->tail);
chars = total_len & (PAGE_SIZE-1);
if (chars && !was_empty) {
- unsigned int mask = pipe->ring_size - 1;
- struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask];
+ struct pipe_buffer *buf = pipe_buf(pipe, head - 1);
int offset = buf->offset + buf->len;
if ((buf->flags & PIPE_BUF_FLAG_CAN_MERGE) &&
@@ -505,54 +506,44 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
head = pipe->head;
if (!pipe_full(head, pipe->tail, pipe->max_usage)) {
- unsigned int mask = pipe->ring_size - 1;
struct pipe_buffer *buf;
- struct page *page = pipe->tmp_page;
+ struct page *page;
int copied;
- if (!page) {
- page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT);
- if (unlikely(!page)) {
- ret = ret ? : -ENOMEM;
- break;
- }
- pipe->tmp_page = page;
+ page = anon_pipe_get_page(pipe);
+ if (unlikely(!page)) {
+ if (!ret)
+ ret = -ENOMEM;
+ break;
}
- /* Allocate a slot in the ring in advance and attach an
- * empty buffer. If we fault or otherwise fail to use
- * it, either the reader will consume it or it'll still
- * be there for the next write.
- */
- pipe->head = head + 1;
+ copied = copy_page_from_iter(page, 0, PAGE_SIZE, from);
+ if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) {
+ anon_pipe_put_page(pipe, page);
+ if (!ret)
+ ret = -EFAULT;
+ break;
+ }
+ pipe->head = head + 1;
/* Insert it into the buffer array */
- buf = &pipe->bufs[head & mask];
+ buf = pipe_buf(pipe, head);
buf->page = page;
buf->ops = &anon_pipe_buf_ops;
buf->offset = 0;
- buf->len = 0;
if (is_packetized(filp))
buf->flags = PIPE_BUF_FLAG_PACKET;
else
buf->flags = PIPE_BUF_FLAG_CAN_MERGE;
- pipe->tmp_page = NULL;
- copied = copy_page_from_iter(page, 0, PAGE_SIZE, from);
- if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) {
- if (!ret)
- ret = -EFAULT;
- break;
- }
- ret += copied;
buf->len = copied;
+ ret += copied;
if (!iov_iter_count(from))
break;
- }
- if (!pipe_full(head, pipe->tail, pipe->max_usage))
continue;
+ }
/* Wait for buffer space to become available. */
if ((filp->f_flags & O_NONBLOCK) ||
@@ -579,11 +570,11 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe));
mutex_lock(&pipe->mutex);
- was_empty = pipe_empty(pipe->head, pipe->tail);
+ was_empty = pipe_is_empty(pipe);
wake_next_writer = true;
}
out:
- if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
+ if (pipe_is_full(pipe))
wake_next_writer = false;
mutex_unlock(&pipe->mutex);
@@ -604,11 +595,21 @@ out:
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
if (wake_next_writer)
wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
- if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) {
- int err = file_update_time(filp);
- if (err)
- ret = err;
- sb_end_write(file_inode(filp)->i_sb);
+ return ret;
+}
+
+static ssize_t
+fifo_pipe_write(struct kiocb *iocb, struct iov_iter *from)
+{
+ int ret = anon_pipe_write(iocb, from);
+ if (ret > 0) {
+ struct file *filp = iocb->ki_filp;
+ if (sb_start_write_trylock(file_inode(filp)->i_sb)) {
+ int err = file_update_time(filp);
+ if (err)
+ ret = err;
+ sb_end_write(file_inode(filp)->i_sb);
+ }
}
return ret;
}
@@ -616,7 +617,7 @@ out:
static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct pipe_inode_info *pipe = filp->private_data;
- unsigned int count, head, tail, mask;
+ unsigned int count, head, tail;
switch (cmd) {
case FIONREAD:
@@ -624,10 +625,9 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
count = 0;
head = pipe->head;
tail = pipe->tail;
- mask = pipe->ring_size - 1;
- while (tail != head) {
- count += pipe->bufs[tail & mask].len;
+ while (!pipe_empty(head, tail)) {
+ count += pipe_buf(pipe, tail)->len;
tail++;
}
mutex_unlock(&pipe->mutex);
@@ -659,7 +659,7 @@ pipe_poll(struct file *filp, poll_table *wait)
{
__poll_t mask;
struct pipe_inode_info *pipe = filp->private_data;
- unsigned int head, tail;
+ union pipe_index idx;
/* Epoll has some historical nasty semantics, this enables them */
WRITE_ONCE(pipe->poll_usage, true);
@@ -680,19 +680,18 @@ pipe_poll(struct file *filp, poll_table *wait)
* if something changes and you got it wrong, the poll
* table entry will wake you up and fix it.
*/
- head = READ_ONCE(pipe->head);
- tail = READ_ONCE(pipe->tail);
+ idx.head_tail = READ_ONCE(pipe->head_tail);
mask = 0;
if (filp->f_mode & FMODE_READ) {
- if (!pipe_empty(head, tail))
+ if (!pipe_empty(idx.head, idx.tail))
mask |= EPOLLIN | EPOLLRDNORM;
if (!pipe->writers && filp->f_pipe != pipe->w_counter)
mask |= EPOLLHUP;
}
if (filp->f_mode & FMODE_WRITE) {
- if (!pipe_full(head, tail, pipe->max_usage))
+ if (!pipe_full(idx.head, idx.tail, pipe->max_usage))
mask |= EPOLLOUT | EPOLLWRNORM;
/*
* Most Unices do not set EPOLLERR for FIFOs but on Linux they
@@ -857,8 +856,10 @@ void free_pipe_info(struct pipe_inode_info *pipe)
if (pipe->watch_queue)
put_watch_queue(pipe->watch_queue);
#endif
- if (pipe->tmp_page)
- __free_page(pipe->tmp_page);
+ for (i = 0; i < ARRAY_SIZE(pipe->tmp_page); i++) {
+ if (pipe->tmp_page[i])
+ __free_page(pipe->tmp_page[i]);
+ }
kfree(pipe->bufs);
kfree(pipe);
}
@@ -878,6 +879,8 @@ static const struct dentry_operations pipefs_dentry_operations = {
.d_dname = pipefs_dname,
};
+static const struct file_operations pipeanon_fops;
+
static struct inode * get_pipe_inode(void)
{
struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb);
@@ -895,7 +898,7 @@ static struct inode * get_pipe_inode(void)
inode->i_pipe = pipe;
pipe->files = 2;
pipe->readers = pipe->writers = 1;
- inode->i_fop = &pipefifo_fops;
+ inode->i_fop = &pipeanon_fops;
/*
* Mark the inode dirty from the very beginning,
@@ -938,7 +941,7 @@ int create_pipe_files(struct file **res, int flags)
f = alloc_file_pseudo(inode, pipe_mnt, "",
O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)),
- &pipefifo_fops);
+ &pipeanon_fops);
if (IS_ERR(f)) {
free_pipe_info(inode->i_pipe);
iput(inode);
@@ -949,7 +952,7 @@ int create_pipe_files(struct file **res, int flags)
f->f_pipe = 0;
res[0] = alloc_file_clone(f, O_RDONLY | (flags & O_NONBLOCK),
- &pipefifo_fops);
+ &pipeanon_fops);
if (IS_ERR(res[0])) {
put_pipe_info(inode, inode->i_pipe);
fput(f);
@@ -960,6 +963,17 @@ int create_pipe_files(struct file **res, int flags)
res[1] = f;
stream_open(inode, res[0]);
stream_open(inode, res[1]);
+
+ /* pipe groks IOCB_NOWAIT */
+ res[0]->f_mode |= FMODE_NOWAIT;
+ res[1]->f_mode |= FMODE_NOWAIT;
+
+ /*
+ * Disable permission and pre-content events, but enable legacy
+ * inotify events for legacy users.
+ */
+ file_set_fsnotify_mode(res[0], FMODE_NONOTIFY_PERM);
+ file_set_fsnotify_mode(res[1], FMODE_NONOTIFY_PERM);
return 0;
}
@@ -988,9 +1002,6 @@ static int __do_pipe_flags(int *fd, struct file **files, int flags)
audit_fd_pair(fdr, fdw);
fd[0] = fdr;
fd[1] = fdw;
- /* pipe groks IOCB_NOWAIT */
- files[0]->f_mode |= FMODE_NOWAIT;
- files[1]->f_mode |= FMODE_NOWAIT;
return 0;
err_fdr:
@@ -1107,8 +1118,8 @@ static void wake_up_partner(struct pipe_inode_info *pipe)
static int fifo_open(struct inode *inode, struct file *filp)
{
+ bool is_pipe = inode->i_fop == &pipeanon_fops;
struct pipe_inode_info *pipe;
- bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC;
int ret;
filp->f_pipe = 0;
@@ -1232,8 +1243,19 @@ err:
const struct file_operations pipefifo_fops = {
.open = fifo_open,
- .read_iter = pipe_read,
- .write_iter = pipe_write,
+ .read_iter = fifo_pipe_read,
+ .write_iter = fifo_pipe_write,
+ .poll = pipe_poll,
+ .unlocked_ioctl = pipe_ioctl,
+ .release = pipe_release,
+ .fasync = pipe_fasync,
+ .splice_write = iter_file_splice_write,
+};
+
+static const struct file_operations pipeanon_fops = {
+ .open = fifo_open,
+ .read_iter = anon_pipe_read,
+ .write_iter = anon_pipe_write,
.poll = pipe_poll,
.unlocked_ioctl = pipe_ioctl,
.release = pipe_release,
@@ -1269,6 +1291,10 @@ int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots)
struct pipe_buffer *bufs;
unsigned int head, tail, mask, n;
+ /* nr_slots larger than limits of pipe->{head,tail} */
+ if (unlikely(nr_slots > (pipe_index_t)-1u))
+ return -EINVAL;
+
bufs = kcalloc(nr_slots, sizeof(*bufs),
GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
if (unlikely(!bufs))
@@ -1388,7 +1414,9 @@ struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice)
{
struct pipe_inode_info *pipe = file->private_data;
- if (file->f_op != &pipefifo_fops || !pipe)
+ if (!pipe)
+ return NULL;
+ if (file->f_op != &pipefifo_fops && file->f_op != &pipeanon_fops)
return NULL;
if (for_splice && pipe_has_watch_queue(pipe))
return NULL;
diff --git a/fs/pnode.c b/fs/pnode.c
index ef048f008bdd..6f7d02f3fa98 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -21,17 +21,12 @@ static inline struct mount *next_peer(struct mount *p)
static inline struct mount *first_slave(struct mount *p)
{
- return list_entry(p->mnt_slave_list.next, struct mount, mnt_slave);
-}
-
-static inline struct mount *last_slave(struct mount *p)
-{
- return list_entry(p->mnt_slave_list.prev, struct mount, mnt_slave);
+ return hlist_entry(p->mnt_slave_list.first, struct mount, mnt_slave);
}
static inline struct mount *next_slave(struct mount *p)
{
- return list_entry(p->mnt_slave.next, struct mount, mnt_slave);
+ return hlist_entry(p->mnt_slave.next, struct mount, mnt_slave);
}
static struct mount *get_peer_under_root(struct mount *mnt,
@@ -70,69 +65,91 @@ int get_dominating_id(struct mount *mnt, const struct path *root)
return 0;
}
-static int do_make_slave(struct mount *mnt)
+static inline bool will_be_unmounted(struct mount *m)
{
- struct mount *master, *slave_mnt;
+ return m->mnt.mnt_flags & MNT_UMOUNT;
+}
- if (list_empty(&mnt->mnt_share)) {
- if (IS_MNT_SHARED(mnt)) {
- mnt_release_group_id(mnt);
- CLEAR_MNT_SHARED(mnt);
- }
- master = mnt->mnt_master;
- if (!master) {
- struct list_head *p = &mnt->mnt_slave_list;
- while (!list_empty(p)) {
- slave_mnt = list_first_entry(p,
- struct mount, mnt_slave);
- list_del_init(&slave_mnt->mnt_slave);
- slave_mnt->mnt_master = NULL;
- }
- return 0;
- }
- } else {
+static struct mount *propagation_source(struct mount *mnt)
+{
+ do {
struct mount *m;
- /*
- * slave 'mnt' to a peer mount that has the
- * same root dentry. If none is available then
- * slave it to anything that is available.
- */
- for (m = master = next_peer(mnt); m != mnt; m = next_peer(m)) {
- if (m->mnt.mnt_root == mnt->mnt.mnt_root) {
- master = m;
- break;
- }
+ for (m = next_peer(mnt); m != mnt; m = next_peer(m)) {
+ if (!will_be_unmounted(m))
+ return m;
}
- list_del_init(&mnt->mnt_share);
- mnt->mnt_group_id = 0;
- CLEAR_MNT_SHARED(mnt);
+ mnt = mnt->mnt_master;
+ } while (mnt && will_be_unmounted(mnt));
+ return mnt;
+}
+
+static void transfer_propagation(struct mount *mnt, struct mount *to)
+{
+ struct hlist_node *p = NULL, *n;
+ struct mount *m;
+
+ hlist_for_each_entry_safe(m, n, &mnt->mnt_slave_list, mnt_slave) {
+ m->mnt_master = to;
+ if (!to)
+ hlist_del_init(&m->mnt_slave);
+ else
+ p = &m->mnt_slave;
}
- list_for_each_entry(slave_mnt, &mnt->mnt_slave_list, mnt_slave)
- slave_mnt->mnt_master = master;
- list_move(&mnt->mnt_slave, &master->mnt_slave_list);
- list_splice(&mnt->mnt_slave_list, master->mnt_slave_list.prev);
- INIT_LIST_HEAD(&mnt->mnt_slave_list);
- mnt->mnt_master = master;
- return 0;
+ if (p)
+ hlist_splice_init(&mnt->mnt_slave_list, p, &to->mnt_slave_list);
}
/*
- * vfsmount lock must be held for write
+ * EXCL[namespace_sem]
*/
void change_mnt_propagation(struct mount *mnt, int type)
{
+ struct mount *m = mnt->mnt_master;
+
if (type == MS_SHARED) {
set_mnt_shared(mnt);
return;
}
- do_make_slave(mnt);
- if (type != MS_SLAVE) {
- list_del_init(&mnt->mnt_slave);
+ if (IS_MNT_SHARED(mnt)) {
+ if (type == MS_SLAVE || !hlist_empty(&mnt->mnt_slave_list))
+ m = propagation_source(mnt);
+ if (list_empty(&mnt->mnt_share)) {
+ mnt_release_group_id(mnt);
+ } else {
+ list_del_init(&mnt->mnt_share);
+ mnt->mnt_group_id = 0;
+ }
+ CLEAR_MNT_SHARED(mnt);
+ transfer_propagation(mnt, m);
+ }
+ hlist_del_init(&mnt->mnt_slave);
+ if (type == MS_SLAVE) {
+ mnt->mnt_master = m;
+ if (m)
+ hlist_add_head(&mnt->mnt_slave, &m->mnt_slave_list);
+ } else {
mnt->mnt_master = NULL;
if (type == MS_UNBINDABLE)
- mnt->mnt.mnt_flags |= MNT_UNBINDABLE;
+ mnt->mnt_t_flags |= T_UNBINDABLE;
else
- mnt->mnt.mnt_flags &= ~MNT_UNBINDABLE;
+ mnt->mnt_t_flags &= ~T_UNBINDABLE;
+ }
+}
+
+static struct mount *__propagation_next(struct mount *m,
+ struct mount *origin)
+{
+ while (1) {
+ struct mount *master = m->mnt_master;
+
+ if (master == origin->mnt_master) {
+ struct mount *next = next_peer(m);
+ return (next == origin) ? NULL : next;
+ } else if (m->mnt_slave.next)
+ return next_slave(m);
+
+ /* back at master */
+ m = master;
}
}
@@ -150,34 +167,24 @@ static struct mount *propagation_next(struct mount *m,
struct mount *origin)
{
/* are there any slaves of this mount? */
- if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
+ if (!IS_MNT_NEW(m) && !hlist_empty(&m->mnt_slave_list))
return first_slave(m);
- while (1) {
- struct mount *master = m->mnt_master;
-
- if (master == origin->mnt_master) {
- struct mount *next = next_peer(m);
- return (next == origin) ? NULL : next;
- } else if (m->mnt_slave.next != &master->mnt_slave_list)
- return next_slave(m);
-
- /* back at master */
- m = master;
- }
+ return __propagation_next(m, origin);
}
static struct mount *skip_propagation_subtree(struct mount *m,
struct mount *origin)
{
/*
- * Advance m such that propagation_next will not return
- * the slaves of m.
+ * Advance m past everything that gets propagation from it.
*/
- if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
- m = last_slave(m);
+ struct mount *p = __propagation_next(m, origin);
- return m;
+ while (p && peers(m, p))
+ p = __propagation_next(p, origin);
+
+ return p;
}
static struct mount *next_group(struct mount *m, struct mount *origin)
@@ -185,7 +192,7 @@ static struct mount *next_group(struct mount *m, struct mount *origin)
while (1) {
while (1) {
struct mount *next;
- if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
+ if (!IS_MNT_NEW(m) && !hlist_empty(&m->mnt_slave_list))
return first_slave(m);
next = next_peer(m);
if (m->mnt_group_id == origin->mnt_group_id) {
@@ -198,7 +205,7 @@ static struct mount *next_group(struct mount *m, struct mount *origin)
/* m is the last peer */
while (1) {
struct mount *master = m->mnt_master;
- if (m->mnt_slave.next != &master->mnt_slave_list)
+ if (m->mnt_slave.next)
return next_slave(m);
m = next_peer(master);
if (master->mnt_group_id == origin->mnt_group_id)
@@ -212,138 +219,113 @@ static struct mount *next_group(struct mount *m, struct mount *origin)
}
}
-/* all accesses are serialized by namespace_sem */
-static struct mount *last_dest, *first_source, *last_source, *dest_master;
-static struct hlist_head *list;
-
-static inline bool peers(const struct mount *m1, const struct mount *m2)
-{
- return m1->mnt_group_id == m2->mnt_group_id && m1->mnt_group_id;
-}
-
-static int propagate_one(struct mount *m, struct mountpoint *dest_mp)
+static bool need_secondary(struct mount *m, struct mountpoint *dest_mp)
{
- struct mount *child;
- int type;
/* skip ones added by this propagate_mnt() */
if (IS_MNT_NEW(m))
- return 0;
- /* skip if mountpoint isn't covered by it */
+ return false;
+ /* skip if mountpoint isn't visible in m */
if (!is_subdir(dest_mp->m_dentry, m->mnt.mnt_root))
- return 0;
- if (peers(m, last_dest)) {
- type = CL_MAKE_SHARED;
- } else {
- struct mount *n, *p;
- bool done;
- for (n = m; ; n = p) {
- p = n->mnt_master;
- if (p == dest_master || IS_MNT_MARKED(p))
- break;
- }
- do {
- struct mount *parent = last_source->mnt_parent;
- if (peers(last_source, first_source))
- break;
- done = parent->mnt_master == p;
- if (done && peers(n, parent))
- break;
- last_source = last_source->mnt_master;
- } while (!done);
+ return false;
+ /* skip if m is in the anon_ns */
+ if (is_anon_ns(m->mnt_ns))
+ return false;
+ return true;
+}
- type = CL_SLAVE;
- /* beginning of peer group among the slaves? */
- if (IS_MNT_SHARED(m))
- type |= CL_MAKE_SHARED;
+static struct mount *find_master(struct mount *m,
+ struct mount *last_copy,
+ struct mount *original)
+{
+ struct mount *p;
+
+ // ascend until there's a copy for something with the same master
+ for (;;) {
+ p = m->mnt_master;
+ if (!p || IS_MNT_MARKED(p))
+ break;
+ m = p;
}
-
- child = copy_tree(last_source, last_source->mnt.mnt_root, type);
- if (IS_ERR(child))
- return PTR_ERR(child);
- read_seqlock_excl(&mount_lock);
- mnt_set_mountpoint(m, dest_mp, child);
- if (m->mnt_master != dest_master)
- SET_MNT_MARK(m->mnt_master);
- read_sequnlock_excl(&mount_lock);
- last_dest = m;
- last_source = child;
- hlist_add_head(&child->mnt_hash, list);
- return count_mounts(m->mnt_ns, child);
+ while (!peers(last_copy, original)) {
+ struct mount *parent = last_copy->mnt_parent;
+ if (parent->mnt_master == p) {
+ if (!peers(parent, m))
+ last_copy = last_copy->mnt_master;
+ break;
+ }
+ last_copy = last_copy->mnt_master;
+ }
+ return last_copy;
}
-/*
- * mount 'source_mnt' under the destination 'dest_mnt' at
- * dentry 'dest_dentry'. And propagate that mount to
- * all the peer and slave mounts of 'dest_mnt'.
- * Link all the new mounts into a propagation tree headed at
- * source_mnt. Also link all the new mounts using ->mnt_list
- * headed at source_mnt's ->mnt_list
+/**
+ * propagate_mnt() - create secondary copies for tree attachment
+ * @dest_mnt: destination mount.
+ * @dest_mp: destination mountpoint.
+ * @source_mnt: source mount.
+ * @tree_list: list of secondaries to be attached.
*
- * @dest_mnt: destination mount.
- * @dest_dentry: destination dentry.
- * @source_mnt: source mount.
- * @tree_list : list of heads of trees to be attached.
+ * Create secondary copies for attaching a tree with root @source_mnt
+ * at mount @dest_mnt with mountpoint @dest_mp. Link all new mounts
+ * into a propagation graph. Set mountpoints for all secondaries,
+ * link their roots into @tree_list via ->mnt_hash.
*/
int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
- struct mount *source_mnt, struct hlist_head *tree_list)
+ struct mount *source_mnt, struct hlist_head *tree_list)
{
- struct mount *m, *n;
- int ret = 0;
-
- /*
- * we don't want to bother passing tons of arguments to
- * propagate_one(); everything is serialized by namespace_sem,
- * so globals will do just fine.
- */
- last_dest = dest_mnt;
- first_source = source_mnt;
- last_source = source_mnt;
- list = tree_list;
- dest_master = dest_mnt->mnt_master;
-
- /* all peers of dest_mnt, except dest_mnt itself */
- for (n = next_peer(dest_mnt); n != dest_mnt; n = next_peer(n)) {
- ret = propagate_one(n, dest_mp);
- if (ret)
- goto out;
- }
-
- /* all slave groups */
- for (m = next_group(dest_mnt, dest_mnt); m;
- m = next_group(m, dest_mnt)) {
- /* everything in that slave group */
- n = m;
+ struct mount *m, *n, *copy, *this;
+ int err = 0, type;
+
+ if (dest_mnt->mnt_master)
+ SET_MNT_MARK(dest_mnt->mnt_master);
+
+ /* iterate over peer groups, depth first */
+ for (m = dest_mnt; m && !err; m = next_group(m, dest_mnt)) {
+ if (m == dest_mnt) { // have one for dest_mnt itself
+ copy = source_mnt;
+ type = CL_MAKE_SHARED;
+ n = next_peer(m);
+ if (n == m)
+ continue;
+ } else {
+ type = CL_SLAVE;
+ /* beginning of peer group among the slaves? */
+ if (IS_MNT_SHARED(m))
+ type |= CL_MAKE_SHARED;
+ n = m;
+ }
do {
- ret = propagate_one(n, dest_mp);
- if (ret)
- goto out;
- n = next_peer(n);
- } while (n != m);
+ if (!need_secondary(n, dest_mp))
+ continue;
+ if (type & CL_SLAVE) // first in this peer group
+ copy = find_master(n, copy, source_mnt);
+ this = copy_tree(copy, copy->mnt.mnt_root, type);
+ if (IS_ERR(this)) {
+ err = PTR_ERR(this);
+ break;
+ }
+ read_seqlock_excl(&mount_lock);
+ mnt_set_mountpoint(n, dest_mp, this);
+ read_sequnlock_excl(&mount_lock);
+ if (n->mnt_master)
+ SET_MNT_MARK(n->mnt_master);
+ copy = this;
+ hlist_add_head(&this->mnt_hash, tree_list);
+ err = count_mounts(n->mnt_ns, this);
+ if (err)
+ break;
+ type = CL_MAKE_SHARED;
+ } while ((n = next_peer(n)) != m);
}
-out:
- read_seqlock_excl(&mount_lock);
+
hlist_for_each_entry(n, tree_list, mnt_hash) {
m = n->mnt_parent;
- if (m->mnt_master != dest_mnt->mnt_master)
+ if (m->mnt_master)
CLEAR_MNT_MARK(m->mnt_master);
}
- read_sequnlock_excl(&mount_lock);
- return ret;
-}
-
-static struct mount *find_topper(struct mount *mnt)
-{
- /* If there is exactly one mount covering mnt completely return it. */
- struct mount *child;
-
- if (!list_is_singular(&mnt->mnt_mounts))
- return NULL;
-
- child = list_first_entry(&mnt->mnt_mounts, struct mount, mnt_child);
- if (child->mnt_mountpoint != mnt->mnt.mnt_root)
- return NULL;
-
- return child;
+ if (dest_mnt->mnt_master)
+ CLEAR_MNT_MARK(dest_mnt->mnt_master);
+ return err;
}
/*
@@ -380,9 +362,6 @@ bool propagation_would_overmount(const struct mount *from,
if (!IS_MNT_SHARED(from))
return false;
- if (IS_MNT_NEW(to))
- return false;
-
if (to->mnt.mnt_root != mp->m_dentry)
return false;
@@ -406,12 +385,8 @@ bool propagation_would_overmount(const struct mount *from,
*/
int propagate_mount_busy(struct mount *mnt, int refcnt)
{
- struct mount *m, *child, *topper;
struct mount *parent = mnt->mnt_parent;
- if (mnt == parent)
- return do_refcount_check(mnt, refcnt);
-
/*
* quickly check if the current mount can be unmounted.
* If not, we don't have to go checking for all other
@@ -420,23 +395,27 @@ int propagate_mount_busy(struct mount *mnt, int refcnt)
if (!list_empty(&mnt->mnt_mounts) || do_refcount_check(mnt, refcnt))
return 1;
- for (m = propagation_next(parent, parent); m;
+ if (mnt == parent)
+ return 0;
+
+ for (struct mount *m = propagation_next(parent, parent); m;
m = propagation_next(m, parent)) {
- int count = 1;
- child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
- if (!child)
- continue;
+ struct list_head *head;
+ struct mount *child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
- /* Is there exactly one mount on the child that covers
- * it completely whose reference should be ignored?
- */
- topper = find_topper(child);
- if (topper)
- count += 1;
- else if (!list_empty(&child->mnt_mounts))
+ if (!child)
continue;
- if (do_refcount_check(child, count))
+ head = &child->mnt_mounts;
+ if (!list_empty(head)) {
+ /*
+ * a mount that covers child completely wouldn't prevent
+ * it being pulled out; any other would.
+ */
+ if (!list_is_singular(head) || !child->overmount)
+ continue;
+ }
+ if (do_refcount_check(child, 1))
return 1;
}
return 0;
@@ -462,179 +441,210 @@ void propagate_mount_unlock(struct mount *mnt)
}
}
-static void umount_one(struct mount *mnt, struct list_head *to_umount)
+static inline bool is_candidate(struct mount *m)
{
- CLEAR_MNT_MARK(mnt);
- mnt->mnt.mnt_flags |= MNT_UMOUNT;
- list_del_init(&mnt->mnt_child);
- list_del_init(&mnt->mnt_umounting);
- move_from_ns(mnt, to_umount);
+ return m->mnt_t_flags & T_UMOUNT_CANDIDATE;
}
-/*
- * NOTE: unmounting 'mnt' naturally propagates to all other mounts its
- * parent propagates to.
- */
-static bool __propagate_umount(struct mount *mnt,
- struct list_head *to_umount,
- struct list_head *to_restore)
+static void umount_one(struct mount *m, struct list_head *to_umount)
{
- bool progress = false;
- struct mount *child;
-
- /*
- * The state of the parent won't change if this mount is
- * already unmounted or marked as without children.
- */
- if (mnt->mnt.mnt_flags & (MNT_UMOUNT | MNT_MARKED))
- goto out;
+ m->mnt.mnt_flags |= MNT_UMOUNT;
+ list_del_init(&m->mnt_child);
+ move_from_ns(m);
+ list_add_tail(&m->mnt_list, to_umount);
+}
- /* Verify topper is the only grandchild that has not been
- * speculatively unmounted.
- */
- list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
- if (child->mnt_mountpoint == mnt->mnt.mnt_root)
- continue;
- if (!list_empty(&child->mnt_umounting) && IS_MNT_MARKED(child))
- continue;
- /* Found a mounted child */
- goto children;
- }
+static void remove_from_candidate_list(struct mount *m)
+{
+ m->mnt_t_flags &= ~(T_MARKED | T_UMOUNT_CANDIDATE);
+ list_del_init(&m->mnt_list);
+}
- /* Mark mounts that can be unmounted if not locked */
- SET_MNT_MARK(mnt);
- progress = true;
+static void gather_candidates(struct list_head *set,
+ struct list_head *candidates)
+{
+ struct mount *m, *p, *q;
- /* If a mount is without children and not locked umount it. */
- if (!IS_MNT_LOCKED(mnt)) {
- umount_one(mnt, to_umount);
- } else {
-children:
- list_move_tail(&mnt->mnt_umounting, to_restore);
+ list_for_each_entry(m, set, mnt_list) {
+ if (is_candidate(m))
+ continue;
+ m->mnt_t_flags |= T_UMOUNT_CANDIDATE;
+ p = m->mnt_parent;
+ q = propagation_next(p, p);
+ while (q) {
+ struct mount *child = __lookup_mnt(&q->mnt,
+ m->mnt_mountpoint);
+ if (child) {
+ /*
+ * We might've already run into this one. That
+ * must've happened on earlier iteration of the
+ * outer loop; in that case we can skip those
+ * parents that get propagation from q - there
+ * will be nothing new on those as well.
+ */
+ if (is_candidate(child)) {
+ q = skip_propagation_subtree(q, p);
+ continue;
+ }
+ child->mnt_t_flags |= T_UMOUNT_CANDIDATE;
+ if (!will_be_unmounted(child))
+ list_add(&child->mnt_list, candidates);
+ }
+ q = propagation_next(q, p);
+ }
}
-out:
- return progress;
+ list_for_each_entry(m, set, mnt_list)
+ m->mnt_t_flags &= ~T_UMOUNT_CANDIDATE;
}
-static void umount_list(struct list_head *to_umount,
- struct list_head *to_restore)
+/*
+ * We know that some child of @m can't be unmounted. In all places where the
+ * chain of descent of @m has child not overmounting the root of parent,
+ * the parent can't be unmounted either.
+ */
+static void trim_ancestors(struct mount *m)
{
- struct mount *mnt, *child, *tmp;
- list_for_each_entry(mnt, to_umount, mnt_list) {
- list_for_each_entry_safe(child, tmp, &mnt->mnt_mounts, mnt_child) {
- /* topper? */
- if (child->mnt_mountpoint == mnt->mnt.mnt_root)
- list_move_tail(&child->mnt_umounting, to_restore);
- else
- umount_one(child, to_umount);
- }
+ struct mount *p;
+
+ for (p = m->mnt_parent; is_candidate(p); m = p, p = p->mnt_parent) {
+ if (IS_MNT_MARKED(m)) // all candidates beneath are overmounts
+ return;
+ SET_MNT_MARK(m);
+ if (m != p->overmount)
+ p->mnt_t_flags &= ~T_UMOUNT_CANDIDATE;
}
}
-static void restore_mounts(struct list_head *to_restore)
+/*
+ * Find and exclude all umount candidates forbidden by @m
+ * (see Documentation/filesystems/propagate_umount.txt)
+ * If we can immediately tell that @m is OK to unmount (unlocked
+ * and all children are already committed to unmounting) commit
+ * to unmounting it.
+ * Only @m itself might be taken from the candidates list;
+ * anything found by trim_ancestors() is marked non-candidate
+ * and left on the list.
+ */
+static void trim_one(struct mount *m, struct list_head *to_umount)
{
- /* Restore mounts to a clean working state */
- while (!list_empty(to_restore)) {
- struct mount *mnt, *parent;
- struct mountpoint *mp;
-
- mnt = list_first_entry(to_restore, struct mount, mnt_umounting);
- CLEAR_MNT_MARK(mnt);
- list_del_init(&mnt->mnt_umounting);
-
- /* Should this mount be reparented? */
- mp = mnt->mnt_mp;
- parent = mnt->mnt_parent;
- while (parent->mnt.mnt_flags & MNT_UMOUNT) {
- mp = parent->mnt_mp;
- parent = parent->mnt_parent;
+ bool remove_this = false, found = false, umount_this = false;
+ struct mount *n;
+
+ if (!is_candidate(m)) { // trim_ancestors() left it on list
+ remove_from_candidate_list(m);
+ return;
+ }
+
+ list_for_each_entry(n, &m->mnt_mounts, mnt_child) {
+ if (!is_candidate(n)) {
+ found = true;
+ if (n != m->overmount) {
+ remove_this = true;
+ break;
+ }
}
- if (parent != mnt->mnt_parent)
- mnt_change_mountpoint(parent, mp, mnt);
+ }
+ if (found) {
+ trim_ancestors(m);
+ } else if (!IS_MNT_LOCKED(m) && list_empty(&m->mnt_mounts)) {
+ remove_this = true;
+ umount_this = true;
+ }
+ if (remove_this) {
+ remove_from_candidate_list(m);
+ if (umount_this)
+ umount_one(m, to_umount);
}
}
-static void cleanup_umount_visitations(struct list_head *visited)
+static void handle_locked(struct mount *m, struct list_head *to_umount)
{
- while (!list_empty(visited)) {
- struct mount *mnt =
- list_first_entry(visited, struct mount, mnt_umounting);
- list_del_init(&mnt->mnt_umounting);
+ struct mount *cutoff = m, *p;
+
+ if (!is_candidate(m)) { // trim_ancestors() left it on list
+ remove_from_candidate_list(m);
+ return;
+ }
+ for (p = m; is_candidate(p); p = p->mnt_parent) {
+ remove_from_candidate_list(p);
+ if (!IS_MNT_LOCKED(p))
+ cutoff = p->mnt_parent;
+ }
+ if (will_be_unmounted(p))
+ cutoff = p;
+ while (m != cutoff) {
+ umount_one(m, to_umount);
+ m = m->mnt_parent;
}
}
/*
- * collect all mounts that receive propagation from the mount in @list,
- * and return these additional mounts in the same list.
- * @list: the list of mounts to be unmounted.
+ * @m is not to going away, and it overmounts the top of a stack of mounts
+ * that are going away. We know that all of those are fully overmounted
+ * by the one above (@m being the topmost of the chain), so @m can be slid
+ * in place where the bottom of the stack is attached.
*
- * vfsmount lock must be held for write
+ * NOTE: here we temporarily violate a constraint - two mounts end up with
+ * the same parent and mountpoint; that will be remedied as soon as we
+ * return from propagate_umount() - its caller (umount_tree()) will detach
+ * the stack from the parent it (and now @m) is attached to. umount_tree()
+ * might choose to keep unmounted pieces stuck to each other, but it always
+ * detaches them from the mounts that remain in the tree.
*/
-int propagate_umount(struct list_head *list)
+static void reparent(struct mount *m)
{
- struct mount *mnt;
- LIST_HEAD(to_restore);
- LIST_HEAD(to_umount);
- LIST_HEAD(visited);
-
- /* Find candidates for unmounting */
- list_for_each_entry_reverse(mnt, list, mnt_list) {
- struct mount *parent = mnt->mnt_parent;
- struct mount *m;
+ struct mount *p = m;
+ struct mountpoint *mp;
- /*
- * If this mount has already been visited it is known that it's
- * entire peer group and all of their slaves in the propagation
- * tree for the mountpoint has already been visited and there is
- * no need to visit them again.
- */
- if (!list_empty(&mnt->mnt_umounting))
- continue;
+ do {
+ mp = p->mnt_mp;
+ p = p->mnt_parent;
+ } while (will_be_unmounted(p));
- list_add_tail(&mnt->mnt_umounting, &visited);
- for (m = propagation_next(parent, parent); m;
- m = propagation_next(m, parent)) {
- struct mount *child = __lookup_mnt(&m->mnt,
- mnt->mnt_mountpoint);
- if (!child)
- continue;
+ mnt_change_mountpoint(p, mp, m);
+ mnt_notify_add(m);
+}
- if (!list_empty(&child->mnt_umounting)) {
- /*
- * If the child has already been visited it is
- * know that it's entire peer group and all of
- * their slaves in the propgation tree for the
- * mountpoint has already been visited and there
- * is no need to visit this subtree again.
- */
- m = skip_propagation_subtree(m, parent);
- continue;
- } else if (child->mnt.mnt_flags & MNT_UMOUNT) {
- /*
- * We have come across a partially unmounted
- * mount in a list that has not been visited
- * yet. Remember it has been visited and
- * continue about our merry way.
- */
- list_add_tail(&child->mnt_umounting, &visited);
- continue;
- }
+/**
+ * propagate_umount - apply propagation rules to the set of mounts for umount()
+ * @set: the list of mounts to be unmounted.
+ *
+ * Collect all mounts that receive propagation from the mount in @set and have
+ * no obstacles to being unmounted. Add these additional mounts to the set.
+ *
+ * See Documentation/filesystems/propagate_umount.txt if you do anything in
+ * this area.
+ *
+ * Locks held:
+ * mount_lock (write_seqlock), namespace_sem (exclusive).
+ */
+void propagate_umount(struct list_head *set)
+{
+ struct mount *m, *p;
+ LIST_HEAD(to_umount); // committed to unmounting
+ LIST_HEAD(candidates); // undecided umount candidates
- /* Check the child and parents while progress is made */
- while (__propagate_umount(child,
- &to_umount, &to_restore)) {
- /* Is the parent a umount candidate? */
- child = child->mnt_parent;
- if (list_empty(&child->mnt_umounting))
- break;
- }
- }
+ // collect all candidates
+ gather_candidates(set, &candidates);
+
+ // reduce the set until it's non-shifting
+ list_for_each_entry_safe(m, p, &candidates, mnt_list)
+ trim_one(m, &to_umount);
+
+ // ... and non-revealing
+ while (!list_empty(&candidates)) {
+ m = list_first_entry(&candidates,struct mount, mnt_list);
+ handle_locked(m, &to_umount);
}
- umount_list(&to_umount, &to_restore);
- restore_mounts(&to_restore);
- cleanup_umount_visitations(&visited);
- list_splice_tail(&to_umount, list);
+ // now to_umount consists of all acceptable candidates
+ // deal with reparenting of surviving overmounts on those
+ list_for_each_entry(m, &to_umount, mnt_list) {
+ struct mount *over = m->overmount;
+ if (over && !will_be_unmounted(over))
+ reparent(over);
+ }
- return 0;
+ // and fold them into the set
+ list_splice_tail_init(&to_umount, set);
}
diff --git a/fs/pnode.h b/fs/pnode.h
index 0b02a6393891..00ab153e3e9d 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -10,14 +10,14 @@
#include <linux/list.h>
#include "mount.h"
-#define IS_MNT_SHARED(m) ((m)->mnt.mnt_flags & MNT_SHARED)
+#define IS_MNT_SHARED(m) ((m)->mnt_t_flags & T_SHARED)
#define IS_MNT_SLAVE(m) ((m)->mnt_master)
-#define IS_MNT_NEW(m) (!(m)->mnt_ns || is_anon_ns((m)->mnt_ns))
-#define CLEAR_MNT_SHARED(m) ((m)->mnt.mnt_flags &= ~MNT_SHARED)
-#define IS_MNT_UNBINDABLE(m) ((m)->mnt.mnt_flags & MNT_UNBINDABLE)
-#define IS_MNT_MARKED(m) ((m)->mnt.mnt_flags & MNT_MARKED)
-#define SET_MNT_MARK(m) ((m)->mnt.mnt_flags |= MNT_MARKED)
-#define CLEAR_MNT_MARK(m) ((m)->mnt.mnt_flags &= ~MNT_MARKED)
+#define IS_MNT_NEW(m) (!(m)->mnt_ns)
+#define CLEAR_MNT_SHARED(m) ((m)->mnt_t_flags &= ~T_SHARED)
+#define IS_MNT_UNBINDABLE(m) ((m)->mnt_t_flags & T_UNBINDABLE)
+#define IS_MNT_MARKED(m) ((m)->mnt_t_flags & T_MARKED)
+#define SET_MNT_MARK(m) ((m)->mnt_t_flags |= T_MARKED)
+#define CLEAR_MNT_MARK(m) ((m)->mnt_t_flags &= ~T_MARKED)
#define IS_MNT_LOCKED(m) ((m)->mnt.mnt_flags & MNT_LOCKED)
#define CL_EXPIRE 0x01
@@ -25,21 +25,26 @@
#define CL_COPY_UNBINDABLE 0x04
#define CL_MAKE_SHARED 0x08
#define CL_PRIVATE 0x10
-#define CL_SHARED_TO_SLAVE 0x20
#define CL_COPY_MNT_NS_FILE 0x40
-#define CL_COPY_ALL (CL_COPY_UNBINDABLE | CL_COPY_MNT_NS_FILE)
-
+/*
+ * EXCL[namespace_sem]
+ */
static inline void set_mnt_shared(struct mount *mnt)
{
- mnt->mnt.mnt_flags &= ~MNT_SHARED_MASK;
- mnt->mnt.mnt_flags |= MNT_SHARED;
+ mnt->mnt_t_flags &= ~T_SHARED_MASK;
+ mnt->mnt_t_flags |= T_SHARED;
+}
+
+static inline bool peers(const struct mount *m1, const struct mount *m2)
+{
+ return m1->mnt_group_id == m2->mnt_group_id && m1->mnt_group_id;
}
void change_mnt_propagation(struct mount *, int);
int propagate_mnt(struct mount *, struct mountpoint *, struct mount *,
struct hlist_head *);
-int propagate_umount(struct list_head *);
+void propagate_umount(struct list_head *);
int propagate_mount_busy(struct mount *, int);
void propagate_mount_unlock(struct mount *);
void mnt_release_group_id(struct mount *);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index cd89e956c322..62d35631ba8c 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -416,7 +416,7 @@ static const struct file_operations proc_pid_cmdline_ops = {
#ifdef CONFIG_KALLSYMS
/*
* Provides a wchan file via kallsyms in a proper one-value-per-file format.
- * Returns the resolved symbol. If that fails, simply return the address.
+ * Returns the resolved symbol to user space.
*/
static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
@@ -827,7 +827,13 @@ static const struct file_operations proc_single_file_operations = {
.release = single_release,
};
-
+/*
+ * proc_mem_open() can return errno, NULL or mm_struct*.
+ *
+ * - Returns NULL if the task has no mm (PF_KTHREAD or PF_EXITING)
+ * - Returns mm_struct* on success
+ * - Returns error code on failure
+ */
struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode)
{
struct task_struct *task = get_proc_task(inode);
@@ -854,8 +860,8 @@ static int __mem_open(struct inode *inode, struct file *file, unsigned int mode)
{
struct mm_struct *mm = proc_mem_open(inode, mode);
- if (IS_ERR(mm))
- return PTR_ERR(mm);
+ if (IS_ERR_OR_NULL(mm))
+ return mm ? PTR_ERR(mm) : -ESRCH;
file->private_data = mm;
return 0;
@@ -1489,7 +1495,6 @@ static const struct file_operations proc_fail_nth_operations = {
#endif
-#ifdef CONFIG_SCHED_DEBUG
/*
* Print out various scheduling related per-task fields:
*/
@@ -1539,8 +1544,6 @@ static const struct file_operations proc_pid_sched_operations = {
.release = single_release,
};
-#endif
-
#ifdef CONFIG_SCHED_AUTOGROUP
/*
* Print out autogroup related information:
@@ -2124,7 +2127,7 @@ bool proc_fill_cache(struct file *file, struct dir_context *ctx,
unsigned type = DT_UNKNOWN;
ino_t ino = 1;
- child = d_hash_and_lookup(dir, &qname);
+ child = try_lookup_noperm(&qname, dir);
if (!child) {
DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
child = d_alloc_parallel(dir, &qname, &wq);
@@ -2497,11 +2500,9 @@ static const struct file_operations proc_map_files_operations = {
#if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS)
struct timers_private {
- struct pid *pid;
- struct task_struct *task;
- struct sighand_struct *sighand;
- struct pid_namespace *ns;
- unsigned long flags;
+ struct pid *pid;
+ struct task_struct *task;
+ struct pid_namespace *ns;
};
static void *timers_start(struct seq_file *m, loff_t *pos)
@@ -2512,54 +2513,48 @@ static void *timers_start(struct seq_file *m, loff_t *pos)
if (!tp->task)
return ERR_PTR(-ESRCH);
- tp->sighand = lock_task_sighand(tp->task, &tp->flags);
- if (!tp->sighand)
- return ERR_PTR(-ESRCH);
-
- return seq_hlist_start(&tp->task->signal->posix_timers, *pos);
+ rcu_read_lock();
+ return seq_hlist_start_rcu(&tp->task->signal->posix_timers, *pos);
}
static void *timers_next(struct seq_file *m, void *v, loff_t *pos)
{
struct timers_private *tp = m->private;
- return seq_hlist_next(v, &tp->task->signal->posix_timers, pos);
+
+ return seq_hlist_next_rcu(v, &tp->task->signal->posix_timers, pos);
}
static void timers_stop(struct seq_file *m, void *v)
{
struct timers_private *tp = m->private;
- if (tp->sighand) {
- unlock_task_sighand(tp->task, &tp->flags);
- tp->sighand = NULL;
- }
-
if (tp->task) {
put_task_struct(tp->task);
tp->task = NULL;
+ rcu_read_unlock();
}
}
static int show_timer(struct seq_file *m, void *v)
{
- struct k_itimer *timer;
- struct timers_private *tp = m->private;
- int notify;
static const char * const nstr[] = {
- [SIGEV_SIGNAL] = "signal",
- [SIGEV_NONE] = "none",
- [SIGEV_THREAD] = "thread",
+ [SIGEV_SIGNAL] = "signal",
+ [SIGEV_NONE] = "none",
+ [SIGEV_THREAD] = "thread",
};
- timer = hlist_entry((struct hlist_node *)v, struct k_itimer, list);
- notify = timer->it_sigev_notify;
+ struct k_itimer *timer = hlist_entry((struct hlist_node *)v, struct k_itimer, list);
+ struct timers_private *tp = m->private;
+ int notify = timer->it_sigev_notify;
+
+ guard(spinlock_irq)(&timer->it_lock);
+ if (!posixtimer_valid(timer))
+ return 0;
seq_printf(m, "ID: %d\n", timer->it_id);
- seq_printf(m, "signal: %d/%px\n",
- timer->sigq.info.si_signo,
+ seq_printf(m, "signal: %d/%px\n", timer->sigq.info.si_signo,
timer->sigq.info.si_value.sival_ptr);
- seq_printf(m, "notify: %s/%s.%d\n",
- nstr[notify & ~SIGEV_THREAD_ID],
+ seq_printf(m, "notify: %s/%s.%d\n", nstr[notify & ~SIGEV_THREAD_ID],
(notify & SIGEV_THREAD_ID) ? "tid" : "pid",
pid_nr_ns(timer->it_pid, tp->ns));
seq_printf(m, "ClockID: %d\n", timer->it_clock);
@@ -2709,8 +2704,7 @@ static struct dentry *proc_pident_instantiate(struct dentry *dentry,
inode->i_fop = p->fop;
ei->op = p->op;
pid_update_inode(task, inode);
- d_set_d_op(dentry, &pid_dentry_operations);
- return d_splice_alias(inode, dentry);
+ return d_splice_alias_ops(inode, dentry, &pid_dentry_operations);
}
static struct dentry *proc_pident_lookup(struct inode *dir,
@@ -3296,7 +3290,7 @@ static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns,
}
#endif /* CONFIG_KSM */
-#ifdef CONFIG_STACKLEAK_METRICS
+#ifdef CONFIG_KSTACK_ERASE_METRICS
static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
@@ -3309,7 +3303,7 @@ static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns,
prev_depth, depth);
return 0;
}
-#endif /* CONFIG_STACKLEAK_METRICS */
+#endif /* CONFIG_KSTACK_ERASE_METRICS */
/*
* Thread groups
@@ -3331,9 +3325,7 @@ static const struct pid_entry tgid_base_stuff[] = {
ONE("status", S_IRUGO, proc_pid_status),
ONE("personality", S_IRUSR, proc_pid_personality),
ONE("limits", S_IRUGO, proc_pid_limits),
-#ifdef CONFIG_SCHED_DEBUG
REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
-#endif
#ifdef CONFIG_SCHED_AUTOGROUP
REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations),
#endif
@@ -3418,7 +3410,7 @@ static const struct pid_entry tgid_base_stuff[] = {
#ifdef CONFIG_LIVEPATCH
ONE("patch_state", S_IRUSR, proc_pid_patch_state),
#endif
-#ifdef CONFIG_STACKLEAK_METRICS
+#ifdef CONFIG_KSTACK_ERASE_METRICS
ONE("stack_depth", S_IRUGO, proc_stack_depth),
#endif
#ifdef CONFIG_PROC_PID_ARCH_STATUS
@@ -3508,8 +3500,7 @@ static struct dentry *proc_pid_instantiate(struct dentry * dentry,
set_nlink(inode, nlink_tgid);
pid_update_inode(task, inode);
- d_set_d_op(dentry, &pid_dentry_operations);
- return d_splice_alias(inode, dentry);
+ return d_splice_alias_ops(inode, dentry, &pid_dentry_operations);
}
struct dentry *proc_pid_lookup(struct dentry *dentry, unsigned int flags)
@@ -3682,9 +3673,7 @@ static const struct pid_entry tid_base_stuff[] = {
ONE("status", S_IRUGO, proc_pid_status),
ONE("personality", S_IRUSR, proc_pid_personality),
ONE("limits", S_IRUGO, proc_pid_limits),
-#ifdef CONFIG_SCHED_DEBUG
REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
-#endif
NOD("comm", S_IFREG|S_IRUGO|S_IWUSR,
&proc_tid_comm_inode_operations,
&proc_pid_set_comm_operations, {}),
@@ -3813,8 +3802,7 @@ static struct dentry *proc_task_instantiate(struct dentry *dentry,
set_nlink(inode, nlink_tid);
pid_update_inode(task, inode);
- d_set_d_op(dentry, &pid_dentry_operations);
- return d_splice_alias(inode, dentry);
+ return d_splice_alias_ops(inode, dentry, &pid_dentry_operations);
}
static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 37aa778d1af7..9eeccff49b2a 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -352,18 +352,9 @@ static int proc_fd_getattr(struct mnt_idmap *idmap,
u32 request_mask, unsigned int query_flags)
{
struct inode *inode = d_inode(path->dentry);
- int rv = 0;
generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
-
- /* If it's a directory, put the number of open fds there */
- if (S_ISDIR(inode->i_mode)) {
- rv = proc_readfd_count(inode, &stat->size);
- if (rv < 0)
- return rv;
- }
-
- return rv;
+ return proc_readfd_count(inode, &stat->size);
}
const struct inode_operations proc_fd_inode_operations = {
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 8ec90826a49e..76e800e38c8f 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -254,8 +254,11 @@ struct dentry *proc_lookup_de(struct inode *dir, struct dentry *dentry,
inode = proc_get_inode(dir->i_sb, de);
if (!inode)
return ERR_PTR(-ENOMEM);
- d_set_d_op(dentry, de->proc_dops);
- return d_splice_alias(inode, dentry);
+ if (de->flags & PROC_ENTRY_FORCE_LOOKUP)
+ return d_splice_alias_ops(inode, dentry,
+ &proc_net_dentry_ops);
+ return d_splice_alias_ops(inode, dentry,
+ &proc_misc_dentry_ops);
}
read_unlock(&proc_subdir_lock);
return ERR_PTR(-ENOENT);
@@ -448,9 +451,8 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
INIT_LIST_HEAD(&ent->pde_openers);
proc_set_user(ent, (*parent)->uid, (*parent)->gid);
- ent->proc_dops = &proc_misc_dentry_ops;
/* Revalidate everything under /proc/${pid}/net */
- if ((*parent)->proc_dops == &proc_net_dentry_ops)
+ if ((*parent)->flags & PROC_ENTRY_FORCE_LOOKUP)
pde_force_lookup(ent);
out:
@@ -559,10 +561,18 @@ struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode,
return p;
}
-static inline void pde_set_flags(struct proc_dir_entry *pde)
+static void pde_set_flags(struct proc_dir_entry *pde)
{
if (pde->proc_ops->proc_flags & PROC_ENTRY_PERMANENT)
pde->flags |= PROC_ENTRY_PERMANENT;
+ if (pde->proc_ops->proc_read_iter)
+ pde->flags |= PROC_ENTRY_proc_read_iter;
+#ifdef CONFIG_COMPAT
+ if (pde->proc_ops->proc_compat_ioctl)
+ pde->flags |= PROC_ENTRY_proc_compat_ioctl;
+#endif
+ if (pde->proc_ops->proc_lseek)
+ pde->flags |= PROC_ENTRY_proc_lseek;
}
struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
@@ -626,6 +636,7 @@ struct proc_dir_entry *proc_create_seq_private(const char *name, umode_t mode,
p->proc_ops = &proc_seq_ops;
p->seq_ops = ops;
p->state_size = state_size;
+ pde_set_flags(p);
return proc_register(parent, p);
}
EXPORT_SYMBOL(proc_create_seq_private);
@@ -656,6 +667,7 @@ struct proc_dir_entry *proc_create_single_data(const char *name, umode_t mode,
return NULL;
p->proc_ops = &proc_single_ops;
p->single_show = show;
+ pde_set_flags(p);
return proc_register(parent, p);
}
EXPORT_SYMBOL(proc_create_single_data);
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 626ad7bd94f2..129490151be1 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -42,7 +42,7 @@ static void proc_evict_inode(struct inode *inode)
head = ei->sysctl;
if (head) {
- RCU_INIT_POINTER(ei->sysctl, NULL);
+ WRITE_ONCE(ei->sysctl, NULL);
proc_sys_evict_inode(inode, head);
}
}
@@ -473,7 +473,7 @@ static int proc_reg_open(struct inode *inode, struct file *file)
typeof_member(struct proc_ops, proc_open) open;
struct pde_opener *pdeo;
- if (!pde->proc_ops->proc_lseek)
+ if (!pde_has_proc_lseek(pde))
file->f_mode &= ~FMODE_LSEEK;
if (pde_is_permanent(pde)) {
@@ -656,13 +656,13 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
if (S_ISREG(inode->i_mode)) {
inode->i_op = de->proc_iops;
- if (de->proc_ops->proc_read_iter)
+ if (pde_has_proc_read_iter(de))
inode->i_fop = &proc_iter_file_ops;
else
inode->i_fop = &proc_reg_file_ops;
#ifdef CONFIG_COMPAT
- if (de->proc_ops->proc_compat_ioctl) {
- if (de->proc_ops->proc_read_iter)
+ if (pde_has_proc_compat_ioctl(de)) {
+ if (pde_has_proc_read_iter(de))
inode->i_fop = &proc_iter_file_ops_compat;
else
inode->i_fop = &proc_reg_file_ops_compat;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 1695509370b8..e737401d7383 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -44,7 +44,6 @@ struct proc_dir_entry {
const struct proc_ops *proc_ops;
const struct file_operations *proc_dir_ops;
};
- const struct dentry_operations *proc_dops;
union {
const struct seq_operations *seq_ops;
int (*single_show)(struct seq_file *, void *);
@@ -85,6 +84,25 @@ static inline void pde_make_permanent(struct proc_dir_entry *pde)
pde->flags |= PROC_ENTRY_PERMANENT;
}
+static inline bool pde_has_proc_read_iter(const struct proc_dir_entry *pde)
+{
+ return pde->flags & PROC_ENTRY_proc_read_iter;
+}
+
+static inline bool pde_has_proc_compat_ioctl(const struct proc_dir_entry *pde)
+{
+#ifdef CONFIG_COMPAT
+ return pde->flags & PROC_ENTRY_proc_compat_ioctl;
+#else
+ return false;
+#endif
+}
+
+static inline bool pde_has_proc_lseek(const struct proc_dir_entry *pde)
+{
+ return pde->flags & PROC_ENTRY_proc_lseek;
+}
+
extern struct kmem_cache *proc_dir_entry_cache;
void pde_free(struct proc_dir_entry *pde);
@@ -143,6 +161,7 @@ unsigned name_to_int(const struct qstr *qstr);
/* Worst case buffer size needed for holding an integer. */
#define PROC_NUMBUF 13
+#ifdef CONFIG_PAGE_MAPCOUNT
/**
* folio_precise_page_mapcount() - Number of mappings of this folio page.
* @folio: The folio.
@@ -173,7 +192,49 @@ static inline int folio_precise_page_mapcount(struct folio *folio,
return mapcount;
}
+#else /* !CONFIG_PAGE_MAPCOUNT */
+static inline int folio_precise_page_mapcount(struct folio *folio,
+ struct page *page)
+{
+ BUILD_BUG();
+}
+#endif /* CONFIG_PAGE_MAPCOUNT */
+/**
+ * folio_average_page_mapcount() - Average number of mappings per page in this
+ * folio
+ * @folio: The folio.
+ *
+ * The average number of user page table entries that reference each page in
+ * this folio as tracked via the RMAP: either referenced directly (PTE) or
+ * as part of a larger area that covers this page (e.g., PMD).
+ *
+ * The average is calculated by rounding to the nearest integer; however,
+ * to avoid duplicated code in current callers, the average is at least
+ * 1 if any page of the folio is mapped.
+ *
+ * Returns: The average number of mappings per page in this folio.
+ */
+static inline int folio_average_page_mapcount(struct folio *folio)
+{
+ int mapcount, entire_mapcount, avg;
+
+ if (!folio_test_large(folio))
+ return atomic_read(&folio->_mapcount) + 1;
+
+ mapcount = folio_large_mapcount(folio);
+ if (unlikely(mapcount <= 0))
+ return 0;
+ entire_mapcount = folio_entire_mapcount(folio);
+ if (mapcount <= entire_mapcount)
+ return entire_mapcount;
+ mapcount -= entire_mapcount;
+
+ /* Round to closest integer ... */
+ avg = ((unsigned int)mapcount + folio_large_nr_pages(folio) / 2) >> folio_large_order(folio);
+ /* ... but return at least 1. */
+ return max_t(int, avg + entire_mapcount, 1);
+}
/*
* array.c
*/
@@ -322,6 +383,11 @@ struct proc_maps_private {
struct task_struct *task;
struct mm_struct *mm;
struct vma_iterator iter;
+ loff_t last_pos;
+#ifdef CONFIG_PER_VMA_LOCK
+ bool mmap_locked;
+ struct vm_area_struct *locked_vma;
+#endif
#ifdef CONFIG_NUMA
struct mempolicy *task_mempolicy;
#endif
@@ -346,7 +412,7 @@ extern const struct dentry_operations proc_net_dentry_ops;
static inline void pde_force_lookup(struct proc_dir_entry *pde)
{
/* /proc/net/ entries can be changed under us by setns(CLONE_NEWNET) */
- pde->proc_dops = &proc_net_dentry_ops;
+ pde->flags |= PROC_ENTRY_FORCE_LOOKUP;
}
/*
@@ -357,7 +423,6 @@ static inline void pde_force_lookup(struct proc_dir_entry *pde)
static inline struct dentry *proc_splice_unmountable(struct inode *inode,
struct dentry *dentry, const struct dentry_operations *d_ops)
{
- d_set_d_op(dentry, d_ops);
dont_mount(dentry);
- return d_splice_alias(inode, dentry);
+ return d_splice_alias_ops(inode, dentry, d_ops);
}
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 1cb33771bf9f..728630b10fdf 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -34,8 +34,6 @@
#include <asm/sections.h>
#include "internal.h"
-#define CORE_STR "CORE"
-
#ifndef ELF_CORE_EFLAGS
#define ELF_CORE_EFLAGS 0
#endif
@@ -122,7 +120,9 @@ static void update_kcore_size(void)
kcore_phdrs_len = kcore_nphdr * sizeof(struct elf_phdr);
kcore_notes_len = (4 * sizeof(struct elf_note) +
- 3 * ALIGN(sizeof(CORE_STR), 4) +
+ ALIGN(sizeof(NN_PRSTATUS), 4) +
+ ALIGN(sizeof(NN_PRPSINFO), 4) +
+ ALIGN(sizeof(NN_TASKSTRUCT), 4) +
VMCOREINFO_NOTE_NAME_BYTES +
ALIGN(sizeof(struct elf_prstatus), 4) +
ALIGN(sizeof(struct elf_prpsinfo), 4) +
@@ -443,11 +443,11 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter)
goto out;
}
- append_kcore_note(notes, &i, CORE_STR, NT_PRSTATUS, &prstatus,
+ append_kcore_note(notes, &i, NN_PRSTATUS, NT_PRSTATUS, &prstatus,
sizeof(prstatus));
- append_kcore_note(notes, &i, CORE_STR, NT_PRPSINFO, &prpsinfo,
+ append_kcore_note(notes, &i, NN_PRPSINFO, NT_PRPSINFO, &prpsinfo,
sizeof(prpsinfo));
- append_kcore_note(notes, &i, CORE_STR, NT_TASKSTRUCT, current,
+ append_kcore_note(notes, &i, NN_TASKSTRUCT, NT_TASKSTRUCT, current,
arch_task_struct_size);
/*
* vmcoreinfo_size is mostly constant after init time, but it
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 8ba9b1472390..a458f1e112fd 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -120,10 +120,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
global_node_page_state(NR_SECONDARY_PAGETABLE));
show_val_kb(m, "NFS_Unstable: ", 0);
- show_val_kb(m, "Bounce: ",
- global_zone_page_state(NR_BOUNCE));
- show_val_kb(m, "WritebackTmp: ",
- global_node_page_state(NR_WRITEBACK_TEMP));
+ show_val_kb(m, "Bounce: ", 0);
+ show_val_kb(m, "WritebackTmp: ", 0);
show_val_kb(m, "CommitLimit: ", vm_commit_limit());
show_val_kb(m, "Committed_AS: ", committed);
seq_printf(m, "VmallocTotal: %8lu kB\n",
@@ -162,6 +160,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
show_val_kb(m, "Unaccepted: ",
global_zone_page_state(NR_UNACCEPTED));
#endif
+ show_val_kb(m, "Balloon: ",
+ global_node_page_state(NR_BALLOON_PAGES));
hugetlb_report_meminfo(m);
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index c610224faf10..4403a2e20c16 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -111,8 +111,7 @@ static struct dentry *proc_ns_instantiate(struct dentry *dentry,
ei->ns_ops = ns_ops;
pid_update_inode(task, inode);
- d_set_d_op(dentry, &pid_dentry_operations);
- return d_splice_alias(inode, dentry);
+ return d_splice_alias_ops(inode, dentry, &pid_dentry_operations);
}
static int proc_ns_dir_readdir(struct file *file, struct dir_context *ctx)
diff --git a/fs/proc/page.c b/fs/proc/page.c
index a55f5acefa97..ba3568e97fd1 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -22,6 +22,12 @@
#define KPMMASK (KPMSIZE - 1)
#define KPMBITS (KPMSIZE * BITS_PER_BYTE)
+enum kpage_operation {
+ KPAGE_FLAGS,
+ KPAGE_COUNT,
+ KPAGE_CGROUP,
+};
+
static inline unsigned long get_max_dump_pfn(void)
{
#ifdef CONFIG_SPARSEMEM
@@ -37,19 +43,33 @@ static inline unsigned long get_max_dump_pfn(void)
#endif
}
-/* /proc/kpagecount - an array exposing page mapcounts
- *
- * Each entry is a u64 representing the corresponding
- * physical page mapcount.
- */
-static ssize_t kpagecount_read(struct file *file, char __user *buf,
- size_t count, loff_t *ppos)
+static u64 get_kpage_count(const struct page *page)
+{
+ struct page_snapshot ps;
+ u64 ret;
+
+ snapshot_page(&ps, page);
+
+ if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
+ ret = folio_precise_page_mapcount(&ps.folio_snapshot,
+ &ps.page_snapshot);
+ else
+ ret = folio_average_page_mapcount(&ps.folio_snapshot);
+
+ return ret;
+}
+
+static ssize_t kpage_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos,
+ enum kpage_operation op)
{
const unsigned long max_dump_pfn = get_max_dump_pfn();
u64 __user *out = (u64 __user *)buf;
+ struct page *page;
unsigned long src = *ppos;
unsigned long pfn;
ssize_t ret = 0;
+ u64 info;
pfn = src / KPMSIZE;
if (src & KPMMASK || count & KPMMASK)
@@ -59,19 +79,31 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src);
while (count > 0) {
- struct page *page;
- u64 mapcount = 0;
-
/*
* TODO: ZONE_DEVICE support requires to identify
* memmaps that were actually initialized.
*/
page = pfn_to_online_page(pfn);
- if (page)
- mapcount = folio_precise_page_mapcount(page_folio(page),
- page);
- if (put_user(mapcount, out)) {
+ if (page) {
+ switch (op) {
+ case KPAGE_FLAGS:
+ info = stable_page_flags(page);
+ break;
+ case KPAGE_COUNT:
+ info = get_kpage_count(page);
+ break;
+ case KPAGE_CGROUP:
+ info = page_cgroup_ino(page);
+ break;
+ default:
+ info = 0;
+ break;
+ }
+ } else
+ info = 0;
+
+ if (put_user(info, out)) {
ret = -EFAULT;
break;
}
@@ -89,17 +121,23 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
return ret;
}
+/* /proc/kpagecount - an array exposing page mapcounts
+ *
+ * Each entry is a u64 representing the corresponding
+ * physical page mapcount.
+ */
+static ssize_t kpagecount_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ return kpage_read(file, buf, count, ppos, KPAGE_COUNT);
+}
+
static const struct proc_ops kpagecount_proc_ops = {
.proc_flags = PROC_ENTRY_PERMANENT,
.proc_lseek = mem_lseek,
.proc_read = kpagecount_read,
};
-/* /proc/kpageflags - an array exposing page flags
- *
- * Each entry is a u64 representing the corresponding
- * physical page flags.
- */
static inline u64 kpf_copy_bit(u64 kflags, int ubit, int kbit)
{
@@ -109,6 +147,7 @@ static inline u64 kpf_copy_bit(u64 kflags, int ubit, int kbit)
u64 stable_page_flags(const struct page *page)
{
const struct folio *folio;
+ struct page_snapshot ps;
unsigned long k;
unsigned long mapping;
bool is_anon;
@@ -120,20 +159,22 @@ u64 stable_page_flags(const struct page *page)
*/
if (!page)
return 1 << KPF_NOPAGE;
- folio = page_folio(page);
+
+ snapshot_page(&ps, page);
+ folio = &ps.folio_snapshot;
k = folio->flags;
mapping = (unsigned long)folio->mapping;
- is_anon = mapping & PAGE_MAPPING_ANON;
+ is_anon = mapping & FOLIO_MAPPING_ANON;
/*
* pseudo flags for the well known (anonymous) memory mapped pages
*/
- if (page_mapped(page))
+ if (folio_mapped(folio))
u |= 1 << KPF_MMAP;
if (is_anon) {
u |= 1 << KPF_ANON;
- if (mapping & PAGE_MAPPING_KSM)
+ if (mapping & FOLIO_MAPPING_KSM)
u |= 1 << KPF_KSM;
}
@@ -141,7 +182,7 @@ u64 stable_page_flags(const struct page *page)
* compound pages: export both head/tail info
* they together define a compound page's start/end pos and order
*/
- if (page == &folio->page)
+ if (ps.idx == 0)
u |= kpf_copy_bit(k, KPF_COMPOUND_HEAD, PG_head);
else
u |= 1 << KPF_COMPOUND_TAIL;
@@ -151,25 +192,19 @@ u64 stable_page_flags(const struct page *page)
folio_test_large_rmappable(folio)) {
/* Note: we indicate any THPs here, not just PMD-sized ones */
u |= 1 << KPF_THP;
- } else if (is_huge_zero_folio(folio)) {
+ } else if (is_huge_zero_pfn(ps.pfn)) {
u |= 1 << KPF_ZERO_PAGE;
u |= 1 << KPF_THP;
- } else if (is_zero_folio(folio)) {
+ } else if (is_zero_pfn(ps.pfn)) {
u |= 1 << KPF_ZERO_PAGE;
}
- /*
- * Caveats on high order pages: PG_buddy and PG_slab will only be set
- * on the head page.
- */
- if (PageBuddy(page))
- u |= 1 << KPF_BUDDY;
- else if (page_count(page) == 0 && is_free_buddy_page(page))
+ if (ps.flags & PAGE_SNAPSHOT_PG_BUDDY)
u |= 1 << KPF_BUDDY;
- if (PageOffline(page))
+ if (folio_test_offline(folio))
u |= 1 << KPF_OFFLINE;
- if (PageTable(page))
+ if (folio_test_pgtable(folio))
u |= 1 << KPF_PGTABLE;
if (folio_test_slab(folio))
u |= 1 << KPF_SLAB;
@@ -177,7 +212,7 @@ u64 stable_page_flags(const struct page *page)
#if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT)
u |= kpf_copy_bit(k, KPF_IDLE, PG_idle);
#else
- if (folio_test_idle(folio))
+ if (ps.flags & PAGE_SNAPSHOT_PG_IDLE)
u |= 1 << KPF_IDLE;
#endif
@@ -203,7 +238,7 @@ u64 stable_page_flags(const struct page *page)
if (u & (1 << KPF_HUGE))
u |= kpf_copy_bit(k, KPF_HWPOISON, PG_hwpoison);
else
- u |= kpf_copy_bit(page->flags, KPF_HWPOISON, PG_hwpoison);
+ u |= kpf_copy_bit(ps.page_snapshot.flags, KPF_HWPOISON, PG_hwpoison);
#endif
u |= kpf_copy_bit(k, KPF_RESERVED, PG_reserved);
@@ -220,47 +255,17 @@ u64 stable_page_flags(const struct page *page)
#endif
return u;
-};
+}
+/* /proc/kpageflags - an array exposing page flags
+ *
+ * Each entry is a u64 representing the corresponding
+ * physical page flags.
+ */
static ssize_t kpageflags_read(struct file *file, char __user *buf,
- size_t count, loff_t *ppos)
+ size_t count, loff_t *ppos)
{
- const unsigned long max_dump_pfn = get_max_dump_pfn();
- u64 __user *out = (u64 __user *)buf;
- unsigned long src = *ppos;
- unsigned long pfn;
- ssize_t ret = 0;
-
- pfn = src / KPMSIZE;
- if (src & KPMMASK || count & KPMMASK)
- return -EINVAL;
- if (src >= max_dump_pfn * KPMSIZE)
- return 0;
- count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src);
-
- while (count > 0) {
- /*
- * TODO: ZONE_DEVICE support requires to identify
- * memmaps that were actually initialized.
- */
- struct page *page = pfn_to_online_page(pfn);
-
- if (put_user(stable_page_flags(page), out)) {
- ret = -EFAULT;
- break;
- }
-
- pfn++;
- out++;
- count -= KPMSIZE;
-
- cond_resched();
- }
-
- *ppos += (char __user *)out - buf;
- if (!ret)
- ret = (char __user *)out - buf;
- return ret;
+ return kpage_read(file, buf, count, ppos, KPAGE_FLAGS);
}
static const struct proc_ops kpageflags_proc_ops = {
@@ -271,53 +276,10 @@ static const struct proc_ops kpageflags_proc_ops = {
#ifdef CONFIG_MEMCG
static ssize_t kpagecgroup_read(struct file *file, char __user *buf,
- size_t count, loff_t *ppos)
+ size_t count, loff_t *ppos)
{
- const unsigned long max_dump_pfn = get_max_dump_pfn();
- u64 __user *out = (u64 __user *)buf;
- struct page *ppage;
- unsigned long src = *ppos;
- unsigned long pfn;
- ssize_t ret = 0;
- u64 ino;
-
- pfn = src / KPMSIZE;
- if (src & KPMMASK || count & KPMMASK)
- return -EINVAL;
- if (src >= max_dump_pfn * KPMSIZE)
- return 0;
- count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src);
-
- while (count > 0) {
- /*
- * TODO: ZONE_DEVICE support requires to identify
- * memmaps that were actually initialized.
- */
- ppage = pfn_to_online_page(pfn);
-
- if (ppage)
- ino = page_cgroup_ino(ppage);
- else
- ino = 0;
-
- if (put_user(ino, out)) {
- ret = -EFAULT;
- break;
- }
-
- pfn++;
- out++;
- count -= KPMSIZE;
-
- cond_resched();
- }
-
- *ppos += (char __user *)out - buf;
- if (!ret)
- ret = (char __user *)out - buf;
- return ret;
+ return kpage_read(file, buf, count, ppos, KPAGE_CGROUP);
}
-
static const struct proc_ops kpagecgroup_proc_ops = {
.proc_flags = PROC_ENTRY_PERMANENT,
.proc_lseek = mem_lseek,
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index cc9d74a06ff0..49ab74e0bfde 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -540,9 +540,8 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
goto out;
}
- d_set_d_op(dentry, &proc_sys_dentry_operations);
inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p);
- err = d_splice_alias(inode, dentry);
+ err = d_splice_alias_ops(inode, dentry, &proc_sys_dentry_operations);
out:
if (h)
@@ -699,9 +698,9 @@ static bool proc_sys_fill_cache(struct file *file,
return false;
if (d_in_lookup(child)) {
struct dentry *res;
- d_set_d_op(child, &proc_sys_dentry_operations);
inode = proc_sys_make_inode(dir->d_sb, head, table);
- res = d_splice_alias(inode, child);
+ res = d_splice_alias_ops(inode, child,
+ &proc_sys_dentry_operations);
d_lookup_done(child);
if (unlikely(res)) {
dput(child);
@@ -918,17 +917,21 @@ static int proc_sys_compare(const struct dentry *dentry,
struct ctl_table_header *head;
struct inode *inode;
- /* Although proc doesn't have negative dentries, rcu-walk means
- * that inode here can be NULL */
- /* AV: can it, indeed? */
- inode = d_inode_rcu(dentry);
- if (!inode)
- return 1;
if (name->len != len)
return 1;
if (memcmp(name->name, str, len))
return 1;
- head = rcu_dereference(PROC_I(inode)->sysctl);
+
+ // false positive is fine here - we'll recheck anyway
+ if (d_in_lookup(dentry))
+ return 0;
+
+ inode = d_inode_rcu(dentry);
+ // we just might have run into dentry in the middle of __dentry_kill()
+ if (!inode)
+ return 1;
+
+ head = READ_ONCE(PROC_I(inode)->sysctl);
return !head || !sysctl_is_seen(head);
}
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 06a297a27ba3..ed86ac710384 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -363,12 +363,12 @@ static const struct inode_operations proc_root_inode_operations = {
* This is the root "inode" in the /proc tree..
*/
struct proc_dir_entry proc_root = {
- .low_ino = PROC_ROOT_INO,
- .namelen = 5,
- .mode = S_IFDIR | S_IRUGO | S_IXUGO,
- .nlink = 2,
+ .low_ino = PROCFS_ROOT_INO,
+ .namelen = 5,
+ .mode = S_IFDIR | S_IRUGO | S_IXUGO,
+ .nlink = 2,
.refcnt = REFCOUNT_INIT(1),
- .proc_iops = &proc_root_inode_operations,
+ .proc_iops = &proc_root_inode_operations,
.proc_dir_ops = &proc_root_operations,
.parent = &proc_root,
.subdir = RB_ROOT,
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index f02cd362309a..29cca0e6d0ff 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -29,6 +29,9 @@
#include <asm/tlbflush.h>
#include "internal.h"
+#define SENTINEL_VMA_END -1
+#define SENTINEL_VMA_GATE -2
+
#define SEQ_PUT_DEC(str, val) \
seq_put_decimal_ull_width(m, str, (val) << (PAGE_SHIFT-10), 8)
void task_mem(struct seq_file *m, struct mm_struct *mm)
@@ -36,9 +39,9 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
unsigned long text, lib, swap, anon, file, shmem;
unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
- anon = get_mm_counter(mm, MM_ANONPAGES);
- file = get_mm_counter(mm, MM_FILEPAGES);
- shmem = get_mm_counter(mm, MM_SHMEMPAGES);
+ anon = get_mm_counter_sum(mm, MM_ANONPAGES);
+ file = get_mm_counter_sum(mm, MM_FILEPAGES);
+ shmem = get_mm_counter_sum(mm, MM_SHMEMPAGES);
/*
* Note: to minimize their overhead, mm maintains hiwater_vm and
@@ -59,7 +62,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
text = min(text, mm->exec_vm << PAGE_SHIFT);
lib = (mm->exec_vm << PAGE_SHIFT) - text;
- swap = get_mm_counter(mm, MM_SWAPENTS);
+ swap = get_mm_counter_sum(mm, MM_SWAPENTS);
SEQ_PUT_DEC("VmPeak:\t", hiwater_vm);
SEQ_PUT_DEC(" kB\nVmSize:\t", total_vm);
SEQ_PUT_DEC(" kB\nVmLck:\t", mm->locked_vm);
@@ -92,12 +95,12 @@ unsigned long task_statm(struct mm_struct *mm,
unsigned long *shared, unsigned long *text,
unsigned long *data, unsigned long *resident)
{
- *shared = get_mm_counter(mm, MM_FILEPAGES) +
- get_mm_counter(mm, MM_SHMEMPAGES);
+ *shared = get_mm_counter_sum(mm, MM_FILEPAGES) +
+ get_mm_counter_sum(mm, MM_SHMEMPAGES);
*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
>> PAGE_SHIFT;
*data = mm->data_vm + mm->stack_vm;
- *resident = *shared + get_mm_counter(mm, MM_ANONPAGES);
+ *resident = *shared + get_mm_counter_sum(mm, MM_ANONPAGES);
return mm->total_vm;
}
@@ -127,15 +130,134 @@ static void release_task_mempolicy(struct proc_maps_private *priv)
}
#endif
-static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv,
- loff_t *ppos)
+#ifdef CONFIG_PER_VMA_LOCK
+
+static void unlock_vma(struct proc_maps_private *priv)
+{
+ if (priv->locked_vma) {
+ vma_end_read(priv->locked_vma);
+ priv->locked_vma = NULL;
+ }
+}
+
+static const struct seq_operations proc_pid_maps_op;
+
+static inline bool lock_vma_range(struct seq_file *m,
+ struct proc_maps_private *priv)
+{
+ /*
+ * smaps and numa_maps perform page table walk, therefore require
+ * mmap_lock but maps can be read with locking just the vma and
+ * walking the vma tree under rcu read protection.
+ */
+ if (m->op != &proc_pid_maps_op) {
+ if (mmap_read_lock_killable(priv->mm))
+ return false;
+
+ priv->mmap_locked = true;
+ } else {
+ rcu_read_lock();
+ priv->locked_vma = NULL;
+ priv->mmap_locked = false;
+ }
+
+ return true;
+}
+
+static inline void unlock_vma_range(struct proc_maps_private *priv)
+{
+ if (priv->mmap_locked) {
+ mmap_read_unlock(priv->mm);
+ } else {
+ unlock_vma(priv);
+ rcu_read_unlock();
+ }
+}
+
+static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv,
+ loff_t last_pos)
+{
+ struct vm_area_struct *vma;
+
+ if (priv->mmap_locked)
+ return vma_next(&priv->iter);
+
+ unlock_vma(priv);
+ vma = lock_next_vma(priv->mm, &priv->iter, last_pos);
+ if (!IS_ERR_OR_NULL(vma))
+ priv->locked_vma = vma;
+
+ return vma;
+}
+
+static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv,
+ loff_t pos)
+{
+ if (priv->mmap_locked)
+ return false;
+
+ rcu_read_unlock();
+ mmap_read_lock(priv->mm);
+ /* Reinitialize the iterator after taking mmap_lock */
+ vma_iter_set(&priv->iter, pos);
+ priv->mmap_locked = true;
+
+ return true;
+}
+
+#else /* CONFIG_PER_VMA_LOCK */
+
+static inline bool lock_vma_range(struct seq_file *m,
+ struct proc_maps_private *priv)
+{
+ return mmap_read_lock_killable(priv->mm) == 0;
+}
+
+static inline void unlock_vma_range(struct proc_maps_private *priv)
+{
+ mmap_read_unlock(priv->mm);
+}
+
+static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv,
+ loff_t last_pos)
+{
+ return vma_next(&priv->iter);
+}
+
+static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv,
+ loff_t pos)
+{
+ return false;
+}
+
+#endif /* CONFIG_PER_VMA_LOCK */
+
+static struct vm_area_struct *proc_get_vma(struct seq_file *m, loff_t *ppos)
{
- struct vm_area_struct *vma = vma_next(&priv->iter);
+ struct proc_maps_private *priv = m->private;
+ struct vm_area_struct *vma;
+retry:
+ vma = get_next_vma(priv, *ppos);
+ /* EINTR of EAGAIN is possible */
+ if (IS_ERR(vma)) {
+ if (PTR_ERR(vma) == -EAGAIN && fallback_to_mmap_lock(priv, *ppos))
+ goto retry;
+
+ return vma;
+ }
+
+ /* Store previous position to be able to restart if needed */
+ priv->last_pos = *ppos;
if (vma) {
- *ppos = vma->vm_start;
+ /*
+ * Track the end of the reported vma to ensure position changes
+ * even if previous vma was merged with the next vma and we
+ * found the extended vma with the same vm_start.
+ */
+ *ppos = vma->vm_end;
} else {
- *ppos = -2UL;
+ *ppos = SENTINEL_VMA_GATE;
vma = get_gate_vma(priv->mm);
}
@@ -145,11 +267,11 @@ static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv,
static void *m_start(struct seq_file *m, loff_t *ppos)
{
struct proc_maps_private *priv = m->private;
- unsigned long last_addr = *ppos;
+ loff_t last_addr = *ppos;
struct mm_struct *mm;
/* See m_next(). Zero at the start or after lseek. */
- if (last_addr == -1UL)
+ if (last_addr == SENTINEL_VMA_END)
return NULL;
priv->task = get_proc_task(priv->inode);
@@ -163,28 +285,34 @@ static void *m_start(struct seq_file *m, loff_t *ppos)
return NULL;
}
- if (mmap_read_lock_killable(mm)) {
+ if (!lock_vma_range(m, priv)) {
mmput(mm);
put_task_struct(priv->task);
priv->task = NULL;
return ERR_PTR(-EINTR);
}
- vma_iter_init(&priv->iter, mm, last_addr);
+ /*
+ * Reset current position if last_addr was set before
+ * and it's not a sentinel.
+ */
+ if (last_addr > 0)
+ *ppos = last_addr = priv->last_pos;
+ vma_iter_init(&priv->iter, mm, (unsigned long)last_addr);
hold_task_mempolicy(priv);
- if (last_addr == -2UL)
+ if (last_addr == SENTINEL_VMA_GATE)
return get_gate_vma(mm);
- return proc_get_vma(priv, ppos);
+ return proc_get_vma(m, ppos);
}
static void *m_next(struct seq_file *m, void *v, loff_t *ppos)
{
- if (*ppos == -2UL) {
- *ppos = -1UL;
+ if (*ppos == SENTINEL_VMA_GATE) {
+ *ppos = SENTINEL_VMA_END;
return NULL;
}
- return proc_get_vma(m->private, ppos);
+ return proc_get_vma(m, ppos);
}
static void m_stop(struct seq_file *m, void *v)
@@ -196,7 +324,7 @@ static void m_stop(struct seq_file *m, void *v)
return;
release_task_mempolicy(priv);
- mmap_read_unlock(mm);
+ unlock_vma_range(priv);
mmput(mm);
put_task_struct(priv->task);
priv->task = NULL;
@@ -707,6 +835,8 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
struct folio *folio = page_folio(page);
int i, nr = compound ? compound_nr(page) : 1;
unsigned long size = nr * PAGE_SIZE;
+ bool exclusive;
+ int mapcount;
/*
* First accumulate quantities that depend only on |size| and the type
@@ -747,18 +877,29 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
dirty, locked, present);
return;
}
+
+ if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
+ mapcount = folio_average_page_mapcount(folio);
+ exclusive = !folio_maybe_mapped_shared(folio);
+ }
+
/*
* We obtain a snapshot of the mapcount. Without holding the folio lock
* this snapshot can be slightly wrong as we cannot always read the
* mapcount atomically.
*/
for (i = 0; i < nr; i++, page++) {
- int mapcount = folio_precise_page_mapcount(folio, page);
unsigned long pss = PAGE_SIZE << PSS_SHIFT;
+
+ if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) {
+ mapcount = folio_precise_page_mapcount(folio, page);
+ exclusive = mapcount < 2;
+ }
+
if (mapcount >= 2)
pss /= mapcount;
smaps_page_accumulate(mss, folio, PAGE_SIZE, pss,
- dirty, locked, mapcount < 2);
+ dirty, locked, exclusive);
}
}
@@ -1007,10 +1148,13 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
{
struct mem_size_stats *mss = walk->private;
struct vm_area_struct *vma = walk->vma;
- pte_t ptent = huge_ptep_get(walk->mm, addr, pte);
struct folio *folio = NULL;
bool present = false;
+ spinlock_t *ptl;
+ pte_t ptent;
+ ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte);
+ ptent = huge_ptep_get(walk->mm, addr, pte);
if (pte_present(ptent)) {
folio = page_folio(pte_page(ptent));
present = true;
@@ -1023,12 +1167,13 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
if (folio) {
/* We treat non-present entries as "maybe shared". */
- if (!present || folio_likely_mapped_shared(folio) ||
+ if (!present || folio_maybe_mapped_shared(folio) ||
hugetlb_pmd_shared(pte))
mss->shared_hugetlb += huge_page_size(hstate_vma(vma));
else
mss->private_hugetlb += huge_page_size(hstate_vma(vma));
}
+ spin_unlock(ptl);
return 0;
}
#else
@@ -1312,8 +1457,8 @@ static int smaps_rollup_open(struct inode *inode, struct file *file)
priv->inode = inode;
priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
- if (IS_ERR(priv->mm)) {
- ret = PTR_ERR(priv->mm);
+ if (IS_ERR_OR_NULL(priv->mm)) {
+ ret = priv->mm ? PTR_ERR(priv->mm) : -ESRCH;
single_release(inode, file);
goto out_free;
@@ -1632,6 +1777,7 @@ struct pagemapread {
#define PM_SOFT_DIRTY BIT_ULL(55)
#define PM_MMAP_EXCLUSIVE BIT_ULL(56)
#define PM_UFFD_WP BIT_ULL(57)
+#define PM_GUARD_REGION BIT_ULL(58)
#define PM_FILE BIT_ULL(61)
#define PM_SWAP BIT_ULL(62)
#define PM_PRESENT BIT_ULL(63)
@@ -1651,6 +1797,13 @@ static int add_to_pagemap(pagemap_entry_t *pme, struct pagemapread *pm)
return 0;
}
+static bool __folio_page_mapped_exclusively(struct folio *folio, struct page *page)
+{
+ if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
+ return folio_precise_page_mapcount(folio, page) == 1;
+ return !folio_maybe_mapped_shared(folio);
+}
+
static int pagemap_pte_hole(unsigned long start, unsigned long end,
__always_unused int depth, struct mm_walk *walk)
{
@@ -1732,6 +1885,8 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
page = pfn_swap_entry_to_page(entry);
if (pte_marker_entry_uffd_wp(entry))
flags |= PM_UFFD_WP;
+ if (is_guard_swp_entry(entry))
+ flags |= PM_GUARD_REGION;
}
if (page) {
@@ -1739,7 +1894,7 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
if (!folio_test_anon(folio))
flags |= PM_FILE;
if ((flags & PM_PRESENT) &&
- folio_precise_page_mapcount(folio, page) == 1)
+ __folio_page_mapped_exclusively(folio, page))
flags |= PM_MMAP_EXCLUSIVE;
}
if (vma->vm_flags & VM_SOFTDIRTY)
@@ -1814,7 +1969,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
pagemap_entry_t pme;
if (folio && (flags & PM_PRESENT) &&
- folio_precise_page_mapcount(folio, page + idx) == 1)
+ __folio_page_mapped_exclusively(folio, page))
cur_flags |= PM_MMAP_EXCLUSIVE;
pme = make_pme(frame, cur_flags);
@@ -1866,12 +2021,14 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
struct pagemapread *pm = walk->private;
struct vm_area_struct *vma = walk->vma;
u64 flags = 0, frame = 0;
+ spinlock_t *ptl;
int err = 0;
pte_t pte;
if (vma->vm_flags & VM_SOFTDIRTY)
flags |= PM_SOFT_DIRTY;
+ ptl = huge_pte_lock(hstate_vma(vma), walk->mm, ptep);
pte = huge_ptep_get(walk->mm, addr, ptep);
if (pte_present(pte)) {
struct folio *folio = page_folio(pte_page(pte));
@@ -1879,7 +2036,7 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
if (!folio_test_anon(folio))
flags |= PM_FILE;
- if (!folio_likely_mapped_shared(folio) &&
+ if (!folio_maybe_mapped_shared(folio) &&
!hugetlb_pmd_shared(ptep))
flags |= PM_MMAP_EXCLUSIVE;
@@ -1899,11 +2056,12 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
err = add_to_pagemap(&pme, pm);
if (err)
- return err;
+ break;
if (pm->show_pfn && (flags & PM_PRESENT))
frame++;
}
+ spin_unlock(ptl);
cond_resched();
return err;
@@ -1931,7 +2089,8 @@ static const struct mm_walk_ops pagemap_ops = {
* Bit 55 pte is soft-dirty (see Documentation/admin-guide/mm/soft-dirty.rst)
* Bit 56 page exclusively mapped
* Bit 57 pte is uffd-wp write-protected
- * Bits 58-60 zero
+ * Bit 58 pte is a guard region
+ * Bits 59-60 zero
* Bit 61 page is file-page or shared-anon
* Bit 62 page swapped
* Bit 63 page present
@@ -2045,8 +2204,8 @@ static int pagemap_open(struct inode *inode, struct file *file)
struct mm_struct *mm;
mm = proc_mem_open(inode, PTRACE_MODE_READ);
- if (IS_ERR(mm))
- return PTR_ERR(mm);
+ if (IS_ERR_OR_NULL(mm))
+ return mm ? PTR_ERR(mm) : -ESRCH;
file->private_data = mm;
return 0;
}
@@ -2063,7 +2222,8 @@ static int pagemap_release(struct inode *inode, struct file *file)
#define PM_SCAN_CATEGORIES (PAGE_IS_WPALLOWED | PAGE_IS_WRITTEN | \
PAGE_IS_FILE | PAGE_IS_PRESENT | \
PAGE_IS_SWAPPED | PAGE_IS_PFNZERO | \
- PAGE_IS_HUGE | PAGE_IS_SOFT_DIRTY)
+ PAGE_IS_HUGE | PAGE_IS_SOFT_DIRTY | \
+ PAGE_IS_GUARD)
#define PM_SCAN_FLAGS (PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC)
struct pagemap_scan_private {
@@ -2104,12 +2264,14 @@ static unsigned long pagemap_page_category(struct pagemap_scan_private *p,
if (!pte_swp_uffd_wp_any(pte))
categories |= PAGE_IS_WRITTEN;
- if (p->masks_of_interest & PAGE_IS_FILE) {
- swp = pte_to_swp_entry(pte);
- if (is_pfn_swap_entry(swp) &&
- !folio_test_anon(pfn_swap_entry_folio(swp)))
- categories |= PAGE_IS_FILE;
- }
+ swp = pte_to_swp_entry(pte);
+ if (is_guard_swp_entry(swp))
+ categories |= PAGE_IS_GUARD;
+ else if ((p->masks_of_interest & PAGE_IS_FILE) &&
+ is_pfn_swap_entry(swp) &&
+ !folio_test_anon(pfn_swap_entry_folio(swp)))
+ categories |= PAGE_IS_FILE;
+
if (pte_swp_soft_dirty(pte))
categories |= PAGE_IS_SOFT_DIRTY;
}
@@ -2155,7 +2317,7 @@ static unsigned long pagemap_thp_category(struct pagemap_scan_private *p,
categories |= PAGE_IS_FILE;
}
- if (is_zero_pfn(pmd_pfn(pmd)))
+ if (is_huge_zero_pmd(pmd))
categories |= PAGE_IS_PFNZERO;
if (pmd_soft_dirty(pmd))
categories |= PAGE_IS_SOFT_DIRTY;
@@ -2455,22 +2617,19 @@ static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start,
spinlock_t *ptl;
int ret;
- arch_enter_lazy_mmu_mode();
-
ret = pagemap_scan_thp_entry(pmd, start, end, walk);
- if (ret != -ENOENT) {
- arch_leave_lazy_mmu_mode();
+ if (ret != -ENOENT)
return ret;
- }
ret = 0;
start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
if (!pte) {
- arch_leave_lazy_mmu_mode();
walk->action = ACTION_AGAIN;
return 0;
}
+ arch_enter_lazy_mmu_mode();
+
if ((p->arg.flags & PM_SCAN_WP_MATCHING) && !p->vec_out) {
/* Fast path for performing exclusive WP */
for (addr = start; addr != end; pte++, addr += PAGE_SIZE) {
@@ -2539,8 +2698,8 @@ flush_and_return:
if (flush_end)
flush_tlb_range(vma, start, addr);
- pte_unmap_unlock(start_pte, ptl);
arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(start_pte, ptl);
cond_resched();
return ret;
@@ -2855,7 +3014,12 @@ static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty,
unsigned long nr_pages)
{
struct folio *folio = page_folio(page);
- int count = folio_precise_page_mapcount(folio, page);
+ int count;
+
+ if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
+ count = folio_precise_page_mapcount(folio, page);
+ else
+ count = folio_average_page_mapcount(folio);
md->pages += nr_pages;
if (pte_dirty || folio_test_dirty(folio))
@@ -2971,17 +3135,22 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
unsigned long addr, unsigned long end, struct mm_walk *walk)
{
- pte_t huge_pte = huge_ptep_get(walk->mm, addr, pte);
+ pte_t huge_pte;
struct numa_maps *md;
struct page *page;
+ spinlock_t *ptl;
+ ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
+ huge_pte = huge_ptep_get(walk->mm, addr, pte);
if (!pte_present(huge_pte))
- return 0;
+ goto out;
page = pte_page(huge_pte);
md = walk->private;
gather_stats(page, md, pte_dirty(huge_pte), 1);
+out:
+ spin_unlock(ptl);
return 0;
}
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index bce674533000..59bfd61d653a 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -260,8 +260,8 @@ static int maps_open(struct inode *inode, struct file *file,
priv->inode = inode;
priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
- if (IS_ERR(priv->mm)) {
- int err = PTR_ERR(priv->mm);
+ if (IS_ERR_OR_NULL(priv->mm)) {
+ int err = priv->mm ? PTR_ERR(priv->mm) : -ESRCH;
seq_release_private(inode, file);
return err;
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index a00120a3c099..f188bd900eb2 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -1490,10 +1490,8 @@ int vmcore_add_device_dump(struct vmcoredd_data *data)
return -EINVAL;
dump = vzalloc(sizeof(*dump));
- if (!dump) {
- ret = -ENOMEM;
- goto out_err;
- }
+ if (!dump)
+ return -ENOMEM;
/* Keep size of the buffer page aligned so that it can be mmaped */
data_size = roundup(sizeof(struct vmcoredd_header) + data->size,
@@ -1519,17 +1517,17 @@ int vmcore_add_device_dump(struct vmcoredd_data *data)
dump->size = data_size;
/* Add the dump to driver sysfs list and update the elfcore hdr */
- mutex_lock(&vmcore_mutex);
- if (vmcore_opened)
- pr_warn_once("Unexpected adding of device dump\n");
- if (vmcore_open) {
- ret = -EBUSY;
- goto out_err;
- }
+ scoped_guard(mutex, &vmcore_mutex) {
+ if (vmcore_opened)
+ pr_warn_once("Unexpected adding of device dump\n");
+ if (vmcore_open) {
+ ret = -EBUSY;
+ goto out_err;
+ }
- list_add_tail(&dump->list, &vmcoredd_list);
- vmcoredd_update_size(data_size);
- mutex_unlock(&vmcore_mutex);
+ list_add_tail(&dump->list, &vmcoredd_list);
+ vmcoredd_update_size(data_size);
+ }
return 0;
out_err:
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index e133b507ddf3..5c555db68aa2 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -111,7 +111,7 @@ static int show_vfsmnt(struct seq_file *m, struct vfsmount *mnt)
if (err)
goto out;
} else {
- mangle(m, r->mnt_devname ? r->mnt_devname : "none");
+ mangle(m, r->mnt_devname);
}
seq_putc(m, ' ');
/* mountpoints outside of chroot jail will give SEQ_SKIP on this */
@@ -177,7 +177,7 @@ static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt)
if (err)
goto out;
} else {
- mangle(m, r->mnt_devname ? r->mnt_devname : "none");
+ mangle(m, r->mnt_devname);
}
seq_puts(m, sb_rdonly(sb) ? " ro" : " rw");
err = show_sb_opts(m, sb);
@@ -199,17 +199,13 @@ static int show_vfsstat(struct seq_file *m, struct vfsmount *mnt)
int err;
/* device */
+ seq_puts(m, "device ");
if (sb->s_op->show_devname) {
- seq_puts(m, "device ");
err = sb->s_op->show_devname(m, mnt_path.dentry);
if (err)
goto out;
} else {
- if (r->mnt_devname) {
- seq_puts(m, "device ");
- mangle(m, r->mnt_devname);
- } else
- seq_puts(m, "no device");
+ mangle(m, r->mnt_devname);
}
/* mount point */
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 56815799ce79..1a2e1185426c 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -14,10 +14,10 @@
#include <linux/init.h>
#include <linux/list.h>
#include <linux/string.h>
-#include <linux/mount.h>
#include <linux/seq_file.h>
#include <linux/ramfs.h>
-#include <linux/parser.h>
+#include <linux/fs_parser.h>
+#include <linux/fs_context.h>
#include <linux/sched.h>
#include <linux/magic.h>
#include <linux/pstore.h>
@@ -226,37 +226,38 @@ static struct inode *pstore_get_inode(struct super_block *sb)
}
enum {
- Opt_kmsg_bytes, Opt_err
+ Opt_kmsg_bytes
};
-static const match_table_t tokens = {
- {Opt_kmsg_bytes, "kmsg_bytes=%u"},
- {Opt_err, NULL}
+static const struct fs_parameter_spec pstore_param_spec[] = {
+ fsparam_u32 ("kmsg_bytes", Opt_kmsg_bytes),
+ {}
};
-static void parse_options(char *options)
-{
- char *p;
- substring_t args[MAX_OPT_ARGS];
- int option;
-
- if (!options)
- return;
+struct pstore_context {
+ unsigned int kmsg_bytes;
+};
- while ((p = strsep(&options, ",")) != NULL) {
- int token;
+static int pstore_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ struct pstore_context *ctx = fc->fs_private;
+ struct fs_parse_result result;
+ int opt;
- if (!*p)
- continue;
+ opt = fs_parse(fc, pstore_param_spec, param, &result);
+ /* pstore has historically ignored invalid kmsg_bytes param */
+ if (opt < 0)
+ return 0;
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_kmsg_bytes:
- if (!match_int(&args[0], &option))
- pstore_set_kmsg_bytes(option);
- break;
- }
+ switch (opt) {
+ case Opt_kmsg_bytes:
+ ctx->kmsg_bytes = result.uint_32;
+ break;
+ default:
+ return -EINVAL;
}
+
+ return 0;
}
/*
@@ -265,14 +266,16 @@ static void parse_options(char *options)
static int pstore_show_options(struct seq_file *m, struct dentry *root)
{
if (kmsg_bytes != CONFIG_PSTORE_DEFAULT_KMSG_BYTES)
- seq_printf(m, ",kmsg_bytes=%lu", kmsg_bytes);
+ seq_printf(m, ",kmsg_bytes=%u", kmsg_bytes);
return 0;
}
-static int pstore_remount(struct super_block *sb, int *flags, char *data)
+static int pstore_reconfigure(struct fs_context *fc)
{
- sync_filesystem(sb);
- parse_options(data);
+ struct pstore_context *ctx = fc->fs_private;
+
+ sync_filesystem(fc->root->d_sb);
+ pstore_set_kmsg_bytes(ctx->kmsg_bytes);
return 0;
}
@@ -281,7 +284,6 @@ static const struct super_operations pstore_ops = {
.statfs = simple_statfs,
.drop_inode = generic_delete_inode,
.evict_inode = pstore_evict_inode,
- .remount_fs = pstore_remount,
.show_options = pstore_show_options,
};
@@ -298,7 +300,7 @@ static struct dentry *psinfo_lock_root(void)
return NULL;
root = pstore_sb->s_root;
- inode_lock(d_inode(root));
+ inode_lock_nested(d_inode(root), I_MUTEX_PARENT);
return root;
}
@@ -316,8 +318,7 @@ int pstore_put_backend_records(struct pstore_info *psi)
list_for_each_entry_safe(pos, tmp, &records_list, list) {
if (pos->record->psi == psi) {
list_del_init(&pos->list);
- d_invalidate(pos->dentry);
- simple_unlink(d_inode(root), pos->dentry);
+ locked_recursive_removal(pos->dentry, NULL);
pos->dentry = NULL;
}
}
@@ -406,8 +407,9 @@ void pstore_get_records(int quiet)
inode_unlock(d_inode(root));
}
-static int pstore_fill_super(struct super_block *sb, void *data, int silent)
+static int pstore_fill_super(struct super_block *sb, struct fs_context *fc)
{
+ struct pstore_context *ctx = fc->fs_private;
struct inode *inode;
sb->s_maxbytes = MAX_LFS_FILESIZE;
@@ -417,7 +419,7 @@ static int pstore_fill_super(struct super_block *sb, void *data, int silent)
sb->s_op = &pstore_ops;
sb->s_time_gran = 1;
- parse_options(data);
+ pstore_set_kmsg_bytes(ctx->kmsg_bytes);
inode = pstore_get_inode(sb);
if (inode) {
@@ -438,12 +440,26 @@ static int pstore_fill_super(struct super_block *sb, void *data, int silent)
return 0;
}
-static struct dentry *pstore_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int pstore_get_tree(struct fs_context *fc)
+{
+ if (fc->root)
+ return pstore_reconfigure(fc);
+
+ return get_tree_single(fc, pstore_fill_super);
+}
+
+static void pstore_free_fc(struct fs_context *fc)
{
- return mount_single(fs_type, flags, data, pstore_fill_super);
+ kfree(fc->fs_private);
}
+static const struct fs_context_operations pstore_context_ops = {
+ .parse_param = pstore_parse_param,
+ .get_tree = pstore_get_tree,
+ .reconfigure = pstore_reconfigure,
+ .free = pstore_free_fc,
+};
+
static void pstore_kill_sb(struct super_block *sb)
{
guard(mutex)(&pstore_sb_lock);
@@ -456,11 +472,33 @@ static void pstore_kill_sb(struct super_block *sb)
INIT_LIST_HEAD(&records_list);
}
+static int pstore_init_fs_context(struct fs_context *fc)
+{
+ struct pstore_context *ctx;
+
+ ctx = kzalloc(sizeof(struct pstore_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ /*
+ * Global kmsg_bytes is initialized to default, and updated
+ * every time we (re)mount the single-sb filesystem with the
+ * option specified.
+ */
+ ctx->kmsg_bytes = kmsg_bytes;
+
+ fc->fs_private = ctx;
+ fc->ops = &pstore_context_ops;
+
+ return 0;
+}
+
static struct file_system_type pstore_fs_type = {
.owner = THIS_MODULE,
.name = "pstore",
- .mount = pstore_mount,
.kill_sb = pstore_kill_sb,
+ .init_fs_context = pstore_init_fs_context,
+ .parameters = pstore_param_spec,
};
int __init pstore_init_fs(void)
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
index 801d6c0b170c..a0fc51196910 100644
--- a/fs/pstore/internal.h
+++ b/fs/pstore/internal.h
@@ -6,7 +6,7 @@
#include <linux/time.h>
#include <linux/pstore.h>
-extern unsigned long kmsg_bytes;
+extern unsigned int kmsg_bytes;
#ifdef CONFIG_PSTORE_FTRACE
extern void pstore_register_ftrace(void);
@@ -35,7 +35,7 @@ static inline void pstore_unregister_pmsg(void) {}
extern struct pstore_info *psinfo;
-extern void pstore_set_kmsg_bytes(int);
+extern void pstore_set_kmsg_bytes(unsigned int bytes);
extern void pstore_get_records(int);
extern void pstore_get_backend_records(struct pstore_info *psi,
struct dentry *root, int quiet);
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index f56b066ab80c..f8b9c9c73997 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -92,8 +92,8 @@ module_param(compress, charp, 0444);
MODULE_PARM_DESC(compress, "compression to use");
/* How much of the kernel log to snapshot */
-unsigned long kmsg_bytes = CONFIG_PSTORE_DEFAULT_KMSG_BYTES;
-module_param(kmsg_bytes, ulong, 0444);
+unsigned int kmsg_bytes = CONFIG_PSTORE_DEFAULT_KMSG_BYTES;
+module_param(kmsg_bytes, uint, 0444);
MODULE_PARM_DESC(kmsg_bytes, "amount of kernel log to snapshot (in bytes)");
static void *compress_workspace;
@@ -107,9 +107,9 @@ static void *compress_workspace;
static char *big_oops_buf;
static size_t max_compressed_size;
-void pstore_set_kmsg_bytes(int bytes)
+void pstore_set_kmsg_bytes(unsigned int bytes)
{
- kmsg_bytes = bytes;
+ WRITE_ONCE(kmsg_bytes, bytes);
}
/* Tag each group of saved records with a sequence number */
@@ -278,6 +278,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
struct kmsg_dump_detail *detail)
{
struct kmsg_dump_iter iter;
+ unsigned int remaining = READ_ONCE(kmsg_bytes);
unsigned long total = 0;
const char *why;
unsigned int part = 1;
@@ -300,7 +301,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
kmsg_dump_rewind(&iter);
oopscount++;
- while (total < kmsg_bytes) {
+ while (total < remaining) {
char *dst;
size_t dst_size;
int header_size;
@@ -562,7 +563,7 @@ void pstore_unregister(struct pstore_info *psi)
pstore_unregister_kmsg();
/* Stop timer and make sure all work has finished. */
- del_timer_sync(&pstore_timer);
+ timer_delete_sync(&pstore_timer);
flush_work(&pstore_work);
/* Remove all backend records from filesystem tree. */
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 825c5c2e0962..df4a9b348769 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2560,7 +2560,7 @@ int dquot_quota_on_mount(struct super_block *sb, char *qf_name,
struct dentry *dentry;
int error;
- dentry = lookup_positive_unlocked(qf_name, sb->s_root, strlen(qf_name));
+ dentry = lookup_noperm_positive_unlocked(&QSTR(qf_name), sb->s_root);
if (IS_ERR(dentry))
return PTR_ERR(dentry);
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index b45c7edc3225..b11f5b20b78b 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -41,7 +41,7 @@ static unsigned long ramfs_mmu_get_unmapped_area(struct file *file,
const struct file_operations ramfs_file_operations = {
.read_iter = generic_file_read_iter,
.write_iter = generic_file_write_iter,
- .mmap = generic_file_mmap,
+ .mmap_prepare = generic_file_mmap_prepare,
.fsync = noop_fsync,
.splice_read = filemap_splice_read,
.splice_write = iter_file_splice_write,
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 7a6d980e614d..77b8ca2757e0 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -28,7 +28,7 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
unsigned long len,
unsigned long pgoff,
unsigned long flags);
-static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma);
+static int ramfs_nommu_mmap_prepare(struct vm_area_desc *desc);
static unsigned ramfs_mmap_capabilities(struct file *file)
{
@@ -38,7 +38,7 @@ static unsigned ramfs_mmap_capabilities(struct file *file)
const struct file_operations ramfs_file_operations = {
.mmap_capabilities = ramfs_mmap_capabilities,
- .mmap = ramfs_nommu_mmap,
+ .mmap_prepare = ramfs_nommu_mmap_prepare,
.get_unmapped_area = ramfs_nommu_get_unmapped_area,
.read_iter = generic_file_read_iter,
.write_iter = generic_file_write_iter,
@@ -262,12 +262,12 @@ out:
/*
* set up a mapping for shared memory segments
*/
-static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma)
+static int ramfs_nommu_mmap_prepare(struct vm_area_desc *desc)
{
- if (!is_nommu_shared_mapping(vma->vm_flags))
+ if (!is_nommu_shared_mapping(desc->vm_flags))
return -ENOSYS;
- file_accessed(file);
- vma->vm_ops = &generic_file_vm_ops;
+ file_accessed(desc->file);
+ desc->vm_ops = &generic_file_vm_ops;
return 0;
}
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 8006faaaf0ec..f8874c3b8c1e 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -119,13 +119,13 @@ out:
return error;
}
-static int ramfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *ramfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
int retval = ramfs_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFDIR, 0);
if (!retval)
inc_nlink(dir);
- return retval;
+ return ERR_PTR(retval);
}
static int ramfs_create(struct mnt_idmap *idmap, struct inode *dir,
@@ -269,6 +269,7 @@ static int ramfs_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_blocksize_bits = PAGE_SHIFT;
sb->s_magic = RAMFS_MAGIC;
sb->s_op = &ramfs_ops;
+ sb->s_d_flags = DCACHE_DONTCACHE;
sb->s_time_gran = 1;
inode = ramfs_get_inode(sb, NULL, S_IFDIR | fsi->mount_opts.mode, 0);
diff --git a/fs/read_write.c b/fs/read_write.c
index a6133241dfb8..c5b6265d984b 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -28,7 +28,7 @@
const struct file_operations generic_ro_fops = {
.llseek = generic_file_llseek,
.read_iter = generic_file_read_iter,
- .mmap = generic_file_readonly_mmap,
+ .mmap_prepare = generic_file_readonly_mmap_prepare,
.splice_read = filemap_splice_read,
};
@@ -169,11 +169,16 @@ generic_file_llseek_size(struct file *file, loff_t offset, int whence,
if (whence == SEEK_CUR) {
/*
- * f_lock protects against read/modify/write race with
- * other SEEK_CURs. Note that parallel writes and reads
- * behave like SEEK_SET.
+ * If the file requires locking via f_pos_lock we know
+ * that mutual exclusion for SEEK_CUR on the same file
+ * is guaranteed. If the file isn't locked, we take
+ * f_lock to protect against f_pos races with other
+ * SEEK_CURs.
*/
- guard(spinlock)(&file->f_lock);
+ if (file_seek_cur_needs_f_lock(file)) {
+ guard(spinlock)(&file->f_lock);
+ return vfs_setpos(file, file->f_pos + offset, maxsize);
+ }
return vfs_setpos(file, file->f_pos + offset, maxsize);
}
@@ -232,7 +237,7 @@ EXPORT_SYMBOL(generic_llseek_cookie);
* @offset: file offset to seek to
* @whence: type of seek
*
- * This is a generic implemenation of ->llseek useable for all normal local
+ * This is a generic implementation of ->llseek useable for all normal local
* filesystems. It just updates the file offset to the value specified by
* @offset and @whence.
*/
@@ -327,7 +332,9 @@ loff_t default_llseek(struct file *file, loff_t offset, int whence)
struct inode *inode = file_inode(file);
loff_t retval;
- inode_lock(inode);
+ retval = inode_lock_killable(inode);
+ if (retval)
+ return retval;
switch (whence) {
case SEEK_END:
offset += i_size_read(inode);
diff --git a/fs/readdir.c b/fs/readdir.c
index 0038efda417b..7764b8638978 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -222,6 +222,7 @@ SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
CLASS(fd_pos, f)(fd);
struct readdir_callback buf = {
.ctx.actor = fillonedir,
+ .ctx.count = 1, /* Hint to fs: just one entry. */
.dirent = dirent
};
@@ -252,7 +253,6 @@ struct getdents_callback {
struct dir_context ctx;
struct linux_dirent __user * current_dir;
int prev_reclen;
- int count;
int error;
};
@@ -266,12 +266,16 @@ static bool filldir(struct dir_context *ctx, const char *name, int namlen,
int reclen = ALIGN(offsetof(struct linux_dirent, d_name) + namlen + 2,
sizeof(long));
int prev_reclen;
+ unsigned int flags = d_type;
+
+ BUILD_BUG_ON(FILLDIR_FLAG_NOINTR & S_DT_MASK);
+ d_type &= S_DT_MASK;
buf->error = verify_dirent_name(name, namlen);
if (unlikely(buf->error))
return false;
buf->error = -EINVAL; /* only used if we fail.. */
- if (reclen > buf->count)
+ if (reclen > ctx->count)
return false;
d_ino = ino;
if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
@@ -279,7 +283,7 @@ static bool filldir(struct dir_context *ctx, const char *name, int namlen,
return false;
}
prev_reclen = buf->prev_reclen;
- if (prev_reclen && signal_pending(current))
+ if (!(flags & FILLDIR_FLAG_NOINTR) && prev_reclen && signal_pending(current))
return false;
dirent = buf->current_dir;
prev = (void __user *) dirent - prev_reclen;
@@ -296,7 +300,7 @@ static bool filldir(struct dir_context *ctx, const char *name, int namlen,
buf->current_dir = (void __user *)dirent + reclen;
buf->prev_reclen = reclen;
- buf->count -= reclen;
+ ctx->count -= reclen;
return true;
efault_end:
user_write_access_end();
@@ -311,7 +315,7 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
CLASS(fd_pos, f)(fd);
struct getdents_callback buf = {
.ctx.actor = filldir,
- .count = count,
+ .ctx.count = count,
.current_dir = dirent
};
int error;
@@ -329,7 +333,7 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
if (put_user(buf.ctx.pos, &lastdirent->d_off))
error = -EFAULT;
else
- error = count - buf.count;
+ error = count - buf.ctx.count;
}
return error;
}
@@ -338,7 +342,6 @@ struct getdents_callback64 {
struct dir_context ctx;
struct linux_dirent64 __user * current_dir;
int prev_reclen;
- int count;
int error;
};
@@ -351,15 +354,19 @@ static bool filldir64(struct dir_context *ctx, const char *name, int namlen,
int reclen = ALIGN(offsetof(struct linux_dirent64, d_name) + namlen + 1,
sizeof(u64));
int prev_reclen;
+ unsigned int flags = d_type;
+
+ BUILD_BUG_ON(FILLDIR_FLAG_NOINTR & S_DT_MASK);
+ d_type &= S_DT_MASK;
buf->error = verify_dirent_name(name, namlen);
if (unlikely(buf->error))
return false;
buf->error = -EINVAL; /* only used if we fail.. */
- if (reclen > buf->count)
+ if (reclen > ctx->count)
return false;
prev_reclen = buf->prev_reclen;
- if (prev_reclen && signal_pending(current))
+ if (!(flags & FILLDIR_FLAG_NOINTR) && prev_reclen && signal_pending(current))
return false;
dirent = buf->current_dir;
prev = (void __user *)dirent - prev_reclen;
@@ -376,7 +383,7 @@ static bool filldir64(struct dir_context *ctx, const char *name, int namlen,
buf->prev_reclen = reclen;
buf->current_dir = (void __user *)dirent + reclen;
- buf->count -= reclen;
+ ctx->count -= reclen;
return true;
efault_end:
@@ -392,7 +399,7 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
CLASS(fd_pos, f)(fd);
struct getdents_callback64 buf = {
.ctx.actor = filldir64,
- .count = count,
+ .ctx.count = count,
.current_dir = dirent
};
int error;
@@ -411,7 +418,7 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
if (put_user(d_off, &lastdirent->d_off))
error = -EFAULT;
else
- error = count - buf.count;
+ error = count - buf.ctx.count;
}
return error;
}
@@ -475,6 +482,7 @@ COMPAT_SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
CLASS(fd_pos, f)(fd);
struct compat_readdir_callback buf = {
.ctx.actor = compat_fillonedir,
+ .ctx.count = 1, /* Hint to fs: just one entry. */
.dirent = dirent
};
@@ -499,7 +507,6 @@ struct compat_getdents_callback {
struct dir_context ctx;
struct compat_linux_dirent __user *current_dir;
int prev_reclen;
- int count;
int error;
};
@@ -513,12 +520,16 @@ static bool compat_filldir(struct dir_context *ctx, const char *name, int namlen
int reclen = ALIGN(offsetof(struct compat_linux_dirent, d_name) +
namlen + 2, sizeof(compat_long_t));
int prev_reclen;
+ unsigned int flags = d_type;
+
+ BUILD_BUG_ON(FILLDIR_FLAG_NOINTR & S_DT_MASK);
+ d_type &= S_DT_MASK;
buf->error = verify_dirent_name(name, namlen);
if (unlikely(buf->error))
return false;
buf->error = -EINVAL; /* only used if we fail.. */
- if (reclen > buf->count)
+ if (reclen > ctx->count)
return false;
d_ino = ino;
if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
@@ -526,7 +537,7 @@ static bool compat_filldir(struct dir_context *ctx, const char *name, int namlen
return false;
}
prev_reclen = buf->prev_reclen;
- if (prev_reclen && signal_pending(current))
+ if (!(flags & FILLDIR_FLAG_NOINTR) && prev_reclen && signal_pending(current))
return false;
dirent = buf->current_dir;
prev = (void __user *) dirent - prev_reclen;
@@ -542,7 +553,7 @@ static bool compat_filldir(struct dir_context *ctx, const char *name, int namlen
buf->prev_reclen = reclen;
buf->current_dir = (void __user *)dirent + reclen;
- buf->count -= reclen;
+ ctx->count -= reclen;
return true;
efault_end:
user_write_access_end();
@@ -557,8 +568,8 @@ COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd,
CLASS(fd_pos, f)(fd);
struct compat_getdents_callback buf = {
.ctx.actor = compat_filldir,
+ .ctx.count = count,
.current_dir = dirent,
- .count = count
};
int error;
@@ -575,7 +586,7 @@ COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd,
if (put_user(buf.ctx.pos, &lastdirent->d_off))
error = -EFAULT;
else
- error = count - buf.count;
+ error = count - buf.ctx.count;
}
return error;
}
diff --git a/fs/resctrl/Kconfig b/fs/resctrl/Kconfig
new file mode 100644
index 000000000000..21671301bd8a
--- /dev/null
+++ b/fs/resctrl/Kconfig
@@ -0,0 +1,39 @@
+config RESCTRL_FS
+ bool "CPU Resource Control Filesystem (resctrl)"
+ depends on ARCH_HAS_CPU_RESCTRL
+ select KERNFS
+ select PROC_CPU_RESCTRL if PROC_FS
+ help
+ Some architectures provide hardware facilities to group tasks and
+ monitor and control their usage of memory system resources such as
+ caches and memory bandwidth. Examples of such facilities include
+ Intel's Resource Director Technology (Intel(R) RDT) and AMD's
+ Platform Quality of Service (AMD QoS).
+
+ If your system has the necessary support and you want to be able to
+ assign tasks to groups and manipulate the associated resource
+ monitors and controls from userspace, say Y here to get a mountable
+ 'resctrl' filesystem that lets you do just that.
+
+ If nothing mounts or prods the 'resctrl' filesystem, resource
+ controls and monitors are left in a quiescent, permissive state.
+
+ On architectures where this can be disabled independently, it is
+ safe to say N.
+
+ See <file:Documentation/filesystems/resctrl.rst> for more information.
+
+config RESCTRL_FS_PSEUDO_LOCK
+ bool
+ depends on RESCTRL_FS
+ help
+ Software mechanism to pin data in a cache portion using
+ micro-architecture specific knowledge.
+
+config RESCTRL_RMID_DEPENDS_ON_CLOSID
+ bool
+ depends on RESCTRL_FS
+ help
+ Enabled by the architecture when the RMID values depend on the CLOSID.
+ This causes the CLOSID allocator to search for CLOSID with clean
+ RMID.
diff --git a/fs/resctrl/Makefile b/fs/resctrl/Makefile
new file mode 100644
index 000000000000..e67f34d2236a
--- /dev/null
+++ b/fs/resctrl/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_RESCTRL_FS) += rdtgroup.o ctrlmondata.o monitor.o
+obj-$(CONFIG_RESCTRL_FS_PSEUDO_LOCK) += pseudo_lock.o
+
+# To allow define_trace.h's recursive include:
+CFLAGS_monitor.o = -I$(src)
diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c
new file mode 100644
index 000000000000..d98e0d2de09f
--- /dev/null
+++ b/fs/resctrl/ctrlmondata.c
@@ -0,0 +1,666 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Resource Director Technology(RDT)
+ * - Cache Allocation code.
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Authors:
+ * Fenghua Yu <fenghua.yu@intel.com>
+ * Tony Luck <tony.luck@intel.com>
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual June 2016, volume 3, section 17.17.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cpu.h>
+#include <linux/kernfs.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/tick.h>
+
+#include "internal.h"
+
+struct rdt_parse_data {
+ struct rdtgroup *rdtgrp;
+ char *buf;
+};
+
+typedef int (ctrlval_parser_t)(struct rdt_parse_data *data,
+ struct resctrl_schema *s,
+ struct rdt_ctrl_domain *d);
+
+/*
+ * Check whether MBA bandwidth percentage value is correct. The value is
+ * checked against the minimum and max bandwidth values specified by the
+ * hardware. The allocated bandwidth percentage is rounded to the next
+ * control step available on the hardware.
+ */
+static bool bw_validate(char *buf, u32 *data, struct rdt_resource *r)
+{
+ int ret;
+ u32 bw;
+
+ /*
+ * Only linear delay values is supported for current Intel SKUs.
+ */
+ if (!r->membw.delay_linear && r->membw.arch_needs_linear) {
+ rdt_last_cmd_puts("No support for non-linear MB domains\n");
+ return false;
+ }
+
+ ret = kstrtou32(buf, 10, &bw);
+ if (ret) {
+ rdt_last_cmd_printf("Invalid MB value %s\n", buf);
+ return false;
+ }
+
+ /* Nothing else to do if software controller is enabled. */
+ if (is_mba_sc(r)) {
+ *data = bw;
+ return true;
+ }
+
+ if (bw < r->membw.min_bw || bw > r->membw.max_bw) {
+ rdt_last_cmd_printf("MB value %u out of range [%d,%d]\n",
+ bw, r->membw.min_bw, r->membw.max_bw);
+ return false;
+ }
+
+ *data = roundup(bw, (unsigned long)r->membw.bw_gran);
+ return true;
+}
+
+static int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s,
+ struct rdt_ctrl_domain *d)
+{
+ struct resctrl_staged_config *cfg;
+ u32 closid = data->rdtgrp->closid;
+ struct rdt_resource *r = s->res;
+ u32 bw_val;
+
+ cfg = &d->staged_config[s->conf_type];
+ if (cfg->have_new_ctrl) {
+ rdt_last_cmd_printf("Duplicate domain %d\n", d->hdr.id);
+ return -EINVAL;
+ }
+
+ if (!bw_validate(data->buf, &bw_val, r))
+ return -EINVAL;
+
+ if (is_mba_sc(r)) {
+ d->mbps_val[closid] = bw_val;
+ return 0;
+ }
+
+ cfg->new_ctrl = bw_val;
+ cfg->have_new_ctrl = true;
+
+ return 0;
+}
+
+/*
+ * Check whether a cache bit mask is valid.
+ * On Intel CPUs, non-contiguous 1s value support is indicated by CPUID:
+ * - CPUID.0x10.1:ECX[3]: L3 non-contiguous 1s value supported if 1
+ * - CPUID.0x10.2:ECX[3]: L2 non-contiguous 1s value supported if 1
+ *
+ * Haswell does not support a non-contiguous 1s value and additionally
+ * requires at least two bits set.
+ * AMD allows non-contiguous bitmasks.
+ */
+static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r)
+{
+ u32 supported_bits = BIT_MASK(r->cache.cbm_len) - 1;
+ unsigned int cbm_len = r->cache.cbm_len;
+ unsigned long first_bit, zero_bit, val;
+ int ret;
+
+ ret = kstrtoul(buf, 16, &val);
+ if (ret) {
+ rdt_last_cmd_printf("Non-hex character in the mask %s\n", buf);
+ return false;
+ }
+
+ if ((r->cache.min_cbm_bits > 0 && val == 0) || val > supported_bits) {
+ rdt_last_cmd_puts("Mask out of range\n");
+ return false;
+ }
+
+ first_bit = find_first_bit(&val, cbm_len);
+ zero_bit = find_next_zero_bit(&val, cbm_len, first_bit);
+
+ /* Are non-contiguous bitmasks allowed? */
+ if (!r->cache.arch_has_sparse_bitmasks &&
+ (find_next_bit(&val, cbm_len, zero_bit) < cbm_len)) {
+ rdt_last_cmd_printf("The mask %lx has non-consecutive 1-bits\n", val);
+ return false;
+ }
+
+ if ((zero_bit - first_bit) < r->cache.min_cbm_bits) {
+ rdt_last_cmd_printf("Need at least %d bits in the mask\n",
+ r->cache.min_cbm_bits);
+ return false;
+ }
+
+ *data = val;
+ return true;
+}
+
+/*
+ * Read one cache bit mask (hex). Check that it is valid for the current
+ * resource type.
+ */
+static int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s,
+ struct rdt_ctrl_domain *d)
+{
+ struct rdtgroup *rdtgrp = data->rdtgrp;
+ struct resctrl_staged_config *cfg;
+ struct rdt_resource *r = s->res;
+ u32 cbm_val;
+
+ cfg = &d->staged_config[s->conf_type];
+ if (cfg->have_new_ctrl) {
+ rdt_last_cmd_printf("Duplicate domain %d\n", d->hdr.id);
+ return -EINVAL;
+ }
+
+ /*
+ * Cannot set up more than one pseudo-locked region in a cache
+ * hierarchy.
+ */
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP &&
+ rdtgroup_pseudo_locked_in_hierarchy(d)) {
+ rdt_last_cmd_puts("Pseudo-locked region in hierarchy\n");
+ return -EINVAL;
+ }
+
+ if (!cbm_validate(data->buf, &cbm_val, r))
+ return -EINVAL;
+
+ if ((rdtgrp->mode == RDT_MODE_EXCLUSIVE ||
+ rdtgrp->mode == RDT_MODE_SHAREABLE) &&
+ rdtgroup_cbm_overlaps_pseudo_locked(d, cbm_val)) {
+ rdt_last_cmd_puts("CBM overlaps with pseudo-locked region\n");
+ return -EINVAL;
+ }
+
+ /*
+ * The CBM may not overlap with the CBM of another closid if
+ * either is exclusive.
+ */
+ if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, true)) {
+ rdt_last_cmd_puts("Overlaps with exclusive group\n");
+ return -EINVAL;
+ }
+
+ if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, false)) {
+ if (rdtgrp->mode == RDT_MODE_EXCLUSIVE ||
+ rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
+ rdt_last_cmd_puts("Overlaps with other group\n");
+ return -EINVAL;
+ }
+ }
+
+ cfg->new_ctrl = cbm_val;
+ cfg->have_new_ctrl = true;
+
+ return 0;
+}
+
+/*
+ * For each domain in this resource we expect to find a series of:
+ * id=mask
+ * separated by ";". The "id" is in decimal, and must match one of
+ * the "id"s for this resource.
+ */
+static int parse_line(char *line, struct resctrl_schema *s,
+ struct rdtgroup *rdtgrp)
+{
+ enum resctrl_conf_type t = s->conf_type;
+ ctrlval_parser_t *parse_ctrlval = NULL;
+ struct resctrl_staged_config *cfg;
+ struct rdt_resource *r = s->res;
+ struct rdt_parse_data data;
+ struct rdt_ctrl_domain *d;
+ char *dom = NULL, *id;
+ unsigned long dom_id;
+
+ /* Walking r->domains, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
+
+ switch (r->schema_fmt) {
+ case RESCTRL_SCHEMA_BITMAP:
+ parse_ctrlval = &parse_cbm;
+ break;
+ case RESCTRL_SCHEMA_RANGE:
+ parse_ctrlval = &parse_bw;
+ break;
+ }
+
+ if (WARN_ON_ONCE(!parse_ctrlval))
+ return -EINVAL;
+
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP &&
+ (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)) {
+ rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n");
+ return -EINVAL;
+ }
+
+next:
+ if (!line || line[0] == '\0')
+ return 0;
+ dom = strsep(&line, ";");
+ id = strsep(&dom, "=");
+ if (!dom || kstrtoul(id, 10, &dom_id)) {
+ rdt_last_cmd_puts("Missing '=' or non-numeric domain\n");
+ return -EINVAL;
+ }
+ dom = strim(dom);
+ list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
+ if (d->hdr.id == dom_id) {
+ data.buf = dom;
+ data.rdtgrp = rdtgrp;
+ if (parse_ctrlval(&data, s, d))
+ return -EINVAL;
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
+ cfg = &d->staged_config[t];
+ /*
+ * In pseudo-locking setup mode and just
+ * parsed a valid CBM that should be
+ * pseudo-locked. Only one locked region per
+ * resource group and domain so just do
+ * the required initialization for single
+ * region and return.
+ */
+ rdtgrp->plr->s = s;
+ rdtgrp->plr->d = d;
+ rdtgrp->plr->cbm = cfg->new_ctrl;
+ d->plr = rdtgrp->plr;
+ return 0;
+ }
+ goto next;
+ }
+ }
+ return -EINVAL;
+}
+
+static int rdtgroup_parse_resource(char *resname, char *tok,
+ struct rdtgroup *rdtgrp)
+{
+ struct resctrl_schema *s;
+
+ list_for_each_entry(s, &resctrl_schema_all, list) {
+ if (!strcmp(resname, s->name) && rdtgrp->closid < s->num_closid)
+ return parse_line(tok, s, rdtgrp);
+ }
+ rdt_last_cmd_printf("Unknown or unsupported resource name '%s'\n", resname);
+ return -EINVAL;
+}
+
+ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct resctrl_schema *s;
+ struct rdtgroup *rdtgrp;
+ struct rdt_resource *r;
+ char *tok, *resname;
+ int ret = 0;
+
+ /* Valid input requires a trailing newline */
+ if (nbytes == 0 || buf[nbytes - 1] != '\n')
+ return -EINVAL;
+ buf[nbytes - 1] = '\0';
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ if (!rdtgrp) {
+ rdtgroup_kn_unlock(of->kn);
+ return -ENOENT;
+ }
+ rdt_last_cmd_clear();
+
+ /*
+ * No changes to pseudo-locked region allowed. It has to be removed
+ * and re-created instead.
+ */
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
+ ret = -EINVAL;
+ rdt_last_cmd_puts("Resource group is pseudo-locked\n");
+ goto out;
+ }
+
+ rdt_staged_configs_clear();
+
+ while ((tok = strsep(&buf, "\n")) != NULL) {
+ resname = strim(strsep(&tok, ":"));
+ if (!tok) {
+ rdt_last_cmd_puts("Missing ':'\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ if (tok[0] == '\0') {
+ rdt_last_cmd_printf("Missing '%s' value\n", resname);
+ ret = -EINVAL;
+ goto out;
+ }
+ ret = rdtgroup_parse_resource(resname, tok, rdtgrp);
+ if (ret)
+ goto out;
+ }
+
+ list_for_each_entry(s, &resctrl_schema_all, list) {
+ r = s->res;
+
+ /*
+ * Writes to mba_sc resources update the software controller,
+ * not the control MSR.
+ */
+ if (is_mba_sc(r))
+ continue;
+
+ ret = resctrl_arch_update_domains(r, rdtgrp->closid);
+ if (ret)
+ goto out;
+ }
+
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
+ /*
+ * If pseudo-locking fails we keep the resource group in
+ * mode RDT_MODE_PSEUDO_LOCKSETUP with its class of service
+ * active and updated for just the domain the pseudo-locked
+ * region was requested for.
+ */
+ ret = rdtgroup_pseudo_lock_create(rdtgrp);
+ }
+
+out:
+ rdt_staged_configs_clear();
+ rdtgroup_kn_unlock(of->kn);
+ return ret ?: nbytes;
+}
+
+static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int closid)
+{
+ struct rdt_resource *r = schema->res;
+ struct rdt_ctrl_domain *dom;
+ bool sep = false;
+ u32 ctrl_val;
+
+ /* Walking r->domains, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
+
+ seq_printf(s, "%*s:", max_name_width, schema->name);
+ list_for_each_entry(dom, &r->ctrl_domains, hdr.list) {
+ if (sep)
+ seq_puts(s, ";");
+
+ if (is_mba_sc(r))
+ ctrl_val = dom->mbps_val[closid];
+ else
+ ctrl_val = resctrl_arch_get_config(r, dom, closid,
+ schema->conf_type);
+
+ seq_printf(s, schema->fmt_str, dom->hdr.id, ctrl_val);
+ sep = true;
+ }
+ seq_puts(s, "\n");
+}
+
+int rdtgroup_schemata_show(struct kernfs_open_file *of,
+ struct seq_file *s, void *v)
+{
+ struct resctrl_schema *schema;
+ struct rdtgroup *rdtgrp;
+ int ret = 0;
+ u32 closid;
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ if (rdtgrp) {
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
+ list_for_each_entry(schema, &resctrl_schema_all, list) {
+ seq_printf(s, "%s:uninitialized\n", schema->name);
+ }
+ } else if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
+ if (!rdtgrp->plr->d) {
+ rdt_last_cmd_clear();
+ rdt_last_cmd_puts("Cache domain offline\n");
+ ret = -ENODEV;
+ } else {
+ seq_printf(s, "%s:%d=%x\n",
+ rdtgrp->plr->s->res->name,
+ rdtgrp->plr->d->hdr.id,
+ rdtgrp->plr->cbm);
+ }
+ } else {
+ closid = rdtgrp->closid;
+ list_for_each_entry(schema, &resctrl_schema_all, list) {
+ if (closid < schema->num_closid)
+ show_doms(s, schema, closid);
+ }
+ }
+ } else {
+ ret = -ENOENT;
+ }
+ rdtgroup_kn_unlock(of->kn);
+ return ret;
+}
+
+static int smp_mon_event_count(void *arg)
+{
+ mon_event_count(arg);
+
+ return 0;
+}
+
+ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct rdtgroup *rdtgrp;
+ int ret = 0;
+
+ /* Valid input requires a trailing newline */
+ if (nbytes == 0 || buf[nbytes - 1] != '\n')
+ return -EINVAL;
+ buf[nbytes - 1] = '\0';
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ if (!rdtgrp) {
+ rdtgroup_kn_unlock(of->kn);
+ return -ENOENT;
+ }
+ rdt_last_cmd_clear();
+
+ if (!strcmp(buf, "mbm_local_bytes")) {
+ if (resctrl_arch_is_mbm_local_enabled())
+ rdtgrp->mba_mbps_event = QOS_L3_MBM_LOCAL_EVENT_ID;
+ else
+ ret = -EINVAL;
+ } else if (!strcmp(buf, "mbm_total_bytes")) {
+ if (resctrl_arch_is_mbm_total_enabled())
+ rdtgrp->mba_mbps_event = QOS_L3_MBM_TOTAL_EVENT_ID;
+ else
+ ret = -EINVAL;
+ } else {
+ ret = -EINVAL;
+ }
+
+ if (ret)
+ rdt_last_cmd_printf("Unsupported event id '%s'\n", buf);
+
+ rdtgroup_kn_unlock(of->kn);
+
+ return ret ?: nbytes;
+}
+
+int rdtgroup_mba_mbps_event_show(struct kernfs_open_file *of,
+ struct seq_file *s, void *v)
+{
+ struct rdtgroup *rdtgrp;
+ int ret = 0;
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+
+ if (rdtgrp) {
+ switch (rdtgrp->mba_mbps_event) {
+ case QOS_L3_MBM_LOCAL_EVENT_ID:
+ seq_puts(s, "mbm_local_bytes\n");
+ break;
+ case QOS_L3_MBM_TOTAL_EVENT_ID:
+ seq_puts(s, "mbm_total_bytes\n");
+ break;
+ default:
+ pr_warn_once("Bad event %d\n", rdtgrp->mba_mbps_event);
+ ret = -EINVAL;
+ break;
+ }
+ } else {
+ ret = -ENOENT;
+ }
+
+ rdtgroup_kn_unlock(of->kn);
+
+ return ret;
+}
+
+struct rdt_domain_hdr *resctrl_find_domain(struct list_head *h, int id,
+ struct list_head **pos)
+{
+ struct rdt_domain_hdr *d;
+ struct list_head *l;
+
+ list_for_each(l, h) {
+ d = list_entry(l, struct rdt_domain_hdr, list);
+ /* When id is found, return its domain. */
+ if (id == d->id)
+ return d;
+ /* Stop searching when finding id's position in sorted list. */
+ if (id < d->id)
+ break;
+ }
+
+ if (pos)
+ *pos = l;
+
+ return NULL;
+}
+
+void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
+ struct rdt_mon_domain *d, struct rdtgroup *rdtgrp,
+ cpumask_t *cpumask, int evtid, int first)
+{
+ int cpu;
+
+ /* When picking a CPU from cpu_mask, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
+
+ /*
+ * Setup the parameters to pass to mon_event_count() to read the data.
+ */
+ rr->rgrp = rdtgrp;
+ rr->evtid = evtid;
+ rr->r = r;
+ rr->d = d;
+ rr->first = first;
+ rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evtid);
+ if (IS_ERR(rr->arch_mon_ctx)) {
+ rr->err = -EINVAL;
+ return;
+ }
+
+ cpu = cpumask_any_housekeeping(cpumask, RESCTRL_PICK_ANY_CPU);
+
+ /*
+ * cpumask_any_housekeeping() prefers housekeeping CPUs, but
+ * are all the CPUs nohz_full? If yes, pick a CPU to IPI.
+ * MPAM's resctrl_arch_rmid_read() is unable to read the
+ * counters on some platforms if its called in IRQ context.
+ */
+ if (tick_nohz_full_cpu(cpu))
+ smp_call_function_any(cpumask, mon_event_count, rr, 1);
+ else
+ smp_call_on_cpu(cpu, smp_mon_event_count, rr, false);
+
+ resctrl_arch_mon_ctx_free(r, evtid, rr->arch_mon_ctx);
+}
+
+int rdtgroup_mondata_show(struct seq_file *m, void *arg)
+{
+ struct kernfs_open_file *of = m->private;
+ enum resctrl_res_level resid;
+ enum resctrl_event_id evtid;
+ struct rdt_domain_hdr *hdr;
+ struct rmid_read rr = {0};
+ struct rdt_mon_domain *d;
+ struct rdtgroup *rdtgrp;
+ int domid, cpu, ret = 0;
+ struct rdt_resource *r;
+ struct cacheinfo *ci;
+ struct mon_data *md;
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ if (!rdtgrp) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ md = of->kn->priv;
+ if (WARN_ON_ONCE(!md)) {
+ ret = -EIO;
+ goto out;
+ }
+
+ resid = md->rid;
+ domid = md->domid;
+ evtid = md->evtid;
+ r = resctrl_arch_get_resource(resid);
+
+ if (md->sum) {
+ /*
+ * This file requires summing across all domains that share
+ * the L3 cache id that was provided in the "domid" field of the
+ * struct mon_data. Search all domains in the resource for
+ * one that matches this cache id.
+ */
+ list_for_each_entry(d, &r->mon_domains, hdr.list) {
+ if (d->ci_id == domid) {
+ rr.ci_id = d->ci_id;
+ cpu = cpumask_any(&d->hdr.cpu_mask);
+ ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE);
+ if (!ci)
+ continue;
+ mon_event_read(&rr, r, NULL, rdtgrp,
+ &ci->shared_cpu_map, evtid, false);
+ goto checkresult;
+ }
+ }
+ ret = -ENOENT;
+ goto out;
+ } else {
+ /*
+ * This file provides data from a single domain. Search
+ * the resource to find the domain with "domid".
+ */
+ hdr = resctrl_find_domain(&r->mon_domains, domid, NULL);
+ if (!hdr || WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN)) {
+ ret = -ENOENT;
+ goto out;
+ }
+ d = container_of(hdr, struct rdt_mon_domain, hdr);
+ mon_event_read(&rr, r, d, rdtgrp, &d->hdr.cpu_mask, evtid, false);
+ }
+
+checkresult:
+
+ if (rr.err == -EIO)
+ seq_puts(m, "Error\n");
+ else if (rr.err == -EINVAL)
+ seq_puts(m, "Unavailable\n");
+ else
+ seq_printf(m, "%llu\n", rr.val);
+
+out:
+ rdtgroup_kn_unlock(of->kn);
+ return ret;
+}
diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h
new file mode 100644
index 000000000000..0a1eedba2b03
--- /dev/null
+++ b/fs/resctrl/internal.h
@@ -0,0 +1,426 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _FS_RESCTRL_INTERNAL_H
+#define _FS_RESCTRL_INTERNAL_H
+
+#include <linux/resctrl.h>
+#include <linux/kernfs.h>
+#include <linux/fs_context.h>
+#include <linux/tick.h>
+
+#define CQM_LIMBOCHECK_INTERVAL 1000
+
+/**
+ * cpumask_any_housekeeping() - Choose any CPU in @mask, preferring those that
+ * aren't marked nohz_full
+ * @mask: The mask to pick a CPU from.
+ * @exclude_cpu:The CPU to avoid picking.
+ *
+ * Returns a CPU from @mask, but not @exclude_cpu. If there are housekeeping
+ * CPUs that don't use nohz_full, these are preferred. Pass
+ * RESCTRL_PICK_ANY_CPU to avoid excluding any CPUs.
+ *
+ * When a CPU is excluded, returns >= nr_cpu_ids if no CPUs are available.
+ */
+static inline unsigned int
+cpumask_any_housekeeping(const struct cpumask *mask, int exclude_cpu)
+{
+ unsigned int cpu;
+
+ /* Try to find a CPU that isn't nohz_full to use in preference */
+ if (tick_nohz_full_enabled()) {
+ cpu = cpumask_any_andnot_but(mask, tick_nohz_full_mask, exclude_cpu);
+ if (cpu < nr_cpu_ids)
+ return cpu;
+ }
+
+ return cpumask_any_but(mask, exclude_cpu);
+}
+
+struct rdt_fs_context {
+ struct kernfs_fs_context kfc;
+ bool enable_cdpl2;
+ bool enable_cdpl3;
+ bool enable_mba_mbps;
+ bool enable_debug;
+};
+
+static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc)
+{
+ struct kernfs_fs_context *kfc = fc->fs_private;
+
+ return container_of(kfc, struct rdt_fs_context, kfc);
+}
+
+/**
+ * struct mon_evt - Entry in the event list of a resource
+ * @evtid: event id
+ * @name: name of the event
+ * @configurable: true if the event is configurable
+ * @list: entry in &rdt_resource->evt_list
+ */
+struct mon_evt {
+ enum resctrl_event_id evtid;
+ char *name;
+ bool configurable;
+ struct list_head list;
+};
+
+/**
+ * struct mon_data - Monitoring details for each event file.
+ * @list: Member of the global @mon_data_kn_priv_list list.
+ * @rid: Resource id associated with the event file.
+ * @evtid: Event id associated with the event file.
+ * @sum: Set when event must be summed across multiple
+ * domains.
+ * @domid: When @sum is zero this is the domain to which
+ * the event file belongs. When @sum is one this
+ * is the id of the L3 cache that all domains to be
+ * summed share.
+ *
+ * Pointed to by the kernfs kn->priv field of monitoring event files.
+ * Readers and writers must hold rdtgroup_mutex.
+ */
+struct mon_data {
+ struct list_head list;
+ enum resctrl_res_level rid;
+ enum resctrl_event_id evtid;
+ int domid;
+ bool sum;
+};
+
+/**
+ * struct rmid_read - Data passed across smp_call*() to read event count.
+ * @rgrp: Resource group for which the counter is being read. If it is a parent
+ * resource group then its event count is summed with the count from all
+ * its child resource groups.
+ * @r: Resource describing the properties of the event being read.
+ * @d: Domain that the counter should be read from. If NULL then sum all
+ * domains in @r sharing L3 @ci.id
+ * @evtid: Which monitor event to read.
+ * @first: Initialize MBM counter when true.
+ * @ci_id: Cacheinfo id for L3. Only set when @d is NULL. Used when summing domains.
+ * @err: Error encountered when reading counter.
+ * @val: Returned value of event counter. If @rgrp is a parent resource group,
+ * @val includes the sum of event counts from its child resource groups.
+ * If @d is NULL, @val includes the sum of all domains in @r sharing @ci.id,
+ * (summed across child resource groups if @rgrp is a parent resource group).
+ * @arch_mon_ctx: Hardware monitor allocated for this read request (MPAM only).
+ */
+struct rmid_read {
+ struct rdtgroup *rgrp;
+ struct rdt_resource *r;
+ struct rdt_mon_domain *d;
+ enum resctrl_event_id evtid;
+ bool first;
+ unsigned int ci_id;
+ int err;
+ u64 val;
+ void *arch_mon_ctx;
+};
+
+extern struct list_head resctrl_schema_all;
+
+extern bool resctrl_mounted;
+
+enum rdt_group_type {
+ RDTCTRL_GROUP = 0,
+ RDTMON_GROUP,
+ RDT_NUM_GROUP,
+};
+
+/**
+ * enum rdtgrp_mode - Mode of a RDT resource group
+ * @RDT_MODE_SHAREABLE: This resource group allows sharing of its allocations
+ * @RDT_MODE_EXCLUSIVE: No sharing of this resource group's allocations allowed
+ * @RDT_MODE_PSEUDO_LOCKSETUP: Resource group will be used for Pseudo-Locking
+ * @RDT_MODE_PSEUDO_LOCKED: No sharing of this resource group's allocations
+ * allowed AND the allocations are Cache Pseudo-Locked
+ * @RDT_NUM_MODES: Total number of modes
+ *
+ * The mode of a resource group enables control over the allowed overlap
+ * between allocations associated with different resource groups (classes
+ * of service). User is able to modify the mode of a resource group by
+ * writing to the "mode" resctrl file associated with the resource group.
+ *
+ * The "shareable", "exclusive", and "pseudo-locksetup" modes are set by
+ * writing the appropriate text to the "mode" file. A resource group enters
+ * "pseudo-locked" mode after the schemata is written while the resource
+ * group is in "pseudo-locksetup" mode.
+ */
+enum rdtgrp_mode {
+ RDT_MODE_SHAREABLE = 0,
+ RDT_MODE_EXCLUSIVE,
+ RDT_MODE_PSEUDO_LOCKSETUP,
+ RDT_MODE_PSEUDO_LOCKED,
+
+ /* Must be last */
+ RDT_NUM_MODES,
+};
+
+/**
+ * struct mongroup - store mon group's data in resctrl fs.
+ * @mon_data_kn: kernfs node for the mon_data directory
+ * @parent: parent rdtgrp
+ * @crdtgrp_list: child rdtgroup node list
+ * @rmid: rmid for this rdtgroup
+ */
+struct mongroup {
+ struct kernfs_node *mon_data_kn;
+ struct rdtgroup *parent;
+ struct list_head crdtgrp_list;
+ u32 rmid;
+};
+
+/**
+ * struct rdtgroup - store rdtgroup's data in resctrl file system.
+ * @kn: kernfs node
+ * @rdtgroup_list: linked list for all rdtgroups
+ * @closid: closid for this rdtgroup
+ * @cpu_mask: CPUs assigned to this rdtgroup
+ * @flags: status bits
+ * @waitcount: how many cpus expect to find this
+ * group when they acquire rdtgroup_mutex
+ * @type: indicates type of this rdtgroup - either
+ * monitor only or ctrl_mon group
+ * @mon: mongroup related data
+ * @mode: mode of resource group
+ * @mba_mbps_event: input monitoring event id when mba_sc is enabled
+ * @plr: pseudo-locked region
+ */
+struct rdtgroup {
+ struct kernfs_node *kn;
+ struct list_head rdtgroup_list;
+ u32 closid;
+ struct cpumask cpu_mask;
+ int flags;
+ atomic_t waitcount;
+ enum rdt_group_type type;
+ struct mongroup mon;
+ enum rdtgrp_mode mode;
+ enum resctrl_event_id mba_mbps_event;
+ struct pseudo_lock_region *plr;
+};
+
+/* rdtgroup.flags */
+#define RDT_DELETED 1
+
+/* rftype.flags */
+#define RFTYPE_FLAGS_CPUS_LIST 1
+
+/*
+ * Define the file type flags for base and info directories.
+ */
+#define RFTYPE_INFO BIT(0)
+
+#define RFTYPE_BASE BIT(1)
+
+#define RFTYPE_CTRL BIT(4)
+
+#define RFTYPE_MON BIT(5)
+
+#define RFTYPE_TOP BIT(6)
+
+#define RFTYPE_RES_CACHE BIT(8)
+
+#define RFTYPE_RES_MB BIT(9)
+
+#define RFTYPE_DEBUG BIT(10)
+
+#define RFTYPE_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL)
+
+#define RFTYPE_MON_INFO (RFTYPE_INFO | RFTYPE_MON)
+
+#define RFTYPE_TOP_INFO (RFTYPE_INFO | RFTYPE_TOP)
+
+#define RFTYPE_CTRL_BASE (RFTYPE_BASE | RFTYPE_CTRL)
+
+#define RFTYPE_MON_BASE (RFTYPE_BASE | RFTYPE_MON)
+
+/* List of all resource groups */
+extern struct list_head rdt_all_groups;
+
+extern int max_name_width;
+
+/**
+ * struct rftype - describe each file in the resctrl file system
+ * @name: File name
+ * @mode: Access mode
+ * @kf_ops: File operations
+ * @flags: File specific RFTYPE_FLAGS_* flags
+ * @fflags: File specific RFTYPE_* flags
+ * @seq_show: Show content of the file
+ * @write: Write to the file
+ */
+struct rftype {
+ char *name;
+ umode_t mode;
+ const struct kernfs_ops *kf_ops;
+ unsigned long flags;
+ unsigned long fflags;
+
+ int (*seq_show)(struct kernfs_open_file *of,
+ struct seq_file *sf, void *v);
+ /*
+ * write() is the generic write callback which maps directly to
+ * kernfs write operation and overrides all other operations.
+ * Maximum write size is determined by ->max_write_len.
+ */
+ ssize_t (*write)(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off);
+};
+
+/**
+ * struct mbm_state - status for each MBM counter in each domain
+ * @prev_bw_bytes: Previous bytes value read for bandwidth calculation
+ * @prev_bw: The most recent bandwidth in MBps
+ */
+struct mbm_state {
+ u64 prev_bw_bytes;
+ u32 prev_bw;
+};
+
+extern struct mutex rdtgroup_mutex;
+
+static inline const char *rdt_kn_name(const struct kernfs_node *kn)
+{
+ return rcu_dereference_check(kn->name, lockdep_is_held(&rdtgroup_mutex));
+}
+
+extern struct rdtgroup rdtgroup_default;
+
+extern struct dentry *debugfs_resctrl;
+
+extern enum resctrl_event_id mba_mbps_default_event;
+
+void rdt_last_cmd_clear(void);
+
+void rdt_last_cmd_puts(const char *s);
+
+__printf(1, 2)
+void rdt_last_cmd_printf(const char *fmt, ...);
+
+struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn);
+
+void rdtgroup_kn_unlock(struct kernfs_node *kn);
+
+int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name);
+
+int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name,
+ umode_t mask);
+
+ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off);
+
+int rdtgroup_schemata_show(struct kernfs_open_file *of,
+ struct seq_file *s, void *v);
+
+ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off);
+
+int rdtgroup_mba_mbps_event_show(struct kernfs_open_file *of,
+ struct seq_file *s, void *v);
+
+bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_ctrl_domain *d,
+ unsigned long cbm, int closid, bool exclusive);
+
+unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_ctrl_domain *d,
+ unsigned long cbm);
+
+enum rdtgrp_mode rdtgroup_mode_by_closid(int closid);
+
+int rdtgroup_tasks_assigned(struct rdtgroup *r);
+
+int closids_supported(void);
+
+void closid_free(int closid);
+
+int alloc_rmid(u32 closid);
+
+void free_rmid(u32 closid, u32 rmid);
+
+void resctrl_mon_resource_exit(void);
+
+void mon_event_count(void *info);
+
+int rdtgroup_mondata_show(struct seq_file *m, void *arg);
+
+void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
+ struct rdt_mon_domain *d, struct rdtgroup *rdtgrp,
+ cpumask_t *cpumask, int evtid, int first);
+
+int resctrl_mon_resource_init(void);
+
+void mbm_setup_overflow_handler(struct rdt_mon_domain *dom,
+ unsigned long delay_ms,
+ int exclude_cpu);
+
+void mbm_handle_overflow(struct work_struct *work);
+
+bool is_mba_sc(struct rdt_resource *r);
+
+void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms,
+ int exclude_cpu);
+
+void cqm_handle_limbo(struct work_struct *work);
+
+bool has_busy_rmid(struct rdt_mon_domain *d);
+
+void __check_limbo(struct rdt_mon_domain *d, bool force_free);
+
+void resctrl_file_fflags_init(const char *config, unsigned long fflags);
+
+void rdt_staged_configs_clear(void);
+
+bool closid_allocated(unsigned int closid);
+
+int resctrl_find_cleanest_closid(void);
+
+#ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK
+int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp);
+
+int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp);
+
+bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm);
+
+bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d);
+
+int rdt_pseudo_lock_init(void);
+
+void rdt_pseudo_lock_release(void);
+
+int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp);
+
+void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp);
+
+#else
+static inline int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm)
+{
+ return false;
+}
+
+static inline bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d)
+{
+ return false;
+}
+
+static inline int rdt_pseudo_lock_init(void) { return 0; }
+static inline void rdt_pseudo_lock_release(void) { }
+static inline int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp) { }
+#endif /* CONFIG_RESCTRL_FS_PSEUDO_LOCK */
+
+#endif /* _FS_RESCTRL_INTERNAL_H */
diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c
new file mode 100644
index 000000000000..f5637855c3ac
--- /dev/null
+++ b/fs/resctrl/monitor.c
@@ -0,0 +1,931 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Resource Director Technology(RDT)
+ * - Monitoring code
+ *
+ * Copyright (C) 2017 Intel Corporation
+ *
+ * Author:
+ * Vikas Shivappa <vikas.shivappa@intel.com>
+ *
+ * This replaces the cqm.c based on perf but we reuse a lot of
+ * code and datastructures originally from Peter Zijlstra and Matt Fleming.
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual June 2016, volume 3, section 17.17.
+ */
+
+#define pr_fmt(fmt) "resctrl: " fmt
+
+#include <linux/cpu.h>
+#include <linux/resctrl.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+
+#include "internal.h"
+
+#define CREATE_TRACE_POINTS
+
+#include "monitor_trace.h"
+
+/**
+ * struct rmid_entry - dirty tracking for all RMID.
+ * @closid: The CLOSID for this entry.
+ * @rmid: The RMID for this entry.
+ * @busy: The number of domains with cached data using this RMID.
+ * @list: Member of the rmid_free_lru list when busy == 0.
+ *
+ * Depending on the architecture the correct monitor is accessed using
+ * both @closid and @rmid, or @rmid only.
+ *
+ * Take the rdtgroup_mutex when accessing.
+ */
+struct rmid_entry {
+ u32 closid;
+ u32 rmid;
+ int busy;
+ struct list_head list;
+};
+
+/*
+ * @rmid_free_lru - A least recently used list of free RMIDs
+ * These RMIDs are guaranteed to have an occupancy less than the
+ * threshold occupancy
+ */
+static LIST_HEAD(rmid_free_lru);
+
+/*
+ * @closid_num_dirty_rmid The number of dirty RMID each CLOSID has.
+ * Only allocated when CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID is defined.
+ * Indexed by CLOSID. Protected by rdtgroup_mutex.
+ */
+static u32 *closid_num_dirty_rmid;
+
+/*
+ * @rmid_limbo_count - count of currently unused but (potentially)
+ * dirty RMIDs.
+ * This counts RMIDs that no one is currently using but that
+ * may have a occupancy value > resctrl_rmid_realloc_threshold. User can
+ * change the threshold occupancy value.
+ */
+static unsigned int rmid_limbo_count;
+
+/*
+ * @rmid_entry - The entry in the limbo and free lists.
+ */
+static struct rmid_entry *rmid_ptrs;
+
+/*
+ * This is the threshold cache occupancy in bytes at which we will consider an
+ * RMID available for re-allocation.
+ */
+unsigned int resctrl_rmid_realloc_threshold;
+
+/*
+ * This is the maximum value for the reallocation threshold, in bytes.
+ */
+unsigned int resctrl_rmid_realloc_limit;
+
+/*
+ * x86 and arm64 differ in their handling of monitoring.
+ * x86's RMID are independent numbers, there is only one source of traffic
+ * with an RMID value of '1'.
+ * arm64's PMG extends the PARTID/CLOSID space, there are multiple sources of
+ * traffic with a PMG value of '1', one for each CLOSID, meaning the RMID
+ * value is no longer unique.
+ * To account for this, resctrl uses an index. On x86 this is just the RMID,
+ * on arm64 it encodes the CLOSID and RMID. This gives a unique number.
+ *
+ * The domain's rmid_busy_llc and rmid_ptrs[] are sized by index. The arch code
+ * must accept an attempt to read every index.
+ */
+static inline struct rmid_entry *__rmid_entry(u32 idx)
+{
+ struct rmid_entry *entry;
+ u32 closid, rmid;
+
+ entry = &rmid_ptrs[idx];
+ resctrl_arch_rmid_idx_decode(idx, &closid, &rmid);
+
+ WARN_ON_ONCE(entry->closid != closid);
+ WARN_ON_ONCE(entry->rmid != rmid);
+
+ return entry;
+}
+
+static void limbo_release_entry(struct rmid_entry *entry)
+{
+ lockdep_assert_held(&rdtgroup_mutex);
+
+ rmid_limbo_count--;
+ list_add_tail(&entry->list, &rmid_free_lru);
+
+ if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID))
+ closid_num_dirty_rmid[entry->closid]--;
+}
+
+/*
+ * Check the RMIDs that are marked as busy for this domain. If the
+ * reported LLC occupancy is below the threshold clear the busy bit and
+ * decrement the count. If the busy count gets to zero on an RMID, we
+ * free the RMID
+ */
+void __check_limbo(struct rdt_mon_domain *d, bool force_free)
+{
+ struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
+ u32 idx_limit = resctrl_arch_system_num_rmid_idx();
+ struct rmid_entry *entry;
+ u32 idx, cur_idx = 1;
+ void *arch_mon_ctx;
+ bool rmid_dirty;
+ u64 val = 0;
+
+ arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, QOS_L3_OCCUP_EVENT_ID);
+ if (IS_ERR(arch_mon_ctx)) {
+ pr_warn_ratelimited("Failed to allocate monitor context: %ld",
+ PTR_ERR(arch_mon_ctx));
+ return;
+ }
+
+ /*
+ * Skip RMID 0 and start from RMID 1 and check all the RMIDs that
+ * are marked as busy for occupancy < threshold. If the occupancy
+ * is less than the threshold decrement the busy counter of the
+ * RMID and move it to the free list when the counter reaches 0.
+ */
+ for (;;) {
+ idx = find_next_bit(d->rmid_busy_llc, idx_limit, cur_idx);
+ if (idx >= idx_limit)
+ break;
+
+ entry = __rmid_entry(idx);
+ if (resctrl_arch_rmid_read(r, d, entry->closid, entry->rmid,
+ QOS_L3_OCCUP_EVENT_ID, &val,
+ arch_mon_ctx)) {
+ rmid_dirty = true;
+ } else {
+ rmid_dirty = (val >= resctrl_rmid_realloc_threshold);
+
+ /*
+ * x86's CLOSID and RMID are independent numbers, so the entry's
+ * CLOSID is an empty CLOSID (X86_RESCTRL_EMPTY_CLOSID). On Arm the
+ * RMID (PMG) extends the CLOSID (PARTID) space with bits that aren't
+ * used to select the configuration. It is thus necessary to track both
+ * CLOSID and RMID because there may be dependencies between them
+ * on some architectures.
+ */
+ trace_mon_llc_occupancy_limbo(entry->closid, entry->rmid, d->hdr.id, val);
+ }
+
+ if (force_free || !rmid_dirty) {
+ clear_bit(idx, d->rmid_busy_llc);
+ if (!--entry->busy)
+ limbo_release_entry(entry);
+ }
+ cur_idx = idx + 1;
+ }
+
+ resctrl_arch_mon_ctx_free(r, QOS_L3_OCCUP_EVENT_ID, arch_mon_ctx);
+}
+
+bool has_busy_rmid(struct rdt_mon_domain *d)
+{
+ u32 idx_limit = resctrl_arch_system_num_rmid_idx();
+
+ return find_first_bit(d->rmid_busy_llc, idx_limit) != idx_limit;
+}
+
+static struct rmid_entry *resctrl_find_free_rmid(u32 closid)
+{
+ struct rmid_entry *itr;
+ u32 itr_idx, cmp_idx;
+
+ if (list_empty(&rmid_free_lru))
+ return rmid_limbo_count ? ERR_PTR(-EBUSY) : ERR_PTR(-ENOSPC);
+
+ list_for_each_entry(itr, &rmid_free_lru, list) {
+ /*
+ * Get the index of this free RMID, and the index it would need
+ * to be if it were used with this CLOSID.
+ * If the CLOSID is irrelevant on this architecture, the two
+ * index values are always the same on every entry and thus the
+ * very first entry will be returned.
+ */
+ itr_idx = resctrl_arch_rmid_idx_encode(itr->closid, itr->rmid);
+ cmp_idx = resctrl_arch_rmid_idx_encode(closid, itr->rmid);
+
+ if (itr_idx == cmp_idx)
+ return itr;
+ }
+
+ return ERR_PTR(-ENOSPC);
+}
+
+/**
+ * resctrl_find_cleanest_closid() - Find a CLOSID where all the associated
+ * RMID are clean, or the CLOSID that has
+ * the most clean RMID.
+ *
+ * MPAM's equivalent of RMID are per-CLOSID, meaning a freshly allocated CLOSID
+ * may not be able to allocate clean RMID. To avoid this the allocator will
+ * choose the CLOSID with the most clean RMID.
+ *
+ * When the CLOSID and RMID are independent numbers, the first free CLOSID will
+ * be returned.
+ */
+int resctrl_find_cleanest_closid(void)
+{
+ u32 cleanest_closid = ~0;
+ int i = 0;
+
+ lockdep_assert_held(&rdtgroup_mutex);
+
+ if (!IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID))
+ return -EIO;
+
+ for (i = 0; i < closids_supported(); i++) {
+ int num_dirty;
+
+ if (closid_allocated(i))
+ continue;
+
+ num_dirty = closid_num_dirty_rmid[i];
+ if (num_dirty == 0)
+ return i;
+
+ if (cleanest_closid == ~0)
+ cleanest_closid = i;
+
+ if (num_dirty < closid_num_dirty_rmid[cleanest_closid])
+ cleanest_closid = i;
+ }
+
+ if (cleanest_closid == ~0)
+ return -ENOSPC;
+
+ return cleanest_closid;
+}
+
+/*
+ * For MPAM the RMID value is not unique, and has to be considered with
+ * the CLOSID. The (CLOSID, RMID) pair is allocated on all domains, which
+ * allows all domains to be managed by a single free list.
+ * Each domain also has a rmid_busy_llc to reduce the work of the limbo handler.
+ */
+int alloc_rmid(u32 closid)
+{
+ struct rmid_entry *entry;
+
+ lockdep_assert_held(&rdtgroup_mutex);
+
+ entry = resctrl_find_free_rmid(closid);
+ if (IS_ERR(entry))
+ return PTR_ERR(entry);
+
+ list_del(&entry->list);
+ return entry->rmid;
+}
+
+static void add_rmid_to_limbo(struct rmid_entry *entry)
+{
+ struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
+ struct rdt_mon_domain *d;
+ u32 idx;
+
+ lockdep_assert_held(&rdtgroup_mutex);
+
+ /* Walking r->domains, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
+
+ idx = resctrl_arch_rmid_idx_encode(entry->closid, entry->rmid);
+
+ entry->busy = 0;
+ list_for_each_entry(d, &r->mon_domains, hdr.list) {
+ /*
+ * For the first limbo RMID in the domain,
+ * setup up the limbo worker.
+ */
+ if (!has_busy_rmid(d))
+ cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL,
+ RESCTRL_PICK_ANY_CPU);
+ set_bit(idx, d->rmid_busy_llc);
+ entry->busy++;
+ }
+
+ rmid_limbo_count++;
+ if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID))
+ closid_num_dirty_rmid[entry->closid]++;
+}
+
+void free_rmid(u32 closid, u32 rmid)
+{
+ u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid);
+ struct rmid_entry *entry;
+
+ lockdep_assert_held(&rdtgroup_mutex);
+
+ /*
+ * Do not allow the default rmid to be free'd. Comparing by index
+ * allows architectures that ignore the closid parameter to avoid an
+ * unnecessary check.
+ */
+ if (!resctrl_arch_mon_capable() ||
+ idx == resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID,
+ RESCTRL_RESERVED_RMID))
+ return;
+
+ entry = __rmid_entry(idx);
+
+ if (resctrl_arch_is_llc_occupancy_enabled())
+ add_rmid_to_limbo(entry);
+ else
+ list_add_tail(&entry->list, &rmid_free_lru);
+}
+
+static struct mbm_state *get_mbm_state(struct rdt_mon_domain *d, u32 closid,
+ u32 rmid, enum resctrl_event_id evtid)
+{
+ u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid);
+
+ switch (evtid) {
+ case QOS_L3_MBM_TOTAL_EVENT_ID:
+ return &d->mbm_total[idx];
+ case QOS_L3_MBM_LOCAL_EVENT_ID:
+ return &d->mbm_local[idx];
+ default:
+ return NULL;
+ }
+}
+
+static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr)
+{
+ int cpu = smp_processor_id();
+ struct rdt_mon_domain *d;
+ struct cacheinfo *ci;
+ struct mbm_state *m;
+ int err, ret;
+ u64 tval = 0;
+
+ if (rr->first) {
+ resctrl_arch_reset_rmid(rr->r, rr->d, closid, rmid, rr->evtid);
+ m = get_mbm_state(rr->d, closid, rmid, rr->evtid);
+ if (m)
+ memset(m, 0, sizeof(struct mbm_state));
+ return 0;
+ }
+
+ if (rr->d) {
+ /* Reading a single domain, must be on a CPU in that domain. */
+ if (!cpumask_test_cpu(cpu, &rr->d->hdr.cpu_mask))
+ return -EINVAL;
+ rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid,
+ rr->evtid, &tval, rr->arch_mon_ctx);
+ if (rr->err)
+ return rr->err;
+
+ rr->val += tval;
+
+ return 0;
+ }
+
+ /* Summing domains that share a cache, must be on a CPU for that cache. */
+ ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE);
+ if (!ci || ci->id != rr->ci_id)
+ return -EINVAL;
+
+ /*
+ * Legacy files must report the sum of an event across all
+ * domains that share the same L3 cache instance.
+ * Report success if a read from any domain succeeds, -EINVAL
+ * (translated to "Unavailable" for user space) if reading from
+ * all domains fail for any reason.
+ */
+ ret = -EINVAL;
+ list_for_each_entry(d, &rr->r->mon_domains, hdr.list) {
+ if (d->ci_id != rr->ci_id)
+ continue;
+ err = resctrl_arch_rmid_read(rr->r, d, closid, rmid,
+ rr->evtid, &tval, rr->arch_mon_ctx);
+ if (!err) {
+ rr->val += tval;
+ ret = 0;
+ }
+ }
+
+ if (ret)
+ rr->err = ret;
+
+ return ret;
+}
+
+/*
+ * mbm_bw_count() - Update bw count from values previously read by
+ * __mon_event_count().
+ * @closid: The closid used to identify the cached mbm_state.
+ * @rmid: The rmid used to identify the cached mbm_state.
+ * @rr: The struct rmid_read populated by __mon_event_count().
+ *
+ * Supporting function to calculate the memory bandwidth
+ * and delta bandwidth in MBps. The chunks value previously read by
+ * __mon_event_count() is compared with the chunks value from the previous
+ * invocation. This must be called once per second to maintain values in MBps.
+ */
+static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr)
+{
+ u64 cur_bw, bytes, cur_bytes;
+ struct mbm_state *m;
+
+ m = get_mbm_state(rr->d, closid, rmid, rr->evtid);
+ if (WARN_ON_ONCE(!m))
+ return;
+
+ cur_bytes = rr->val;
+ bytes = cur_bytes - m->prev_bw_bytes;
+ m->prev_bw_bytes = cur_bytes;
+
+ cur_bw = bytes / SZ_1M;
+
+ m->prev_bw = cur_bw;
+}
+
+/*
+ * This is scheduled by mon_event_read() to read the CQM/MBM counters
+ * on a domain.
+ */
+void mon_event_count(void *info)
+{
+ struct rdtgroup *rdtgrp, *entry;
+ struct rmid_read *rr = info;
+ struct list_head *head;
+ int ret;
+
+ rdtgrp = rr->rgrp;
+
+ ret = __mon_event_count(rdtgrp->closid, rdtgrp->mon.rmid, rr);
+
+ /*
+ * For Ctrl groups read data from child monitor groups and
+ * add them together. Count events which are read successfully.
+ * Discard the rmid_read's reporting errors.
+ */
+ head = &rdtgrp->mon.crdtgrp_list;
+
+ if (rdtgrp->type == RDTCTRL_GROUP) {
+ list_for_each_entry(entry, head, mon.crdtgrp_list) {
+ if (__mon_event_count(entry->closid, entry->mon.rmid,
+ rr) == 0)
+ ret = 0;
+ }
+ }
+
+ /*
+ * __mon_event_count() calls for newly created monitor groups may
+ * report -EINVAL/Unavailable if the monitor hasn't seen any traffic.
+ * Discard error if any of the monitor event reads succeeded.
+ */
+ if (ret == 0)
+ rr->err = 0;
+}
+
+static struct rdt_ctrl_domain *get_ctrl_domain_from_cpu(int cpu,
+ struct rdt_resource *r)
+{
+ struct rdt_ctrl_domain *d;
+
+ lockdep_assert_cpus_held();
+
+ list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
+ /* Find the domain that contains this CPU */
+ if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask))
+ return d;
+ }
+
+ return NULL;
+}
+
+/*
+ * Feedback loop for MBA software controller (mba_sc)
+ *
+ * mba_sc is a feedback loop where we periodically read MBM counters and
+ * adjust the bandwidth percentage values via the IA32_MBA_THRTL_MSRs so
+ * that:
+ *
+ * current bandwidth(cur_bw) < user specified bandwidth(user_bw)
+ *
+ * This uses the MBM counters to measure the bandwidth and MBA throttle
+ * MSRs to control the bandwidth for a particular rdtgrp. It builds on the
+ * fact that resctrl rdtgroups have both monitoring and control.
+ *
+ * The frequency of the checks is 1s and we just tag along the MBM overflow
+ * timer. Having 1s interval makes the calculation of bandwidth simpler.
+ *
+ * Although MBA's goal is to restrict the bandwidth to a maximum, there may
+ * be a need to increase the bandwidth to avoid unnecessarily restricting
+ * the L2 <-> L3 traffic.
+ *
+ * Since MBA controls the L2 external bandwidth where as MBM measures the
+ * L3 external bandwidth the following sequence could lead to such a
+ * situation.
+ *
+ * Consider an rdtgroup which had high L3 <-> memory traffic in initial
+ * phases -> mba_sc kicks in and reduced bandwidth percentage values -> but
+ * after some time rdtgroup has mostly L2 <-> L3 traffic.
+ *
+ * In this case we may restrict the rdtgroup's L2 <-> L3 traffic as its
+ * throttle MSRs already have low percentage values. To avoid
+ * unnecessarily restricting such rdtgroups, we also increase the bandwidth.
+ */
+static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm)
+{
+ u32 closid, rmid, cur_msr_val, new_msr_val;
+ struct mbm_state *pmbm_data, *cmbm_data;
+ struct rdt_ctrl_domain *dom_mba;
+ enum resctrl_event_id evt_id;
+ struct rdt_resource *r_mba;
+ struct list_head *head;
+ struct rdtgroup *entry;
+ u32 cur_bw, user_bw;
+
+ r_mba = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
+ evt_id = rgrp->mba_mbps_event;
+
+ closid = rgrp->closid;
+ rmid = rgrp->mon.rmid;
+ pmbm_data = get_mbm_state(dom_mbm, closid, rmid, evt_id);
+ if (WARN_ON_ONCE(!pmbm_data))
+ return;
+
+ dom_mba = get_ctrl_domain_from_cpu(smp_processor_id(), r_mba);
+ if (!dom_mba) {
+ pr_warn_once("Failure to get domain for MBA update\n");
+ return;
+ }
+
+ cur_bw = pmbm_data->prev_bw;
+ user_bw = dom_mba->mbps_val[closid];
+
+ /* MBA resource doesn't support CDP */
+ cur_msr_val = resctrl_arch_get_config(r_mba, dom_mba, closid, CDP_NONE);
+
+ /*
+ * For Ctrl groups read data from child monitor groups.
+ */
+ head = &rgrp->mon.crdtgrp_list;
+ list_for_each_entry(entry, head, mon.crdtgrp_list) {
+ cmbm_data = get_mbm_state(dom_mbm, entry->closid, entry->mon.rmid, evt_id);
+ if (WARN_ON_ONCE(!cmbm_data))
+ return;
+ cur_bw += cmbm_data->prev_bw;
+ }
+
+ /*
+ * Scale up/down the bandwidth linearly for the ctrl group. The
+ * bandwidth step is the bandwidth granularity specified by the
+ * hardware.
+ * Always increase throttling if current bandwidth is above the
+ * target set by user.
+ * But avoid thrashing up and down on every poll by checking
+ * whether a decrease in throttling is likely to push the group
+ * back over target. E.g. if currently throttling to 30% of bandwidth
+ * on a system with 10% granularity steps, check whether moving to
+ * 40% would go past the limit by multiplying current bandwidth by
+ * "(30 + 10) / 30".
+ */
+ if (cur_msr_val > r_mba->membw.min_bw && user_bw < cur_bw) {
+ new_msr_val = cur_msr_val - r_mba->membw.bw_gran;
+ } else if (cur_msr_val < MAX_MBA_BW &&
+ (user_bw > (cur_bw * (cur_msr_val + r_mba->membw.min_bw) / cur_msr_val))) {
+ new_msr_val = cur_msr_val + r_mba->membw.bw_gran;
+ } else {
+ return;
+ }
+
+ resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val);
+}
+
+static void mbm_update_one_event(struct rdt_resource *r, struct rdt_mon_domain *d,
+ u32 closid, u32 rmid, enum resctrl_event_id evtid)
+{
+ struct rmid_read rr = {0};
+
+ rr.r = r;
+ rr.d = d;
+ rr.evtid = evtid;
+ rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid);
+ if (IS_ERR(rr.arch_mon_ctx)) {
+ pr_warn_ratelimited("Failed to allocate monitor context: %ld",
+ PTR_ERR(rr.arch_mon_ctx));
+ return;
+ }
+
+ __mon_event_count(closid, rmid, &rr);
+
+ /*
+ * If the software controller is enabled, compute the
+ * bandwidth for this event id.
+ */
+ if (is_mba_sc(NULL))
+ mbm_bw_count(closid, rmid, &rr);
+
+ resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx);
+}
+
+static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d,
+ u32 closid, u32 rmid)
+{
+ /*
+ * This is protected from concurrent reads from user as both
+ * the user and overflow handler hold the global mutex.
+ */
+ if (resctrl_arch_is_mbm_total_enabled())
+ mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_TOTAL_EVENT_ID);
+
+ if (resctrl_arch_is_mbm_local_enabled())
+ mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_LOCAL_EVENT_ID);
+}
+
+/*
+ * Handler to scan the limbo list and move the RMIDs
+ * to free list whose occupancy < threshold_occupancy.
+ */
+void cqm_handle_limbo(struct work_struct *work)
+{
+ unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL);
+ struct rdt_mon_domain *d;
+
+ cpus_read_lock();
+ mutex_lock(&rdtgroup_mutex);
+
+ d = container_of(work, struct rdt_mon_domain, cqm_limbo.work);
+
+ __check_limbo(d, false);
+
+ if (has_busy_rmid(d)) {
+ d->cqm_work_cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask,
+ RESCTRL_PICK_ANY_CPU);
+ schedule_delayed_work_on(d->cqm_work_cpu, &d->cqm_limbo,
+ delay);
+ }
+
+ mutex_unlock(&rdtgroup_mutex);
+ cpus_read_unlock();
+}
+
+/**
+ * cqm_setup_limbo_handler() - Schedule the limbo handler to run for this
+ * domain.
+ * @dom: The domain the limbo handler should run for.
+ * @delay_ms: How far in the future the handler should run.
+ * @exclude_cpu: Which CPU the handler should not run on,
+ * RESCTRL_PICK_ANY_CPU to pick any CPU.
+ */
+void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms,
+ int exclude_cpu)
+{
+ unsigned long delay = msecs_to_jiffies(delay_ms);
+ int cpu;
+
+ cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu);
+ dom->cqm_work_cpu = cpu;
+
+ if (cpu < nr_cpu_ids)
+ schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay);
+}
+
+void mbm_handle_overflow(struct work_struct *work)
+{
+ unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL);
+ struct rdtgroup *prgrp, *crgrp;
+ struct rdt_mon_domain *d;
+ struct list_head *head;
+ struct rdt_resource *r;
+
+ cpus_read_lock();
+ mutex_lock(&rdtgroup_mutex);
+
+ /*
+ * If the filesystem has been unmounted this work no longer needs to
+ * run.
+ */
+ if (!resctrl_mounted || !resctrl_arch_mon_capable())
+ goto out_unlock;
+
+ r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
+ d = container_of(work, struct rdt_mon_domain, mbm_over.work);
+
+ list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
+ mbm_update(r, d, prgrp->closid, prgrp->mon.rmid);
+
+ head = &prgrp->mon.crdtgrp_list;
+ list_for_each_entry(crgrp, head, mon.crdtgrp_list)
+ mbm_update(r, d, crgrp->closid, crgrp->mon.rmid);
+
+ if (is_mba_sc(NULL))
+ update_mba_bw(prgrp, d);
+ }
+
+ /*
+ * Re-check for housekeeping CPUs. This allows the overflow handler to
+ * move off a nohz_full CPU quickly.
+ */
+ d->mbm_work_cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask,
+ RESCTRL_PICK_ANY_CPU);
+ schedule_delayed_work_on(d->mbm_work_cpu, &d->mbm_over, delay);
+
+out_unlock:
+ mutex_unlock(&rdtgroup_mutex);
+ cpus_read_unlock();
+}
+
+/**
+ * mbm_setup_overflow_handler() - Schedule the overflow handler to run for this
+ * domain.
+ * @dom: The domain the overflow handler should run for.
+ * @delay_ms: How far in the future the handler should run.
+ * @exclude_cpu: Which CPU the handler should not run on,
+ * RESCTRL_PICK_ANY_CPU to pick any CPU.
+ */
+void mbm_setup_overflow_handler(struct rdt_mon_domain *dom, unsigned long delay_ms,
+ int exclude_cpu)
+{
+ unsigned long delay = msecs_to_jiffies(delay_ms);
+ int cpu;
+
+ /*
+ * When a domain comes online there is no guarantee the filesystem is
+ * mounted. If not, there is no need to catch counter overflow.
+ */
+ if (!resctrl_mounted || !resctrl_arch_mon_capable())
+ return;
+ cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu);
+ dom->mbm_work_cpu = cpu;
+
+ if (cpu < nr_cpu_ids)
+ schedule_delayed_work_on(cpu, &dom->mbm_over, delay);
+}
+
+static int dom_data_init(struct rdt_resource *r)
+{
+ u32 idx_limit = resctrl_arch_system_num_rmid_idx();
+ u32 num_closid = resctrl_arch_get_num_closid(r);
+ struct rmid_entry *entry = NULL;
+ int err = 0, i;
+ u32 idx;
+
+ mutex_lock(&rdtgroup_mutex);
+ if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) {
+ u32 *tmp;
+
+ /*
+ * If the architecture hasn't provided a sanitised value here,
+ * this may result in larger arrays than necessary. Resctrl will
+ * use a smaller system wide value based on the resources in
+ * use.
+ */
+ tmp = kcalloc(num_closid, sizeof(*tmp), GFP_KERNEL);
+ if (!tmp) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+
+ closid_num_dirty_rmid = tmp;
+ }
+
+ rmid_ptrs = kcalloc(idx_limit, sizeof(struct rmid_entry), GFP_KERNEL);
+ if (!rmid_ptrs) {
+ if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) {
+ kfree(closid_num_dirty_rmid);
+ closid_num_dirty_rmid = NULL;
+ }
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+
+ for (i = 0; i < idx_limit; i++) {
+ entry = &rmid_ptrs[i];
+ INIT_LIST_HEAD(&entry->list);
+
+ resctrl_arch_rmid_idx_decode(i, &entry->closid, &entry->rmid);
+ list_add_tail(&entry->list, &rmid_free_lru);
+ }
+
+ /*
+ * RESCTRL_RESERVED_CLOSID and RESCTRL_RESERVED_RMID are special and
+ * are always allocated. These are used for the rdtgroup_default
+ * control group, which will be setup later in resctrl_init().
+ */
+ idx = resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID,
+ RESCTRL_RESERVED_RMID);
+ entry = __rmid_entry(idx);
+ list_del(&entry->list);
+
+out_unlock:
+ mutex_unlock(&rdtgroup_mutex);
+
+ return err;
+}
+
+static void dom_data_exit(struct rdt_resource *r)
+{
+ mutex_lock(&rdtgroup_mutex);
+
+ if (!r->mon_capable)
+ goto out_unlock;
+
+ if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) {
+ kfree(closid_num_dirty_rmid);
+ closid_num_dirty_rmid = NULL;
+ }
+
+ kfree(rmid_ptrs);
+ rmid_ptrs = NULL;
+
+out_unlock:
+ mutex_unlock(&rdtgroup_mutex);
+}
+
+static struct mon_evt llc_occupancy_event = {
+ .name = "llc_occupancy",
+ .evtid = QOS_L3_OCCUP_EVENT_ID,
+};
+
+static struct mon_evt mbm_total_event = {
+ .name = "mbm_total_bytes",
+ .evtid = QOS_L3_MBM_TOTAL_EVENT_ID,
+};
+
+static struct mon_evt mbm_local_event = {
+ .name = "mbm_local_bytes",
+ .evtid = QOS_L3_MBM_LOCAL_EVENT_ID,
+};
+
+/*
+ * Initialize the event list for the resource.
+ *
+ * Note that MBM events are also part of RDT_RESOURCE_L3 resource
+ * because as per the SDM the total and local memory bandwidth
+ * are enumerated as part of L3 monitoring.
+ */
+static void l3_mon_evt_init(struct rdt_resource *r)
+{
+ INIT_LIST_HEAD(&r->evt_list);
+
+ if (resctrl_arch_is_llc_occupancy_enabled())
+ list_add_tail(&llc_occupancy_event.list, &r->evt_list);
+ if (resctrl_arch_is_mbm_total_enabled())
+ list_add_tail(&mbm_total_event.list, &r->evt_list);
+ if (resctrl_arch_is_mbm_local_enabled())
+ list_add_tail(&mbm_local_event.list, &r->evt_list);
+}
+
+/**
+ * resctrl_mon_resource_init() - Initialise global monitoring structures.
+ *
+ * Allocate and initialise global monitor resources that do not belong to a
+ * specific domain. i.e. the rmid_ptrs[] used for the limbo and free lists.
+ * Called once during boot after the struct rdt_resource's have been configured
+ * but before the filesystem is mounted.
+ * Resctrl's cpuhp callbacks may be called before this point to bring a domain
+ * online.
+ *
+ * Returns 0 for success, or -ENOMEM.
+ */
+int resctrl_mon_resource_init(void)
+{
+ struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
+ int ret;
+
+ if (!r->mon_capable)
+ return 0;
+
+ ret = dom_data_init(r);
+ if (ret)
+ return ret;
+
+ l3_mon_evt_init(r);
+
+ if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_TOTAL_EVENT_ID)) {
+ mbm_total_event.configurable = true;
+ resctrl_file_fflags_init("mbm_total_bytes_config",
+ RFTYPE_MON_INFO | RFTYPE_RES_CACHE);
+ }
+ if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_LOCAL_EVENT_ID)) {
+ mbm_local_event.configurable = true;
+ resctrl_file_fflags_init("mbm_local_bytes_config",
+ RFTYPE_MON_INFO | RFTYPE_RES_CACHE);
+ }
+
+ if (resctrl_arch_is_mbm_local_enabled())
+ mba_mbps_default_event = QOS_L3_MBM_LOCAL_EVENT_ID;
+ else if (resctrl_arch_is_mbm_total_enabled())
+ mba_mbps_default_event = QOS_L3_MBM_TOTAL_EVENT_ID;
+
+ return 0;
+}
+
+void resctrl_mon_resource_exit(void)
+{
+ struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
+
+ dom_data_exit(r);
+}
diff --git a/fs/resctrl/monitor_trace.h b/fs/resctrl/monitor_trace.h
new file mode 100644
index 000000000000..fdf49f22576a
--- /dev/null
+++ b/fs/resctrl/monitor_trace.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM resctrl
+
+#if !defined(_FS_RESCTRL_MONITOR_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _FS_RESCTRL_MONITOR_TRACE_H
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(mon_llc_occupancy_limbo,
+ TP_PROTO(u32 ctrl_hw_id, u32 mon_hw_id, int domain_id, u64 llc_occupancy_bytes),
+ TP_ARGS(ctrl_hw_id, mon_hw_id, domain_id, llc_occupancy_bytes),
+ TP_STRUCT__entry(__field(u32, ctrl_hw_id)
+ __field(u32, mon_hw_id)
+ __field(int, domain_id)
+ __field(u64, llc_occupancy_bytes)),
+ TP_fast_assign(__entry->ctrl_hw_id = ctrl_hw_id;
+ __entry->mon_hw_id = mon_hw_id;
+ __entry->domain_id = domain_id;
+ __entry->llc_occupancy_bytes = llc_occupancy_bytes;),
+ TP_printk("ctrl_hw_id=%u mon_hw_id=%u domain_id=%d llc_occupancy_bytes=%llu",
+ __entry->ctrl_hw_id, __entry->mon_hw_id, __entry->domain_id,
+ __entry->llc_occupancy_bytes)
+ );
+
+#endif /* _FS_RESCTRL_MONITOR_TRACE_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+
+#define TRACE_INCLUDE_FILE monitor_trace
+
+#include <trace/define_trace.h>
diff --git a/fs/resctrl/pseudo_lock.c b/fs/resctrl/pseudo_lock.c
new file mode 100644
index 000000000000..87bbc2605de1
--- /dev/null
+++ b/fs/resctrl/pseudo_lock.c
@@ -0,0 +1,1101 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Resource Director Technology (RDT)
+ *
+ * Pseudo-locking support built on top of Cache Allocation Technology (CAT)
+ *
+ * Copyright (C) 2018 Intel Corporation
+ *
+ * Author: Reinette Chatre <reinette.chatre@intel.com>
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cacheinfo.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/debugfs.h>
+#include <linux/kthread.h>
+#include <linux/mman.h>
+#include <linux/pm_qos.h>
+#include <linux/resctrl.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+#include "internal.h"
+
+/*
+ * Major number assigned to and shared by all devices exposing
+ * pseudo-locked regions.
+ */
+static unsigned int pseudo_lock_major;
+
+static unsigned long pseudo_lock_minor_avail = GENMASK(MINORBITS, 0);
+
+static char *pseudo_lock_devnode(const struct device *dev, umode_t *mode)
+{
+ const struct rdtgroup *rdtgrp;
+
+ rdtgrp = dev_get_drvdata(dev);
+ if (mode)
+ *mode = 0600;
+ guard(mutex)(&rdtgroup_mutex);
+ return kasprintf(GFP_KERNEL, "pseudo_lock/%s", rdt_kn_name(rdtgrp->kn));
+}
+
+static const struct class pseudo_lock_class = {
+ .name = "pseudo_lock",
+ .devnode = pseudo_lock_devnode,
+};
+
+/**
+ * pseudo_lock_minor_get - Obtain available minor number
+ * @minor: Pointer to where new minor number will be stored
+ *
+ * A bitmask is used to track available minor numbers. Here the next free
+ * minor number is marked as unavailable and returned.
+ *
+ * Return: 0 on success, <0 on failure.
+ */
+static int pseudo_lock_minor_get(unsigned int *minor)
+{
+ unsigned long first_bit;
+
+ first_bit = find_first_bit(&pseudo_lock_minor_avail, MINORBITS);
+
+ if (first_bit == MINORBITS)
+ return -ENOSPC;
+
+ __clear_bit(first_bit, &pseudo_lock_minor_avail);
+ *minor = first_bit;
+
+ return 0;
+}
+
+/**
+ * pseudo_lock_minor_release - Return minor number to available
+ * @minor: The minor number made available
+ */
+static void pseudo_lock_minor_release(unsigned int minor)
+{
+ __set_bit(minor, &pseudo_lock_minor_avail);
+}
+
+/**
+ * region_find_by_minor - Locate a pseudo-lock region by inode minor number
+ * @minor: The minor number of the device representing pseudo-locked region
+ *
+ * When the character device is accessed we need to determine which
+ * pseudo-locked region it belongs to. This is done by matching the minor
+ * number of the device to the pseudo-locked region it belongs.
+ *
+ * Minor numbers are assigned at the time a pseudo-locked region is associated
+ * with a cache instance.
+ *
+ * Return: On success return pointer to resource group owning the pseudo-locked
+ * region, NULL on failure.
+ */
+static struct rdtgroup *region_find_by_minor(unsigned int minor)
+{
+ struct rdtgroup *rdtgrp, *rdtgrp_match = NULL;
+
+ list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
+ if (rdtgrp->plr && rdtgrp->plr->minor == minor) {
+ rdtgrp_match = rdtgrp;
+ break;
+ }
+ }
+ return rdtgrp_match;
+}
+
+/**
+ * struct pseudo_lock_pm_req - A power management QoS request list entry
+ * @list: Entry within the @pm_reqs list for a pseudo-locked region
+ * @req: PM QoS request
+ */
+struct pseudo_lock_pm_req {
+ struct list_head list;
+ struct dev_pm_qos_request req;
+};
+
+static void pseudo_lock_cstates_relax(struct pseudo_lock_region *plr)
+{
+ struct pseudo_lock_pm_req *pm_req, *next;
+
+ list_for_each_entry_safe(pm_req, next, &plr->pm_reqs, list) {
+ dev_pm_qos_remove_request(&pm_req->req);
+ list_del(&pm_req->list);
+ kfree(pm_req);
+ }
+}
+
+/**
+ * pseudo_lock_cstates_constrain - Restrict cores from entering C6
+ * @plr: Pseudo-locked region
+ *
+ * To prevent the cache from being affected by power management entering
+ * C6 has to be avoided. This is accomplished by requesting a latency
+ * requirement lower than lowest C6 exit latency of all supported
+ * platforms as found in the cpuidle state tables in the intel_idle driver.
+ * At this time it is possible to do so with a single latency requirement
+ * for all supported platforms.
+ *
+ * Since Goldmont is supported, which is affected by X86_BUG_MONITOR,
+ * the ACPI latencies need to be considered while keeping in mind that C2
+ * may be set to map to deeper sleep states. In this case the latency
+ * requirement needs to prevent entering C2 also.
+ *
+ * Return: 0 on success, <0 on failure
+ */
+static int pseudo_lock_cstates_constrain(struct pseudo_lock_region *plr)
+{
+ struct pseudo_lock_pm_req *pm_req;
+ int cpu;
+ int ret;
+
+ for_each_cpu(cpu, &plr->d->hdr.cpu_mask) {
+ pm_req = kzalloc(sizeof(*pm_req), GFP_KERNEL);
+ if (!pm_req) {
+ rdt_last_cmd_puts("Failure to allocate memory for PM QoS\n");
+ ret = -ENOMEM;
+ goto out_err;
+ }
+ ret = dev_pm_qos_add_request(get_cpu_device(cpu),
+ &pm_req->req,
+ DEV_PM_QOS_RESUME_LATENCY,
+ 30);
+ if (ret < 0) {
+ rdt_last_cmd_printf("Failed to add latency req CPU%d\n",
+ cpu);
+ kfree(pm_req);
+ ret = -1;
+ goto out_err;
+ }
+ list_add(&pm_req->list, &plr->pm_reqs);
+ }
+
+ return 0;
+
+out_err:
+ pseudo_lock_cstates_relax(plr);
+ return ret;
+}
+
+/**
+ * pseudo_lock_region_clear - Reset pseudo-lock region data
+ * @plr: pseudo-lock region
+ *
+ * All content of the pseudo-locked region is reset - any memory allocated
+ * freed.
+ *
+ * Return: void
+ */
+static void pseudo_lock_region_clear(struct pseudo_lock_region *plr)
+{
+ plr->size = 0;
+ plr->line_size = 0;
+ kfree(plr->kmem);
+ plr->kmem = NULL;
+ plr->s = NULL;
+ if (plr->d)
+ plr->d->plr = NULL;
+ plr->d = NULL;
+ plr->cbm = 0;
+ plr->debugfs_dir = NULL;
+}
+
+/**
+ * pseudo_lock_region_init - Initialize pseudo-lock region information
+ * @plr: pseudo-lock region
+ *
+ * Called after user provided a schemata to be pseudo-locked. From the
+ * schemata the &struct pseudo_lock_region is on entry already initialized
+ * with the resource, domain, and capacity bitmask. Here the information
+ * required for pseudo-locking is deduced from this data and &struct
+ * pseudo_lock_region initialized further. This information includes:
+ * - size in bytes of the region to be pseudo-locked
+ * - cache line size to know the stride with which data needs to be accessed
+ * to be pseudo-locked
+ * - a cpu associated with the cache instance on which the pseudo-locking
+ * flow can be executed
+ *
+ * Return: 0 on success, <0 on failure. Descriptive error will be written
+ * to last_cmd_status buffer.
+ */
+static int pseudo_lock_region_init(struct pseudo_lock_region *plr)
+{
+ enum resctrl_scope scope = plr->s->res->ctrl_scope;
+ struct cacheinfo *ci;
+ int ret;
+
+ if (WARN_ON_ONCE(scope != RESCTRL_L2_CACHE && scope != RESCTRL_L3_CACHE))
+ return -ENODEV;
+
+ /* Pick the first cpu we find that is associated with the cache. */
+ plr->cpu = cpumask_first(&plr->d->hdr.cpu_mask);
+
+ if (!cpu_online(plr->cpu)) {
+ rdt_last_cmd_printf("CPU %u associated with cache not online\n",
+ plr->cpu);
+ ret = -ENODEV;
+ goto out_region;
+ }
+
+ ci = get_cpu_cacheinfo_level(plr->cpu, scope);
+ if (ci) {
+ plr->line_size = ci->coherency_line_size;
+ plr->size = rdtgroup_cbm_to_size(plr->s->res, plr->d, plr->cbm);
+ return 0;
+ }
+
+ ret = -1;
+ rdt_last_cmd_puts("Unable to determine cache line size\n");
+out_region:
+ pseudo_lock_region_clear(plr);
+ return ret;
+}
+
+/**
+ * pseudo_lock_init - Initialize a pseudo-lock region
+ * @rdtgrp: resource group to which new pseudo-locked region will belong
+ *
+ * A pseudo-locked region is associated with a resource group. When this
+ * association is created the pseudo-locked region is initialized. The
+ * details of the pseudo-locked region are not known at this time so only
+ * allocation is done and association established.
+ *
+ * Return: 0 on success, <0 on failure
+ */
+static int pseudo_lock_init(struct rdtgroup *rdtgrp)
+{
+ struct pseudo_lock_region *plr;
+
+ plr = kzalloc(sizeof(*plr), GFP_KERNEL);
+ if (!plr)
+ return -ENOMEM;
+
+ init_waitqueue_head(&plr->lock_thread_wq);
+ INIT_LIST_HEAD(&plr->pm_reqs);
+ rdtgrp->plr = plr;
+ return 0;
+}
+
+/**
+ * pseudo_lock_region_alloc - Allocate kernel memory that will be pseudo-locked
+ * @plr: pseudo-lock region
+ *
+ * Initialize the details required to set up the pseudo-locked region and
+ * allocate the contiguous memory that will be pseudo-locked to the cache.
+ *
+ * Return: 0 on success, <0 on failure. Descriptive error will be written
+ * to last_cmd_status buffer.
+ */
+static int pseudo_lock_region_alloc(struct pseudo_lock_region *plr)
+{
+ int ret;
+
+ ret = pseudo_lock_region_init(plr);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * We do not yet support contiguous regions larger than
+ * KMALLOC_MAX_SIZE.
+ */
+ if (plr->size > KMALLOC_MAX_SIZE) {
+ rdt_last_cmd_puts("Requested region exceeds maximum size\n");
+ ret = -E2BIG;
+ goto out_region;
+ }
+
+ plr->kmem = kzalloc(plr->size, GFP_KERNEL);
+ if (!plr->kmem) {
+ rdt_last_cmd_puts("Unable to allocate memory\n");
+ ret = -ENOMEM;
+ goto out_region;
+ }
+
+ ret = 0;
+ goto out;
+out_region:
+ pseudo_lock_region_clear(plr);
+out:
+ return ret;
+}
+
+/**
+ * pseudo_lock_free - Free a pseudo-locked region
+ * @rdtgrp: resource group to which pseudo-locked region belonged
+ *
+ * The pseudo-locked region's resources have already been released, or not
+ * yet created at this point. Now it can be freed and disassociated from the
+ * resource group.
+ *
+ * Return: void
+ */
+static void pseudo_lock_free(struct rdtgroup *rdtgrp)
+{
+ pseudo_lock_region_clear(rdtgrp->plr);
+ kfree(rdtgrp->plr);
+ rdtgrp->plr = NULL;
+}
+
+/**
+ * rdtgroup_monitor_in_progress - Test if monitoring in progress
+ * @rdtgrp: resource group being queried
+ *
+ * Return: 1 if monitor groups have been created for this resource
+ * group, 0 otherwise.
+ */
+static int rdtgroup_monitor_in_progress(struct rdtgroup *rdtgrp)
+{
+ return !list_empty(&rdtgrp->mon.crdtgrp_list);
+}
+
+/**
+ * rdtgroup_locksetup_user_restrict - Restrict user access to group
+ * @rdtgrp: resource group needing access restricted
+ *
+ * A resource group used for cache pseudo-locking cannot have cpus or tasks
+ * assigned to it. This is communicated to the user by restricting access
+ * to all the files that can be used to make such changes.
+ *
+ * Permissions restored with rdtgroup_locksetup_user_restore()
+ *
+ * Return: 0 on success, <0 on failure. If a failure occurs during the
+ * restriction of access an attempt will be made to restore permissions but
+ * the state of the mode of these files will be uncertain when a failure
+ * occurs.
+ */
+static int rdtgroup_locksetup_user_restrict(struct rdtgroup *rdtgrp)
+{
+ int ret;
+
+ ret = rdtgroup_kn_mode_restrict(rdtgrp, "tasks");
+ if (ret)
+ return ret;
+
+ ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus");
+ if (ret)
+ goto err_tasks;
+
+ ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list");
+ if (ret)
+ goto err_cpus;
+
+ if (resctrl_arch_mon_capable()) {
+ ret = rdtgroup_kn_mode_restrict(rdtgrp, "mon_groups");
+ if (ret)
+ goto err_cpus_list;
+ }
+
+ ret = 0;
+ goto out;
+
+err_cpus_list:
+ rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777);
+err_cpus:
+ rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777);
+err_tasks:
+ rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777);
+out:
+ return ret;
+}
+
+/**
+ * rdtgroup_locksetup_user_restore - Restore user access to group
+ * @rdtgrp: resource group needing access restored
+ *
+ * Restore all file access previously removed using
+ * rdtgroup_locksetup_user_restrict()
+ *
+ * Return: 0 on success, <0 on failure. If a failure occurs during the
+ * restoration of access an attempt will be made to restrict permissions
+ * again but the state of the mode of these files will be uncertain when
+ * a failure occurs.
+ */
+static int rdtgroup_locksetup_user_restore(struct rdtgroup *rdtgrp)
+{
+ int ret;
+
+ ret = rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777);
+ if (ret)
+ return ret;
+
+ ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777);
+ if (ret)
+ goto err_tasks;
+
+ ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777);
+ if (ret)
+ goto err_cpus;
+
+ if (resctrl_arch_mon_capable()) {
+ ret = rdtgroup_kn_mode_restore(rdtgrp, "mon_groups", 0777);
+ if (ret)
+ goto err_cpus_list;
+ }
+
+ ret = 0;
+ goto out;
+
+err_cpus_list:
+ rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list");
+err_cpus:
+ rdtgroup_kn_mode_restrict(rdtgrp, "cpus");
+err_tasks:
+ rdtgroup_kn_mode_restrict(rdtgrp, "tasks");
+out:
+ return ret;
+}
+
+/**
+ * rdtgroup_locksetup_enter - Resource group enters locksetup mode
+ * @rdtgrp: resource group requested to enter locksetup mode
+ *
+ * A resource group enters locksetup mode to reflect that it would be used
+ * to represent a pseudo-locked region and is in the process of being set
+ * up to do so. A resource group used for a pseudo-locked region would
+ * lose the closid associated with it so we cannot allow it to have any
+ * tasks or cpus assigned nor permit tasks or cpus to be assigned in the
+ * future. Monitoring of a pseudo-locked region is not allowed either.
+ *
+ * The above and more restrictions on a pseudo-locked region are checked
+ * for and enforced before the resource group enters the locksetup mode.
+ *
+ * Returns: 0 if the resource group successfully entered locksetup mode, <0
+ * on failure. On failure the last_cmd_status buffer is updated with text to
+ * communicate details of failure to the user.
+ */
+int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp)
+{
+ int ret;
+
+ /*
+ * The default resource group can neither be removed nor lose the
+ * default closid associated with it.
+ */
+ if (rdtgrp == &rdtgroup_default) {
+ rdt_last_cmd_puts("Cannot pseudo-lock default group\n");
+ return -EINVAL;
+ }
+
+ /*
+ * Cache Pseudo-locking not supported when CDP is enabled.
+ *
+ * Some things to consider if you would like to enable this
+ * support (using L3 CDP as example):
+ * - When CDP is enabled two separate resources are exposed,
+ * L3DATA and L3CODE, but they are actually on the same cache.
+ * The implication for pseudo-locking is that if a
+ * pseudo-locked region is created on a domain of one
+ * resource (eg. L3CODE), then a pseudo-locked region cannot
+ * be created on that same domain of the other resource
+ * (eg. L3DATA). This is because the creation of a
+ * pseudo-locked region involves a call to wbinvd that will
+ * affect all cache allocations on particular domain.
+ * - Considering the previous, it may be possible to only
+ * expose one of the CDP resources to pseudo-locking and
+ * hide the other. For example, we could consider to only
+ * expose L3DATA and since the L3 cache is unified it is
+ * still possible to place instructions there are execute it.
+ * - If only one region is exposed to pseudo-locking we should
+ * still keep in mind that availability of a portion of cache
+ * for pseudo-locking should take into account both resources.
+ * Similarly, if a pseudo-locked region is created in one
+ * resource, the portion of cache used by it should be made
+ * unavailable to all future allocations from both resources.
+ */
+ if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3) ||
+ resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2)) {
+ rdt_last_cmd_puts("CDP enabled\n");
+ return -EINVAL;
+ }
+
+ /*
+ * Not knowing the bits to disable prefetching implies that this
+ * platform does not support Cache Pseudo-Locking.
+ */
+ if (resctrl_arch_get_prefetch_disable_bits() == 0) {
+ rdt_last_cmd_puts("Pseudo-locking not supported\n");
+ return -EINVAL;
+ }
+
+ if (rdtgroup_monitor_in_progress(rdtgrp)) {
+ rdt_last_cmd_puts("Monitoring in progress\n");
+ return -EINVAL;
+ }
+
+ if (rdtgroup_tasks_assigned(rdtgrp)) {
+ rdt_last_cmd_puts("Tasks assigned to resource group\n");
+ return -EINVAL;
+ }
+
+ if (!cpumask_empty(&rdtgrp->cpu_mask)) {
+ rdt_last_cmd_puts("CPUs assigned to resource group\n");
+ return -EINVAL;
+ }
+
+ if (rdtgroup_locksetup_user_restrict(rdtgrp)) {
+ rdt_last_cmd_puts("Unable to modify resctrl permissions\n");
+ return -EIO;
+ }
+
+ ret = pseudo_lock_init(rdtgrp);
+ if (ret) {
+ rdt_last_cmd_puts("Unable to init pseudo-lock region\n");
+ goto out_release;
+ }
+
+ /*
+ * If this system is capable of monitoring a rmid would have been
+ * allocated when the control group was created. This is not needed
+ * anymore when this group would be used for pseudo-locking. This
+ * is safe to call on platforms not capable of monitoring.
+ */
+ free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
+
+ ret = 0;
+ goto out;
+
+out_release:
+ rdtgroup_locksetup_user_restore(rdtgrp);
+out:
+ return ret;
+}
+
+/**
+ * rdtgroup_locksetup_exit - resource group exist locksetup mode
+ * @rdtgrp: resource group
+ *
+ * When a resource group exits locksetup mode the earlier restrictions are
+ * lifted.
+ *
+ * Return: 0 on success, <0 on failure
+ */
+int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp)
+{
+ int ret;
+
+ if (resctrl_arch_mon_capable()) {
+ ret = alloc_rmid(rdtgrp->closid);
+ if (ret < 0) {
+ rdt_last_cmd_puts("Out of RMIDs\n");
+ return ret;
+ }
+ rdtgrp->mon.rmid = ret;
+ }
+
+ ret = rdtgroup_locksetup_user_restore(rdtgrp);
+ if (ret) {
+ free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
+ return ret;
+ }
+
+ pseudo_lock_free(rdtgrp);
+ return 0;
+}
+
+/**
+ * rdtgroup_cbm_overlaps_pseudo_locked - Test if CBM or portion is pseudo-locked
+ * @d: RDT domain
+ * @cbm: CBM to test
+ *
+ * @d represents a cache instance and @cbm a capacity bitmask that is
+ * considered for it. Determine if @cbm overlaps with any existing
+ * pseudo-locked region on @d.
+ *
+ * @cbm is unsigned long, even if only 32 bits are used, to make the
+ * bitmap functions work correctly.
+ *
+ * Return: true if @cbm overlaps with pseudo-locked region on @d, false
+ * otherwise.
+ */
+bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm)
+{
+ unsigned int cbm_len;
+ unsigned long cbm_b;
+
+ if (d->plr) {
+ cbm_len = d->plr->s->res->cache.cbm_len;
+ cbm_b = d->plr->cbm;
+ if (bitmap_intersects(&cbm, &cbm_b, cbm_len))
+ return true;
+ }
+ return false;
+}
+
+/**
+ * rdtgroup_pseudo_locked_in_hierarchy - Pseudo-locked region in cache hierarchy
+ * @d: RDT domain under test
+ *
+ * The setup of a pseudo-locked region affects all cache instances within
+ * the hierarchy of the region. It is thus essential to know if any
+ * pseudo-locked regions exist within a cache hierarchy to prevent any
+ * attempts to create new pseudo-locked regions in the same hierarchy.
+ *
+ * Return: true if a pseudo-locked region exists in the hierarchy of @d or
+ * if it is not possible to test due to memory allocation issue,
+ * false otherwise.
+ */
+bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d)
+{
+ struct rdt_ctrl_domain *d_i;
+ cpumask_var_t cpu_with_psl;
+ struct rdt_resource *r;
+ bool ret = false;
+
+ /* Walking r->domains, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
+
+ if (!zalloc_cpumask_var(&cpu_with_psl, GFP_KERNEL))
+ return true;
+
+ /*
+ * First determine which cpus have pseudo-locked regions
+ * associated with them.
+ */
+ for_each_alloc_capable_rdt_resource(r) {
+ list_for_each_entry(d_i, &r->ctrl_domains, hdr.list) {
+ if (d_i->plr)
+ cpumask_or(cpu_with_psl, cpu_with_psl,
+ &d_i->hdr.cpu_mask);
+ }
+ }
+
+ /*
+ * Next test if new pseudo-locked region would intersect with
+ * existing region.
+ */
+ if (cpumask_intersects(&d->hdr.cpu_mask, cpu_with_psl))
+ ret = true;
+
+ free_cpumask_var(cpu_with_psl);
+ return ret;
+}
+
+/**
+ * pseudo_lock_measure_cycles - Trigger latency measure to pseudo-locked region
+ * @rdtgrp: Resource group to which the pseudo-locked region belongs.
+ * @sel: Selector of which measurement to perform on a pseudo-locked region.
+ *
+ * The measurement of latency to access a pseudo-locked region should be
+ * done from a cpu that is associated with that pseudo-locked region.
+ * Determine which cpu is associated with this region and start a thread on
+ * that cpu to perform the measurement, wait for that thread to complete.
+ *
+ * Return: 0 on success, <0 on failure
+ */
+static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel)
+{
+ struct pseudo_lock_region *plr = rdtgrp->plr;
+ struct task_struct *thread;
+ unsigned int cpu;
+ int ret = -1;
+
+ cpus_read_lock();
+ mutex_lock(&rdtgroup_mutex);
+
+ if (rdtgrp->flags & RDT_DELETED) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ if (!plr->d) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ plr->thread_done = 0;
+ cpu = cpumask_first(&plr->d->hdr.cpu_mask);
+ if (!cpu_online(cpu)) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ plr->cpu = cpu;
+
+ if (sel == 1)
+ thread = kthread_run_on_cpu(resctrl_arch_measure_cycles_lat_fn,
+ plr, cpu, "pseudo_lock_measure/%u");
+ else if (sel == 2)
+ thread = kthread_run_on_cpu(resctrl_arch_measure_l2_residency,
+ plr, cpu, "pseudo_lock_measure/%u");
+ else if (sel == 3)
+ thread = kthread_run_on_cpu(resctrl_arch_measure_l3_residency,
+ plr, cpu, "pseudo_lock_measure/%u");
+ else
+ goto out;
+
+ if (IS_ERR(thread)) {
+ ret = PTR_ERR(thread);
+ goto out;
+ }
+
+ ret = wait_event_interruptible(plr->lock_thread_wq,
+ plr->thread_done == 1);
+ if (ret < 0)
+ goto out;
+
+ ret = 0;
+
+out:
+ mutex_unlock(&rdtgroup_mutex);
+ cpus_read_unlock();
+ return ret;
+}
+
+static ssize_t pseudo_lock_measure_trigger(struct file *file,
+ const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct rdtgroup *rdtgrp = file->private_data;
+ size_t buf_size;
+ char buf[32];
+ int ret;
+ int sel;
+
+ buf_size = min(count, (sizeof(buf) - 1));
+ if (copy_from_user(buf, user_buf, buf_size))
+ return -EFAULT;
+
+ buf[buf_size] = '\0';
+ ret = kstrtoint(buf, 10, &sel);
+ if (ret == 0) {
+ if (sel != 1 && sel != 2 && sel != 3)
+ return -EINVAL;
+ ret = pseudo_lock_measure_cycles(rdtgrp, sel);
+ if (ret == 0)
+ ret = count;
+ }
+
+ return ret;
+}
+
+static const struct file_operations pseudo_measure_fops = {
+ .write = pseudo_lock_measure_trigger,
+ .open = simple_open,
+ .llseek = default_llseek,
+};
+
+/**
+ * rdtgroup_pseudo_lock_create - Create a pseudo-locked region
+ * @rdtgrp: resource group to which pseudo-lock region belongs
+ *
+ * Called when a resource group in the pseudo-locksetup mode receives a
+ * valid schemata that should be pseudo-locked. Since the resource group is
+ * in pseudo-locksetup mode the &struct pseudo_lock_region has already been
+ * allocated and initialized with the essential information. If a failure
+ * occurs the resource group remains in the pseudo-locksetup mode with the
+ * &struct pseudo_lock_region associated with it, but cleared from all
+ * information and ready for the user to re-attempt pseudo-locking by
+ * writing the schemata again.
+ *
+ * Return: 0 if the pseudo-locked region was successfully pseudo-locked, <0
+ * on failure. Descriptive error will be written to last_cmd_status buffer.
+ */
+int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp)
+{
+ struct pseudo_lock_region *plr = rdtgrp->plr;
+ struct task_struct *thread;
+ unsigned int new_minor;
+ struct device *dev;
+ char *kn_name __free(kfree) = NULL;
+ int ret;
+
+ ret = pseudo_lock_region_alloc(plr);
+ if (ret < 0)
+ return ret;
+
+ ret = pseudo_lock_cstates_constrain(plr);
+ if (ret < 0) {
+ ret = -EINVAL;
+ goto out_region;
+ }
+ kn_name = kstrdup(rdt_kn_name(rdtgrp->kn), GFP_KERNEL);
+ if (!kn_name) {
+ ret = -ENOMEM;
+ goto out_cstates;
+ }
+
+ plr->thread_done = 0;
+
+ thread = kthread_run_on_cpu(resctrl_arch_pseudo_lock_fn, plr,
+ plr->cpu, "pseudo_lock/%u");
+ if (IS_ERR(thread)) {
+ ret = PTR_ERR(thread);
+ rdt_last_cmd_printf("Locking thread returned error %d\n", ret);
+ goto out_cstates;
+ }
+
+ ret = wait_event_interruptible(plr->lock_thread_wq,
+ plr->thread_done == 1);
+ if (ret < 0) {
+ /*
+ * If the thread does not get on the CPU for whatever
+ * reason and the process which sets up the region is
+ * interrupted then this will leave the thread in runnable
+ * state and once it gets on the CPU it will dereference
+ * the cleared, but not freed, plr struct resulting in an
+ * empty pseudo-locking loop.
+ */
+ rdt_last_cmd_puts("Locking thread interrupted\n");
+ goto out_cstates;
+ }
+
+ ret = pseudo_lock_minor_get(&new_minor);
+ if (ret < 0) {
+ rdt_last_cmd_puts("Unable to obtain a new minor number\n");
+ goto out_cstates;
+ }
+
+ /*
+ * Unlock access but do not release the reference. The
+ * pseudo-locked region will still be here on return.
+ *
+ * The mutex has to be released temporarily to avoid a potential
+ * deadlock with the mm->mmap_lock which is obtained in the
+ * device_create() and debugfs_create_dir() callpath below as well as
+ * before the mmap() callback is called.
+ */
+ mutex_unlock(&rdtgroup_mutex);
+
+ if (!IS_ERR_OR_NULL(debugfs_resctrl)) {
+ plr->debugfs_dir = debugfs_create_dir(kn_name, debugfs_resctrl);
+ if (!IS_ERR_OR_NULL(plr->debugfs_dir))
+ debugfs_create_file("pseudo_lock_measure", 0200,
+ plr->debugfs_dir, rdtgrp,
+ &pseudo_measure_fops);
+ }
+
+ dev = device_create(&pseudo_lock_class, NULL,
+ MKDEV(pseudo_lock_major, new_minor),
+ rdtgrp, "%s", kn_name);
+
+ mutex_lock(&rdtgroup_mutex);
+
+ if (IS_ERR(dev)) {
+ ret = PTR_ERR(dev);
+ rdt_last_cmd_printf("Failed to create character device: %d\n",
+ ret);
+ goto out_debugfs;
+ }
+
+ /* We released the mutex - check if group was removed while we did so */
+ if (rdtgrp->flags & RDT_DELETED) {
+ ret = -ENODEV;
+ goto out_device;
+ }
+
+ plr->minor = new_minor;
+
+ rdtgrp->mode = RDT_MODE_PSEUDO_LOCKED;
+ closid_free(rdtgrp->closid);
+ rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0444);
+ rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0444);
+
+ ret = 0;
+ goto out;
+
+out_device:
+ device_destroy(&pseudo_lock_class, MKDEV(pseudo_lock_major, new_minor));
+out_debugfs:
+ debugfs_remove_recursive(plr->debugfs_dir);
+ pseudo_lock_minor_release(new_minor);
+out_cstates:
+ pseudo_lock_cstates_relax(plr);
+out_region:
+ pseudo_lock_region_clear(plr);
+out:
+ return ret;
+}
+
+/**
+ * rdtgroup_pseudo_lock_remove - Remove a pseudo-locked region
+ * @rdtgrp: resource group to which the pseudo-locked region belongs
+ *
+ * The removal of a pseudo-locked region can be initiated when the resource
+ * group is removed from user space via a "rmdir" from userspace or the
+ * unmount of the resctrl filesystem. On removal the resource group does
+ * not go back to pseudo-locksetup mode before it is removed, instead it is
+ * removed directly. There is thus asymmetry with the creation where the
+ * &struct pseudo_lock_region is removed here while it was not created in
+ * rdtgroup_pseudo_lock_create().
+ *
+ * Return: void
+ */
+void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp)
+{
+ struct pseudo_lock_region *plr = rdtgrp->plr;
+
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
+ /*
+ * Default group cannot be a pseudo-locked region so we can
+ * free closid here.
+ */
+ closid_free(rdtgrp->closid);
+ goto free;
+ }
+
+ pseudo_lock_cstates_relax(plr);
+ debugfs_remove_recursive(rdtgrp->plr->debugfs_dir);
+ device_destroy(&pseudo_lock_class, MKDEV(pseudo_lock_major, plr->minor));
+ pseudo_lock_minor_release(plr->minor);
+
+free:
+ pseudo_lock_free(rdtgrp);
+}
+
+static int pseudo_lock_dev_open(struct inode *inode, struct file *filp)
+{
+ struct rdtgroup *rdtgrp;
+
+ mutex_lock(&rdtgroup_mutex);
+
+ rdtgrp = region_find_by_minor(iminor(inode));
+ if (!rdtgrp) {
+ mutex_unlock(&rdtgroup_mutex);
+ return -ENODEV;
+ }
+
+ filp->private_data = rdtgrp;
+ atomic_inc(&rdtgrp->waitcount);
+ /* Perform a non-seekable open - llseek is not supported */
+ filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
+
+ mutex_unlock(&rdtgroup_mutex);
+
+ return 0;
+}
+
+static int pseudo_lock_dev_release(struct inode *inode, struct file *filp)
+{
+ struct rdtgroup *rdtgrp;
+
+ mutex_lock(&rdtgroup_mutex);
+ rdtgrp = filp->private_data;
+ WARN_ON(!rdtgrp);
+ if (!rdtgrp) {
+ mutex_unlock(&rdtgroup_mutex);
+ return -ENODEV;
+ }
+ filp->private_data = NULL;
+ atomic_dec(&rdtgrp->waitcount);
+ mutex_unlock(&rdtgroup_mutex);
+ return 0;
+}
+
+static int pseudo_lock_dev_mremap(struct vm_area_struct *area)
+{
+ /* Not supported */
+ return -EINVAL;
+}
+
+static const struct vm_operations_struct pseudo_mmap_ops = {
+ .mremap = pseudo_lock_dev_mremap,
+};
+
+static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+ unsigned long vsize = vma->vm_end - vma->vm_start;
+ unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
+ struct pseudo_lock_region *plr;
+ struct rdtgroup *rdtgrp;
+ unsigned long physical;
+ unsigned long psize;
+
+ mutex_lock(&rdtgroup_mutex);
+
+ rdtgrp = filp->private_data;
+ WARN_ON(!rdtgrp);
+ if (!rdtgrp) {
+ mutex_unlock(&rdtgroup_mutex);
+ return -ENODEV;
+ }
+
+ plr = rdtgrp->plr;
+
+ if (!plr->d) {
+ mutex_unlock(&rdtgroup_mutex);
+ return -ENODEV;
+ }
+
+ /*
+ * Task is required to run with affinity to the cpus associated
+ * with the pseudo-locked region. If this is not the case the task
+ * may be scheduled elsewhere and invalidate entries in the
+ * pseudo-locked region.
+ */
+ if (!cpumask_subset(current->cpus_ptr, &plr->d->hdr.cpu_mask)) {
+ mutex_unlock(&rdtgroup_mutex);
+ return -EINVAL;
+ }
+
+ physical = __pa(plr->kmem) >> PAGE_SHIFT;
+ psize = plr->size - off;
+
+ if (off > plr->size) {
+ mutex_unlock(&rdtgroup_mutex);
+ return -ENOSPC;
+ }
+
+ /*
+ * Ensure changes are carried directly to the memory being mapped,
+ * do not allow copy-on-write mapping.
+ */
+ if (!(vma->vm_flags & VM_SHARED)) {
+ mutex_unlock(&rdtgroup_mutex);
+ return -EINVAL;
+ }
+
+ if (vsize > psize) {
+ mutex_unlock(&rdtgroup_mutex);
+ return -ENOSPC;
+ }
+
+ memset(plr->kmem + off, 0, vsize);
+
+ if (remap_pfn_range(vma, vma->vm_start, physical + vma->vm_pgoff,
+ vsize, vma->vm_page_prot)) {
+ mutex_unlock(&rdtgroup_mutex);
+ return -EAGAIN;
+ }
+ vma->vm_ops = &pseudo_mmap_ops;
+ mutex_unlock(&rdtgroup_mutex);
+ return 0;
+}
+
+static const struct file_operations pseudo_lock_dev_fops = {
+ .owner = THIS_MODULE,
+ .read = NULL,
+ .write = NULL,
+ .open = pseudo_lock_dev_open,
+ .release = pseudo_lock_dev_release,
+ .mmap = pseudo_lock_dev_mmap,
+};
+
+int rdt_pseudo_lock_init(void)
+{
+ int ret;
+
+ ret = register_chrdev(0, "pseudo_lock", &pseudo_lock_dev_fops);
+ if (ret < 0)
+ return ret;
+
+ pseudo_lock_major = ret;
+
+ ret = class_register(&pseudo_lock_class);
+ if (ret) {
+ unregister_chrdev(pseudo_lock_major, "pseudo_lock");
+ return ret;
+ }
+
+ return 0;
+}
+
+void rdt_pseudo_lock_release(void)
+{
+ class_unregister(&pseudo_lock_class);
+ unregister_chrdev(pseudo_lock_major, "pseudo_lock");
+ pseudo_lock_major = 0;
+}
diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c
new file mode 100644
index 000000000000..77d08229d855
--- /dev/null
+++ b/fs/resctrl/rdtgroup.c
@@ -0,0 +1,4357 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * User interface for Resource Allocation in Resource Director Technology(RDT)
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Author: Fenghua Yu <fenghua.yu@intel.com>
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cpu.h>
+#include <linux/debugfs.h>
+#include <linux/fs.h>
+#include <linux/fs_parser.h>
+#include <linux/sysfs.h>
+#include <linux/kernfs.h>
+#include <linux/resctrl.h>
+#include <linux/seq_buf.h>
+#include <linux/seq_file.h>
+#include <linux/sched/task.h>
+#include <linux/slab.h>
+#include <linux/user_namespace.h>
+
+#include <uapi/linux/magic.h>
+
+#include "internal.h"
+
+/* Mutex to protect rdtgroup access. */
+DEFINE_MUTEX(rdtgroup_mutex);
+
+static struct kernfs_root *rdt_root;
+
+struct rdtgroup rdtgroup_default;
+
+LIST_HEAD(rdt_all_groups);
+
+/* list of entries for the schemata file */
+LIST_HEAD(resctrl_schema_all);
+
+/*
+ * List of struct mon_data containing private data of event files for use by
+ * rdtgroup_mondata_show(). Protected by rdtgroup_mutex.
+ */
+static LIST_HEAD(mon_data_kn_priv_list);
+
+/* The filesystem can only be mounted once. */
+bool resctrl_mounted;
+
+/* Kernel fs node for "info" directory under root */
+static struct kernfs_node *kn_info;
+
+/* Kernel fs node for "mon_groups" directory under root */
+static struct kernfs_node *kn_mongrp;
+
+/* Kernel fs node for "mon_data" directory under root */
+static struct kernfs_node *kn_mondata;
+
+/*
+ * Used to store the max resource name width to display the schemata names in
+ * a tabular format.
+ */
+int max_name_width;
+
+static struct seq_buf last_cmd_status;
+
+static char last_cmd_status_buf[512];
+
+static int rdtgroup_setup_root(struct rdt_fs_context *ctx);
+
+static void rdtgroup_destroy_root(void);
+
+struct dentry *debugfs_resctrl;
+
+/*
+ * Memory bandwidth monitoring event to use for the default CTRL_MON group
+ * and each new CTRL_MON group created by the user. Only relevant when
+ * the filesystem is mounted with the "mba_MBps" option so it does not
+ * matter that it remains uninitialized on systems that do not support
+ * the "mba_MBps" option.
+ */
+enum resctrl_event_id mba_mbps_default_event;
+
+static bool resctrl_debug;
+
+void rdt_last_cmd_clear(void)
+{
+ lockdep_assert_held(&rdtgroup_mutex);
+ seq_buf_clear(&last_cmd_status);
+}
+
+void rdt_last_cmd_puts(const char *s)
+{
+ lockdep_assert_held(&rdtgroup_mutex);
+ seq_buf_puts(&last_cmd_status, s);
+}
+
+void rdt_last_cmd_printf(const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ lockdep_assert_held(&rdtgroup_mutex);
+ seq_buf_vprintf(&last_cmd_status, fmt, ap);
+ va_end(ap);
+}
+
+void rdt_staged_configs_clear(void)
+{
+ struct rdt_ctrl_domain *dom;
+ struct rdt_resource *r;
+
+ lockdep_assert_held(&rdtgroup_mutex);
+
+ for_each_alloc_capable_rdt_resource(r) {
+ list_for_each_entry(dom, &r->ctrl_domains, hdr.list)
+ memset(dom->staged_config, 0, sizeof(dom->staged_config));
+ }
+}
+
+static bool resctrl_is_mbm_enabled(void)
+{
+ return (resctrl_arch_is_mbm_total_enabled() ||
+ resctrl_arch_is_mbm_local_enabled());
+}
+
+static bool resctrl_is_mbm_event(int e)
+{
+ return (e >= QOS_L3_MBM_TOTAL_EVENT_ID &&
+ e <= QOS_L3_MBM_LOCAL_EVENT_ID);
+}
+
+/*
+ * Trivial allocator for CLOSIDs. Use BITMAP APIs to manipulate a bitmap
+ * of free CLOSIDs.
+ *
+ * Using a global CLOSID across all resources has some advantages and
+ * some drawbacks:
+ * + We can simply set current's closid to assign a task to a resource
+ * group.
+ * + Context switch code can avoid extra memory references deciding which
+ * CLOSID to load into the PQR_ASSOC MSR
+ * - We give up some options in configuring resource groups across multi-socket
+ * systems.
+ * - Our choices on how to configure each resource become progressively more
+ * limited as the number of resources grows.
+ */
+static unsigned long *closid_free_map;
+
+static int closid_free_map_len;
+
+int closids_supported(void)
+{
+ return closid_free_map_len;
+}
+
+static int closid_init(void)
+{
+ struct resctrl_schema *s;
+ u32 rdt_min_closid = ~0;
+
+ /* Monitor only platforms still call closid_init() */
+ if (list_empty(&resctrl_schema_all))
+ return 0;
+
+ /* Compute rdt_min_closid across all resources */
+ list_for_each_entry(s, &resctrl_schema_all, list)
+ rdt_min_closid = min(rdt_min_closid, s->num_closid);
+
+ closid_free_map = bitmap_alloc(rdt_min_closid, GFP_KERNEL);
+ if (!closid_free_map)
+ return -ENOMEM;
+ bitmap_fill(closid_free_map, rdt_min_closid);
+
+ /* RESCTRL_RESERVED_CLOSID is always reserved for the default group */
+ __clear_bit(RESCTRL_RESERVED_CLOSID, closid_free_map);
+ closid_free_map_len = rdt_min_closid;
+
+ return 0;
+}
+
+static void closid_exit(void)
+{
+ bitmap_free(closid_free_map);
+ closid_free_map = NULL;
+}
+
+static int closid_alloc(void)
+{
+ int cleanest_closid;
+ u32 closid;
+
+ lockdep_assert_held(&rdtgroup_mutex);
+
+ if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID) &&
+ resctrl_arch_is_llc_occupancy_enabled()) {
+ cleanest_closid = resctrl_find_cleanest_closid();
+ if (cleanest_closid < 0)
+ return cleanest_closid;
+ closid = cleanest_closid;
+ } else {
+ closid = find_first_bit(closid_free_map, closid_free_map_len);
+ if (closid == closid_free_map_len)
+ return -ENOSPC;
+ }
+ __clear_bit(closid, closid_free_map);
+
+ return closid;
+}
+
+void closid_free(int closid)
+{
+ lockdep_assert_held(&rdtgroup_mutex);
+
+ __set_bit(closid, closid_free_map);
+}
+
+/**
+ * closid_allocated - test if provided closid is in use
+ * @closid: closid to be tested
+ *
+ * Return: true if @closid is currently associated with a resource group,
+ * false if @closid is free
+ */
+bool closid_allocated(unsigned int closid)
+{
+ lockdep_assert_held(&rdtgroup_mutex);
+
+ return !test_bit(closid, closid_free_map);
+}
+
+/**
+ * rdtgroup_mode_by_closid - Return mode of resource group with closid
+ * @closid: closid if the resource group
+ *
+ * Each resource group is associated with a @closid. Here the mode
+ * of a resource group can be queried by searching for it using its closid.
+ *
+ * Return: mode as &enum rdtgrp_mode of resource group with closid @closid
+ */
+enum rdtgrp_mode rdtgroup_mode_by_closid(int closid)
+{
+ struct rdtgroup *rdtgrp;
+
+ list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
+ if (rdtgrp->closid == closid)
+ return rdtgrp->mode;
+ }
+
+ return RDT_NUM_MODES;
+}
+
+static const char * const rdt_mode_str[] = {
+ [RDT_MODE_SHAREABLE] = "shareable",
+ [RDT_MODE_EXCLUSIVE] = "exclusive",
+ [RDT_MODE_PSEUDO_LOCKSETUP] = "pseudo-locksetup",
+ [RDT_MODE_PSEUDO_LOCKED] = "pseudo-locked",
+};
+
+/**
+ * rdtgroup_mode_str - Return the string representation of mode
+ * @mode: the resource group mode as &enum rdtgroup_mode
+ *
+ * Return: string representation of valid mode, "unknown" otherwise
+ */
+static const char *rdtgroup_mode_str(enum rdtgrp_mode mode)
+{
+ if (mode < RDT_MODE_SHAREABLE || mode >= RDT_NUM_MODES)
+ return "unknown";
+
+ return rdt_mode_str[mode];
+}
+
+/* set uid and gid of rdtgroup dirs and files to that of the creator */
+static int rdtgroup_kn_set_ugid(struct kernfs_node *kn)
+{
+ struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
+ .ia_uid = current_fsuid(),
+ .ia_gid = current_fsgid(), };
+
+ if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
+ gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
+ return 0;
+
+ return kernfs_setattr(kn, &iattr);
+}
+
+static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft)
+{
+ struct kernfs_node *kn;
+ int ret;
+
+ kn = __kernfs_create_file(parent_kn, rft->name, rft->mode,
+ GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
+ 0, rft->kf_ops, rft, NULL, NULL);
+ if (IS_ERR(kn))
+ return PTR_ERR(kn);
+
+ ret = rdtgroup_kn_set_ugid(kn);
+ if (ret) {
+ kernfs_remove(kn);
+ return ret;
+ }
+
+ return 0;
+}
+
+static int rdtgroup_seqfile_show(struct seq_file *m, void *arg)
+{
+ struct kernfs_open_file *of = m->private;
+ struct rftype *rft = of->kn->priv;
+
+ if (rft->seq_show)
+ return rft->seq_show(of, m, arg);
+ return 0;
+}
+
+static ssize_t rdtgroup_file_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct rftype *rft = of->kn->priv;
+
+ if (rft->write)
+ return rft->write(of, buf, nbytes, off);
+
+ return -EINVAL;
+}
+
+static const struct kernfs_ops rdtgroup_kf_single_ops = {
+ .atomic_write_len = PAGE_SIZE,
+ .write = rdtgroup_file_write,
+ .seq_show = rdtgroup_seqfile_show,
+};
+
+static const struct kernfs_ops kf_mondata_ops = {
+ .atomic_write_len = PAGE_SIZE,
+ .seq_show = rdtgroup_mondata_show,
+};
+
+static bool is_cpu_list(struct kernfs_open_file *of)
+{
+ struct rftype *rft = of->kn->priv;
+
+ return rft->flags & RFTYPE_FLAGS_CPUS_LIST;
+}
+
+static int rdtgroup_cpus_show(struct kernfs_open_file *of,
+ struct seq_file *s, void *v)
+{
+ struct rdtgroup *rdtgrp;
+ struct cpumask *mask;
+ int ret = 0;
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+
+ if (rdtgrp) {
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
+ if (!rdtgrp->plr->d) {
+ rdt_last_cmd_clear();
+ rdt_last_cmd_puts("Cache domain offline\n");
+ ret = -ENODEV;
+ } else {
+ mask = &rdtgrp->plr->d->hdr.cpu_mask;
+ seq_printf(s, is_cpu_list(of) ?
+ "%*pbl\n" : "%*pb\n",
+ cpumask_pr_args(mask));
+ }
+ } else {
+ seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n",
+ cpumask_pr_args(&rdtgrp->cpu_mask));
+ }
+ } else {
+ ret = -ENOENT;
+ }
+ rdtgroup_kn_unlock(of->kn);
+
+ return ret;
+}
+
+/*
+ * Update the PGR_ASSOC MSR on all cpus in @cpu_mask,
+ *
+ * Per task closids/rmids must have been set up before calling this function.
+ * @r may be NULL.
+ */
+static void
+update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r)
+{
+ struct resctrl_cpu_defaults defaults, *p = NULL;
+
+ if (r) {
+ defaults.closid = r->closid;
+ defaults.rmid = r->mon.rmid;
+ p = &defaults;
+ }
+
+ on_each_cpu_mask(cpu_mask, resctrl_arch_sync_cpu_closid_rmid, p, 1);
+}
+
+static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
+ cpumask_var_t tmpmask)
+{
+ struct rdtgroup *prgrp = rdtgrp->mon.parent, *crgrp;
+ struct list_head *head;
+
+ /* Check whether cpus belong to parent ctrl group */
+ cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask);
+ if (!cpumask_empty(tmpmask)) {
+ rdt_last_cmd_puts("Can only add CPUs to mongroup that belong to parent\n");
+ return -EINVAL;
+ }
+
+ /* Check whether cpus are dropped from this group */
+ cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
+ if (!cpumask_empty(tmpmask)) {
+ /* Give any dropped cpus to parent rdtgroup */
+ cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask);
+ update_closid_rmid(tmpmask, prgrp);
+ }
+
+ /*
+ * If we added cpus, remove them from previous group that owned them
+ * and update per-cpu rmid
+ */
+ cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
+ if (!cpumask_empty(tmpmask)) {
+ head = &prgrp->mon.crdtgrp_list;
+ list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
+ if (crgrp == rdtgrp)
+ continue;
+ cpumask_andnot(&crgrp->cpu_mask, &crgrp->cpu_mask,
+ tmpmask);
+ }
+ update_closid_rmid(tmpmask, rdtgrp);
+ }
+
+ /* Done pushing/pulling - update this group with new mask */
+ cpumask_copy(&rdtgrp->cpu_mask, newmask);
+
+ return 0;
+}
+
+static void cpumask_rdtgrp_clear(struct rdtgroup *r, struct cpumask *m)
+{
+ struct rdtgroup *crgrp;
+
+ cpumask_andnot(&r->cpu_mask, &r->cpu_mask, m);
+ /* update the child mon group masks as well*/
+ list_for_each_entry(crgrp, &r->mon.crdtgrp_list, mon.crdtgrp_list)
+ cpumask_and(&crgrp->cpu_mask, &r->cpu_mask, &crgrp->cpu_mask);
+}
+
+static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
+ cpumask_var_t tmpmask, cpumask_var_t tmpmask1)
+{
+ struct rdtgroup *r, *crgrp;
+ struct list_head *head;
+
+ /* Check whether cpus are dropped from this group */
+ cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
+ if (!cpumask_empty(tmpmask)) {
+ /* Can't drop from default group */
+ if (rdtgrp == &rdtgroup_default) {
+ rdt_last_cmd_puts("Can't drop CPUs from default group\n");
+ return -EINVAL;
+ }
+
+ /* Give any dropped cpus to rdtgroup_default */
+ cpumask_or(&rdtgroup_default.cpu_mask,
+ &rdtgroup_default.cpu_mask, tmpmask);
+ update_closid_rmid(tmpmask, &rdtgroup_default);
+ }
+
+ /*
+ * If we added cpus, remove them from previous group and
+ * the prev group's child groups that owned them
+ * and update per-cpu closid/rmid.
+ */
+ cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
+ if (!cpumask_empty(tmpmask)) {
+ list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) {
+ if (r == rdtgrp)
+ continue;
+ cpumask_and(tmpmask1, &r->cpu_mask, tmpmask);
+ if (!cpumask_empty(tmpmask1))
+ cpumask_rdtgrp_clear(r, tmpmask1);
+ }
+ update_closid_rmid(tmpmask, rdtgrp);
+ }
+
+ /* Done pushing/pulling - update this group with new mask */
+ cpumask_copy(&rdtgrp->cpu_mask, newmask);
+
+ /*
+ * Clear child mon group masks since there is a new parent mask
+ * now and update the rmid for the cpus the child lost.
+ */
+ head = &rdtgrp->mon.crdtgrp_list;
+ list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
+ cpumask_and(tmpmask, &rdtgrp->cpu_mask, &crgrp->cpu_mask);
+ update_closid_rmid(tmpmask, rdtgrp);
+ cpumask_clear(&crgrp->cpu_mask);
+ }
+
+ return 0;
+}
+
+static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ cpumask_var_t tmpmask, newmask, tmpmask1;
+ struct rdtgroup *rdtgrp;
+ int ret;
+
+ if (!buf)
+ return -EINVAL;
+
+ if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
+ return -ENOMEM;
+ if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) {
+ free_cpumask_var(tmpmask);
+ return -ENOMEM;
+ }
+ if (!zalloc_cpumask_var(&tmpmask1, GFP_KERNEL)) {
+ free_cpumask_var(tmpmask);
+ free_cpumask_var(newmask);
+ return -ENOMEM;
+ }
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ if (!rdtgrp) {
+ ret = -ENOENT;
+ goto unlock;
+ }
+
+ rdt_last_cmd_clear();
+
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED ||
+ rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
+ ret = -EINVAL;
+ rdt_last_cmd_puts("Pseudo-locking in progress\n");
+ goto unlock;
+ }
+
+ if (is_cpu_list(of))
+ ret = cpulist_parse(buf, newmask);
+ else
+ ret = cpumask_parse(buf, newmask);
+
+ if (ret) {
+ rdt_last_cmd_puts("Bad CPU list/mask\n");
+ goto unlock;
+ }
+
+ /* check that user didn't specify any offline cpus */
+ cpumask_andnot(tmpmask, newmask, cpu_online_mask);
+ if (!cpumask_empty(tmpmask)) {
+ ret = -EINVAL;
+ rdt_last_cmd_puts("Can only assign online CPUs\n");
+ goto unlock;
+ }
+
+ if (rdtgrp->type == RDTCTRL_GROUP)
+ ret = cpus_ctrl_write(rdtgrp, newmask, tmpmask, tmpmask1);
+ else if (rdtgrp->type == RDTMON_GROUP)
+ ret = cpus_mon_write(rdtgrp, newmask, tmpmask);
+ else
+ ret = -EINVAL;
+
+unlock:
+ rdtgroup_kn_unlock(of->kn);
+ free_cpumask_var(tmpmask);
+ free_cpumask_var(newmask);
+ free_cpumask_var(tmpmask1);
+
+ return ret ?: nbytes;
+}
+
+/**
+ * rdtgroup_remove - the helper to remove resource group safely
+ * @rdtgrp: resource group to remove
+ *
+ * On resource group creation via a mkdir, an extra kernfs_node reference is
+ * taken to ensure that the rdtgroup structure remains accessible for the
+ * rdtgroup_kn_unlock() calls where it is removed.
+ *
+ * Drop the extra reference here, then free the rdtgroup structure.
+ *
+ * Return: void
+ */
+static void rdtgroup_remove(struct rdtgroup *rdtgrp)
+{
+ kernfs_put(rdtgrp->kn);
+ kfree(rdtgrp);
+}
+
+static void _update_task_closid_rmid(void *task)
+{
+ /*
+ * If the task is still current on this CPU, update PQR_ASSOC MSR.
+ * Otherwise, the MSR is updated when the task is scheduled in.
+ */
+ if (task == current)
+ resctrl_arch_sched_in(task);
+}
+
+static void update_task_closid_rmid(struct task_struct *t)
+{
+ if (IS_ENABLED(CONFIG_SMP) && task_curr(t))
+ smp_call_function_single(task_cpu(t), _update_task_closid_rmid, t, 1);
+ else
+ _update_task_closid_rmid(t);
+}
+
+static bool task_in_rdtgroup(struct task_struct *tsk, struct rdtgroup *rdtgrp)
+{
+ u32 closid, rmid = rdtgrp->mon.rmid;
+
+ if (rdtgrp->type == RDTCTRL_GROUP)
+ closid = rdtgrp->closid;
+ else if (rdtgrp->type == RDTMON_GROUP)
+ closid = rdtgrp->mon.parent->closid;
+ else
+ return false;
+
+ return resctrl_arch_match_closid(tsk, closid) &&
+ resctrl_arch_match_rmid(tsk, closid, rmid);
+}
+
+static int __rdtgroup_move_task(struct task_struct *tsk,
+ struct rdtgroup *rdtgrp)
+{
+ /* If the task is already in rdtgrp, no need to move the task. */
+ if (task_in_rdtgroup(tsk, rdtgrp))
+ return 0;
+
+ /*
+ * Set the task's closid/rmid before the PQR_ASSOC MSR can be
+ * updated by them.
+ *
+ * For ctrl_mon groups, move both closid and rmid.
+ * For monitor groups, can move the tasks only from
+ * their parent CTRL group.
+ */
+ if (rdtgrp->type == RDTMON_GROUP &&
+ !resctrl_arch_match_closid(tsk, rdtgrp->mon.parent->closid)) {
+ rdt_last_cmd_puts("Can't move task to different control group\n");
+ return -EINVAL;
+ }
+
+ if (rdtgrp->type == RDTMON_GROUP)
+ resctrl_arch_set_closid_rmid(tsk, rdtgrp->mon.parent->closid,
+ rdtgrp->mon.rmid);
+ else
+ resctrl_arch_set_closid_rmid(tsk, rdtgrp->closid,
+ rdtgrp->mon.rmid);
+
+ /*
+ * Ensure the task's closid and rmid are written before determining if
+ * the task is current that will decide if it will be interrupted.
+ * This pairs with the full barrier between the rq->curr update and
+ * resctrl_arch_sched_in() during context switch.
+ */
+ smp_mb();
+
+ /*
+ * By now, the task's closid and rmid are set. If the task is current
+ * on a CPU, the PQR_ASSOC MSR needs to be updated to make the resource
+ * group go into effect. If the task is not current, the MSR will be
+ * updated when the task is scheduled in.
+ */
+ update_task_closid_rmid(tsk);
+
+ return 0;
+}
+
+static bool is_closid_match(struct task_struct *t, struct rdtgroup *r)
+{
+ return (resctrl_arch_alloc_capable() && (r->type == RDTCTRL_GROUP) &&
+ resctrl_arch_match_closid(t, r->closid));
+}
+
+static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r)
+{
+ return (resctrl_arch_mon_capable() && (r->type == RDTMON_GROUP) &&
+ resctrl_arch_match_rmid(t, r->mon.parent->closid,
+ r->mon.rmid));
+}
+
+/**
+ * rdtgroup_tasks_assigned - Test if tasks have been assigned to resource group
+ * @r: Resource group
+ *
+ * Return: 1 if tasks have been assigned to @r, 0 otherwise
+ */
+int rdtgroup_tasks_assigned(struct rdtgroup *r)
+{
+ struct task_struct *p, *t;
+ int ret = 0;
+
+ lockdep_assert_held(&rdtgroup_mutex);
+
+ rcu_read_lock();
+ for_each_process_thread(p, t) {
+ if (is_closid_match(t, r) || is_rmid_match(t, r)) {
+ ret = 1;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static int rdtgroup_task_write_permission(struct task_struct *task,
+ struct kernfs_open_file *of)
+{
+ const struct cred *tcred = get_task_cred(task);
+ const struct cred *cred = current_cred();
+ int ret = 0;
+
+ /*
+ * Even if we're attaching all tasks in the thread group, we only
+ * need to check permissions on one of them.
+ */
+ if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
+ !uid_eq(cred->euid, tcred->uid) &&
+ !uid_eq(cred->euid, tcred->suid)) {
+ rdt_last_cmd_printf("No permission to move task %d\n", task->pid);
+ ret = -EPERM;
+ }
+
+ put_cred(tcred);
+ return ret;
+}
+
+static int rdtgroup_move_task(pid_t pid, struct rdtgroup *rdtgrp,
+ struct kernfs_open_file *of)
+{
+ struct task_struct *tsk;
+ int ret;
+
+ rcu_read_lock();
+ if (pid) {
+ tsk = find_task_by_vpid(pid);
+ if (!tsk) {
+ rcu_read_unlock();
+ rdt_last_cmd_printf("No task %d\n", pid);
+ return -ESRCH;
+ }
+ } else {
+ tsk = current;
+ }
+
+ get_task_struct(tsk);
+ rcu_read_unlock();
+
+ ret = rdtgroup_task_write_permission(tsk, of);
+ if (!ret)
+ ret = __rdtgroup_move_task(tsk, rdtgrp);
+
+ put_task_struct(tsk);
+ return ret;
+}
+
+static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct rdtgroup *rdtgrp;
+ char *pid_str;
+ int ret = 0;
+ pid_t pid;
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ if (!rdtgrp) {
+ rdtgroup_kn_unlock(of->kn);
+ return -ENOENT;
+ }
+ rdt_last_cmd_clear();
+
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED ||
+ rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
+ ret = -EINVAL;
+ rdt_last_cmd_puts("Pseudo-locking in progress\n");
+ goto unlock;
+ }
+
+ while (buf && buf[0] != '\0' && buf[0] != '\n') {
+ pid_str = strim(strsep(&buf, ","));
+
+ if (kstrtoint(pid_str, 0, &pid)) {
+ rdt_last_cmd_printf("Task list parsing error pid %s\n", pid_str);
+ ret = -EINVAL;
+ break;
+ }
+
+ if (pid < 0) {
+ rdt_last_cmd_printf("Invalid pid %d\n", pid);
+ ret = -EINVAL;
+ break;
+ }
+
+ ret = rdtgroup_move_task(pid, rdtgrp, of);
+ if (ret) {
+ rdt_last_cmd_printf("Error while processing task %d\n", pid);
+ break;
+ }
+ }
+
+unlock:
+ rdtgroup_kn_unlock(of->kn);
+
+ return ret ?: nbytes;
+}
+
+static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s)
+{
+ struct task_struct *p, *t;
+ pid_t pid;
+
+ rcu_read_lock();
+ for_each_process_thread(p, t) {
+ if (is_closid_match(t, r) || is_rmid_match(t, r)) {
+ pid = task_pid_vnr(t);
+ if (pid)
+ seq_printf(s, "%d\n", pid);
+ }
+ }
+ rcu_read_unlock();
+}
+
+static int rdtgroup_tasks_show(struct kernfs_open_file *of,
+ struct seq_file *s, void *v)
+{
+ struct rdtgroup *rdtgrp;
+ int ret = 0;
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ if (rdtgrp)
+ show_rdt_tasks(rdtgrp, s);
+ else
+ ret = -ENOENT;
+ rdtgroup_kn_unlock(of->kn);
+
+ return ret;
+}
+
+static int rdtgroup_closid_show(struct kernfs_open_file *of,
+ struct seq_file *s, void *v)
+{
+ struct rdtgroup *rdtgrp;
+ int ret = 0;
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ if (rdtgrp)
+ seq_printf(s, "%u\n", rdtgrp->closid);
+ else
+ ret = -ENOENT;
+ rdtgroup_kn_unlock(of->kn);
+
+ return ret;
+}
+
+static int rdtgroup_rmid_show(struct kernfs_open_file *of,
+ struct seq_file *s, void *v)
+{
+ struct rdtgroup *rdtgrp;
+ int ret = 0;
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ if (rdtgrp)
+ seq_printf(s, "%u\n", rdtgrp->mon.rmid);
+ else
+ ret = -ENOENT;
+ rdtgroup_kn_unlock(of->kn);
+
+ return ret;
+}
+
+#ifdef CONFIG_PROC_CPU_RESCTRL
+/*
+ * A task can only be part of one resctrl control group and of one monitor
+ * group which is associated to that control group.
+ *
+ * 1) res:
+ * mon:
+ *
+ * resctrl is not available.
+ *
+ * 2) res:/
+ * mon:
+ *
+ * Task is part of the root resctrl control group, and it is not associated
+ * to any monitor group.
+ *
+ * 3) res:/
+ * mon:mon0
+ *
+ * Task is part of the root resctrl control group and monitor group mon0.
+ *
+ * 4) res:group0
+ * mon:
+ *
+ * Task is part of resctrl control group group0, and it is not associated
+ * to any monitor group.
+ *
+ * 5) res:group0
+ * mon:mon1
+ *
+ * Task is part of resctrl control group group0 and monitor group mon1.
+ */
+int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *tsk)
+{
+ struct rdtgroup *rdtg;
+ int ret = 0;
+
+ mutex_lock(&rdtgroup_mutex);
+
+ /* Return empty if resctrl has not been mounted. */
+ if (!resctrl_mounted) {
+ seq_puts(s, "res:\nmon:\n");
+ goto unlock;
+ }
+
+ list_for_each_entry(rdtg, &rdt_all_groups, rdtgroup_list) {
+ struct rdtgroup *crg;
+
+ /*
+ * Task information is only relevant for shareable
+ * and exclusive groups.
+ */
+ if (rdtg->mode != RDT_MODE_SHAREABLE &&
+ rdtg->mode != RDT_MODE_EXCLUSIVE)
+ continue;
+
+ if (!resctrl_arch_match_closid(tsk, rdtg->closid))
+ continue;
+
+ seq_printf(s, "res:%s%s\n", (rdtg == &rdtgroup_default) ? "/" : "",
+ rdt_kn_name(rdtg->kn));
+ seq_puts(s, "mon:");
+ list_for_each_entry(crg, &rdtg->mon.crdtgrp_list,
+ mon.crdtgrp_list) {
+ if (!resctrl_arch_match_rmid(tsk, crg->mon.parent->closid,
+ crg->mon.rmid))
+ continue;
+ seq_printf(s, "%s", rdt_kn_name(crg->kn));
+ break;
+ }
+ seq_putc(s, '\n');
+ goto unlock;
+ }
+ /*
+ * The above search should succeed. Otherwise return
+ * with an error.
+ */
+ ret = -ENOENT;
+unlock:
+ mutex_unlock(&rdtgroup_mutex);
+
+ return ret;
+}
+#endif
+
+static int rdt_last_cmd_status_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ int len;
+
+ mutex_lock(&rdtgroup_mutex);
+ len = seq_buf_used(&last_cmd_status);
+ if (len)
+ seq_printf(seq, "%.*s", len, last_cmd_status_buf);
+ else
+ seq_puts(seq, "ok\n");
+ mutex_unlock(&rdtgroup_mutex);
+ return 0;
+}
+
+static void *rdt_kn_parent_priv(struct kernfs_node *kn)
+{
+ /*
+ * The parent pointer is only valid within RCU section since it can be
+ * replaced.
+ */
+ guard(rcu)();
+ return rcu_dereference(kn->__parent)->priv;
+}
+
+static int rdt_num_closids_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
+
+ seq_printf(seq, "%u\n", s->num_closid);
+ return 0;
+}
+
+static int rdt_default_ctrl_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
+ struct rdt_resource *r = s->res;
+
+ seq_printf(seq, "%x\n", resctrl_get_default_ctrl(r));
+ return 0;
+}
+
+static int rdt_min_cbm_bits_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
+ struct rdt_resource *r = s->res;
+
+ seq_printf(seq, "%u\n", r->cache.min_cbm_bits);
+ return 0;
+}
+
+static int rdt_shareable_bits_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
+ struct rdt_resource *r = s->res;
+
+ seq_printf(seq, "%x\n", r->cache.shareable_bits);
+ return 0;
+}
+
+/*
+ * rdt_bit_usage_show - Display current usage of resources
+ *
+ * A domain is a shared resource that can now be allocated differently. Here
+ * we display the current regions of the domain as an annotated bitmask.
+ * For each domain of this resource its allocation bitmask
+ * is annotated as below to indicate the current usage of the corresponding bit:
+ * 0 - currently unused
+ * X - currently available for sharing and used by software and hardware
+ * H - currently used by hardware only but available for software use
+ * S - currently used and shareable by software only
+ * E - currently used exclusively by one resource group
+ * P - currently pseudo-locked by one resource group
+ */
+static int rdt_bit_usage_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
+ /*
+ * Use unsigned long even though only 32 bits are used to ensure
+ * test_bit() is used safely.
+ */
+ unsigned long sw_shareable = 0, hw_shareable = 0;
+ unsigned long exclusive = 0, pseudo_locked = 0;
+ struct rdt_resource *r = s->res;
+ struct rdt_ctrl_domain *dom;
+ int i, hwb, swb, excl, psl;
+ enum rdtgrp_mode mode;
+ bool sep = false;
+ u32 ctrl_val;
+
+ cpus_read_lock();
+ mutex_lock(&rdtgroup_mutex);
+ hw_shareable = r->cache.shareable_bits;
+ list_for_each_entry(dom, &r->ctrl_domains, hdr.list) {
+ if (sep)
+ seq_putc(seq, ';');
+ sw_shareable = 0;
+ exclusive = 0;
+ seq_printf(seq, "%d=", dom->hdr.id);
+ for (i = 0; i < closids_supported(); i++) {
+ if (!closid_allocated(i))
+ continue;
+ ctrl_val = resctrl_arch_get_config(r, dom, i,
+ s->conf_type);
+ mode = rdtgroup_mode_by_closid(i);
+ switch (mode) {
+ case RDT_MODE_SHAREABLE:
+ sw_shareable |= ctrl_val;
+ break;
+ case RDT_MODE_EXCLUSIVE:
+ exclusive |= ctrl_val;
+ break;
+ case RDT_MODE_PSEUDO_LOCKSETUP:
+ /*
+ * RDT_MODE_PSEUDO_LOCKSETUP is possible
+ * here but not included since the CBM
+ * associated with this CLOSID in this mode
+ * is not initialized and no task or cpu can be
+ * assigned this CLOSID.
+ */
+ break;
+ case RDT_MODE_PSEUDO_LOCKED:
+ case RDT_NUM_MODES:
+ WARN(1,
+ "invalid mode for closid %d\n", i);
+ break;
+ }
+ }
+ for (i = r->cache.cbm_len - 1; i >= 0; i--) {
+ pseudo_locked = dom->plr ? dom->plr->cbm : 0;
+ hwb = test_bit(i, &hw_shareable);
+ swb = test_bit(i, &sw_shareable);
+ excl = test_bit(i, &exclusive);
+ psl = test_bit(i, &pseudo_locked);
+ if (hwb && swb)
+ seq_putc(seq, 'X');
+ else if (hwb && !swb)
+ seq_putc(seq, 'H');
+ else if (!hwb && swb)
+ seq_putc(seq, 'S');
+ else if (excl)
+ seq_putc(seq, 'E');
+ else if (psl)
+ seq_putc(seq, 'P');
+ else /* Unused bits remain */
+ seq_putc(seq, '0');
+ }
+ sep = true;
+ }
+ seq_putc(seq, '\n');
+ mutex_unlock(&rdtgroup_mutex);
+ cpus_read_unlock();
+ return 0;
+}
+
+static int rdt_min_bw_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
+ struct rdt_resource *r = s->res;
+
+ seq_printf(seq, "%u\n", r->membw.min_bw);
+ return 0;
+}
+
+static int rdt_num_rmids_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
+
+ seq_printf(seq, "%d\n", r->num_rmid);
+
+ return 0;
+}
+
+static int rdt_mon_features_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
+ struct mon_evt *mevt;
+
+ list_for_each_entry(mevt, &r->evt_list, list) {
+ seq_printf(seq, "%s\n", mevt->name);
+ if (mevt->configurable)
+ seq_printf(seq, "%s_config\n", mevt->name);
+ }
+
+ return 0;
+}
+
+static int rdt_bw_gran_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
+ struct rdt_resource *r = s->res;
+
+ seq_printf(seq, "%u\n", r->membw.bw_gran);
+ return 0;
+}
+
+static int rdt_delay_linear_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
+ struct rdt_resource *r = s->res;
+
+ seq_printf(seq, "%u\n", r->membw.delay_linear);
+ return 0;
+}
+
+static int max_threshold_occ_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ seq_printf(seq, "%u\n", resctrl_rmid_realloc_threshold);
+
+ return 0;
+}
+
+static int rdt_thread_throttle_mode_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
+ struct rdt_resource *r = s->res;
+
+ switch (r->membw.throttle_mode) {
+ case THREAD_THROTTLE_PER_THREAD:
+ seq_puts(seq, "per-thread\n");
+ return 0;
+ case THREAD_THROTTLE_MAX:
+ seq_puts(seq, "max\n");
+ return 0;
+ case THREAD_THROTTLE_UNDEFINED:
+ seq_puts(seq, "undefined\n");
+ return 0;
+ }
+
+ WARN_ON_ONCE(1);
+
+ return 0;
+}
+
+static ssize_t max_threshold_occ_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ unsigned int bytes;
+ int ret;
+
+ ret = kstrtouint(buf, 0, &bytes);
+ if (ret)
+ return ret;
+
+ if (bytes > resctrl_rmid_realloc_limit)
+ return -EINVAL;
+
+ resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(bytes);
+
+ return nbytes;
+}
+
+/*
+ * rdtgroup_mode_show - Display mode of this resource group
+ */
+static int rdtgroup_mode_show(struct kernfs_open_file *of,
+ struct seq_file *s, void *v)
+{
+ struct rdtgroup *rdtgrp;
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ if (!rdtgrp) {
+ rdtgroup_kn_unlock(of->kn);
+ return -ENOENT;
+ }
+
+ seq_printf(s, "%s\n", rdtgroup_mode_str(rdtgrp->mode));
+
+ rdtgroup_kn_unlock(of->kn);
+ return 0;
+}
+
+static enum resctrl_conf_type resctrl_peer_type(enum resctrl_conf_type my_type)
+{
+ switch (my_type) {
+ case CDP_CODE:
+ return CDP_DATA;
+ case CDP_DATA:
+ return CDP_CODE;
+ default:
+ case CDP_NONE:
+ return CDP_NONE;
+ }
+}
+
+static int rdt_has_sparse_bitmasks_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct resctrl_schema *s = rdt_kn_parent_priv(of->kn);
+ struct rdt_resource *r = s->res;
+
+ seq_printf(seq, "%u\n", r->cache.arch_has_sparse_bitmasks);
+
+ return 0;
+}
+
+/**
+ * __rdtgroup_cbm_overlaps - Does CBM for intended closid overlap with other
+ * @r: Resource to which domain instance @d belongs.
+ * @d: The domain instance for which @closid is being tested.
+ * @cbm: Capacity bitmask being tested.
+ * @closid: Intended closid for @cbm.
+ * @type: CDP type of @r.
+ * @exclusive: Only check if overlaps with exclusive resource groups
+ *
+ * Checks if provided @cbm intended to be used for @closid on domain
+ * @d overlaps with any other closids or other hardware usage associated
+ * with this domain. If @exclusive is true then only overlaps with
+ * resource groups in exclusive mode will be considered. If @exclusive
+ * is false then overlaps with any resource group or hardware entities
+ * will be considered.
+ *
+ * @cbm is unsigned long, even if only 32 bits are used, to make the
+ * bitmap functions work correctly.
+ *
+ * Return: false if CBM does not overlap, true if it does.
+ */
+static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_ctrl_domain *d,
+ unsigned long cbm, int closid,
+ enum resctrl_conf_type type, bool exclusive)
+{
+ enum rdtgrp_mode mode;
+ unsigned long ctrl_b;
+ int i;
+
+ /* Check for any overlap with regions used by hardware directly */
+ if (!exclusive) {
+ ctrl_b = r->cache.shareable_bits;
+ if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len))
+ return true;
+ }
+
+ /* Check for overlap with other resource groups */
+ for (i = 0; i < closids_supported(); i++) {
+ ctrl_b = resctrl_arch_get_config(r, d, i, type);
+ mode = rdtgroup_mode_by_closid(i);
+ if (closid_allocated(i) && i != closid &&
+ mode != RDT_MODE_PSEUDO_LOCKSETUP) {
+ if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len)) {
+ if (exclusive) {
+ if (mode == RDT_MODE_EXCLUSIVE)
+ return true;
+ continue;
+ }
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+/**
+ * rdtgroup_cbm_overlaps - Does CBM overlap with other use of hardware
+ * @s: Schema for the resource to which domain instance @d belongs.
+ * @d: The domain instance for which @closid is being tested.
+ * @cbm: Capacity bitmask being tested.
+ * @closid: Intended closid for @cbm.
+ * @exclusive: Only check if overlaps with exclusive resource groups
+ *
+ * Resources that can be allocated using a CBM can use the CBM to control
+ * the overlap of these allocations. rdtgroup_cmb_overlaps() is the test
+ * for overlap. Overlap test is not limited to the specific resource for
+ * which the CBM is intended though - when dealing with CDP resources that
+ * share the underlying hardware the overlap check should be performed on
+ * the CDP resource sharing the hardware also.
+ *
+ * Refer to description of __rdtgroup_cbm_overlaps() for the details of the
+ * overlap test.
+ *
+ * Return: true if CBM overlap detected, false if there is no overlap
+ */
+bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_ctrl_domain *d,
+ unsigned long cbm, int closid, bool exclusive)
+{
+ enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type);
+ struct rdt_resource *r = s->res;
+
+ if (__rdtgroup_cbm_overlaps(r, d, cbm, closid, s->conf_type,
+ exclusive))
+ return true;
+
+ if (!resctrl_arch_get_cdp_enabled(r->rid))
+ return false;
+ return __rdtgroup_cbm_overlaps(r, d, cbm, closid, peer_type, exclusive);
+}
+
+/**
+ * rdtgroup_mode_test_exclusive - Test if this resource group can be exclusive
+ * @rdtgrp: Resource group identified through its closid.
+ *
+ * An exclusive resource group implies that there should be no sharing of
+ * its allocated resources. At the time this group is considered to be
+ * exclusive this test can determine if its current schemata supports this
+ * setting by testing for overlap with all other resource groups.
+ *
+ * Return: true if resource group can be exclusive, false if there is overlap
+ * with allocations of other resource groups and thus this resource group
+ * cannot be exclusive.
+ */
+static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp)
+{
+ int closid = rdtgrp->closid;
+ struct rdt_ctrl_domain *d;
+ struct resctrl_schema *s;
+ struct rdt_resource *r;
+ bool has_cache = false;
+ u32 ctrl;
+
+ /* Walking r->domains, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
+
+ list_for_each_entry(s, &resctrl_schema_all, list) {
+ r = s->res;
+ if (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)
+ continue;
+ has_cache = true;
+ list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
+ ctrl = resctrl_arch_get_config(r, d, closid,
+ s->conf_type);
+ if (rdtgroup_cbm_overlaps(s, d, ctrl, closid, false)) {
+ rdt_last_cmd_puts("Schemata overlaps\n");
+ return false;
+ }
+ }
+ }
+
+ if (!has_cache) {
+ rdt_last_cmd_puts("Cannot be exclusive without CAT/CDP\n");
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * rdtgroup_mode_write - Modify the resource group's mode
+ */
+static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct rdtgroup *rdtgrp;
+ enum rdtgrp_mode mode;
+ int ret = 0;
+
+ /* Valid input requires a trailing newline */
+ if (nbytes == 0 || buf[nbytes - 1] != '\n')
+ return -EINVAL;
+ buf[nbytes - 1] = '\0';
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ if (!rdtgrp) {
+ rdtgroup_kn_unlock(of->kn);
+ return -ENOENT;
+ }
+
+ rdt_last_cmd_clear();
+
+ mode = rdtgrp->mode;
+
+ if ((!strcmp(buf, "shareable") && mode == RDT_MODE_SHAREABLE) ||
+ (!strcmp(buf, "exclusive") && mode == RDT_MODE_EXCLUSIVE) ||
+ (!strcmp(buf, "pseudo-locksetup") &&
+ mode == RDT_MODE_PSEUDO_LOCKSETUP) ||
+ (!strcmp(buf, "pseudo-locked") && mode == RDT_MODE_PSEUDO_LOCKED))
+ goto out;
+
+ if (mode == RDT_MODE_PSEUDO_LOCKED) {
+ rdt_last_cmd_puts("Cannot change pseudo-locked group\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (!strcmp(buf, "shareable")) {
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
+ ret = rdtgroup_locksetup_exit(rdtgrp);
+ if (ret)
+ goto out;
+ }
+ rdtgrp->mode = RDT_MODE_SHAREABLE;
+ } else if (!strcmp(buf, "exclusive")) {
+ if (!rdtgroup_mode_test_exclusive(rdtgrp)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
+ ret = rdtgroup_locksetup_exit(rdtgrp);
+ if (ret)
+ goto out;
+ }
+ rdtgrp->mode = RDT_MODE_EXCLUSIVE;
+ } else if (IS_ENABLED(CONFIG_RESCTRL_FS_PSEUDO_LOCK) &&
+ !strcmp(buf, "pseudo-locksetup")) {
+ ret = rdtgroup_locksetup_enter(rdtgrp);
+ if (ret)
+ goto out;
+ rdtgrp->mode = RDT_MODE_PSEUDO_LOCKSETUP;
+ } else {
+ rdt_last_cmd_puts("Unknown or unsupported mode\n");
+ ret = -EINVAL;
+ }
+
+out:
+ rdtgroup_kn_unlock(of->kn);
+ return ret ?: nbytes;
+}
+
+/**
+ * rdtgroup_cbm_to_size - Translate CBM to size in bytes
+ * @r: RDT resource to which @d belongs.
+ * @d: RDT domain instance.
+ * @cbm: bitmask for which the size should be computed.
+ *
+ * The bitmask provided associated with the RDT domain instance @d will be
+ * translated into how many bytes it represents. The size in bytes is
+ * computed by first dividing the total cache size by the CBM length to
+ * determine how many bytes each bit in the bitmask represents. The result
+ * is multiplied with the number of bits set in the bitmask.
+ *
+ * @cbm is unsigned long, even if only 32 bits are used to make the
+ * bitmap functions work correctly.
+ */
+unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r,
+ struct rdt_ctrl_domain *d, unsigned long cbm)
+{
+ unsigned int size = 0;
+ struct cacheinfo *ci;
+ int num_b;
+
+ if (WARN_ON_ONCE(r->ctrl_scope != RESCTRL_L2_CACHE && r->ctrl_scope != RESCTRL_L3_CACHE))
+ return size;
+
+ num_b = bitmap_weight(&cbm, r->cache.cbm_len);
+ ci = get_cpu_cacheinfo_level(cpumask_any(&d->hdr.cpu_mask), r->ctrl_scope);
+ if (ci)
+ size = ci->size / r->cache.cbm_len * num_b;
+
+ return size;
+}
+
+bool is_mba_sc(struct rdt_resource *r)
+{
+ if (!r)
+ r = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
+
+ /*
+ * The software controller support is only applicable to MBA resource.
+ * Make sure to check for resource type.
+ */
+ if (r->rid != RDT_RESOURCE_MBA)
+ return false;
+
+ return r->membw.mba_sc;
+}
+
+/*
+ * rdtgroup_size_show - Display size in bytes of allocated regions
+ *
+ * The "size" file mirrors the layout of the "schemata" file, printing the
+ * size in bytes of each region instead of the capacity bitmask.
+ */
+static int rdtgroup_size_show(struct kernfs_open_file *of,
+ struct seq_file *s, void *v)
+{
+ struct resctrl_schema *schema;
+ enum resctrl_conf_type type;
+ struct rdt_ctrl_domain *d;
+ struct rdtgroup *rdtgrp;
+ struct rdt_resource *r;
+ unsigned int size;
+ int ret = 0;
+ u32 closid;
+ bool sep;
+ u32 ctrl;
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ if (!rdtgrp) {
+ rdtgroup_kn_unlock(of->kn);
+ return -ENOENT;
+ }
+
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
+ if (!rdtgrp->plr->d) {
+ rdt_last_cmd_clear();
+ rdt_last_cmd_puts("Cache domain offline\n");
+ ret = -ENODEV;
+ } else {
+ seq_printf(s, "%*s:", max_name_width,
+ rdtgrp->plr->s->name);
+ size = rdtgroup_cbm_to_size(rdtgrp->plr->s->res,
+ rdtgrp->plr->d,
+ rdtgrp->plr->cbm);
+ seq_printf(s, "%d=%u\n", rdtgrp->plr->d->hdr.id, size);
+ }
+ goto out;
+ }
+
+ closid = rdtgrp->closid;
+
+ list_for_each_entry(schema, &resctrl_schema_all, list) {
+ r = schema->res;
+ type = schema->conf_type;
+ sep = false;
+ seq_printf(s, "%*s:", max_name_width, schema->name);
+ list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
+ if (sep)
+ seq_putc(s, ';');
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
+ size = 0;
+ } else {
+ if (is_mba_sc(r))
+ ctrl = d->mbps_val[closid];
+ else
+ ctrl = resctrl_arch_get_config(r, d,
+ closid,
+ type);
+ if (r->rid == RDT_RESOURCE_MBA ||
+ r->rid == RDT_RESOURCE_SMBA)
+ size = ctrl;
+ else
+ size = rdtgroup_cbm_to_size(r, d, ctrl);
+ }
+ seq_printf(s, "%d=%u", d->hdr.id, size);
+ sep = true;
+ }
+ seq_putc(s, '\n');
+ }
+
+out:
+ rdtgroup_kn_unlock(of->kn);
+
+ return ret;
+}
+
+static void mondata_config_read(struct resctrl_mon_config_info *mon_info)
+{
+ smp_call_function_any(&mon_info->d->hdr.cpu_mask,
+ resctrl_arch_mon_event_config_read, mon_info, 1);
+}
+
+static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid)
+{
+ struct resctrl_mon_config_info mon_info;
+ struct rdt_mon_domain *dom;
+ bool sep = false;
+
+ cpus_read_lock();
+ mutex_lock(&rdtgroup_mutex);
+
+ list_for_each_entry(dom, &r->mon_domains, hdr.list) {
+ if (sep)
+ seq_puts(s, ";");
+
+ memset(&mon_info, 0, sizeof(struct resctrl_mon_config_info));
+ mon_info.r = r;
+ mon_info.d = dom;
+ mon_info.evtid = evtid;
+ mondata_config_read(&mon_info);
+
+ seq_printf(s, "%d=0x%02x", dom->hdr.id, mon_info.mon_config);
+ sep = true;
+ }
+ seq_puts(s, "\n");
+
+ mutex_unlock(&rdtgroup_mutex);
+ cpus_read_unlock();
+
+ return 0;
+}
+
+static int mbm_total_bytes_config_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
+
+ mbm_config_show(seq, r, QOS_L3_MBM_TOTAL_EVENT_ID);
+
+ return 0;
+}
+
+static int mbm_local_bytes_config_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
+
+ mbm_config_show(seq, r, QOS_L3_MBM_LOCAL_EVENT_ID);
+
+ return 0;
+}
+
+static void mbm_config_write_domain(struct rdt_resource *r,
+ struct rdt_mon_domain *d, u32 evtid, u32 val)
+{
+ struct resctrl_mon_config_info mon_info = {0};
+
+ /*
+ * Read the current config value first. If both are the same then
+ * no need to write it again.
+ */
+ mon_info.r = r;
+ mon_info.d = d;
+ mon_info.evtid = evtid;
+ mondata_config_read(&mon_info);
+ if (mon_info.mon_config == val)
+ return;
+
+ mon_info.mon_config = val;
+
+ /*
+ * Update MSR_IA32_EVT_CFG_BASE MSR on one of the CPUs in the
+ * domain. The MSRs offset from MSR MSR_IA32_EVT_CFG_BASE
+ * are scoped at the domain level. Writing any of these MSRs
+ * on one CPU is observed by all the CPUs in the domain.
+ */
+ smp_call_function_any(&d->hdr.cpu_mask, resctrl_arch_mon_event_config_write,
+ &mon_info, 1);
+
+ /*
+ * When an Event Configuration is changed, the bandwidth counters
+ * for all RMIDs and Events will be cleared by the hardware. The
+ * hardware also sets MSR_IA32_QM_CTR.Unavailable (bit 62) for
+ * every RMID on the next read to any event for every RMID.
+ * Subsequent reads will have MSR_IA32_QM_CTR.Unavailable (bit 62)
+ * cleared while it is tracked by the hardware. Clear the
+ * mbm_local and mbm_total counts for all the RMIDs.
+ */
+ resctrl_arch_reset_rmid_all(r, d);
+}
+
+static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid)
+{
+ char *dom_str = NULL, *id_str;
+ unsigned long dom_id, val;
+ struct rdt_mon_domain *d;
+
+ /* Walking r->domains, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
+
+next:
+ if (!tok || tok[0] == '\0')
+ return 0;
+
+ /* Start processing the strings for each domain */
+ dom_str = strim(strsep(&tok, ";"));
+ id_str = strsep(&dom_str, "=");
+
+ if (!id_str || kstrtoul(id_str, 10, &dom_id)) {
+ rdt_last_cmd_puts("Missing '=' or non-numeric domain id\n");
+ return -EINVAL;
+ }
+
+ if (!dom_str || kstrtoul(dom_str, 16, &val)) {
+ rdt_last_cmd_puts("Non-numeric event configuration value\n");
+ return -EINVAL;
+ }
+
+ /* Value from user cannot be more than the supported set of events */
+ if ((val & r->mbm_cfg_mask) != val) {
+ rdt_last_cmd_printf("Invalid event configuration: max valid mask is 0x%02x\n",
+ r->mbm_cfg_mask);
+ return -EINVAL;
+ }
+
+ list_for_each_entry(d, &r->mon_domains, hdr.list) {
+ if (d->hdr.id == dom_id) {
+ mbm_config_write_domain(r, d, evtid, val);
+ goto next;
+ }
+ }
+
+ return -EINVAL;
+}
+
+static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
+ int ret;
+
+ /* Valid input requires a trailing newline */
+ if (nbytes == 0 || buf[nbytes - 1] != '\n')
+ return -EINVAL;
+
+ cpus_read_lock();
+ mutex_lock(&rdtgroup_mutex);
+
+ rdt_last_cmd_clear();
+
+ buf[nbytes - 1] = '\0';
+
+ ret = mon_config_write(r, buf, QOS_L3_MBM_TOTAL_EVENT_ID);
+
+ mutex_unlock(&rdtgroup_mutex);
+ cpus_read_unlock();
+
+ return ret ?: nbytes;
+}
+
+static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ struct rdt_resource *r = rdt_kn_parent_priv(of->kn);
+ int ret;
+
+ /* Valid input requires a trailing newline */
+ if (nbytes == 0 || buf[nbytes - 1] != '\n')
+ return -EINVAL;
+
+ cpus_read_lock();
+ mutex_lock(&rdtgroup_mutex);
+
+ rdt_last_cmd_clear();
+
+ buf[nbytes - 1] = '\0';
+
+ ret = mon_config_write(r, buf, QOS_L3_MBM_LOCAL_EVENT_ID);
+
+ mutex_unlock(&rdtgroup_mutex);
+ cpus_read_unlock();
+
+ return ret ?: nbytes;
+}
+
+/* rdtgroup information files for one cache resource. */
+static struct rftype res_common_files[] = {
+ {
+ .name = "last_cmd_status",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_last_cmd_status_show,
+ .fflags = RFTYPE_TOP_INFO,
+ },
+ {
+ .name = "num_closids",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_num_closids_show,
+ .fflags = RFTYPE_CTRL_INFO,
+ },
+ {
+ .name = "mon_features",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_mon_features_show,
+ .fflags = RFTYPE_MON_INFO,
+ },
+ {
+ .name = "num_rmids",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_num_rmids_show,
+ .fflags = RFTYPE_MON_INFO,
+ },
+ {
+ .name = "cbm_mask",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_default_ctrl_show,
+ .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
+ },
+ {
+ .name = "min_cbm_bits",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_min_cbm_bits_show,
+ .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
+ },
+ {
+ .name = "shareable_bits",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_shareable_bits_show,
+ .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
+ },
+ {
+ .name = "bit_usage",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_bit_usage_show,
+ .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
+ },
+ {
+ .name = "min_bandwidth",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_min_bw_show,
+ .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB,
+ },
+ {
+ .name = "bandwidth_gran",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_bw_gran_show,
+ .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB,
+ },
+ {
+ .name = "delay_linear",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_delay_linear_show,
+ .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB,
+ },
+ /*
+ * Platform specific which (if any) capabilities are provided by
+ * thread_throttle_mode. Defer "fflags" initialization to platform
+ * discovery.
+ */
+ {
+ .name = "thread_throttle_mode",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_thread_throttle_mode_show,
+ },
+ {
+ .name = "max_threshold_occupancy",
+ .mode = 0644,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .write = max_threshold_occ_write,
+ .seq_show = max_threshold_occ_show,
+ .fflags = RFTYPE_MON_INFO | RFTYPE_RES_CACHE,
+ },
+ {
+ .name = "mbm_total_bytes_config",
+ .mode = 0644,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = mbm_total_bytes_config_show,
+ .write = mbm_total_bytes_config_write,
+ },
+ {
+ .name = "mbm_local_bytes_config",
+ .mode = 0644,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = mbm_local_bytes_config_show,
+ .write = mbm_local_bytes_config_write,
+ },
+ {
+ .name = "cpus",
+ .mode = 0644,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .write = rdtgroup_cpus_write,
+ .seq_show = rdtgroup_cpus_show,
+ .fflags = RFTYPE_BASE,
+ },
+ {
+ .name = "cpus_list",
+ .mode = 0644,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .write = rdtgroup_cpus_write,
+ .seq_show = rdtgroup_cpus_show,
+ .flags = RFTYPE_FLAGS_CPUS_LIST,
+ .fflags = RFTYPE_BASE,
+ },
+ {
+ .name = "tasks",
+ .mode = 0644,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .write = rdtgroup_tasks_write,
+ .seq_show = rdtgroup_tasks_show,
+ .fflags = RFTYPE_BASE,
+ },
+ {
+ .name = "mon_hw_id",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdtgroup_rmid_show,
+ .fflags = RFTYPE_MON_BASE | RFTYPE_DEBUG,
+ },
+ {
+ .name = "schemata",
+ .mode = 0644,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .write = rdtgroup_schemata_write,
+ .seq_show = rdtgroup_schemata_show,
+ .fflags = RFTYPE_CTRL_BASE,
+ },
+ {
+ .name = "mba_MBps_event",
+ .mode = 0644,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .write = rdtgroup_mba_mbps_event_write,
+ .seq_show = rdtgroup_mba_mbps_event_show,
+ },
+ {
+ .name = "mode",
+ .mode = 0644,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .write = rdtgroup_mode_write,
+ .seq_show = rdtgroup_mode_show,
+ .fflags = RFTYPE_CTRL_BASE,
+ },
+ {
+ .name = "size",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdtgroup_size_show,
+ .fflags = RFTYPE_CTRL_BASE,
+ },
+ {
+ .name = "sparse_masks",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_has_sparse_bitmasks_show,
+ .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE,
+ },
+ {
+ .name = "ctrl_hw_id",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdtgroup_closid_show,
+ .fflags = RFTYPE_CTRL_BASE | RFTYPE_DEBUG,
+ },
+};
+
+static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags)
+{
+ struct rftype *rfts, *rft;
+ int ret, len;
+
+ rfts = res_common_files;
+ len = ARRAY_SIZE(res_common_files);
+
+ lockdep_assert_held(&rdtgroup_mutex);
+
+ if (resctrl_debug)
+ fflags |= RFTYPE_DEBUG;
+
+ for (rft = rfts; rft < rfts + len; rft++) {
+ if (rft->fflags && ((fflags & rft->fflags) == rft->fflags)) {
+ ret = rdtgroup_add_file(kn, rft);
+ if (ret)
+ goto error;
+ }
+ }
+
+ return 0;
+error:
+ pr_warn("Failed to add %s, err=%d\n", rft->name, ret);
+ while (--rft >= rfts) {
+ if ((fflags & rft->fflags) == rft->fflags)
+ kernfs_remove_by_name(kn, rft->name);
+ }
+ return ret;
+}
+
+static struct rftype *rdtgroup_get_rftype_by_name(const char *name)
+{
+ struct rftype *rfts, *rft;
+ int len;
+
+ rfts = res_common_files;
+ len = ARRAY_SIZE(res_common_files);
+
+ for (rft = rfts; rft < rfts + len; rft++) {
+ if (!strcmp(rft->name, name))
+ return rft;
+ }
+
+ return NULL;
+}
+
+static void thread_throttle_mode_init(void)
+{
+ enum membw_throttle_mode throttle_mode = THREAD_THROTTLE_UNDEFINED;
+ struct rdt_resource *r_mba, *r_smba;
+
+ r_mba = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
+ if (r_mba->alloc_capable &&
+ r_mba->membw.throttle_mode != THREAD_THROTTLE_UNDEFINED)
+ throttle_mode = r_mba->membw.throttle_mode;
+
+ r_smba = resctrl_arch_get_resource(RDT_RESOURCE_SMBA);
+ if (r_smba->alloc_capable &&
+ r_smba->membw.throttle_mode != THREAD_THROTTLE_UNDEFINED)
+ throttle_mode = r_smba->membw.throttle_mode;
+
+ if (throttle_mode == THREAD_THROTTLE_UNDEFINED)
+ return;
+
+ resctrl_file_fflags_init("thread_throttle_mode",
+ RFTYPE_CTRL_INFO | RFTYPE_RES_MB);
+}
+
+void resctrl_file_fflags_init(const char *config, unsigned long fflags)
+{
+ struct rftype *rft;
+
+ rft = rdtgroup_get_rftype_by_name(config);
+ if (rft)
+ rft->fflags = fflags;
+}
+
+/**
+ * rdtgroup_kn_mode_restrict - Restrict user access to named resctrl file
+ * @r: The resource group with which the file is associated.
+ * @name: Name of the file
+ *
+ * The permissions of named resctrl file, directory, or link are modified
+ * to not allow read, write, or execute by any user.
+ *
+ * WARNING: This function is intended to communicate to the user that the
+ * resctrl file has been locked down - that it is not relevant to the
+ * particular state the system finds itself in. It should not be relied
+ * on to protect from user access because after the file's permissions
+ * are restricted the user can still change the permissions using chmod
+ * from the command line.
+ *
+ * Return: 0 on success, <0 on failure.
+ */
+int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name)
+{
+ struct iattr iattr = {.ia_valid = ATTR_MODE,};
+ struct kernfs_node *kn;
+ int ret = 0;
+
+ kn = kernfs_find_and_get_ns(r->kn, name, NULL);
+ if (!kn)
+ return -ENOENT;
+
+ switch (kernfs_type(kn)) {
+ case KERNFS_DIR:
+ iattr.ia_mode = S_IFDIR;
+ break;
+ case KERNFS_FILE:
+ iattr.ia_mode = S_IFREG;
+ break;
+ case KERNFS_LINK:
+ iattr.ia_mode = S_IFLNK;
+ break;
+ }
+
+ ret = kernfs_setattr(kn, &iattr);
+ kernfs_put(kn);
+ return ret;
+}
+
+/**
+ * rdtgroup_kn_mode_restore - Restore user access to named resctrl file
+ * @r: The resource group with which the file is associated.
+ * @name: Name of the file
+ * @mask: Mask of permissions that should be restored
+ *
+ * Restore the permissions of the named file. If @name is a directory the
+ * permissions of its parent will be used.
+ *
+ * Return: 0 on success, <0 on failure.
+ */
+int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name,
+ umode_t mask)
+{
+ struct iattr iattr = {.ia_valid = ATTR_MODE,};
+ struct kernfs_node *kn, *parent;
+ struct rftype *rfts, *rft;
+ int ret, len;
+
+ rfts = res_common_files;
+ len = ARRAY_SIZE(res_common_files);
+
+ for (rft = rfts; rft < rfts + len; rft++) {
+ if (!strcmp(rft->name, name))
+ iattr.ia_mode = rft->mode & mask;
+ }
+
+ kn = kernfs_find_and_get_ns(r->kn, name, NULL);
+ if (!kn)
+ return -ENOENT;
+
+ switch (kernfs_type(kn)) {
+ case KERNFS_DIR:
+ parent = kernfs_get_parent(kn);
+ if (parent) {
+ iattr.ia_mode |= parent->mode;
+ kernfs_put(parent);
+ }
+ iattr.ia_mode |= S_IFDIR;
+ break;
+ case KERNFS_FILE:
+ iattr.ia_mode |= S_IFREG;
+ break;
+ case KERNFS_LINK:
+ iattr.ia_mode |= S_IFLNK;
+ break;
+ }
+
+ ret = kernfs_setattr(kn, &iattr);
+ kernfs_put(kn);
+ return ret;
+}
+
+static int rdtgroup_mkdir_info_resdir(void *priv, char *name,
+ unsigned long fflags)
+{
+ struct kernfs_node *kn_subdir;
+ int ret;
+
+ kn_subdir = kernfs_create_dir(kn_info, name,
+ kn_info->mode, priv);
+ if (IS_ERR(kn_subdir))
+ return PTR_ERR(kn_subdir);
+
+ ret = rdtgroup_kn_set_ugid(kn_subdir);
+ if (ret)
+ return ret;
+
+ ret = rdtgroup_add_files(kn_subdir, fflags);
+ if (!ret)
+ kernfs_activate(kn_subdir);
+
+ return ret;
+}
+
+static unsigned long fflags_from_resource(struct rdt_resource *r)
+{
+ switch (r->rid) {
+ case RDT_RESOURCE_L3:
+ case RDT_RESOURCE_L2:
+ return RFTYPE_RES_CACHE;
+ case RDT_RESOURCE_MBA:
+ case RDT_RESOURCE_SMBA:
+ return RFTYPE_RES_MB;
+ }
+
+ return WARN_ON_ONCE(1);
+}
+
+static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
+{
+ struct resctrl_schema *s;
+ struct rdt_resource *r;
+ unsigned long fflags;
+ char name[32];
+ int ret;
+
+ /* create the directory */
+ kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL);
+ if (IS_ERR(kn_info))
+ return PTR_ERR(kn_info);
+
+ ret = rdtgroup_add_files(kn_info, RFTYPE_TOP_INFO);
+ if (ret)
+ goto out_destroy;
+
+ /* loop over enabled controls, these are all alloc_capable */
+ list_for_each_entry(s, &resctrl_schema_all, list) {
+ r = s->res;
+ fflags = fflags_from_resource(r) | RFTYPE_CTRL_INFO;
+ ret = rdtgroup_mkdir_info_resdir(s, s->name, fflags);
+ if (ret)
+ goto out_destroy;
+ }
+
+ for_each_mon_capable_rdt_resource(r) {
+ fflags = fflags_from_resource(r) | RFTYPE_MON_INFO;
+ sprintf(name, "%s_MON", r->name);
+ ret = rdtgroup_mkdir_info_resdir(r, name, fflags);
+ if (ret)
+ goto out_destroy;
+ }
+
+ ret = rdtgroup_kn_set_ugid(kn_info);
+ if (ret)
+ goto out_destroy;
+
+ kernfs_activate(kn_info);
+
+ return 0;
+
+out_destroy:
+ kernfs_remove(kn_info);
+ return ret;
+}
+
+static int
+mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp,
+ char *name, struct kernfs_node **dest_kn)
+{
+ struct kernfs_node *kn;
+ int ret;
+
+ /* create the directory */
+ kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
+ if (IS_ERR(kn))
+ return PTR_ERR(kn);
+
+ if (dest_kn)
+ *dest_kn = kn;
+
+ ret = rdtgroup_kn_set_ugid(kn);
+ if (ret)
+ goto out_destroy;
+
+ kernfs_activate(kn);
+
+ return 0;
+
+out_destroy:
+ kernfs_remove(kn);
+ return ret;
+}
+
+static inline bool is_mba_linear(void)
+{
+ return resctrl_arch_get_resource(RDT_RESOURCE_MBA)->membw.delay_linear;
+}
+
+static int mba_sc_domain_allocate(struct rdt_resource *r, struct rdt_ctrl_domain *d)
+{
+ u32 num_closid = resctrl_arch_get_num_closid(r);
+ int cpu = cpumask_any(&d->hdr.cpu_mask);
+ int i;
+
+ d->mbps_val = kcalloc_node(num_closid, sizeof(*d->mbps_val),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (!d->mbps_val)
+ return -ENOMEM;
+
+ for (i = 0; i < num_closid; i++)
+ d->mbps_val[i] = MBA_MAX_MBPS;
+
+ return 0;
+}
+
+static void mba_sc_domain_destroy(struct rdt_resource *r,
+ struct rdt_ctrl_domain *d)
+{
+ kfree(d->mbps_val);
+ d->mbps_val = NULL;
+}
+
+/*
+ * MBA software controller is supported only if
+ * MBM is supported and MBA is in linear scale,
+ * and the MBM monitor scope is the same as MBA
+ * control scope.
+ */
+static bool supports_mba_mbps(void)
+{
+ struct rdt_resource *rmbm = resctrl_arch_get_resource(RDT_RESOURCE_L3);
+ struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
+
+ return (resctrl_is_mbm_enabled() &&
+ r->alloc_capable && is_mba_linear() &&
+ r->ctrl_scope == rmbm->mon_scope);
+}
+
+/*
+ * Enable or disable the MBA software controller
+ * which helps user specify bandwidth in MBps.
+ */
+static int set_mba_sc(bool mba_sc)
+{
+ struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_MBA);
+ u32 num_closid = resctrl_arch_get_num_closid(r);
+ struct rdt_ctrl_domain *d;
+ unsigned long fflags;
+ int i;
+
+ if (!supports_mba_mbps() || mba_sc == is_mba_sc(r))
+ return -EINVAL;
+
+ r->membw.mba_sc = mba_sc;
+
+ rdtgroup_default.mba_mbps_event = mba_mbps_default_event;
+
+ list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
+ for (i = 0; i < num_closid; i++)
+ d->mbps_val[i] = MBA_MAX_MBPS;
+ }
+
+ fflags = mba_sc ? RFTYPE_CTRL_BASE | RFTYPE_MON_BASE : 0;
+ resctrl_file_fflags_init("mba_MBps_event", fflags);
+
+ return 0;
+}
+
+/*
+ * We don't allow rdtgroup directories to be created anywhere
+ * except the root directory. Thus when looking for the rdtgroup
+ * structure for a kernfs node we are either looking at a directory,
+ * in which case the rdtgroup structure is pointed at by the "priv"
+ * field, otherwise we have a file, and need only look to the parent
+ * to find the rdtgroup.
+ */
+static struct rdtgroup *kernfs_to_rdtgroup(struct kernfs_node *kn)
+{
+ if (kernfs_type(kn) == KERNFS_DIR) {
+ /*
+ * All the resource directories use "kn->priv"
+ * to point to the "struct rdtgroup" for the
+ * resource. "info" and its subdirectories don't
+ * have rdtgroup structures, so return NULL here.
+ */
+ if (kn == kn_info ||
+ rcu_access_pointer(kn->__parent) == kn_info)
+ return NULL;
+ else
+ return kn->priv;
+ } else {
+ return rdt_kn_parent_priv(kn);
+ }
+}
+
+static void rdtgroup_kn_get(struct rdtgroup *rdtgrp, struct kernfs_node *kn)
+{
+ atomic_inc(&rdtgrp->waitcount);
+ kernfs_break_active_protection(kn);
+}
+
+static void rdtgroup_kn_put(struct rdtgroup *rdtgrp, struct kernfs_node *kn)
+{
+ if (atomic_dec_and_test(&rdtgrp->waitcount) &&
+ (rdtgrp->flags & RDT_DELETED)) {
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
+ rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)
+ rdtgroup_pseudo_lock_remove(rdtgrp);
+ kernfs_unbreak_active_protection(kn);
+ rdtgroup_remove(rdtgrp);
+ } else {
+ kernfs_unbreak_active_protection(kn);
+ }
+}
+
+struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn)
+{
+ struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn);
+
+ if (!rdtgrp)
+ return NULL;
+
+ rdtgroup_kn_get(rdtgrp, kn);
+
+ cpus_read_lock();
+ mutex_lock(&rdtgroup_mutex);
+
+ /* Was this group deleted while we waited? */
+ if (rdtgrp->flags & RDT_DELETED)
+ return NULL;
+
+ return rdtgrp;
+}
+
+void rdtgroup_kn_unlock(struct kernfs_node *kn)
+{
+ struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn);
+
+ if (!rdtgrp)
+ return;
+
+ mutex_unlock(&rdtgroup_mutex);
+ cpus_read_unlock();
+
+ rdtgroup_kn_put(rdtgrp, kn);
+}
+
+static int mkdir_mondata_all(struct kernfs_node *parent_kn,
+ struct rdtgroup *prgrp,
+ struct kernfs_node **mon_data_kn);
+
+static void rdt_disable_ctx(void)
+{
+ resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false);
+ resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false);
+ set_mba_sc(false);
+
+ resctrl_debug = false;
+}
+
+static int rdt_enable_ctx(struct rdt_fs_context *ctx)
+{
+ int ret = 0;
+
+ if (ctx->enable_cdpl2) {
+ ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, true);
+ if (ret)
+ goto out_done;
+ }
+
+ if (ctx->enable_cdpl3) {
+ ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, true);
+ if (ret)
+ goto out_cdpl2;
+ }
+
+ if (ctx->enable_mba_mbps) {
+ ret = set_mba_sc(true);
+ if (ret)
+ goto out_cdpl3;
+ }
+
+ if (ctx->enable_debug)
+ resctrl_debug = true;
+
+ return 0;
+
+out_cdpl3:
+ resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false);
+out_cdpl2:
+ resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false);
+out_done:
+ return ret;
+}
+
+static int schemata_list_add(struct rdt_resource *r, enum resctrl_conf_type type)
+{
+ struct resctrl_schema *s;
+ const char *suffix = "";
+ int ret, cl;
+
+ s = kzalloc(sizeof(*s), GFP_KERNEL);
+ if (!s)
+ return -ENOMEM;
+
+ s->res = r;
+ s->num_closid = resctrl_arch_get_num_closid(r);
+ if (resctrl_arch_get_cdp_enabled(r->rid))
+ s->num_closid /= 2;
+
+ s->conf_type = type;
+ switch (type) {
+ case CDP_CODE:
+ suffix = "CODE";
+ break;
+ case CDP_DATA:
+ suffix = "DATA";
+ break;
+ case CDP_NONE:
+ suffix = "";
+ break;
+ }
+
+ ret = snprintf(s->name, sizeof(s->name), "%s%s", r->name, suffix);
+ if (ret >= sizeof(s->name)) {
+ kfree(s);
+ return -EINVAL;
+ }
+
+ cl = strlen(s->name);
+
+ /*
+ * If CDP is supported by this resource, but not enabled,
+ * include the suffix. This ensures the tabular format of the
+ * schemata file does not change between mounts of the filesystem.
+ */
+ if (r->cdp_capable && !resctrl_arch_get_cdp_enabled(r->rid))
+ cl += 4;
+
+ if (cl > max_name_width)
+ max_name_width = cl;
+
+ switch (r->schema_fmt) {
+ case RESCTRL_SCHEMA_BITMAP:
+ s->fmt_str = "%d=%x";
+ break;
+ case RESCTRL_SCHEMA_RANGE:
+ s->fmt_str = "%d=%u";
+ break;
+ }
+
+ if (WARN_ON_ONCE(!s->fmt_str)) {
+ kfree(s);
+ return -EINVAL;
+ }
+
+ INIT_LIST_HEAD(&s->list);
+ list_add(&s->list, &resctrl_schema_all);
+
+ return 0;
+}
+
+static int schemata_list_create(void)
+{
+ struct rdt_resource *r;
+ int ret = 0;
+
+ for_each_alloc_capable_rdt_resource(r) {
+ if (resctrl_arch_get_cdp_enabled(r->rid)) {
+ ret = schemata_list_add(r, CDP_CODE);
+ if (ret)
+ break;
+
+ ret = schemata_list_add(r, CDP_DATA);
+ } else {
+ ret = schemata_list_add(r, CDP_NONE);
+ }
+
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+static void schemata_list_destroy(void)
+{
+ struct resctrl_schema *s, *tmp;
+
+ list_for_each_entry_safe(s, tmp, &resctrl_schema_all, list) {
+ list_del(&s->list);
+ kfree(s);
+ }
+}
+
+static int rdt_get_tree(struct fs_context *fc)
+{
+ struct rdt_fs_context *ctx = rdt_fc2context(fc);
+ unsigned long flags = RFTYPE_CTRL_BASE;
+ struct rdt_mon_domain *dom;
+ struct rdt_resource *r;
+ int ret;
+
+ cpus_read_lock();
+ mutex_lock(&rdtgroup_mutex);
+ /*
+ * resctrl file system can only be mounted once.
+ */
+ if (resctrl_mounted) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ ret = rdtgroup_setup_root(ctx);
+ if (ret)
+ goto out;
+
+ ret = rdt_enable_ctx(ctx);
+ if (ret)
+ goto out_root;
+
+ ret = schemata_list_create();
+ if (ret) {
+ schemata_list_destroy();
+ goto out_ctx;
+ }
+
+ ret = closid_init();
+ if (ret)
+ goto out_schemata_free;
+
+ if (resctrl_arch_mon_capable())
+ flags |= RFTYPE_MON;
+
+ ret = rdtgroup_add_files(rdtgroup_default.kn, flags);
+ if (ret)
+ goto out_closid_exit;
+
+ kernfs_activate(rdtgroup_default.kn);
+
+ ret = rdtgroup_create_info_dir(rdtgroup_default.kn);
+ if (ret < 0)
+ goto out_closid_exit;
+
+ if (resctrl_arch_mon_capable()) {
+ ret = mongroup_create_dir(rdtgroup_default.kn,
+ &rdtgroup_default, "mon_groups",
+ &kn_mongrp);
+ if (ret < 0)
+ goto out_info;
+
+ ret = mkdir_mondata_all(rdtgroup_default.kn,
+ &rdtgroup_default, &kn_mondata);
+ if (ret < 0)
+ goto out_mongrp;
+ rdtgroup_default.mon.mon_data_kn = kn_mondata;
+ }
+
+ ret = rdt_pseudo_lock_init();
+ if (ret)
+ goto out_mondata;
+
+ ret = kernfs_get_tree(fc);
+ if (ret < 0)
+ goto out_psl;
+
+ if (resctrl_arch_alloc_capable())
+ resctrl_arch_enable_alloc();
+ if (resctrl_arch_mon_capable())
+ resctrl_arch_enable_mon();
+
+ if (resctrl_arch_alloc_capable() || resctrl_arch_mon_capable())
+ resctrl_mounted = true;
+
+ if (resctrl_is_mbm_enabled()) {
+ r = resctrl_arch_get_resource(RDT_RESOURCE_L3);
+ list_for_each_entry(dom, &r->mon_domains, hdr.list)
+ mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL,
+ RESCTRL_PICK_ANY_CPU);
+ }
+
+ goto out;
+
+out_psl:
+ rdt_pseudo_lock_release();
+out_mondata:
+ if (resctrl_arch_mon_capable())
+ kernfs_remove(kn_mondata);
+out_mongrp:
+ if (resctrl_arch_mon_capable())
+ kernfs_remove(kn_mongrp);
+out_info:
+ kernfs_remove(kn_info);
+out_closid_exit:
+ closid_exit();
+out_schemata_free:
+ schemata_list_destroy();
+out_ctx:
+ rdt_disable_ctx();
+out_root:
+ rdtgroup_destroy_root();
+out:
+ rdt_last_cmd_clear();
+ mutex_unlock(&rdtgroup_mutex);
+ cpus_read_unlock();
+ return ret;
+}
+
+enum rdt_param {
+ Opt_cdp,
+ Opt_cdpl2,
+ Opt_mba_mbps,
+ Opt_debug,
+ nr__rdt_params
+};
+
+static const struct fs_parameter_spec rdt_fs_parameters[] = {
+ fsparam_flag("cdp", Opt_cdp),
+ fsparam_flag("cdpl2", Opt_cdpl2),
+ fsparam_flag("mba_MBps", Opt_mba_mbps),
+ fsparam_flag("debug", Opt_debug),
+ {}
+};
+
+static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ struct rdt_fs_context *ctx = rdt_fc2context(fc);
+ struct fs_parse_result result;
+ const char *msg;
+ int opt;
+
+ opt = fs_parse(fc, rdt_fs_parameters, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_cdp:
+ ctx->enable_cdpl3 = true;
+ return 0;
+ case Opt_cdpl2:
+ ctx->enable_cdpl2 = true;
+ return 0;
+ case Opt_mba_mbps:
+ msg = "mba_MBps requires MBM and linear scale MBA at L3 scope";
+ if (!supports_mba_mbps())
+ return invalfc(fc, msg);
+ ctx->enable_mba_mbps = true;
+ return 0;
+ case Opt_debug:
+ ctx->enable_debug = true;
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
+static void rdt_fs_context_free(struct fs_context *fc)
+{
+ struct rdt_fs_context *ctx = rdt_fc2context(fc);
+
+ kernfs_free_fs_context(fc);
+ kfree(ctx);
+}
+
+static const struct fs_context_operations rdt_fs_context_ops = {
+ .free = rdt_fs_context_free,
+ .parse_param = rdt_parse_param,
+ .get_tree = rdt_get_tree,
+};
+
+static int rdt_init_fs_context(struct fs_context *fc)
+{
+ struct rdt_fs_context *ctx;
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->kfc.magic = RDTGROUP_SUPER_MAGIC;
+ fc->fs_private = &ctx->kfc;
+ fc->ops = &rdt_fs_context_ops;
+ put_user_ns(fc->user_ns);
+ fc->user_ns = get_user_ns(&init_user_ns);
+ fc->global = true;
+ return 0;
+}
+
+/*
+ * Move tasks from one to the other group. If @from is NULL, then all tasks
+ * in the systems are moved unconditionally (used for teardown).
+ *
+ * If @mask is not NULL the cpus on which moved tasks are running are set
+ * in that mask so the update smp function call is restricted to affected
+ * cpus.
+ */
+static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to,
+ struct cpumask *mask)
+{
+ struct task_struct *p, *t;
+
+ read_lock(&tasklist_lock);
+ for_each_process_thread(p, t) {
+ if (!from || is_closid_match(t, from) ||
+ is_rmid_match(t, from)) {
+ resctrl_arch_set_closid_rmid(t, to->closid,
+ to->mon.rmid);
+
+ /*
+ * Order the closid/rmid stores above before the loads
+ * in task_curr(). This pairs with the full barrier
+ * between the rq->curr update and
+ * resctrl_arch_sched_in() during context switch.
+ */
+ smp_mb();
+
+ /*
+ * If the task is on a CPU, set the CPU in the mask.
+ * The detection is inaccurate as tasks might move or
+ * schedule before the smp function call takes place.
+ * In such a case the function call is pointless, but
+ * there is no other side effect.
+ */
+ if (IS_ENABLED(CONFIG_SMP) && mask && task_curr(t))
+ cpumask_set_cpu(task_cpu(t), mask);
+ }
+ }
+ read_unlock(&tasklist_lock);
+}
+
+static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp)
+{
+ struct rdtgroup *sentry, *stmp;
+ struct list_head *head;
+
+ head = &rdtgrp->mon.crdtgrp_list;
+ list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) {
+ free_rmid(sentry->closid, sentry->mon.rmid);
+ list_del(&sentry->mon.crdtgrp_list);
+
+ if (atomic_read(&sentry->waitcount) != 0)
+ sentry->flags = RDT_DELETED;
+ else
+ rdtgroup_remove(sentry);
+ }
+}
+
+/*
+ * Forcibly remove all of subdirectories under root.
+ */
+static void rmdir_all_sub(void)
+{
+ struct rdtgroup *rdtgrp, *tmp;
+
+ /* Move all tasks to the default resource group */
+ rdt_move_group_tasks(NULL, &rdtgroup_default, NULL);
+
+ list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) {
+ /* Free any child rmids */
+ free_all_child_rdtgrp(rdtgrp);
+
+ /* Remove each rdtgroup other than root */
+ if (rdtgrp == &rdtgroup_default)
+ continue;
+
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
+ rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)
+ rdtgroup_pseudo_lock_remove(rdtgrp);
+
+ /*
+ * Give any CPUs back to the default group. We cannot copy
+ * cpu_online_mask because a CPU might have executed the
+ * offline callback already, but is still marked online.
+ */
+ cpumask_or(&rdtgroup_default.cpu_mask,
+ &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
+
+ free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
+
+ kernfs_remove(rdtgrp->kn);
+ list_del(&rdtgrp->rdtgroup_list);
+
+ if (atomic_read(&rdtgrp->waitcount) != 0)
+ rdtgrp->flags = RDT_DELETED;
+ else
+ rdtgroup_remove(rdtgrp);
+ }
+ /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */
+ update_closid_rmid(cpu_online_mask, &rdtgroup_default);
+
+ kernfs_remove(kn_info);
+ kernfs_remove(kn_mongrp);
+ kernfs_remove(kn_mondata);
+}
+
+/**
+ * mon_get_kn_priv() - Get the mon_data priv data for this event.
+ *
+ * The same values are used across the mon_data directories of all control and
+ * monitor groups for the same event in the same domain. Keep a list of
+ * allocated structures and re-use an existing one with the same values for
+ * @rid, @domid, etc.
+ *
+ * @rid: The resource id for the event file being created.
+ * @domid: The domain id for the event file being created.
+ * @mevt: The type of event file being created.
+ * @do_sum: Whether SNC summing monitors are being created.
+ */
+static struct mon_data *mon_get_kn_priv(enum resctrl_res_level rid, int domid,
+ struct mon_evt *mevt,
+ bool do_sum)
+{
+ struct mon_data *priv;
+
+ lockdep_assert_held(&rdtgroup_mutex);
+
+ list_for_each_entry(priv, &mon_data_kn_priv_list, list) {
+ if (priv->rid == rid && priv->domid == domid &&
+ priv->sum == do_sum && priv->evtid == mevt->evtid)
+ return priv;
+ }
+
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ if (!priv)
+ return NULL;
+
+ priv->rid = rid;
+ priv->domid = domid;
+ priv->sum = do_sum;
+ priv->evtid = mevt->evtid;
+ list_add_tail(&priv->list, &mon_data_kn_priv_list);
+
+ return priv;
+}
+
+/**
+ * mon_put_kn_priv() - Free all allocated mon_data structures.
+ *
+ * Called when resctrl file system is unmounted.
+ */
+static void mon_put_kn_priv(void)
+{
+ struct mon_data *priv, *tmp;
+
+ lockdep_assert_held(&rdtgroup_mutex);
+
+ list_for_each_entry_safe(priv, tmp, &mon_data_kn_priv_list, list) {
+ list_del(&priv->list);
+ kfree(priv);
+ }
+}
+
+static void resctrl_fs_teardown(void)
+{
+ lockdep_assert_held(&rdtgroup_mutex);
+
+ /* Cleared by rdtgroup_destroy_root() */
+ if (!rdtgroup_default.kn)
+ return;
+
+ rmdir_all_sub();
+ mon_put_kn_priv();
+ rdt_pseudo_lock_release();
+ rdtgroup_default.mode = RDT_MODE_SHAREABLE;
+ closid_exit();
+ schemata_list_destroy();
+ rdtgroup_destroy_root();
+}
+
+static void rdt_kill_sb(struct super_block *sb)
+{
+ struct rdt_resource *r;
+
+ cpus_read_lock();
+ mutex_lock(&rdtgroup_mutex);
+
+ rdt_disable_ctx();
+
+ /* Put everything back to default values. */
+ for_each_alloc_capable_rdt_resource(r)
+ resctrl_arch_reset_all_ctrls(r);
+
+ resctrl_fs_teardown();
+ if (resctrl_arch_alloc_capable())
+ resctrl_arch_disable_alloc();
+ if (resctrl_arch_mon_capable())
+ resctrl_arch_disable_mon();
+ resctrl_mounted = false;
+ kernfs_kill_sb(sb);
+ mutex_unlock(&rdtgroup_mutex);
+ cpus_read_unlock();
+}
+
+static struct file_system_type rdt_fs_type = {
+ .name = "resctrl",
+ .init_fs_context = rdt_init_fs_context,
+ .parameters = rdt_fs_parameters,
+ .kill_sb = rdt_kill_sb,
+};
+
+static int mon_addfile(struct kernfs_node *parent_kn, const char *name,
+ void *priv)
+{
+ struct kernfs_node *kn;
+ int ret = 0;
+
+ kn = __kernfs_create_file(parent_kn, name, 0444,
+ GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0,
+ &kf_mondata_ops, priv, NULL, NULL);
+ if (IS_ERR(kn))
+ return PTR_ERR(kn);
+
+ ret = rdtgroup_kn_set_ugid(kn);
+ if (ret) {
+ kernfs_remove(kn);
+ return ret;
+ }
+
+ return ret;
+}
+
+static void mon_rmdir_one_subdir(struct kernfs_node *pkn, char *name, char *subname)
+{
+ struct kernfs_node *kn;
+
+ kn = kernfs_find_and_get(pkn, name);
+ if (!kn)
+ return;
+ kernfs_put(kn);
+
+ if (kn->dir.subdirs <= 1)
+ kernfs_remove(kn);
+ else
+ kernfs_remove_by_name(kn, subname);
+}
+
+/*
+ * Remove all subdirectories of mon_data of ctrl_mon groups
+ * and monitor groups for the given domain.
+ * Remove files and directories containing "sum" of domain data
+ * when last domain being summed is removed.
+ */
+static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
+ struct rdt_mon_domain *d)
+{
+ struct rdtgroup *prgrp, *crgrp;
+ char subname[32];
+ bool snc_mode;
+ char name[32];
+
+ snc_mode = r->mon_scope == RESCTRL_L3_NODE;
+ sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci_id : d->hdr.id);
+ if (snc_mode)
+ sprintf(subname, "mon_sub_%s_%02d", r->name, d->hdr.id);
+
+ list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
+ mon_rmdir_one_subdir(prgrp->mon.mon_data_kn, name, subname);
+
+ list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list)
+ mon_rmdir_one_subdir(crgrp->mon.mon_data_kn, name, subname);
+ }
+}
+
+static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d,
+ struct rdt_resource *r, struct rdtgroup *prgrp,
+ bool do_sum)
+{
+ struct rmid_read rr = {0};
+ struct mon_data *priv;
+ struct mon_evt *mevt;
+ int ret, domid;
+
+ if (WARN_ON(list_empty(&r->evt_list)))
+ return -EPERM;
+
+ list_for_each_entry(mevt, &r->evt_list, list) {
+ domid = do_sum ? d->ci_id : d->hdr.id;
+ priv = mon_get_kn_priv(r->rid, domid, mevt, do_sum);
+ if (WARN_ON_ONCE(!priv))
+ return -EINVAL;
+
+ ret = mon_addfile(kn, mevt->name, priv);
+ if (ret)
+ return ret;
+
+ if (!do_sum && resctrl_is_mbm_event(mevt->evtid))
+ mon_event_read(&rr, r, d, prgrp, &d->hdr.cpu_mask, mevt->evtid, true);
+ }
+
+ return 0;
+}
+
+static int mkdir_mondata_subdir(struct kernfs_node *parent_kn,
+ struct rdt_mon_domain *d,
+ struct rdt_resource *r, struct rdtgroup *prgrp)
+{
+ struct kernfs_node *kn, *ckn;
+ char name[32];
+ bool snc_mode;
+ int ret = 0;
+
+ lockdep_assert_held(&rdtgroup_mutex);
+
+ snc_mode = r->mon_scope == RESCTRL_L3_NODE;
+ sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci_id : d->hdr.id);
+ kn = kernfs_find_and_get(parent_kn, name);
+ if (kn) {
+ /*
+ * rdtgroup_mutex will prevent this directory from being
+ * removed. No need to keep this hold.
+ */
+ kernfs_put(kn);
+ } else {
+ kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
+ if (IS_ERR(kn))
+ return PTR_ERR(kn);
+
+ ret = rdtgroup_kn_set_ugid(kn);
+ if (ret)
+ goto out_destroy;
+ ret = mon_add_all_files(kn, d, r, prgrp, snc_mode);
+ if (ret)
+ goto out_destroy;
+ }
+
+ if (snc_mode) {
+ sprintf(name, "mon_sub_%s_%02d", r->name, d->hdr.id);
+ ckn = kernfs_create_dir(kn, name, parent_kn->mode, prgrp);
+ if (IS_ERR(ckn)) {
+ ret = -EINVAL;
+ goto out_destroy;
+ }
+
+ ret = rdtgroup_kn_set_ugid(ckn);
+ if (ret)
+ goto out_destroy;
+
+ ret = mon_add_all_files(ckn, d, r, prgrp, false);
+ if (ret)
+ goto out_destroy;
+ }
+
+ kernfs_activate(kn);
+ return 0;
+
+out_destroy:
+ kernfs_remove(kn);
+ return ret;
+}
+
+/*
+ * Add all subdirectories of mon_data for "ctrl_mon" groups
+ * and "monitor" groups with given domain id.
+ */
+static void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
+ struct rdt_mon_domain *d)
+{
+ struct kernfs_node *parent_kn;
+ struct rdtgroup *prgrp, *crgrp;
+ struct list_head *head;
+
+ list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
+ parent_kn = prgrp->mon.mon_data_kn;
+ mkdir_mondata_subdir(parent_kn, d, r, prgrp);
+
+ head = &prgrp->mon.crdtgrp_list;
+ list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
+ parent_kn = crgrp->mon.mon_data_kn;
+ mkdir_mondata_subdir(parent_kn, d, r, crgrp);
+ }
+ }
+}
+
+static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn,
+ struct rdt_resource *r,
+ struct rdtgroup *prgrp)
+{
+ struct rdt_mon_domain *dom;
+ int ret;
+
+ /* Walking r->domains, ensure it can't race with cpuhp */
+ lockdep_assert_cpus_held();
+
+ list_for_each_entry(dom, &r->mon_domains, hdr.list) {
+ ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+/*
+ * This creates a directory mon_data which contains the monitored data.
+ *
+ * mon_data has one directory for each domain which are named
+ * in the format mon_<domain_name>_<domain_id>. For ex: A mon_data
+ * with L3 domain looks as below:
+ * ./mon_data:
+ * mon_L3_00
+ * mon_L3_01
+ * mon_L3_02
+ * ...
+ *
+ * Each domain directory has one file per event:
+ * ./mon_L3_00/:
+ * llc_occupancy
+ *
+ */
+static int mkdir_mondata_all(struct kernfs_node *parent_kn,
+ struct rdtgroup *prgrp,
+ struct kernfs_node **dest_kn)
+{
+ struct rdt_resource *r;
+ struct kernfs_node *kn;
+ int ret;
+
+ /*
+ * Create the mon_data directory first.
+ */
+ ret = mongroup_create_dir(parent_kn, prgrp, "mon_data", &kn);
+ if (ret)
+ return ret;
+
+ if (dest_kn)
+ *dest_kn = kn;
+
+ /*
+ * Create the subdirectories for each domain. Note that all events
+ * in a domain like L3 are grouped into a resource whose domain is L3
+ */
+ for_each_mon_capable_rdt_resource(r) {
+ ret = mkdir_mondata_subdir_alldom(kn, r, prgrp);
+ if (ret)
+ goto out_destroy;
+ }
+
+ return 0;
+
+out_destroy:
+ kernfs_remove(kn);
+ return ret;
+}
+
+/**
+ * cbm_ensure_valid - Enforce validity on provided CBM
+ * @_val: Candidate CBM
+ * @r: RDT resource to which the CBM belongs
+ *
+ * The provided CBM represents all cache portions available for use. This
+ * may be represented by a bitmap that does not consist of contiguous ones
+ * and thus be an invalid CBM.
+ * Here the provided CBM is forced to be a valid CBM by only considering
+ * the first set of contiguous bits as valid and clearing all bits.
+ * The intention here is to provide a valid default CBM with which a new
+ * resource group is initialized. The user can follow this with a
+ * modification to the CBM if the default does not satisfy the
+ * requirements.
+ */
+static u32 cbm_ensure_valid(u32 _val, struct rdt_resource *r)
+{
+ unsigned int cbm_len = r->cache.cbm_len;
+ unsigned long first_bit, zero_bit;
+ unsigned long val = _val;
+
+ if (!val)
+ return 0;
+
+ first_bit = find_first_bit(&val, cbm_len);
+ zero_bit = find_next_zero_bit(&val, cbm_len, first_bit);
+
+ /* Clear any remaining bits to ensure contiguous region */
+ bitmap_clear(&val, zero_bit, cbm_len - zero_bit);
+ return (u32)val;
+}
+
+/*
+ * Initialize cache resources per RDT domain
+ *
+ * Set the RDT domain up to start off with all usable allocations. That is,
+ * all shareable and unused bits. All-zero CBM is invalid.
+ */
+static int __init_one_rdt_domain(struct rdt_ctrl_domain *d, struct resctrl_schema *s,
+ u32 closid)
+{
+ enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type);
+ enum resctrl_conf_type t = s->conf_type;
+ struct resctrl_staged_config *cfg;
+ struct rdt_resource *r = s->res;
+ u32 used_b = 0, unused_b = 0;
+ unsigned long tmp_cbm;
+ enum rdtgrp_mode mode;
+ u32 peer_ctl, ctrl_val;
+ int i;
+
+ cfg = &d->staged_config[t];
+ cfg->have_new_ctrl = false;
+ cfg->new_ctrl = r->cache.shareable_bits;
+ used_b = r->cache.shareable_bits;
+ for (i = 0; i < closids_supported(); i++) {
+ if (closid_allocated(i) && i != closid) {
+ mode = rdtgroup_mode_by_closid(i);
+ if (mode == RDT_MODE_PSEUDO_LOCKSETUP)
+ /*
+ * ctrl values for locksetup aren't relevant
+ * until the schemata is written, and the mode
+ * becomes RDT_MODE_PSEUDO_LOCKED.
+ */
+ continue;
+ /*
+ * If CDP is active include peer domain's
+ * usage to ensure there is no overlap
+ * with an exclusive group.
+ */
+ if (resctrl_arch_get_cdp_enabled(r->rid))
+ peer_ctl = resctrl_arch_get_config(r, d, i,
+ peer_type);
+ else
+ peer_ctl = 0;
+ ctrl_val = resctrl_arch_get_config(r, d, i,
+ s->conf_type);
+ used_b |= ctrl_val | peer_ctl;
+ if (mode == RDT_MODE_SHAREABLE)
+ cfg->new_ctrl |= ctrl_val | peer_ctl;
+ }
+ }
+ if (d->plr && d->plr->cbm > 0)
+ used_b |= d->plr->cbm;
+ unused_b = used_b ^ (BIT_MASK(r->cache.cbm_len) - 1);
+ unused_b &= BIT_MASK(r->cache.cbm_len) - 1;
+ cfg->new_ctrl |= unused_b;
+ /*
+ * Force the initial CBM to be valid, user can
+ * modify the CBM based on system availability.
+ */
+ cfg->new_ctrl = cbm_ensure_valid(cfg->new_ctrl, r);
+ /*
+ * Assign the u32 CBM to an unsigned long to ensure that
+ * bitmap_weight() does not access out-of-bound memory.
+ */
+ tmp_cbm = cfg->new_ctrl;
+ if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) < r->cache.min_cbm_bits) {
+ rdt_last_cmd_printf("No space on %s:%d\n", s->name, d->hdr.id);
+ return -ENOSPC;
+ }
+ cfg->have_new_ctrl = true;
+
+ return 0;
+}
+
+/*
+ * Initialize cache resources with default values.
+ *
+ * A new RDT group is being created on an allocation capable (CAT)
+ * supporting system. Set this group up to start off with all usable
+ * allocations.
+ *
+ * If there are no more shareable bits available on any domain then
+ * the entire allocation will fail.
+ */
+static int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid)
+{
+ struct rdt_ctrl_domain *d;
+ int ret;
+
+ list_for_each_entry(d, &s->res->ctrl_domains, hdr.list) {
+ ret = __init_one_rdt_domain(d, s, closid);
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
+}
+
+/* Initialize MBA resource with default values. */
+static void rdtgroup_init_mba(struct rdt_resource *r, u32 closid)
+{
+ struct resctrl_staged_config *cfg;
+ struct rdt_ctrl_domain *d;
+
+ list_for_each_entry(d, &r->ctrl_domains, hdr.list) {
+ if (is_mba_sc(r)) {
+ d->mbps_val[closid] = MBA_MAX_MBPS;
+ continue;
+ }
+
+ cfg = &d->staged_config[CDP_NONE];
+ cfg->new_ctrl = resctrl_get_default_ctrl(r);
+ cfg->have_new_ctrl = true;
+ }
+}
+
+/* Initialize the RDT group's allocations. */
+static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp)
+{
+ struct resctrl_schema *s;
+ struct rdt_resource *r;
+ int ret = 0;
+
+ rdt_staged_configs_clear();
+
+ list_for_each_entry(s, &resctrl_schema_all, list) {
+ r = s->res;
+ if (r->rid == RDT_RESOURCE_MBA ||
+ r->rid == RDT_RESOURCE_SMBA) {
+ rdtgroup_init_mba(r, rdtgrp->closid);
+ if (is_mba_sc(r))
+ continue;
+ } else {
+ ret = rdtgroup_init_cat(s, rdtgrp->closid);
+ if (ret < 0)
+ goto out;
+ }
+
+ ret = resctrl_arch_update_domains(r, rdtgrp->closid);
+ if (ret < 0) {
+ rdt_last_cmd_puts("Failed to initialize allocations\n");
+ goto out;
+ }
+ }
+
+ rdtgrp->mode = RDT_MODE_SHAREABLE;
+
+out:
+ rdt_staged_configs_clear();
+ return ret;
+}
+
+static int mkdir_rdt_prepare_rmid_alloc(struct rdtgroup *rdtgrp)
+{
+ int ret;
+
+ if (!resctrl_arch_mon_capable())
+ return 0;
+
+ ret = alloc_rmid(rdtgrp->closid);
+ if (ret < 0) {
+ rdt_last_cmd_puts("Out of RMIDs\n");
+ return ret;
+ }
+ rdtgrp->mon.rmid = ret;
+
+ ret = mkdir_mondata_all(rdtgrp->kn, rdtgrp, &rdtgrp->mon.mon_data_kn);
+ if (ret) {
+ rdt_last_cmd_puts("kernfs subdir error\n");
+ free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
+ return ret;
+ }
+
+ return 0;
+}
+
+static void mkdir_rdt_prepare_rmid_free(struct rdtgroup *rgrp)
+{
+ if (resctrl_arch_mon_capable())
+ free_rmid(rgrp->closid, rgrp->mon.rmid);
+}
+
+/*
+ * We allow creating mon groups only with in a directory called "mon_groups"
+ * which is present in every ctrl_mon group. Check if this is a valid
+ * "mon_groups" directory.
+ *
+ * 1. The directory should be named "mon_groups".
+ * 2. The mon group itself should "not" be named "mon_groups".
+ * This makes sure "mon_groups" directory always has a ctrl_mon group
+ * as parent.
+ */
+static bool is_mon_groups(struct kernfs_node *kn, const char *name)
+{
+ return (!strcmp(rdt_kn_name(kn), "mon_groups") &&
+ strcmp(name, "mon_groups"));
+}
+
+static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
+ const char *name, umode_t mode,
+ enum rdt_group_type rtype, struct rdtgroup **r)
+{
+ struct rdtgroup *prdtgrp, *rdtgrp;
+ unsigned long files = 0;
+ struct kernfs_node *kn;
+ int ret;
+
+ prdtgrp = rdtgroup_kn_lock_live(parent_kn);
+ if (!prdtgrp) {
+ ret = -ENODEV;
+ goto out_unlock;
+ }
+
+ rdt_last_cmd_clear();
+
+ /*
+ * Check that the parent directory for a monitor group is a "mon_groups"
+ * directory.
+ */
+ if (rtype == RDTMON_GROUP && !is_mon_groups(parent_kn, name)) {
+ ret = -EPERM;
+ goto out_unlock;
+ }
+
+ if (rtype == RDTMON_GROUP &&
+ (prdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
+ prdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)) {
+ ret = -EINVAL;
+ rdt_last_cmd_puts("Pseudo-locking in progress\n");
+ goto out_unlock;
+ }
+
+ /* allocate the rdtgroup. */
+ rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL);
+ if (!rdtgrp) {
+ ret = -ENOSPC;
+ rdt_last_cmd_puts("Kernel out of memory\n");
+ goto out_unlock;
+ }
+ *r = rdtgrp;
+ rdtgrp->mon.parent = prdtgrp;
+ rdtgrp->type = rtype;
+ INIT_LIST_HEAD(&rdtgrp->mon.crdtgrp_list);
+
+ /* kernfs creates the directory for rdtgrp */
+ kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp);
+ if (IS_ERR(kn)) {
+ ret = PTR_ERR(kn);
+ rdt_last_cmd_puts("kernfs create error\n");
+ goto out_free_rgrp;
+ }
+ rdtgrp->kn = kn;
+
+ /*
+ * kernfs_remove() will drop the reference count on "kn" which
+ * will free it. But we still need it to stick around for the
+ * rdtgroup_kn_unlock(kn) call. Take one extra reference here,
+ * which will be dropped by kernfs_put() in rdtgroup_remove().
+ */
+ kernfs_get(kn);
+
+ ret = rdtgroup_kn_set_ugid(kn);
+ if (ret) {
+ rdt_last_cmd_puts("kernfs perm error\n");
+ goto out_destroy;
+ }
+
+ if (rtype == RDTCTRL_GROUP) {
+ files = RFTYPE_BASE | RFTYPE_CTRL;
+ if (resctrl_arch_mon_capable())
+ files |= RFTYPE_MON;
+ } else {
+ files = RFTYPE_BASE | RFTYPE_MON;
+ }
+
+ ret = rdtgroup_add_files(kn, files);
+ if (ret) {
+ rdt_last_cmd_puts("kernfs fill error\n");
+ goto out_destroy;
+ }
+
+ /*
+ * The caller unlocks the parent_kn upon success.
+ */
+ return 0;
+
+out_destroy:
+ kernfs_put(rdtgrp->kn);
+ kernfs_remove(rdtgrp->kn);
+out_free_rgrp:
+ kfree(rdtgrp);
+out_unlock:
+ rdtgroup_kn_unlock(parent_kn);
+ return ret;
+}
+
+static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp)
+{
+ kernfs_remove(rgrp->kn);
+ rdtgroup_remove(rgrp);
+}
+
+/*
+ * Create a monitor group under "mon_groups" directory of a control
+ * and monitor group(ctrl_mon). This is a resource group
+ * to monitor a subset of tasks and cpus in its parent ctrl_mon group.
+ */
+static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn,
+ const char *name, umode_t mode)
+{
+ struct rdtgroup *rdtgrp, *prgrp;
+ int ret;
+
+ ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTMON_GROUP, &rdtgrp);
+ if (ret)
+ return ret;
+
+ prgrp = rdtgrp->mon.parent;
+ rdtgrp->closid = prgrp->closid;
+
+ ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp);
+ if (ret) {
+ mkdir_rdt_prepare_clean(rdtgrp);
+ goto out_unlock;
+ }
+
+ kernfs_activate(rdtgrp->kn);
+
+ /*
+ * Add the rdtgrp to the list of rdtgrps the parent
+ * ctrl_mon group has to track.
+ */
+ list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list);
+
+out_unlock:
+ rdtgroup_kn_unlock(parent_kn);
+ return ret;
+}
+
+/*
+ * These are rdtgroups created under the root directory. Can be used
+ * to allocate and monitor resources.
+ */
+static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
+ const char *name, umode_t mode)
+{
+ struct rdtgroup *rdtgrp;
+ struct kernfs_node *kn;
+ u32 closid;
+ int ret;
+
+ ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTCTRL_GROUP, &rdtgrp);
+ if (ret)
+ return ret;
+
+ kn = rdtgrp->kn;
+ ret = closid_alloc();
+ if (ret < 0) {
+ rdt_last_cmd_puts("Out of CLOSIDs\n");
+ goto out_common_fail;
+ }
+ closid = ret;
+ ret = 0;
+
+ rdtgrp->closid = closid;
+
+ ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp);
+ if (ret)
+ goto out_closid_free;
+
+ kernfs_activate(rdtgrp->kn);
+
+ ret = rdtgroup_init_alloc(rdtgrp);
+ if (ret < 0)
+ goto out_rmid_free;
+
+ list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups);
+
+ if (resctrl_arch_mon_capable()) {
+ /*
+ * Create an empty mon_groups directory to hold the subset
+ * of tasks and cpus to monitor.
+ */
+ ret = mongroup_create_dir(kn, rdtgrp, "mon_groups", NULL);
+ if (ret) {
+ rdt_last_cmd_puts("kernfs subdir error\n");
+ goto out_del_list;
+ }
+ if (is_mba_sc(NULL))
+ rdtgrp->mba_mbps_event = mba_mbps_default_event;
+ }
+
+ goto out_unlock;
+
+out_del_list:
+ list_del(&rdtgrp->rdtgroup_list);
+out_rmid_free:
+ mkdir_rdt_prepare_rmid_free(rdtgrp);
+out_closid_free:
+ closid_free(closid);
+out_common_fail:
+ mkdir_rdt_prepare_clean(rdtgrp);
+out_unlock:
+ rdtgroup_kn_unlock(parent_kn);
+ return ret;
+}
+
+static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
+ umode_t mode)
+{
+ /* Do not accept '\n' to avoid unparsable situation. */
+ if (strchr(name, '\n'))
+ return -EINVAL;
+
+ /*
+ * If the parent directory is the root directory and RDT
+ * allocation is supported, add a control and monitoring
+ * subdirectory
+ */
+ if (resctrl_arch_alloc_capable() && parent_kn == rdtgroup_default.kn)
+ return rdtgroup_mkdir_ctrl_mon(parent_kn, name, mode);
+
+ /* Else, attempt to add a monitoring subdirectory. */
+ if (resctrl_arch_mon_capable())
+ return rdtgroup_mkdir_mon(parent_kn, name, mode);
+
+ return -EPERM;
+}
+
+static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
+{
+ struct rdtgroup *prdtgrp = rdtgrp->mon.parent;
+ u32 closid, rmid;
+ int cpu;
+
+ /* Give any tasks back to the parent group */
+ rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask);
+
+ /*
+ * Update per cpu closid/rmid of the moved CPUs first.
+ * Note: the closid will not change, but the arch code still needs it.
+ */
+ closid = prdtgrp->closid;
+ rmid = prdtgrp->mon.rmid;
+ for_each_cpu(cpu, &rdtgrp->cpu_mask)
+ resctrl_arch_set_cpu_default_closid_rmid(cpu, closid, rmid);
+
+ /*
+ * Update the MSR on moved CPUs and CPUs which have moved
+ * task running on them.
+ */
+ cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
+ update_closid_rmid(tmpmask, NULL);
+
+ rdtgrp->flags = RDT_DELETED;
+ free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
+
+ /*
+ * Remove the rdtgrp from the parent ctrl_mon group's list
+ */
+ WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list));
+ list_del(&rdtgrp->mon.crdtgrp_list);
+
+ kernfs_remove(rdtgrp->kn);
+
+ return 0;
+}
+
+static int rdtgroup_ctrl_remove(struct rdtgroup *rdtgrp)
+{
+ rdtgrp->flags = RDT_DELETED;
+ list_del(&rdtgrp->rdtgroup_list);
+
+ kernfs_remove(rdtgrp->kn);
+ return 0;
+}
+
+static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
+{
+ u32 closid, rmid;
+ int cpu;
+
+ /* Give any tasks back to the default group */
+ rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask);
+
+ /* Give any CPUs back to the default group */
+ cpumask_or(&rdtgroup_default.cpu_mask,
+ &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
+
+ /* Update per cpu closid and rmid of the moved CPUs first */
+ closid = rdtgroup_default.closid;
+ rmid = rdtgroup_default.mon.rmid;
+ for_each_cpu(cpu, &rdtgrp->cpu_mask)
+ resctrl_arch_set_cpu_default_closid_rmid(cpu, closid, rmid);
+
+ /*
+ * Update the MSR on moved CPUs and CPUs which have moved
+ * task running on them.
+ */
+ cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
+ update_closid_rmid(tmpmask, NULL);
+
+ free_rmid(rdtgrp->closid, rdtgrp->mon.rmid);
+ closid_free(rdtgrp->closid);
+
+ rdtgroup_ctrl_remove(rdtgrp);
+
+ /*
+ * Free all the child monitor group rmids.
+ */
+ free_all_child_rdtgrp(rdtgrp);
+
+ return 0;
+}
+
+static struct kernfs_node *rdt_kn_parent(struct kernfs_node *kn)
+{
+ /*
+ * Valid within the RCU section it was obtained or while rdtgroup_mutex
+ * is held.
+ */
+ return rcu_dereference_check(kn->__parent, lockdep_is_held(&rdtgroup_mutex));
+}
+
+static int rdtgroup_rmdir(struct kernfs_node *kn)
+{
+ struct kernfs_node *parent_kn;
+ struct rdtgroup *rdtgrp;
+ cpumask_var_t tmpmask;
+ int ret = 0;
+
+ if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
+ return -ENOMEM;
+
+ rdtgrp = rdtgroup_kn_lock_live(kn);
+ if (!rdtgrp) {
+ ret = -EPERM;
+ goto out;
+ }
+ parent_kn = rdt_kn_parent(kn);
+
+ /*
+ * If the rdtgroup is a ctrl_mon group and parent directory
+ * is the root directory, remove the ctrl_mon group.
+ *
+ * If the rdtgroup is a mon group and parent directory
+ * is a valid "mon_groups" directory, remove the mon group.
+ */
+ if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn &&
+ rdtgrp != &rdtgroup_default) {
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
+ rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
+ ret = rdtgroup_ctrl_remove(rdtgrp);
+ } else {
+ ret = rdtgroup_rmdir_ctrl(rdtgrp, tmpmask);
+ }
+ } else if (rdtgrp->type == RDTMON_GROUP &&
+ is_mon_groups(parent_kn, rdt_kn_name(kn))) {
+ ret = rdtgroup_rmdir_mon(rdtgrp, tmpmask);
+ } else {
+ ret = -EPERM;
+ }
+
+out:
+ rdtgroup_kn_unlock(kn);
+ free_cpumask_var(tmpmask);
+ return ret;
+}
+
+/**
+ * mongrp_reparent() - replace parent CTRL_MON group of a MON group
+ * @rdtgrp: the MON group whose parent should be replaced
+ * @new_prdtgrp: replacement parent CTRL_MON group for @rdtgrp
+ * @cpus: cpumask provided by the caller for use during this call
+ *
+ * Replaces the parent CTRL_MON group for a MON group, resulting in all member
+ * tasks' CLOSID immediately changing to that of the new parent group.
+ * Monitoring data for the group is unaffected by this operation.
+ */
+static void mongrp_reparent(struct rdtgroup *rdtgrp,
+ struct rdtgroup *new_prdtgrp,
+ cpumask_var_t cpus)
+{
+ struct rdtgroup *prdtgrp = rdtgrp->mon.parent;
+
+ WARN_ON(rdtgrp->type != RDTMON_GROUP);
+ WARN_ON(new_prdtgrp->type != RDTCTRL_GROUP);
+
+ /* Nothing to do when simply renaming a MON group. */
+ if (prdtgrp == new_prdtgrp)
+ return;
+
+ WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list));
+ list_move_tail(&rdtgrp->mon.crdtgrp_list,
+ &new_prdtgrp->mon.crdtgrp_list);
+
+ rdtgrp->mon.parent = new_prdtgrp;
+ rdtgrp->closid = new_prdtgrp->closid;
+
+ /* Propagate updated closid to all tasks in this group. */
+ rdt_move_group_tasks(rdtgrp, rdtgrp, cpus);
+
+ update_closid_rmid(cpus, NULL);
+}
+
+static int rdtgroup_rename(struct kernfs_node *kn,
+ struct kernfs_node *new_parent, const char *new_name)
+{
+ struct kernfs_node *kn_parent;
+ struct rdtgroup *new_prdtgrp;
+ struct rdtgroup *rdtgrp;
+ cpumask_var_t tmpmask;
+ int ret;
+
+ rdtgrp = kernfs_to_rdtgroup(kn);
+ new_prdtgrp = kernfs_to_rdtgroup(new_parent);
+ if (!rdtgrp || !new_prdtgrp)
+ return -ENOENT;
+
+ /* Release both kernfs active_refs before obtaining rdtgroup mutex. */
+ rdtgroup_kn_get(rdtgrp, kn);
+ rdtgroup_kn_get(new_prdtgrp, new_parent);
+
+ mutex_lock(&rdtgroup_mutex);
+
+ rdt_last_cmd_clear();
+
+ /*
+ * Don't allow kernfs_to_rdtgroup() to return a parent rdtgroup if
+ * either kernfs_node is a file.
+ */
+ if (kernfs_type(kn) != KERNFS_DIR ||
+ kernfs_type(new_parent) != KERNFS_DIR) {
+ rdt_last_cmd_puts("Source and destination must be directories");
+ ret = -EPERM;
+ goto out;
+ }
+
+ if ((rdtgrp->flags & RDT_DELETED) || (new_prdtgrp->flags & RDT_DELETED)) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ kn_parent = rdt_kn_parent(kn);
+ if (rdtgrp->type != RDTMON_GROUP || !kn_parent ||
+ !is_mon_groups(kn_parent, rdt_kn_name(kn))) {
+ rdt_last_cmd_puts("Source must be a MON group\n");
+ ret = -EPERM;
+ goto out;
+ }
+
+ if (!is_mon_groups(new_parent, new_name)) {
+ rdt_last_cmd_puts("Destination must be a mon_groups subdirectory\n");
+ ret = -EPERM;
+ goto out;
+ }
+
+ /*
+ * If the MON group is monitoring CPUs, the CPUs must be assigned to the
+ * current parent CTRL_MON group and therefore cannot be assigned to
+ * the new parent, making the move illegal.
+ */
+ if (!cpumask_empty(&rdtgrp->cpu_mask) &&
+ rdtgrp->mon.parent != new_prdtgrp) {
+ rdt_last_cmd_puts("Cannot move a MON group that monitors CPUs\n");
+ ret = -EPERM;
+ goto out;
+ }
+
+ /*
+ * Allocate the cpumask for use in mongrp_reparent() to avoid the
+ * possibility of failing to allocate it after kernfs_rename() has
+ * succeeded.
+ */
+ if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * Perform all input validation and allocations needed to ensure
+ * mongrp_reparent() will succeed before calling kernfs_rename(),
+ * otherwise it would be necessary to revert this call if
+ * mongrp_reparent() failed.
+ */
+ ret = kernfs_rename(kn, new_parent, new_name);
+ if (!ret)
+ mongrp_reparent(rdtgrp, new_prdtgrp, tmpmask);
+
+ free_cpumask_var(tmpmask);
+
+out:
+ mutex_unlock(&rdtgroup_mutex);
+ rdtgroup_kn_put(rdtgrp, kn);
+ rdtgroup_kn_put(new_prdtgrp, new_parent);
+ return ret;
+}
+
+static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf)
+{
+ if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3))
+ seq_puts(seq, ",cdp");
+
+ if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2))
+ seq_puts(seq, ",cdpl2");
+
+ if (is_mba_sc(resctrl_arch_get_resource(RDT_RESOURCE_MBA)))
+ seq_puts(seq, ",mba_MBps");
+
+ if (resctrl_debug)
+ seq_puts(seq, ",debug");
+
+ return 0;
+}
+
+static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = {
+ .mkdir = rdtgroup_mkdir,
+ .rmdir = rdtgroup_rmdir,
+ .rename = rdtgroup_rename,
+ .show_options = rdtgroup_show_options,
+};
+
+static int rdtgroup_setup_root(struct rdt_fs_context *ctx)
+{
+ rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops,
+ KERNFS_ROOT_CREATE_DEACTIVATED |
+ KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK,
+ &rdtgroup_default);
+ if (IS_ERR(rdt_root))
+ return PTR_ERR(rdt_root);
+
+ ctx->kfc.root = rdt_root;
+ rdtgroup_default.kn = kernfs_root_to_node(rdt_root);
+
+ return 0;
+}
+
+static void rdtgroup_destroy_root(void)
+{
+ lockdep_assert_held(&rdtgroup_mutex);
+
+ kernfs_destroy_root(rdt_root);
+ rdtgroup_default.kn = NULL;
+}
+
+static void rdtgroup_setup_default(void)
+{
+ mutex_lock(&rdtgroup_mutex);
+
+ rdtgroup_default.closid = RESCTRL_RESERVED_CLOSID;
+ rdtgroup_default.mon.rmid = RESCTRL_RESERVED_RMID;
+ rdtgroup_default.type = RDTCTRL_GROUP;
+ INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list);
+
+ list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups);
+
+ mutex_unlock(&rdtgroup_mutex);
+}
+
+static void domain_destroy_mon_state(struct rdt_mon_domain *d)
+{
+ bitmap_free(d->rmid_busy_llc);
+ kfree(d->mbm_total);
+ kfree(d->mbm_local);
+}
+
+void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d)
+{
+ mutex_lock(&rdtgroup_mutex);
+
+ if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA)
+ mba_sc_domain_destroy(r, d);
+
+ mutex_unlock(&rdtgroup_mutex);
+}
+
+void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d)
+{
+ mutex_lock(&rdtgroup_mutex);
+
+ /*
+ * If resctrl is mounted, remove all the
+ * per domain monitor data directories.
+ */
+ if (resctrl_mounted && resctrl_arch_mon_capable())
+ rmdir_mondata_subdir_allrdtgrp(r, d);
+
+ if (resctrl_is_mbm_enabled())
+ cancel_delayed_work(&d->mbm_over);
+ if (resctrl_arch_is_llc_occupancy_enabled() && has_busy_rmid(d)) {
+ /*
+ * When a package is going down, forcefully
+ * decrement rmid->ebusy. There is no way to know
+ * that the L3 was flushed and hence may lead to
+ * incorrect counts in rare scenarios, but leaving
+ * the RMID as busy creates RMID leaks if the
+ * package never comes back.
+ */
+ __check_limbo(d, true);
+ cancel_delayed_work(&d->cqm_limbo);
+ }
+
+ domain_destroy_mon_state(d);
+
+ mutex_unlock(&rdtgroup_mutex);
+}
+
+/**
+ * domain_setup_mon_state() - Initialise domain monitoring structures.
+ * @r: The resource for the newly online domain.
+ * @d: The newly online domain.
+ *
+ * Allocate monitor resources that belong to this domain.
+ * Called when the first CPU of a domain comes online, regardless of whether
+ * the filesystem is mounted.
+ * During boot this may be called before global allocations have been made by
+ * resctrl_mon_resource_init().
+ *
+ * Returns 0 for success, or -ENOMEM.
+ */
+static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain *d)
+{
+ u32 idx_limit = resctrl_arch_system_num_rmid_idx();
+ size_t tsize;
+
+ if (resctrl_arch_is_llc_occupancy_enabled()) {
+ d->rmid_busy_llc = bitmap_zalloc(idx_limit, GFP_KERNEL);
+ if (!d->rmid_busy_llc)
+ return -ENOMEM;
+ }
+ if (resctrl_arch_is_mbm_total_enabled()) {
+ tsize = sizeof(*d->mbm_total);
+ d->mbm_total = kcalloc(idx_limit, tsize, GFP_KERNEL);
+ if (!d->mbm_total) {
+ bitmap_free(d->rmid_busy_llc);
+ return -ENOMEM;
+ }
+ }
+ if (resctrl_arch_is_mbm_local_enabled()) {
+ tsize = sizeof(*d->mbm_local);
+ d->mbm_local = kcalloc(idx_limit, tsize, GFP_KERNEL);
+ if (!d->mbm_local) {
+ bitmap_free(d->rmid_busy_llc);
+ kfree(d->mbm_total);
+ return -ENOMEM;
+ }
+ }
+
+ return 0;
+}
+
+int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d)
+{
+ int err = 0;
+
+ mutex_lock(&rdtgroup_mutex);
+
+ if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) {
+ /* RDT_RESOURCE_MBA is never mon_capable */
+ err = mba_sc_domain_allocate(r, d);
+ }
+
+ mutex_unlock(&rdtgroup_mutex);
+
+ return err;
+}
+
+int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d)
+{
+ int err;
+
+ mutex_lock(&rdtgroup_mutex);
+
+ err = domain_setup_mon_state(r, d);
+ if (err)
+ goto out_unlock;
+
+ if (resctrl_is_mbm_enabled()) {
+ INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow);
+ mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL,
+ RESCTRL_PICK_ANY_CPU);
+ }
+
+ if (resctrl_arch_is_llc_occupancy_enabled())
+ INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo);
+
+ /*
+ * If the filesystem is not mounted then only the default resource group
+ * exists. Creation of its directories is deferred until mount time
+ * by rdt_get_tree() calling mkdir_mondata_all().
+ * If resctrl is mounted, add per domain monitor data directories.
+ */
+ if (resctrl_mounted && resctrl_arch_mon_capable())
+ mkdir_mondata_subdir_allrdtgrp(r, d);
+
+out_unlock:
+ mutex_unlock(&rdtgroup_mutex);
+
+ return err;
+}
+
+void resctrl_online_cpu(unsigned int cpu)
+{
+ mutex_lock(&rdtgroup_mutex);
+ /* The CPU is set in default rdtgroup after online. */
+ cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask);
+ mutex_unlock(&rdtgroup_mutex);
+}
+
+static void clear_childcpus(struct rdtgroup *r, unsigned int cpu)
+{
+ struct rdtgroup *cr;
+
+ list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) {
+ if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask))
+ break;
+ }
+}
+
+static struct rdt_mon_domain *get_mon_domain_from_cpu(int cpu,
+ struct rdt_resource *r)
+{
+ struct rdt_mon_domain *d;
+
+ lockdep_assert_cpus_held();
+
+ list_for_each_entry(d, &r->mon_domains, hdr.list) {
+ /* Find the domain that contains this CPU */
+ if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask))
+ return d;
+ }
+
+ return NULL;
+}
+
+void resctrl_offline_cpu(unsigned int cpu)
+{
+ struct rdt_resource *l3 = resctrl_arch_get_resource(RDT_RESOURCE_L3);
+ struct rdt_mon_domain *d;
+ struct rdtgroup *rdtgrp;
+
+ mutex_lock(&rdtgroup_mutex);
+ list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
+ if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) {
+ clear_childcpus(rdtgrp, cpu);
+ break;
+ }
+ }
+
+ if (!l3->mon_capable)
+ goto out_unlock;
+
+ d = get_mon_domain_from_cpu(cpu, l3);
+ if (d) {
+ if (resctrl_is_mbm_enabled() && cpu == d->mbm_work_cpu) {
+ cancel_delayed_work(&d->mbm_over);
+ mbm_setup_overflow_handler(d, 0, cpu);
+ }
+ if (resctrl_arch_is_llc_occupancy_enabled() &&
+ cpu == d->cqm_work_cpu && has_busy_rmid(d)) {
+ cancel_delayed_work(&d->cqm_limbo);
+ cqm_setup_limbo_handler(d, 0, cpu);
+ }
+ }
+
+out_unlock:
+ mutex_unlock(&rdtgroup_mutex);
+}
+
+/*
+ * resctrl_init - resctrl filesystem initialization
+ *
+ * Setup resctrl file system including set up root, create mount point,
+ * register resctrl filesystem, and initialize files under root directory.
+ *
+ * Return: 0 on success or -errno
+ */
+int resctrl_init(void)
+{
+ int ret = 0;
+
+ seq_buf_init(&last_cmd_status, last_cmd_status_buf,
+ sizeof(last_cmd_status_buf));
+
+ rdtgroup_setup_default();
+
+ thread_throttle_mode_init();
+
+ ret = resctrl_mon_resource_init();
+ if (ret)
+ return ret;
+
+ ret = sysfs_create_mount_point(fs_kobj, "resctrl");
+ if (ret) {
+ resctrl_mon_resource_exit();
+ return ret;
+ }
+
+ ret = register_filesystem(&rdt_fs_type);
+ if (ret)
+ goto cleanup_mountpoint;
+
+ /*
+ * Adding the resctrl debugfs directory here may not be ideal since
+ * it would let the resctrl debugfs directory appear on the debugfs
+ * filesystem before the resctrl filesystem is mounted.
+ * It may also be ok since that would enable debugging of RDT before
+ * resctrl is mounted.
+ * The reason why the debugfs directory is created here and not in
+ * rdt_get_tree() is because rdt_get_tree() takes rdtgroup_mutex and
+ * during the debugfs directory creation also &sb->s_type->i_mutex_key
+ * (the lockdep class of inode->i_rwsem). Other filesystem
+ * interactions (eg. SyS_getdents) have the lock ordering:
+ * &sb->s_type->i_mutex_key --> &mm->mmap_lock
+ * During mmap(), called with &mm->mmap_lock, the rdtgroup_mutex
+ * is taken, thus creating dependency:
+ * &mm->mmap_lock --> rdtgroup_mutex for the latter that can cause
+ * issues considering the other two lock dependencies.
+ * By creating the debugfs directory here we avoid a dependency
+ * that may cause deadlock (even though file operations cannot
+ * occur until the filesystem is mounted, but I do not know how to
+ * tell lockdep that).
+ */
+ debugfs_resctrl = debugfs_create_dir("resctrl", NULL);
+
+ return 0;
+
+cleanup_mountpoint:
+ sysfs_remove_mount_point(fs_kobj, "resctrl");
+ resctrl_mon_resource_exit();
+
+ return ret;
+}
+
+static bool resctrl_online_domains_exist(void)
+{
+ struct rdt_resource *r;
+
+ /*
+ * Only walk capable resources to allow resctrl_arch_get_resource()
+ * to return dummy 'not capable' resources.
+ */
+ for_each_alloc_capable_rdt_resource(r) {
+ if (!list_empty(&r->ctrl_domains))
+ return true;
+ }
+
+ for_each_mon_capable_rdt_resource(r) {
+ if (!list_empty(&r->mon_domains))
+ return true;
+ }
+
+ return false;
+}
+
+/**
+ * resctrl_exit() - Remove the resctrl filesystem and free resources.
+ *
+ * Called by the architecture code in response to a fatal error.
+ * Removes resctrl files and structures from kernfs to prevent further
+ * configuration.
+ *
+ * When called by the architecture code, all CPUs and resctrl domains must be
+ * offline. This ensures the limbo and overflow handlers are not scheduled to
+ * run, meaning the data structures they access can be freed by
+ * resctrl_mon_resource_exit().
+ *
+ * After resctrl_exit() returns, the architecture code should return an
+ * error from all resctrl_arch_ functions that can do this.
+ * resctrl_arch_get_resource() must continue to return struct rdt_resources
+ * with the correct rid field to ensure the filesystem can be unmounted.
+ */
+void resctrl_exit(void)
+{
+ cpus_read_lock();
+ WARN_ON_ONCE(resctrl_online_domains_exist());
+
+ mutex_lock(&rdtgroup_mutex);
+ resctrl_fs_teardown();
+ mutex_unlock(&rdtgroup_mutex);
+
+ cpus_read_unlock();
+
+ debugfs_remove_recursive(debugfs_resctrl);
+ debugfs_resctrl = NULL;
+ unregister_filesystem(&rdt_fs_type);
+
+ /*
+ * Do not remove the sysfs mount point added by resctrl_init() so that
+ * it can be used to umount resctrl.
+ */
+
+ resctrl_mon_resource_exit();
+}
diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c
index 4520ca413867..4b77c6dc4418 100644
--- a/fs/romfs/mmap-nommu.c
+++ b/fs/romfs/mmap-nommu.c
@@ -61,9 +61,9 @@ static unsigned long romfs_get_unmapped_area(struct file *file,
* permit a R/O mapping to be made directly through onto an MTD device if
* possible
*/
-static int romfs_mmap(struct file *file, struct vm_area_struct *vma)
+static int romfs_mmap_prepare(struct vm_area_desc *desc)
{
- return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -ENOSYS;
+ return is_nommu_shared_mapping(desc->vm_flags) ? 0 : -ENOSYS;
}
static unsigned romfs_mmap_capabilities(struct file *file)
@@ -79,7 +79,7 @@ const struct file_operations romfs_ro_fops = {
.llseek = generic_file_llseek,
.read_iter = generic_file_read_iter,
.splice_read = filemap_splice_read,
- .mmap = romfs_mmap,
+ .mmap_prepare = romfs_mmap_prepare,
.get_unmapped_area = romfs_get_unmapped_area,
.mmap_capabilities = romfs_mmap_capabilities,
};
diff --git a/fs/select.c b/fs/select.c
index 7da531b1cf6b..082cf60c7e23 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -192,7 +192,7 @@ static int __pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *k
* and is paired with smp_store_mb() in poll_schedule_timeout.
*/
smp_wmb();
- pwq->triggered = 1;
+ WRITE_ONCE(pwq->triggered, 1);
/*
* Perform the default wake up operation using a dummy
@@ -237,7 +237,7 @@ static int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
int rc = -EINTR;
set_current_state(state);
- if (!pwq->triggered)
+ if (!READ_ONCE(pwq->triggered))
rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS);
__set_current_state(TASK_RUNNING);
@@ -630,7 +630,7 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];
ret = -EINVAL;
- if (n < 0)
+ if (unlikely(n < 0))
goto out_nofds;
/* max_fds can increase, so grab it once to avoid race */
@@ -857,7 +857,7 @@ static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait,
int fd = pollfd->fd;
__poll_t mask, filter;
- if (fd < 0)
+ if (unlikely(fd < 0))
return 0;
CLASS(fd, f)(fd);
diff --git a/fs/signalfd.c b/fs/signalfd.c
index d1a5f43ce466..d469782f97f4 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -277,15 +277,14 @@ static int do_signalfd4(int ufd, sigset_t *mask, int flags)
return ufd;
}
- file = anon_inode_getfile("[signalfd]", &signalfd_fops, ctx,
- O_RDWR | (flags & O_NONBLOCK));
+ file = anon_inode_getfile_fmode("[signalfd]", &signalfd_fops,
+ ctx, O_RDWR | (flags & O_NONBLOCK),
+ FMODE_NOWAIT);
if (IS_ERR(file)) {
put_unused_fd(ufd);
kfree(ctx);
return PTR_ERR(file);
}
- file->f_mode |= FMODE_NOWAIT;
-
fd_install(ufd, file);
} else {
CLASS(fd, f)(ufd);
diff --git a/fs/smb/client/Makefile b/fs/smb/client/Makefile
index 22023e30915b..4c97b31a25c2 100644
--- a/fs/smb/client/Makefile
+++ b/fs/smb/client/Makefile
@@ -32,6 +32,6 @@ cifs-$(CONFIG_CIFS_SMB_DIRECT) += smbdirect.o
cifs-$(CONFIG_CIFS_ROOT) += cifsroot.o
-cifs-$(CONFIG_CIFS_ALLOW_INSECURE_LEGACY) += smb1ops.o cifssmb.o
+cifs-$(CONFIG_CIFS_ALLOW_INSECURE_LEGACY) += smb1ops.o cifssmb.o cifstransport.o
cifs-$(CONFIG_CIFS_COMPRESSION) += compress.o compress/lz77.o
diff --git a/fs/smb/client/asn1.c b/fs/smb/client/asn1.c
index b5724ef9f182..214a44509e7b 100644
--- a/fs/smb/client/asn1.c
+++ b/fs/smb/client/asn1.c
@@ -52,6 +52,8 @@ int cifs_neg_token_init_mech_type(void *context, size_t hdrlen,
server->sec_kerberos = true;
else if (oid == OID_ntlmssp)
server->sec_ntlmssp = true;
+ else if (oid == OID_IAKerb)
+ server->sec_iakerb = true;
else {
char buf[50];
diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c
index fe738623cf1b..b69daeb1301b 100644
--- a/fs/smb/client/cached_dir.c
+++ b/fs/smb/client/cached_dir.c
@@ -29,7 +29,6 @@ static struct cached_fid *find_or_create_cached_dir(struct cached_fids *cfids,
{
struct cached_fid *cfid;
- spin_lock(&cfids->cfid_list_lock);
list_for_each_entry(cfid, &cfids->entries, entry) {
if (!strcmp(cfid->path, path)) {
/*
@@ -38,25 +37,20 @@ static struct cached_fid *find_or_create_cached_dir(struct cached_fids *cfids,
* being deleted due to a lease break.
*/
if (!cfid->time || !cfid->has_lease) {
- spin_unlock(&cfids->cfid_list_lock);
return NULL;
}
kref_get(&cfid->refcount);
- spin_unlock(&cfids->cfid_list_lock);
return cfid;
}
}
if (lookup_only) {
- spin_unlock(&cfids->cfid_list_lock);
return NULL;
}
if (cfids->num_entries >= max_cached_dirs) {
- spin_unlock(&cfids->cfid_list_lock);
return NULL;
}
cfid = init_cached_dir(path);
if (cfid == NULL) {
- spin_unlock(&cfids->cfid_list_lock);
return NULL;
}
cfid->cfids = cfids;
@@ -74,7 +68,6 @@ static struct cached_fid *find_or_create_cached_dir(struct cached_fids *cfids,
*/
cfid->has_lease = true;
- spin_unlock(&cfids->cfid_list_lock);
return cfid;
}
@@ -109,7 +102,8 @@ path_to_dentry(struct cifs_sb_info *cifs_sb, const char *path)
while (*s && *s != sep)
s++;
- child = lookup_positive_unlocked(p, dentry, s - p);
+ child = lookup_noperm_positive_unlocked(&QSTR_LEN(p, s - p),
+ dentry);
dput(dentry);
dentry = child;
} while (!IS_ERR(dentry));
@@ -161,6 +155,7 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
struct cached_fids *cfids;
const char *npath;
int retries = 0, cur_sleep = 1;
+ __le32 lease_flags = 0;
if (cifs_sb->root == NULL)
return -ENOENT;
@@ -187,8 +182,10 @@ replay_again:
if (!utf16_path)
return -ENOMEM;
+ spin_lock(&cfids->cfid_list_lock);
cfid = find_or_create_cached_dir(cfids, path, lookup_only, tcon->max_cached_dirs);
if (cfid == NULL) {
+ spin_unlock(&cfids->cfid_list_lock);
kfree(utf16_path);
return -ENOENT;
}
@@ -197,8 +194,8 @@ replay_again:
* Otherwise, it is either a new entry or laundromat worker removed it
* from @cfids->entries. Caller will put last reference if the latter.
*/
- spin_lock(&cfids->cfid_list_lock);
if (cfid->has_lease && cfid->time) {
+ cfid->last_access_time = jiffies;
spin_unlock(&cfids->cfid_list_lock);
*ret_cfid = cfid;
kfree(utf16_path);
@@ -206,8 +203,10 @@ replay_again:
}
spin_unlock(&cfids->cfid_list_lock);
+ pfid = &cfid->fid;
+
/*
- * Skip any prefix paths in @path as lookup_positive_unlocked() ends up
+ * Skip any prefix paths in @path as lookup_noperm_positive_unlocked() ends up
* calling ->lookup() which already adds those through
* build_path_from_dentry(). Also, do it earlier as we might reconnect
* below when trying to send compounded request and then potentially
@@ -227,6 +226,25 @@ replay_again:
rc = -ENOENT;
goto out;
}
+ if (dentry->d_parent && server->dialect >= SMB30_PROT_ID) {
+ struct cached_fid *parent_cfid;
+
+ spin_lock(&cfids->cfid_list_lock);
+ list_for_each_entry(parent_cfid, &cfids->entries, entry) {
+ if (parent_cfid->dentry == dentry->d_parent) {
+ cifs_dbg(FYI, "found a parent cached file handle\n");
+ if (parent_cfid->has_lease && parent_cfid->time) {
+ lease_flags
+ |= SMB2_LEASE_FLAG_PARENT_LEASE_KEY_SET_LE;
+ memcpy(pfid->parent_lease_key,
+ parent_cfid->fid.lease_key,
+ SMB2_LEASE_KEY_SIZE);
+ }
+ break;
+ }
+ }
+ spin_unlock(&cfids->cfid_list_lock);
+ }
}
cfid->dentry = dentry;
cfid->tcon = tcon;
@@ -241,7 +259,6 @@ replay_again:
if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
- pfid = &cfid->fid;
server->ops->new_lease_key(pfid);
memset(rqst, 0, sizeof(rqst));
@@ -261,6 +278,7 @@ replay_again:
FILE_READ_EA,
.disposition = FILE_OPEN,
.fid = pfid,
+ .lease_flags = lease_flags,
.replay = !!(retries),
};
@@ -346,6 +364,7 @@ replay_again:
cfid->file_all_info_is_valid = true;
cfid->time = jiffies;
+ cfid->last_access_time = jiffies;
spin_unlock(&cfids->cfid_list_lock);
/* At this point the directory handle is fully cached */
rc = 0;
@@ -492,8 +511,17 @@ void close_all_cached_dirs(struct cifs_sb_info *cifs_sb)
spin_lock(&cfids->cfid_list_lock);
list_for_each_entry(cfid, &cfids->entries, entry) {
tmp_list = kmalloc(sizeof(*tmp_list), GFP_ATOMIC);
- if (tmp_list == NULL)
- break;
+ if (tmp_list == NULL) {
+ /*
+ * If the malloc() fails, we won't drop all
+ * dentries, and unmounting is likely to trigger
+ * a 'Dentry still in use' error.
+ */
+ cifs_tcon_dbg(VFS, "Out of memory while dropping dentries\n");
+ spin_unlock(&cfids->cfid_list_lock);
+ spin_unlock(&cifs_sb->tlink_tree_lock);
+ goto done;
+ }
spin_lock(&cfid->fid_lock);
tmp_list->dentry = cfid->dentry;
cfid->dentry = NULL;
@@ -505,6 +533,7 @@ void close_all_cached_dirs(struct cifs_sb_info *cifs_sb)
}
spin_unlock(&cifs_sb->tlink_tree_lock);
+done:
list_for_each_entry_safe(tmp_list, q, &entry, entry) {
list_del(&tmp_list->entry);
dput(tmp_list->dentry);
@@ -590,7 +619,7 @@ static void cached_dir_put_work(struct work_struct *work)
queue_work(serverclose_wq, &cfid->close_work);
}
-int cached_dir_lease_break(struct cifs_tcon *tcon, __u8 lease_key[16])
+bool cached_dir_lease_break(struct cifs_tcon *tcon, __u8 lease_key[16])
{
struct cached_fids *cfids = tcon->cfids;
struct cached_fid *cfid;
@@ -703,8 +732,8 @@ static void cfids_laundromat_worker(struct work_struct *work)
spin_lock(&cfids->cfid_list_lock);
list_for_each_entry_safe(cfid, q, &cfids->entries, entry) {
- if (cfid->time &&
- time_after(jiffies, cfid->time + HZ * dir_cache_timeout)) {
+ if (cfid->last_access_time &&
+ time_after(jiffies, cfid->last_access_time + HZ * dir_cache_timeout)) {
cfid->on_list = false;
list_move(&cfid->entry, &entry);
cfids->num_entries--;
diff --git a/fs/smb/client/cached_dir.h b/fs/smb/client/cached_dir.h
index 1dfe79d947a6..46b5a2fdf15b 100644
--- a/fs/smb/client/cached_dir.h
+++ b/fs/smb/client/cached_dir.h
@@ -14,19 +14,18 @@ struct cached_dirent {
char *name;
int namelen;
loff_t pos;
-
struct cifs_fattr fattr;
};
struct cached_dirents {
bool is_valid:1;
bool is_failed:1;
- struct dir_context *ctx; /*
- * Only used to make sure we only take entries
- * from a single context. Never dereferenced.
- */
+ struct file *file; /*
+ * Used to associate the cache with a single
+ * open file instance.
+ */
struct mutex de_mutex;
- int pos; /* Expected ctx->pos */
+ loff_t pos; /* Expected ctx->pos */
struct list_head entries;
};
@@ -39,6 +38,7 @@ struct cached_fid {
bool on_list:1;
bool file_all_info_is_valid:1;
unsigned long time; /* jiffies of when lease was taken */
+ unsigned long last_access_time; /* jiffies of when last accessed */
struct kref refcount;
struct cifs_fid fid;
spinlock_t fid_lock;
@@ -80,6 +80,6 @@ extern void drop_cached_dir_by_name(const unsigned int xid,
struct cifs_sb_info *cifs_sb);
extern void close_all_cached_dirs(struct cifs_sb_info *cifs_sb);
extern void invalidate_all_cached_dirs(struct cifs_tcon *tcon);
-extern int cached_dir_lease_break(struct cifs_tcon *tcon, __u8 lease_key[16]);
+extern bool cached_dir_lease_break(struct cifs_tcon *tcon, __u8 lease_key[16]);
#endif /* _CACHED_DIR_H */
diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c
index e03c890de0a0..beb4f18f05ef 100644
--- a/fs/smb/client/cifs_debug.c
+++ b/fs/smb/client/cifs_debug.c
@@ -26,6 +26,7 @@
#include "smbdirect.h"
#endif
#include "cifs_swn.h"
+#include "cached_dir.h"
void
cifs_dump_mem(char *label, void *data, int length)
@@ -59,7 +60,7 @@ void cifs_dump_mids(struct TCP_Server_Info *server)
return;
cifs_dbg(VFS, "Dump pending requests:\n");
- spin_lock(&server->mid_lock);
+ spin_lock(&server->mid_queue_lock);
list_for_each_entry(mid_entry, &server->pending_mid_q, qhead) {
cifs_dbg(VFS, "State: %d Cmd: %d Pid: %d Cbdata: %p Mid %llu\n",
mid_entry->mid_state,
@@ -82,7 +83,7 @@ void cifs_dump_mids(struct TCP_Server_Info *server)
mid_entry->resp_buf, 62);
}
}
- spin_unlock(&server->mid_lock);
+ spin_unlock(&server->mid_queue_lock);
#endif /* CONFIG_CIFS_DEBUG2 */
}
@@ -280,6 +281,54 @@ static int cifs_debug_files_proc_show(struct seq_file *m, void *v)
return 0;
}
+static int cifs_debug_dirs_proc_show(struct seq_file *m, void *v)
+{
+ struct list_head *stmp, *tmp, *tmp1;
+ struct TCP_Server_Info *server;
+ struct cifs_ses *ses;
+ struct cifs_tcon *tcon;
+ struct cached_fids *cfids;
+ struct cached_fid *cfid;
+ LIST_HEAD(entry);
+
+ seq_puts(m, "# Version:1\n");
+ seq_puts(m, "# Format:\n");
+ seq_puts(m, "# <tree id> <sess id> <persistent fid> <path>\n");
+
+ spin_lock(&cifs_tcp_ses_lock);
+ list_for_each(stmp, &cifs_tcp_ses_list) {
+ server = list_entry(stmp, struct TCP_Server_Info,
+ tcp_ses_list);
+ list_for_each(tmp, &server->smb_ses_list) {
+ ses = list_entry(tmp, struct cifs_ses, smb_ses_list);
+ list_for_each(tmp1, &ses->tcon_list) {
+ tcon = list_entry(tmp1, struct cifs_tcon, tcon_list);
+ cfids = tcon->cfids;
+ spin_lock(&cfids->cfid_list_lock); /* check lock ordering */
+ seq_printf(m, "Num entries: %d\n", cfids->num_entries);
+ list_for_each_entry(cfid, &cfids->entries, entry) {
+ seq_printf(m, "0x%x 0x%llx 0x%llx %s",
+ tcon->tid,
+ ses->Suid,
+ cfid->fid.persistent_fid,
+ cfid->path);
+ if (cfid->file_all_info_is_valid)
+ seq_printf(m, "\tvalid file info");
+ if (cfid->dirents.is_valid)
+ seq_printf(m, ", valid dirents");
+ seq_printf(m, "\n");
+ }
+ spin_unlock(&cfids->cfid_list_lock);
+
+
+ }
+ }
+ }
+ spin_unlock(&cifs_tcp_ses_lock);
+ seq_putc(m, '\n');
+ return 0;
+}
+
static __always_inline const char *compression_alg_str(__le16 alg)
{
switch (alg) {
@@ -362,6 +411,11 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
c = 0;
spin_lock(&cifs_tcp_ses_lock);
list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
+#ifdef CONFIG_CIFS_SMB_DIRECT
+ struct smbdirect_socket *sc;
+ struct smbdirect_socket_parameters *sp;
+#endif
+
/* channel info will be printed as a part of sessions below */
if (SERVER_IS_CHAN(server))
continue;
@@ -383,25 +437,27 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
seq_printf(m, "\nSMBDirect transport not available");
goto skip_rdma;
}
+ sc = &server->smbd_conn->socket;
+ sp = &sc->parameters;
seq_printf(m, "\nSMBDirect (in hex) protocol version: %x "
"transport status: %x",
server->smbd_conn->protocol,
- server->smbd_conn->transport_status);
+ server->smbd_conn->socket.status);
seq_printf(m, "\nConn receive_credit_max: %x "
"send_credit_target: %x max_send_size: %x",
- server->smbd_conn->receive_credit_max,
- server->smbd_conn->send_credit_target,
- server->smbd_conn->max_send_size);
+ sp->recv_credit_max,
+ sp->send_credit_target,
+ sp->max_send_size);
seq_printf(m, "\nConn max_fragmented_recv_size: %x "
"max_fragmented_send_size: %x max_receive_size:%x",
- server->smbd_conn->max_fragmented_recv_size,
- server->smbd_conn->max_fragmented_send_size,
- server->smbd_conn->max_receive_size);
+ sp->max_fragmented_recv_size,
+ sp->max_fragmented_send_size,
+ sp->max_recv_size);
seq_printf(m, "\nConn keep_alive_interval: %x "
"max_readwrite_size: %x rdma_readwrite_threshold: %x",
- server->smbd_conn->keep_alive_interval,
- server->smbd_conn->max_readwrite_size,
+ sp->keepalive_interval_msec * 1000,
+ sp->max_read_write_size,
server->smbd_conn->rdma_readwrite_threshold);
seq_printf(m, "\nDebug count_get_receive_buffer: %x "
"count_put_receive_buffer: %x count_send_empty: %x",
@@ -411,15 +467,13 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
seq_printf(m, "\nRead Queue count_reassembly_queue: %x "
"count_enqueue_reassembly_queue: %x "
"count_dequeue_reassembly_queue: %x "
- "fragment_reassembly_remaining: %x "
"reassembly_data_length: %x "
"reassembly_queue_length: %x",
server->smbd_conn->count_reassembly_queue,
server->smbd_conn->count_enqueue_reassembly_queue,
server->smbd_conn->count_dequeue_reassembly_queue,
- server->smbd_conn->fragment_reassembly_remaining,
- server->smbd_conn->reassembly_data_length,
- server->smbd_conn->reassembly_queue_length);
+ sc->recv_io.reassembly.data_length,
+ sc->recv_io.reassembly.queue_length);
seq_printf(m, "\nCurrent Credits send_credits: %x "
"receive_credits: %x receive_credit_target: %x",
atomic_read(&server->smbd_conn->send_credits),
@@ -427,10 +481,8 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
server->smbd_conn->receive_credit_target);
seq_printf(m, "\nPending send_pending: %x ",
atomic_read(&server->smbd_conn->send_pending));
- seq_printf(m, "\nReceive buffers count_receive_queue: %x "
- "count_empty_packet_queue: %x",
- server->smbd_conn->count_receive_queue,
- server->smbd_conn->count_empty_packet_queue);
+ seq_printf(m, "\nReceive buffers count_receive_queue: %x ",
+ server->smbd_conn->count_receive_queue);
seq_printf(m, "\nMR responder_resources: %x "
"max_frmr_depth: %x mr_type: %x",
server->smbd_conn->responder_resources,
@@ -618,7 +670,7 @@ skip_rdma:
seq_printf(m, "\n\tServer ConnectionId: 0x%llx",
chan_server->conn_id);
- spin_lock(&chan_server->mid_lock);
+ spin_lock(&chan_server->mid_queue_lock);
list_for_each_entry(mid_entry, &chan_server->pending_mid_q, qhead) {
seq_printf(m, "\n\t\tState: %d com: %d pid: %d cbdata: %p mid %llu",
mid_entry->mid_state,
@@ -627,7 +679,7 @@ skip_rdma:
mid_entry->callback_data,
mid_entry->mid);
}
- spin_unlock(&chan_server->mid_lock);
+ spin_unlock(&chan_server->mid_queue_lock);
}
spin_unlock(&ses->chan_lock);
seq_puts(m, "\n--\n");
@@ -858,6 +910,9 @@ cifs_proc_init(void)
proc_create_single("open_files", 0400, proc_fs_cifs,
cifs_debug_files_proc_show);
+ proc_create_single("open_dirs", 0400, proc_fs_cifs,
+ cifs_debug_dirs_proc_show);
+
proc_create("Stats", 0644, proc_fs_cifs, &cifs_stats_proc_ops);
proc_create("cifsFYI", 0644, proc_fs_cifs, &cifsFYI_proc_ops);
proc_create("traceSMB", 0644, proc_fs_cifs, &traceSMB_proc_ops);
@@ -902,6 +957,7 @@ cifs_proc_clean(void)
remove_proc_entry("DebugData", proc_fs_cifs);
remove_proc_entry("open_files", proc_fs_cifs);
+ remove_proc_entry("open_dirs", proc_fs_cifs);
remove_proc_entry("cifsFYI", proc_fs_cifs);
remove_proc_entry("traceSMB", proc_fs_cifs);
remove_proc_entry("Stats", proc_fs_cifs);
@@ -1100,7 +1156,7 @@ static ssize_t cifs_security_flags_proc_write(struct file *file,
if ((count < 1) || (count > 11))
return -EINVAL;
- memset(flags_string, 0, 12);
+ memset(flags_string, 0, sizeof(flags_string));
if (copy_from_user(flags_string, buffer, count))
return -EFAULT;
diff --git a/fs/smb/client/cifs_fs_sb.h b/fs/smb/client/cifs_fs_sb.h
index 651759192280..5e8d163cb5f8 100644
--- a/fs/smb/client/cifs_fs_sb.h
+++ b/fs/smb/client/cifs_fs_sb.h
@@ -49,6 +49,7 @@
struct cifs_sb_info {
struct rb_root tlink_tree;
+ struct list_head tcon_sb_link;
spinlock_t tlink_tree_lock;
struct tcon_link *master_tlink;
struct nls_table *local_nls;
diff --git a/fs/smb/client/cifs_ioctl.h b/fs/smb/client/cifs_ioctl.h
index 26327442e383..b51ce64fcccf 100644
--- a/fs/smb/client/cifs_ioctl.h
+++ b/fs/smb/client/cifs_ioctl.h
@@ -61,7 +61,7 @@ struct smb_query_info {
struct smb3_key_debug_info {
__u64 Suid;
__u16 cipher_type;
- __u8 auth_key[16]; /* SMB2_NTLMV2_SESSKEY_SIZE */
+ __u8 auth_key[SMB2_NTLMV2_SESSKEY_SIZE];
__u8 smb3encryptionkey[SMB3_SIGN_KEY_SIZE];
__u8 smb3decryptionkey[SMB3_SIGN_KEY_SIZE];
} __packed;
diff --git a/fs/smb/client/cifs_spnego.c b/fs/smb/client/cifs_spnego.c
index 28f568b5fc27..43b86fa4d695 100644
--- a/fs/smb/client/cifs_spnego.c
+++ b/fs/smb/client/cifs_spnego.c
@@ -124,53 +124,44 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo,
dp = description;
/* start with version and hostname portion of UNC string */
spnego_key = ERR_PTR(-EINVAL);
- sprintf(dp, "ver=0x%x;host=%s;", CIFS_SPNEGO_UPCALL_VERSION,
- hostname);
- dp = description + strlen(description);
+ dp += sprintf(dp, "ver=0x%x;host=%s;", CIFS_SPNEGO_UPCALL_VERSION,
+ hostname);
/* add the server address */
if (server->dstaddr.ss_family == AF_INET)
- sprintf(dp, "ip4=%pI4", &sa->sin_addr);
+ dp += sprintf(dp, "ip4=%pI4", &sa->sin_addr);
else if (server->dstaddr.ss_family == AF_INET6)
- sprintf(dp, "ip6=%pI6", &sa6->sin6_addr);
+ dp += sprintf(dp, "ip6=%pI6", &sa6->sin6_addr);
else
goto out;
- dp = description + strlen(description);
-
- /* for now, only sec=krb5 and sec=mskrb5 are valid */
+ /* for now, only sec=krb5 and sec=mskrb5 and iakerb are valid */
if (server->sec_kerberos)
- sprintf(dp, ";sec=krb5");
+ dp += sprintf(dp, ";sec=krb5");
else if (server->sec_mskerberos)
- sprintf(dp, ";sec=mskrb5");
+ dp += sprintf(dp, ";sec=mskrb5");
+ else if (server->sec_iakerb)
+ dp += sprintf(dp, ";sec=iakerb");
else {
cifs_dbg(VFS, "unknown or missing server auth type, use krb5\n");
- sprintf(dp, ";sec=krb5");
+ dp += sprintf(dp, ";sec=krb5");
}
- dp = description + strlen(description);
- sprintf(dp, ";uid=0x%x",
- from_kuid_munged(&init_user_ns, sesInfo->linux_uid));
+ dp += sprintf(dp, ";uid=0x%x",
+ from_kuid_munged(&init_user_ns, sesInfo->linux_uid));
- dp = description + strlen(description);
- sprintf(dp, ";creduid=0x%x",
+ dp += sprintf(dp, ";creduid=0x%x",
from_kuid_munged(&init_user_ns, sesInfo->cred_uid));
- if (sesInfo->user_name) {
- dp = description + strlen(description);
- sprintf(dp, ";user=%s", sesInfo->user_name);
- }
+ if (sesInfo->user_name)
+ dp += sprintf(dp, ";user=%s", sesInfo->user_name);
- dp = description + strlen(description);
- sprintf(dp, ";pid=0x%x", current->pid);
+ dp += sprintf(dp, ";pid=0x%x", current->pid);
- if (sesInfo->upcall_target == UPTARGET_MOUNT) {
- dp = description + strlen(description);
- sprintf(dp, ";upcall_target=mount");
- } else {
- dp = description + strlen(description);
- sprintf(dp, ";upcall_target=app");
- }
+ if (sesInfo->upcall_target == UPTARGET_MOUNT)
+ dp += sprintf(dp, ";upcall_target=mount");
+ else
+ dp += sprintf(dp, ";upcall_target=app");
cifs_dbg(FYI, "key description = %s\n", description);
saved_cred = override_creds(spnego_cred);
diff --git a/fs/smb/client/cifsacl.c b/fs/smb/client/cifsacl.c
index ba79aa2107cc..63b3b1290bed 100644
--- a/fs/smb/client/cifsacl.c
+++ b/fs/smb/client/cifsacl.c
@@ -763,7 +763,7 @@ static void parse_dacl(struct smb_acl *pdacl, char *end_of_acl,
struct cifs_fattr *fattr, bool mode_from_special_sid)
{
int i;
- int num_aces = 0;
+ u16 num_aces = 0;
int acl_size;
char *acl_base;
struct smb_ace **ppace;
@@ -778,14 +778,15 @@ static void parse_dacl(struct smb_acl *pdacl, char *end_of_acl,
}
/* validate that we do not go past end of acl */
- if (end_of_acl < (char *)pdacl + le16_to_cpu(pdacl->size)) {
+ if (end_of_acl < (char *)pdacl + sizeof(struct smb_acl) ||
+ end_of_acl < (char *)pdacl + le16_to_cpu(pdacl->size)) {
cifs_dbg(VFS, "ACL too small to parse DACL\n");
return;
}
cifs_dbg(NOISY, "DACL revision %d size %d num aces %d\n",
le16_to_cpu(pdacl->revision), le16_to_cpu(pdacl->size),
- le32_to_cpu(pdacl->num_aces));
+ le16_to_cpu(pdacl->num_aces));
/* reset rwx permissions for user/group/other.
Also, if num_aces is 0 i.e. DACL has no ACEs,
@@ -795,19 +796,38 @@ static void parse_dacl(struct smb_acl *pdacl, char *end_of_acl,
acl_base = (char *)pdacl;
acl_size = sizeof(struct smb_acl);
- num_aces = le32_to_cpu(pdacl->num_aces);
+ num_aces = le16_to_cpu(pdacl->num_aces);
if (num_aces > 0) {
umode_t denied_mode = 0;
- if (num_aces > ULONG_MAX / sizeof(struct smb_ace *))
+ if (num_aces > (le16_to_cpu(pdacl->size) - sizeof(struct smb_acl)) /
+ (offsetof(struct smb_ace, sid) +
+ offsetof(struct smb_sid, sub_auth) + sizeof(__le16)))
return;
+
ppace = kmalloc_array(num_aces, sizeof(struct smb_ace *),
GFP_KERNEL);
if (!ppace)
return;
for (i = 0; i < num_aces; ++i) {
+ if (end_of_acl - acl_base < acl_size)
+ break;
+
ppace[i] = (struct smb_ace *) (acl_base + acl_size);
+ acl_base = (char *)ppace[i];
+ acl_size = offsetof(struct smb_ace, sid) +
+ offsetof(struct smb_sid, sub_auth);
+
+ if (end_of_acl - acl_base < acl_size ||
+ ppace[i]->sid.num_subauth == 0 ||
+ ppace[i]->sid.num_subauth > SID_MAX_SUB_AUTHORITIES ||
+ (end_of_acl - acl_base <
+ acl_size + sizeof(__le32) * ppace[i]->sid.num_subauth) ||
+ (le16_to_cpu(ppace[i]->size) <
+ acl_size + sizeof(__le32) * ppace[i]->sid.num_subauth))
+ break;
+
#ifdef CONFIG_CIFS_DEBUG2
dump_ace(ppace[i], end_of_acl);
#endif
@@ -851,7 +871,6 @@ static void parse_dacl(struct smb_acl *pdacl, char *end_of_acl,
(void *)ppace[i],
sizeof(struct smb_ace)); */
- acl_base = (char *)ppace[i];
acl_size = le16_to_cpu(ppace[i]->size);
}
@@ -937,12 +956,12 @@ unsigned int setup_special_user_owner_ACE(struct smb_ace *pntace)
static void populate_new_aces(char *nacl_base,
struct smb_sid *pownersid,
struct smb_sid *pgrpsid,
- __u64 *pnmode, u32 *pnum_aces, u16 *pnsize,
+ __u64 *pnmode, u16 *pnum_aces, u16 *pnsize,
bool modefromsid,
bool posix)
{
__u64 nmode;
- u32 num_aces = 0;
+ u16 num_aces = 0;
u16 nsize = 0;
__u64 user_mode;
__u64 group_mode;
@@ -1050,7 +1069,7 @@ static __u16 replace_sids_and_copy_aces(struct smb_acl *pdacl, struct smb_acl *p
u16 size = 0;
struct smb_ace *pntace = NULL;
char *acl_base = NULL;
- u32 src_num_aces = 0;
+ u16 src_num_aces = 0;
u16 nsize = 0;
struct smb_ace *pnntace = NULL;
char *nacl_base = NULL;
@@ -1058,7 +1077,7 @@ static __u16 replace_sids_and_copy_aces(struct smb_acl *pdacl, struct smb_acl *p
acl_base = (char *)pdacl;
size = sizeof(struct smb_acl);
- src_num_aces = le32_to_cpu(pdacl->num_aces);
+ src_num_aces = le16_to_cpu(pdacl->num_aces);
nacl_base = (char *)pndacl;
nsize = sizeof(struct smb_acl);
@@ -1090,11 +1109,11 @@ static int set_chmod_dacl(struct smb_acl *pdacl, struct smb_acl *pndacl,
u16 size = 0;
struct smb_ace *pntace = NULL;
char *acl_base = NULL;
- u32 src_num_aces = 0;
+ u16 src_num_aces = 0;
u16 nsize = 0;
struct smb_ace *pnntace = NULL;
char *nacl_base = NULL;
- u32 num_aces = 0;
+ u16 num_aces = 0;
bool new_aces_set = false;
/* Assuming that pndacl and pnmode are never NULL */
@@ -1112,7 +1131,7 @@ static int set_chmod_dacl(struct smb_acl *pdacl, struct smb_acl *pndacl,
acl_base = (char *)pdacl;
size = sizeof(struct smb_acl);
- src_num_aces = le32_to_cpu(pdacl->num_aces);
+ src_num_aces = le16_to_cpu(pdacl->num_aces);
/* Retain old ACEs which we can retain */
for (i = 0; i < src_num_aces; ++i) {
@@ -1158,7 +1177,7 @@ next_ace:
}
finalize_dacl:
- pndacl->num_aces = cpu_to_le32(num_aces);
+ pndacl->num_aces = cpu_to_le16(num_aces);
pndacl->size = cpu_to_le16(nsize);
return 0;
@@ -1293,7 +1312,7 @@ static int build_sec_desc(struct smb_ntsd *pntsd, struct smb_ntsd *pnntsd,
dacloffset ? dacl_ptr->revision : cpu_to_le16(ACL_REVISION);
ndacl_ptr->size = cpu_to_le16(0);
- ndacl_ptr->num_aces = cpu_to_le32(0);
+ ndacl_ptr->num_aces = cpu_to_le16(0);
rc = set_chmod_dacl(dacl_ptr, ndacl_ptr, owner_sid_ptr, group_sid_ptr,
pnmode, mode_from_sid, posix);
@@ -1395,7 +1414,7 @@ chown_chgrp_exit:
#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
struct smb_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
const struct cifs_fid *cifsfid, u32 *pacllen,
- u32 __maybe_unused unused)
+ u32 info)
{
struct smb_ntsd *pntsd = NULL;
unsigned int xid;
@@ -1407,7 +1426,7 @@ struct smb_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
xid = get_xid();
rc = CIFSSMBGetCIFSACL(xid, tlink_tcon(tlink), cifsfid->netfid, &pntsd,
- pacllen);
+ pacllen, info);
free_xid(xid);
cifs_put_tlink(tlink);
@@ -1419,7 +1438,7 @@ struct smb_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
}
static struct smb_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
- const char *path, u32 *pacllen)
+ const char *path, u32 *pacllen, u32 info)
{
struct smb_ntsd *pntsd = NULL;
int oplock = 0;
@@ -1446,9 +1465,12 @@ static struct smb_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
.fid = &fid,
};
+ if (info & SACL_SECINFO)
+ oparms.desired_access |= SYSTEM_SECURITY;
+
rc = CIFS_open(xid, &oparms, &oplock, NULL);
if (!rc) {
- rc = CIFSSMBGetCIFSACL(xid, tcon, fid.netfid, &pntsd, pacllen);
+ rc = CIFSSMBGetCIFSACL(xid, tcon, fid.netfid, &pntsd, pacllen, info);
CIFSSMBClose(xid, tcon, fid.netfid);
}
@@ -1472,7 +1494,7 @@ struct smb_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
if (inode)
open_file = find_readable_file(CIFS_I(inode), true);
if (!open_file)
- return get_cifs_acl_by_path(cifs_sb, path, pacllen);
+ return get_cifs_acl_by_path(cifs_sb, path, pacllen, info);
pntsd = get_cifs_acl_by_fid(cifs_sb, &open_file->fid, pacllen, info);
cifsFileInfo_put(open_file);
@@ -1485,7 +1507,7 @@ int set_cifs_acl(struct smb_ntsd *pnntsd, __u32 acllen,
{
int oplock = 0;
unsigned int xid;
- int rc, access_flags;
+ int rc, access_flags = 0;
struct cifs_tcon *tcon;
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
@@ -1498,10 +1520,12 @@ int set_cifs_acl(struct smb_ntsd *pnntsd, __u32 acllen,
tcon = tlink_tcon(tlink);
xid = get_xid();
- if (aclflag == CIFS_ACL_OWNER || aclflag == CIFS_ACL_GROUP)
- access_flags = WRITE_OWNER;
- else
- access_flags = WRITE_DAC;
+ if (aclflag & CIFS_ACL_OWNER || aclflag & CIFS_ACL_GROUP)
+ access_flags |= WRITE_OWNER;
+ if (aclflag & CIFS_ACL_SACL)
+ access_flags |= SYSTEM_SECURITY;
+ if (aclflag & CIFS_ACL_DACL)
+ access_flags |= WRITE_DAC;
oparms = (struct cifs_open_parms) {
.tcon = tcon,
@@ -1541,7 +1565,7 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
int rc = 0;
struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
struct smb_version_operations *ops;
- const u32 info = 0;
+ const u32 info = OWNER_SECINFO | GROUP_SECINFO | DACL_SECINFO;
cifs_dbg(NOISY, "converting ACL to mode for %s\n", path);
@@ -1595,7 +1619,7 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode,
struct tcon_link *tlink;
struct smb_version_operations *ops;
bool mode_from_sid, id_from_sid;
- const u32 info = 0;
+ const u32 info = OWNER_SECINFO | GROUP_SECINFO | DACL_SECINFO;
bool posix;
tlink = cifs_sb_tlink(cifs_sb);
@@ -1648,7 +1672,7 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode,
dacl_ptr = (struct smb_acl *)((char *)pntsd + dacloffset);
if (mode_from_sid)
nsecdesclen +=
- le32_to_cpu(dacl_ptr->num_aces) * sizeof(struct smb_ace);
+ le16_to_cpu(dacl_ptr->num_aces) * sizeof(struct smb_ace);
else /* cifsacl */
nsecdesclen += le16_to_cpu(dacl_ptr->size);
}
diff --git a/fs/smb/client/cifsencrypt.c b/fs/smb/client/cifsencrypt.c
index e69968e88fe7..3cc686246908 100644
--- a/fs/smb/client/cifsencrypt.c
+++ b/fs/smb/client/cifsencrypt.c
@@ -343,7 +343,7 @@ static struct ntlmssp2_name *find_next_av(struct cifs_ses *ses,
len = AV_LEN(av);
if (AV_TYPE(av) == NTLMSSP_AV_EOL)
return NULL;
- if (!len || (u8 *)av + sizeof(*av) + len > end)
+ if ((u8 *)av + sizeof(*av) + len > end)
return NULL;
return av;
}
@@ -363,7 +363,7 @@ static int find_av_name(struct cifs_ses *ses, u16 type, char **name, u16 maxlen)
av_for_each_entry(ses, av) {
len = AV_LEN(av);
- if (AV_TYPE(av) != type)
+ if (AV_TYPE(av) != type || !len)
continue;
if (!IS_ALIGNED(len, sizeof(__le16))) {
cifs_dbg(VFS | ONCE, "%s: bad length(%u) for type %u\n",
@@ -532,17 +532,67 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash, struct shash_
return rc;
}
+/*
+ * Set up NTLMv2 response blob with SPN (cifs/<hostname>) appended to the
+ * existing list of AV pairs.
+ */
+static int set_auth_key_response(struct cifs_ses *ses)
+{
+ size_t baselen = CIFS_SESS_KEY_SIZE + sizeof(struct ntlmv2_resp);
+ size_t len, spnlen, tilen = 0, num_avs = 2 /* SPN + EOL */;
+ struct TCP_Server_Info *server = ses->server;
+ char *spn __free(kfree) = NULL;
+ struct ntlmssp2_name *av;
+ char *rsp = NULL;
+ int rc;
+
+ spnlen = strlen(server->hostname);
+ len = sizeof("cifs/") + spnlen;
+ spn = kmalloc(len, GFP_KERNEL);
+ if (!spn) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ spnlen = scnprintf(spn, len, "cifs/%.*s",
+ (int)spnlen, server->hostname);
+
+ av_for_each_entry(ses, av)
+ tilen += sizeof(*av) + AV_LEN(av);
+
+ len = baselen + tilen + spnlen * sizeof(__le16) + num_avs * sizeof(*av);
+ rsp = kmalloc(len, GFP_KERNEL);
+ if (!rsp) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ memcpy(rsp + baselen, ses->auth_key.response, tilen);
+ av = (void *)(rsp + baselen + tilen);
+ av->type = cpu_to_le16(NTLMSSP_AV_TARGET_NAME);
+ av->length = cpu_to_le16(spnlen * sizeof(__le16));
+ cifs_strtoUTF16((__le16 *)av->data, spn, spnlen, ses->local_nls);
+ av = (void *)((__u8 *)av + sizeof(*av) + AV_LEN(av));
+ av->type = cpu_to_le16(NTLMSSP_AV_EOL);
+ av->length = 0;
+
+ rc = 0;
+ ses->auth_key.len = len;
+out:
+ ses->auth_key.response = rsp;
+ return rc;
+}
+
int
setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
{
struct shash_desc *hmacmd5 = NULL;
- int rc;
- int baselen;
- unsigned int tilen;
+ unsigned char *tiblob = NULL; /* target info blob */
struct ntlmv2_resp *ntlmv2;
char ntlmv2_hash[16];
- unsigned char *tiblob = NULL; /* target info blob */
__le64 rsp_timestamp;
+ __u64 cc;
+ int rc;
if (nls_cp == NULL) {
cifs_dbg(VFS, "%s called with nls_cp==NULL\n", __func__);
@@ -588,32 +638,25 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
* (as Windows 7 does)
*/
rsp_timestamp = find_timestamp(ses);
+ get_random_bytes(&cc, sizeof(cc));
- baselen = CIFS_SESS_KEY_SIZE + sizeof(struct ntlmv2_resp);
- tilen = ses->auth_key.len;
- tiblob = ses->auth_key.response;
+ cifs_server_lock(ses->server);
- ses->auth_key.response = kmalloc(baselen + tilen, GFP_KERNEL);
- if (!ses->auth_key.response) {
- rc = -ENOMEM;
+ tiblob = ses->auth_key.response;
+ rc = set_auth_key_response(ses);
+ if (rc) {
ses->auth_key.len = 0;
- goto setup_ntlmv2_rsp_ret;
+ goto unlock;
}
- ses->auth_key.len += baselen;
ntlmv2 = (struct ntlmv2_resp *)
(ses->auth_key.response + CIFS_SESS_KEY_SIZE);
ntlmv2->blob_signature = cpu_to_le32(0x00000101);
ntlmv2->reserved = 0;
ntlmv2->time = rsp_timestamp;
-
- get_random_bytes(&ntlmv2->client_chal, sizeof(ntlmv2->client_chal));
+ ntlmv2->client_chal = cc;
ntlmv2->reserved2 = 0;
- memcpy(ses->auth_key.response + baselen, tiblob, tilen);
-
- cifs_server_lock(ses->server);
-
rc = cifs_alloc_hash("hmac(md5)", &hmacmd5);
if (rc) {
cifs_dbg(VFS, "Could not allocate HMAC-MD5, rc=%d\n", rc);
@@ -704,18 +747,12 @@ cifs_crypto_secmech_release(struct TCP_Server_Info *server)
cifs_free_hash(&server->secmech.md5);
cifs_free_hash(&server->secmech.sha512);
- if (!SERVER_IS_CHAN(server)) {
- if (server->secmech.enc) {
- crypto_free_aead(server->secmech.enc);
- server->secmech.enc = NULL;
- }
-
- if (server->secmech.dec) {
- crypto_free_aead(server->secmech.dec);
- server->secmech.dec = NULL;
- }
- } else {
+ if (server->secmech.enc) {
+ crypto_free_aead(server->secmech.enc);
server->secmech.enc = NULL;
+ }
+ if (server->secmech.dec) {
+ crypto_free_aead(server->secmech.dec);
server->secmech.dec = NULL;
}
}
diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index b800c9f585d8..3bd85ab2deb1 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -70,7 +70,6 @@ bool require_gcm_256; /* false by default */
bool enable_negotiate_signing; /* false by default */
unsigned int global_secflags = CIFSSEC_DEF;
/* unsigned int ntlmv2_support = 0; */
-unsigned int sign_CIFS_PDUs = 1;
/*
* Global transaction id (XID) information
@@ -78,7 +77,7 @@ unsigned int sign_CIFS_PDUs = 1;
unsigned int GlobalCurrentXid; /* protected by GlobalMid_Lock */
unsigned int GlobalTotalActiveXid; /* prot by GlobalMid_Lock */
unsigned int GlobalMaxActiveXid; /* prot by GlobalMid_Lock */
-spinlock_t GlobalMid_Lock; /* protects above & list operations on midQ entries */
+DEFINE_SPINLOCK(GlobalMid_Lock); /* protects above & list operations on midQ entries */
/*
* Global counters, updated atomically
@@ -98,7 +97,7 @@ atomic_t total_buf_alloc_count;
atomic_t total_small_buf_alloc_count;
#endif/* STATS2 */
struct list_head cifs_tcp_ses_list;
-spinlock_t cifs_tcp_ses_lock;
+DEFINE_SPINLOCK(cifs_tcp_ses_lock);
static const struct super_operations cifs_super_ops;
unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE;
module_param(CIFSMaxBufSize, uint, 0444);
@@ -261,9 +260,9 @@ cifs_read_super(struct super_block *sb)
}
if (tcon->nocase)
- sb->s_d_op = &cifs_ci_dentry_ops;
+ set_default_d_op(sb, &cifs_ci_dentry_ops);
else
- sb->s_d_op = &cifs_dentry_ops;
+ set_default_d_op(sb, &cifs_dentry_ops);
sb->s_root = d_make_root(inode);
if (!sb->s_root) {
@@ -637,6 +636,10 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
cifs_sb->ctx->dir_mode);
if (cifs_sb->ctx->iocharset)
seq_printf(s, ",iocharset=%s", cifs_sb->ctx->iocharset);
+ if (tcon->ses->unicode == 0)
+ seq_puts(s, ",nounicode");
+ else if (tcon->ses->unicode == 1)
+ seq_puts(s, ",unicode");
if (tcon->seal)
seq_puts(s, ",seal");
else if (tcon->ses->server->ignore_signature)
@@ -715,6 +718,12 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
cifs_sb->ctx->backupgid));
seq_show_option(s, "reparse",
cifs_reparse_type_str(cifs_sb->ctx->reparse_type));
+ if (cifs_sb->ctx->nonativesocket)
+ seq_puts(s, ",nonativesocket");
+ else
+ seq_puts(s, ",nativesocket");
+ seq_show_option(s, "symlink",
+ cifs_symlink_type_str(cifs_symlink_type(cifs_sb)));
seq_printf(s, ",rsize=%u", cifs_sb->ctx->rsize);
seq_printf(s, ",wsize=%u", cifs_sb->ctx->wsize);
@@ -919,7 +928,8 @@ cifs_get_root(struct smb3_fs_context *ctx, struct super_block *sb)
while (*s && *s != sep)
s++;
- child = lookup_positive_unlocked(p, dentry, s - p);
+ child = lookup_noperm_positive_unlocked(&QSTR_LEN(p, s - p),
+ dentry);
dput(dentry);
dentry = child;
} while (!IS_ERR(dentry));
@@ -1515,7 +1525,7 @@ const struct file_operations cifs_file_ops = {
.flock = cifs_flock,
.fsync = cifs_fsync,
.flush = cifs_flush,
- .mmap = cifs_file_mmap,
+ .mmap_prepare = cifs_file_mmap_prepare,
.splice_read = filemap_splice_read,
.splice_write = iter_file_splice_write,
.llseek = cifs_llseek,
@@ -1535,7 +1545,7 @@ const struct file_operations cifs_file_strict_ops = {
.flock = cifs_flock,
.fsync = cifs_strict_fsync,
.flush = cifs_flush,
- .mmap = cifs_file_strict_mmap,
+ .mmap_prepare = cifs_file_strict_mmap_prepare,
.splice_read = filemap_splice_read,
.splice_write = iter_file_splice_write,
.llseek = cifs_llseek,
@@ -1555,7 +1565,7 @@ const struct file_operations cifs_file_direct_ops = {
.flock = cifs_flock,
.fsync = cifs_fsync,
.flush = cifs_flush,
- .mmap = cifs_file_mmap,
+ .mmap_prepare = cifs_file_mmap_prepare,
.splice_read = copy_splice_read,
.splice_write = iter_file_splice_write,
.unlocked_ioctl = cifs_ioctl,
@@ -1573,7 +1583,7 @@ const struct file_operations cifs_file_nobrl_ops = {
.release = cifs_close,
.fsync = cifs_fsync,
.flush = cifs_flush,
- .mmap = cifs_file_mmap,
+ .mmap_prepare = cifs_file_mmap_prepare,
.splice_read = filemap_splice_read,
.splice_write = iter_file_splice_write,
.llseek = cifs_llseek,
@@ -1591,7 +1601,7 @@ const struct file_operations cifs_file_strict_nobrl_ops = {
.release = cifs_close,
.fsync = cifs_strict_fsync,
.flush = cifs_flush,
- .mmap = cifs_file_strict_mmap,
+ .mmap_prepare = cifs_file_strict_mmap_prepare,
.splice_read = filemap_splice_read,
.splice_write = iter_file_splice_write,
.llseek = cifs_llseek,
@@ -1609,7 +1619,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = {
.release = cifs_close,
.fsync = cifs_fsync,
.flush = cifs_flush,
- .mmap = cifs_file_mmap,
+ .mmap_prepare = cifs_file_mmap_prepare,
.splice_read = copy_splice_read,
.splice_write = iter_file_splice_write,
.unlocked_ioctl = cifs_ioctl,
@@ -1853,8 +1863,6 @@ init_cifs(void)
GlobalCurrentXid = 0;
GlobalTotalActiveXid = 0;
GlobalMaxActiveXid = 0;
- spin_lock_init(&cifs_tcp_ses_lock);
- spin_lock_init(&GlobalMid_Lock);
cifs_lock_secret = get_random_u32();
diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h
index a762dbbbd959..3ce7c614ccc0 100644
--- a/fs/smb/client/cifsfs.h
+++ b/fs/smb/client/cifsfs.h
@@ -59,8 +59,8 @@ extern int cifs_unlink(struct inode *dir, struct dentry *dentry);
extern int cifs_hardlink(struct dentry *, struct inode *, struct dentry *);
extern int cifs_mknod(struct mnt_idmap *, struct inode *, struct dentry *,
umode_t, dev_t);
-extern int cifs_mkdir(struct mnt_idmap *, struct inode *, struct dentry *,
- umode_t);
+extern struct dentry *cifs_mkdir(struct mnt_idmap *, struct inode *, struct dentry *,
+ umode_t);
extern int cifs_rmdir(struct inode *, struct dentry *);
extern int cifs_rename2(struct mnt_idmap *, struct inode *,
struct dentry *, struct inode *, struct dentry *,
@@ -103,8 +103,8 @@ extern int cifs_lock(struct file *, int, struct file_lock *);
extern int cifs_fsync(struct file *, loff_t, loff_t, int);
extern int cifs_strict_fsync(struct file *, loff_t, loff_t, int);
extern int cifs_flush(struct file *, fl_owner_t id);
-extern int cifs_file_mmap(struct file *file, struct vm_area_struct *vma);
-extern int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma);
+int cifs_file_mmap_prepare(struct vm_area_desc *desc);
+int cifs_file_strict_mmap_prepare(struct vm_area_desc *desc);
extern const struct file_operations cifs_dir_ops;
extern int cifs_readdir(struct file *file, struct dir_context *ctx);
@@ -135,7 +135,6 @@ extern ssize_t cifs_file_copychunk_range(unsigned int xid,
extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
extern void cifs_setsize(struct inode *inode, loff_t offset);
-extern int cifs_truncate_page(struct address_space *mapping, loff_t from);
struct smb3_fs_context;
extern struct dentry *cifs_smb3_do_mount(struct file_system_type *fs_type,
@@ -146,6 +145,6 @@ extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
/* when changing internal version - update following two lines at same time */
-#define SMB3_PRODUCT_BUILD 52
-#define CIFS_VERSION "2.52"
+#define SMB3_PRODUCT_BUILD 56
+#define CIFS_VERSION "2.56"
#endif /* _CIFSFS_H */
diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index 49ffc040f736..1e64a4fb6af0 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -151,6 +151,7 @@ enum securityEnum {
NTLMv2, /* Legacy NTLM auth with NTLMv2 hash */
RawNTLMSSP, /* NTLMSSP without SPNEGO, NTLMv2 hash */
Kerberos, /* Kerberos via SPNEGO */
+ IAKerb, /* Kerberos proxy */
};
enum upcall_target_enum {
@@ -160,6 +161,7 @@ enum upcall_target_enum {
};
enum cifs_reparse_type {
+ CIFS_REPARSE_TYPE_NONE,
CIFS_REPARSE_TYPE_NFS,
CIFS_REPARSE_TYPE_WSL,
CIFS_REPARSE_TYPE_DEFAULT = CIFS_REPARSE_TYPE_NFS,
@@ -168,6 +170,8 @@ enum cifs_reparse_type {
static inline const char *cifs_reparse_type_str(enum cifs_reparse_type type)
{
switch (type) {
+ case CIFS_REPARSE_TYPE_NONE:
+ return "none";
case CIFS_REPARSE_TYPE_NFS:
return "nfs";
case CIFS_REPARSE_TYPE_WSL:
@@ -177,6 +181,39 @@ static inline const char *cifs_reparse_type_str(enum cifs_reparse_type type)
}
}
+enum cifs_symlink_type {
+ CIFS_SYMLINK_TYPE_DEFAULT,
+ CIFS_SYMLINK_TYPE_NONE,
+ CIFS_SYMLINK_TYPE_NATIVE,
+ CIFS_SYMLINK_TYPE_UNIX,
+ CIFS_SYMLINK_TYPE_MFSYMLINKS,
+ CIFS_SYMLINK_TYPE_SFU,
+ CIFS_SYMLINK_TYPE_NFS,
+ CIFS_SYMLINK_TYPE_WSL,
+};
+
+static inline const char *cifs_symlink_type_str(enum cifs_symlink_type type)
+{
+ switch (type) {
+ case CIFS_SYMLINK_TYPE_NONE:
+ return "none";
+ case CIFS_SYMLINK_TYPE_NATIVE:
+ return "native";
+ case CIFS_SYMLINK_TYPE_UNIX:
+ return "unix";
+ case CIFS_SYMLINK_TYPE_MFSYMLINKS:
+ return "mfsymlinks";
+ case CIFS_SYMLINK_TYPE_SFU:
+ return "sfu";
+ case CIFS_SYMLINK_TYPE_NFS:
+ return "nfs";
+ case CIFS_SYMLINK_TYPE_WSL:
+ return "wsl";
+ default:
+ return "unknown";
+ }
+}
+
struct session_key {
unsigned int len;
char *response;
@@ -215,10 +252,8 @@ struct cifs_cred {
struct cifs_open_info_data {
bool adjust_tz;
- union {
- bool reparse_point;
- bool symlink;
- };
+ bool reparse_point;
+ bool contains_posix_file_info;
struct {
/* ioctl response buffer */
struct {
@@ -226,10 +261,7 @@ struct cifs_open_info_data {
struct kvec iov;
} io;
__u32 tag;
- union {
- struct reparse_data_buffer *buf;
- struct reparse_posix_data *posix;
- };
+ struct reparse_data_buffer *buf;
} reparse;
struct {
__u8 eas[SMB2_WSL_MAX_QUERY_EA_RESP_SIZE];
@@ -326,7 +358,7 @@ struct smb_version_operations {
int (*handle_cancelled_mid)(struct mid_q_entry *, struct TCP_Server_Info *);
void (*downgrade_oplock)(struct TCP_Server_Info *server,
struct cifsInodeInfo *cinode, __u32 oplock,
- unsigned int epoch, bool *purge_cache);
+ __u16 epoch, bool *purge_cache);
/* process transaction2 response */
bool (*check_trans2)(struct mid_q_entry *, struct TCP_Server_Info *,
char *, int);
@@ -521,12 +553,12 @@ struct smb_version_operations {
/* if we can do cache read operations */
bool (*is_read_op)(__u32);
/* set oplock level for the inode */
- void (*set_oplock_level)(struct cifsInodeInfo *, __u32, unsigned int,
- bool *);
+ void (*set_oplock_level)(struct cifsInodeInfo *cinode, __u32 oplock, __u16 epoch,
+ bool *purge_cache);
/* create lease context buffer for CREATE request */
- char * (*create_lease_buf)(u8 *lease_key, u8 oplock);
+ char * (*create_lease_buf)(u8 *lease_key, u8 oplock, u8 *parent_lease_key, __le32 le_flags);
/* parse lease context buffer and return oplock/epoch info */
- __u8 (*parse_lease_buf)(void *buf, unsigned int *epoch, char *lkey);
+ __u8 (*parse_lease_buf)(void *buf, __u16 *epoch, char *lkey);
ssize_t (*copychunk_range)(const unsigned int,
struct cifsFileInfo *src_file,
struct cifsFileInfo *target_file,
@@ -593,16 +625,16 @@ struct smb_version_operations {
bool (*is_status_io_timeout)(char *buf);
/* Check for STATUS_NETWORK_NAME_DELETED */
bool (*is_network_name_deleted)(char *buf, struct TCP_Server_Info *srv);
- int (*parse_reparse_point)(struct cifs_sb_info *cifs_sb,
- const char *full_path,
- struct kvec *rsp_iov,
- struct cifs_open_info_data *data);
- int (*create_reparse_symlink)(const unsigned int xid,
- struct inode *inode,
- struct dentry *dentry,
- struct cifs_tcon *tcon,
- const char *full_path,
- const char *symname);
+ struct reparse_data_buffer * (*get_reparse_point_buffer)(const struct kvec *rsp_iov,
+ u32 *plen);
+ struct inode * (*create_reparse_inode)(struct cifs_open_info_data *data,
+ struct super_block *sb,
+ const unsigned int xid,
+ struct cifs_tcon *tcon,
+ const char *full_path,
+ bool directory,
+ struct kvec *reparse_iov,
+ struct kvec *xattr_iov);
};
struct smb_version_values {
@@ -621,6 +653,7 @@ struct smb_version_values {
unsigned int cap_unix;
unsigned int cap_nt_find;
unsigned int cap_large_files;
+ unsigned int cap_unicode;
__u16 signing_enabled;
__u16 signing_required;
size_t create_lease_size;
@@ -678,9 +711,12 @@ inc_rfc1001_len(void *buf, int count)
struct TCP_Server_Info {
struct list_head tcp_ses_list;
struct list_head smb_ses_list;
+ struct list_head rlist; /* reconnect list */
spinlock_t srv_lock; /* protect anything here that is not protected */
__u64 conn_id; /* connection identifier (useful for debugging) */
int srv_count; /* reference counter */
+ int rfc1001_sessinit; /* whether to estasblish netbios session */
+ bool with_rfc1001; /* if netbios session is used */
/* 15 character server name + 0x20 16th byte indicating type = srv */
char server_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
struct smb_version_operations *ops;
@@ -696,7 +732,8 @@ struct TCP_Server_Info {
#endif
wait_queue_head_t response_q;
wait_queue_head_t request_q; /* if more than maxmpx to srvr must block*/
- spinlock_t mid_lock; /* protect mid queue and it's entries */
+ spinlock_t mid_queue_lock; /* protect mid queue */
+ spinlock_t mid_counter_lock;
struct list_head pending_mid_q;
bool noblocksnd; /* use blocking sendmsg */
bool noautotune; /* do not autotune send buf sizes */
@@ -734,14 +771,16 @@ struct TCP_Server_Info {
/* SMB_COM_WRITE_RAW or SMB_COM_READ_RAW. */
unsigned int capabilities; /* selective disabling of caps by smb sess */
int timeAdj; /* Adjust for difference in server time zone in sec */
- __u64 CurrentMid; /* multiplex id - rotating counter, protected by GlobalMid_Lock */
+ __u64 current_mid; /* multiplex id - rotating counter, protected by mid_counter_lock */
char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlm, ntlmv2 etc */
/* 16th byte of RFC1001 workstation name is always null */
char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
__u32 sequence_number; /* for signing, protected by srv_mutex */
__u32 reconnect_instance; /* incremented on each reconnect */
+ __le32 session_key_id; /* retrieved from negotiate response and send in session setup request */
struct session_key session_key;
unsigned long lstrp; /* when we got last response from this server */
+ unsigned long neg_start; /* when negotiate started (jiffies) */
struct cifs_secmech secmech; /* crypto sec mech functs, descriptors */
#define CIFS_NEGFLAVOR_UNENCAP 1 /* wct == 17, but no ext_sec */
#define CIFS_NEGFLAVOR_EXTENDED 2 /* wct == 17, ext_sec bit set */
@@ -751,6 +790,7 @@ struct TCP_Server_Info {
bool sec_kerberosu2u; /* supports U2U Kerberos */
bool sec_kerberos; /* supports plain Kerberos */
bool sec_mskerberos; /* supports legacy MS Kerberos */
+ bool sec_iakerb; /* supports pass-through auth for Kerberos (krb5 proxy) */
bool large_buf; /* is current buffer large? */
/* use SMBD connection instead of socket */
bool rdma;
@@ -1050,6 +1090,7 @@ struct cifs_chan {
};
#define CIFS_SES_FLAG_SCALE_CHANNELS (0x1)
+#define CIFS_SES_FLAGS_PENDING_QUERY_INTERFACES (0x2)
/*
* Session structure. One of these for each uid session with a particular host
@@ -1087,6 +1128,7 @@ struct cifs_ses {
bool sign; /* is signing required? */
bool domainAuto:1;
bool expired_pwd; /* track if access denied or expired pwd so can know if need to update */
+ int unicode;
unsigned int flags;
__u16 session_flags;
__u8 smb3signingkey[SMB3_SIGN_KEY_SIZE];
@@ -1265,6 +1307,7 @@ struct cifs_tcon {
bool use_persistent:1; /* use persistent instead of durable handles */
bool no_lease:1; /* Do not request leases on files or directories */
bool use_witness:1; /* use witness protocol */
+ bool dummy:1; /* dummy tcon used for reconnecting channels */
__le32 capabilities;
__u32 share_flags;
__u32 maximal_access;
@@ -1286,7 +1329,8 @@ struct cifs_tcon {
#endif
struct list_head pending_opens; /* list of incomplete opens */
struct cached_fids *cfids;
- /* BB add field for back pointer to sb struct(s)? */
+ struct list_head cifs_sb_list;
+ spinlock_t sb_list_lock;
#ifdef CONFIG_CIFS_DFS_UPCALL
struct delayed_work dfs_cache_work;
struct list_head dfs_ses_list;
@@ -1405,6 +1449,7 @@ struct cifs_open_parms {
bool reconnect:1;
bool replay:1; /* indicates that this open is for a replay */
struct kvec *ea_cctx;
+ __le32 lease_flags;
};
struct cifs_fid {
@@ -1412,10 +1457,11 @@ struct cifs_fid {
__u64 persistent_fid; /* persist file id for smb2 */
__u64 volatile_fid; /* volatile file id for smb2 */
__u8 lease_key[SMB2_LEASE_KEY_SIZE]; /* lease key for smb2 */
+ __u8 parent_lease_key[SMB2_LEASE_KEY_SIZE];
__u8 create_guid[16];
__u32 access;
struct cifs_pending_open *pending_open;
- unsigned int epoch;
+ __u16 epoch;
#ifdef CONFIG_CIFS_DEBUG2
__u64 mid;
#endif /* CIFS_DEBUG2 */
@@ -1448,7 +1494,7 @@ struct cifsFileInfo {
bool oplock_break_cancelled:1;
bool status_file_deleted:1; /* file has been deleted */
bool offload:1; /* offload final part of _put to a wq */
- unsigned int oplock_epoch; /* epoch from the lease break */
+ __u16 oplock_epoch; /* epoch from the lease break */
__u32 oplock_level; /* oplock/lease level from the lease break */
int count;
spinlock_t file_info_lock; /* protects four flag/count fields above */
@@ -1476,7 +1522,6 @@ struct cifs_io_parms {
struct cifs_io_request {
struct netfs_io_request rreq;
struct cifsFileInfo *cfile;
- struct TCP_Server_Info *server;
pid_t pid;
};
@@ -1545,7 +1590,7 @@ struct cifsInodeInfo {
spinlock_t open_file_lock; /* protects openFileList */
__u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */
unsigned int oplock; /* oplock/lease level we have */
- unsigned int epoch; /* used to track lease state changes */
+ __u16 epoch; /* used to track lease state changes */
#define CIFS_INODE_PENDING_OPLOCK_BREAK (0) /* oplock break in progress */
#define CIFS_INODE_PENDING_WRITERS (1) /* Writes in progress */
#define CIFS_INODE_FLAG_UNUSED (2) /* Unused flag */
@@ -1684,9 +1729,12 @@ struct mid_q_entry {
void *resp_buf; /* pointer to received SMB header */
unsigned int resp_buf_size;
int mid_state; /* wish this were enum but can not pass to wait_event */
- unsigned int mid_flags;
+ int mid_rc; /* rc for MID_RC */
__le16 command; /* smb command code */
unsigned int optype; /* operation type */
+ spinlock_t mid_lock;
+ bool wait_cancelled:1; /* Cancelled while waiting for response */
+ bool deleted_from_q:1; /* Whether Mid has been dequeued frem pending_mid_q */
bool large_buf:1; /* if valid response, is pointer to large buf */
bool multiRsp:1; /* multiple trans2 responses for one request */
bool multiEnd:1; /* both received */
@@ -1846,10 +1894,7 @@ static inline bool is_replayable_error(int error)
#define MID_RESPONSE_MALFORMED 0x10
#define MID_SHUTDOWN 0x20
#define MID_RESPONSE_READY 0x40 /* ready for other process handle the rsp */
-
-/* Flags */
-#define MID_WAIT_CANCELLED 1 /* Cancelled while waiting for response */
-#define MID_DELETED 2 /* Mid has been dequeued/deleted */
+#define MID_RC 0x80 /* mid_rc contains custom rc */
/* Types of response buffer returned from SendReceive2 */
#define CIFS_NO_BUFFER 0 /* Response buffer not returned */
@@ -1951,8 +1996,7 @@ require use of the stronger protocol */
* TCP_Server_Info-> TCP_Server_Info cifs_get_tcp_session
* reconnect_mutex
* TCP_Server_Info->srv_mutex TCP_Server_Info cifs_get_tcp_session
- * cifs_ses->session_mutex cifs_ses sesInfoAlloc
- * cifs_tcon
+ * cifs_ses->session_mutex cifs_ses sesInfoAlloc
* cifs_tcon->open_file_lock cifs_tcon->openFileList tconInfoAlloc
* cifs_tcon->pending_opens
* cifs_tcon->stat_lock cifs_tcon->bytes_read tconInfoAlloc
@@ -1962,33 +2006,40 @@ require use of the stronger protocol */
* GlobalCurrentXid
* GlobalTotalActiveXid
* TCP_Server_Info->srv_lock (anything in struct not protected by another lock and can change)
- * TCP_Server_Info->mid_lock TCP_Server_Info->pending_mid_q cifs_get_tcp_session
- * ->CurrentMid
- * (any changes in mid_q_entry fields)
+ * TCP_Server_Info->mid_queue_lock TCP_Server_Info->pending_mid_q cifs_get_tcp_session
+ * mid_q_entry->deleted_from_q
+ * TCP_Server_Info->mid_counter_lock TCP_Server_Info->current_mid cifs_get_tcp_session
* TCP_Server_Info->req_lock TCP_Server_Info->in_flight cifs_get_tcp_session
* ->credits
* ->echo_credits
* ->oplock_credits
* ->reconnect_instance
* cifs_ses->ses_lock (anything that is not protected by another lock and can change)
+ * sesInfoAlloc
* cifs_ses->iface_lock cifs_ses->iface_list sesInfoAlloc
* ->iface_count
* ->iface_last_update
- * cifs_ses->chan_lock cifs_ses->chans
+ * cifs_ses->chan_lock cifs_ses->chans sesInfoAlloc
* ->chans_need_reconnect
* ->chans_in_reconnect
* cifs_tcon->tc_lock (anything that is not protected by another lock and can change)
+ * tcon_info_alloc
* inode->i_rwsem, taken by fs/netfs/locking.c e.g. should be taken before cifsInodeInfo locks
* cifsInodeInfo->open_file_lock cifsInodeInfo->openFileList cifs_alloc_inode
* cifsInodeInfo->writers_lock cifsInodeInfo->writers cifsInodeInfo_alloc
* cifsInodeInfo->lock_sem cifsInodeInfo->llist cifs_init_once
* ->can_cache_brlcks
* cifsInodeInfo->deferred_lock cifsInodeInfo->deferred_closes cifsInodeInfo_alloc
- * cached_fids->cfid_list_lock cifs_tcon->cfids->entries init_cached_dirs
- * cifsFileInfo->fh_mutex cifsFileInfo cifs_new_fileinfo
+ * cached_fids->cfid_list_lock cifs_tcon->cfids->entries init_cached_dirs
+ * cached_fid->fid_lock (anything that is not protected by another lock and can change)
+ * init_cached_dir
+ * cifsFileInfo->fh_mutex cifsFileInfo cifs_new_fileinfo
* cifsFileInfo->file_info_lock cifsFileInfo->count cifs_new_fileinfo
* ->invalidHandle initiate_cifs_search
* ->oplock_break_cancelled
+ * mid_q_entry->mid_lock mid_q_entry->callback alloc_mid
+ * smb2_mid_entry_alloc
+ * (Any fields of mid_q_entry that will need protection)
****************************************************************************/
#ifdef DECLARE_GLOBALS_HERE
@@ -2118,6 +2169,8 @@ static inline char *get_security_type_str(enum securityEnum sectype)
return "Kerberos";
case NTLMv2:
return "NTLMv2";
+ case IAKerb:
+ return "IAKerb";
default:
return "Unknown";
}
@@ -2173,11 +2226,13 @@ static inline size_t ntlmssp_workstation_name_size(const struct cifs_ses *ses)
static inline void move_cifs_info_to_smb2(struct smb2_file_all_info *dst, const FILE_ALL_INFO *src)
{
- memcpy(dst, src, (size_t)((u8 *)&src->AccessFlags - (u8 *)src));
- dst->AccessFlags = src->AccessFlags;
- dst->CurrentByteOffset = src->CurrentByteOffset;
- dst->Mode = src->Mode;
- dst->AlignmentRequirement = src->AlignmentRequirement;
+ memcpy(dst, src, (size_t)((u8 *)&src->EASize - (u8 *)src));
+ dst->IndexNumber = 0;
+ dst->EASize = src->EASize;
+ dst->AccessFlags = 0;
+ dst->CurrentByteOffset = 0;
+ dst->Mode = 0;
+ dst->AlignmentRequirement = 0;
dst->FileNameLength = src->FileNameLength;
}
@@ -2289,8 +2344,8 @@ struct smb2_compound_vars {
struct kvec io_iov[SMB2_IOCTL_IOV_SIZE];
struct kvec si_iov[SMB2_SET_INFO_IOV_SIZE];
struct kvec close_iov;
- struct smb2_file_rename_info rename_info;
- struct smb2_file_link_info link_info;
+ struct smb2_file_rename_info_hdr rename_info;
+ struct smb2_file_link_info_hdr link_info;
struct kvec ea_iov;
};
@@ -2324,4 +2379,26 @@ static inline bool cifs_netbios_name(const char *name, size_t namelen)
return ret;
}
+/*
+ * Execute mid callback atomically - ensures callback runs exactly once
+ * and prevents sleeping in atomic context.
+ */
+static inline void mid_execute_callback(struct mid_q_entry *mid)
+{
+ void (*callback)(struct mid_q_entry *mid);
+
+ spin_lock(&mid->mid_lock);
+ callback = mid->callback;
+ mid->callback = NULL; /* Mark as executed, */
+ spin_unlock(&mid->mid_lock);
+
+ if (callback)
+ callback(mid);
+}
+
+#define CIFS_REPARSE_SUPPORT(tcon) \
+ ((tcon)->posix_extensions || \
+ (le32_to_cpu((tcon)->fsAttrInfo.Attributes) & \
+ FILE_SUPPORTS_REPARSE_POINTS))
+
#endif /* _CIFS_GLOB_H */
diff --git a/fs/smb/client/cifspdu.h b/fs/smb/client/cifspdu.h
index 5c047b00516f..d9cf7db0ac35 100644
--- a/fs/smb/client/cifspdu.h
+++ b/fs/smb/client/cifspdu.h
@@ -190,42 +190,82 @@
*/
#define FILE_READ_DATA 0x00000001 /* Data can be read from the file */
+ /* or directory child entries can */
+ /* be listed together with the */
+ /* associated child attributes */
+ /* (so the FILE_READ_ATTRIBUTES on */
+ /* the child entry is not needed) */
#define FILE_WRITE_DATA 0x00000002 /* Data can be written to the file */
+ /* or new file can be created in */
+ /* the directory */
#define FILE_APPEND_DATA 0x00000004 /* Data can be appended to the file */
+ /* (for non-local files over SMB it */
+ /* is same as FILE_WRITE_DATA) */
+ /* or new subdirectory can be */
+ /* created in the directory */
#define FILE_READ_EA 0x00000008 /* Extended attributes associated */
/* with the file can be read */
#define FILE_WRITE_EA 0x00000010 /* Extended attributes associated */
/* with the file can be written */
#define FILE_EXECUTE 0x00000020 /*Data can be read into memory from */
/* the file using system paging I/O */
-#define FILE_DELETE_CHILD 0x00000040
+ /* for executing the file / script */
+ /* or right to traverse directory */
+ /* (but by default all users have */
+ /* directory bypass traverse */
+ /* privilege and do not need this */
+ /* permission on directories at all)*/
+#define FILE_DELETE_CHILD 0x00000040 /* Child entry can be deleted from */
+ /* the directory (so the DELETE on */
+ /* the child entry is not needed) */
#define FILE_READ_ATTRIBUTES 0x00000080 /* Attributes associated with the */
- /* file can be read */
+ /* file or directory can be read */
#define FILE_WRITE_ATTRIBUTES 0x00000100 /* Attributes associated with the */
- /* file can be written */
-#define DELETE 0x00010000 /* The file can be deleted */
-#define READ_CONTROL 0x00020000 /* The access control list and */
- /* ownership associated with the */
- /* file can be read */
-#define WRITE_DAC 0x00040000 /* The access control list and */
- /* ownership associated with the */
- /* file can be written. */
+ /* file or directory can be written */
+#define DELETE 0x00010000 /* The file or dir can be deleted */
+#define READ_CONTROL 0x00020000 /* The discretionary access control */
+ /* list and ownership associated */
+ /* with the file or dir can be read */
+#define WRITE_DAC 0x00040000 /* The discretionary access control */
+ /* list associated with the file or */
+ /* directory can be written */
#define WRITE_OWNER 0x00080000 /* Ownership information associated */
- /* with the file can be written */
+ /* with the file/dir can be written */
#define SYNCHRONIZE 0x00100000 /* The file handle can waited on to */
/* synchronize with the completion */
/* of an input/output request */
#define SYSTEM_SECURITY 0x01000000 /* The system access control list */
- /* can be read and changed */
-#define GENERIC_ALL 0x10000000
-#define GENERIC_EXECUTE 0x20000000
-#define GENERIC_WRITE 0x40000000
-#define GENERIC_READ 0x80000000
- /* In summary - Relevant file */
- /* access flags from CIFS are */
- /* file_read_data, file_write_data */
- /* file_execute, file_read_attributes*/
- /* write_dac, and delete. */
+ /* associated with the file or */
+ /* directory can be read or written */
+ /* (cannot be in DACL, can in SACL) */
+#define MAXIMUM_ALLOWED 0x02000000 /* Maximal subset of GENERIC_ALL */
+ /* permissions which can be granted */
+ /* (cannot be in DACL nor SACL) */
+#define GENERIC_ALL 0x10000000 /* Same as: GENERIC_EXECUTE | */
+ /* GENERIC_WRITE | */
+ /* GENERIC_READ | */
+ /* FILE_DELETE_CHILD | */
+ /* DELETE | */
+ /* WRITE_DAC | */
+ /* WRITE_OWNER */
+ /* So GENERIC_ALL contains all bits */
+ /* mentioned above except these two */
+ /* SYSTEM_SECURITY MAXIMUM_ALLOWED */
+#define GENERIC_EXECUTE 0x20000000 /* Same as: FILE_EXECUTE | */
+ /* FILE_READ_ATTRIBUTES | */
+ /* READ_CONTROL | */
+ /* SYNCHRONIZE */
+#define GENERIC_WRITE 0x40000000 /* Same as: FILE_WRITE_DATA | */
+ /* FILE_APPEND_DATA | */
+ /* FILE_WRITE_EA | */
+ /* FILE_WRITE_ATTRIBUTES | */
+ /* READ_CONTROL | */
+ /* SYNCHRONIZE */
+#define GENERIC_READ 0x80000000 /* Same as: FILE_READ_DATA | */
+ /* FILE_READ_EA | */
+ /* FILE_READ_ATTRIBUTES | */
+ /* READ_CONTROL | */
+ /* SYNCHRONIZE */
#define FILE_READ_RIGHTS (FILE_READ_DATA | FILE_READ_EA | FILE_READ_ATTRIBUTES)
#define FILE_WRITE_RIGHTS (FILE_WRITE_DATA | FILE_APPEND_DATA \
@@ -557,7 +597,7 @@ typedef union smb_com_session_setup_andx {
__le16 MaxBufferSize;
__le16 MaxMpxCount;
__le16 VcNumber;
- __u32 SessionKey;
+ __le32 SessionKey;
__le16 SecurityBlobLength;
__u32 Reserved;
__le32 Capabilities; /* see below */
@@ -576,7 +616,7 @@ typedef union smb_com_session_setup_andx {
__le16 MaxBufferSize;
__le16 MaxMpxCount;
__le16 VcNumber;
- __u32 SessionKey;
+ __le32 SessionKey;
__le16 CaseInsensitivePasswordLength; /* ASCII password len */
__le16 CaseSensitivePasswordLength; /* Unicode password length*/
__u32 Reserved; /* see below */
@@ -614,7 +654,7 @@ typedef union smb_com_session_setup_andx {
__le16 MaxBufferSize;
__le16 MaxMpxCount;
__le16 VcNumber;
- __u32 SessionKey;
+ __le32 SessionKey;
__le16 PasswordLength;
__u32 Reserved; /* encrypt key len and offset */
__le16 ByteCount;
@@ -1226,10 +1266,9 @@ typedef struct smb_com_query_information_rsp {
typedef struct smb_com_setattr_req {
struct smb_hdr hdr; /* wct = 8 */
__le16 attr;
- __le16 time_low;
- __le16 time_high;
+ __le32 last_write_time;
__le16 reserved[5]; /* must be zero */
- __u16 ByteCount;
+ __le16 ByteCount;
__u8 BufferFormat; /* 4 = ASCII */
unsigned char fileName[];
} __attribute__((packed)) SETATTR_REQ;
@@ -1484,20 +1523,6 @@ struct file_notify_information {
__u8 FileName[];
} __attribute__((packed));
-/* For IO_REPARSE_TAG_NFS */
-#define NFS_SPECFILE_LNK 0x00000000014B4E4C
-#define NFS_SPECFILE_CHR 0x0000000000524843
-#define NFS_SPECFILE_BLK 0x00000000004B4C42
-#define NFS_SPECFILE_FIFO 0x000000004F464946
-#define NFS_SPECFILE_SOCK 0x000000004B434F53
-struct reparse_posix_data {
- __le32 ReparseTag;
- __le16 ReparseDataLength;
- __u16 Reserved;
- __le64 InodeType; /* LNK, FIFO, CHR etc. */
- __u8 DataBuffer[];
-} __attribute__((packed));
-
struct cifs_quota_data {
__u32 rsrvd1; /* 0 */
__u32 sid_size;
@@ -2230,6 +2255,8 @@ typedef struct {
#define FILE_SUPPORTS_ENCRYPTION 0x00020000
#define FILE_SUPPORTS_OBJECT_IDS 0x00010000
#define FILE_VOLUME_IS_COMPRESSED 0x00008000
+#define FILE_SUPPORTS_POSIX_UNLINK_RENAME 0x00000400
+#define FILE_RETURNS_CLEANUP_RESULT_INFO 0x00000200
#define FILE_SUPPORTS_REMOTE_STORAGE 0x00000100
#define FILE_SUPPORTS_REPARSE_POINTS 0x00000080
#define FILE_SUPPORTS_SPARSE_FILES 0x00000040
@@ -2264,13 +2291,7 @@ typedef struct { /* data block encoding of response to level 263 QPathInfo */
__u8 DeletePending;
__u8 Directory;
__u16 Pad2;
- __le64 IndexNumber;
__le32 EASize;
- __le32 AccessFlags;
- __u64 IndexNumber1;
- __le64 CurrentByteOffset;
- __le32 Mode;
- __le32 AlignmentRequirement;
__le32 FileNameLength;
union {
char __pad;
diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h
index 223e5e231f42..c34c533b2efa 100644
--- a/fs/smb/client/cifsproto.h
+++ b/fs/smb/client/cifsproto.h
@@ -31,6 +31,9 @@ extern void cifs_small_buf_release(void *);
extern void free_rsp_buf(int, void *);
extern int smb_send(struct TCP_Server_Info *, struct smb_hdr *,
unsigned int /* length */);
+extern int smb_send_kvec(struct TCP_Server_Info *server,
+ struct msghdr *msg,
+ size_t *sent);
extern unsigned int _get_xid(void);
extern void _free_xid(unsigned int);
#define get_xid() \
@@ -113,16 +116,31 @@ extern int SendReceive(const unsigned int /* xid */ , struct cifs_ses *,
int * /* bytes returned */ , const int);
extern int SendReceiveNoRsp(const unsigned int xid, struct cifs_ses *ses,
char *in_buf, int flags);
+int cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server);
extern struct mid_q_entry *cifs_setup_request(struct cifs_ses *,
struct TCP_Server_Info *,
struct smb_rqst *);
extern struct mid_q_entry *cifs_setup_async_request(struct TCP_Server_Info *,
struct smb_rqst *);
+int __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
+ struct smb_rqst *rqst);
extern int cifs_check_receive(struct mid_q_entry *mid,
struct TCP_Server_Info *server, bool log_error);
+int wait_for_free_request(struct TCP_Server_Info *server, const int flags,
+ unsigned int *instance);
extern int cifs_wait_mtu_credits(struct TCP_Server_Info *server,
size_t size, size_t *num,
struct cifs_credits *credits);
+
+static inline int
+send_cancel(struct TCP_Server_Info *server, struct smb_rqst *rqst,
+ struct mid_q_entry *mid)
+{
+ return server->ops->send_cancel ?
+ server->ops->send_cancel(server, rqst, mid) : 0;
+}
+
+int wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ);
extern int SendReceive2(const unsigned int /* xid */ , struct cifs_ses *,
struct kvec *, int /* nvec to send */,
int * /* type of buf returned */, const int flags,
@@ -133,6 +151,7 @@ extern int SendReceiveBlockingLock(const unsigned int xid,
struct smb_hdr *out_buf,
int *bytes_returned);
+void smb2_query_server_interfaces(struct work_struct *work);
void
cifs_signal_cifsd_for_reconnect(struct TCP_Server_Info *server,
bool all_channels);
@@ -148,8 +167,7 @@ extern bool is_size_safe_to_change(struct cifsInodeInfo *cifsInode, __u64 eof,
bool from_readdir);
extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
unsigned int bytes_written);
-void cifs_write_subrequest_terminated(struct cifs_io_subrequest *wdata, ssize_t result,
- bool was_async);
+void cifs_write_subrequest_terminated(struct cifs_io_subrequest *wdata, ssize_t result);
extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, int);
extern int cifs_get_writable_file(struct cifsInodeInfo *cifs_inode,
int flags,
@@ -160,6 +178,8 @@ extern int cifs_get_writable_path(struct cifs_tcon *tcon, const char *name,
extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool);
extern int cifs_get_readable_path(struct cifs_tcon *tcon, const char *name,
struct cifsFileInfo **ret_file);
+extern int cifs_get_hardlink_path(struct cifs_tcon *tcon, struct inode *inode,
+ struct file *file);
extern unsigned int smbCalcSize(void *buf);
extern int decode_negTokenInit(unsigned char *security_blob, int length,
struct TCP_Server_Info *server);
@@ -390,6 +410,10 @@ extern int CIFSSMBQFSUnixInfo(const unsigned int xid, struct cifs_tcon *tcon);
extern int CIFSSMBQFSPosixInfo(const unsigned int xid, struct cifs_tcon *tcon,
struct kstatfs *FSData);
+extern int SMBSetInformation(const unsigned int xid, struct cifs_tcon *tcon,
+ const char *fileName, __le32 attributes, __le64 write_time,
+ const struct nls_table *nls_codepage,
+ struct cifs_sb_info *cifs_sb);
extern int CIFSSMBSetPathInfo(const unsigned int xid, struct cifs_tcon *tcon,
const char *fileName, const FILE_BASIC_INFO *data,
const struct nls_table *nls_codepage,
@@ -474,6 +498,14 @@ extern int cifs_query_reparse_point(const unsigned int xid,
const char *full_path,
u32 *tag, struct kvec *rsp,
int *rsp_buftype);
+extern struct inode *cifs_create_reparse_inode(struct cifs_open_info_data *data,
+ struct super_block *sb,
+ const unsigned int xid,
+ struct cifs_tcon *tcon,
+ const char *full_path,
+ bool directory,
+ struct kvec *reparse_iov,
+ struct kvec *xattr_iov);
extern int CIFSSMB_set_compression(const unsigned int xid,
struct cifs_tcon *tcon, __u16 fid);
extern int CIFS_open(const unsigned int xid, struct cifs_open_parms *oparms,
@@ -557,7 +589,7 @@ extern int CIFSSMBSetEA(const unsigned int xid, struct cifs_tcon *tcon,
const struct nls_table *nls_codepage,
struct cifs_sb_info *cifs_sb);
extern int CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon,
- __u16 fid, struct smb_ntsd **acl_inf, __u32 *buflen);
+ __u16 fid, struct smb_ntsd **acl_inf, __u32 *buflen, __u32 info);
extern int CIFSSMBSetCIFSACL(const unsigned int, struct cifs_tcon *, __u16,
struct smb_ntsd *pntsd, __u32 len, int aclflag);
extern int cifs_do_get_acl(const unsigned int xid, struct cifs_tcon *tcon,
@@ -592,7 +624,6 @@ int cifs_async_readv(struct cifs_io_subrequest *rdata);
int cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid);
void cifs_async_writev(struct cifs_io_subrequest *wdata);
-void cifs_writev_complete(struct work_struct *work);
int cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb,
const unsigned char *path, char *pbuf,
@@ -656,7 +687,7 @@ char *extract_sharename(const char *unc);
int parse_reparse_point(struct reparse_data_buffer *buf,
u32 plen, struct cifs_sb_info *cifs_sb,
const char *full_path,
- bool unicode, struct cifs_open_info_data *data);
+ struct cifs_open_info_data *data);
int __cifs_sfu_make_node(unsigned int xid, struct inode *inode,
struct dentry *dentry, struct cifs_tcon *tcon,
const char *full_path, umode_t mode, dev_t dev,
diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c
index 7f1cacc89dbb..d20766f664c4 100644
--- a/fs/smb/client/cifssmb.c
+++ b/fs/smb/client/cifssmb.c
@@ -114,19 +114,23 @@ again:
mutex_lock(&ses->session_mutex);
/*
- * Recheck after acquire mutex. If another thread is negotiating
- * and the server never sends an answer the socket will be closed
- * and tcpStatus set to reconnect.
+ * Handle the case where a concurrent thread failed to negotiate or
+ * killed a channel.
*/
spin_lock(&server->srv_lock);
- if (server->tcpStatus == CifsNeedReconnect) {
+ switch (server->tcpStatus) {
+ case CifsExiting:
spin_unlock(&server->srv_lock);
mutex_unlock(&ses->session_mutex);
-
- if (tcon->retry)
- goto again;
- rc = -EHOSTDOWN;
- goto out;
+ return -EHOSTDOWN;
+ case CifsNeedReconnect:
+ spin_unlock(&server->srv_lock);
+ mutex_unlock(&ses->session_mutex);
+ if (!tcon->retry)
+ return -EHOSTDOWN;
+ goto again;
+ default:
+ break;
}
spin_unlock(&server->srv_lock);
@@ -152,16 +156,20 @@ again:
spin_unlock(&ses->ses_lock);
rc = cifs_negotiate_protocol(0, ses, server);
- if (!rc) {
- rc = cifs_setup_session(0, ses, server, ses->local_nls);
- if ((rc == -EACCES) || (rc == -EHOSTDOWN) || (rc == -EKEYREVOKED)) {
- /*
- * Try alternate password for next reconnect if an alternate
- * password is available.
- */
- if (ses->password2)
- swap(ses->password2, ses->password);
- }
+ if (rc) {
+ mutex_unlock(&ses->session_mutex);
+ if (!tcon->retry)
+ return -EHOSTDOWN;
+ goto again;
+ }
+ rc = cifs_setup_session(0, ses, server, ses->local_nls);
+ if ((rc == -EACCES) || (rc == -EHOSTDOWN) || (rc == -EKEYREVOKED)) {
+ /*
+ * Try alternate password for next reconnect if an alternate
+ * password is available.
+ */
+ if (ses->password2)
+ swap(ses->password2, ses->password);
}
/* do we need to reconnect tcon? */
@@ -429,7 +437,10 @@ CIFSSMBNegotiate(const unsigned int xid,
return rc;
pSMB->hdr.Mid = get_next_mid(server);
- pSMB->hdr.Flags2 |= (SMBFLG2_UNICODE | SMBFLG2_ERR_STATUS);
+ pSMB->hdr.Flags2 |= SMBFLG2_ERR_STATUS;
+
+ if (ses->unicode != 0)
+ pSMB->hdr.Flags2 |= SMBFLG2_UNICODE;
if (should_set_ext_sec_flag(ses->sectype)) {
cifs_dbg(FYI, "Requesting extended security\n");
@@ -487,6 +498,7 @@ CIFSSMBNegotiate(const unsigned int xid,
server->max_rw = le32_to_cpu(pSMBr->MaxRawSize);
cifs_dbg(NOISY, "Max buf = %d\n", ses->server->maxBuf);
server->capabilities = le32_to_cpu(pSMBr->Capabilities);
+ server->session_key_id = pSMBr->SessionKey;
server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone);
server->timeAdj *= 60;
@@ -1030,15 +1042,31 @@ static __u16 convert_disposition(int disposition)
static int
access_flags_to_smbopen_mode(const int access_flags)
{
- int masked_flags = access_flags & (GENERIC_READ | GENERIC_WRITE);
-
- if (masked_flags == GENERIC_READ)
- return SMBOPEN_READ;
- else if (masked_flags == GENERIC_WRITE)
+ /*
+ * SYSTEM_SECURITY grants both read and write access to SACL, treat is as read/write.
+ * MAXIMUM_ALLOWED grants as many access as possible, so treat it as read/write too.
+ * SYNCHRONIZE as is does not grant any specific access, so do not check its mask.
+ * If only SYNCHRONIZE bit is specified then fallback to read access.
+ */
+ bool with_write_flags = access_flags & (FILE_WRITE_DATA | FILE_APPEND_DATA | FILE_WRITE_EA |
+ FILE_DELETE_CHILD | FILE_WRITE_ATTRIBUTES | DELETE |
+ WRITE_DAC | WRITE_OWNER | SYSTEM_SECURITY |
+ MAXIMUM_ALLOWED | GENERIC_WRITE | GENERIC_ALL);
+ bool with_read_flags = access_flags & (FILE_READ_DATA | FILE_READ_EA | FILE_EXECUTE |
+ FILE_READ_ATTRIBUTES | READ_CONTROL |
+ SYSTEM_SECURITY | MAXIMUM_ALLOWED | GENERIC_ALL |
+ GENERIC_EXECUTE | GENERIC_READ);
+ bool with_execute_flags = access_flags & (FILE_EXECUTE | MAXIMUM_ALLOWED | GENERIC_ALL |
+ GENERIC_EXECUTE);
+
+ if (with_write_flags && with_read_flags)
+ return SMBOPEN_READWRITE;
+ else if (with_write_flags)
return SMBOPEN_WRITE;
-
- /* just go for read/write */
- return SMBOPEN_READWRITE;
+ else if (with_execute_flags)
+ return SMBOPEN_EXECUTE;
+ else
+ return SMBOPEN_READ;
}
int
@@ -1306,7 +1334,12 @@ cifs_readv_callback(struct mid_q_entry *mid)
cifs_stats_bytes_read(tcon, rdata->got_bytes);
break;
case MID_REQUEST_SUBMITTED:
+ trace_netfs_sreq(&rdata->subreq, netfs_sreq_trace_io_req_submitted);
+ goto do_retry;
case MID_RETRY_NEEDED:
+ trace_netfs_sreq(&rdata->subreq, netfs_sreq_trace_io_retry_needed);
+do_retry:
+ __set_bit(NETFS_SREQ_NEED_RETRY, &rdata->subreq.flags);
rdata->result = -EAGAIN;
if (server->sign && rdata->got_bytes)
/* reset bytes number since we can not check a sign */
@@ -1315,8 +1348,14 @@ cifs_readv_callback(struct mid_q_entry *mid)
task_io_account_read(rdata->got_bytes);
cifs_stats_bytes_read(tcon, rdata->got_bytes);
break;
+ case MID_RESPONSE_MALFORMED:
+ trace_netfs_sreq(&rdata->subreq, netfs_sreq_trace_io_malformed);
+ rdata->result = -EIO;
+ break;
default:
+ trace_netfs_sreq(&rdata->subreq, netfs_sreq_trace_io_unknown);
rdata->result = -EIO;
+ break;
}
if (rdata->result == -ENODATA) {
@@ -1338,7 +1377,8 @@ cifs_readv_callback(struct mid_q_entry *mid)
rdata->credits.value = 0;
rdata->subreq.error = rdata->result;
rdata->subreq.transferred += rdata->got_bytes;
- queue_work(cifsiod_wq, &rdata->subreq.work);
+ trace_netfs_sreq(&rdata->subreq, netfs_sreq_trace_io_progress);
+ netfs_read_subreq_terminated(&rdata->subreq);
release_mid(mid);
add_credits(server, &credits, 0);
}
@@ -1684,10 +1724,21 @@ cifs_writev_callback(struct mid_q_entry *mid)
}
break;
case MID_REQUEST_SUBMITTED:
+ trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_req_submitted);
+ __set_bit(NETFS_SREQ_NEED_RETRY, &wdata->subreq.flags);
+ result = -EAGAIN;
+ break;
case MID_RETRY_NEEDED:
+ trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_retry_needed);
+ __set_bit(NETFS_SREQ_NEED_RETRY, &wdata->subreq.flags);
result = -EAGAIN;
break;
+ case MID_RESPONSE_MALFORMED:
+ trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_malformed);
+ result = -EIO;
+ break;
default:
+ trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_unknown);
result = -EIO;
break;
}
@@ -1697,7 +1748,7 @@ cifs_writev_callback(struct mid_q_entry *mid)
server->credits, server->in_flight,
0, cifs_trace_rw_credits_write_response_clear);
wdata->credits.value = 0;
- cifs_write_subrequest_terminated(wdata, result, true);
+ cifs_write_subrequest_terminated(wdata, result);
release_mid(mid);
trace_smb3_rw_credits(credits.rreq_debug_id, credits.rreq_debug_index, 0,
server->credits, server->in_flight,
@@ -1785,7 +1836,7 @@ async_writev_out:
out:
if (rc) {
add_credits_and_wake_if(wdata->server, &wdata->credits, 0);
- cifs_write_subrequest_terminated(wdata, rc, false);
+ cifs_write_subrequest_terminated(wdata, rc);
}
}
@@ -2700,6 +2751,9 @@ int cifs_query_reparse_point(const unsigned int xid,
if (cap_unix(tcon->ses))
return -EOPNOTSUPP;
+ if (!CIFS_REPARSE_SUPPORT(tcon))
+ return -EOPNOTSUPP;
+
oparms = (struct cifs_open_parms) {
.tcon = tcon,
.cifs_sb = cifs_sb,
@@ -2722,10 +2776,10 @@ int cifs_query_reparse_point(const unsigned int xid,
io_req->TotalParameterCount = 0;
io_req->TotalDataCount = 0;
- io_req->MaxParameterCount = cpu_to_le32(2);
+ io_req->MaxParameterCount = cpu_to_le32(0);
/* BB find exact data count max from sess structure BB */
io_req->MaxDataCount = cpu_to_le32(CIFSMaxBufSize & 0xFFFFFF00);
- io_req->MaxSetupCount = 4;
+ io_req->MaxSetupCount = 1;
io_req->Reserved = 0;
io_req->ParameterOffset = 0;
io_req->DataCount = 0;
@@ -2752,6 +2806,22 @@ int cifs_query_reparse_point(const unsigned int xid,
goto error;
}
+ /* SetupCount must be 1, otherwise offset to ByteCount is incorrect. */
+ if (io_rsp->SetupCount != 1) {
+ rc = -EIO;
+ goto error;
+ }
+
+ /*
+ * ReturnedDataLen is output length of executed IOCTL.
+ * DataCount is output length transferred over network.
+ * Check that we have full FSCTL_GET_REPARSE_POINT buffer.
+ */
+ if (data_count != le16_to_cpu(io_rsp->ReturnedDataLen)) {
+ rc = -EIO;
+ goto error;
+ }
+
end = 2 + get_bcc(&io_rsp->hdr) + (__u8 *)&io_rsp->ByteCount;
start = (__u8 *)&io_rsp->hdr.Protocol + data_offset;
if (start >= end) {
@@ -2781,6 +2851,134 @@ error:
return rc;
}
+struct inode *cifs_create_reparse_inode(struct cifs_open_info_data *data,
+ struct super_block *sb,
+ const unsigned int xid,
+ struct cifs_tcon *tcon,
+ const char *full_path,
+ bool directory,
+ struct kvec *reparse_iov,
+ struct kvec *xattr_iov)
+{
+ struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+ struct cifs_open_parms oparms;
+ TRANSACT_IOCTL_REQ *io_req;
+ struct inode *new = NULL;
+ struct kvec in_iov[2];
+ struct kvec out_iov;
+ struct cifs_fid fid;
+ int io_req_len;
+ int oplock = 0;
+ int buf_type = 0;
+ int rc;
+
+ cifs_tcon_dbg(FYI, "%s: path=%s\n", __func__, full_path);
+
+ /*
+ * If server filesystem does not support reparse points then do not
+ * attempt to create reparse point. This will prevent creating unusable
+ * empty object on the server.
+ */
+ if (!CIFS_REPARSE_SUPPORT(tcon))
+ return ERR_PTR(-EOPNOTSUPP);
+
+#ifndef CONFIG_CIFS_XATTR
+ if (xattr_iov)
+ return ERR_PTR(-EOPNOTSUPP);
+#endif
+
+ oparms = CIFS_OPARMS(cifs_sb, tcon, full_path,
+ FILE_READ_ATTRIBUTES | FILE_WRITE_DATA | FILE_WRITE_EA,
+ FILE_CREATE,
+ (directory ? CREATE_NOT_FILE : CREATE_NOT_DIR) | OPEN_REPARSE_POINT,
+ ACL_NO_MODE);
+ oparms.fid = &fid;
+
+ rc = CIFS_open(xid, &oparms, &oplock, NULL);
+ if (rc)
+ return ERR_PTR(rc);
+
+#ifdef CONFIG_CIFS_XATTR
+ if (xattr_iov) {
+ struct smb2_file_full_ea_info *ea;
+
+ ea = &((struct smb2_create_ea_ctx *)xattr_iov->iov_base)->ea;
+ while (1) {
+ rc = CIFSSMBSetEA(xid,
+ tcon,
+ full_path,
+ &ea->ea_data[0],
+ &ea->ea_data[ea->ea_name_length+1],
+ le16_to_cpu(ea->ea_value_length),
+ cifs_sb->local_nls,
+ cifs_sb);
+ if (rc)
+ goto out_close;
+ if (le32_to_cpu(ea->next_entry_offset) == 0)
+ break;
+ ea = (struct smb2_file_full_ea_info *)((u8 *)ea +
+ le32_to_cpu(ea->next_entry_offset));
+ }
+ }
+#endif
+
+ rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **)&io_req, NULL);
+ if (rc)
+ goto out_close;
+
+ inc_rfc1001_len(io_req, sizeof(io_req->Pad));
+
+ io_req_len = be32_to_cpu(io_req->hdr.smb_buf_length) + sizeof(io_req->hdr.smb_buf_length);
+
+ /* NT IOCTL response contains one-word long output setup buffer with size of output data. */
+ io_req->MaxSetupCount = 1;
+ /* NT IOCTL response does not contain output parameters. */
+ io_req->MaxParameterCount = cpu_to_le32(0);
+ /* FSCTL_SET_REPARSE_POINT response contains empty output data. */
+ io_req->MaxDataCount = cpu_to_le32(0);
+
+ io_req->TotalParameterCount = cpu_to_le32(0);
+ io_req->TotalDataCount = cpu_to_le32(reparse_iov->iov_len);
+ io_req->ParameterCount = io_req->TotalParameterCount;
+ io_req->ParameterOffset = cpu_to_le32(0);
+ io_req->DataCount = io_req->TotalDataCount;
+ io_req->DataOffset = cpu_to_le32(offsetof(typeof(*io_req), Data) -
+ sizeof(io_req->hdr.smb_buf_length));
+ io_req->SetupCount = 4;
+ io_req->SubCommand = cpu_to_le16(NT_TRANSACT_IOCTL);
+ io_req->FunctionCode = cpu_to_le32(FSCTL_SET_REPARSE_POINT);
+ io_req->Fid = fid.netfid;
+ io_req->IsFsctl = 1;
+ io_req->IsRootFlag = 0;
+ io_req->ByteCount = cpu_to_le16(le32_to_cpu(io_req->DataCount) + sizeof(io_req->Pad));
+
+ inc_rfc1001_len(io_req, reparse_iov->iov_len);
+
+ in_iov[0].iov_base = (char *)io_req;
+ in_iov[0].iov_len = io_req_len;
+ in_iov[1] = *reparse_iov;
+ rc = SendReceive2(xid, tcon->ses, in_iov, ARRAY_SIZE(in_iov), &buf_type,
+ CIFS_NO_RSP_BUF, &out_iov);
+
+ cifs_buf_release(io_req);
+
+ if (!rc)
+ rc = cifs_get_inode_info(&new, full_path, data, sb, xid, NULL);
+
+out_close:
+ CIFSSMBClose(xid, tcon, fid.netfid);
+
+ /*
+ * If CREATE was successful but FSCTL_SET_REPARSE_POINT failed then
+ * remove the intermediate object created by CREATE. Otherwise
+ * empty object stay on the server when reparse call failed.
+ */
+ if (rc)
+ CIFSSMBDelFile(xid, tcon, full_path, cifs_sb, NULL);
+
+ return rc ? ERR_PTR(rc) : new;
+}
+
int
CIFSSMB_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
__u16 fid)
@@ -3369,7 +3567,7 @@ validate_ntransact(char *buf, char **ppparm, char **ppdata,
/* Get Security Descriptor (by handle) from remote server for a file or dir */
int
CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid,
- struct smb_ntsd **acl_inf, __u32 *pbuflen)
+ struct smb_ntsd **acl_inf, __u32 *pbuflen, __u32 info)
{
int rc = 0;
int buf_type = 0;
@@ -3391,8 +3589,7 @@ CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid,
/* BB TEST with big acls that might need to be e.g. larger than 16K */
pSMB->MaxSetupCount = 0;
pSMB->Fid = fid; /* file handle always le */
- pSMB->AclFlags = cpu_to_le32(CIFS_ACL_OWNER | CIFS_ACL_GROUP |
- CIFS_ACL_DACL);
+ pSMB->AclFlags = cpu_to_le32(info);
pSMB->ByteCount = cpu_to_le16(11); /* 3 bytes pad + 8 bytes parm */
inc_rfc1001_len(pSMB, 11);
iov[0].iov_base = (char *)pSMB;
@@ -3951,6 +4148,12 @@ findFirstRetry:
pSMB->FileName[name_len] = 0;
pSMB->FileName[name_len+1] = 0;
name_len += 2;
+ } else if (!searchName[0]) {
+ pSMB->FileName[0] = CIFS_DIR_SEP(cifs_sb);
+ pSMB->FileName[1] = 0;
+ pSMB->FileName[2] = 0;
+ pSMB->FileName[3] = 0;
+ name_len = 4;
}
} else {
name_len = copy_path_name(pSMB->FileName, searchName);
@@ -3962,6 +4165,10 @@ findFirstRetry:
pSMB->FileName[name_len] = '*';
pSMB->FileName[name_len+1] = 0;
name_len += 2;
+ } else if (!searchName[0]) {
+ pSMB->FileName[0] = CIFS_DIR_SEP(cifs_sb);
+ pSMB->FileName[1] = 0;
+ name_len = 2;
}
}
@@ -3988,7 +4195,7 @@ findFirstRetry:
pSMB->SearchAttributes =
cpu_to_le16(ATTR_READONLY | ATTR_HIDDEN | ATTR_SYSTEM |
ATTR_DIRECTORY);
- pSMB->SearchCount = cpu_to_le16(CIFSMaxBufSize/sizeof(FILE_UNIX_INFO));
+ pSMB->SearchCount = cpu_to_le16(msearch ? CIFSMaxBufSize/sizeof(FILE_UNIX_INFO) : 1);
pSMB->SearchFlags = cpu_to_le16(search_flags);
pSMB->InformationLevel = cpu_to_le16(psrch_inf->info_level);
@@ -5141,6 +5348,63 @@ CIFSSMBSetFileSize(const unsigned int xid, struct cifs_tcon *tcon,
return rc;
}
+int
+SMBSetInformation(const unsigned int xid, struct cifs_tcon *tcon,
+ const char *fileName, __le32 attributes, __le64 write_time,
+ const struct nls_table *nls_codepage,
+ struct cifs_sb_info *cifs_sb)
+{
+ SETATTR_REQ *pSMB;
+ SETATTR_RSP *pSMBr;
+ struct timespec64 ts;
+ int bytes_returned;
+ int name_len;
+ int rc;
+
+ cifs_dbg(FYI, "In %s path %s\n", __func__, fileName);
+
+retry:
+ rc = smb_init(SMB_COM_SETATTR, 8, tcon, (void **) &pSMB,
+ (void **) &pSMBr);
+ if (rc)
+ return rc;
+
+ if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
+ name_len =
+ cifsConvertToUTF16((__le16 *) pSMB->fileName,
+ fileName, PATH_MAX, nls_codepage,
+ cifs_remap(cifs_sb));
+ name_len++; /* trailing null */
+ name_len *= 2;
+ } else {
+ name_len = copy_path_name(pSMB->fileName, fileName);
+ }
+ /* Only few attributes can be set by this command, others are not accepted by Win9x. */
+ pSMB->attr = cpu_to_le16(le32_to_cpu(attributes) &
+ (ATTR_READONLY | ATTR_HIDDEN | ATTR_SYSTEM | ATTR_ARCHIVE));
+ /* Zero write time value (in both NT and SETATTR formats) means to not change it. */
+ if (le64_to_cpu(write_time) != 0) {
+ ts = cifs_NTtimeToUnix(write_time);
+ pSMB->last_write_time = cpu_to_le32(ts.tv_sec);
+ }
+ pSMB->BufferFormat = 0x04;
+ name_len++; /* account for buffer type byte */
+ inc_rfc1001_len(pSMB, (__u16)name_len);
+ pSMB->ByteCount = cpu_to_le16(name_len);
+
+ rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+ (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+ if (rc)
+ cifs_dbg(FYI, "Send error in %s = %d\n", __func__, rc);
+
+ cifs_buf_release(pSMB);
+
+ if (rc == -EAGAIN)
+ goto retry;
+
+ return rc;
+}
+
/* Some legacy servers such as NT4 require that the file times be set on
an open handle, rather than by pathname - this is awkward due to
potential access conflicts on the open, but it is unavoidable for these
diff --git a/fs/smb/client/cifstransport.c b/fs/smb/client/cifstransport.c
new file mode 100644
index 000000000000..e98b95eff8c9
--- /dev/null
+++ b/fs/smb/client/cifstransport.c
@@ -0,0 +1,565 @@
+// SPDX-License-Identifier: LGPL-2.1
+/*
+ *
+ * Copyright (C) International Business Machines Corp., 2002,2008
+ * Author(s): Steve French (sfrench@us.ibm.com)
+ * Jeremy Allison (jra@samba.org) 2006.
+ *
+ */
+
+#include <linux/fs.h>
+#include <linux/list.h>
+#include <linux/gfp.h>
+#include <linux/wait.h>
+#include <linux/net.h>
+#include <linux/delay.h>
+#include <linux/freezer.h>
+#include <linux/tcp.h>
+#include <linux/bvec.h>
+#include <linux/highmem.h>
+#include <linux/uaccess.h>
+#include <linux/processor.h>
+#include <linux/mempool.h>
+#include <linux/sched/signal.h>
+#include <linux/task_io_accounting_ops.h>
+#include "cifspdu.h"
+#include "cifsglob.h"
+#include "cifsproto.h"
+#include "cifs_debug.h"
+#include "smb2proto.h"
+#include "smbdirect.h"
+#include "compress.h"
+
+/* Max number of iovectors we can use off the stack when sending requests. */
+#define CIFS_MAX_IOV_SIZE 8
+
+static struct mid_q_entry *
+alloc_mid(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
+{
+ struct mid_q_entry *temp;
+
+ if (server == NULL) {
+ cifs_dbg(VFS, "%s: null TCP session\n", __func__);
+ return NULL;
+ }
+
+ temp = mempool_alloc(cifs_mid_poolp, GFP_NOFS);
+ memset(temp, 0, sizeof(struct mid_q_entry));
+ kref_init(&temp->refcount);
+ spin_lock_init(&temp->mid_lock);
+ temp->mid = get_mid(smb_buffer);
+ temp->pid = current->pid;
+ temp->command = cpu_to_le16(smb_buffer->Command);
+ cifs_dbg(FYI, "For smb_command %d\n", smb_buffer->Command);
+ /* easier to use jiffies */
+ /* when mid allocated can be before when sent */
+ temp->when_alloc = jiffies;
+ temp->server = server;
+
+ /*
+ * The default is for the mid to be synchronous, so the
+ * default callback just wakes up the current task.
+ */
+ get_task_struct(current);
+ temp->creator = current;
+ temp->callback = cifs_wake_up_task;
+ temp->callback_data = current;
+
+ atomic_inc(&mid_count);
+ temp->mid_state = MID_REQUEST_ALLOCATED;
+ return temp;
+}
+
+int
+smb_send(struct TCP_Server_Info *server, struct smb_hdr *smb_buffer,
+ unsigned int smb_buf_length)
+{
+ struct kvec iov[2];
+ struct smb_rqst rqst = { .rq_iov = iov,
+ .rq_nvec = 2 };
+
+ iov[0].iov_base = smb_buffer;
+ iov[0].iov_len = 4;
+ iov[1].iov_base = (char *)smb_buffer + 4;
+ iov[1].iov_len = smb_buf_length;
+
+ return __smb_send_rqst(server, 1, &rqst);
+}
+
+static int allocate_mid(struct cifs_ses *ses, struct smb_hdr *in_buf,
+ struct mid_q_entry **ppmidQ)
+{
+ spin_lock(&ses->ses_lock);
+ if (ses->ses_status == SES_NEW) {
+ if ((in_buf->Command != SMB_COM_SESSION_SETUP_ANDX) &&
+ (in_buf->Command != SMB_COM_NEGOTIATE)) {
+ spin_unlock(&ses->ses_lock);
+ return -EAGAIN;
+ }
+ /* else ok - we are setting up session */
+ }
+
+ if (ses->ses_status == SES_EXITING) {
+ /* check if SMB session is bad because we are setting it up */
+ if (in_buf->Command != SMB_COM_LOGOFF_ANDX) {
+ spin_unlock(&ses->ses_lock);
+ return -EAGAIN;
+ }
+ /* else ok - we are shutting down session */
+ }
+ spin_unlock(&ses->ses_lock);
+
+ *ppmidQ = alloc_mid(in_buf, ses->server);
+ if (*ppmidQ == NULL)
+ return -ENOMEM;
+ spin_lock(&ses->server->mid_queue_lock);
+ list_add_tail(&(*ppmidQ)->qhead, &ses->server->pending_mid_q);
+ spin_unlock(&ses->server->mid_queue_lock);
+ return 0;
+}
+
+struct mid_q_entry *
+cifs_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst)
+{
+ int rc;
+ struct smb_hdr *hdr = (struct smb_hdr *)rqst->rq_iov[0].iov_base;
+ struct mid_q_entry *mid;
+
+ if (rqst->rq_iov[0].iov_len != 4 ||
+ rqst->rq_iov[0].iov_base + 4 != rqst->rq_iov[1].iov_base)
+ return ERR_PTR(-EIO);
+
+ /* enable signing if server requires it */
+ if (server->sign)
+ hdr->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
+
+ mid = alloc_mid(hdr, server);
+ if (mid == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ rc = cifs_sign_rqst(rqst, server, &mid->sequence_number);
+ if (rc) {
+ release_mid(mid);
+ return ERR_PTR(rc);
+ }
+
+ return mid;
+}
+
+/*
+ *
+ * Send an SMB Request. No response info (other than return code)
+ * needs to be parsed.
+ *
+ * flags indicate the type of request buffer and how long to wait
+ * and whether to log NT STATUS code (error) before mapping it to POSIX error
+ *
+ */
+int
+SendReceiveNoRsp(const unsigned int xid, struct cifs_ses *ses,
+ char *in_buf, int flags)
+{
+ int rc;
+ struct kvec iov[1];
+ struct kvec rsp_iov;
+ int resp_buf_type;
+
+ iov[0].iov_base = in_buf;
+ iov[0].iov_len = get_rfc1002_length(in_buf) + 4;
+ flags |= CIFS_NO_RSP_BUF;
+ rc = SendReceive2(xid, ses, iov, 1, &resp_buf_type, flags, &rsp_iov);
+ cifs_dbg(NOISY, "SendRcvNoRsp flags %d rc %d\n", flags, rc);
+
+ return rc;
+}
+
+int
+cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server,
+ bool log_error)
+{
+ unsigned int len = get_rfc1002_length(mid->resp_buf) + 4;
+
+ dump_smb(mid->resp_buf, min_t(u32, 92, len));
+
+ /* convert the length into a more usable form */
+ if (server->sign) {
+ struct kvec iov[2];
+ int rc = 0;
+ struct smb_rqst rqst = { .rq_iov = iov,
+ .rq_nvec = 2 };
+
+ iov[0].iov_base = mid->resp_buf;
+ iov[0].iov_len = 4;
+ iov[1].iov_base = (char *)mid->resp_buf + 4;
+ iov[1].iov_len = len - 4;
+ /* FIXME: add code to kill session */
+ rc = cifs_verify_signature(&rqst, server,
+ mid->sequence_number);
+ if (rc)
+ cifs_server_dbg(VFS, "SMB signature verification returned error = %d\n",
+ rc);
+ }
+
+ /* BB special case reconnect tid and uid here? */
+ return map_and_check_smb_error(mid, log_error);
+}
+
+struct mid_q_entry *
+cifs_setup_request(struct cifs_ses *ses, struct TCP_Server_Info *ignored,
+ struct smb_rqst *rqst)
+{
+ int rc;
+ struct smb_hdr *hdr = (struct smb_hdr *)rqst->rq_iov[0].iov_base;
+ struct mid_q_entry *mid;
+
+ if (rqst->rq_iov[0].iov_len != 4 ||
+ rqst->rq_iov[0].iov_base + 4 != rqst->rq_iov[1].iov_base)
+ return ERR_PTR(-EIO);
+
+ rc = allocate_mid(ses, hdr, &mid);
+ if (rc)
+ return ERR_PTR(rc);
+ rc = cifs_sign_rqst(rqst, ses->server, &mid->sequence_number);
+ if (rc) {
+ delete_mid(mid);
+ return ERR_PTR(rc);
+ }
+ return mid;
+}
+
+int
+SendReceive2(const unsigned int xid, struct cifs_ses *ses,
+ struct kvec *iov, int n_vec, int *resp_buf_type /* ret */,
+ const int flags, struct kvec *resp_iov)
+{
+ struct smb_rqst rqst;
+ struct kvec s_iov[CIFS_MAX_IOV_SIZE], *new_iov;
+ int rc;
+
+ if (n_vec + 1 > CIFS_MAX_IOV_SIZE) {
+ new_iov = kmalloc_array(n_vec + 1, sizeof(struct kvec),
+ GFP_KERNEL);
+ if (!new_iov) {
+ /* otherwise cifs_send_recv below sets resp_buf_type */
+ *resp_buf_type = CIFS_NO_BUFFER;
+ return -ENOMEM;
+ }
+ } else
+ new_iov = s_iov;
+
+ /* 1st iov is a RFC1001 length followed by the rest of the packet */
+ memcpy(new_iov + 1, iov, (sizeof(struct kvec) * n_vec));
+
+ new_iov[0].iov_base = new_iov[1].iov_base;
+ new_iov[0].iov_len = 4;
+ new_iov[1].iov_base += 4;
+ new_iov[1].iov_len -= 4;
+
+ memset(&rqst, 0, sizeof(struct smb_rqst));
+ rqst.rq_iov = new_iov;
+ rqst.rq_nvec = n_vec + 1;
+
+ rc = cifs_send_recv(xid, ses, ses->server,
+ &rqst, resp_buf_type, flags, resp_iov);
+ if (n_vec + 1 > CIFS_MAX_IOV_SIZE)
+ kfree(new_iov);
+ return rc;
+}
+
+int
+SendReceive(const unsigned int xid, struct cifs_ses *ses,
+ struct smb_hdr *in_buf, struct smb_hdr *out_buf,
+ int *pbytes_returned, const int flags)
+{
+ int rc = 0;
+ struct mid_q_entry *midQ;
+ unsigned int len = be32_to_cpu(in_buf->smb_buf_length);
+ struct kvec iov = { .iov_base = in_buf, .iov_len = len };
+ struct smb_rqst rqst = { .rq_iov = &iov, .rq_nvec = 1 };
+ struct cifs_credits credits = { .value = 1, .instance = 0 };
+ struct TCP_Server_Info *server;
+
+ if (ses == NULL) {
+ cifs_dbg(VFS, "Null smb session\n");
+ return -EIO;
+ }
+ server = ses->server;
+ if (server == NULL) {
+ cifs_dbg(VFS, "Null tcp session\n");
+ return -EIO;
+ }
+
+ spin_lock(&server->srv_lock);
+ if (server->tcpStatus == CifsExiting) {
+ spin_unlock(&server->srv_lock);
+ return -ENOENT;
+ }
+ spin_unlock(&server->srv_lock);
+
+ /* Ensure that we do not send more than 50 overlapping requests
+ to the same server. We may make this configurable later or
+ use ses->maxReq */
+
+ if (len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
+ cifs_server_dbg(VFS, "Invalid length, greater than maximum frame, %d\n",
+ len);
+ return -EIO;
+ }
+
+ rc = wait_for_free_request(server, flags, &credits.instance);
+ if (rc)
+ return rc;
+
+ /* make sure that we sign in the same order that we send on this socket
+ and avoid races inside tcp sendmsg code that could cause corruption
+ of smb data */
+
+ cifs_server_lock(server);
+
+ rc = allocate_mid(ses, in_buf, &midQ);
+ if (rc) {
+ cifs_server_unlock(server);
+ /* Update # of requests on wire to server */
+ add_credits(server, &credits, 0);
+ return rc;
+ }
+
+ rc = cifs_sign_smb(in_buf, server, &midQ->sequence_number);
+ if (rc) {
+ cifs_server_unlock(server);
+ goto out;
+ }
+
+ midQ->mid_state = MID_REQUEST_SUBMITTED;
+
+ rc = smb_send(server, in_buf, len);
+ cifs_save_when_sent(midQ);
+
+ if (rc < 0)
+ server->sequence_number -= 2;
+
+ cifs_server_unlock(server);
+
+ if (rc < 0)
+ goto out;
+
+ rc = wait_for_response(server, midQ);
+ if (rc != 0) {
+ send_cancel(server, &rqst, midQ);
+ spin_lock(&midQ->mid_lock);
+ if (midQ->callback) {
+ /* no longer considered to be "in-flight" */
+ midQ->callback = release_mid;
+ spin_unlock(&midQ->mid_lock);
+ add_credits(server, &credits, 0);
+ return rc;
+ }
+ spin_unlock(&midQ->mid_lock);
+ }
+
+ rc = cifs_sync_mid_result(midQ, server);
+ if (rc != 0) {
+ add_credits(server, &credits, 0);
+ return rc;
+ }
+
+ if (!midQ->resp_buf || !out_buf ||
+ midQ->mid_state != MID_RESPONSE_READY) {
+ rc = -EIO;
+ cifs_server_dbg(VFS, "Bad MID state?\n");
+ goto out;
+ }
+
+ *pbytes_returned = get_rfc1002_length(midQ->resp_buf);
+ memcpy(out_buf, midQ->resp_buf, *pbytes_returned + 4);
+ rc = cifs_check_receive(midQ, server, 0);
+out:
+ delete_mid(midQ);
+ add_credits(server, &credits, 0);
+
+ return rc;
+}
+
+/* We send a LOCKINGX_CANCEL_LOCK to cause the Windows
+ blocking lock to return. */
+
+static int
+send_lock_cancel(const unsigned int xid, struct cifs_tcon *tcon,
+ struct smb_hdr *in_buf,
+ struct smb_hdr *out_buf)
+{
+ int bytes_returned;
+ struct cifs_ses *ses = tcon->ses;
+ LOCK_REQ *pSMB = (LOCK_REQ *)in_buf;
+
+ /* We just modify the current in_buf to change
+ the type of lock from LOCKING_ANDX_SHARED_LOCK
+ or LOCKING_ANDX_EXCLUSIVE_LOCK to
+ LOCKING_ANDX_CANCEL_LOCK. */
+
+ pSMB->LockType = LOCKING_ANDX_CANCEL_LOCK|LOCKING_ANDX_LARGE_FILES;
+ pSMB->Timeout = 0;
+ pSMB->hdr.Mid = get_next_mid(ses->server);
+
+ return SendReceive(xid, ses, in_buf, out_buf,
+ &bytes_returned, 0);
+}
+
+int
+SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
+ struct smb_hdr *in_buf, struct smb_hdr *out_buf,
+ int *pbytes_returned)
+{
+ int rc = 0;
+ int rstart = 0;
+ struct mid_q_entry *midQ;
+ struct cifs_ses *ses;
+ unsigned int len = be32_to_cpu(in_buf->smb_buf_length);
+ struct kvec iov = { .iov_base = in_buf, .iov_len = len };
+ struct smb_rqst rqst = { .rq_iov = &iov, .rq_nvec = 1 };
+ unsigned int instance;
+ struct TCP_Server_Info *server;
+
+ if (tcon == NULL || tcon->ses == NULL) {
+ cifs_dbg(VFS, "Null smb session\n");
+ return -EIO;
+ }
+ ses = tcon->ses;
+ server = ses->server;
+
+ if (server == NULL) {
+ cifs_dbg(VFS, "Null tcp session\n");
+ return -EIO;
+ }
+
+ spin_lock(&server->srv_lock);
+ if (server->tcpStatus == CifsExiting) {
+ spin_unlock(&server->srv_lock);
+ return -ENOENT;
+ }
+ spin_unlock(&server->srv_lock);
+
+ /* Ensure that we do not send more than 50 overlapping requests
+ to the same server. We may make this configurable later or
+ use ses->maxReq */
+
+ if (len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
+ cifs_tcon_dbg(VFS, "Invalid length, greater than maximum frame, %d\n",
+ len);
+ return -EIO;
+ }
+
+ rc = wait_for_free_request(server, CIFS_BLOCKING_OP, &instance);
+ if (rc)
+ return rc;
+
+ /* make sure that we sign in the same order that we send on this socket
+ and avoid races inside tcp sendmsg code that could cause corruption
+ of smb data */
+
+ cifs_server_lock(server);
+
+ rc = allocate_mid(ses, in_buf, &midQ);
+ if (rc) {
+ cifs_server_unlock(server);
+ return rc;
+ }
+
+ rc = cifs_sign_smb(in_buf, server, &midQ->sequence_number);
+ if (rc) {
+ delete_mid(midQ);
+ cifs_server_unlock(server);
+ return rc;
+ }
+
+ midQ->mid_state = MID_REQUEST_SUBMITTED;
+ rc = smb_send(server, in_buf, len);
+ cifs_save_when_sent(midQ);
+
+ if (rc < 0)
+ server->sequence_number -= 2;
+
+ cifs_server_unlock(server);
+
+ if (rc < 0) {
+ delete_mid(midQ);
+ return rc;
+ }
+
+ /* Wait for a reply - allow signals to interrupt. */
+ rc = wait_event_interruptible(server->response_q,
+ (!(midQ->mid_state == MID_REQUEST_SUBMITTED ||
+ midQ->mid_state == MID_RESPONSE_RECEIVED)) ||
+ ((server->tcpStatus != CifsGood) &&
+ (server->tcpStatus != CifsNew)));
+
+ /* Were we interrupted by a signal ? */
+ spin_lock(&server->srv_lock);
+ if ((rc == -ERESTARTSYS) &&
+ (midQ->mid_state == MID_REQUEST_SUBMITTED ||
+ midQ->mid_state == MID_RESPONSE_RECEIVED) &&
+ ((server->tcpStatus == CifsGood) ||
+ (server->tcpStatus == CifsNew))) {
+ spin_unlock(&server->srv_lock);
+
+ if (in_buf->Command == SMB_COM_TRANSACTION2) {
+ /* POSIX lock. We send a NT_CANCEL SMB to cause the
+ blocking lock to return. */
+ rc = send_cancel(server, &rqst, midQ);
+ if (rc) {
+ delete_mid(midQ);
+ return rc;
+ }
+ } else {
+ /* Windows lock. We send a LOCKINGX_CANCEL_LOCK
+ to cause the blocking lock to return. */
+
+ rc = send_lock_cancel(xid, tcon, in_buf, out_buf);
+
+ /* If we get -ENOLCK back the lock may have
+ already been removed. Don't exit in this case. */
+ if (rc && rc != -ENOLCK) {
+ delete_mid(midQ);
+ return rc;
+ }
+ }
+
+ rc = wait_for_response(server, midQ);
+ if (rc) {
+ send_cancel(server, &rqst, midQ);
+ spin_lock(&midQ->mid_lock);
+ if (midQ->callback) {
+ /* no longer considered to be "in-flight" */
+ midQ->callback = release_mid;
+ spin_unlock(&midQ->mid_lock);
+ return rc;
+ }
+ spin_unlock(&midQ->mid_lock);
+ }
+
+ /* We got the response - restart system call. */
+ rstart = 1;
+ spin_lock(&server->srv_lock);
+ }
+ spin_unlock(&server->srv_lock);
+
+ rc = cifs_sync_mid_result(midQ, server);
+ if (rc != 0)
+ return rc;
+
+ /* rcvd frame is ok */
+ if (out_buf == NULL || midQ->mid_state != MID_RESPONSE_READY) {
+ rc = -EIO;
+ cifs_tcon_dbg(VFS, "Bad MID state?\n");
+ goto out;
+ }
+
+ *pbytes_returned = get_rfc1002_length(midQ->resp_buf);
+ memcpy(out_buf, midQ->resp_buf, *pbytes_returned + 4);
+ rc = cifs_check_receive(midQ, server, 0);
+out:
+ delete_mid(midQ);
+ if (rstart && rc == -EACCES)
+ return -ERESTARTSYS;
+ return rc;
+}
diff --git a/fs/smb/client/compress.c b/fs/smb/client/compress.c
index 766b4de13da7..db709f5cd2e1 100644
--- a/fs/smb/client/compress.c
+++ b/fs/smb/client/compress.c
@@ -155,58 +155,29 @@ static int cmp_bkt(const void *_a, const void *_b)
}
/*
- * TODO:
- * Support other iter types, if required.
- * Only ITER_XARRAY is supported for now.
+ * Collect some 2K samples with 2K gaps between.
*/
-static int collect_sample(const struct iov_iter *iter, ssize_t max, u8 *sample)
+static int collect_sample(const struct iov_iter *source, ssize_t max, u8 *sample)
{
- struct folio *folios[16], *folio;
- unsigned int nr, i, j, npages;
- loff_t start = iter->xarray_start + iter->iov_offset;
- pgoff_t last, index = start / PAGE_SIZE;
- size_t len, off, foff;
- void *p;
- int s = 0;
-
- last = (start + max - 1) / PAGE_SIZE;
- do {
- nr = xa_extract(iter->xarray, (void **)folios, index, last, ARRAY_SIZE(folios),
- XA_PRESENT);
- if (nr == 0)
- return -EIO;
-
- for (i = 0; i < nr; i++) {
- folio = folios[i];
- npages = folio_nr_pages(folio);
- foff = start - folio_pos(folio);
- off = foff % PAGE_SIZE;
-
- for (j = foff / PAGE_SIZE; j < npages; j++) {
- size_t len2;
-
- len = min_t(size_t, max, PAGE_SIZE - off);
- len2 = min_t(size_t, len, SZ_2K);
-
- p = kmap_local_page(folio_page(folio, j));
- memcpy(&sample[s], p, len2);
- kunmap_local(p);
-
- s += len2;
-
- if (len2 < SZ_2K || s >= max - SZ_2K)
- return s;
-
- max -= len;
- if (max <= 0)
- return s;
-
- start += len;
- off = 0;
- index++;
- }
- }
- } while (nr == ARRAY_SIZE(folios));
+ struct iov_iter iter = *source;
+ size_t s = 0;
+
+ while (iov_iter_count(&iter) >= SZ_2K) {
+ size_t part = umin(umin(iov_iter_count(&iter), SZ_2K), max);
+ size_t n;
+
+ n = copy_from_iter(sample + s, part, &iter);
+ if (n != part)
+ return -EFAULT;
+
+ s += n;
+ max -= n;
+
+ if (iov_iter_count(&iter) < PAGE_SIZE - SZ_2K)
+ break;
+
+ iov_iter_advance(&iter, SZ_2K);
+ }
return s;
}
diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index 880d7cf8b730..dd12f3eb61dc 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -97,7 +97,7 @@ static int reconn_set_ipaddr_from_hostname(struct TCP_Server_Info *server)
return rc;
}
-static void smb2_query_server_interfaces(struct work_struct *work)
+void smb2_query_server_interfaces(struct work_struct *work)
{
int rc;
int xid;
@@ -116,18 +116,22 @@ static void smb2_query_server_interfaces(struct work_struct *work)
rc = server->ops->query_server_interfaces(xid, tcon, false);
free_xid(xid);
- if (rc) {
- if (rc == -EOPNOTSUPP)
- return;
-
+ if (rc)
cifs_dbg(FYI, "%s: failed to query server interfaces: %d\n",
__func__, rc);
- }
queue_delayed_work(cifsiod_wq, &tcon->query_interfaces,
(SMB_INTERFACE_POLL_INTERVAL * HZ));
}
+#define set_need_reco(server) \
+do { \
+ spin_lock(&server->srv_lock); \
+ if (server->tcpStatus != CifsExiting) \
+ server->tcpStatus = CifsNeedReconnect; \
+ spin_unlock(&server->srv_lock); \
+} while (0)
+
/*
* Update the tcpStatus for the server.
* This is used to signal the cifsd thread to call cifs_reconnect
@@ -141,39 +145,45 @@ void
cifs_signal_cifsd_for_reconnect(struct TCP_Server_Info *server,
bool all_channels)
{
- struct TCP_Server_Info *pserver;
+ struct TCP_Server_Info *nserver;
struct cifs_ses *ses;
+ LIST_HEAD(reco);
int i;
- /* If server is a channel, select the primary channel */
- pserver = SERVER_IS_CHAN(server) ? server->primary_server : server;
-
/* if we need to signal just this channel */
if (!all_channels) {
- spin_lock(&server->srv_lock);
- if (server->tcpStatus != CifsExiting)
- server->tcpStatus = CifsNeedReconnect;
- spin_unlock(&server->srv_lock);
+ set_need_reco(server);
return;
}
- spin_lock(&cifs_tcp_ses_lock);
- list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) {
- if (cifs_ses_exiting(ses))
- continue;
- spin_lock(&ses->chan_lock);
- for (i = 0; i < ses->chan_count; i++) {
- if (!ses->chans[i].server)
+ if (SERVER_IS_CHAN(server))
+ server = server->primary_server;
+ scoped_guard(spinlock, &cifs_tcp_ses_lock) {
+ set_need_reco(server);
+ list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
+ spin_lock(&ses->ses_lock);
+ if (ses->ses_status == SES_EXITING) {
+ spin_unlock(&ses->ses_lock);
continue;
-
- spin_lock(&ses->chans[i].server->srv_lock);
- if (ses->chans[i].server->tcpStatus != CifsExiting)
- ses->chans[i].server->tcpStatus = CifsNeedReconnect;
- spin_unlock(&ses->chans[i].server->srv_lock);
+ }
+ spin_lock(&ses->chan_lock);
+ for (i = 1; i < ses->chan_count; i++) {
+ nserver = ses->chans[i].server;
+ if (!nserver)
+ continue;
+ nserver->srv_count++;
+ list_add(&nserver->rlist, &reco);
+ }
+ spin_unlock(&ses->chan_lock);
+ spin_unlock(&ses->ses_lock);
}
- spin_unlock(&ses->chan_lock);
}
- spin_unlock(&cifs_tcp_ses_lock);
+
+ list_for_each_entry_safe(server, nserver, &reco, rlist) {
+ list_del_init(&server->rlist);
+ set_need_reco(server);
+ cifs_put_tcp_session(server, 0);
+ }
}
/*
@@ -311,21 +321,21 @@ cifs_abort_connection(struct TCP_Server_Info *server)
/* mark submitted MIDs for retry and issue callback */
INIT_LIST_HEAD(&retry_list);
cifs_dbg(FYI, "%s: moving mids to private list\n", __func__);
- spin_lock(&server->mid_lock);
+ spin_lock(&server->mid_queue_lock);
list_for_each_entry_safe(mid, nmid, &server->pending_mid_q, qhead) {
kref_get(&mid->refcount);
if (mid->mid_state == MID_REQUEST_SUBMITTED)
mid->mid_state = MID_RETRY_NEEDED;
list_move(&mid->qhead, &retry_list);
- mid->mid_flags |= MID_DELETED;
+ mid->deleted_from_q = true;
}
- spin_unlock(&server->mid_lock);
+ spin_unlock(&server->mid_queue_lock);
cifs_server_unlock(server);
cifs_dbg(FYI, "%s: issuing mid callbacks\n", __func__);
list_for_each_entry_safe(mid, nmid, &retry_list, qhead) {
list_del_init(&mid->qhead);
- mid->callback(mid);
+ mid_execute_callback(mid);
release_mid(mid);
}
@@ -348,7 +358,7 @@ static bool cifs_tcp_ses_needs_reconnect(struct TCP_Server_Info *server, int num
}
cifs_dbg(FYI, "Mark tcp session as need reconnect\n");
- trace_smb3_reconnect(server->CurrentMid, server->conn_id,
+ trace_smb3_reconnect(server->current_mid, server->conn_id,
server->hostname);
server->tcpStatus = CifsNeedReconnect;
@@ -370,13 +380,20 @@ static bool cifs_tcp_ses_needs_reconnect(struct TCP_Server_Info *server, int num
*
*/
static int __cifs_reconnect(struct TCP_Server_Info *server,
- bool mark_smb_session)
+ bool mark_smb_session, bool once)
{
int rc = 0;
if (!cifs_tcp_ses_needs_reconnect(server, 1))
return 0;
+ /*
+ * if smb session has been marked for reconnect, also reconnect all
+ * connections. This way, the other connections do not end up bad.
+ */
+ if (mark_smb_session)
+ cifs_signal_cifsd_for_reconnect(server, mark_smb_session);
+
cifs_mark_tcp_ses_conns_for_reconnect(server, mark_smb_session);
cifs_abort_connection(server);
@@ -385,7 +402,8 @@ static int __cifs_reconnect(struct TCP_Server_Info *server,
try_to_freeze();
cifs_server_lock(server);
- if (!cifs_swn_set_server_dstaddr(server)) {
+ if (!cifs_swn_set_server_dstaddr(server) &&
+ !SERVER_IS_CHAN(server)) {
/* resolve the hostname again to make sure that IP address is up-to-date */
rc = reconn_set_ipaddr_from_hostname(server);
cifs_dbg(FYI, "%s: reconn_set_ipaddr_from_hostname: rc=%d\n", __func__, rc);
@@ -398,6 +416,9 @@ static int __cifs_reconnect(struct TCP_Server_Info *server,
if (rc) {
cifs_server_unlock(server);
cifs_dbg(FYI, "%s: reconnect error %d\n", __func__, rc);
+ /* If was asked to reconnect only once, do not try it more times */
+ if (once)
+ break;
msleep(3000);
} else {
atomic_inc(&tcpSesReconnectCount);
@@ -563,19 +584,33 @@ static int reconnect_dfs_server(struct TCP_Server_Info *server)
return rc;
}
-int cifs_reconnect(struct TCP_Server_Info *server, bool mark_smb_session)
+static int
+_cifs_reconnect(struct TCP_Server_Info *server, bool mark_smb_session, bool once)
{
if (!server->leaf_fullpath)
- return __cifs_reconnect(server, mark_smb_session);
+ return __cifs_reconnect(server, mark_smb_session, once);
return reconnect_dfs_server(server);
}
#else
-int cifs_reconnect(struct TCP_Server_Info *server, bool mark_smb_session)
+static int
+_cifs_reconnect(struct TCP_Server_Info *server, bool mark_smb_session, bool once)
{
- return __cifs_reconnect(server, mark_smb_session);
+ return __cifs_reconnect(server, mark_smb_session, once);
}
#endif
+int
+cifs_reconnect(struct TCP_Server_Info *server, bool mark_smb_session)
+{
+ return _cifs_reconnect(server, mark_smb_session, false);
+}
+
+static int
+cifs_reconnect_once(struct TCP_Server_Info *server)
+{
+ return _cifs_reconnect(server, true, true);
+}
+
static void
cifs_echo_request(struct work_struct *work)
{
@@ -644,12 +679,12 @@ server_unresponsive(struct TCP_Server_Info *server)
/*
* If we're in the process of mounting a share or reconnecting a session
* and the server abruptly shut down (e.g. socket wasn't closed, packet
- * had been ACK'ed but no SMB response), don't wait longer than 20s to
- * negotiate protocol.
+ * had been ACK'ed but no SMB response), don't wait longer than 20s from
+ * when negotiate actually started.
*/
spin_lock(&server->srv_lock);
if (server->tcpStatus == CifsInNegotiate &&
- time_after(jiffies, server->lstrp + 20 * HZ)) {
+ time_after(jiffies, server->neg_start + 20 * HZ)) {
spin_unlock(&server->srv_lock);
cifs_reconnect(server, false);
return true;
@@ -802,26 +837,110 @@ is_smb_response(struct TCP_Server_Info *server, unsigned char type)
/* Regular SMB response */
return true;
case RFC1002_SESSION_KEEP_ALIVE:
+ /*
+ * RFC 1002 session keep alive can sent by the server only when
+ * we established a RFC 1002 session. But Samba servers send
+ * RFC 1002 session keep alive also over port 445 on which
+ * RFC 1002 session is not established.
+ */
cifs_dbg(FYI, "RFC 1002 session keep alive\n");
break;
case RFC1002_POSITIVE_SESSION_RESPONSE:
- cifs_dbg(FYI, "RFC 1002 positive session response\n");
+ /*
+ * RFC 1002 positive session response cannot be returned
+ * for SMB request. RFC 1002 session response is handled
+ * exclusively in ip_rfc1001_connect() function.
+ */
+ cifs_server_dbg(VFS, "RFC 1002 positive session response (unexpected)\n");
+ cifs_reconnect(server, true);
break;
case RFC1002_NEGATIVE_SESSION_RESPONSE:
/*
* We get this from Windows 98 instead of an error on
- * SMB negprot response.
+ * SMB negprot response, when we have not established
+ * RFC 1002 session (which means ip_rfc1001_connect()
+ * was skipped). Note that same still happens with
+ * Windows Server 2022 when connecting via port 139.
+ * So for this case when mount option -o nonbsessinit
+ * was not specified, try to reconnect with establishing
+ * RFC 1002 session. If new socket establishment with
+ * RFC 1002 session was successful then return to the
+ * mid's caller -EAGAIN, so it can retry the request.
*/
- cifs_dbg(FYI, "RFC 1002 negative session response\n");
- /* give server a second to clean up */
- msleep(1000);
- /*
- * Always try 445 first on reconnect since we get NACK
- * on some if we ever connected to port 139 (the NACK
- * is since we do not begin with RFC1001 session
- * initialize frame).
- */
- cifs_set_port((struct sockaddr *)&server->dstaddr, CIFS_PORT);
+ if (!cifs_rdma_enabled(server) &&
+ server->tcpStatus == CifsInNegotiate &&
+ !server->with_rfc1001 &&
+ server->rfc1001_sessinit != 0) {
+ int rc, mid_rc;
+ struct mid_q_entry *mid, *nmid;
+ LIST_HEAD(dispose_list);
+
+ cifs_dbg(FYI, "RFC 1002 negative session response during SMB Negotiate, retrying with NetBIOS session\n");
+
+ /*
+ * Before reconnect, delete all pending mids for this
+ * server, so reconnect would not signal connection
+ * aborted error to mid's callbacks. Note that for this
+ * server there should be exactly one pending mid
+ * corresponding to SMB1/SMB2 Negotiate packet.
+ */
+ spin_lock(&server->mid_queue_lock);
+ list_for_each_entry_safe(mid, nmid, &server->pending_mid_q, qhead) {
+ kref_get(&mid->refcount);
+ list_move(&mid->qhead, &dispose_list);
+ mid->deleted_from_q = true;
+ }
+ spin_unlock(&server->mid_queue_lock);
+
+ /* Now try to reconnect once with NetBIOS session. */
+ server->with_rfc1001 = true;
+ rc = cifs_reconnect_once(server);
+
+ /*
+ * If reconnect was successful then indicate -EAGAIN
+ * to mid's caller. If reconnect failed with -EAGAIN
+ * then mask it as -EHOSTDOWN, so mid's caller would
+ * know that it failed.
+ */
+ if (rc == 0)
+ mid_rc = -EAGAIN;
+ else if (rc == -EAGAIN)
+ mid_rc = -EHOSTDOWN;
+ else
+ mid_rc = rc;
+
+ /*
+ * After reconnect (either successful or unsuccessful)
+ * deliver reconnect status to mid's caller via mid's
+ * callback. Use MID_RC state which indicates that the
+ * return code should be read from mid_rc member.
+ */
+ list_for_each_entry_safe(mid, nmid, &dispose_list, qhead) {
+ list_del_init(&mid->qhead);
+ mid->mid_rc = mid_rc;
+ mid->mid_state = MID_RC;
+ mid_execute_callback(mid);
+ release_mid(mid);
+ }
+
+ /*
+ * If reconnect failed then wait two seconds. In most
+ * cases we were been called from the mount context and
+ * delivered failure to mid's callback will stop this
+ * receiver task thread and fails the mount process.
+ * So wait two seconds to prevent another reconnect
+ * in this task thread, which would be useless as the
+ * mount context will fail at all.
+ */
+ if (rc != 0)
+ msleep(2000);
+ } else {
+ cifs_server_dbg(VFS, "RFC 1002 negative session response (unexpected)\n");
+ cifs_reconnect(server, true);
+ }
+ break;
+ case RFC1002_RETARGET_SESSION_RESPONSE:
+ cifs_server_dbg(VFS, "RFC 1002 retarget session response (unexpected)\n");
cifs_reconnect(server, true);
break;
default:
@@ -838,7 +957,7 @@ dequeue_mid(struct mid_q_entry *mid, bool malformed)
#ifdef CONFIG_CIFS_STATS2
mid->when_received = jiffies;
#endif
- spin_lock(&mid->server->mid_lock);
+ spin_lock(&mid->server->mid_queue_lock);
if (!malformed)
mid->mid_state = MID_RESPONSE_RECEIVED;
else
@@ -847,13 +966,13 @@ dequeue_mid(struct mid_q_entry *mid, bool malformed)
* Trying to handle/dequeue a mid after the send_recv()
* function has finished processing it is a bug.
*/
- if (mid->mid_flags & MID_DELETED) {
- spin_unlock(&mid->server->mid_lock);
+ if (mid->deleted_from_q == true) {
+ spin_unlock(&mid->server->mid_queue_lock);
pr_warn_once("trying to dequeue a deleted mid\n");
} else {
list_del_init(&mid->qhead);
- mid->mid_flags |= MID_DELETED;
- spin_unlock(&mid->server->mid_lock);
+ mid->deleted_from_q = true;
+ spin_unlock(&mid->server->mid_queue_lock);
}
}
@@ -972,13 +1091,9 @@ clean_demultiplex_info(struct TCP_Server_Info *server)
msleep(125);
if (cifs_rdma_enabled(server))
smbd_destroy(server);
-
if (server->ssocket) {
sock_release(server->ssocket);
server->ssocket = NULL;
-
- /* Release netns reference for the socket. */
- put_net(cifs_net_ns(server));
}
if (!list_empty(&server->pending_mid_q)) {
@@ -986,23 +1101,23 @@ clean_demultiplex_info(struct TCP_Server_Info *server)
struct list_head *tmp, *tmp2;
LIST_HEAD(dispose_list);
- spin_lock(&server->mid_lock);
+ spin_lock(&server->mid_queue_lock);
list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
cifs_dbg(FYI, "Clearing mid %llu\n", mid_entry->mid);
kref_get(&mid_entry->refcount);
mid_entry->mid_state = MID_SHUTDOWN;
list_move(&mid_entry->qhead, &dispose_list);
- mid_entry->mid_flags |= MID_DELETED;
+ mid_entry->deleted_from_q = true;
}
- spin_unlock(&server->mid_lock);
+ spin_unlock(&server->mid_queue_lock);
/* now walk dispose list and issue callbacks */
list_for_each_safe(tmp, tmp2, &dispose_list) {
mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
cifs_dbg(FYI, "Callback mid %llu\n", mid_entry->mid);
list_del_init(&mid_entry->qhead);
- mid_entry->callback(mid_entry);
+ mid_execute_callback(mid_entry);
release_mid(mid_entry);
}
/* 1/8th of sec is more than enough time for them to exit */
@@ -1026,7 +1141,6 @@ clean_demultiplex_info(struct TCP_Server_Info *server)
*/
}
- /* Release netns reference for this server. */
put_net(cifs_net_ns(server));
kfree(server->leaf_fullpath);
kfree(server->hostname);
@@ -1128,7 +1242,7 @@ smb2_add_credits_from_hdr(char *buffer, struct TCP_Server_Info *server)
spin_unlock(&server->req_lock);
wake_up(&server->request_q);
- trace_smb3_hdr_credits(server->CurrentMid,
+ trace_smb3_hdr_credits(server->current_mid,
server->conn_id, server->hostname, scredits,
le16_to_cpu(shdr->CreditRequest), in_flight);
cifs_server_dbg(FYI, "%s: added %u credits total=%d\n",
@@ -1280,7 +1394,7 @@ next_pdu:
}
if (!mids[i]->multiRsp || mids[i]->multiEnd)
- mids[i]->callback(mids[i]);
+ mid_execute_callback(mids[i]);
release_mid(mids[i]);
} else if (server->ops->is_oplock_break &&
@@ -1672,10 +1786,9 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx,
tcp_ses->ops = ctx->ops;
tcp_ses->vals = ctx->vals;
-
- /* Grab netns reference for this server. */
cifs_set_net_ns(tcp_ses, get_net(current->nsproxy->net_ns));
+ tcp_ses->sign = ctx->sign;
tcp_ses->conn_id = atomic_inc_return(&tcpSesNextId);
tcp_ses->noblockcnt = ctx->rootfs;
tcp_ses->noblocksnd = ctx->noblocksnd || ctx->rootfs;
@@ -1699,6 +1812,8 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx,
ctx->source_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL);
memcpy(tcp_ses->server_RFC1001_name,
ctx->target_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL);
+ tcp_ses->rfc1001_sessinit = ctx->rfc1001_sessinit;
+ tcp_ses->with_rfc1001 = false;
tcp_ses->session_estab = false;
tcp_ses->sequence_number = 0;
tcp_ses->channel_sequence_num = 0; /* only tracked for primary channel */
@@ -1707,7 +1822,8 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx,
tcp_ses->compression.requested = ctx->compress;
spin_lock_init(&tcp_ses->req_lock);
spin_lock_init(&tcp_ses->srv_lock);
- spin_lock_init(&tcp_ses->mid_lock);
+ spin_lock_init(&tcp_ses->mid_queue_lock);
+ spin_lock_init(&tcp_ses->mid_counter_lock);
INIT_LIST_HEAD(&tcp_ses->tcp_ses_list);
INIT_LIST_HEAD(&tcp_ses->smb_ses_list);
INIT_DELAYED_WORK(&tcp_ses->echo, cifs_echo_request);
@@ -1729,12 +1845,8 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx,
*/
tcp_ses->tcpStatus = CifsNew;
++tcp_ses->srv_count;
+ tcp_ses->echo_interval = ctx->echo_interval * HZ;
- if (ctx->echo_interval >= SMB_ECHO_INTERVAL_MIN &&
- ctx->echo_interval <= SMB_ECHO_INTERVAL_MAX)
- tcp_ses->echo_interval = ctx->echo_interval * HZ;
- else
- tcp_ses->echo_interval = SMB_ECHO_INTERVAL_DEFAULT * HZ;
if (tcp_ses->rdma) {
#ifndef CONFIG_CIFS_SMB_DIRECT
cifs_dbg(VFS, "CONFIG_CIFS_SMB_DIRECT is not enabled\n");
@@ -1802,7 +1914,6 @@ smbd_connected:
out_err_crypto_release:
cifs_crypto_secmech_release(tcp_ses);
- /* Release netns reference for this server. */
put_net(cifs_net_ns(tcp_ses));
out_err:
@@ -1811,10 +1922,8 @@ out_err:
cifs_put_tcp_session(tcp_ses->primary_server, false);
kfree(tcp_ses->hostname);
kfree(tcp_ses->leaf_fullpath);
- if (tcp_ses->ssocket) {
+ if (tcp_ses->ssocket)
sock_release(tcp_ses->ssocket);
- put_net(cifs_net_ns(tcp_ses));
- }
kfree(tcp_ses);
}
return ERR_PTR(rc);
@@ -1825,9 +1934,8 @@ static int match_session(struct cifs_ses *ses,
struct smb3_fs_context *ctx,
bool match_super)
{
- if (ctx->sectype != Unspecified &&
- ctx->sectype != ses->sectype)
- return 0;
+ struct TCP_Server_Info *server = ses->server;
+ enum securityEnum ctx_sec, ses_sec;
if (!match_super && ctx->dfs_root_ses != ses->dfs_root_ses)
return 0;
@@ -1839,11 +1947,20 @@ static int match_session(struct cifs_ses *ses,
if (ses->chan_max < ctx->max_channels)
return 0;
- switch (ses->sectype) {
+ ctx_sec = server->ops->select_sectype(server, ctx->sectype);
+ ses_sec = server->ops->select_sectype(server, ses->sectype);
+
+ if (ctx_sec != ses_sec)
+ return 0;
+
+ switch (ctx_sec) {
+ case IAKerb:
case Kerberos:
if (!uid_eq(ctx->cred_uid, ses->cred_uid))
return 0;
break;
+ case NTLMv2:
+ case RawNTLMSSP:
default:
/* NULL username means anonymous session */
if (ses->user_name == NULL) {
@@ -2341,6 +2458,7 @@ retry_old_session:
ses->cred_uid = ctx->cred_uid;
ses->linux_uid = ctx->linux_uid;
+ ses->unicode = ctx->unicode;
ses->sectype = ctx->sectype;
ses->sign = ctx->sign;
@@ -2446,6 +2564,8 @@ static int match_tcon(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
return 0;
if (tcon->nodelete != ctx->nodelete)
return 0;
+ if (tcon->posix_extensions != ctx->linux_ext)
+ return 0;
return 1;
}
@@ -2761,20 +2881,14 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx)
tcon->max_cached_dirs = ctx->max_cached_dirs;
tcon->nodelete = ctx->nodelete;
tcon->local_lease = ctx->local_lease;
- INIT_LIST_HEAD(&tcon->pending_opens);
tcon->status = TID_GOOD;
- INIT_DELAYED_WORK(&tcon->query_interfaces,
- smb2_query_server_interfaces);
if (ses->server->dialect >= SMB30_PROT_ID &&
(ses->server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) {
/* schedule query interfaces poll */
queue_delayed_work(cifsiod_wq, &tcon->query_interfaces,
(SMB_INTERFACE_POLL_INTERVAL * HZ));
}
-#ifdef CONFIG_CIFS_DFS_UPCALL
- INIT_DELAYED_WORK(&tcon->dfs_cache_work, dfs_cache_refresh);
-#endif
spin_lock(&cifs_tcp_ses_lock);
list_add(&tcon->tcon_list, &ses->tcon_list);
spin_unlock(&cifs_tcp_ses_lock);
@@ -2849,6 +2963,10 @@ compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data)
return 0;
if (old->ctx->reparse_type != new->ctx->reparse_type)
return 0;
+ if (old->ctx->nonativesocket != new->ctx->nonativesocket)
+ return 0;
+ if (old->ctx->symlink_type != new->ctx->symlink_type)
+ return 0;
return 1;
}
@@ -3014,6 +3132,44 @@ bind_socket(struct TCP_Server_Info *server)
}
static int
+smb_recv_kvec(struct TCP_Server_Info *server, struct msghdr *msg, size_t *recv)
+{
+ int rc = 0;
+ int retries = 0;
+ int msg_flags = server->noblocksnd ? MSG_DONTWAIT : 0;
+
+ *recv = 0;
+
+ while (msg_data_left(msg)) {
+ rc = sock_recvmsg(server->ssocket, msg, msg_flags);
+ if (rc == -EAGAIN) {
+ retries++;
+ if (retries >= 14 ||
+ (!server->noblocksnd && (retries > 2))) {
+ cifs_server_dbg(VFS, "sends on sock %p stuck for 15 seconds\n",
+ server->ssocket);
+ return -EAGAIN;
+ }
+ msleep(1 << retries);
+ continue;
+ }
+
+ if (rc < 0)
+ return rc;
+
+ if (rc == 0) {
+ cifs_dbg(FYI, "Received no data (TCP RST)\n");
+ return -ECONNABORTED;
+ }
+
+ /* recv was at least partially successful */
+ *recv += rc;
+ retries = 0; /* in case we get ENOSPC on the next send */
+ }
+ return 0;
+}
+
+static int
ip_rfc1001_connect(struct TCP_Server_Info *server)
{
int rc = 0;
@@ -3023,8 +3179,12 @@ ip_rfc1001_connect(struct TCP_Server_Info *server)
* sessinit is sent but no second negprot
*/
struct rfc1002_session_packet req = {};
- struct smb_hdr *smb_buf = (struct smb_hdr *)&req;
+ struct rfc1002_session_packet resp = {};
+ struct msghdr msg = {};
+ struct kvec iov = {};
unsigned int len;
+ size_t sent;
+ size_t recv;
req.trailer.session_req.called_len = sizeof(req.trailer.session_req.called_name);
@@ -3053,19 +3213,119 @@ ip_rfc1001_connect(struct TCP_Server_Info *server)
* As per rfc1002, @len must be the number of bytes that follows the
* length field of a rfc1002 session request payload.
*/
- len = sizeof(req) - offsetof(struct rfc1002_session_packet, trailer.session_req);
+ len = sizeof(req.trailer.session_req);
+ req.type = RFC1002_SESSION_REQUEST;
+ req.flags = 0;
+ req.length = cpu_to_be16(len);
+ len += offsetof(typeof(req), trailer.session_req);
+ iov.iov_base = &req;
+ iov.iov_len = len;
+ iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &iov, 1, len);
+ rc = smb_send_kvec(server, &msg, &sent);
+ if (rc < 0 || len != sent)
+ return (rc == -EINTR || rc == -EAGAIN) ? rc : -ECONNABORTED;
- smb_buf->smb_buf_length = cpu_to_be32((RFC1002_SESSION_REQUEST << 24) | len);
- rc = smb_send(server, smb_buf, len);
/*
* RFC1001 layer in at least one server requires very short break before
* negprot presumably because not expecting negprot to follow so fast.
- * This is a simple solution that works without complicating the code
- * and causes no significant slowing down on mount for everyone else
+ * For example DOS SMB servers cannot process negprot if it was received
+ * before the server sent response for SESSION_REQUEST packet. So, wait
+ * for the response, read it and parse it as it can contain useful error
+ * information (e.g. specified server name was incorrect). For example
+ * even the latest Windows Server 2022 SMB1 server over port 139 send
+ * error if its server name was in SESSION_REQUEST packet incorrect.
+ * Nowadays usage of port 139 is not common, so waiting for reply here
+ * does not slowing down mounting of common case (over port 445).
*/
- usleep_range(1000, 2000);
+ len = offsetof(typeof(resp), trailer);
+ iov.iov_base = &resp;
+ iov.iov_len = len;
+ iov_iter_kvec(&msg.msg_iter, ITER_DEST, &iov, 1, len);
+ rc = smb_recv_kvec(server, &msg, &recv);
+ if (rc < 0 || recv != len)
+ return (rc == -EINTR || rc == -EAGAIN) ? rc : -ECONNABORTED;
+
+ switch (resp.type) {
+ case RFC1002_POSITIVE_SESSION_RESPONSE:
+ if (be16_to_cpu(resp.length) != 0) {
+ cifs_dbg(VFS, "RFC 1002 positive session response but with invalid non-zero length %u\n",
+ be16_to_cpu(resp.length));
+ return -EIO;
+ }
+ cifs_dbg(FYI, "RFC 1002 positive session response");
+ break;
+ case RFC1002_NEGATIVE_SESSION_RESPONSE:
+ /* Read RFC1002 response error code and convert it to errno in rc */
+ len = sizeof(resp.trailer.neg_ses_resp_error_code);
+ iov.iov_base = &resp.trailer.neg_ses_resp_error_code;
+ iov.iov_len = len;
+ iov_iter_kvec(&msg.msg_iter, ITER_DEST, &iov, 1, len);
+ if (be16_to_cpu(resp.length) == len &&
+ smb_recv_kvec(server, &msg, &recv) == 0 &&
+ recv == len) {
+ cifs_dbg(VFS, "RFC 1002 negative session response with error 0x%x\n",
+ resp.trailer.neg_ses_resp_error_code);
+ switch (resp.trailer.neg_ses_resp_error_code) {
+ case RFC1002_NOT_LISTENING_CALLED:
+ /* server does not listen for specified server name */
+ fallthrough;
+ case RFC1002_NOT_PRESENT:
+ /* server name is incorrect */
+ rc = -ENOENT;
+ cifs_dbg(VFS, "Server rejected NetBIOS servername %.15s\n",
+ server->server_RFC1001_name[0] ?
+ server->server_RFC1001_name :
+ DEFAULT_CIFS_CALLED_NAME);
+ cifs_dbg(VFS, "Specify correct NetBIOS servername in source path or with -o servern= option\n");
+ break;
+ case RFC1002_NOT_LISTENING_CALLING:
+ /* client name was not accepted by server */
+ rc = -EACCES;
+ cifs_dbg(VFS, "Server rejected NetBIOS clientname %.15s\n",
+ server->workstation_RFC1001_name[0] ?
+ server->workstation_RFC1001_name :
+ "LINUX_CIFS_CLNT");
+ cifs_dbg(VFS, "Specify correct NetBIOS clientname with -o netbiosname= option\n");
+ break;
+ case RFC1002_INSUFFICIENT_RESOURCE:
+ /* remote server resource error */
+ rc = -EREMOTEIO;
+ break;
+ case RFC1002_UNSPECIFIED_ERROR:
+ default:
+ /* other/unknown error */
+ rc = -EIO;
+ break;
+ }
+ } else {
+ cifs_dbg(VFS, "RFC 1002 negative session response\n");
+ rc = -EIO;
+ }
+ return rc;
+ case RFC1002_RETARGET_SESSION_RESPONSE:
+ cifs_dbg(VFS, "RFC 1002 retarget session response\n");
+ if (be16_to_cpu(resp.length) == sizeof(resp.trailer.retarget_resp)) {
+ len = sizeof(resp.trailer.retarget_resp);
+ iov.iov_base = &resp.trailer.retarget_resp;
+ iov.iov_len = len;
+ iov_iter_kvec(&msg.msg_iter, ITER_DEST, &iov, 1, len);
+ if (smb_recv_kvec(server, &msg, &recv) == 0 && recv == len) {
+ cifs_dbg(VFS, "Server wants to redirect connection\n");
+ cifs_dbg(VFS, "Remount with options -o ip=%pI4,port=%u\n",
+ &resp.trailer.retarget_resp.retarget_ip_addr,
+ be16_to_cpu(resp.trailer.retarget_resp.port));
+ }
+ }
+ cifs_dbg(VFS, "Closing connection\n");
+ /* FIXME: Should we automatically redirect to new retarget_resp server? */
+ return -EMULTIHOP;
+ default:
+ cifs_dbg(VFS, "RFC 1002 unknown response type 0x%x\n", resp.type);
+ return -EIO;
+ }
- return rc;
+ server->with_rfc1001 = true;
+ return 0;
}
static int
@@ -3101,20 +3361,17 @@ generic_ip_connect(struct TCP_Server_Info *server)
socket = server->ssocket;
} else {
struct net *net = cifs_net_ns(server);
+ struct sock *sk;
- rc = sock_create_kern(net, sfamily, SOCK_STREAM, IPPROTO_TCP, &server->ssocket);
+ rc = sock_create_kern(net, sfamily, SOCK_STREAM,
+ IPPROTO_TCP, &server->ssocket);
if (rc < 0) {
cifs_server_dbg(VFS, "Error %d creating socket\n", rc);
return rc;
}
- /*
- * Grab netns reference for the socket.
- *
- * It'll be released here, on error, or in clean_demultiplex_info() upon server
- * teardown.
- */
- get_net(net);
+ sk = server->ssocket->sk;
+ sk_net_refcnt_upgrade(sk);
/* BB other socket options to set KEEPALIVE, NODELAY? */
cifs_dbg(FYI, "Socket created\n");
@@ -3128,10 +3385,8 @@ generic_ip_connect(struct TCP_Server_Info *server)
}
rc = bind_socket(server);
- if (rc < 0) {
- put_net(cifs_net_ns(server));
+ if (rc < 0)
return rc;
- }
/*
* Eventually check for other socket options to change from
@@ -3168,17 +3423,22 @@ generic_ip_connect(struct TCP_Server_Info *server)
if (rc < 0) {
cifs_dbg(FYI, "Error %d connecting to server\n", rc);
trace_smb3_connect_err(server->hostname, server->conn_id, &server->dstaddr, rc);
- put_net(cifs_net_ns(server));
sock_release(socket);
server->ssocket = NULL;
return rc;
}
trace_smb3_connect_done(server->hostname, server->conn_id, &server->dstaddr);
- if (sport == htons(RFC1001_PORT))
- rc = ip_rfc1001_connect(server);
- if (rc < 0)
- put_net(cifs_net_ns(server));
+ /*
+ * Establish RFC1001 NetBIOS session when it was explicitly requested
+ * by mount option -o nbsessinit, or when connecting to default RFC1001
+ * server port (139) and it was not explicitly disabled by mount option
+ * -o nonbsessinit.
+ */
+ if (server->with_rfc1001 ||
+ server->rfc1001_sessinit == 1 ||
+ (server->rfc1001_sessinit == -1 && sport == htons(RFC1001_PORT)))
+ rc = ip_rfc1001_connect(server);
return rc;
}
@@ -3326,6 +3586,7 @@ int cifs_setup_cifs_sb(struct cifs_sb_info *cifs_sb)
struct smb3_fs_context *ctx = cifs_sb->ctx;
INIT_DELAYED_WORK(&cifs_sb->prune_tlinks, cifs_prune_tlinks);
+ INIT_LIST_HEAD(&cifs_sb->tcon_sb_link);
spin_lock_init(&cifs_sb->tlink_tree_lock);
cifs_sb->tlink_tree = RB_ROOT;
@@ -3463,9 +3724,15 @@ int cifs_mount_get_tcon(struct cifs_mount_ctx *mnt_ctx)
goto out;
}
- /* if new SMB3.11 POSIX extensions are supported do not remap / and \ */
- if (tcon->posix_extensions)
+ /*
+ * if new SMB3.11 POSIX extensions are supported, do not change anything in the
+ * path (i.e., do not remap / and \ and do not map any special characters)
+ */
+ if (tcon->posix_extensions) {
cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_POSIX_PATHS;
+ cifs_sb->mnt_cifs_flags &= ~(CIFS_MOUNT_MAP_SFM_CHR |
+ CIFS_MOUNT_MAP_SPECIAL_CHR);
+ }
#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
/* tell server which Unix caps we support */
@@ -3502,28 +3769,7 @@ int cifs_mount_get_tcon(struct cifs_mount_ctx *mnt_ctx)
}
}
- /*
- * Clamp the rsize/wsize mount arguments if they are too big for the server
- * and set the rsize/wsize to the negotiated values if not passed in by
- * the user on mount
- */
- if ((cifs_sb->ctx->wsize == 0) ||
- (cifs_sb->ctx->wsize > server->ops->negotiate_wsize(tcon, ctx))) {
- cifs_sb->ctx->wsize =
- round_down(server->ops->negotiate_wsize(tcon, ctx), PAGE_SIZE);
- /*
- * in the very unlikely event that the server sent a max write size under PAGE_SIZE,
- * (which would get rounded down to 0) then reset wsize to absolute minimum eg 4096
- */
- if (cifs_sb->ctx->wsize == 0) {
- cifs_sb->ctx->wsize = PAGE_SIZE;
- cifs_dbg(VFS, "wsize too small, reset to minimum ie PAGE_SIZE, usually 4096\n");
- }
- }
- if ((cifs_sb->ctx->rsize == 0) ||
- (cifs_sb->ctx->rsize > server->ops->negotiate_rsize(tcon, ctx)))
- cifs_sb->ctx->rsize = server->ops->negotiate_rsize(tcon, ctx);
-
+ cifs_negotiate_iosize(server, cifs_sb->ctx, tcon);
/*
* The cookie is initialized from volume info returned above.
* Inside cifs_fscache_get_super_cookie it checks
@@ -3558,6 +3804,10 @@ static int mount_setup_tlink(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses,
tlink_rb_insert(&cifs_sb->tlink_tree, tlink);
spin_unlock(&cifs_sb->tlink_tree_lock);
+ spin_lock(&tcon->sb_list_lock);
+ list_add(&cifs_sb->tcon_sb_link, &tcon->cifs_sb_list);
+ spin_unlock(&tcon->sb_list_lock);
+
queue_delayed_work(cifsiod_wq, &cifs_sb->prune_tlinks,
TLINK_IDLE_EXPIRE);
return 0;
@@ -3899,9 +4149,19 @@ cifs_umount(struct cifs_sb_info *cifs_sb)
struct rb_root *root = &cifs_sb->tlink_tree;
struct rb_node *node;
struct tcon_link *tlink;
+ struct cifs_tcon *tcon = NULL;
cancel_delayed_work_sync(&cifs_sb->prune_tlinks);
+ if (cifs_sb->master_tlink) {
+ tcon = cifs_sb->master_tlink->tl_tcon;
+ if (tcon) {
+ spin_lock(&tcon->sb_list_lock);
+ list_del_init(&cifs_sb->tcon_sb_link);
+ spin_unlock(&tcon->sb_list_lock);
+ }
+ }
+
spin_lock(&cifs_sb->tlink_tree_lock);
while ((node = rb_first(root))) {
tlink = rb_entry(node, struct tcon_link, tl_rbnode);
@@ -3923,11 +4183,13 @@ int
cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses,
struct TCP_Server_Info *server)
{
+ bool in_retry = false;
int rc = 0;
if (!server->ops->need_neg || !server->ops->negotiate)
return -ENOSYS;
+retry:
/* only send once per connect */
spin_lock(&server->srv_lock);
if (server->tcpStatus != CifsGood &&
@@ -3944,9 +4206,18 @@ cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses,
}
server->tcpStatus = CifsInNegotiate;
+ server->neg_start = jiffies;
spin_unlock(&server->srv_lock);
rc = server->ops->negotiate(xid, ses, server);
+ if (rc == -EAGAIN) {
+ /* Allow one retry attempt */
+ if (!in_retry) {
+ in_retry = true;
+ goto retry;
+ }
+ rc = -EHOSTDOWN;
+ }
if (rc == 0) {
spin_lock(&server->srv_lock);
if (server->tcpStatus == CifsInNegotiate)
@@ -3969,7 +4240,7 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
struct TCP_Server_Info *server,
struct nls_table *nls_info)
{
- int rc = -ENOSYS;
+ int rc = 0;
struct TCP_Server_Info *pserver = SERVER_IS_CHAN(server) ? server->primary_server : server;
struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&pserver->dstaddr;
struct sockaddr_in *addr = (struct sockaddr_in *)&pserver->dstaddr;
@@ -4021,6 +4292,26 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
if (!linuxExtEnabled)
ses->capabilities &= (~server->vals->cap_unix);
+ /*
+ * Check if the server supports specified encoding mode.
+ * Zero value in vals->cap_unicode indidcates that chosen
+ * protocol dialect does not support non-UNICODE mode.
+ */
+ if (ses->unicode == 1 && server->vals->cap_unicode != 0 &&
+ !(server->capabilities & server->vals->cap_unicode)) {
+ cifs_dbg(VFS, "Server does not support mounting in UNICODE mode\n");
+ rc = -EOPNOTSUPP;
+ } else if (ses->unicode == 0 && server->vals->cap_unicode == 0) {
+ cifs_dbg(VFS, "Server does not support mounting in non-UNICODE mode\n");
+ rc = -EOPNOTSUPP;
+ } else if (ses->unicode == 0) {
+ /*
+ * When UNICODE mode was explicitly disabled then
+ * do not announce client UNICODE capability.
+ */
+ ses->capabilities &= (~server->vals->cap_unicode);
+ }
+
if (ses->auth_key.response) {
cifs_dbg(FYI, "Free previous auth_key.response = %p\n",
ses->auth_key.response);
@@ -4033,8 +4324,12 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
cifs_dbg(FYI, "Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d\n",
server->sec_mode, server->capabilities, server->timeAdj);
- if (server->ops->sess_setup)
- rc = server->ops->sess_setup(xid, ses, server, nls_info);
+ if (!rc) {
+ if (server->ops->sess_setup)
+ rc = server->ops->sess_setup(xid, ses, server, nls_info);
+ else
+ rc = -ENOSYS;
+ }
if (rc) {
cifs_server_dbg(VFS, "Send error in SessSetup = %d\n", rc);
@@ -4104,6 +4399,7 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid)
ctx->seal = master_tcon->seal;
ctx->witness = master_tcon->use_witness;
ctx->dfs_root_ses = master_tcon->ses->dfs_root_ses;
+ ctx->unicode = master_tcon->ses->unicode;
rc = cifs_set_vol_auth(ctx, master_tcon->ses);
if (rc) {
diff --git a/fs/smb/client/dfs.c b/fs/smb/client/dfs.c
index dad521336b5e..f65a8a90ba27 100644
--- a/fs/smb/client/dfs.c
+++ b/fs/smb/client/dfs.c
@@ -150,25 +150,27 @@ again:
if (rc)
continue;
- if (tgt.flags & DFSREF_STORAGE_SERVER) {
- rc = cifs_mount_get_tcon(mnt_ctx);
- if (!rc)
- rc = cifs_is_path_remote(mnt_ctx);
+ rc = cifs_mount_get_tcon(mnt_ctx);
+ if (rc) {
+ if (tgt.server_type == DFS_TYPE_LINK &&
+ DFS_INTERLINK(tgt.flags))
+ rc = -EREMOTE;
+ } else {
+ rc = cifs_is_path_remote(mnt_ctx);
if (!rc) {
ref_walk_set_tgt_hint(rw);
break;
}
- if (rc != -EREMOTE)
- continue;
}
-
- rc = ref_walk_advance(rw);
- if (!rc) {
- rc = setup_dfs_ref(&tgt, rw);
- if (rc)
- break;
- ref_walk_mark_end(rw);
- goto again;
+ if (rc == -EREMOTE) {
+ rc = ref_walk_advance(rw);
+ if (!rc) {
+ rc = setup_dfs_ref(&tgt, rw);
+ if (rc)
+ break;
+ ref_walk_mark_end(rw);
+ goto again;
+ }
}
}
} while (rc && ref_walk_descend(rw));
diff --git a/fs/smb/client/dfs.h b/fs/smb/client/dfs.h
index ed4cd7cf1ec6..e60f0a24a8a1 100644
--- a/fs/smb/client/dfs.h
+++ b/fs/smb/client/dfs.h
@@ -188,4 +188,11 @@ static inline void dfs_put_root_smb_sessions(struct list_head *head)
}
}
+static inline const char *dfs_ses_refpath(struct cifs_ses *ses)
+{
+ const char *path = ses->server->leaf_fullpath;
+
+ return path ? path + 1 : ERR_PTR(-ENOENT);
+}
+
#endif /* _CIFS_DFS_H */
diff --git a/fs/smb/client/dfs_cache.c b/fs/smb/client/dfs_cache.c
index 5022bb1f122a..4dada26d56b5 100644
--- a/fs/smb/client/dfs_cache.c
+++ b/fs/smb/client/dfs_cache.c
@@ -1136,33 +1136,19 @@ static bool is_ses_good(struct cifs_ses *ses)
return ret;
}
-static char *get_ses_refpath(struct cifs_ses *ses)
-{
- struct TCP_Server_Info *server = ses->server;
- char *path = ERR_PTR(-ENOENT);
-
- if (server->leaf_fullpath) {
- path = kstrdup(server->leaf_fullpath + 1, GFP_KERNEL);
- if (!path)
- path = ERR_PTR(-ENOMEM);
- }
- return path;
-}
-
/* Refresh dfs referral of @ses */
static void refresh_ses_referral(struct cifs_ses *ses)
{
struct cache_entry *ce;
unsigned int xid;
- char *path;
+ const char *path;
int rc = 0;
xid = get_xid();
- path = get_ses_refpath(ses);
+ path = dfs_ses_refpath(ses);
if (IS_ERR(path)) {
rc = PTR_ERR(path);
- path = NULL;
goto out;
}
@@ -1181,7 +1167,6 @@ static void refresh_ses_referral(struct cifs_ses *ses)
out:
free_xid(xid);
- kfree(path);
}
static int __refresh_tcon_referral(struct cifs_tcon *tcon,
@@ -1231,19 +1216,18 @@ static void refresh_tcon_referral(struct cifs_tcon *tcon, bool force_refresh)
struct dfs_info3_param *refs = NULL;
struct cache_entry *ce;
struct cifs_ses *ses;
- unsigned int xid;
bool needs_refresh;
- char *path;
+ const char *path;
+ unsigned int xid;
int numrefs = 0;
int rc = 0;
xid = get_xid();
ses = tcon->ses;
- path = get_ses_refpath(ses);
+ path = dfs_ses_refpath(ses);
if (IS_ERR(path)) {
rc = PTR_ERR(path);
- path = NULL;
goto out;
}
@@ -1271,7 +1255,6 @@ static void refresh_tcon_referral(struct cifs_tcon *tcon, bool force_refresh)
out:
free_xid(xid);
- kfree(path);
free_dfs_info_array(refs, numrefs);
}
diff --git a/fs/smb/client/dir.c b/fs/smb/client/dir.c
index d1e95632ac54..5223edf6d11a 100644
--- a/fs/smb/client/dir.c
+++ b/fs/smb/client/dir.c
@@ -23,6 +23,7 @@
#include "fs_context.h"
#include "cifs_ioctl.h"
#include "fscache.h"
+#include "cached_dir.h"
static void
renew_parental_timestamps(struct dentry *direntry)
@@ -189,7 +190,9 @@ static int cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned
int disposition;
struct TCP_Server_Info *server = tcon->ses->server;
struct cifs_open_parms oparms;
+ struct cached_fid *parent_cfid = NULL;
int rdwr_for_fscache = 0;
+ __le32 lease_flags = 0;
*oplock = 0;
if (tcon->ses->server->oplocks)
@@ -311,7 +314,28 @@ static int cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned
if (!tcon->unix_ext && (mode & S_IWUGO) == 0)
create_options |= CREATE_OPTION_READONLY;
+
retry_open:
+ if (tcon->cfids && direntry->d_parent && server->dialect >= SMB30_PROT_ID) {
+ parent_cfid = NULL;
+ spin_lock(&tcon->cfids->cfid_list_lock);
+ list_for_each_entry(parent_cfid, &tcon->cfids->entries, entry) {
+ if (parent_cfid->dentry == direntry->d_parent) {
+ cifs_dbg(FYI, "found a parent cached file handle\n");
+ if (parent_cfid->has_lease && parent_cfid->time) {
+ lease_flags
+ |= SMB2_LEASE_FLAG_PARENT_LEASE_KEY_SET_LE;
+ memcpy(fid->parent_lease_key,
+ parent_cfid->fid.lease_key,
+ SMB2_LEASE_KEY_SIZE);
+ parent_cfid->dirents.is_valid = false;
+ }
+ break;
+ }
+ }
+ spin_unlock(&tcon->cfids->cfid_list_lock);
+ }
+
oparms = (struct cifs_open_parms) {
.tcon = tcon,
.cifs_sb = cifs_sb,
@@ -320,6 +344,7 @@ retry_open:
.disposition = disposition,
.path = full_path,
.fid = fid,
+ .lease_flags = lease_flags,
.mode = mode,
};
rc = server->ops->open(xid, &oparms, oplock, buf);
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index 79de2f2f9c41..186e061068be 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -52,6 +52,7 @@ static void cifs_prepare_write(struct netfs_io_subrequest *subreq)
struct netfs_io_stream *stream = &req->rreq.io_streams[subreq->stream_nr];
struct TCP_Server_Info *server;
struct cifsFileInfo *open_file = req->cfile;
+ struct cifs_sb_info *cifs_sb = CIFS_SB(wdata->rreq->inode->i_sb);
size_t wsize = req->rreq.wsize;
int rc;
@@ -63,6 +64,10 @@ static void cifs_prepare_write(struct netfs_io_subrequest *subreq)
server = cifs_pick_channel(tlink_tcon(open_file->tlink)->ses);
wdata->server = server;
+ if (cifs_sb->ctx->wsize == 0)
+ cifs_negotiate_wsize(server, cifs_sb->ctx,
+ tlink_tcon(req->cfile->tlink));
+
retry:
if (open_file->invalidHandle) {
rc = cifs_reopen_file(open_file, false);
@@ -130,7 +135,7 @@ fail:
else
trace_netfs_sreq(subreq, netfs_sreq_trace_fail);
add_credits_and_wake_if(wdata->server, &wdata->credits, 0);
- cifs_write_subrequest_terminated(wdata, rc, false);
+ cifs_write_subrequest_terminated(wdata, rc);
goto out;
}
@@ -147,7 +152,7 @@ static int cifs_prepare_read(struct netfs_io_subrequest *subreq)
struct netfs_io_request *rreq = subreq->rreq;
struct cifs_io_subrequest *rdata = container_of(subreq, struct cifs_io_subrequest, subreq);
struct cifs_io_request *req = container_of(subreq->rreq, struct cifs_io_request, rreq);
- struct TCP_Server_Info *server = req->server;
+ struct TCP_Server_Info *server;
struct cifs_sb_info *cifs_sb = CIFS_SB(rreq->inode->i_sb);
size_t size;
int rc = 0;
@@ -156,12 +161,13 @@ static int cifs_prepare_read(struct netfs_io_subrequest *subreq)
rdata->xid = get_xid();
rdata->have_xid = true;
}
+
+ server = cifs_pick_channel(tlink_tcon(req->cfile->tlink)->ses);
rdata->server = server;
if (cifs_sb->ctx->rsize == 0)
- cifs_sb->ctx->rsize =
- server->ops->negotiate_rsize(tlink_tcon(req->cfile->tlink),
- cifs_sb->ctx);
+ cifs_negotiate_rsize(server, cifs_sb->ctx,
+ tlink_tcon(req->cfile->tlink));
rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->rsize,
&size, &rdata->credits);
@@ -198,7 +204,7 @@ static void cifs_issue_read(struct netfs_io_subrequest *subreq)
struct netfs_io_request *rreq = subreq->rreq;
struct cifs_io_subrequest *rdata = container_of(subreq, struct cifs_io_subrequest, subreq);
struct cifs_io_request *req = container_of(subreq->rreq, struct cifs_io_request, rreq);
- struct TCP_Server_Info *server = req->server;
+ struct TCP_Server_Info *server = rdata->server;
int rc = 0;
cifs_dbg(FYI, "%s: op=%08x[%x] mapping=%p len=%zu/%zu\n",
@@ -217,7 +223,8 @@ static void cifs_issue_read(struct netfs_io_subrequest *subreq)
goto failed;
}
- if (subreq->rreq->origin != NETFS_DIO_READ)
+ if (subreq->rreq->origin != NETFS_UNBUFFERED_READ &&
+ subreq->rreq->origin != NETFS_DIO_READ)
__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
@@ -266,7 +273,6 @@ static int cifs_init_request(struct netfs_io_request *rreq, struct file *file)
open_file = file->private_data;
rreq->netfs_priv = file->private_data;
req->cfile = cifsFileInfo_get(open_file);
- req->server = cifs_pick_channel(tlink_tcon(req->cfile->tlink)->ses);
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
req->pid = req->cfile->pid;
} else if (rreq->origin != NETFS_WRITEBACK) {
@@ -387,7 +393,7 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon)
spin_unlock(&tcon->tc_lock);
/*
- * BB Add call to invalidate_inodes(sb) for all superblocks mounted
+ * BB Add call to evict_inodes(sb) for all superblocks mounted
* to this tcon.
*/
}
@@ -997,15 +1003,23 @@ int cifs_open(struct inode *inode, struct file *file)
rc = cifs_get_readable_path(tcon, full_path, &cfile);
}
if (rc == 0) {
- if (file->f_flags == cfile->f_flags) {
+ unsigned int oflags = file->f_flags & ~(O_CREAT|O_EXCL|O_TRUNC);
+ unsigned int cflags = cfile->f_flags & ~(O_CREAT|O_EXCL|O_TRUNC);
+
+ if (cifs_convert_flags(oflags, 0) == cifs_convert_flags(cflags, 0) &&
+ (oflags & (O_SYNC|O_DIRECT)) == (cflags & (O_SYNC|O_DIRECT))) {
file->private_data = cfile;
spin_lock(&CIFS_I(inode)->deferred_lock);
cifs_del_deferred_close(cfile);
spin_unlock(&CIFS_I(inode)->deferred_lock);
goto use_cache;
- } else {
- _cifsFileInfo_put(cfile, true, false);
}
+ _cifsFileInfo_put(cfile, true, false);
+ } else {
+ /* hard link on the defeered close file */
+ rc = cifs_get_hardlink_path(tcon, inode, file);
+ if (rc)
+ cifs_close_deferred_file(CIFS_I(inode));
}
if (server->oplocks)
@@ -2070,6 +2084,29 @@ cifs_move_llist(struct list_head *source, struct list_head *dest)
list_move(li, dest);
}
+int
+cifs_get_hardlink_path(struct cifs_tcon *tcon, struct inode *inode,
+ struct file *file)
+{
+ struct cifsFileInfo *open_file = NULL;
+ struct cifsInodeInfo *cinode = CIFS_I(inode);
+ int rc = 0;
+
+ spin_lock(&tcon->open_file_lock);
+ spin_lock(&cinode->open_file_lock);
+
+ list_for_each_entry(open_file, &cinode->openFileList, flist) {
+ if (file->f_flags == open_file->f_flags) {
+ rc = -EINVAL;
+ break;
+ }
+ }
+
+ spin_unlock(&cinode->open_file_lock);
+ spin_unlock(&tcon->open_file_lock);
+ return rc;
+}
+
void
cifs_free_llist(struct list_head *llist)
{
@@ -2394,8 +2431,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *flock)
return rc;
}
-void cifs_write_subrequest_terminated(struct cifs_io_subrequest *wdata, ssize_t result,
- bool was_async)
+void cifs_write_subrequest_terminated(struct cifs_io_subrequest *wdata, ssize_t result)
{
struct netfs_io_request *wreq = wdata->rreq;
struct netfs_inode *ictx = netfs_inode(wreq->inode);
@@ -2412,7 +2448,7 @@ void cifs_write_subrequest_terminated(struct cifs_io_subrequest *wdata, ssize_t
netfs_resize_file(ictx, wrend, true);
}
- netfs_write_subrequest_terminated(&wdata->subreq, result, was_async);
+ netfs_write_subrequest_terminated(&wdata->subreq, result);
}
struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode,
@@ -2963,38 +2999,38 @@ static const struct vm_operations_struct cifs_file_vm_ops = {
.page_mkwrite = cifs_page_mkwrite,
};
-int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma)
+int cifs_file_strict_mmap_prepare(struct vm_area_desc *desc)
{
int xid, rc = 0;
- struct inode *inode = file_inode(file);
+ struct inode *inode = file_inode(desc->file);
xid = get_xid();
if (!CIFS_CACHE_READ(CIFS_I(inode)))
rc = cifs_zap_mapping(inode);
if (!rc)
- rc = generic_file_mmap(file, vma);
+ rc = generic_file_mmap_prepare(desc);
if (!rc)
- vma->vm_ops = &cifs_file_vm_ops;
+ desc->vm_ops = &cifs_file_vm_ops;
free_xid(xid);
return rc;
}
-int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
+int cifs_file_mmap_prepare(struct vm_area_desc *desc)
{
int rc, xid;
xid = get_xid();
- rc = cifs_revalidate_file(file);
+ rc = cifs_revalidate_file(desc->file);
if (rc)
cifs_dbg(FYI, "Validation prior to mmap failed, error=%d\n",
rc);
if (!rc)
- rc = generic_file_mmap(file, vma);
+ rc = generic_file_mmap_prepare(desc);
if (!rc)
- vma->vm_ops = &cifs_file_vm_ops;
+ desc->vm_ops = &cifs_file_vm_ops;
free_xid(xid);
return rc;
@@ -3052,7 +3088,8 @@ void cifs_oplock_break(struct work_struct *work)
struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
oplock_break);
struct inode *inode = d_inode(cfile->dentry);
- struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+ struct super_block *sb = inode->i_sb;
+ struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
struct cifsInodeInfo *cinode = CIFS_I(inode);
struct cifs_tcon *tcon;
struct TCP_Server_Info *server;
@@ -3062,6 +3099,12 @@ void cifs_oplock_break(struct work_struct *work)
__u64 persistent_fid, volatile_fid;
__u16 net_fid;
+ /*
+ * Hold a reference to the superblock to prevent it and its inodes from
+ * being freed while we are accessing cinode. Otherwise, _cifsFileInfo_put()
+ * may release the last reference to the sb and trigger inode eviction.
+ */
+ cifs_sb_active(sb);
wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS,
TASK_UNINTERRUPTIBLE);
@@ -3081,7 +3124,7 @@ void cifs_oplock_break(struct work_struct *work)
cinode->oplock = 0;
}
- if (inode && S_ISREG(inode->i_mode)) {
+ if (S_ISREG(inode->i_mode)) {
if (CIFS_CACHE_READ(cinode))
break_lease(inode, O_RDONLY);
else
@@ -3134,6 +3177,7 @@ oplock_break_ack:
cifs_put_tlink(tlink);
out:
cifs_done_oplock_break(cinode);
+ cifs_sb_deactive(sb);
}
static int cifs_swap_activate(struct swap_info_struct *sis,
diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c
index 5381f05420bc..072383899e81 100644
--- a/fs/smb/client/fs_context.c
+++ b/fs/smb/client/fs_context.c
@@ -133,6 +133,9 @@ const struct fs_parameter_spec smb3_fs_parameters[] = {
fsparam_flag("rootfs", Opt_rootfs),
fsparam_flag("compress", Opt_compress),
fsparam_flag("witness", Opt_witness),
+ fsparam_flag_no("nativesocket", Opt_nativesocket),
+ fsparam_flag_no("unicode", Opt_unicode),
+ fsparam_flag_no("nbsessinit", Opt_nbsessinit),
/* Mount options which take uid or gid */
fsparam_uid("backupuid", Opt_backupuid),
@@ -170,6 +173,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = {
fsparam_string("username", Opt_user),
fsparam_string("pass", Opt_pass),
fsparam_string("password", Opt_pass),
+ fsparam_string("pass2", Opt_pass2),
fsparam_string("password2", Opt_pass2),
fsparam_string("ip", Opt_ip),
fsparam_string("addr", Opt_ip),
@@ -185,6 +189,8 @@ const struct fs_parameter_spec smb3_fs_parameters[] = {
fsparam_string("cache", Opt_cache),
fsparam_string("reparse", Opt_reparse),
fsparam_string("upcall_target", Opt_upcalltarget),
+ fsparam_string("symlink", Opt_symlink),
+ fsparam_string("symlinkroot", Opt_symlinkroot),
/* Arguments that should be ignored */
fsparam_flag("guest", Opt_ignore),
@@ -332,6 +338,7 @@ cifs_parse_cache_flavor(struct fs_context *fc, char *value, struct smb3_fs_conte
static const match_table_t reparse_flavor_tokens = {
{ Opt_reparse_default, "default" },
+ { Opt_reparse_none, "none" },
{ Opt_reparse_nfs, "nfs" },
{ Opt_reparse_wsl, "wsl" },
{ Opt_reparse_err, NULL },
@@ -346,6 +353,9 @@ static int parse_reparse_flavor(struct fs_context *fc, char *value,
case Opt_reparse_default:
ctx->reparse_type = CIFS_REPARSE_TYPE_DEFAULT;
break;
+ case Opt_reparse_none:
+ ctx->reparse_type = CIFS_REPARSE_TYPE_NONE;
+ break;
case Opt_reparse_nfs:
ctx->reparse_type = CIFS_REPARSE_TYPE_NFS;
break;
@@ -359,6 +369,55 @@ static int parse_reparse_flavor(struct fs_context *fc, char *value,
return 0;
}
+static const match_table_t symlink_flavor_tokens = {
+ { Opt_symlink_default, "default" },
+ { Opt_symlink_none, "none" },
+ { Opt_symlink_native, "native" },
+ { Opt_symlink_unix, "unix" },
+ { Opt_symlink_mfsymlinks, "mfsymlinks" },
+ { Opt_symlink_sfu, "sfu" },
+ { Opt_symlink_nfs, "nfs" },
+ { Opt_symlink_wsl, "wsl" },
+ { Opt_symlink_err, NULL },
+};
+
+static int parse_symlink_flavor(struct fs_context *fc, char *value,
+ struct smb3_fs_context *ctx)
+{
+ substring_t args[MAX_OPT_ARGS];
+
+ switch (match_token(value, symlink_flavor_tokens, args)) {
+ case Opt_symlink_default:
+ ctx->symlink_type = CIFS_SYMLINK_TYPE_DEFAULT;
+ break;
+ case Opt_symlink_none:
+ ctx->symlink_type = CIFS_SYMLINK_TYPE_NONE;
+ break;
+ case Opt_symlink_native:
+ ctx->symlink_type = CIFS_SYMLINK_TYPE_NATIVE;
+ break;
+ case Opt_symlink_unix:
+ ctx->symlink_type = CIFS_SYMLINK_TYPE_UNIX;
+ break;
+ case Opt_symlink_mfsymlinks:
+ ctx->symlink_type = CIFS_SYMLINK_TYPE_MFSYMLINKS;
+ break;
+ case Opt_symlink_sfu:
+ ctx->symlink_type = CIFS_SYMLINK_TYPE_SFU;
+ break;
+ case Opt_symlink_nfs:
+ ctx->symlink_type = CIFS_SYMLINK_TYPE_NFS;
+ break;
+ case Opt_symlink_wsl:
+ ctx->symlink_type = CIFS_SYMLINK_TYPE_WSL;
+ break;
+ default:
+ cifs_errorf(fc, "bad symlink= option: %s\n", value);
+ return 1;
+ }
+ return 0;
+}
+
#define DUP_CTX_STR(field) \
do { \
if (ctx->field) { \
@@ -386,6 +445,7 @@ smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx
new_ctx->iocharset = NULL;
new_ctx->leaf_fullpath = NULL;
new_ctx->dns_dom = NULL;
+ new_ctx->symlinkroot = NULL;
/*
* Make sure to stay in sync with smb3_cleanup_fs_context_contents()
*/
@@ -401,6 +461,7 @@ smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx
DUP_CTX_STR(iocharset);
DUP_CTX_STR(leaf_fullpath);
DUP_CTX_STR(dns_dom);
+ DUP_CTX_STR(symlinkroot);
return 0;
}
@@ -904,6 +965,14 @@ static int smb3_verify_reconfigure_ctx(struct fs_context *fc,
cifs_errorf(fc, "can not change iocharset during remount\n");
return -EINVAL;
}
+ if (new_ctx->unicode != old_ctx->unicode) {
+ cifs_errorf(fc, "can not change unicode during remount\n");
+ return -EINVAL;
+ }
+ if (new_ctx->rfc1001_sessinit != old_ctx->rfc1001_sessinit) {
+ cifs_errorf(fc, "can not change nbsessinit during remount\n");
+ return -EINVAL;
+ }
return 0;
}
@@ -952,6 +1021,7 @@ static int smb3_reconfigure(struct fs_context *fc)
struct dentry *root = fc->root;
struct cifs_sb_info *cifs_sb = CIFS_SB(root->d_sb);
struct cifs_ses *ses = cifs_sb_master_tcon(cifs_sb)->ses;
+ unsigned int rsize = ctx->rsize, wsize = ctx->wsize;
char *new_password = NULL, *new_password2 = NULL;
bool need_recon = false;
int rc;
@@ -1034,11 +1104,8 @@ static int smb3_reconfigure(struct fs_context *fc)
STEAL_STRING(cifs_sb, ctx, iocharset);
/* if rsize or wsize not passed in on remount, use previous values */
- if (ctx->rsize == 0)
- ctx->rsize = cifs_sb->ctx->rsize;
- if (ctx->wsize == 0)
- ctx->wsize = cifs_sb->ctx->wsize;
-
+ ctx->rsize = rsize ? CIFS_ALIGN_RSIZE(fc, rsize) : cifs_sb->ctx->rsize;
+ ctx->wsize = wsize ? CIFS_ALIGN_WSIZE(fc, wsize) : cifs_sb->ctx->wsize;
smb3_cleanup_fs_context_contents(cifs_sb->ctx);
rc = smb3_fs_context_dup(cifs_sb->ctx, ctx);
@@ -1059,6 +1126,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
int i, opt;
bool is_smb3 = !strcmp(fc->fs_type->name, "smb3");
bool skip_parsing = false;
+ char *hostname;
cifs_dbg(FYI, "CIFS: parsing cifs mount option '%s'\n", param->key);
@@ -1073,6 +1141,9 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
} else if (!strcmp("user", param->key) || !strcmp("username", param->key)) {
skip_parsing = true;
opt = Opt_user;
+ } else if (!strcmp("pass2", param->key) || !strcmp("password2", param->key)) {
+ skip_parsing = true;
+ opt = Opt_pass2;
}
}
@@ -1239,7 +1310,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
__func__);
goto cifs_parse_mount_err;
}
- ctx->bsize = result.uint_32;
+ ctx->bsize = CIFS_ALIGN_BSIZE(fc, result.uint_32);
ctx->got_bsize = true;
break;
case Opt_rasize:
@@ -1263,40 +1334,31 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
ctx->rasize = result.uint_32;
break;
case Opt_rsize:
- ctx->rsize = result.uint_32;
+ ctx->rsize = CIFS_ALIGN_RSIZE(fc, result.uint_32);
ctx->got_rsize = true;
+ ctx->vol_rsize = ctx->rsize;
break;
case Opt_wsize:
- ctx->wsize = result.uint_32;
+ ctx->wsize = CIFS_ALIGN_WSIZE(fc, result.uint_32);
ctx->got_wsize = true;
- if (ctx->wsize % PAGE_SIZE != 0) {
- ctx->wsize = round_down(ctx->wsize, PAGE_SIZE);
- if (ctx->wsize == 0) {
- ctx->wsize = PAGE_SIZE;
- cifs_dbg(VFS, "wsize too small, reset to minimum %ld\n", PAGE_SIZE);
- } else {
- cifs_dbg(VFS,
- "wsize rounded down to %d to multiple of PAGE_SIZE %ld\n",
- ctx->wsize, PAGE_SIZE);
- }
- }
+ ctx->vol_wsize = ctx->wsize;
break;
case Opt_acregmax:
- ctx->acregmax = HZ * result.uint_32;
- if (ctx->acregmax > CIFS_MAX_ACTIMEO) {
+ if (result.uint_32 > CIFS_MAX_ACTIMEO / HZ) {
cifs_errorf(fc, "acregmax too large\n");
goto cifs_parse_mount_err;
}
+ ctx->acregmax = HZ * result.uint_32;
break;
case Opt_acdirmax:
- ctx->acdirmax = HZ * result.uint_32;
- if (ctx->acdirmax > CIFS_MAX_ACTIMEO) {
+ if (result.uint_32 > CIFS_MAX_ACTIMEO / HZ) {
cifs_errorf(fc, "acdirmax too large\n");
goto cifs_parse_mount_err;
}
+ ctx->acdirmax = HZ * result.uint_32;
break;
case Opt_actimeo:
- if (HZ * result.uint_32 > CIFS_MAX_ACTIMEO) {
+ if (result.uint_32 > CIFS_MAX_ACTIMEO / HZ) {
cifs_errorf(fc, "timeout too large\n");
goto cifs_parse_mount_err;
}
@@ -1308,13 +1370,18 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
ctx->acdirmax = ctx->acregmax = HZ * result.uint_32;
break;
case Opt_closetimeo:
- ctx->closetimeo = HZ * result.uint_32;
- if (ctx->closetimeo > SMB3_MAX_DCLOSETIMEO) {
+ if (result.uint_32 > SMB3_MAX_DCLOSETIMEO / HZ) {
cifs_errorf(fc, "closetimeo too large\n");
goto cifs_parse_mount_err;
}
+ ctx->closetimeo = HZ * result.uint_32;
break;
case Opt_echo_interval:
+ if (result.uint_32 < SMB_ECHO_INTERVAL_MIN ||
+ result.uint_32 > SMB_ECHO_INTERVAL_MAX) {
+ cifs_errorf(fc, "echo interval is out of bounds\n");
+ goto cifs_parse_mount_err;
+ }
ctx->echo_interval = result.uint_32;
break;
case Opt_snapshot:
@@ -1381,6 +1448,16 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
cifs_errorf(fc, "OOM when copying UNC string\n");
goto cifs_parse_mount_err;
}
+ hostname = extract_hostname(ctx->UNC);
+ if (IS_ERR(hostname)) {
+ cifs_errorf(fc, "Cannot extract hostname from UNC string\n");
+ goto cifs_parse_mount_err;
+ }
+ /* last byte, type, is 0x20 for servr type */
+ memset(ctx->target_rfc1001_name, 0x20, RFC1001_NAME_LEN_WITH_NULL);
+ for (i = 0; i < RFC1001_NAME_LEN && hostname[i] != 0; i++)
+ ctx->target_rfc1001_name[i] = toupper(hostname[i]);
+ kfree(hostname);
break;
case Opt_user:
kfree(ctx->username);
@@ -1398,35 +1475,21 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
pr_warn("username too long\n");
goto cifs_parse_mount_err;
}
- ctx->username = kstrdup(param->string, GFP_KERNEL);
- if (ctx->username == NULL) {
- cifs_errorf(fc, "OOM when copying username string\n");
- goto cifs_parse_mount_err;
- }
+ ctx->username = no_free_ptr(param->string);
break;
case Opt_pass:
kfree_sensitive(ctx->password);
ctx->password = NULL;
if (strlen(param->string) == 0)
break;
-
- ctx->password = kstrdup(param->string, GFP_KERNEL);
- if (ctx->password == NULL) {
- cifs_errorf(fc, "OOM when copying password string\n");
- goto cifs_parse_mount_err;
- }
+ ctx->password = no_free_ptr(param->string);
break;
case Opt_pass2:
kfree_sensitive(ctx->password2);
ctx->password2 = NULL;
if (strlen(param->string) == 0)
break;
-
- ctx->password2 = kstrdup(param->string, GFP_KERNEL);
- if (ctx->password2 == NULL) {
- cifs_errorf(fc, "OOM when copying password2 string\n");
- goto cifs_parse_mount_err;
- }
+ ctx->password2 = no_free_ptr(param->string);
break;
case Opt_ip:
if (strlen(param->string) == 0) {
@@ -1449,11 +1512,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
}
kfree(ctx->domainname);
- ctx->domainname = kstrdup(param->string, GFP_KERNEL);
- if (ctx->domainname == NULL) {
- cifs_errorf(fc, "OOM when copying domainname string\n");
- goto cifs_parse_mount_err;
- }
+ ctx->domainname = no_free_ptr(param->string);
cifs_dbg(FYI, "Domain name set\n");
break;
case Opt_srcaddr:
@@ -1473,11 +1532,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
if (strncasecmp(param->string, "default", 7) != 0) {
kfree(ctx->iocharset);
- ctx->iocharset = kstrdup(param->string, GFP_KERNEL);
- if (ctx->iocharset == NULL) {
- cifs_errorf(fc, "OOM when copying iocharset string\n");
- goto cifs_parse_mount_err;
- }
+ ctx->iocharset = no_free_ptr(param->string);
}
/* if iocharset not set then load_nls_default
* is used by caller
@@ -1524,6 +1579,10 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
if (i == RFC1001_NAME_LEN && param->string[i] != 0)
pr_warn("server netbiosname longer than 15 truncated\n");
break;
+ case Opt_nbsessinit:
+ ctx->rfc1001_sessinit = !result.negated;
+ cifs_dbg(FYI, "rfc1001_sessinit set to %d\n", ctx->rfc1001_sessinit);
+ break;
case Opt_ver:
/* version of mount userspace tools, not dialect */
/* If interface changes in mount.cifs bump to new ver */
@@ -1565,6 +1624,10 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
ctx->witness = true;
pr_warn_once("Witness protocol support is experimental\n");
break;
+ case Opt_unicode:
+ ctx->unicode = !result.negated;
+ cifs_dbg(FYI, "unicode set to %d\n", ctx->unicode);
+ break;
case Opt_rootfs:
#ifndef CONFIG_CIFS_ROOT
cifs_dbg(VFS, "rootfs support requires CONFIG_CIFS_ROOT config option\n");
@@ -1589,6 +1652,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
pr_warn_once("conflicting posix mount options specified\n");
ctx->linux_ext = 1;
ctx->no_linux_ext = 0;
+ ctx->nonativesocket = 1; /* POSIX mounts use NFS style reparse points */
}
break;
case Opt_nocase:
@@ -1727,6 +1791,27 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
if (parse_reparse_flavor(fc, param->string, ctx))
goto cifs_parse_mount_err;
break;
+ case Opt_nativesocket:
+ ctx->nonativesocket = result.negated;
+ break;
+ case Opt_symlink:
+ if (parse_symlink_flavor(fc, param->string, ctx))
+ goto cifs_parse_mount_err;
+ break;
+ case Opt_symlinkroot:
+ if (param->string[0] != '/') {
+ cifs_errorf(fc, "symlinkroot mount options must be absolute path\n");
+ goto cifs_parse_mount_err;
+ }
+ if (strnlen(param->string, PATH_MAX) == PATH_MAX) {
+ cifs_errorf(fc, "symlinkroot path too long (max path length: %u)\n",
+ PATH_MAX - 1);
+ goto cifs_parse_mount_err;
+ }
+ kfree(ctx->symlinkroot);
+ ctx->symlinkroot = param->string;
+ param->string = NULL;
+ break;
}
/* case Opt_ignore: - is ignored as expected ... */
@@ -1765,13 +1850,16 @@ int smb3_init_fs_context(struct fs_context *fc)
memset(ctx->source_rfc1001_name, 0x20, RFC1001_NAME_LEN);
for (i = 0; i < strnlen(nodename, RFC1001_NAME_LEN); i++)
ctx->source_rfc1001_name[i] = toupper(nodename[i]);
-
ctx->source_rfc1001_name[RFC1001_NAME_LEN] = 0;
+
/*
* null target name indicates to use *SMBSERVR default called name
* if we end up sending RFC1001 session initialize
*/
ctx->target_rfc1001_name[0] = 0;
+
+ ctx->rfc1001_sessinit = -1; /* autodetect based on port number */
+
ctx->cred_uid = current_uid();
ctx->linux_uid = current_uid();
ctx->linux_gid = current_gid();
@@ -1821,6 +1909,10 @@ int smb3_init_fs_context(struct fs_context *fc)
ctx->retrans = 1;
ctx->reparse_type = CIFS_REPARSE_TYPE_DEFAULT;
+ ctx->symlink_type = CIFS_SYMLINK_TYPE_DEFAULT;
+ ctx->nonativesocket = 0;
+
+ ctx->unicode = -1; /* autodetect, but prefer UNICODE mode */
/*
* short int override_uid = -1;
@@ -1867,6 +1959,8 @@ smb3_cleanup_fs_context_contents(struct smb3_fs_context *ctx)
ctx->leaf_fullpath = NULL;
kfree(ctx->dns_dom);
ctx->dns_dom = NULL;
+ kfree(ctx->symlinkroot);
+ ctx->symlinkroot = NULL;
}
void
diff --git a/fs/smb/client/fs_context.h b/fs/smb/client/fs_context.h
index 8813533345ee..b0fec6b9a23b 100644
--- a/fs/smb/client/fs_context.h
+++ b/fs/smb/client/fs_context.h
@@ -20,6 +20,21 @@
cifs_dbg(VFS, fmt, ## __VA_ARGS__); \
} while (0)
+static inline size_t cifs_io_align(struct fs_context *fc,
+ const char *name, size_t size)
+{
+ if (!size || !IS_ALIGNED(size, PAGE_SIZE)) {
+ cifs_errorf(fc, "unaligned %s, making it a multiple of %lu bytes\n",
+ name, PAGE_SIZE);
+ size = umax(round_down(size, PAGE_SIZE), PAGE_SIZE);
+ }
+ return size;
+}
+
+#define CIFS_ALIGN_WSIZE(_fc, _size) cifs_io_align(_fc, "wsize", _size)
+#define CIFS_ALIGN_RSIZE(_fc, _size) cifs_io_align(_fc, "rsize", _size)
+#define CIFS_ALIGN_BSIZE(_fc, _size) cifs_io_align(_fc, "bsize", _size)
+
enum smb_version {
Smb_1 = 1,
Smb_20,
@@ -43,11 +58,24 @@ enum {
enum cifs_reparse_parm {
Opt_reparse_default,
+ Opt_reparse_none,
Opt_reparse_nfs,
Opt_reparse_wsl,
Opt_reparse_err
};
+enum cifs_symlink_parm {
+ Opt_symlink_default,
+ Opt_symlink_none,
+ Opt_symlink_native,
+ Opt_symlink_unix,
+ Opt_symlink_mfsymlinks,
+ Opt_symlink_sfu,
+ Opt_symlink_nfs,
+ Opt_symlink_wsl,
+ Opt_symlink_err
+};
+
enum cifs_sec_param {
Opt_sec_krb5,
Opt_sec_krb5i,
@@ -122,6 +150,7 @@ enum cifs_param {
Opt_witness,
Opt_is_upcall_target_mount,
Opt_is_upcall_target_application,
+ Opt_unicode,
/* Mount options which take numeric value */
Opt_backupuid,
@@ -160,12 +189,16 @@ enum cifs_param {
Opt_iocharset,
Opt_netbiosname,
Opt_servern,
+ Opt_nbsessinit,
Opt_ver,
Opt_vers,
Opt_sec,
Opt_cache,
Opt_reparse,
Opt_upcalltarget,
+ Opt_nativesocket,
+ Opt_symlink,
+ Opt_symlinkroot,
/* Mount options to be ignored */
Opt_ignore,
@@ -199,6 +232,7 @@ struct smb3_fs_context {
char *iocharset; /* local code page for mapping to and from Unicode */
char source_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* clnt nb name */
char target_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* srvr nb name */
+ int rfc1001_sessinit;
kuid_t cred_uid;
kuid_t linux_uid;
kgid_t linux_gid;
@@ -263,6 +297,9 @@ struct smb3_fs_context {
bool use_client_guid:1;
/* reuse existing guid for multichannel */
u8 client_guid[SMB2_CLIENT_GUID_SIZE];
+ /* User-specified original r/wsize value */
+ unsigned int vol_rsize;
+ unsigned int vol_wsize;
unsigned int bsize;
unsigned int rasize;
unsigned int rsize;
@@ -290,16 +327,38 @@ struct smb3_fs_context {
bool compress; /* enable SMB2 messages (READ/WRITE) de/compression */
bool rootfs:1; /* if it's a SMB root file system */
bool witness:1; /* use witness protocol */
+ int unicode;
char *leaf_fullpath;
struct cifs_ses *dfs_root_ses;
bool dfs_automount:1; /* set for dfs automount only */
enum cifs_reparse_type reparse_type;
+ enum cifs_symlink_type symlink_type;
+ bool nonativesocket:1;
bool dfs_conn:1; /* set for dfs mounts */
char *dns_dom;
+ char *symlinkroot; /* top level directory for native SMB symlinks in absolute format */
};
extern const struct fs_parameter_spec smb3_fs_parameters[];
+static inline enum cifs_symlink_type cifs_symlink_type(struct cifs_sb_info *cifs_sb)
+{
+ bool posix = cifs_sb_master_tcon(cifs_sb)->posix_extensions;
+
+ if (cifs_sb->ctx->symlink_type != CIFS_SYMLINK_TYPE_DEFAULT)
+ return cifs_sb->ctx->symlink_type;
+
+ if (cifs_sb->ctx->mfsymlinks)
+ return CIFS_SYMLINK_TYPE_MFSYMLINKS;
+ else if (cifs_sb->ctx->sfu_emul)
+ return CIFS_SYMLINK_TYPE_SFU;
+ else if (cifs_sb->ctx->linux_ext && !cifs_sb->ctx->no_linux_ext)
+ return posix ? CIFS_SYMLINK_TYPE_NATIVE : CIFS_SYMLINK_TYPE_UNIX;
+ else if (cifs_sb->ctx->reparse_type != CIFS_REPARSE_TYPE_NONE)
+ return CIFS_SYMLINK_TYPE_NATIVE;
+ return CIFS_SYMLINK_TYPE_NONE;
+}
+
extern int smb3_init_fs_context(struct fs_context *fc);
extern void smb3_cleanup_fs_context_contents(struct smb3_fs_context *ctx);
extern void smb3_cleanup_fs_context(struct smb3_fs_context *ctx);
@@ -333,4 +392,36 @@ static inline void cifs_mount_unlock(void)
mutex_unlock(&cifs_mount_mutex);
}
+static inline void cifs_negotiate_rsize(struct TCP_Server_Info *server,
+ struct smb3_fs_context *ctx,
+ struct cifs_tcon *tcon)
+{
+ unsigned int size;
+
+ size = umax(server->ops->negotiate_rsize(tcon, ctx), PAGE_SIZE);
+ if (ctx->rsize)
+ size = umax(umin(ctx->rsize, size), PAGE_SIZE);
+ ctx->rsize = round_down(size, PAGE_SIZE);
+}
+
+static inline void cifs_negotiate_wsize(struct TCP_Server_Info *server,
+ struct smb3_fs_context *ctx,
+ struct cifs_tcon *tcon)
+{
+ unsigned int size;
+
+ size = umax(server->ops->negotiate_wsize(tcon, ctx), PAGE_SIZE);
+ if (ctx->wsize)
+ size = umax(umin(ctx->wsize, size), PAGE_SIZE);
+ ctx->wsize = round_down(size, PAGE_SIZE);
+}
+
+static inline void cifs_negotiate_iosize(struct TCP_Server_Info *server,
+ struct smb3_fs_context *ctx,
+ struct cifs_tcon *tcon)
+{
+ cifs_negotiate_rsize(server, ctx, tcon);
+ cifs_negotiate_wsize(server, ctx, tcon);
+}
+
#endif
diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c
index 93e9188b2632..fe453a4b3dc8 100644
--- a/fs/smb/client/inode.c
+++ b/fs/smb/client/inode.c
@@ -990,7 +990,7 @@ cifs_get_file_info(struct file *filp)
/* TODO: add support to query reparse tag */
data.adjust_tz = false;
if (data.symlink_target) {
- data.symlink = true;
+ data.reparse_point = true;
data.reparse.tag = IO_REPARSE_TAG_SYMLINK;
}
path = build_path_from_dentry(dentry, page);
@@ -1203,18 +1203,45 @@ static int reparse_info_to_fattr(struct cifs_open_info_data *data,
goto out;
}
break;
- case IO_REPARSE_TAG_MOUNT_POINT:
- cifs_create_junction_fattr(fattr, sb);
- rc = 0;
- goto out;
default:
/* Check for cached reparse point data */
if (data->symlink_target || data->reparse.buf) {
rc = 0;
- } else if (iov && server->ops->parse_reparse_point) {
- rc = server->ops->parse_reparse_point(cifs_sb,
- full_path,
- iov, data);
+ } else if (iov && server->ops->get_reparse_point_buffer) {
+ struct reparse_data_buffer *reparse_buf;
+ u32 reparse_len;
+
+ reparse_buf = server->ops->get_reparse_point_buffer(iov, &reparse_len);
+ rc = parse_reparse_point(reparse_buf, reparse_len,
+ cifs_sb, full_path, data);
+ /*
+ * If the reparse point was not handled but it is the
+ * name surrogate which points to directory, then treat
+ * is as a new mount point. Name surrogate reparse point
+ * represents another named entity in the system.
+ */
+ if (rc == -EOPNOTSUPP &&
+ IS_REPARSE_TAG_NAME_SURROGATE(data->reparse.tag) &&
+ (le32_to_cpu(data->fi.Attributes) & ATTR_DIRECTORY)) {
+ rc = 0;
+ cifs_create_junction_fattr(fattr, sb);
+ goto out;
+ }
+ /*
+ * If the reparse point is unsupported by the Linux SMB
+ * client then let it process by the SMB server. So mask
+ * the -EOPNOTSUPP error code. This will allow Linux SMB
+ * client to send SMB OPEN request to server. If server
+ * does not support this reparse point too then server
+ * will return error during open the path.
+ */
+ if (rc == -EOPNOTSUPP)
+ rc = 0;
+ }
+
+ if (data->reparse.tag == IO_REPARSE_TAG_SYMLINK && !rc) {
+ bool directory = le32_to_cpu(data->fi.Attributes) & ATTR_DIRECTORY;
+ rc = smb2_fix_symlink_target_type(&data->symlink_target, directory, cifs_sb);
}
break;
}
@@ -1403,7 +1430,7 @@ int cifs_get_inode_info(struct inode **inode,
struct cifs_fattr fattr = {};
int rc;
- if (is_inode_cache_good(*inode)) {
+ if (!data && is_inode_cache_good(*inode)) {
cifs_dbg(FYI, "No need to revalidate cached inode sizes\n");
return 0;
}
@@ -1502,7 +1529,7 @@ int smb311_posix_get_inode_info(struct inode **inode,
struct cifs_fattr fattr = {};
int rc;
- if (is_inode_cache_good(*inode)) {
+ if (!data && is_inode_cache_good(*inode)) {
cifs_dbg(FYI, "No need to revalidate cached inode sizes\n");
return 0;
}
@@ -1916,15 +1943,24 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
struct tcon_link *tlink;
struct cifs_tcon *tcon;
+ __u32 dosattr = 0, origattr = 0;
struct TCP_Server_Info *server;
struct iattr *attrs = NULL;
- __u32 dosattr = 0, origattr = 0;
+ bool rehash = false;
cifs_dbg(FYI, "cifs_unlink, dir=0x%p, dentry=0x%p\n", dir, dentry);
if (unlikely(cifs_forced_shutdown(cifs_sb)))
return -EIO;
+ /* Unhash dentry in advance to prevent any concurrent opens */
+ spin_lock(&dentry->d_lock);
+ if (!d_unhashed(dentry)) {
+ __d_drop(dentry);
+ rehash = true;
+ }
+ spin_unlock(&dentry->d_lock);
+
tlink = cifs_sb_tlink(cifs_sb);
if (IS_ERR(tlink))
return PTR_ERR(tlink);
@@ -1976,7 +2012,8 @@ psx_del_no_retry:
cifs_drop_nlink(inode);
}
} else if (rc == -ENOENT) {
- d_drop(dentry);
+ if (simple_positive(dentry))
+ d_delete(dentry);
} else if (rc == -EBUSY) {
if (server->ops->rename_pending_delete) {
rc = server->ops->rename_pending_delete(full_path,
@@ -2029,6 +2066,8 @@ unlink_out:
kfree(attrs);
free_xid(xid);
cifs_put_tlink(tlink);
+ if (rehash)
+ d_rehash(dentry);
return rc;
}
@@ -2189,8 +2228,8 @@ posix_mkdir_get_info:
}
#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
-int cifs_mkdir(struct mnt_idmap *idmap, struct inode *inode,
- struct dentry *direntry, umode_t mode)
+struct dentry *cifs_mkdir(struct mnt_idmap *idmap, struct inode *inode,
+ struct dentry *direntry, umode_t mode)
{
int rc = 0;
unsigned int xid;
@@ -2206,10 +2245,10 @@ int cifs_mkdir(struct mnt_idmap *idmap, struct inode *inode,
cifs_sb = CIFS_SB(inode->i_sb);
if (unlikely(cifs_forced_shutdown(cifs_sb)))
- return -EIO;
+ return ERR_PTR(-EIO);
tlink = cifs_sb_tlink(cifs_sb);
if (IS_ERR(tlink))
- return PTR_ERR(tlink);
+ return ERR_CAST(tlink);
tcon = tlink_tcon(tlink);
xid = get_xid();
@@ -2265,7 +2304,7 @@ mkdir_out:
free_dentry_path(page);
free_xid(xid);
cifs_put_tlink(tlink);
- return rc;
+ return ERR_PTR(rc);
}
int cifs_rmdir(struct inode *inode, struct dentry *direntry)
@@ -2435,6 +2474,7 @@ cifs_rename2(struct mnt_idmap *idmap, struct inode *source_dir,
struct cifs_sb_info *cifs_sb;
struct tcon_link *tlink;
struct cifs_tcon *tcon;
+ bool rehash = false;
unsigned int xid;
int rc, tmprc;
int retry_count = 0;
@@ -2450,6 +2490,17 @@ cifs_rename2(struct mnt_idmap *idmap, struct inode *source_dir,
if (unlikely(cifs_forced_shutdown(cifs_sb)))
return -EIO;
+ /*
+ * Prevent any concurrent opens on the target by unhashing the dentry.
+ * VFS already unhashes the target when renaming directories.
+ */
+ if (d_is_positive(target_dentry) && !d_is_dir(target_dentry)) {
+ if (!d_unhashed(target_dentry)) {
+ d_drop(target_dentry);
+ rehash = true;
+ }
+ }
+
tlink = cifs_sb_tlink(cifs_sb);
if (IS_ERR(tlink))
return PTR_ERR(tlink);
@@ -2491,6 +2542,8 @@ cifs_rename2(struct mnt_idmap *idmap, struct inode *source_dir,
}
}
+ if (!rc)
+ rehash = false;
/*
* No-replace is the natural behavior for CIFS, so skip unlink hacks.
*/
@@ -2549,12 +2602,16 @@ unlink_target:
goto cifs_rename_exit;
rc = cifs_do_rename(xid, source_dentry, from_name,
target_dentry, to_name);
+ if (!rc)
+ rehash = false;
}
/* force revalidate to go get info when needed */
CIFS_I(source_dir)->time = CIFS_I(target_dir)->time = 0;
cifs_rename_exit:
+ if (rehash)
+ d_rehash(target_dentry);
kfree(info_buf_source);
free_dentry_path(page2);
free_dentry_path(page1);
@@ -2883,23 +2940,6 @@ int cifs_fiemap(struct inode *inode, struct fiemap_extent_info *fei, u64 start,
return -EOPNOTSUPP;
}
-int cifs_truncate_page(struct address_space *mapping, loff_t from)
-{
- pgoff_t index = from >> PAGE_SHIFT;
- unsigned offset = from & (PAGE_SIZE - 1);
- struct page *page;
- int rc = 0;
-
- page = grab_cache_page(mapping, index);
- if (!page)
- return -ENOMEM;
-
- zero_user_segment(page, offset, PAGE_SIZE);
- unlock_page(page);
- put_page(page);
- return rc;
-}
-
void cifs_setsize(struct inode *inode, loff_t offset)
{
struct cifsInodeInfo *cifs_i = CIFS_I(inode);
@@ -2994,8 +3034,6 @@ set_size_out:
*/
attrs->ia_ctime = attrs->ia_mtime = current_time(inode);
attrs->ia_valid |= ATTR_CTIME | ATTR_MTIME;
-
- cifs_truncate_page(inode->i_mapping, inode->i_size);
}
return rc;
diff --git a/fs/smb/client/ioctl.c b/fs/smb/client/ioctl.c
index 56439da4f119..0a9935ce05a5 100644
--- a/fs/smb/client/ioctl.c
+++ b/fs/smb/client/ioctl.c
@@ -506,7 +506,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
le16_to_cpu(tcon->ses->server->cipher_type);
pkey_inf.Suid = tcon->ses->Suid;
memcpy(pkey_inf.auth_key, tcon->ses->auth_key.response,
- 16 /* SMB2_NTLMV2_SESSKEY_SIZE */);
+ SMB2_NTLMV2_SESSKEY_SIZE);
memcpy(pkey_inf.smb3decryptionkey,
tcon->ses->smb3decryptionkey, SMB3_SIGN_KEY_SIZE);
memcpy(pkey_inf.smb3encryptionkey,
diff --git a/fs/smb/client/link.c b/fs/smb/client/link.c
index 47ddeb7fa111..fe80e711cd75 100644
--- a/fs/smb/client/link.c
+++ b/fs/smb/client/link.c
@@ -18,6 +18,8 @@
#include "cifs_unicode.h"
#include "smb2proto.h"
#include "cifs_ioctl.h"
+#include "fs_context.h"
+#include "reparse.h"
/*
* M-F Symlink Functions - Begin
@@ -257,7 +259,7 @@ cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
struct cifs_open_parms oparms;
struct cifs_io_parms io_parms = {0};
int buf_type = CIFS_NO_BUFFER;
- FILE_ALL_INFO file_info;
+ struct cifs_open_info_data query_data;
oparms = (struct cifs_open_parms) {
.tcon = tcon,
@@ -269,11 +271,11 @@ cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
.fid = &fid,
};
- rc = CIFS_open(xid, &oparms, &oplock, &file_info);
+ rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, &query_data);
if (rc)
return rc;
- if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) {
+ if (query_data.fi.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) {
rc = -ENOENT;
/* it's not a symlink */
goto out;
@@ -312,7 +314,7 @@ cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
.fid = &fid,
};
- rc = CIFS_open(xid, &oparms, &oplock, NULL);
+ rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, NULL);
if (rc)
return rc;
@@ -569,7 +571,6 @@ cifs_symlink(struct mnt_idmap *idmap, struct inode *inode,
int rc = -EOPNOTSUPP;
unsigned int xid;
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
- struct TCP_Server_Info *server;
struct tcon_link *tlink;
struct cifs_tcon *pTcon;
const char *full_path;
@@ -592,7 +593,6 @@ cifs_symlink(struct mnt_idmap *idmap, struct inode *inode,
goto symlink_exit;
}
pTcon = tlink_tcon(tlink);
- server = cifs_pick_channel(pTcon->ses);
full_path = build_path_from_dentry(direntry, page);
if (IS_ERR(full_path)) {
@@ -604,22 +604,45 @@ cifs_symlink(struct mnt_idmap *idmap, struct inode *inode,
cifs_dbg(FYI, "symname is %s\n", symname);
/* BB what if DFS and this volume is on different share? BB */
- if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) {
- rc = create_mf_symlink(xid, pTcon, cifs_sb, full_path, symname);
- } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
- rc = __cifs_sfu_make_node(xid, inode, direntry, pTcon,
- full_path, S_IFLNK, 0, symname);
+ rc = -EOPNOTSUPP;
+ switch (cifs_symlink_type(cifs_sb)) {
+ case CIFS_SYMLINK_TYPE_UNIX:
#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
- } else if (pTcon->unix_ext) {
- rc = CIFSUnixCreateSymLink(xid, pTcon, full_path, symname,
- cifs_sb->local_nls,
- cifs_remap(cifs_sb));
+ if (pTcon->unix_ext) {
+ rc = CIFSUnixCreateSymLink(xid, pTcon, full_path,
+ symname,
+ cifs_sb->local_nls,
+ cifs_remap(cifs_sb));
+ }
#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
- } else if (server->ops->create_reparse_symlink) {
- rc = server->ops->create_reparse_symlink(xid, inode, direntry,
- pTcon, full_path,
- symname);
- goto symlink_exit;
+ break;
+
+ case CIFS_SYMLINK_TYPE_MFSYMLINKS:
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) {
+ rc = create_mf_symlink(xid, pTcon, cifs_sb,
+ full_path, symname);
+ }
+ break;
+
+ case CIFS_SYMLINK_TYPE_SFU:
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
+ rc = __cifs_sfu_make_node(xid, inode, direntry, pTcon,
+ full_path, S_IFLNK,
+ 0, symname);
+ }
+ break;
+
+ case CIFS_SYMLINK_TYPE_NATIVE:
+ case CIFS_SYMLINK_TYPE_NFS:
+ case CIFS_SYMLINK_TYPE_WSL:
+ if (CIFS_REPARSE_SUPPORT(pTcon)) {
+ rc = create_reparse_symlink(xid, inode, direntry, pTcon,
+ full_path, symname);
+ goto symlink_exit;
+ }
+ break;
+ default:
+ break;
}
if (rc == 0) {
diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c
index b328dc5c7988..da23cc12a52c 100644
--- a/fs/smb/client/misc.c
+++ b/fs/smb/client/misc.c
@@ -137,8 +137,10 @@ tcon_info_alloc(bool dir_leases_enabled, enum smb3_tcon_ref_trace trace)
spin_lock_init(&ret_buf->tc_lock);
INIT_LIST_HEAD(&ret_buf->openFileList);
INIT_LIST_HEAD(&ret_buf->tcon_list);
+ INIT_LIST_HEAD(&ret_buf->cifs_sb_list);
spin_lock_init(&ret_buf->open_file_lock);
spin_lock_init(&ret_buf->stat_lock);
+ spin_lock_init(&ret_buf->sb_list_lock);
atomic_set(&ret_buf->num_local_opens, 0);
atomic_set(&ret_buf->num_remote_opens, 0);
ret_buf->stats_from_time = ktime_get_real_seconds();
@@ -149,6 +151,12 @@ tcon_info_alloc(bool dir_leases_enabled, enum smb3_tcon_ref_trace trace)
#ifdef CONFIG_CIFS_DFS_UPCALL
INIT_LIST_HEAD(&ret_buf->dfs_ses_list);
#endif
+ INIT_LIST_HEAD(&ret_buf->pending_opens);
+ INIT_DELAYED_WORK(&ret_buf->query_interfaces,
+ smb2_query_server_interfaces);
+#ifdef CONFIG_CIFS_DFS_UPCALL
+ INIT_DELAYED_WORK(&ret_buf->dfs_cache_work, dfs_cache_refresh);
+#endif
return ret_buf;
}
@@ -324,6 +332,14 @@ check_smb_hdr(struct smb_hdr *smb)
if (smb->Command == SMB_COM_LOCKING_ANDX)
return 0;
+ /*
+ * Windows NT server returns error resposne (e.g. STATUS_DELETE_PENDING
+ * or STATUS_OBJECT_NAME_NOT_FOUND or ERRDOS/ERRbadfile or any other)
+ * for some TRANS2 requests without the RESPONSE flag set in header.
+ */
+ if (smb->Command == SMB_COM_TRANSACTION2 && smb->Status.CifsError != 0)
+ return 0;
+
cifs_dbg(VFS, "Server sent request, not response. mid=%u\n",
get_mid(smb));
return 1;
diff --git a/fs/smb/client/namespace.c b/fs/smb/client/namespace.c
index e3f9213131c4..52a520349cb7 100644
--- a/fs/smb/client/namespace.c
+++ b/fs/smb/client/namespace.c
@@ -146,6 +146,9 @@ static char *automount_fullpath(struct dentry *dentry, void *page)
}
spin_unlock(&tcon->tc_lock);
+ if (unlikely(!page))
+ return ERR_PTR(-ENOMEM);
+
s = dentry_path_raw(dentry, page, PATH_MAX);
if (IS_ERR(s))
return s;
@@ -283,7 +286,6 @@ struct vfsmount *cifs_d_automount(struct path *path)
return newmnt;
}
- mntget(newmnt); /* prevent immediate expiration */
mnt_set_expiry(newmnt, &cifs_automount_list);
schedule_delayed_work(&cifs_automount_task,
cifs_mountpoint_expiry_timeout);
diff --git a/fs/smb/client/netmisc.c b/fs/smb/client/netmisc.c
index 17b3e21ea868..9ec20601cee2 100644
--- a/fs/smb/client/netmisc.c
+++ b/fs/smb/client/netmisc.c
@@ -313,7 +313,6 @@ static const struct {
ERRDOS, 2215, NT_STATUS_NO_LOGON_SERVERS}, {
ERRHRD, ERRgeneral, NT_STATUS_NO_SUCH_LOGON_SESSION}, {
ERRHRD, ERRgeneral, NT_STATUS_NO_SUCH_PRIVILEGE}, {
- ERRDOS, ERRnoaccess, NT_STATUS_PRIVILEGE_NOT_HELD}, {
ERRHRD, ERRgeneral, NT_STATUS_INVALID_ACCOUNT_NAME}, {
ERRHRD, ERRgeneral, NT_STATUS_USER_EXISTS},
/* { This NT error code was 'sqashed'
@@ -871,6 +870,15 @@ map_smb_to_linux_error(char *buf, bool logErr)
}
/* else ERRHRD class errors or junk - return EIO */
+ /* special cases for NT status codes which cannot be translated to DOS codes */
+ if (smb->Flags2 & SMBFLG2_ERR_STATUS) {
+ __u32 err = le32_to_cpu(smb->Status.CifsError);
+ if (err == (NT_STATUS_NOT_A_REPARSE_POINT))
+ rc = -ENODATA;
+ else if (err == (NT_STATUS_PRIVILEGE_NOT_HELD))
+ rc = -EPERM;
+ }
+
cifs_dbg(FYI, "Mapping smb error code 0x%x to POSIX err %d\n",
le32_to_cpu(smb->Status.CifsError), rc);
diff --git a/fs/smb/client/nterr.c b/fs/smb/client/nterr.c
index d396a8e98a81..8f0bc441295e 100644
--- a/fs/smb/client/nterr.c
+++ b/fs/smb/client/nterr.c
@@ -674,6 +674,7 @@ const struct nt_err_code_struct nt_errs[] = {
{"NT_STATUS_QUOTA_LIST_INCONSISTENT",
NT_STATUS_QUOTA_LIST_INCONSISTENT},
{"NT_STATUS_FILE_IS_OFFLINE", NT_STATUS_FILE_IS_OFFLINE},
+ {"NT_STATUS_NOT_A_REPARSE_POINT", NT_STATUS_NOT_A_REPARSE_POINT},
{"NT_STATUS_NO_MORE_ENTRIES", NT_STATUS_NO_MORE_ENTRIES},
{"NT_STATUS_MORE_ENTRIES", NT_STATUS_MORE_ENTRIES},
{"NT_STATUS_SOME_UNMAPPED", NT_STATUS_SOME_UNMAPPED},
diff --git a/fs/smb/client/nterr.h b/fs/smb/client/nterr.h
index edd4741cab0a..180602c22355 100644
--- a/fs/smb/client/nterr.h
+++ b/fs/smb/client/nterr.h
@@ -546,6 +546,7 @@ extern const struct nt_err_code_struct nt_errs[];
#define NT_STATUS_TOO_MANY_LINKS 0xC0000000 | 0x0265
#define NT_STATUS_QUOTA_LIST_INCONSISTENT 0xC0000000 | 0x0266
#define NT_STATUS_FILE_IS_OFFLINE 0xC0000000 | 0x0267
+#define NT_STATUS_NOT_A_REPARSE_POINT 0xC0000000 | 0x0275
#define NT_STATUS_NO_SUCH_JOB 0xC0000000 | 0xEDE /* scheduler */
#endif /* _NTERR_H */
diff --git a/fs/smb/client/readdir.c b/fs/smb/client/readdir.c
index 50f96259d9ad..4e5460206397 100644
--- a/fs/smb/client/readdir.c
+++ b/fs/smb/client/readdir.c
@@ -9,6 +9,7 @@
*
*/
#include <linux/fs.h>
+#include <linux/namei.h>
#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/stat.h>
@@ -78,7 +79,7 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name,
cifs_dbg(FYI, "%s: for %s\n", __func__, name->name);
- dentry = d_hash_and_lookup(parent, name);
+ dentry = try_lookup_noperm(name, parent);
if (!dentry) {
/*
* If we know that the inode will need to be revalidated
@@ -263,7 +264,7 @@ cifs_posix_to_fattr(struct cifs_fattr *fattr, struct smb2_posix_info *info,
/* The Mode field in the response can now include the file type as well */
fattr->cf_mode = wire_mode_to_posix(le32_to_cpu(info->Mode),
fattr->cf_cifsattrs & ATTR_DIRECTORY);
- fattr->cf_dtype = S_DT(le32_to_cpu(info->Mode));
+ fattr->cf_dtype = S_DT(fattr->cf_mode);
switch (fattr->cf_mode & S_IFMT) {
case S_IFLNK:
@@ -733,7 +734,10 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos,
else
cifs_buf_release(cfile->srch_inf.
ntwrk_buf_start);
+ /* Reset all pointers to the network buffer to prevent stale references */
cfile->srch_inf.ntwrk_buf_start = NULL;
+ cfile->srch_inf.srch_entries_start = NULL;
+ cfile->srch_inf.last_entry = NULL;
}
rc = initiate_cifs_search(xid, file, full_path);
if (rc) {
@@ -756,11 +760,11 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos,
rc = server->ops->query_dir_next(xid, tcon, &cfile->fid,
search_flags,
&cfile->srch_inf);
+ if (rc)
+ return -ENOENT;
/* FindFirst/Next set last_entry to NULL on malformed reply */
if (cfile->srch_inf.last_entry)
cifs_save_resume_key(cfile->srch_inf.last_entry, cfile);
- if (rc)
- return -ENOENT;
}
if (index_to_find < cfile->srch_inf.index_of_last_entry) {
/* we found the buffer that contains the entry */
@@ -847,9 +851,9 @@ static bool emit_cached_dirents(struct cached_dirents *cde,
}
static void update_cached_dirents_count(struct cached_dirents *cde,
- struct dir_context *ctx)
+ struct file *file)
{
- if (cde->ctx != ctx)
+ if (cde->file != file)
return;
if (cde->is_valid || cde->is_failed)
return;
@@ -858,9 +862,9 @@ static void update_cached_dirents_count(struct cached_dirents *cde,
}
static void finished_cached_dirents_count(struct cached_dirents *cde,
- struct dir_context *ctx)
+ struct dir_context *ctx, struct file *file)
{
- if (cde->ctx != ctx)
+ if (cde->file != file)
return;
if (cde->is_valid || cde->is_failed)
return;
@@ -873,11 +877,12 @@ static void finished_cached_dirents_count(struct cached_dirents *cde,
static void add_cached_dirent(struct cached_dirents *cde,
struct dir_context *ctx,
const char *name, int namelen,
- struct cifs_fattr *fattr)
+ struct cifs_fattr *fattr,
+ struct file *file)
{
struct cached_dirent *de;
- if (cde->ctx != ctx)
+ if (cde->file != file)
return;
if (cde->is_valid || cde->is_failed)
return;
@@ -907,7 +912,8 @@ static void add_cached_dirent(struct cached_dirents *cde,
static bool cifs_dir_emit(struct dir_context *ctx,
const char *name, int namelen,
struct cifs_fattr *fattr,
- struct cached_fid *cfid)
+ struct cached_fid *cfid,
+ struct file *file)
{
bool rc;
ino_t ino = cifs_uniqueid_to_ino_t(fattr->cf_uniqueid);
@@ -919,7 +925,7 @@ static bool cifs_dir_emit(struct dir_context *ctx,
if (cfid) {
mutex_lock(&cfid->dirents.de_mutex);
add_cached_dirent(&cfid->dirents, ctx, name, namelen,
- fattr);
+ fattr, file);
mutex_unlock(&cfid->dirents.de_mutex);
}
@@ -1019,7 +1025,7 @@ static int cifs_filldir(char *find_entry, struct file *file,
cifs_prime_dcache(file_dentry(file), &name, &fattr);
return !cifs_dir_emit(ctx, name.name, name.len,
- &fattr, cfid);
+ &fattr, cfid, file);
}
@@ -1070,8 +1076,8 @@ int cifs_readdir(struct file *file, struct dir_context *ctx)
* we need to initialize scanning and storing the
* directory content.
*/
- if (ctx->pos == 0 && cfid->dirents.ctx == NULL) {
- cfid->dirents.ctx = ctx;
+ if (ctx->pos == 0 && cfid->dirents.file == NULL) {
+ cfid->dirents.file = file;
cfid->dirents.pos = 2;
}
/*
@@ -1139,7 +1145,7 @@ int cifs_readdir(struct file *file, struct dir_context *ctx)
} else {
if (cfid) {
mutex_lock(&cfid->dirents.de_mutex);
- finished_cached_dirents_count(&cfid->dirents, ctx);
+ finished_cached_dirents_count(&cfid->dirents, ctx, file);
mutex_unlock(&cfid->dirents.de_mutex);
}
cifs_dbg(FYI, "Could not find entry\n");
@@ -1180,7 +1186,7 @@ int cifs_readdir(struct file *file, struct dir_context *ctx)
ctx->pos++;
if (cfid) {
mutex_lock(&cfid->dirents.de_mutex);
- update_cached_dirents_count(&cfid->dirents, ctx);
+ update_cached_dirents_count(&cfid->dirents, file);
mutex_unlock(&cfid->dirents.de_mutex);
}
diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c
index d88b41133e00..7869cec58f52 100644
--- a/fs/smb/client/reparse.c
+++ b/fs/smb/client/reparse.c
@@ -14,47 +14,174 @@
#include "fs_context.h"
#include "reparse.h"
+static int mknod_nfs(unsigned int xid, struct inode *inode,
+ struct dentry *dentry, struct cifs_tcon *tcon,
+ const char *full_path, umode_t mode, dev_t dev,
+ const char *symname);
+
+static int mknod_wsl(unsigned int xid, struct inode *inode,
+ struct dentry *dentry, struct cifs_tcon *tcon,
+ const char *full_path, umode_t mode, dev_t dev,
+ const char *symname);
+
+static int create_native_symlink(const unsigned int xid, struct inode *inode,
+ struct dentry *dentry, struct cifs_tcon *tcon,
+ const char *full_path, const char *symname);
+
static int detect_directory_symlink_target(struct cifs_sb_info *cifs_sb,
const unsigned int xid,
const char *full_path,
const char *symname,
bool *directory);
-int smb2_create_reparse_symlink(const unsigned int xid, struct inode *inode,
+int create_reparse_symlink(const unsigned int xid, struct inode *inode,
struct dentry *dentry, struct cifs_tcon *tcon,
const char *full_path, const char *symname)
{
+ switch (cifs_symlink_type(CIFS_SB(inode->i_sb))) {
+ case CIFS_SYMLINK_TYPE_NATIVE:
+ return create_native_symlink(xid, inode, dentry, tcon, full_path, symname);
+ case CIFS_SYMLINK_TYPE_NFS:
+ return mknod_nfs(xid, inode, dentry, tcon, full_path, S_IFLNK, 0, symname);
+ case CIFS_SYMLINK_TYPE_WSL:
+ return mknod_wsl(xid, inode, dentry, tcon, full_path, S_IFLNK, 0, symname);
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+static int create_native_symlink(const unsigned int xid, struct inode *inode,
+ struct dentry *dentry, struct cifs_tcon *tcon,
+ const char *full_path, const char *symname)
+{
struct reparse_symlink_data_buffer *buf = NULL;
- struct cifs_open_info_data data;
+ struct cifs_open_info_data data = {};
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+ const char *symroot = cifs_sb->ctx->symlinkroot;
struct inode *new;
struct kvec iov;
- __le16 *path;
+ __le16 *path = NULL;
bool directory;
- char *sym, sep = CIFS_DIR_SEP(cifs_sb);
- u16 len, plen;
+ char *symlink_target = NULL;
+ char *sym = NULL;
+ char sep = CIFS_DIR_SEP(cifs_sb);
+ u16 len, plen, poff, slen;
int rc = 0;
if (strlen(symname) > REPARSE_SYM_PATH_MAX)
return -ENAMETOOLONG;
- sym = kstrdup(symname, GFP_KERNEL);
- if (!sym)
- return -ENOMEM;
+ symlink_target = kstrdup(symname, GFP_KERNEL);
+ if (!symlink_target) {
+ rc = -ENOMEM;
+ goto out;
+ }
data = (struct cifs_open_info_data) {
.reparse_point = true,
.reparse = { .tag = IO_REPARSE_TAG_SYMLINK, },
- .symlink_target = sym,
+ .symlink_target = symlink_target,
};
- convert_delimiter(sym, sep);
+ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) &&
+ symroot && symname[0] == '/') {
+ /*
+ * This is a request to create an absolute symlink on the server
+ * which does not support POSIX paths, and expects symlink in
+ * NT-style path. So convert absolute Linux symlink target path
+ * to the absolute NT-style path. Root of the NT-style path for
+ * symlinks is specified in "symlinkroot" mount option. This will
+ * ensure compatibility of this symlink stored in absolute form
+ * on the SMB server.
+ */
+ if (!strstarts(symname, symroot)) {
+ /*
+ * If the absolute Linux symlink target path is not
+ * inside "symlinkroot" location then there is no way
+ * to convert such Linux symlink to NT-style path.
+ */
+ cifs_dbg(VFS,
+ "absolute symlink '%s' cannot be converted to NT format "
+ "because it is outside of symlinkroot='%s'\n",
+ symname, symroot);
+ rc = -EINVAL;
+ goto out;
+ }
+ len = strlen(symroot);
+ if (symroot[len - 1] != '/')
+ len++;
+ if (symname[len] >= 'a' && symname[len] <= 'z' &&
+ (symname[len+1] == '/' || symname[len+1] == '\0')) {
+ /*
+ * Symlink points to Linux target /symlinkroot/x/path/...
+ * where 'x' is the lowercase local Windows drive.
+ * NT-style path for 'x' has common form \??\X:\path\...
+ * with uppercase local Windows drive.
+ */
+ int common_path_len = strlen(symname+len+1)+1;
+ sym = kzalloc(6+common_path_len, GFP_KERNEL);
+ if (!sym) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ memcpy(sym, "\\??\\", 4);
+ sym[4] = symname[len] - ('a'-'A');
+ sym[5] = ':';
+ memcpy(sym+6, symname+len+1, common_path_len);
+ } else {
+ /* Unhandled absolute symlink. Report an error. */
+ cifs_dbg(
+ VFS,
+ "absolute symlink '%s' cannot be converted to NT format "
+ "because it points to unknown target\n",
+ symname);
+ rc = -EINVAL;
+ goto out;
+ }
+ } else {
+ /*
+ * This is request to either create an absolute symlink on
+ * server which expects POSIX paths or it is an request to
+ * create a relative symlink from the current directory.
+ * These paths have same format as relative SMB symlinks,
+ * so no conversion is needed. So just take symname as-is.
+ */
+ sym = kstrdup(symname, GFP_KERNEL);
+ if (!sym) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ }
+
+ if (sep == '\\')
+ convert_delimiter(sym, sep);
+
+ /*
+ * For absolute NT symlinks it is required to pass also leading
+ * backslash and to not mangle NT object prefix "\\??\\" and not to
+ * mangle colon in drive letter. But cifs_convert_path_to_utf16()
+ * removes leading backslash and replaces '?' and ':'. So temporary
+ * mask these characters in NT object prefix by '_' and then change
+ * them back.
+ */
+ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) && symname[0] == '/')
+ sym[0] = sym[1] = sym[2] = sym[5] = '_';
+
path = cifs_convert_path_to_utf16(sym, cifs_sb);
if (!path) {
rc = -ENOMEM;
goto out;
}
+ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) && symname[0] == '/') {
+ sym[0] = '\\';
+ sym[1] = sym[2] = '?';
+ sym[5] = ':';
+ path[0] = cpu_to_le16('\\');
+ path[1] = path[2] = cpu_to_le16('?');
+ path[5] = cpu_to_le16(':');
+ }
+
/*
* SMB distinguish between symlink to directory and symlink to file.
* They cannot be exchanged (symlink of file type which points to
@@ -67,8 +194,18 @@ int smb2_create_reparse_symlink(const unsigned int xid, struct inode *inode,
if (rc < 0)
goto out;
- plen = 2 * UniStrnlen((wchar_t *)path, REPARSE_SYM_PATH_MAX);
- len = sizeof(*buf) + plen * 2;
+ slen = 2 * UniStrnlen((wchar_t *)path, REPARSE_SYM_PATH_MAX);
+ poff = 0;
+ plen = slen;
+ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) && symname[0] == '/') {
+ /*
+ * For absolute NT symlinks skip leading "\\??\\" in PrintName as
+ * PrintName is user visible location in DOS/Win32 format (not in NT format).
+ */
+ poff = 4;
+ plen -= 2 * poff;
+ }
+ len = sizeof(*buf) + plen + slen;
buf = kzalloc(len, GFP_KERNEL);
if (!buf) {
rc = -ENOMEM;
@@ -77,20 +214,21 @@ int smb2_create_reparse_symlink(const unsigned int xid, struct inode *inode,
buf->ReparseTag = cpu_to_le32(IO_REPARSE_TAG_SYMLINK);
buf->ReparseDataLength = cpu_to_le16(len - sizeof(struct reparse_data_buffer));
+
buf->SubstituteNameOffset = cpu_to_le16(plen);
- buf->SubstituteNameLength = cpu_to_le16(plen);
- memcpy(&buf->PathBuffer[plen], path, plen);
+ buf->SubstituteNameLength = cpu_to_le16(slen);
+ memcpy(&buf->PathBuffer[plen], path, slen);
+
buf->PrintNameOffset = 0;
buf->PrintNameLength = cpu_to_le16(plen);
- memcpy(buf->PathBuffer, path, plen);
+ memcpy(buf->PathBuffer, path+poff, plen);
+
buf->Flags = cpu_to_le32(*symname != '/' ? SYMLINK_FLAG_RELATIVE : 0);
- if (*sym != sep)
- buf->Flags = cpu_to_le32(SYMLINK_FLAG_RELATIVE);
- convert_delimiter(sym, '/');
iov.iov_base = buf;
iov.iov_len = len;
- new = smb2_get_reparse_inode(&data, inode->i_sb, xid,
+ new = tcon->ses->server->ops->create_reparse_inode(
+ &data, inode->i_sb, xid,
tcon, full_path, directory,
&iov, NULL);
if (!IS_ERR(new))
@@ -98,6 +236,7 @@ int smb2_create_reparse_symlink(const unsigned int xid, struct inode *inode,
else
rc = PTR_ERR(new);
out:
+ kfree(sym);
kfree(path);
cifs_free_open_info(&data);
kfree(buf);
@@ -242,8 +381,40 @@ static int detect_directory_symlink_target(struct cifs_sb_info *cifs_sb,
return 0;
}
-static int nfs_set_reparse_buf(struct reparse_posix_data *buf,
+static int create_native_socket(const unsigned int xid, struct inode *inode,
+ struct dentry *dentry, struct cifs_tcon *tcon,
+ const char *full_path)
+{
+ struct reparse_data_buffer buf = {
+ .ReparseTag = cpu_to_le32(IO_REPARSE_TAG_AF_UNIX),
+ .ReparseDataLength = cpu_to_le16(0),
+ };
+ struct cifs_open_info_data data = {
+ .reparse_point = true,
+ .reparse = { .tag = IO_REPARSE_TAG_AF_UNIX, .buf = &buf, },
+ };
+ struct kvec iov = {
+ .iov_base = &buf,
+ .iov_len = sizeof(buf),
+ };
+ struct inode *new;
+ int rc = 0;
+
+ new = tcon->ses->server->ops->create_reparse_inode(
+ &data, inode->i_sb, xid,
+ tcon, full_path, false, &iov, NULL);
+ if (!IS_ERR(new))
+ d_instantiate(dentry, new);
+ else
+ rc = PTR_ERR(new);
+ cifs_free_open_info(&data);
+ return rc;
+}
+
+static int nfs_set_reparse_buf(struct reparse_nfs_data_buffer *buf,
mode_t mode, dev_t dev,
+ __le16 *symname_utf16,
+ int symname_utf16_len,
struct kvec *iov)
{
u64 type;
@@ -254,7 +425,13 @@ static int nfs_set_reparse_buf(struct reparse_posix_data *buf,
switch ((type = reparse_mode_nfs_type(mode))) {
case NFS_SPECFILE_BLK:
case NFS_SPECFILE_CHR:
- dlen = sizeof(__le64);
+ dlen = 2 * sizeof(__le32);
+ ((__le32 *)buf->DataBuffer)[0] = cpu_to_le32(MAJOR(dev));
+ ((__le32 *)buf->DataBuffer)[1] = cpu_to_le32(MINOR(dev));
+ break;
+ case NFS_SPECFILE_LNK:
+ dlen = symname_utf16_len;
+ memcpy(buf->DataBuffer, symname_utf16, symname_utf16_len);
break;
case NFS_SPECFILE_FIFO:
case NFS_SPECFILE_SOCK:
@@ -269,8 +446,6 @@ static int nfs_set_reparse_buf(struct reparse_posix_data *buf,
buf->InodeType = cpu_to_le64(type);
buf->ReparseDataLength = cpu_to_le16(len + dlen -
sizeof(struct reparse_data_buffer));
- *(__le64 *)buf->DataBuffer = cpu_to_le64(((u64)MINOR(dev) << 32) |
- MAJOR(dev));
iov->iov_base = buf;
iov->iov_len = len + dlen;
return 0;
@@ -278,38 +453,74 @@ static int nfs_set_reparse_buf(struct reparse_posix_data *buf,
static int mknod_nfs(unsigned int xid, struct inode *inode,
struct dentry *dentry, struct cifs_tcon *tcon,
- const char *full_path, umode_t mode, dev_t dev)
+ const char *full_path, umode_t mode, dev_t dev,
+ const char *symname)
{
+ struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct cifs_open_info_data data;
- struct reparse_posix_data *p;
+ struct reparse_nfs_data_buffer *p = NULL;
+ __le16 *symname_utf16 = NULL;
+ int symname_utf16_len = 0;
struct inode *new;
struct kvec iov;
__u8 buf[sizeof(*p) + sizeof(__le64)];
int rc;
- p = (struct reparse_posix_data *)buf;
- rc = nfs_set_reparse_buf(p, mode, dev, &iov);
+ if (S_ISLNK(mode)) {
+ symname_utf16 = cifs_strndup_to_utf16(symname, strlen(symname),
+ &symname_utf16_len,
+ cifs_sb->local_nls,
+ NO_MAP_UNI_RSVD);
+ if (!symname_utf16) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ symname_utf16_len -= 2; /* symlink is without trailing wide-nul */
+ p = kzalloc(sizeof(*p) + symname_utf16_len, GFP_KERNEL);
+ if (!p) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ } else {
+ p = (struct reparse_nfs_data_buffer *)buf;
+ }
+ rc = nfs_set_reparse_buf(p, mode, dev, symname_utf16, symname_utf16_len, &iov);
if (rc)
- return rc;
+ goto out;
data = (struct cifs_open_info_data) {
.reparse_point = true,
- .reparse = { .tag = IO_REPARSE_TAG_NFS, .posix = p, },
+ .reparse = { .tag = IO_REPARSE_TAG_NFS, .buf = (struct reparse_data_buffer *)p, },
+ .symlink_target = kstrdup(symname, GFP_KERNEL),
};
- new = smb2_get_reparse_inode(&data, inode->i_sb, xid,
+ new = tcon->ses->server->ops->create_reparse_inode(
+ &data, inode->i_sb, xid,
tcon, full_path, false, &iov, NULL);
if (!IS_ERR(new))
d_instantiate(dentry, new);
else
rc = PTR_ERR(new);
cifs_free_open_info(&data);
+out:
+ if (S_ISLNK(mode)) {
+ kfree(symname_utf16);
+ kfree(p);
+ }
return rc;
}
-static int wsl_set_reparse_buf(struct reparse_data_buffer *buf,
- mode_t mode, struct kvec *iov)
+static int wsl_set_reparse_buf(struct reparse_data_buffer **buf,
+ mode_t mode, const char *symname,
+ struct cifs_sb_info *cifs_sb,
+ struct kvec *iov)
{
+ struct reparse_wsl_symlink_data_buffer *symlink_buf;
+ __le16 *symname_utf16;
+ int symname_utf16_len;
+ int symname_utf8_maxlen;
+ int symname_utf8_len;
+ size_t buf_len;
u32 tag;
switch ((tag = reparse_mode_wsl_tag(mode))) {
@@ -317,16 +528,45 @@ static int wsl_set_reparse_buf(struct reparse_data_buffer *buf,
case IO_REPARSE_TAG_LX_CHR:
case IO_REPARSE_TAG_LX_FIFO:
case IO_REPARSE_TAG_AF_UNIX:
+ buf_len = sizeof(struct reparse_data_buffer);
+ *buf = kzalloc(buf_len, GFP_KERNEL);
+ if (!*buf)
+ return -ENOMEM;
+ break;
+ case IO_REPARSE_TAG_LX_SYMLINK:
+ symname_utf16 = cifs_strndup_to_utf16(symname, strlen(symname),
+ &symname_utf16_len,
+ cifs_sb->local_nls,
+ NO_MAP_UNI_RSVD);
+ if (!symname_utf16)
+ return -ENOMEM;
+ symname_utf8_maxlen = symname_utf16_len/2*3;
+ symlink_buf = kzalloc(sizeof(struct reparse_wsl_symlink_data_buffer) +
+ symname_utf8_maxlen, GFP_KERNEL);
+ if (!symlink_buf) {
+ kfree(symname_utf16);
+ return -ENOMEM;
+ }
+ /* Version field must be set to 2 (MS-FSCC 2.1.2.7) */
+ symlink_buf->Version = cpu_to_le32(2);
+ /* Target for Version 2 is in UTF-8 but without trailing null-term byte */
+ symname_utf8_len = utf16s_to_utf8s((wchar_t *)symname_utf16, symname_utf16_len/2,
+ UTF16_LITTLE_ENDIAN,
+ symlink_buf->Target,
+ symname_utf8_maxlen);
+ *buf = (struct reparse_data_buffer *)symlink_buf;
+ buf_len = sizeof(struct reparse_wsl_symlink_data_buffer) + symname_utf8_len;
+ kfree(symname_utf16);
break;
default:
return -EOPNOTSUPP;
}
- buf->ReparseTag = cpu_to_le32(tag);
- buf->Reserved = 0;
- buf->ReparseDataLength = 0;
- iov->iov_base = buf;
- iov->iov_len = sizeof(*buf);
+ (*buf)->ReparseTag = cpu_to_le32(tag);
+ (*buf)->Reserved = 0;
+ (*buf)->ReparseDataLength = cpu_to_le16(buf_len - sizeof(struct reparse_data_buffer));
+ iov->iov_base = *buf;
+ iov->iov_len = buf_len;
return 0;
}
@@ -415,27 +655,32 @@ static int wsl_set_xattrs(struct inode *inode, umode_t _mode,
static int mknod_wsl(unsigned int xid, struct inode *inode,
struct dentry *dentry, struct cifs_tcon *tcon,
- const char *full_path, umode_t mode, dev_t dev)
+ const char *full_path, umode_t mode, dev_t dev,
+ const char *symname)
{
+ struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct cifs_open_info_data data;
- struct reparse_data_buffer buf;
+ struct reparse_data_buffer *buf;
struct smb2_create_ea_ctx *cc;
struct inode *new;
unsigned int len;
struct kvec reparse_iov, xattr_iov;
int rc;
- rc = wsl_set_reparse_buf(&buf, mode, &reparse_iov);
+ rc = wsl_set_reparse_buf(&buf, mode, symname, cifs_sb, &reparse_iov);
if (rc)
return rc;
rc = wsl_set_xattrs(inode, mode, dev, &xattr_iov);
- if (rc)
+ if (rc) {
+ kfree(buf);
return rc;
+ }
data = (struct cifs_open_info_data) {
.reparse_point = true,
- .reparse = { .tag = le32_to_cpu(buf.ReparseTag), .buf = &buf, },
+ .reparse = { .tag = le32_to_cpu(buf->ReparseTag), .buf = buf, },
+ .symlink_target = kstrdup(symname, GFP_KERNEL),
};
cc = xattr_iov.iov_base;
@@ -443,7 +688,8 @@ static int mknod_wsl(unsigned int xid, struct inode *inode,
memcpy(data.wsl.eas, &cc->ea, len);
data.wsl.eas_len = len;
- new = smb2_get_reparse_inode(&data, inode->i_sb,
+ new = tcon->ses->server->ops->create_reparse_inode(
+ &data, inode->i_sb,
xid, tcon, full_path, false,
&reparse_iov, &xattr_iov);
if (!IS_ERR(new))
@@ -452,29 +698,31 @@ static int mknod_wsl(unsigned int xid, struct inode *inode,
rc = PTR_ERR(new);
cifs_free_open_info(&data);
kfree(xattr_iov.iov_base);
+ kfree(buf);
return rc;
}
-int smb2_mknod_reparse(unsigned int xid, struct inode *inode,
+int mknod_reparse(unsigned int xid, struct inode *inode,
struct dentry *dentry, struct cifs_tcon *tcon,
const char *full_path, umode_t mode, dev_t dev)
{
struct smb3_fs_context *ctx = CIFS_SB(inode->i_sb)->ctx;
- int rc = -EOPNOTSUPP;
+
+ if (S_ISSOCK(mode) && !ctx->nonativesocket && ctx->reparse_type != CIFS_REPARSE_TYPE_NONE)
+ return create_native_socket(xid, inode, dentry, tcon, full_path);
switch (ctx->reparse_type) {
case CIFS_REPARSE_TYPE_NFS:
- rc = mknod_nfs(xid, inode, dentry, tcon, full_path, mode, dev);
- break;
+ return mknod_nfs(xid, inode, dentry, tcon, full_path, mode, dev, NULL);
case CIFS_REPARSE_TYPE_WSL:
- rc = mknod_wsl(xid, inode, dentry, tcon, full_path, mode, dev);
- break;
+ return mknod_wsl(xid, inode, dentry, tcon, full_path, mode, dev, NULL);
+ default:
+ return -EOPNOTSUPP;
}
- return rc;
}
/* See MS-FSCC 2.1.2.6 for the 'NFS' style reparse tags */
-static int parse_reparse_posix(struct reparse_posix_data *buf,
+static int parse_reparse_nfs(struct reparse_nfs_data_buffer *buf,
struct cifs_sb_info *cifs_sb,
struct cifs_open_info_data *data)
{
@@ -536,43 +784,149 @@ static int parse_reparse_posix(struct reparse_posix_data *buf,
}
int smb2_parse_native_symlink(char **target, const char *buf, unsigned int len,
- bool unicode, bool relative,
+ bool relative,
const char *full_path,
struct cifs_sb_info *cifs_sb)
{
+ const char *symroot = cifs_sb->ctx->symlinkroot;
char sep = CIFS_DIR_SEP(cifs_sb);
char *linux_target = NULL;
char *smb_target = NULL;
+ int symlinkroot_len;
+ int abs_path_len;
+ char *abs_path;
int levels;
int rc;
int i;
- /* Check that length it valid for unicode/non-unicode mode */
- if (!len || (unicode && (len % 2))) {
+ /* Check that length it valid */
+ if (!len || (len % 2)) {
cifs_dbg(VFS, "srv returned malformed symlink buffer\n");
rc = -EIO;
goto out;
}
/*
- * Check that buffer does not contain UTF-16 null codepoint in unicode
- * mode or null byte in non-unicode mode because Linux cannot process
- * symlink with null byte.
+ * Check that buffer does not contain UTF-16 null codepoint
+ * because Linux cannot process symlink with null byte.
*/
- if ((unicode && UniStrnlen((wchar_t *)buf, len/2) != len/2) ||
- (!unicode && strnlen(buf, len) != len)) {
+ if (UniStrnlen((wchar_t *)buf, len/2) != len/2) {
cifs_dbg(VFS, "srv returned null byte in native symlink target location\n");
rc = -EIO;
goto out;
}
- smb_target = cifs_strndup_from_utf16(buf, len, unicode, cifs_sb->local_nls);
+ smb_target = cifs_strndup_from_utf16(buf, len, true, cifs_sb->local_nls);
if (!smb_target) {
rc = -ENOMEM;
goto out;
}
- if (smb_target[0] == sep && relative) {
+ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) &&
+ symroot && !relative) {
+ /*
+ * This is an absolute symlink from the server which does not
+ * support POSIX paths, so the symlink is in NT-style path.
+ * So convert it to absolute Linux symlink target path. Root of
+ * the NT-style path for symlinks is specified in "symlinkroot"
+ * mount option.
+ *
+ * Root of the DOS and Win32 paths is at NT path \??\
+ * It means that DOS/Win32 path C:\folder\file.txt is
+ * NT path \??\C:\folder\file.txt
+ *
+ * NT systems have some well-known object symlinks in their NT
+ * hierarchy, which is needed to take into account when resolving
+ * other symlinks. Most commonly used symlink paths are:
+ * \?? -> \GLOBAL??
+ * \DosDevices -> \??
+ * \GLOBAL??\GLOBALROOT -> \
+ * \GLOBAL??\Global -> \GLOBAL??
+ * \GLOBAL??\NUL -> \Device\Null
+ * \GLOBAL??\UNC -> \Device\Mup
+ * \GLOBAL??\PhysicalDrive0 -> \Device\Harddisk0\DR0 (for each harddisk)
+ * \GLOBAL??\A: -> \Device\Floppy0 (if A: is the first floppy)
+ * \GLOBAL??\C: -> \Device\HarddiskVolume1 (if C: is the first harddisk)
+ * \GLOBAL??\D: -> \Device\CdRom0 (if D: is first cdrom)
+ * \SystemRoot -> \Device\Harddisk0\Partition1\WINDOWS (or where is NT system installed)
+ * \Volume{...} -> \Device\HarddiskVolume1 (where ... is system generated guid)
+ *
+ * In most common cases, absolute NT symlinks points to path on
+ * DOS/Win32 drive letter, system-specific Volume or on UNC share.
+ * Here are few examples of commonly used absolute NT symlinks
+ * created by mklink.exe tool:
+ * \??\C:\folder\file.txt
+ * \??\\C:\folder\file.txt
+ * \??\UNC\server\share\file.txt
+ * \??\\UNC\server\share\file.txt
+ * \??\Volume{b75e2c83-0000-0000-0000-602f00000000}\folder\file.txt
+ *
+ * It means that the most common path prefix \??\ is also NT path
+ * symlink (to \GLOBAL??). It is less common that second path
+ * separator is double backslash, but it is valid.
+ *
+ * Volume guid is randomly generated by the target system and so
+ * only the target system knows the mapping between guid and the
+ * hardisk number. Over SMB it is not possible to resolve this
+ * mapping, therefore symlinks pointing to target location of
+ * volume guids are totally unusable over SMB.
+ *
+ * For now parse only symlink paths available for DOS and Win32.
+ * Those are paths with \??\ prefix or paths which points to \??\
+ * via other NT symlink (\DosDevices\, \GLOBAL??\, ...).
+ */
+ abs_path = smb_target;
+globalroot:
+ if (strstarts(abs_path, "\\??\\"))
+ abs_path += sizeof("\\??\\")-1;
+ else if (strstarts(abs_path, "\\DosDevices\\"))
+ abs_path += sizeof("\\DosDevices\\")-1;
+ else if (strstarts(abs_path, "\\GLOBAL??\\"))
+ abs_path += sizeof("\\GLOBAL??\\")-1;
+ else
+ goto out_unhandled_target;
+
+ /* Sometimes path separator after \?? is double backslash */
+ if (abs_path[0] == '\\')
+ abs_path++;
+
+ while (strstarts(abs_path, "Global\\"))
+ abs_path += sizeof("Global\\")-1;
+
+ if (strstarts(abs_path, "GLOBALROOT\\")) {
+ /* Label globalroot requires path with leading '\\', so do not trim '\\' */
+ abs_path += sizeof("GLOBALROOT")-1;
+ goto globalroot;
+ }
+
+ /* For now parse only paths to drive letters */
+ if (((abs_path[0] >= 'A' && abs_path[0] <= 'Z') ||
+ (abs_path[0] >= 'a' && abs_path[0] <= 'z')) &&
+ abs_path[1] == ':' &&
+ (abs_path[2] == '\\' || abs_path[2] == '\0')) {
+ /* Convert drive letter to lowercase and drop colon */
+ char drive_letter = abs_path[0];
+ if (drive_letter >= 'A' && drive_letter <= 'Z')
+ drive_letter += 'a'-'A';
+ abs_path++;
+ abs_path[0] = drive_letter;
+ } else {
+ goto out_unhandled_target;
+ }
+
+ abs_path_len = strlen(abs_path)+1;
+ symlinkroot_len = strlen(symroot);
+ if (symroot[symlinkroot_len - 1] == '/')
+ symlinkroot_len--;
+ linux_target = kmalloc(symlinkroot_len + 1 + abs_path_len, GFP_KERNEL);
+ if (!linux_target) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ memcpy(linux_target, symroot, symlinkroot_len);
+ linux_target[symlinkroot_len] = '/';
+ memcpy(linux_target + symlinkroot_len + 1, abs_path, abs_path_len);
+ } else if (smb_target[0] == sep && relative) {
/*
* This is a relative SMB symlink from the top of the share,
* which is the top level directory of the Linux mount point.
@@ -601,6 +955,13 @@ int smb2_parse_native_symlink(char **target, const char *buf, unsigned int len,
}
memcpy(linux_target + levels*3, smb_target+1, smb_target_len); /* +1 to skip leading sep */
} else {
+ /*
+ * This is either an absolute symlink in POSIX-style format
+ * or relative SMB symlink from the current directory.
+ * These paths have same format as Linux symlinks, so no
+ * conversion is needed.
+ */
+out_unhandled_target:
linux_target = smb_target;
smb_target = NULL;
}
@@ -620,8 +981,8 @@ out:
return rc;
}
-static int parse_reparse_symlink(struct reparse_symlink_data_buffer *sym,
- u32 plen, bool unicode,
+static int parse_reparse_native_symlink(struct reparse_symlink_data_buffer *sym,
+ u32 plen,
struct cifs_sb_info *cifs_sb,
const char *full_path,
struct cifs_open_info_data *data)
@@ -641,7 +1002,6 @@ static int parse_reparse_symlink(struct reparse_symlink_data_buffer *sym,
return smb2_parse_native_symlink(&data->symlink_target,
sym->PathBuffer + offs,
len,
- unicode,
le32_to_cpu(sym->Flags) & SYMLINK_FLAG_RELATIVE,
full_path,
cifs_sb);
@@ -652,29 +1012,36 @@ static int parse_reparse_wsl_symlink(struct reparse_wsl_symlink_data_buffer *buf
struct cifs_open_info_data *data)
{
int len = le16_to_cpu(buf->ReparseDataLength);
+ int data_offset = offsetof(typeof(*buf), Target) - offsetof(typeof(*buf), Version);
int symname_utf8_len;
__le16 *symname_utf16;
int symname_utf16_len;
- if (len <= sizeof(buf->Flags)) {
+ if (len <= data_offset) {
cifs_dbg(VFS, "srv returned malformed wsl symlink buffer\n");
return -EIO;
}
- /* PathBuffer is in UTF-8 but without trailing null-term byte */
- symname_utf8_len = len - sizeof(buf->Flags);
+ /* MS-FSCC 2.1.2.7 defines layout of the Target field only for Version 2. */
+ if (le32_to_cpu(buf->Version) != 2) {
+ cifs_dbg(VFS, "srv returned unsupported wsl symlink version %u\n", le32_to_cpu(buf->Version));
+ return -EIO;
+ }
+
+ /* Target for Version 2 is in UTF-8 but without trailing null-term byte */
+ symname_utf8_len = len - data_offset;
/*
* Check that buffer does not contain null byte
* because Linux cannot process symlink with null byte.
*/
- if (strnlen(buf->PathBuffer, symname_utf8_len) != symname_utf8_len) {
+ if (strnlen(buf->Target, symname_utf8_len) != symname_utf8_len) {
cifs_dbg(VFS, "srv returned null byte in wsl symlink target location\n");
return -EIO;
}
symname_utf16 = kzalloc(symname_utf8_len * 2, GFP_KERNEL);
if (!symname_utf16)
return -ENOMEM;
- symname_utf16_len = utf8s_to_utf16s(buf->PathBuffer, symname_utf8_len,
+ symname_utf16_len = utf8s_to_utf16s(buf->Target, symname_utf8_len,
UTF16_LITTLE_ENDIAN,
(wchar_t *) symname_utf16, symname_utf8_len * 2);
if (symname_utf16_len < 0) {
@@ -696,21 +1063,19 @@ static int parse_reparse_wsl_symlink(struct reparse_wsl_symlink_data_buffer *buf
int parse_reparse_point(struct reparse_data_buffer *buf,
u32 plen, struct cifs_sb_info *cifs_sb,
const char *full_path,
- bool unicode, struct cifs_open_info_data *data)
+ struct cifs_open_info_data *data)
{
- struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
-
data->reparse.buf = buf;
/* See MS-FSCC 2.1.2 */
switch (le32_to_cpu(buf->ReparseTag)) {
case IO_REPARSE_TAG_NFS:
- return parse_reparse_posix((struct reparse_posix_data *)buf,
+ return parse_reparse_nfs((struct reparse_nfs_data_buffer *)buf,
cifs_sb, data);
case IO_REPARSE_TAG_SYMLINK:
- return parse_reparse_symlink(
+ return parse_reparse_native_symlink(
(struct reparse_symlink_data_buffer *)buf,
- plen, unicode, cifs_sb, full_path, data);
+ plen, cifs_sb, full_path, data);
case IO_REPARSE_TAG_LX_SYMLINK:
return parse_reparse_wsl_symlink(
(struct reparse_wsl_symlink_data_buffer *)buf,
@@ -724,34 +1089,27 @@ int parse_reparse_point(struct reparse_data_buffer *buf,
le32_to_cpu(buf->ReparseTag));
return -EIO;
}
- break;
+ return 0;
default:
- cifs_tcon_dbg(VFS | ONCE, "unhandled reparse tag: 0x%08x\n",
- le32_to_cpu(buf->ReparseTag));
- break;
+ return -EOPNOTSUPP;
}
- return 0;
}
-int smb2_parse_reparse_point(struct cifs_sb_info *cifs_sb,
- const char *full_path,
- struct kvec *rsp_iov,
- struct cifs_open_info_data *data)
+struct reparse_data_buffer *smb2_get_reparse_point_buffer(const struct kvec *rsp_iov,
+ u32 *plen)
{
- struct reparse_data_buffer *buf;
struct smb2_ioctl_rsp *io = rsp_iov->iov_base;
- u32 plen = le32_to_cpu(io->OutputCount);
-
- buf = (struct reparse_data_buffer *)((u8 *)io +
- le32_to_cpu(io->OutputOffset));
- return parse_reparse_point(buf, plen, cifs_sb, full_path, true, data);
+ *plen = le32_to_cpu(io->OutputCount);
+ return (struct reparse_data_buffer *)((u8 *)io +
+ le32_to_cpu(io->OutputOffset));
}
-static void wsl_to_fattr(struct cifs_open_info_data *data,
+static bool wsl_to_fattr(struct cifs_open_info_data *data,
struct cifs_sb_info *cifs_sb,
u32 tag, struct cifs_fattr *fattr)
{
struct smb2_file_full_ea_info *ea;
+ bool have_xattr_dev = false;
u32 next = 0;
switch (tag) {
@@ -794,21 +1152,30 @@ static void wsl_to_fattr(struct cifs_open_info_data *data,
fattr->cf_uid = wsl_make_kuid(cifs_sb, v);
else if (!strncmp(name, SMB2_WSL_XATTR_GID, nlen))
fattr->cf_gid = wsl_make_kgid(cifs_sb, v);
- else if (!strncmp(name, SMB2_WSL_XATTR_MODE, nlen))
+ else if (!strncmp(name, SMB2_WSL_XATTR_MODE, nlen)) {
+ /* File type in reparse point tag and in xattr mode must match. */
+ if (S_DT(fattr->cf_mode) != S_DT(le32_to_cpu(*(__le32 *)v)))
+ return false;
fattr->cf_mode = (umode_t)le32_to_cpu(*(__le32 *)v);
- else if (!strncmp(name, SMB2_WSL_XATTR_DEV, nlen))
+ } else if (!strncmp(name, SMB2_WSL_XATTR_DEV, nlen)) {
fattr->cf_rdev = reparse_mkdev(v);
+ have_xattr_dev = true;
+ }
} while (next);
out:
- fattr->cf_dtype = S_DT(fattr->cf_mode);
+
+ /* Major and minor numbers for char and block devices are mandatory. */
+ if (!have_xattr_dev && (tag == IO_REPARSE_TAG_LX_CHR || tag == IO_REPARSE_TAG_LX_BLK))
+ return false;
+
+ return true;
}
static bool posix_reparse_to_fattr(struct cifs_sb_info *cifs_sb,
struct cifs_fattr *fattr,
struct cifs_open_info_data *data)
{
- struct reparse_posix_data *buf = data->reparse.posix;
-
+ struct reparse_nfs_data_buffer *buf = (struct reparse_nfs_data_buffer *)data->reparse.buf;
if (buf == NULL)
return true;
@@ -859,22 +1226,14 @@ bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb,
bool ok;
switch (tag) {
- case IO_REPARSE_TAG_INTERNAL:
- if (!(fattr->cf_cifsattrs & ATTR_DIRECTORY))
- return false;
- fallthrough;
- case IO_REPARSE_TAG_DFS:
- case IO_REPARSE_TAG_DFSR:
- case IO_REPARSE_TAG_MOUNT_POINT:
- /* See cifs_create_junction_fattr() */
- fattr->cf_mode = S_IFDIR | 0711;
- break;
case IO_REPARSE_TAG_LX_SYMLINK:
case IO_REPARSE_TAG_LX_FIFO:
case IO_REPARSE_TAG_AF_UNIX:
case IO_REPARSE_TAG_LX_CHR:
case IO_REPARSE_TAG_LX_BLK:
- wsl_to_fattr(data, cifs_sb, tag, fattr);
+ ok = wsl_to_fattr(data, cifs_sb, tag, fattr);
+ if (!ok)
+ return false;
break;
case IO_REPARSE_TAG_NFS:
ok = posix_reparse_to_fattr(cifs_sb, fattr, data);
@@ -886,7 +1245,14 @@ bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb,
fattr->cf_mode |= S_IFLNK;
break;
default:
- return false;
+ if (!(fattr->cf_cifsattrs & ATTR_DIRECTORY))
+ return false;
+ if (!IS_REPARSE_TAG_NAME_SURROGATE(tag) &&
+ tag != IO_REPARSE_TAG_INTERNAL)
+ return false;
+ /* See cifs_create_junction_fattr() */
+ fattr->cf_mode = S_IFDIR | 0711;
+ break;
}
fattr->cf_dtype = S_DT(fattr->cf_mode);
diff --git a/fs/smb/client/reparse.h b/fs/smb/client/reparse.h
index ff05b0e75c92..66269c10beba 100644
--- a/fs/smb/client/reparse.h
+++ b/fs/smb/client/reparse.h
@@ -50,6 +50,7 @@ static inline kgid_t wsl_make_kgid(struct cifs_sb_info *cifs_sb,
static inline u64 reparse_mode_nfs_type(mode_t mode)
{
switch (mode & S_IFMT) {
+ case S_IFLNK: return NFS_SPECFILE_LNK;
case S_IFBLK: return NFS_SPECFILE_BLK;
case S_IFCHR: return NFS_SPECFILE_CHR;
case S_IFIFO: return NFS_SPECFILE_FIFO;
@@ -61,6 +62,7 @@ static inline u64 reparse_mode_nfs_type(mode_t mode)
static inline u32 reparse_mode_wsl_tag(mode_t mode)
{
switch (mode & S_IFMT) {
+ case S_IFLNK: return IO_REPARSE_TAG_LX_SYMLINK;
case S_IFBLK: return IO_REPARSE_TAG_LX_BLK;
case S_IFCHR: return IO_REPARSE_TAG_LX_CHR;
case S_IFIFO: return IO_REPARSE_TAG_LX_FIFO;
@@ -97,29 +99,42 @@ static inline bool reparse_inode_match(struct inode *inode,
static inline bool cifs_open_data_reparse(struct cifs_open_info_data *data)
{
- struct smb2_file_all_info *fi = &data->fi;
- u32 attrs = le32_to_cpu(fi->Attributes);
+ u32 attrs;
bool ret;
- ret = data->reparse_point || (attrs & ATTR_REPARSE);
- if (ret)
- attrs |= ATTR_REPARSE;
- fi->Attributes = cpu_to_le32(attrs);
+ if (data->contains_posix_file_info) {
+ struct smb311_posix_qinfo *fi = &data->posix_fi;
+
+ attrs = le32_to_cpu(fi->DosAttributes);
+ if (data->reparse_point) {
+ attrs |= ATTR_REPARSE;
+ fi->DosAttributes = cpu_to_le32(attrs);
+ }
+
+ } else {
+ struct smb2_file_all_info *fi = &data->fi;
+
+ attrs = le32_to_cpu(fi->Attributes);
+ if (data->reparse_point) {
+ attrs |= ATTR_REPARSE;
+ fi->Attributes = cpu_to_le32(attrs);
+ }
+ }
+
+ ret = attrs & ATTR_REPARSE;
+
return ret;
}
bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb,
struct cifs_fattr *fattr,
struct cifs_open_info_data *data);
-int smb2_create_reparse_symlink(const unsigned int xid, struct inode *inode,
+int create_reparse_symlink(const unsigned int xid, struct inode *inode,
struct dentry *dentry, struct cifs_tcon *tcon,
const char *full_path, const char *symname);
-int smb2_mknod_reparse(unsigned int xid, struct inode *inode,
+int mknod_reparse(unsigned int xid, struct inode *inode,
struct dentry *dentry, struct cifs_tcon *tcon,
const char *full_path, umode_t mode, dev_t dev);
-int smb2_parse_reparse_point(struct cifs_sb_info *cifs_sb,
- const char *full_path,
- struct kvec *rsp_iov,
- struct cifs_open_info_data *data);
+struct reparse_data_buffer *smb2_get_reparse_point_buffer(const struct kvec *rsp_iov, u32 *len);
#endif /* _CIFS_REPARSE_H */
diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c
index 91d4d409cb1d..0a8c2fcc9ded 100644
--- a/fs/smb/client/sess.c
+++ b/fs/smb/client/sess.c
@@ -242,7 +242,7 @@ int cifs_try_adding_channels(struct cifs_ses *ses)
iface->num_channels++;
iface->weight_fulfilled++;
- cifs_dbg(VFS, "successfully opened new channel on iface:%pIS\n",
+ cifs_info("successfully opened new channel on iface:%pIS\n",
&iface->sockaddr);
break;
}
@@ -332,6 +332,7 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server)
struct cifs_server_iface *old_iface = NULL;
struct cifs_server_iface *last_iface = NULL;
struct sockaddr_storage ss;
+ int retry = 0;
spin_lock(&ses->chan_lock);
chan_index = cifs_ses_get_chan_index(ses, server);
@@ -360,6 +361,7 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server)
return;
}
+try_again:
last_iface = list_last_entry(&ses->iface_list, struct cifs_server_iface,
iface_head);
iface_min_speed = last_iface->speed;
@@ -397,6 +399,13 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server)
}
if (list_entry_is_head(iface, &ses->iface_list, iface_head)) {
+ list_for_each_entry(iface, &ses->iface_list, iface_head)
+ iface->weight_fulfilled = 0;
+
+ /* see if it can be satisfied in second attempt */
+ if (!retry++)
+ goto try_again;
+
iface = NULL;
cifs_dbg(FYI, "unable to find a suitable iface\n");
}
@@ -445,6 +454,10 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server)
ses->chans[chan_index].iface = iface;
spin_unlock(&ses->chan_lock);
+
+ spin_lock(&server->srv_lock);
+ memcpy(&server->dstaddr, &iface->sockaddr, sizeof(server->dstaddr));
+ spin_unlock(&server->srv_lock);
}
static int
@@ -494,13 +507,13 @@ cifs_ses_add_channel(struct cifs_ses *ses,
ctx->domainauto = ses->domainAuto;
ctx->domainname = ses->domainName;
- /* no hostname for extra channels */
- ctx->server_hostname = "";
+ ctx->server_hostname = ses->server->hostname;
ctx->username = ses->user_name;
ctx->password = ses->password;
ctx->sectype = ses->sectype;
ctx->sign = ses->sign;
+ ctx->unicode = ses->unicode;
/* UNC and paths */
/* XXX: Use ses->server->hostname? */
@@ -522,6 +535,13 @@ cifs_ses_add_channel(struct cifs_ses *ses,
ctx->sockopt_tcp_nodelay = ses->server->tcp_nodelay;
ctx->echo_interval = ses->server->echo_interval / HZ;
ctx->max_credits = ses->server->max_credits;
+ ctx->min_offload = ses->server->min_offload;
+ ctx->compress = ses->server->compression.requested;
+ ctx->dfs_conn = ses->server->dfs_conn;
+ ctx->ignore_signature = ses->server->ignore_signature;
+ ctx->leaf_fullpath = ses->server->leaf_fullpath;
+ ctx->rootfs = ses->server->noblockcnt;
+ ctx->retrans = ses->server->retrans;
/*
* This will be used for encoding/decoding user/domain/pw
@@ -620,6 +640,7 @@ static __u32 cifs_ssetup_hdr(struct cifs_ses *ses,
USHRT_MAX));
pSMB->req.MaxMpxCount = cpu_to_le16(server->maxReq);
pSMB->req.VcNumber = cpu_to_le16(1);
+ pSMB->req.SessionKey = server->session_key_id;
/* Now no need to set SMBFLG_CASELESS or obsolete CANONICAL PATH */
@@ -672,6 +693,22 @@ unicode_oslm_strings(char **pbcc_area, const struct nls_table *nls_cp)
*pbcc_area = bcc_ptr;
}
+static void
+ascii_oslm_strings(char **pbcc_area, const struct nls_table *nls_cp)
+{
+ char *bcc_ptr = *pbcc_area;
+
+ strcpy(bcc_ptr, "Linux version ");
+ bcc_ptr += strlen("Linux version ");
+ strcpy(bcc_ptr, init_utsname()->release);
+ bcc_ptr += strlen(init_utsname()->release) + 1;
+
+ strcpy(bcc_ptr, CIFS_NETWORK_OPSYS);
+ bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1;
+
+ *pbcc_area = bcc_ptr;
+}
+
static void unicode_domain_string(char **pbcc_area, struct cifs_ses *ses,
const struct nls_table *nls_cp)
{
@@ -696,6 +733,25 @@ static void unicode_domain_string(char **pbcc_area, struct cifs_ses *ses,
*pbcc_area = bcc_ptr;
}
+static void ascii_domain_string(char **pbcc_area, struct cifs_ses *ses,
+ const struct nls_table *nls_cp)
+{
+ char *bcc_ptr = *pbcc_area;
+ int len;
+
+ /* copy domain */
+ if (ses->domainName != NULL) {
+ len = strscpy(bcc_ptr, ses->domainName, CIFS_MAX_DOMAINNAME_LEN);
+ if (WARN_ON_ONCE(len < 0))
+ len = CIFS_MAX_DOMAINNAME_LEN - 1;
+ bcc_ptr += len;
+ } /* else we send a null domain name so server will default to its own domain */
+ *bcc_ptr = 0;
+ bcc_ptr++;
+
+ *pbcc_area = bcc_ptr;
+}
+
static void unicode_ssetup_strings(char **pbcc_area, struct cifs_ses *ses,
const struct nls_table *nls_cp)
{
@@ -741,25 +797,10 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifs_ses *ses,
*bcc_ptr = 0;
bcc_ptr++; /* account for null termination */
- /* copy domain */
- if (ses->domainName != NULL) {
- len = strscpy(bcc_ptr, ses->domainName, CIFS_MAX_DOMAINNAME_LEN);
- if (WARN_ON_ONCE(len < 0))
- len = CIFS_MAX_DOMAINNAME_LEN - 1;
- bcc_ptr += len;
- } /* else we send a null domain name so server will default to its own domain */
- *bcc_ptr = 0;
- bcc_ptr++;
-
/* BB check for overflow here */
- strcpy(bcc_ptr, "Linux version ");
- bcc_ptr += strlen("Linux version ");
- strcpy(bcc_ptr, init_utsname()->release);
- bcc_ptr += strlen(init_utsname()->release) + 1;
-
- strcpy(bcc_ptr, CIFS_NETWORK_OPSYS);
- bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1;
+ ascii_domain_string(&bcc_ptr, ses, nls_cp);
+ ascii_oslm_strings(&bcc_ptr, nls_cp);
*pbcc_area = bcc_ptr;
}
@@ -1235,12 +1276,13 @@ cifs_select_sectype(struct TCP_Server_Info *server, enum securityEnum requested)
switch (requested) {
case Kerberos:
case RawNTLMSSP:
+ case IAKerb:
return requested;
case Unspecified:
if (server->sec_ntlmssp &&
(global_secflags & CIFSSEC_MAY_NTLMSSP))
return RawNTLMSSP;
- if ((server->sec_kerberos || server->sec_mskerberos) &&
+ if ((server->sec_kerberos || server->sec_mskerberos || server->sec_iakerb) &&
(global_secflags & CIFSSEC_MAY_KRB5))
return Kerberos;
fallthrough;
@@ -1561,7 +1603,7 @@ sess_auth_kerberos(struct sess_data *sess_data)
sess_data->iov[1].iov_len = msg->secblob_len;
pSMB->req.SecurityBlobLength = cpu_to_le16(sess_data->iov[1].iov_len);
- if (ses->capabilities & CAP_UNICODE) {
+ if (pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) {
/* unicode strings must be word aligned */
if (!IS_ALIGNED(sess_data->iov[0].iov_len + sess_data->iov[1].iov_len, 2)) {
*bcc_ptr = 0;
@@ -1570,8 +1612,8 @@ sess_auth_kerberos(struct sess_data *sess_data)
unicode_oslm_strings(&bcc_ptr, sess_data->nls_cp);
unicode_domain_string(&bcc_ptr, ses, sess_data->nls_cp);
} else {
- /* BB: is this right? */
- ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp);
+ ascii_oslm_strings(&bcc_ptr, sess_data->nls_cp);
+ ascii_domain_string(&bcc_ptr, ses, sess_data->nls_cp);
}
sess_data->iov[2].iov_len = (long) bcc_ptr -
@@ -1655,22 +1697,22 @@ _sess_auth_rawntlmssp_assemble_req(struct sess_data *sess_data)
pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
capabilities = cifs_ssetup_hdr(ses, server, pSMB);
- if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) {
- cifs_dbg(VFS, "NTLMSSP requires Unicode support\n");
- return -ENOSYS;
- }
-
pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
capabilities |= CAP_EXTENDED_SECURITY;
pSMB->req.Capabilities |= cpu_to_le32(capabilities);
bcc_ptr = sess_data->iov[2].iov_base;
- /* unicode strings must be word aligned */
- if (!IS_ALIGNED(sess_data->iov[0].iov_len + sess_data->iov[1].iov_len, 2)) {
- *bcc_ptr = 0;
- bcc_ptr++;
+
+ if (pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) {
+ /* unicode strings must be word aligned */
+ if (!IS_ALIGNED(sess_data->iov[0].iov_len + sess_data->iov[1].iov_len, 2)) {
+ *bcc_ptr = 0;
+ bcc_ptr++;
+ }
+ unicode_oslm_strings(&bcc_ptr, sess_data->nls_cp);
+ } else {
+ ascii_oslm_strings(&bcc_ptr, sess_data->nls_cp);
}
- unicode_oslm_strings(&bcc_ptr, sess_data->nls_cp);
sess_data->iov[2].iov_len = (long) bcc_ptr -
(long) sess_data->iov[2].iov_base;
diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c
index 749a83cd0deb..893a1ea8c000 100644
--- a/fs/smb/client/smb1ops.c
+++ b/fs/smb/client/smb1ops.c
@@ -14,6 +14,9 @@
#include "cifspdu.h"
#include "cifs_unicode.h"
#include "fs_context.h"
+#include "nterr.h"
+#include "smberr.h"
+#include "reparse.h"
/*
* An NT cancel request header looks just like the original request except:
@@ -92,17 +95,17 @@ cifs_find_mid(struct TCP_Server_Info *server, char *buffer)
struct smb_hdr *buf = (struct smb_hdr *)buffer;
struct mid_q_entry *mid;
- spin_lock(&server->mid_lock);
+ spin_lock(&server->mid_queue_lock);
list_for_each_entry(mid, &server->pending_mid_q, qhead) {
if (compare_mid(mid->mid, buf) &&
mid->mid_state == MID_REQUEST_SUBMITTED &&
le16_to_cpu(mid->command) == buf->Command) {
kref_get(&mid->refcount);
- spin_unlock(&server->mid_lock);
+ spin_unlock(&server->mid_queue_lock);
return mid;
}
}
- spin_unlock(&server->mid_lock);
+ spin_unlock(&server->mid_queue_lock);
return NULL;
}
@@ -166,10 +169,9 @@ cifs_get_next_mid(struct TCP_Server_Info *server)
__u16 last_mid, cur_mid;
bool collision, reconnect = false;
- spin_lock(&server->mid_lock);
-
+ spin_lock(&server->mid_counter_lock);
/* mid is 16 bit only for CIFS/SMB */
- cur_mid = (__u16)((server->CurrentMid) & 0xffff);
+ cur_mid = (__u16)((server->current_mid) & 0xffff);
/* we do not want to loop forever */
last_mid = cur_mid;
cur_mid++;
@@ -195,6 +197,7 @@ cifs_get_next_mid(struct TCP_Server_Info *server)
cur_mid++;
num_mids = 0;
+ spin_lock(&server->mid_queue_lock);
list_for_each_entry(mid_entry, &server->pending_mid_q, qhead) {
++num_mids;
if (mid_entry->mid == cur_mid &&
@@ -204,6 +207,7 @@ cifs_get_next_mid(struct TCP_Server_Info *server)
break;
}
}
+ spin_unlock(&server->mid_queue_lock);
/*
* if we have more than 32k mids in the list, then something
@@ -220,12 +224,12 @@ cifs_get_next_mid(struct TCP_Server_Info *server)
if (!collision) {
mid = (__u64)cur_mid;
- server->CurrentMid = mid;
+ server->current_mid = mid;
break;
}
cur_mid++;
}
- spin_unlock(&server->mid_lock);
+ spin_unlock(&server->mid_counter_lock);
if (reconnect) {
cifs_signal_cifsd_for_reconnect(server, false);
@@ -377,7 +381,7 @@ coalesce_t2(char *second_buf, struct smb_hdr *target_hdr)
static void
cifs_downgrade_oplock(struct TCP_Server_Info *server,
struct cifsInodeInfo *cinode, __u32 oplock,
- unsigned int epoch, bool *purge_cache)
+ __u16 epoch, bool *purge_cache)
{
cifs_set_oplock_level(cinode, oplock);
}
@@ -426,26 +430,19 @@ cifs_negotiate(const unsigned int xid,
{
int rc;
rc = CIFSSMBNegotiate(xid, ses, server);
- if (rc == -EAGAIN) {
- /* retry only once on 1st time connection */
- set_credits(server, 1);
- rc = CIFSSMBNegotiate(xid, ses, server);
- if (rc == -EAGAIN)
- rc = -EHOSTDOWN;
- }
return rc;
}
static unsigned int
-cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
+smb1_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
{
__u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability);
struct TCP_Server_Info *server = tcon->ses->server;
unsigned int wsize;
/* start with specified wsize, or default */
- if (ctx->wsize)
- wsize = ctx->wsize;
+ if (ctx->got_wsize)
+ wsize = ctx->vol_wsize;
else if (tcon->unix_ext && (unix_cap & CIFS_UNIX_LARGE_WRITE_CAP))
wsize = CIFS_DEFAULT_IOSIZE;
else
@@ -472,7 +469,7 @@ cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
}
static unsigned int
-cifs_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
+smb1_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
{
__u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability);
struct TCP_Server_Info *server = tcon->ses->server;
@@ -497,7 +494,7 @@ cifs_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
else
defsize = server->maxBuf - sizeof(READ_RSP);
- rsize = ctx->rsize ? ctx->rsize : defsize;
+ rsize = ctx->got_rsize ? ctx->vol_rsize : defsize;
/*
* no CAP_LARGE_READ_X? Then MS-CIFS states that we must limit this to
@@ -548,54 +545,146 @@ static int cifs_query_path_info(const unsigned int xid,
const char *full_path,
struct cifs_open_info_data *data)
{
- int rc;
+ int rc = -EOPNOTSUPP;
FILE_ALL_INFO fi = {};
+ struct cifs_search_info search_info = {};
+ bool non_unicode_wildcard = false;
- data->symlink = false;
+ data->reparse_point = false;
data->adjust_tz = false;
- /* could do find first instead but this returns more info */
- rc = CIFSSMBQPathInfo(xid, tcon, full_path, &fi, 0 /* not legacy */, cifs_sb->local_nls,
- cifs_remap(cifs_sb));
/*
- * BB optimize code so we do not make the above call when server claims
- * no NT SMB support and the above call failed at least once - set flag
- * in tcon or mount.
+ * First try CIFSSMBQPathInfo() function which returns more info
+ * (NumberOfLinks) than CIFSFindFirst() fallback function.
+ * Some servers like Win9x do not support SMB_QUERY_FILE_ALL_INFO over
+ * TRANS2_QUERY_PATH_INFORMATION, but supports it with filehandle over
+ * TRANS2_QUERY_FILE_INFORMATION (function CIFSSMBQFileInfo(). But SMB
+ * Open command on non-NT servers works only for files, does not work
+ * for directories. And moreover Win9x SMB server returns bogus data in
+ * SMB_QUERY_FILE_ALL_INFO Attributes field. So for non-NT servers,
+ * do not even use CIFSSMBQPathInfo() or CIFSSMBQFileInfo() function.
+ */
+ if (tcon->ses->capabilities & CAP_NT_SMBS)
+ rc = CIFSSMBQPathInfo(xid, tcon, full_path, &fi, 0 /* not legacy */,
+ cifs_sb->local_nls, cifs_remap(cifs_sb));
+
+ /*
+ * Non-UNICODE variant of fallback functions below expands wildcards,
+ * so they cannot be used for querying paths with wildcard characters.
+ */
+ if (rc && !(tcon->ses->capabilities & CAP_UNICODE) && strpbrk(full_path, "*?\"><"))
+ non_unicode_wildcard = true;
+
+ /*
+ * Then fallback to CIFSFindFirst() which works also with non-NT servers
+ * but does not does not provide NumberOfLinks.
*/
- if ((rc == -EOPNOTSUPP) || (rc == -EINVAL)) {
+ if ((rc == -EOPNOTSUPP || rc == -EINVAL) &&
+ !non_unicode_wildcard) {
+ if (!(tcon->ses->capabilities & tcon->ses->server->vals->cap_nt_find))
+ search_info.info_level = SMB_FIND_FILE_INFO_STANDARD;
+ else
+ search_info.info_level = SMB_FIND_FILE_FULL_DIRECTORY_INFO;
+ rc = CIFSFindFirst(xid, tcon, full_path, cifs_sb, NULL,
+ CIFS_SEARCH_CLOSE_ALWAYS | CIFS_SEARCH_CLOSE_AT_END,
+ &search_info, false);
+ if (rc == 0) {
+ if (!(tcon->ses->capabilities & tcon->ses->server->vals->cap_nt_find)) {
+ FIND_FILE_STANDARD_INFO *di;
+ int offset = tcon->ses->server->timeAdj;
+
+ di = (FIND_FILE_STANDARD_INFO *)search_info.srch_entries_start;
+ fi.CreationTime = cpu_to_le64(cifs_UnixTimeToNT(cnvrtDosUnixTm(
+ di->CreationDate, di->CreationTime, offset)));
+ fi.LastAccessTime = cpu_to_le64(cifs_UnixTimeToNT(cnvrtDosUnixTm(
+ di->LastAccessDate, di->LastAccessTime, offset)));
+ fi.LastWriteTime = cpu_to_le64(cifs_UnixTimeToNT(cnvrtDosUnixTm(
+ di->LastWriteDate, di->LastWriteTime, offset)));
+ fi.ChangeTime = fi.LastWriteTime;
+ fi.Attributes = cpu_to_le32(le16_to_cpu(di->Attributes));
+ fi.AllocationSize = cpu_to_le64(le32_to_cpu(di->AllocationSize));
+ fi.EndOfFile = cpu_to_le64(le32_to_cpu(di->DataSize));
+ } else {
+ FILE_FULL_DIRECTORY_INFO *di;
+
+ di = (FILE_FULL_DIRECTORY_INFO *)search_info.srch_entries_start;
+ fi.CreationTime = di->CreationTime;
+ fi.LastAccessTime = di->LastAccessTime;
+ fi.LastWriteTime = di->LastWriteTime;
+ fi.ChangeTime = di->ChangeTime;
+ fi.Attributes = di->ExtFileAttributes;
+ fi.AllocationSize = di->AllocationSize;
+ fi.EndOfFile = di->EndOfFile;
+ fi.EASize = di->EaSize;
+ }
+ fi.NumberOfLinks = cpu_to_le32(1);
+ fi.DeletePending = 0;
+ fi.Directory = !!(le32_to_cpu(fi.Attributes) & ATTR_DIRECTORY);
+ cifs_buf_release(search_info.ntwrk_buf_start);
+ } else if (!full_path[0]) {
+ /*
+ * CIFSFindFirst() does not work on root path if the
+ * root path was exported on the server from the top
+ * level path (drive letter).
+ */
+ rc = -EOPNOTSUPP;
+ }
+ }
+
+ /*
+ * If everything failed then fallback to the legacy SMB command
+ * SMB_COM_QUERY_INFORMATION which works with all servers, but
+ * provide just few information.
+ */
+ if ((rc == -EOPNOTSUPP || rc == -EINVAL) && !non_unicode_wildcard) {
rc = SMBQueryInformation(xid, tcon, full_path, &fi, cifs_sb->local_nls,
cifs_remap(cifs_sb));
data->adjust_tz = true;
+ } else if ((rc == -EOPNOTSUPP || rc == -EINVAL) && non_unicode_wildcard) {
+ /* Path with non-UNICODE wildcard character cannot exist. */
+ rc = -ENOENT;
}
if (!rc) {
- int tmprc;
- int oplock = 0;
- struct cifs_fid fid;
- struct cifs_open_parms oparms;
-
move_cifs_info_to_smb2(&data->fi, &fi);
+ data->reparse_point = le32_to_cpu(fi.Attributes) & ATTR_REPARSE;
+ }
- if (!(le32_to_cpu(fi.Attributes) & ATTR_REPARSE))
- return 0;
-
- oparms = (struct cifs_open_parms) {
- .tcon = tcon,
- .cifs_sb = cifs_sb,
- .desired_access = FILE_READ_ATTRIBUTES,
- .create_options = cifs_create_options(cifs_sb, 0),
- .disposition = FILE_OPEN,
- .path = full_path,
- .fid = &fid,
- };
-
- /* Need to check if this is a symbolic link or not */
- tmprc = CIFS_open(xid, &oparms, &oplock, NULL);
- if (tmprc == -EOPNOTSUPP)
- data->symlink = true;
- else if (tmprc == 0)
- CIFSSMBClose(xid, tcon, fid.netfid);
+#ifdef CONFIG_CIFS_XATTR
+ /*
+ * For WSL CHR and BLK reparse points it is required to fetch
+ * EA $LXDEV which contains major and minor device numbers.
+ */
+ if (!rc && data->reparse_point) {
+ struct smb2_file_full_ea_info *ea;
+
+ ea = (struct smb2_file_full_ea_info *)data->wsl.eas;
+ rc = CIFSSMBQAllEAs(xid, tcon, full_path, SMB2_WSL_XATTR_DEV,
+ &ea->ea_data[SMB2_WSL_XATTR_NAME_LEN + 1],
+ SMB2_WSL_XATTR_DEV_SIZE, cifs_sb);
+ if (rc == SMB2_WSL_XATTR_DEV_SIZE) {
+ ea->next_entry_offset = cpu_to_le32(0);
+ ea->flags = 0;
+ ea->ea_name_length = SMB2_WSL_XATTR_NAME_LEN;
+ ea->ea_value_length = cpu_to_le16(SMB2_WSL_XATTR_DEV_SIZE);
+ memcpy(&ea->ea_data[0], SMB2_WSL_XATTR_DEV, SMB2_WSL_XATTR_NAME_LEN + 1);
+ data->wsl.eas_len = sizeof(*ea) + SMB2_WSL_XATTR_NAME_LEN + 1 +
+ SMB2_WSL_XATTR_DEV_SIZE;
+ rc = 0;
+ } else if (rc >= 0) {
+ /* It is an error if EA $LXDEV has wrong size. */
+ rc = -EINVAL;
+ } else {
+ /*
+ * In all other cases ignore error if fetching
+ * of EA $LXDEV failed. It is needed only for
+ * WSL CHR and BLK reparse points and wsl_to_fattr()
+ * handle the case when EA is missing.
+ */
+ rc = 0;
+ }
}
+#endif
return rc;
}
@@ -632,6 +721,13 @@ static int cifs_query_file_info(const unsigned int xid, struct cifs_tcon *tcon,
int rc;
FILE_ALL_INFO fi = {};
+ /*
+ * CIFSSMBQFileInfo() for non-NT servers returns bogus data in
+ * Attributes fields. So do not use this command for non-NT servers.
+ */
+ if (!(tcon->ses->capabilities & CAP_NT_SMBS))
+ return -EOPNOTSUPP;
+
if (cfile->symlink_target) {
data->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL);
if (!data->symlink_target)
@@ -802,6 +898,9 @@ smb_set_file_info(struct inode *inode, const char *full_path,
struct cifs_fid fid;
struct cifs_open_parms oparms;
struct cifsFileInfo *open_file;
+ FILE_BASIC_INFO new_buf;
+ struct cifs_open_info_data query_data;
+ __le64 write_time = buf->LastWriteTime;
struct cifsInodeInfo *cinode = CIFS_I(inode);
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct tcon_link *tlink = NULL;
@@ -809,20 +908,58 @@ smb_set_file_info(struct inode *inode, const char *full_path,
/* if the file is already open for write, just use that fileid */
open_file = find_writable_file(cinode, FIND_WR_FSUID_ONLY);
+
if (open_file) {
fid.netfid = open_file->fid.netfid;
netpid = open_file->pid;
tcon = tlink_tcon(open_file->tlink);
- goto set_via_filehandle;
+ } else {
+ tlink = cifs_sb_tlink(cifs_sb);
+ if (IS_ERR(tlink)) {
+ rc = PTR_ERR(tlink);
+ tlink = NULL;
+ goto out;
+ }
+ tcon = tlink_tcon(tlink);
}
- tlink = cifs_sb_tlink(cifs_sb);
- if (IS_ERR(tlink)) {
- rc = PTR_ERR(tlink);
- tlink = NULL;
- goto out;
+ /*
+ * Non-NT servers interprets zero time value in SMB_SET_FILE_BASIC_INFO
+ * over TRANS2_SET_FILE_INFORMATION as a valid time value. NT servers
+ * interprets zero time value as do not change existing value on server.
+ * API of ->set_file_info() callback expects that zero time value has
+ * the NT meaning - do not change. Therefore if server is non-NT and
+ * some time values in "buf" are zero, then fetch missing time values.
+ */
+ if (!(tcon->ses->capabilities & CAP_NT_SMBS) &&
+ (!buf->CreationTime || !buf->LastAccessTime ||
+ !buf->LastWriteTime || !buf->ChangeTime)) {
+ rc = cifs_query_path_info(xid, tcon, cifs_sb, full_path, &query_data);
+ if (rc) {
+ if (open_file) {
+ cifsFileInfo_put(open_file);
+ open_file = NULL;
+ }
+ goto out;
+ }
+ /*
+ * Original write_time from buf->LastWriteTime is preserved
+ * as SMBSetInformation() interprets zero as do not change.
+ */
+ new_buf = *buf;
+ buf = &new_buf;
+ if (!buf->CreationTime)
+ buf->CreationTime = query_data.fi.CreationTime;
+ if (!buf->LastAccessTime)
+ buf->LastAccessTime = query_data.fi.LastAccessTime;
+ if (!buf->LastWriteTime)
+ buf->LastWriteTime = query_data.fi.LastWriteTime;
+ if (!buf->ChangeTime)
+ buf->ChangeTime = query_data.fi.ChangeTime;
}
- tcon = tlink_tcon(tlink);
+
+ if (open_file)
+ goto set_via_filehandle;
rc = CIFSSMBSetPathInfo(xid, tcon, full_path, buf, cifs_sb->local_nls,
cifs_sb);
@@ -843,8 +980,45 @@ smb_set_file_info(struct inode *inode, const char *full_path,
.fid = &fid,
};
- cifs_dbg(FYI, "calling SetFileInfo since SetPathInfo for times not supported by this server\n");
- rc = CIFS_open(xid, &oparms, &oplock, NULL);
+ if (S_ISDIR(inode->i_mode) && !(tcon->ses->capabilities & CAP_NT_SMBS)) {
+ /* Opening directory path is not possible on non-NT servers. */
+ rc = -EOPNOTSUPP;
+ } else {
+ /*
+ * Use cifs_open_file() instead of CIFS_open() as the
+ * cifs_open_file() selects the correct function which
+ * works also on non-NT servers.
+ */
+ rc = cifs_open_file(xid, &oparms, &oplock, NULL);
+ /*
+ * Opening path for writing on non-NT servers is not
+ * possible when the read-only attribute is already set.
+ * Non-NT server in this case returns -EACCES. For those
+ * servers the only possible way how to clear the read-only
+ * bit is via SMB_COM_SETATTR command.
+ */
+ if (rc == -EACCES &&
+ (cinode->cifsAttrs & ATTR_READONLY) &&
+ le32_to_cpu(buf->Attributes) != 0 && /* 0 = do not change attrs */
+ !(le32_to_cpu(buf->Attributes) & ATTR_READONLY) &&
+ !(tcon->ses->capabilities & CAP_NT_SMBS))
+ rc = -EOPNOTSUPP;
+ }
+
+ /* Fallback to SMB_COM_SETATTR command when absolutelty needed. */
+ if (rc == -EOPNOTSUPP) {
+ cifs_dbg(FYI, "calling SetInformation since SetPathInfo for attrs/times not supported by this server\n");
+ rc = SMBSetInformation(xid, tcon, full_path,
+ buf->Attributes != 0 ? buf->Attributes : cpu_to_le32(cinode->cifsAttrs),
+ write_time,
+ cifs_sb->local_nls, cifs_sb);
+ if (rc == 0)
+ cinode->cifsAttrs = le32_to_cpu(buf->Attributes);
+ else
+ rc = -EACCES;
+ goto out;
+ }
+
if (rc != 0) {
if (rc == -EIO)
rc = -EINVAL;
@@ -852,6 +1026,7 @@ smb_set_file_info(struct inode *inode, const char *full_path,
}
netpid = current->tgid;
+ cifs_dbg(FYI, "calling SetFileInfo since SetPathInfo for attrs/times not supported by this server\n");
set_via_filehandle:
rc = CIFSSMBSetFileInfo(xid, tcon, buf, fid.netfid, netpid);
@@ -862,6 +1037,21 @@ set_via_filehandle:
CIFSSMBClose(xid, tcon, fid.netfid);
else
cifsFileInfo_put(open_file);
+
+ /*
+ * Setting the read-only bit is not honered on non-NT servers when done
+ * via open-semantics. So for setting it, use SMB_COM_SETATTR command.
+ * This command works only after the file is closed, so use it only when
+ * operation was called without the filehandle.
+ */
+ if (open_file == NULL &&
+ !(tcon->ses->capabilities & CAP_NT_SMBS) &&
+ le32_to_cpu(buf->Attributes) & ATTR_READONLY) {
+ SMBSetInformation(xid, tcon, full_path,
+ buf->Attributes,
+ 0 /* do not change write time */,
+ cifs_sb->local_nls, cifs_sb);
+ }
out:
if (tlink != NULL)
cifs_put_tlink(tlink);
@@ -999,18 +1189,13 @@ static int cifs_query_symlink(const unsigned int xid,
return rc;
}
-static int cifs_parse_reparse_point(struct cifs_sb_info *cifs_sb,
- const char *full_path,
- struct kvec *rsp_iov,
- struct cifs_open_info_data *data)
+static struct reparse_data_buffer *cifs_get_reparse_point_buffer(const struct kvec *rsp_iov,
+ u32 *plen)
{
- struct reparse_data_buffer *buf;
TRANSACT_IOCTL_RSP *io = rsp_iov->iov_base;
- u32 plen = le16_to_cpu(io->ByteCount);
-
- buf = (struct reparse_data_buffer *)((__u8 *)&io->hdr.Protocol +
- le32_to_cpu(io->DataOffset));
- return parse_reparse_point(buf, plen, cifs_sb, full_path, true, data);
+ *plen = le16_to_cpu(io->ByteCount);
+ return (struct reparse_data_buffer *)((__u8 *)&io->hdr.Protocol +
+ le32_to_cpu(io->DataOffset));
}
static bool
@@ -1080,17 +1265,67 @@ cifs_make_node(unsigned int xid, struct inode *inode,
if (rc == 0)
d_instantiate(dentry, newinode);
return rc;
+ } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
+ /*
+ * Check if mounted with mount parm 'sfu' mount parm.
+ * SFU emulation should work with all servers
+ * and was used by default in earlier versions of Windows.
+ */
+ return cifs_sfu_make_node(xid, inode, dentry, tcon,
+ full_path, mode, dev);
+ } else if (CIFS_REPARSE_SUPPORT(tcon)) {
+ /*
+ * mknod via reparse points requires server support for
+ * storing reparse points, which is available since
+ * Windows 2000, but was not widely used until release
+ * of Windows Server 2012 by the Windows NFS server.
+ */
+ return mknod_reparse(xid, inode, dentry, tcon,
+ full_path, mode, dev);
+ } else {
+ return -EOPNOTSUPP;
}
- /*
- * Check if mounted with mount parm 'sfu' mount parm.
- * SFU emulation should work with all servers, but only
- * supports block and char device, socket & fifo,
- * and was used by default in earlier versions of Windows
- */
- if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL))
- return -EPERM;
- return cifs_sfu_make_node(xid, inode, dentry, tcon,
- full_path, mode, dev);
+}
+
+static bool
+cifs_is_network_name_deleted(char *buf, struct TCP_Server_Info *server)
+{
+ struct smb_hdr *shdr = (struct smb_hdr *)buf;
+ struct TCP_Server_Info *pserver;
+ struct cifs_ses *ses;
+ struct cifs_tcon *tcon;
+
+ if (shdr->Flags2 & SMBFLG2_ERR_STATUS) {
+ if (shdr->Status.CifsError != cpu_to_le32(NT_STATUS_NETWORK_NAME_DELETED))
+ return false;
+ } else {
+ if (shdr->Status.DosError.ErrorClass != ERRSRV ||
+ shdr->Status.DosError.Error != cpu_to_le16(ERRinvtid))
+ return false;
+ }
+
+ /* If server is a channel, select the primary channel */
+ pserver = SERVER_IS_CHAN(server) ? server->primary_server : server;
+
+ spin_lock(&cifs_tcp_ses_lock);
+ list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) {
+ if (cifs_ses_exiting(ses))
+ continue;
+ list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
+ if (tcon->tid == shdr->Tid) {
+ spin_lock(&tcon->tc_lock);
+ tcon->need_reconnect = true;
+ spin_unlock(&tcon->tc_lock);
+ spin_unlock(&cifs_tcp_ses_lock);
+ pr_warn_once("Server share %s deleted.\n",
+ tcon->tree_name);
+ return true;
+ }
+ }
+ }
+ spin_unlock(&cifs_tcp_ses_lock);
+
+ return false;
}
struct smb_version_operations smb1_operations = {
@@ -1118,8 +1353,8 @@ struct smb_version_operations smb1_operations = {
.check_trans2 = cifs_check_trans2,
.need_neg = cifs_need_neg,
.negotiate = cifs_negotiate,
- .negotiate_wsize = cifs_negotiate_wsize,
- .negotiate_rsize = cifs_negotiate_rsize,
+ .negotiate_wsize = smb1_negotiate_wsize,
+ .negotiate_rsize = smb1_negotiate_rsize,
.sess_setup = CIFS_SessSetup,
.logoff = CIFSSMBLogoff,
.tree_connect = CIFSTCon,
@@ -1145,7 +1380,8 @@ struct smb_version_operations smb1_operations = {
.rename = CIFSSMBRename,
.create_hardlink = CIFSCreateHardLink,
.query_symlink = cifs_query_symlink,
- .parse_reparse_point = cifs_parse_reparse_point,
+ .get_reparse_point_buffer = cifs_get_reparse_point_buffer,
+ .create_reparse_inode = cifs_create_reparse_inode,
.open = cifs_open_file,
.set_fid = cifs_set_fid,
.close = cifs_close_file,
@@ -1177,6 +1413,7 @@ struct smb_version_operations smb1_operations = {
.get_acl_by_fid = get_cifs_acl_by_fid,
.set_acl = set_cifs_acl,
.make_node = cifs_make_node,
+ .is_network_name_deleted = cifs_is_network_name_deleted,
};
struct smb_version_values smb1_values = {
@@ -1194,6 +1431,7 @@ struct smb_version_values smb1_values = {
.cap_unix = CAP_UNIX,
.cap_nt_find = CAP_NT_SMBS | CAP_NT_FIND,
.cap_large_files = CAP_LARGE_FILES,
+ .cap_unicode = CAP_UNICODE,
.signing_enabled = SECMODE_SIGN_ENABLED,
.signing_required = SECMODE_SIGN_REQUIRED,
};
diff --git a/fs/smb/client/smb2file.c b/fs/smb/client/smb2file.c
index 9ec44eab8dbc..a7f629238830 100644
--- a/fs/smb/client/smb2file.c
+++ b/fs/smb/client/smb2file.c
@@ -63,6 +63,52 @@ static struct smb2_symlink_err_rsp *symlink_data(const struct kvec *iov)
return sym;
}
+int smb2_fix_symlink_target_type(char **target, bool directory, struct cifs_sb_info *cifs_sb)
+{
+ char *buf;
+ int len;
+
+ /*
+ * POSIX server does not distinguish between symlinks to file and
+ * symlink directory. So nothing is needed to fix on the client side.
+ */
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)
+ return 0;
+
+ if (!*target)
+ return -EIO;
+
+ len = strlen(*target);
+ if (!len)
+ return -EIO;
+
+ /*
+ * If this is directory symlink and it does not have trailing slash then
+ * append it. Trailing slash simulates Windows/SMB behavior which do not
+ * allow resolving directory symlink to file.
+ */
+ if (directory && (*target)[len-1] != '/') {
+ buf = krealloc(*target, len+2, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+ buf[len] = '/';
+ buf[len+1] = '\0';
+ *target = buf;
+ len++;
+ }
+
+ /*
+ * If this is a file (non-directory) symlink and it points to path name
+ * with trailing slash then this is an invalid symlink because file name
+ * cannot contain slash character. File name with slash is invalid on
+ * both Windows and Linux systems. So return an error for such symlink.
+ */
+ if (!directory && (*target)[len-1] == '/')
+ return -EIO;
+
+ return 0;
+}
+
int smb2_parse_symlink_response(struct cifs_sb_info *cifs_sb, const struct kvec *iov,
const char *full_path, char **path)
{
@@ -89,7 +135,6 @@ int smb2_parse_symlink_response(struct cifs_sb_info *cifs_sb, const struct kvec
return smb2_parse_native_symlink(path,
(char *)sym->PathBuffer + sub_offs,
sub_len,
- true,
le32_to_cpu(sym->Flags) & SYMLINK_FLAG_RELATIVE,
full_path,
cifs_sb);
@@ -107,16 +152,35 @@ int smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, __u32
int err_buftype = CIFS_NO_BUFFER;
struct cifs_fid *fid = oparms->fid;
struct network_resiliency_req nr_ioctl_req;
+ bool retry_without_read_attributes = false;
smb2_path = cifs_convert_path_to_utf16(oparms->path, oparms->cifs_sb);
if (smb2_path == NULL)
return -ENOMEM;
- oparms->desired_access |= FILE_READ_ATTRIBUTES;
+ /*
+ * GENERIC_READ, GENERIC_EXECUTE, GENERIC_ALL and MAXIMUM_ALLOWED
+ * contains also FILE_READ_ATTRIBUTES access right. So do not append
+ * FILE_READ_ATTRIBUTES when not needed and prevent calling code path
+ * for retry_without_read_attributes.
+ */
+ if (!(oparms->desired_access & FILE_READ_ATTRIBUTES) &&
+ !(oparms->desired_access & GENERIC_READ) &&
+ !(oparms->desired_access & GENERIC_EXECUTE) &&
+ !(oparms->desired_access & GENERIC_ALL) &&
+ !(oparms->desired_access & MAXIMUM_ALLOWED)) {
+ oparms->desired_access |= FILE_READ_ATTRIBUTES;
+ retry_without_read_attributes = true;
+ }
smb2_oplock = SMB2_OPLOCK_LEVEL_BATCH;
rc = SMB2_open(xid, oparms, smb2_path, &smb2_oplock, smb2_data, NULL, &err_iov,
&err_buftype);
+ if (rc == -EACCES && retry_without_read_attributes) {
+ oparms->desired_access &= ~FILE_READ_ATTRIBUTES;
+ rc = SMB2_open(xid, oparms, smb2_path, &smb2_oplock, smb2_data, NULL, &err_iov,
+ &err_buftype);
+ }
if (rc && data) {
struct smb2_hdr *hdr = err_iov.iov_base;
@@ -133,6 +197,11 @@ int smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, __u32
NULL, NULL, NULL);
oparms->create_options &= ~OPEN_REPARSE_POINT;
}
+ if (!rc) {
+ bool directory = le32_to_cpu(data->fi.Attributes) & ATTR_DIRECTORY;
+ rc = smb2_fix_symlink_target_type(&data->symlink_target,
+ directory, oparms->cifs_sb);
+ }
}
}
diff --git a/fs/smb/client/smb2glob.h b/fs/smb/client/smb2glob.h
index 2466e6155136..224495322a05 100644
--- a/fs/smb/client/smb2glob.h
+++ b/fs/smb/client/smb2glob.h
@@ -38,6 +38,7 @@ enum smb2_compound_ops {
SMB2_OP_SET_REPARSE,
SMB2_OP_GET_REPARSE,
SMB2_OP_QUERY_WSL_EA,
+ SMB2_OP_OPEN_QUERY,
};
/* Used when constructing chained read requests. */
diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c
index c97f14757c27..2a0316c514e4 100644
--- a/fs/smb/client/smb2inode.c
+++ b/fs/smb/client/smb2inode.c
@@ -176,6 +176,7 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
struct kvec *out_iov, int *out_buftype, struct dentry *dentry)
{
+ struct smb2_create_rsp *create_rsp = NULL;
struct smb2_query_info_rsp *qi_rsp = NULL;
struct smb2_compound_vars *vars = NULL;
__u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
@@ -265,7 +266,13 @@ replay_again:
num_rqst++;
rc = 0;
- for (i = 0; i < num_cmds; i++) {
+ i = 0;
+
+ /* Skip the leading explicit OPEN operation */
+ if (num_cmds > 0 && cmds[0] == SMB2_OP_OPEN_QUERY)
+ i++;
+
+ for (; i < num_cmds; i++) {
/* Operation */
switch (cmds[i]) {
case SMB2_OP_QUERY_INFO:
@@ -640,6 +647,29 @@ finished:
}
tmp_rc = rc;
+
+ if (rc == 0 && num_cmds > 0 && cmds[0] == SMB2_OP_OPEN_QUERY) {
+ create_rsp = rsp_iov[0].iov_base;
+ idata = in_iov[0].iov_base;
+ idata->fi.CreationTime = create_rsp->CreationTime;
+ idata->fi.LastAccessTime = create_rsp->LastAccessTime;
+ idata->fi.LastWriteTime = create_rsp->LastWriteTime;
+ idata->fi.ChangeTime = create_rsp->ChangeTime;
+ idata->fi.Attributes = create_rsp->FileAttributes;
+ idata->fi.AllocationSize = create_rsp->AllocationSize;
+ idata->fi.EndOfFile = create_rsp->EndofFile;
+ if (le32_to_cpu(idata->fi.NumberOfLinks) == 0)
+ idata->fi.NumberOfLinks = cpu_to_le32(1); /* dummy value */
+ idata->fi.DeletePending = 0;
+ idata->fi.Directory = !!(le32_to_cpu(create_rsp->FileAttributes) & ATTR_DIRECTORY);
+
+ /* smb2_parse_contexts() fills idata->fi.IndexNumber */
+ rc = smb2_parse_contexts(server, &rsp_iov[0], &oparms->fid->epoch,
+ oparms->fid->lease_key, &oplock, &idata->fi, NULL);
+ if (rc)
+ cifs_dbg(VFS, "rc: %d parsing context of compound op\n", rc);
+ }
+
for (i = 0; i < num_cmds; i++) {
char *buf = rsp_iov[i + i].iov_base;
@@ -650,6 +680,7 @@ finished:
switch (cmds[i]) {
case SMB2_OP_QUERY_INFO:
idata = in_iov[i].iov_base;
+ idata->contains_posix_file_info = false;
if (rc == 0 && cfile && cfile->symlink_target) {
idata->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL);
if (!idata->symlink_target)
@@ -673,6 +704,7 @@ finished:
break;
case SMB2_OP_POSIX_QUERY_INFO:
idata = in_iov[i].iov_base;
+ idata->contains_posix_file_info = true;
if (rc == 0 && cfile && cfile->symlink_target) {
idata->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL);
if (!idata->symlink_target)
@@ -770,6 +802,7 @@ finished:
idata = in_iov[i].iov_base;
idata->reparse.io.iov = *iov;
idata->reparse.io.buftype = resp_buftype[i + 1];
+ idata->contains_posix_file_info = false; /* BB VERIFY */
rbuf = reparse_buf_ptr(iov);
if (IS_ERR(rbuf)) {
rc = PTR_ERR(rbuf);
@@ -791,6 +824,7 @@ finished:
case SMB2_OP_QUERY_WSL_EA:
if (!rc) {
idata = in_iov[i].iov_base;
+ idata->contains_posix_file_info = false;
qi_rsp = rsp_iov[i + 1].iov_base;
data[0] = (u8 *)qi_rsp + le16_to_cpu(qi_rsp->OutputBufferOffset);
size[0] = le32_to_cpu(qi_rsp->OutputBufferLength);
@@ -974,6 +1008,43 @@ int smb2_query_path_info(const unsigned int xid,
case 0:
rc = parse_create_response(data, cifs_sb, full_path, &out_iov[0]);
break;
+ case -EACCES:
+ /*
+ * If SMB2_OP_QUERY_INFO (called when POSIX extensions are not used) failed with
+ * STATUS_ACCESS_DENIED then it means that caller does not have permission to
+ * open the path with FILE_READ_ATTRIBUTES access and therefore cannot issue
+ * SMB2_OP_QUERY_INFO command.
+ *
+ * There is an alternative way how to query limited information about path but still
+ * suitable for stat() syscall. SMB2 OPEN/CREATE operation returns in its successful
+ * response subset of query information.
+ *
+ * So try to open the path without FILE_READ_ATTRIBUTES but with MAXIMUM_ALLOWED
+ * access which will grant the maximum possible access to the file and the response
+ * will contain required query information for stat() syscall.
+ */
+
+ if (tcon->posix_extensions)
+ break;
+
+ num_cmds = 1;
+ cmds[0] = SMB2_OP_OPEN_QUERY;
+ in_iov[0].iov_base = data;
+ in_iov[0].iov_len = sizeof(*data);
+ oparms = CIFS_OPARMS(cifs_sb, tcon, full_path, MAXIMUM_ALLOWED,
+ FILE_OPEN, create_options, ACL_NO_MODE);
+ free_rsp_iov(out_iov, out_buftype, ARRAY_SIZE(out_iov));
+ rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
+ &oparms, in_iov, cmds, num_cmds,
+ cfile, out_iov, out_buftype, NULL);
+
+ hdr = out_iov[0].iov_base;
+ if (!hdr || out_buftype[0] == CIFS_NO_BUFFER)
+ goto out;
+
+ if (!rc)
+ rc = parse_create_response(data, cifs_sb, full_path, &out_iov[0]);
+ break;
case -EOPNOTSUPP:
/*
* BB TODO: When support for special files added to Samba
@@ -987,10 +1058,11 @@ int smb2_query_path_info(const unsigned int xid,
* Skip SMB2_OP_GET_REPARSE if symlink already parsed in create
* response.
*/
- if (data->reparse.tag != IO_REPARSE_TAG_SYMLINK)
+ if (data->reparse.tag != IO_REPARSE_TAG_SYMLINK) {
cmds[num_cmds++] = SMB2_OP_GET_REPARSE;
- if (!tcon->posix_extensions)
- cmds[num_cmds++] = SMB2_OP_QUERY_WSL_EA;
+ if (!tcon->posix_extensions)
+ cmds[num_cmds++] = SMB2_OP_QUERY_WSL_EA;
+ }
oparms = CIFS_OPARMS(cifs_sb, tcon, full_path,
FILE_READ_ATTRIBUTES |
@@ -1010,6 +1082,11 @@ int smb2_query_path_info(const unsigned int xid,
else
rc = -EOPNOTSUPP;
}
+
+ if (data->reparse.tag == IO_REPARSE_TAG_SYMLINK && !rc) {
+ bool directory = le32_to_cpu(data->fi.Attributes) & ATTR_DIRECTORY;
+ rc = smb2_fix_symlink_target_type(&data->symlink_target, directory, cifs_sb);
+ }
break;
case -EREMOTE:
break;
@@ -1244,7 +1321,7 @@ smb2_set_file_info(struct inode *inode, const char *full_path,
return rc;
}
-struct inode *smb2_get_reparse_inode(struct cifs_open_info_data *data,
+struct inode *smb2_create_reparse_inode(struct cifs_open_info_data *data,
struct super_block *sb,
const unsigned int xid,
struct cifs_tcon *tcon,
@@ -1264,6 +1341,14 @@ struct inode *smb2_get_reparse_inode(struct cifs_open_info_data *data,
int rc;
int i;
+ /*
+ * If server filesystem does not support reparse points then do not
+ * attempt to create reparse point. This will prevent creating unusable
+ * empty object on the server.
+ */
+ if (!CIFS_REPARSE_SUPPORT(tcon))
+ return ERR_PTR(-EOPNOTSUPP);
+
oparms = CIFS_OPARMS(cifs_sb, tcon, full_path,
SYNCHRONIZE | DELETE |
FILE_READ_ATTRIBUTES |
diff --git a/fs/smb/client/smb2maperror.c b/fs/smb/client/smb2maperror.c
index b05313acf9b2..12c2b868789f 100644
--- a/fs/smb/client/smb2maperror.c
+++ b/fs/smb/client/smb2maperror.c
@@ -380,7 +380,7 @@ static const struct status_to_posix_error smb2_error_map_table[] = {
{STATUS_NO_LOGON_SERVERS, -EIO, "STATUS_NO_LOGON_SERVERS"},
{STATUS_NO_SUCH_LOGON_SESSION, -EIO, "STATUS_NO_SUCH_LOGON_SESSION"},
{STATUS_NO_SUCH_PRIVILEGE, -EIO, "STATUS_NO_SUCH_PRIVILEGE"},
- {STATUS_PRIVILEGE_NOT_HELD, -EIO, "STATUS_PRIVILEGE_NOT_HELD"},
+ {STATUS_PRIVILEGE_NOT_HELD, -EPERM, "STATUS_PRIVILEGE_NOT_HELD"},
{STATUS_INVALID_ACCOUNT_NAME, -EIO, "STATUS_INVALID_ACCOUNT_NAME"},
{STATUS_USER_EXISTS, -EIO, "STATUS_USER_EXISTS"},
{STATUS_NO_SUCH_USER, -EIO, "STATUS_NO_SUCH_USER"},
@@ -871,7 +871,7 @@ static const struct status_to_posix_error smb2_error_map_table[] = {
{STATUS_VALIDATE_CONTINUE, -EIO, "STATUS_VALIDATE_CONTINUE"},
{STATUS_NO_MATCH, -EIO, "STATUS_NO_MATCH"},
{STATUS_NO_MORE_MATCHES, -EIO, "STATUS_NO_MORE_MATCHES"},
- {STATUS_NOT_A_REPARSE_POINT, -EIO, "STATUS_NOT_A_REPARSE_POINT"},
+ {STATUS_NOT_A_REPARSE_POINT, -ENODATA, "STATUS_NOT_A_REPARSE_POINT"},
{STATUS_IO_REPARSE_TAG_INVALID, -EIO, "STATUS_IO_REPARSE_TAG_INVALID"},
{STATUS_IO_REPARSE_TAG_MISMATCH, -EIO,
"STATUS_IO_REPARSE_TAG_MISMATCH"},
diff --git a/fs/smb/client/smb2misc.c b/fs/smb/client/smb2misc.c
index f3c4b70b77b9..cddf273c14ae 100644
--- a/fs/smb/client/smb2misc.c
+++ b/fs/smb/client/smb2misc.c
@@ -816,11 +816,12 @@ smb2_handle_cancelled_close(struct cifs_tcon *tcon, __u64 persistent_fid,
WARN_ONCE(tcon->tc_count < 0, "tcon refcount is negative");
spin_unlock(&cifs_tcp_ses_lock);
- if (tcon->ses)
+ if (tcon->ses) {
server = tcon->ses->server;
-
- cifs_server_dbg(FYI, "tid=0x%x: tcon is closing, skipping async close retry of fid %llu %llu\n",
- tcon->tid, persistent_fid, volatile_fid);
+ cifs_server_dbg(FYI,
+ "tid=0x%x: tcon is closing, skipping async close retry of fid %llu %llu\n",
+ tcon->tid, persistent_fid, volatile_fid);
+ }
return 0;
}
diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index d640dcabc305..94b1d7a395d5 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -91,7 +91,7 @@ smb2_add_credits(struct TCP_Server_Info *server,
if (*val > 65000) {
*val = 65000; /* Don't get near 64K credits, avoid srv bugs */
pr_warn_once("server overflowed SMB3 credits\n");
- trace_smb3_overflow_credits(server->CurrentMid,
+ trace_smb3_overflow_credits(server->current_mid,
server->conn_id, server->hostname, *val,
add, server->in_flight);
}
@@ -136,7 +136,7 @@ smb2_add_credits(struct TCP_Server_Info *server,
wake_up(&server->request_q);
if (reconnect_detected) {
- trace_smb3_reconnect_detected(server->CurrentMid,
+ trace_smb3_reconnect_detected(server->current_mid,
server->conn_id, server->hostname, scredits, add, in_flight);
cifs_dbg(FYI, "trying to put %d credits from the old server instance %d\n",
@@ -144,7 +144,7 @@ smb2_add_credits(struct TCP_Server_Info *server,
}
if (reconnect_with_invalid_credits) {
- trace_smb3_reconnect_with_invalid_credits(server->CurrentMid,
+ trace_smb3_reconnect_with_invalid_credits(server->current_mid,
server->conn_id, server->hostname, scredits, add, in_flight);
cifs_dbg(FYI, "Negotiate operation when server credits is non-zero. Optype: %d, server credits: %d, credits added: %d\n",
optype, scredits, add);
@@ -176,7 +176,7 @@ smb2_add_credits(struct TCP_Server_Info *server,
break;
}
- trace_smb3_add_credits(server->CurrentMid,
+ trace_smb3_add_credits(server->current_mid,
server->conn_id, server->hostname, scredits, add, in_flight);
cifs_dbg(FYI, "%s: added %u credits total=%d\n", __func__, add, scredits);
}
@@ -203,7 +203,7 @@ smb2_set_credits(struct TCP_Server_Info *server, const int val)
in_flight = server->in_flight;
spin_unlock(&server->req_lock);
- trace_smb3_set_credits(server->CurrentMid,
+ trace_smb3_set_credits(server->current_mid,
server->conn_id, server->hostname, scredits, val, in_flight);
cifs_dbg(FYI, "%s: set %u credits\n", __func__, val);
@@ -288,7 +288,7 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, size_t size,
in_flight = server->in_flight;
spin_unlock(&server->req_lock);
- trace_smb3_wait_credits(server->CurrentMid,
+ trace_smb3_wait_credits(server->current_mid,
server->conn_id, server->hostname, scredits, -(credits->value), in_flight);
cifs_dbg(FYI, "%s: removed %u credits total=%d\n",
__func__, credits->value, scredits);
@@ -316,7 +316,7 @@ smb2_adjust_credits(struct TCP_Server_Info *server,
server->credits, server->in_flight,
new_val - credits->value,
cifs_trace_rw_credits_no_adjust_up);
- trace_smb3_too_many_credits(server->CurrentMid,
+ trace_smb3_too_many_credits(server->current_mid,
server->conn_id, server->hostname, 0, credits->value - new_val, 0);
cifs_server_dbg(VFS, "R=%x[%x] request has less credits (%d) than required (%d)",
subreq->rreq->debug_id, subreq->subreq.debug_index,
@@ -338,7 +338,7 @@ smb2_adjust_credits(struct TCP_Server_Info *server,
server->credits, server->in_flight,
new_val - credits->value,
cifs_trace_rw_credits_old_session);
- trace_smb3_reconnect_detected(server->CurrentMid,
+ trace_smb3_reconnect_detected(server->current_mid,
server->conn_id, server->hostname, scredits,
credits->value - new_val, in_flight);
cifs_server_dbg(VFS, "R=%x[%x] trying to return %d credits to old session\n",
@@ -358,7 +358,7 @@ smb2_adjust_credits(struct TCP_Server_Info *server,
spin_unlock(&server->req_lock);
wake_up(&server->request_q);
- trace_smb3_adj_credits(server->CurrentMid,
+ trace_smb3_adj_credits(server->current_mid,
server->conn_id, server->hostname, scredits,
credits->value - new_val, in_flight);
cifs_dbg(FYI, "%s: adjust added %u credits total=%d\n",
@@ -374,19 +374,19 @@ smb2_get_next_mid(struct TCP_Server_Info *server)
{
__u64 mid;
/* for SMB2 we need the current value */
- spin_lock(&server->mid_lock);
- mid = server->CurrentMid++;
- spin_unlock(&server->mid_lock);
+ spin_lock(&server->mid_counter_lock);
+ mid = server->current_mid++;
+ spin_unlock(&server->mid_counter_lock);
return mid;
}
static void
smb2_revert_current_mid(struct TCP_Server_Info *server, const unsigned int val)
{
- spin_lock(&server->mid_lock);
- if (server->CurrentMid >= val)
- server->CurrentMid -= val;
- spin_unlock(&server->mid_lock);
+ spin_lock(&server->mid_counter_lock);
+ if (server->current_mid >= val)
+ server->current_mid -= val;
+ spin_unlock(&server->mid_counter_lock);
}
static struct mid_q_entry *
@@ -401,7 +401,7 @@ __smb2_find_mid(struct TCP_Server_Info *server, char *buf, bool dequeue)
return NULL;
}
- spin_lock(&server->mid_lock);
+ spin_lock(&server->mid_queue_lock);
list_for_each_entry(mid, &server->pending_mid_q, qhead) {
if ((mid->mid == wire_mid) &&
(mid->mid_state == MID_REQUEST_SUBMITTED) &&
@@ -409,13 +409,13 @@ __smb2_find_mid(struct TCP_Server_Info *server, char *buf, bool dequeue)
kref_get(&mid->refcount);
if (dequeue) {
list_del_init(&mid->qhead);
- mid->mid_flags |= MID_DELETED;
+ mid->deleted_from_q = true;
}
- spin_unlock(&server->mid_lock);
+ spin_unlock(&server->mid_queue_lock);
return mid;
}
}
- spin_unlock(&server->mid_lock);
+ spin_unlock(&server->mid_queue_lock);
return NULL;
}
@@ -460,16 +460,24 @@ smb2_negotiate(const unsigned int xid,
{
int rc;
- spin_lock(&server->mid_lock);
- server->CurrentMid = 0;
- spin_unlock(&server->mid_lock);
+ spin_lock(&server->mid_counter_lock);
+ server->current_mid = 0;
+ spin_unlock(&server->mid_counter_lock);
rc = SMB2_negotiate(xid, ses, server);
- /* BB we probably don't need to retry with modern servers */
- if (rc == -EAGAIN)
- rc = -EHOSTDOWN;
return rc;
}
+static inline unsigned int
+prevent_zero_iosize(unsigned int size, const char *type)
+{
+ if (size == 0) {
+ cifs_dbg(VFS, "SMB: Zero %ssize calculated, using minimum value %u\n",
+ type, CIFS_MIN_DEFAULT_IOSIZE);
+ return CIFS_MIN_DEFAULT_IOSIZE;
+ }
+ return size;
+}
+
static unsigned int
smb2_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
{
@@ -477,12 +485,12 @@ smb2_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
unsigned int wsize;
/* start with specified wsize, or default */
- wsize = ctx->wsize ? ctx->wsize : CIFS_DEFAULT_IOSIZE;
+ wsize = ctx->got_wsize ? ctx->vol_wsize : CIFS_DEFAULT_IOSIZE;
wsize = min_t(unsigned int, wsize, server->max_write);
if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU))
wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE);
- return wsize;
+ return prevent_zero_iosize(wsize, "w");
}
static unsigned int
@@ -492,10 +500,13 @@ smb3_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
unsigned int wsize;
/* start with specified wsize, or default */
- wsize = ctx->wsize ? ctx->wsize : SMB3_DEFAULT_IOSIZE;
+ wsize = ctx->got_wsize ? ctx->vol_wsize : SMB3_DEFAULT_IOSIZE;
wsize = min_t(unsigned int, wsize, server->max_write);
#ifdef CONFIG_CIFS_SMB_DIRECT
if (server->rdma) {
+ struct smbdirect_socket_parameters *sp =
+ &server->smbd_conn->socket.parameters;
+
if (server->sign)
/*
* Account for SMB2 data transfer packet header and
@@ -503,18 +514,18 @@ smb3_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
*/
wsize = min_t(unsigned int,
wsize,
- server->smbd_conn->max_fragmented_send_size -
+ sp->max_fragmented_send_size -
SMB2_READWRITE_PDU_HEADER_SIZE -
sizeof(struct smb2_transform_hdr));
else
wsize = min_t(unsigned int,
- wsize, server->smbd_conn->max_readwrite_size);
+ wsize, sp->max_read_write_size);
}
#endif
if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU))
wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE);
- return wsize;
+ return prevent_zero_iosize(wsize, "w");
}
static unsigned int
@@ -524,13 +535,13 @@ smb2_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
unsigned int rsize;
/* start with specified rsize, or default */
- rsize = ctx->rsize ? ctx->rsize : CIFS_DEFAULT_IOSIZE;
+ rsize = ctx->got_rsize ? ctx->vol_rsize : CIFS_DEFAULT_IOSIZE;
rsize = min_t(unsigned int, rsize, server->max_read);
if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU))
rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE);
- return rsize;
+ return prevent_zero_iosize(rsize, "r");
}
static unsigned int
@@ -540,10 +551,13 @@ smb3_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
unsigned int rsize;
/* start with specified rsize, or default */
- rsize = ctx->rsize ? ctx->rsize : SMB3_DEFAULT_IOSIZE;
+ rsize = ctx->got_rsize ? ctx->vol_rsize : SMB3_DEFAULT_IOSIZE;
rsize = min_t(unsigned int, rsize, server->max_read);
#ifdef CONFIG_CIFS_SMB_DIRECT
if (server->rdma) {
+ struct smbdirect_socket_parameters *sp =
+ &server->smbd_conn->socket.parameters;
+
if (server->sign)
/*
* Account for SMB2 data transfer packet header and
@@ -551,19 +565,19 @@ smb3_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx)
*/
rsize = min_t(unsigned int,
rsize,
- server->smbd_conn->max_fragmented_recv_size -
+ sp->max_fragmented_recv_size -
SMB2_READWRITE_PDU_HEADER_SIZE -
sizeof(struct smb2_transform_hdr));
else
rsize = min_t(unsigned int,
- rsize, server->smbd_conn->max_readwrite_size);
+ rsize, sp->max_read_write_size);
}
#endif
if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU))
rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE);
- return rsize;
+ return prevent_zero_iosize(rsize, "r");
}
/*
@@ -758,6 +772,13 @@ next_iface:
bytes_left -= sizeof(*p);
break;
}
+ /* Validate that Next doesn't point beyond the buffer */
+ if (next > bytes_left) {
+ cifs_dbg(VFS, "%s: invalid Next pointer %zu > %zd\n",
+ __func__, next, bytes_left);
+ rc = -EINVAL;
+ goto out;
+ }
p = (struct network_interface_info_ioctl_rsp *)((u8 *)p+next);
bytes_left -= next;
}
@@ -769,7 +790,9 @@ next_iface:
}
/* Azure rounds the buffer size up 8, to a 16 byte boundary */
- if ((bytes_left > 8) || p->Next)
+ if ((bytes_left > 8) ||
+ (bytes_left >= offsetof(struct network_interface_info_ioctl_rsp, Next)
+ + sizeof(p->Next) && p->Next))
cifs_dbg(VFS, "%s: incomplete interface info\n", __func__);
ses->iface_last_update = jiffies;
@@ -969,7 +992,7 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon,
if (islink)
rc = -EREMOTE;
}
- if (rc == -EREMOTE && IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) && cifs_sb &&
+ if (rc == -EREMOTE && IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) &&
(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS))
rc = -EOPNOTSUPP;
goto out;
@@ -1001,6 +1024,7 @@ static int smb2_query_file_info(const unsigned int xid, struct cifs_tcon *tcon,
if (!data->symlink_target)
return -ENOMEM;
}
+ data->contains_posix_file_info = false;
return SMB2_query_info(xid, tcon, fid->persistent_fid, fid->volatile_fid, &data->fi);
}
@@ -2483,7 +2507,7 @@ smb2_is_status_pending(char *buf, struct TCP_Server_Info *server)
spin_unlock(&server->req_lock);
wake_up(&server->request_q);
- trace_smb3_pend_credits(server->CurrentMid,
+ trace_smb3_pend_credits(server->current_mid,
server->conn_id, server->hostname, scredits,
le16_to_cpu(shdr->CreditRequest), in_flight);
cifs_dbg(FYI, "%s: status pending add %u credits total=%d\n",
@@ -3525,8 +3549,6 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon,
if (rc == 0) {
netfs_resize_file(&cifsi->netfs, new_eof, true);
cifs_setsize(inode, new_eof);
- cifs_truncate_page(inode->i_mapping, inode->i_size);
- truncate_setsize(inode, new_eof);
}
goto out;
}
@@ -3904,22 +3926,22 @@ static long smb3_fallocate(struct file *file, struct cifs_tcon *tcon, int mode,
static void
smb2_downgrade_oplock(struct TCP_Server_Info *server,
struct cifsInodeInfo *cinode, __u32 oplock,
- unsigned int epoch, bool *purge_cache)
+ __u16 epoch, bool *purge_cache)
{
server->ops->set_oplock_level(cinode, oplock, 0, NULL);
}
static void
smb21_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock,
- unsigned int epoch, bool *purge_cache);
+ __u16 epoch, bool *purge_cache);
static void
smb3_downgrade_oplock(struct TCP_Server_Info *server,
struct cifsInodeInfo *cinode, __u32 oplock,
- unsigned int epoch, bool *purge_cache)
+ __u16 epoch, bool *purge_cache)
{
unsigned int old_state = cinode->oplock;
- unsigned int old_epoch = cinode->epoch;
+ __u16 old_epoch = cinode->epoch;
unsigned int new_state;
if (epoch > old_epoch) {
@@ -3939,7 +3961,7 @@ smb3_downgrade_oplock(struct TCP_Server_Info *server,
static void
smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock,
- unsigned int epoch, bool *purge_cache)
+ __u16 epoch, bool *purge_cache)
{
oplock &= 0xFF;
cinode->lease_granted = false;
@@ -3963,7 +3985,7 @@ smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock,
static void
smb21_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock,
- unsigned int epoch, bool *purge_cache)
+ __u16 epoch, bool *purge_cache)
{
char message[5] = {0};
unsigned int new_oplock = 0;
@@ -4000,7 +4022,7 @@ smb21_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock,
static void
smb3_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock,
- unsigned int epoch, bool *purge_cache)
+ __u16 epoch, bool *purge_cache)
{
unsigned int old_oplock = cinode->oplock;
@@ -4062,7 +4084,7 @@ map_oplock_to_lease(u8 oplock)
}
static char *
-smb2_create_lease_buf(u8 *lease_key, u8 oplock)
+smb2_create_lease_buf(u8 *lease_key, u8 oplock, u8 *parent_lease_key, __le32 flags)
{
struct create_lease *buf;
@@ -4088,7 +4110,7 @@ smb2_create_lease_buf(u8 *lease_key, u8 oplock)
}
static char *
-smb3_create_lease_buf(u8 *lease_key, u8 oplock)
+smb3_create_lease_buf(u8 *lease_key, u8 oplock, u8 *parent_lease_key, __le32 flags)
{
struct create_lease_v2 *buf;
@@ -4098,6 +4120,9 @@ smb3_create_lease_buf(u8 *lease_key, u8 oplock)
memcpy(&buf->lcontext.LeaseKey, lease_key, SMB2_LEASE_KEY_SIZE);
buf->lcontext.LeaseState = map_oplock_to_lease(oplock);
+ buf->lcontext.LeaseFlags = flags;
+ if (flags & SMB2_LEASE_FLAG_PARENT_LEASE_KEY_SET_LE)
+ memcpy(&buf->lcontext.ParentLeaseKey, parent_lease_key, SMB2_LEASE_KEY_SIZE);
buf->ccontext.DataOffset = cpu_to_le16(offsetof
(struct create_lease_v2, lcontext));
@@ -4114,7 +4139,7 @@ smb3_create_lease_buf(u8 *lease_key, u8 oplock)
}
static __u8
-smb2_parse_lease_buf(void *buf, unsigned int *epoch, char *lease_key)
+smb2_parse_lease_buf(void *buf, __u16 *epoch, char *lease_key)
{
struct create_lease *lc = (struct create_lease *)buf;
@@ -4125,7 +4150,7 @@ smb2_parse_lease_buf(void *buf, unsigned int *epoch, char *lease_key)
}
static __u8
-smb3_parse_lease_buf(void *buf, unsigned int *epoch, char *lease_key)
+smb3_parse_lease_buf(void *buf, __u16 *epoch, char *lease_key)
{
struct create_lease_v2 *lc = (struct create_lease_v2 *)buf;
@@ -4300,6 +4325,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
u8 key[SMB3_ENC_DEC_KEY_SIZE];
struct aead_request *req;
u8 *iv;
+ DECLARE_CRYPTO_WAIT(wait);
unsigned int crypt_len = le32_to_cpu(tr_hdr->OriginalMessageSize);
void *creq;
size_t sensitive_size;
@@ -4350,7 +4376,11 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
aead_request_set_crypt(req, sg, sg, crypt_len, iv);
aead_request_set_ad(req, assoc_data_len);
- rc = enc ? crypto_aead_encrypt(req) : crypto_aead_decrypt(req);
+ aead_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+ crypto_req_done, &wait);
+
+ rc = crypto_wait_req(enc ? crypto_aead_encrypt(req)
+ : crypto_aead_decrypt(req), &wait);
if (!rc && enc)
memcpy(&tr_hdr->Signature, sign, SMB2_SIGNATURE_SIZE);
@@ -4466,7 +4496,7 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, int num_rqst,
for (int i = 1; i < num_rqst; i++) {
struct smb_rqst *old = &old_rq[i - 1];
struct smb_rqst *new = &new_rq[i];
- struct folio_queue *buffer;
+ struct folio_queue *buffer = NULL;
size_t size = iov_iter_count(&old->rq_iter);
orig_len += smb_rqst_len(server, old);
@@ -4548,9 +4578,9 @@ decrypt_raw_data(struct TCP_Server_Info *server, char *buf,
return rc;
}
} else {
- if (unlikely(!server->secmech.dec))
- return -EIO;
-
+ rc = smb3_crypto_aead_allocate(server);
+ if (unlikely(rc))
+ return rc;
tfm = server->secmech.dec;
}
@@ -4784,22 +4814,22 @@ static void smb2_decrypt_offload(struct work_struct *work)
dw->server->ops->is_network_name_deleted(dw->buf,
dw->server);
- mid->callback(mid);
+ mid_execute_callback(mid);
} else {
spin_lock(&dw->server->srv_lock);
if (dw->server->tcpStatus == CifsNeedReconnect) {
- spin_lock(&dw->server->mid_lock);
+ spin_lock(&dw->server->mid_queue_lock);
mid->mid_state = MID_RETRY_NEEDED;
- spin_unlock(&dw->server->mid_lock);
+ spin_unlock(&dw->server->mid_queue_lock);
spin_unlock(&dw->server->srv_lock);
- mid->callback(mid);
+ mid_execute_callback(mid);
} else {
- spin_lock(&dw->server->mid_lock);
+ spin_lock(&dw->server->mid_queue_lock);
mid->mid_state = MID_REQUEST_SUBMITTED;
- mid->mid_flags &= ~(MID_DELETED);
+ mid->deleted_from_q = false;
list_add_tail(&mid->qhead,
&dw->server->pending_mid_q);
- spin_unlock(&dw->server->mid_lock);
+ spin_unlock(&dw->server->mid_queue_lock);
spin_unlock(&dw->server->srv_lock);
}
}
@@ -4964,6 +4994,10 @@ one_more:
next_buffer = (char *)cifs_buf_get();
else
next_buffer = (char *)cifs_small_buf_get();
+ if (!next_buffer) {
+ cifs_server_dbg(VFS, "No memory for (large) SMB response\n");
+ return -1;
+ }
memcpy(next_buffer, buf + next_cmd, pdu_length - next_cmd);
}
@@ -5077,6 +5111,7 @@ int __cifs_sfu_make_node(unsigned int xid, struct inode *inode,
{
struct TCP_Server_Info *server = tcon->ses->server;
struct cifs_open_parms oparms;
+ struct cifs_open_info_data idata;
struct cifs_io_parms io_parms = {};
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct cifs_fid fid;
@@ -5145,11 +5180,21 @@ int __cifs_sfu_make_node(unsigned int xid, struct inode *inode,
FILE_CREATE, CREATE_NOT_DIR |
CREATE_OPTION_SPECIAL, ACL_NO_MODE);
oparms.fid = &fid;
-
- rc = server->ops->open(xid, &oparms, &oplock, NULL);
+ idata.contains_posix_file_info = false;
+ rc = server->ops->open(xid, &oparms, &oplock, &idata);
if (rc)
goto out;
+ /*
+ * Check if the server honored ATTR_SYSTEM flag by CREATE_OPTION_SPECIAL
+ * option. If not then server does not support ATTR_SYSTEM and newly
+ * created file is not SFU compatible, which means that the call failed.
+ */
+ if (!(le32_to_cpu(idata.fi.Attributes) & ATTR_SYSTEM)) {
+ rc = -EOPNOTSUPP;
+ goto out_close;
+ }
+
if (type_len + data_len > 0) {
io_parms.pid = current->tgid;
io_parms.tcon = tcon;
@@ -5164,8 +5209,18 @@ int __cifs_sfu_make_node(unsigned int xid, struct inode *inode,
iov, ARRAY_SIZE(iov)-1);
}
+out_close:
server->ops->close(xid, tcon, &fid);
+ /*
+ * If CREATE was successful but either setting ATTR_SYSTEM failed or
+ * writing type/data information failed then remove the intermediate
+ * object created by CREATE. Otherwise intermediate empty object stay
+ * on the server.
+ */
+ if (rc)
+ server->ops->unlink(xid, tcon, full_path, cifs_sb, NULL);
+
out:
kfree(symname_utf16);
return rc;
@@ -5203,7 +5258,7 @@ static int smb2_make_node(unsigned int xid, struct inode *inode,
const char *full_path, umode_t mode, dev_t dev)
{
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
- int rc;
+ int rc = -EOPNOTSUPP;
/*
* Check if mounted with mount parm 'sfu' mount parm.
@@ -5214,9 +5269,9 @@ static int smb2_make_node(unsigned int xid, struct inode *inode,
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
rc = cifs_sfu_make_node(xid, inode, dentry, tcon,
full_path, mode, dev);
- } else {
- rc = smb2_mknod_reparse(xid, inode, dentry, tcon,
- full_path, mode, dev);
+ } else if (CIFS_REPARSE_SUPPORT(tcon)) {
+ rc = mknod_reparse(xid, inode, dentry, tcon,
+ full_path, mode, dev);
}
return rc;
}
@@ -5271,10 +5326,10 @@ struct smb_version_operations smb20_operations = {
.unlink = smb2_unlink,
.rename = smb2_rename_path,
.create_hardlink = smb2_create_hardlink,
- .parse_reparse_point = smb2_parse_reparse_point,
+ .get_reparse_point_buffer = smb2_get_reparse_point_buffer,
.query_mf_symlink = smb3_query_mf_symlink,
.create_mf_symlink = smb3_create_mf_symlink,
- .create_reparse_symlink = smb2_create_reparse_symlink,
+ .create_reparse_inode = smb2_create_reparse_inode,
.open = smb2_open_file,
.set_fid = smb2_set_fid,
.close = smb2_close_file,
@@ -5374,10 +5429,10 @@ struct smb_version_operations smb21_operations = {
.unlink = smb2_unlink,
.rename = smb2_rename_path,
.create_hardlink = smb2_create_hardlink,
- .parse_reparse_point = smb2_parse_reparse_point,
+ .get_reparse_point_buffer = smb2_get_reparse_point_buffer,
.query_mf_symlink = smb3_query_mf_symlink,
.create_mf_symlink = smb3_create_mf_symlink,
- .create_reparse_symlink = smb2_create_reparse_symlink,
+ .create_reparse_inode = smb2_create_reparse_inode,
.open = smb2_open_file,
.set_fid = smb2_set_fid,
.close = smb2_close_file,
@@ -5481,10 +5536,10 @@ struct smb_version_operations smb30_operations = {
.unlink = smb2_unlink,
.rename = smb2_rename_path,
.create_hardlink = smb2_create_hardlink,
- .parse_reparse_point = smb2_parse_reparse_point,
+ .get_reparse_point_buffer = smb2_get_reparse_point_buffer,
.query_mf_symlink = smb3_query_mf_symlink,
.create_mf_symlink = smb3_create_mf_symlink,
- .create_reparse_symlink = smb2_create_reparse_symlink,
+ .create_reparse_inode = smb2_create_reparse_inode,
.open = smb2_open_file,
.set_fid = smb2_set_fid,
.close = smb2_close_file,
@@ -5597,10 +5652,10 @@ struct smb_version_operations smb311_operations = {
.unlink = smb2_unlink,
.rename = smb2_rename_path,
.create_hardlink = smb2_create_hardlink,
- .parse_reparse_point = smb2_parse_reparse_point,
+ .get_reparse_point_buffer = smb2_get_reparse_point_buffer,
.query_mf_symlink = smb3_query_mf_symlink,
.create_mf_symlink = smb3_create_mf_symlink,
- .create_reparse_symlink = smb2_create_reparse_symlink,
+ .create_reparse_inode = smb2_create_reparse_inode,
.open = smb2_open_file,
.set_fid = smb2_set_fid,
.close = smb2_close_file,
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index 9f54596a6866..2df93a75e3b8 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -36,6 +36,7 @@
#include "smb2glob.h"
#include "cifspdu.h"
#include "cifs_spnego.h"
+#include "../common/smbdirect/smbdirect.h"
#include "smbdirect.h"
#include "trace.h"
#ifdef CONFIG_CIFS_DFS_UPCALL
@@ -43,6 +44,7 @@
#endif
#include "cached_dir.h"
#include "compress.h"
+#include "fs_context.h"
/*
* The following table defines the expected "StructureSize" of SMB2 requests
@@ -300,32 +302,23 @@ again:
mutex_lock(&ses->session_mutex);
/*
- * if this is called by delayed work, and the channel has been disabled
- * in parallel, the delayed work can continue to execute in parallel
- * there's a chance that this channel may not exist anymore
+ * Handle the case where a concurrent thread failed to negotiate or
+ * killed a channel.
*/
spin_lock(&server->srv_lock);
- if (server->tcpStatus == CifsExiting) {
+ switch (server->tcpStatus) {
+ case CifsExiting:
spin_unlock(&server->srv_lock);
mutex_unlock(&ses->session_mutex);
- rc = -EHOSTDOWN;
- goto out;
- }
-
- /*
- * Recheck after acquire mutex. If another thread is negotiating
- * and the server never sends an answer the socket will be closed
- * and tcpStatus set to reconnect.
- */
- if (server->tcpStatus == CifsNeedReconnect) {
+ return -EHOSTDOWN;
+ case CifsNeedReconnect:
spin_unlock(&server->srv_lock);
mutex_unlock(&ses->session_mutex);
-
- if (tcon->retry)
- goto again;
-
- rc = -EHOSTDOWN;
- goto out;
+ if (!tcon->retry)
+ return -EHOSTDOWN;
+ goto again;
+ default:
+ break;
}
spin_unlock(&server->srv_lock);
@@ -350,43 +343,41 @@ again:
spin_unlock(&ses->ses_lock);
rc = cifs_negotiate_protocol(0, ses, server);
- if (!rc) {
- /*
- * if server stopped supporting multichannel
- * and the first channel reconnected, disable all the others.
- */
- if (ses->chan_count > 1 &&
- !(server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) {
- rc = cifs_chan_skip_or_disable(ses, server,
- from_reconnect);
- if (rc) {
- mutex_unlock(&ses->session_mutex);
- goto out;
- }
- }
-
- rc = cifs_setup_session(0, ses, server, ses->local_nls);
- if ((rc == -EACCES) || (rc == -EKEYEXPIRED) || (rc == -EKEYREVOKED)) {
- /*
- * Try alternate password for next reconnect (key rotation
- * could be enabled on the server e.g.) if an alternate
- * password is available and the current password is expired,
- * but do not swap on non pwd related errors like host down
- */
- if (ses->password2)
- swap(ses->password2, ses->password);
- }
-
- if ((rc == -EACCES) && !tcon->retry) {
- mutex_unlock(&ses->session_mutex);
- rc = -EHOSTDOWN;
- goto failed;
- } else if (rc) {
+ if (rc) {
+ mutex_unlock(&ses->session_mutex);
+ if (!tcon->retry)
+ return -EHOSTDOWN;
+ goto again;
+ }
+ /*
+ * if server stopped supporting multichannel
+ * and the first channel reconnected, disable all the others.
+ */
+ if (ses->chan_count > 1 &&
+ !(server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) {
+ rc = cifs_chan_skip_or_disable(ses, server,
+ from_reconnect);
+ if (rc) {
mutex_unlock(&ses->session_mutex);
goto out;
}
- } else {
+ }
+
+ rc = cifs_setup_session(0, ses, server, ses->local_nls);
+ if ((rc == -EACCES) || (rc == -EKEYEXPIRED) || (rc == -EKEYREVOKED)) {
+ /*
+ * Try alternate password for next reconnect (key rotation
+ * could be enabled on the server e.g.) if an alternate
+ * password is available and the current password is expired,
+ * but do not swap on non pwd related errors like host down
+ */
+ if (ses->password2)
+ swap(ses->password2, ses->password);
+ }
+ if (rc) {
mutex_unlock(&ses->session_mutex);
+ if (rc == -EACCES && !tcon->retry)
+ return -EHOSTDOWN;
goto out;
}
@@ -421,14 +412,23 @@ skip_sess_setup:
if (!rc &&
(server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL) &&
server->ops->query_server_interfaces) {
- mutex_unlock(&ses->session_mutex);
-
/*
- * query server network interfaces, in case they change
+ * query server network interfaces, in case they change.
+ * Also mark the session as pending this update while the query
+ * is in progress. This will be used to avoid calling
+ * smb2_reconnect recursively.
*/
+ ses->flags |= CIFS_SES_FLAGS_PENDING_QUERY_INTERFACES;
xid = get_xid();
rc = server->ops->query_server_interfaces(xid, tcon, false);
free_xid(xid);
+ ses->flags &= ~CIFS_SES_FLAGS_PENDING_QUERY_INTERFACES;
+
+ if (!tcon->ipc && !tcon->dummy)
+ queue_delayed_work(cifsiod_wq, &tcon->query_interfaces,
+ (SMB_INTERFACE_POLL_INTERVAL * HZ));
+
+ mutex_unlock(&ses->session_mutex);
if (rc == -EOPNOTSUPP && ses->chan_count > 1) {
/*
@@ -448,11 +448,8 @@ skip_sess_setup:
if (ses->chan_max > ses->chan_count &&
ses->iface_count &&
!SERVER_IS_CHAN(server)) {
- if (ses->chan_count == 1) {
+ if (ses->chan_count == 1)
cifs_server_dbg(VFS, "supports multichannel now\n");
- queue_delayed_work(cifsiod_wq, &tcon->query_interfaces,
- (SMB_INTERFACE_POLL_INTERVAL * HZ));
- }
cifs_try_adding_channels(ses);
}
@@ -490,7 +487,6 @@ out:
case SMB2_IOCTL:
rc = -EAGAIN;
}
-failed:
return rc;
}
@@ -571,11 +567,18 @@ static int smb2_ioctl_req_init(u32 opcode, struct cifs_tcon *tcon,
struct TCP_Server_Info *server,
void **request_buf, unsigned int *total_len)
{
- /* Skip reconnect only for FSCTL_VALIDATE_NEGOTIATE_INFO IOCTLs */
- if (opcode == FSCTL_VALIDATE_NEGOTIATE_INFO) {
+ /*
+ * Skip reconnect in one of the following cases:
+ * 1. For FSCTL_VALIDATE_NEGOTIATE_INFO IOCTLs
+ * 2. For FSCTL_QUERY_NETWORK_INTERFACE_INFO IOCTL when called from
+ * smb2_reconnect (indicated by CIFS_SES_FLAG_SCALE_CHANNELS ses flag)
+ */
+ if (opcode == FSCTL_VALIDATE_NEGOTIATE_INFO ||
+ (opcode == FSCTL_QUERY_NETWORK_INTERFACE_INFO &&
+ (tcon->ses->flags & CIFS_SES_FLAGS_PENDING_QUERY_INTERFACES)))
return __smb2_plain_req_init(SMB2_IOCTL, tcon, server,
request_buf, total_len);
- }
+
return smb2_plain_req_init(SMB2_IOCTL, tcon, server,
request_buf, total_len);
}
@@ -1263,15 +1266,8 @@ SMB2_negotiate(const unsigned int xid,
cifs_server_dbg(VFS, "Missing expected negotiate contexts\n");
}
- if (server->cipher_type && !rc) {
- if (!SERVER_IS_CHAN(server)) {
- rc = smb3_crypto_aead_allocate(server);
- } else {
- /* For channels, just reuse the primary server crypto secmech. */
- server->secmech.enc = server->primary_server->secmech.enc;
- server->secmech.dec = server->primary_server->secmech.dec;
- }
- }
+ if (server->cipher_type && !rc)
+ rc = smb3_crypto_aead_allocate(server);
neg_exit:
free_rsp_buf(resp_buftype, rsp);
return rc;
@@ -1429,7 +1425,7 @@ smb2_select_sectype(struct TCP_Server_Info *server, enum securityEnum requested)
if (server->sec_ntlmssp &&
(global_secflags & CIFSSEC_MAY_NTLMSSP))
return RawNTLMSSP;
- if ((server->sec_kerberos || server->sec_mskerberos) &&
+ if ((server->sec_kerberos || server->sec_mskerberos || server->sec_iakerb) &&
(global_secflags & CIFSSEC_MAY_KRB5))
return Kerberos;
fallthrough;
@@ -2169,7 +2165,7 @@ tcon_exit:
tcon_error_exit:
if (rsp && rsp->hdr.Status == STATUS_BAD_NETWORK_NAME)
- cifs_tcon_dbg(VFS, "BAD_NETWORK_NAME: %s\n", tree);
+ cifs_dbg(VFS | ONCE, "BAD_NETWORK_NAME: %s\n", tree);
goto tcon_exit;
}
@@ -2329,7 +2325,7 @@ parse_posix_ctxt(struct create_context *cc, struct smb2_file_all_info *info,
int smb2_parse_contexts(struct TCP_Server_Info *server,
struct kvec *rsp_iov,
- unsigned int *epoch,
+ __u16 *epoch,
char *lease_key, __u8 *oplock,
struct smb2_file_all_info *buf,
struct create_posix_rsp *posix)
@@ -2410,11 +2406,16 @@ static int
add_lease_context(struct TCP_Server_Info *server,
struct smb2_create_req *req,
struct kvec *iov,
- unsigned int *num_iovec, u8 *lease_key, __u8 *oplock)
+ unsigned int *num_iovec,
+ u8 *lease_key,
+ __u8 *oplock,
+ u8 *parent_lease_key,
+ __le32 flags)
{
unsigned int num = *num_iovec;
- iov[num].iov_base = server->ops->create_lease_buf(lease_key, *oplock);
+ iov[num].iov_base = server->ops->create_lease_buf(lease_key, *oplock,
+ parent_lease_key, flags);
if (iov[num].iov_base == NULL)
return -ENOMEM;
iov[num].iov_len = server->vals->create_lease_size;
@@ -2939,6 +2940,7 @@ replay_again:
req->CreateContextsOffset = cpu_to_le32(
sizeof(struct smb2_create_req) +
iov[1].iov_len);
+ le32_add_cpu(&req->CreateContextsLength, iov[n_iov-1].iov_len);
pc_buf = iov[n_iov-1].iov_base;
}
@@ -2985,7 +2987,7 @@ replay_again:
/* Eventually save off posix specific response info and timestamps */
err_free_rsp_buf:
- free_rsp_buf(resp_buftype, rsp);
+ free_rsp_buf(resp_buftype, rsp_iov.iov_base);
kfree(pc_buf);
err_free_req:
cifs_small_buf_release(req);
@@ -3086,7 +3088,9 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
req->RequestedOplockLevel = *oplock; /* no srv lease support */
else {
rc = add_lease_context(server, req, iov, &n_iov,
- oparms->fid->lease_key, oplock);
+ oparms->fid->lease_key, oplock,
+ oparms->fid->parent_lease_key,
+ oparms->lease_flags);
if (rc)
return rc;
}
@@ -3928,12 +3932,10 @@ SMB2_query_acl(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid,
void **data, u32 *plen, u32 extra_info)
{
- __u32 additional_info = OWNER_SECINFO | GROUP_SECINFO | DACL_SECINFO |
- extra_info;
*plen = 0;
return query_info(xid, tcon, persistent_fid, volatile_fid,
- 0, SMB2_O_INFO_SECURITY, additional_info,
+ 0, SMB2_O_INFO_SECURITY, extra_info,
SMB2_MAX_BUFFER_SIZE, MIN_SEC_DESC_LEN, data, plen);
}
@@ -4103,6 +4105,20 @@ smb2_echo_callback(struct mid_q_entry *mid)
add_credits(server, &credits, CIFS_ECHO_OP);
}
+static void cifs_renegotiate_iosize(struct TCP_Server_Info *server,
+ struct cifs_tcon *tcon)
+{
+ struct cifs_sb_info *cifs_sb;
+
+ if (server == NULL || tcon == NULL)
+ return;
+
+ spin_lock(&tcon->sb_list_lock);
+ list_for_each_entry(cifs_sb, &tcon->cifs_sb_list, tcon_sb_link)
+ cifs_negotiate_iosize(server, cifs_sb->ctx, tcon);
+ spin_unlock(&tcon->sb_list_lock);
+}
+
void smb2_reconnect_server(struct work_struct *work)
{
struct TCP_Server_Info *server = container_of(work,
@@ -4188,9 +4204,10 @@ void smb2_reconnect_server(struct work_struct *work)
list_for_each_entry_safe(tcon, tcon2, &tmp_list, rlist) {
rc = smb2_reconnect(SMB2_INTERNAL_CMD, tcon, server, true);
- if (!rc)
+ if (!rc) {
+ cifs_renegotiate_iosize(server, tcon);
cifs_reopen_persistent_handles(tcon);
- else
+ } else
resched = true;
list_del_init(&tcon->rlist);
if (tcon->ipc)
@@ -4212,10 +4229,8 @@ void smb2_reconnect_server(struct work_struct *work)
}
goto done;
}
-
tcon->status = TID_GOOD;
- tcon->retry = false;
- tcon->need_reconnect = false;
+ tcon->dummy = true;
/* now reconnect sessions for necessary channels */
list_for_each_entry_safe(ses, ses2, &tmp_ses_list, rlist) {
@@ -4446,10 +4461,10 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
#ifdef CONFIG_CIFS_SMB_DIRECT
/*
* If we want to do a RDMA write, fill in and append
- * smbd_buffer_descriptor_v1 to the end of read request
+ * smbdirect_buffer_descriptor_v1 to the end of read request
*/
if (rdata && smb3_use_rdma_offload(io_parms)) {
- struct smbd_buffer_descriptor_v1 *v1;
+ struct smbdirect_buffer_descriptor_v1 *v1;
bool need_invalidate = server->dialect == SMB30_PROT_ID;
rdata->mr = smbd_register_mr(server->smbd_conn, &rdata->subreq.io_iter,
@@ -4463,8 +4478,8 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
req->ReadChannelInfoOffset =
cpu_to_le16(offsetof(struct smb2_read_req, Buffer));
req->ReadChannelInfoLength =
- cpu_to_le16(sizeof(struct smbd_buffer_descriptor_v1));
- v1 = (struct smbd_buffer_descriptor_v1 *) &req->Buffer[0];
+ cpu_to_le16(sizeof(struct smbdirect_buffer_descriptor_v1));
+ v1 = (struct smbdirect_buffer_descriptor_v1 *) &req->Buffer[0];
v1->offset = cpu_to_le64(rdata->mr->mr->iova);
v1->token = cpu_to_le32(rdata->mr->mr->rkey);
v1->length = cpu_to_le32(rdata->mr->mr->length);
@@ -4550,7 +4565,11 @@ smb2_readv_callback(struct mid_q_entry *mid)
cifs_stats_bytes_read(tcon, rdata->got_bytes);
break;
case MID_REQUEST_SUBMITTED:
+ trace_netfs_sreq(&rdata->subreq, netfs_sreq_trace_io_req_submitted);
+ goto do_retry;
case MID_RETRY_NEEDED:
+ trace_netfs_sreq(&rdata->subreq, netfs_sreq_trace_io_retry_needed);
+do_retry:
__set_bit(NETFS_SREQ_NEED_RETRY, &rdata->subreq.flags);
rdata->result = -EAGAIN;
if (server->sign && rdata->got_bytes)
@@ -4561,11 +4580,15 @@ smb2_readv_callback(struct mid_q_entry *mid)
cifs_stats_bytes_read(tcon, rdata->got_bytes);
break;
case MID_RESPONSE_MALFORMED:
+ trace_netfs_sreq(&rdata->subreq, netfs_sreq_trace_io_malformed);
credits.value = le16_to_cpu(shdr->CreditRequest);
credits.instance = server->reconnect_instance;
- fallthrough;
+ rdata->result = -EIO;
+ break;
default:
+ trace_netfs_sreq(&rdata->subreq, netfs_sreq_trace_io_unknown);
rdata->result = -EIO;
+ break;
}
#ifdef CONFIG_CIFS_SMB_DIRECT
/*
@@ -4818,11 +4841,14 @@ smb2_writev_callback(struct mid_q_entry *mid)
switch (mid->mid_state) {
case MID_RESPONSE_RECEIVED:
+ trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_progress);
credits.value = le16_to_cpu(rsp->hdr.CreditRequest);
credits.instance = server->reconnect_instance;
result = smb2_check_receive(mid, server, 0);
- if (result != 0)
+ if (result != 0) {
+ trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_bad);
break;
+ }
written = le32_to_cpu(rsp->DataLength);
/*
@@ -4844,14 +4870,23 @@ smb2_writev_callback(struct mid_q_entry *mid)
}
break;
case MID_REQUEST_SUBMITTED:
+ trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_req_submitted);
+ __set_bit(NETFS_SREQ_NEED_RETRY, &wdata->subreq.flags);
+ result = -EAGAIN;
+ break;
case MID_RETRY_NEEDED:
+ trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_retry_needed);
+ __set_bit(NETFS_SREQ_NEED_RETRY, &wdata->subreq.flags);
result = -EAGAIN;
break;
case MID_RESPONSE_MALFORMED:
+ trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_malformed);
credits.value = le16_to_cpu(rsp->hdr.CreditRequest);
credits.instance = server->reconnect_instance;
- fallthrough;
+ result = -EIO;
+ break;
default:
+ trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_unknown);
result = -EIO;
break;
}
@@ -4891,8 +4926,7 @@ smb2_writev_callback(struct mid_q_entry *mid)
server->credits, server->in_flight,
0, cifs_trace_rw_credits_write_response_clear);
wdata->credits.value = 0;
- trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_progress);
- cifs_write_subrequest_terminated(wdata, result ?: written, true);
+ cifs_write_subrequest_terminated(wdata, result ?: written);
release_mid(mid);
trace_smb3_rw_credits(rreq_debug_id, subreq_debug_index, 0,
server->credits, server->in_flight,
@@ -4972,10 +5006,10 @@ smb2_async_writev(struct cifs_io_subrequest *wdata)
#ifdef CONFIG_CIFS_SMB_DIRECT
/*
* If we want to do a server RDMA read, fill in and append
- * smbd_buffer_descriptor_v1 to the end of write request
+ * smbdirect_buffer_descriptor_v1 to the end of write request
*/
if (smb3_use_rdma_offload(io_parms)) {
- struct smbd_buffer_descriptor_v1 *v1;
+ struct smbdirect_buffer_descriptor_v1 *v1;
bool need_invalidate = server->dialect == SMB30_PROT_ID;
wdata->mr = smbd_register_mr(server->smbd_conn, &wdata->subreq.io_iter,
@@ -4994,8 +5028,8 @@ smb2_async_writev(struct cifs_io_subrequest *wdata)
req->WriteChannelInfoOffset =
cpu_to_le16(offsetof(struct smb2_write_req, Buffer));
req->WriteChannelInfoLength =
- cpu_to_le16(sizeof(struct smbd_buffer_descriptor_v1));
- v1 = (struct smbd_buffer_descriptor_v1 *) &req->Buffer[0];
+ cpu_to_le16(sizeof(struct smbdirect_buffer_descriptor_v1));
+ v1 = (struct smbdirect_buffer_descriptor_v1 *) &req->Buffer[0];
v1->offset = cpu_to_le64(wdata->mr->mr->iova);
v1->token = cpu_to_le32(wdata->mr->mr->rkey);
v1->length = cpu_to_le32(wdata->mr->mr->length);
@@ -5065,7 +5099,7 @@ out:
-(int)wdata->credits.value,
cifs_trace_rw_credits_write_response_clear);
add_credits_and_wake_if(wdata->server, &wdata->credits, 0);
- cifs_write_subrequest_terminated(wdata, rc, true);
+ cifs_write_subrequest_terminated(wdata, rc);
}
}
@@ -5921,71 +5955,6 @@ posix_qfsinf_exit:
}
int
-SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon,
- u64 persistent_fid, u64 volatile_fid, struct kstatfs *fsdata)
-{
- struct smb_rqst rqst;
- struct smb2_query_info_rsp *rsp = NULL;
- struct kvec iov;
- struct kvec rsp_iov;
- int rc = 0;
- int resp_buftype;
- struct cifs_ses *ses = tcon->ses;
- struct TCP_Server_Info *server;
- struct smb2_fs_full_size_info *info = NULL;
- int flags = 0;
- int retries = 0, cur_sleep = 1;
-
-replay_again:
- /* reinitialize for possible replay */
- flags = 0;
- server = cifs_pick_channel(ses);
-
- rc = build_qfs_info_req(&iov, tcon, server,
- FS_FULL_SIZE_INFORMATION,
- sizeof(struct smb2_fs_full_size_info),
- persistent_fid, volatile_fid);
- if (rc)
- return rc;
-
- if (smb3_encryption_required(tcon))
- flags |= CIFS_TRANSFORM_REQ;
-
- memset(&rqst, 0, sizeof(struct smb_rqst));
- rqst.rq_iov = &iov;
- rqst.rq_nvec = 1;
-
- if (retries)
- smb2_set_replay(server, &rqst);
-
- rc = cifs_send_recv(xid, ses, server,
- &rqst, &resp_buftype, flags, &rsp_iov);
- free_qfs_info_req(&iov);
- if (rc) {
- cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE);
- goto qfsinf_exit;
- }
- rsp = (struct smb2_query_info_rsp *)rsp_iov.iov_base;
-
- info = (struct smb2_fs_full_size_info *)(
- le16_to_cpu(rsp->OutputBufferOffset) + (char *)rsp);
- rc = smb2_validate_iov(le16_to_cpu(rsp->OutputBufferOffset),
- le32_to_cpu(rsp->OutputBufferLength), &rsp_iov,
- sizeof(struct smb2_fs_full_size_info));
- if (!rc)
- smb2_copy_fs_info_to_kstatfs(info, fsdata);
-
-qfsinf_exit:
- free_rsp_buf(resp_buftype, rsp_iov.iov_base);
-
- if (is_replayable_error(rc) &&
- smb2_should_replay(tcon, &retries, &cur_sleep))
- goto replay_again;
-
- return rc;
-}
-
-int
SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid, int level)
{
diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h
index 09349fa8da03..6e805ece6a7b 100644
--- a/fs/smb/client/smb2proto.h
+++ b/fs/smb/client/smb2proto.h
@@ -54,7 +54,7 @@ extern int smb3_handle_read_data(struct TCP_Server_Info *server,
extern int smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb, const char *path,
__u32 *reparse_tag);
-struct inode *smb2_get_reparse_inode(struct cifs_open_info_data *data,
+struct inode *smb2_create_reparse_inode(struct cifs_open_info_data *data,
struct super_block *sb,
const unsigned int xid,
struct cifs_tcon *tcon,
@@ -111,8 +111,9 @@ extern int smb3_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb,
const unsigned char *path, char *pbuf,
unsigned int *pbytes_read);
+int smb2_fix_symlink_target_type(char **target, bool directory, struct cifs_sb_info *cifs_sb);
int smb2_parse_native_symlink(char **target, const char *buf, unsigned int len,
- bool unicode, bool relative,
+ bool relative,
const char *full_path,
struct cifs_sb_info *cifs_sb);
int smb2_parse_symlink_response(struct cifs_sb_info *cifs_sb,
@@ -258,9 +259,6 @@ extern int smb2_handle_cancelled_close(struct cifs_tcon *tcon,
__u64 volatile_fid);
extern int smb2_handle_cancelled_mid(struct mid_q_entry *mid, struct TCP_Server_Info *server);
void smb2_cancelled_close_fid(struct work_struct *work);
-extern int SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon,
- u64 persistent_file_id, u64 volatile_file_id,
- struct kstatfs *FSData);
extern int SMB311_posix_qfs_info(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_file_id, u64 volatile_file_id,
struct kstatfs *FSData);
@@ -282,7 +280,7 @@ extern enum securityEnum smb2_select_sectype(struct TCP_Server_Info *,
enum securityEnum);
int smb2_parse_contexts(struct TCP_Server_Info *server,
struct kvec *rsp_iov,
- unsigned int *epoch,
+ __u16 *epoch,
char *lease_key, __u8 *oplock,
struct smb2_file_all_info *buf,
struct create_posix_rsp *posix);
@@ -316,9 +314,6 @@ int smb311_posix_query_path_info(const unsigned int xid,
int posix_info_parse(const void *beg, const void *end,
struct smb2_posix_info_parsed *out);
int posix_info_sid_size(const void *beg, const void *end);
-int smb2_create_reparse_symlink(const unsigned int xid, struct inode *inode,
- struct dentry *dentry, struct cifs_tcon *tcon,
- const char *full_path, const char *symname);
int smb2_make_nfs_node(unsigned int xid, struct inode *inode,
struct dentry *dentry, struct cifs_tcon *tcon,
const char *full_path, umode_t mode, dev_t dev);
diff --git a/fs/smb/client/smb2transport.c b/fs/smb/client/smb2transport.c
index 475b36c27f65..bc0e92eb2b64 100644
--- a/fs/smb/client/smb2transport.c
+++ b/fs/smb/client/smb2transport.c
@@ -771,6 +771,7 @@ smb2_mid_entry_alloc(const struct smb2_hdr *shdr,
temp = mempool_alloc(cifs_mid_poolp, GFP_NOFS);
memset(temp, 0, sizeof(struct mid_q_entry));
kref_init(&temp->refcount);
+ spin_lock_init(&temp->mid_lock);
temp->mid = le64_to_cpu(shdr->MessageId);
temp->credits = credits > 0 ? credits : 1;
temp->pid = current->pid;
@@ -840,9 +841,9 @@ smb2_get_mid_entry(struct cifs_ses *ses, struct TCP_Server_Info *server,
*mid = smb2_mid_entry_alloc(shdr, server);
if (*mid == NULL)
return -ENOMEM;
- spin_lock(&server->mid_lock);
+ spin_lock(&server->mid_queue_lock);
list_add_tail(&(*mid)->qhead, &server->pending_mid_q);
- spin_unlock(&server->mid_lock);
+ spin_unlock(&server->mid_queue_lock);
return 0;
}
diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c
index b0b7254661e9..02d6db431fd4 100644
--- a/fs/smb/client/smbdirect.c
+++ b/fs/smb/client/smbdirect.c
@@ -7,32 +7,29 @@
#include <linux/module.h>
#include <linux/highmem.h>
#include <linux/folio_queue.h>
+#include "../common/smbdirect/smbdirect_pdu.h"
#include "smbdirect.h"
#include "cifs_debug.h"
#include "cifsproto.h"
#include "smb2proto.h"
-static struct smbd_response *get_empty_queue_buffer(
- struct smbd_connection *info);
-static struct smbd_response *get_receive_buffer(
+static struct smbdirect_recv_io *get_receive_buffer(
struct smbd_connection *info);
static void put_receive_buffer(
struct smbd_connection *info,
- struct smbd_response *response);
+ struct smbdirect_recv_io *response);
static int allocate_receive_buffers(struct smbd_connection *info, int num_buf);
static void destroy_receive_buffers(struct smbd_connection *info);
-static void put_empty_packet(
- struct smbd_connection *info, struct smbd_response *response);
static void enqueue_reassembly(
struct smbd_connection *info,
- struct smbd_response *response, int data_length);
-static struct smbd_response *_get_first_reassembly(
+ struct smbdirect_recv_io *response, int data_length);
+static struct smbdirect_recv_io *_get_first_reassembly(
struct smbd_connection *info);
static int smbd_post_recv(
struct smbd_connection *info,
- struct smbd_response *response);
+ struct smbdirect_recv_io *response);
static int smbd_post_send_empty(struct smbd_connection *info);
@@ -50,9 +47,6 @@ struct smb_extract_to_rdma {
static ssize_t smb_extract_iter_to_rdma(struct iov_iter *iter, size_t len,
struct smb_extract_to_rdma *rdma);
-/* SMBD version number */
-#define SMBD_V1 0x0100
-
/* Port numbers for SMBD transport */
#define SMB_PORT 445
#define SMBD_PORT 5445
@@ -165,10 +159,11 @@ static void smbd_disconnect_rdma_work(struct work_struct *work)
{
struct smbd_connection *info =
container_of(work, struct smbd_connection, disconnect_work);
+ struct smbdirect_socket *sc = &info->socket;
- if (info->transport_status == SMBD_CONNECTED) {
- info->transport_status = SMBD_DISCONNECTING;
- rdma_disconnect(info->id);
+ if (sc->status == SMBDIRECT_SOCKET_CONNECTED) {
+ sc->status = SMBDIRECT_SOCKET_DISCONNECTING;
+ rdma_disconnect(sc->rdma.cm_id);
}
}
@@ -182,9 +177,11 @@ static int smbd_conn_upcall(
struct rdma_cm_id *id, struct rdma_cm_event *event)
{
struct smbd_connection *info = id->context;
+ struct smbdirect_socket *sc = &info->socket;
+ const char *event_name = rdma_event_msg(event->event);
- log_rdma_event(INFO, "event=%d status=%d\n",
- event->event, event->status);
+ log_rdma_event(INFO, "event=%s status=%d\n",
+ event_name, event->status);
switch (event->event) {
case RDMA_CM_EVENT_ADDR_RESOLVED:
@@ -194,45 +191,50 @@ static int smbd_conn_upcall(
break;
case RDMA_CM_EVENT_ADDR_ERROR:
+ log_rdma_event(ERR, "connecting failed event=%s\n", event_name);
info->ri_rc = -EHOSTUNREACH;
complete(&info->ri_done);
break;
case RDMA_CM_EVENT_ROUTE_ERROR:
+ log_rdma_event(ERR, "connecting failed event=%s\n", event_name);
info->ri_rc = -ENETUNREACH;
complete(&info->ri_done);
break;
case RDMA_CM_EVENT_ESTABLISHED:
- log_rdma_event(INFO, "connected event=%d\n", event->event);
- info->transport_status = SMBD_CONNECTED;
- wake_up_interruptible(&info->conn_wait);
+ log_rdma_event(INFO, "connected event=%s\n", event_name);
+ sc->status = SMBDIRECT_SOCKET_CONNECTED;
+ wake_up_interruptible(&info->status_wait);
break;
case RDMA_CM_EVENT_CONNECT_ERROR:
case RDMA_CM_EVENT_UNREACHABLE:
case RDMA_CM_EVENT_REJECTED:
- log_rdma_event(INFO, "connecting failed event=%d\n", event->event);
- info->transport_status = SMBD_DISCONNECTED;
- wake_up_interruptible(&info->conn_wait);
+ log_rdma_event(ERR, "connecting failed event=%s\n", event_name);
+ sc->status = SMBDIRECT_SOCKET_DISCONNECTED;
+ wake_up_interruptible(&info->status_wait);
break;
case RDMA_CM_EVENT_DEVICE_REMOVAL:
case RDMA_CM_EVENT_DISCONNECTED:
/* This happens when we fail the negotiation */
- if (info->transport_status == SMBD_NEGOTIATE_FAILED) {
- info->transport_status = SMBD_DISCONNECTED;
- wake_up(&info->conn_wait);
+ if (sc->status == SMBDIRECT_SOCKET_NEGOTIATE_FAILED) {
+ log_rdma_event(ERR, "event=%s during negotiation\n", event_name);
+ sc->status = SMBDIRECT_SOCKET_DISCONNECTED;
+ wake_up(&info->status_wait);
break;
}
- info->transport_status = SMBD_DISCONNECTED;
- wake_up_interruptible(&info->disconn_wait);
- wake_up_interruptible(&info->wait_reassembly_queue);
+ sc->status = SMBDIRECT_SOCKET_DISCONNECTED;
+ wake_up_interruptible(&info->status_wait);
+ wake_up_interruptible(&sc->recv_io.reassembly.wait_queue);
wake_up_interruptible_all(&info->wait_send_queue);
break;
default:
+ log_rdma_event(ERR, "unexpected event=%s status=%d\n",
+ event_name, event->status);
break;
}
@@ -259,12 +261,12 @@ smbd_qp_async_error_upcall(struct ib_event *event, void *context)
}
}
-static inline void *smbd_request_payload(struct smbd_request *request)
+static inline void *smbdirect_send_io_payload(struct smbdirect_send_io *request)
{
return (void *)request->packet;
}
-static inline void *smbd_response_payload(struct smbd_response *response)
+static inline void *smbdirect_recv_io_payload(struct smbdirect_recv_io *response)
{
return (void *)response->packet;
}
@@ -273,33 +275,38 @@ static inline void *smbd_response_payload(struct smbd_response *response)
static void send_done(struct ib_cq *cq, struct ib_wc *wc)
{
int i;
- struct smbd_request *request =
- container_of(wc->wr_cqe, struct smbd_request, cqe);
+ struct smbdirect_send_io *request =
+ container_of(wc->wr_cqe, struct smbdirect_send_io, cqe);
+ struct smbdirect_socket *sc = request->socket;
+ struct smbd_connection *info =
+ container_of(sc, struct smbd_connection, socket);
- log_rdma_send(INFO, "smbd_request 0x%p completed wc->status=%d\n",
+ log_rdma_send(INFO, "smbdirect_send_io 0x%p completed wc->status=%d\n",
request, wc->status);
- if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_SEND) {
- log_rdma_send(ERR, "wc->status=%d wc->opcode=%d\n",
- wc->status, wc->opcode);
- smbd_disconnect_rdma_connection(request->info);
- }
-
for (i = 0; i < request->num_sge; i++)
- ib_dma_unmap_single(request->info->id->device,
+ ib_dma_unmap_single(sc->ib.dev,
request->sge[i].addr,
request->sge[i].length,
DMA_TO_DEVICE);
- if (atomic_dec_and_test(&request->info->send_pending))
- wake_up(&request->info->wait_send_pending);
+ if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_SEND) {
+ log_rdma_send(ERR, "wc->status=%d wc->opcode=%d\n",
+ wc->status, wc->opcode);
+ mempool_free(request, sc->send_io.mem.pool);
+ smbd_disconnect_rdma_connection(info);
+ return;
+ }
+
+ if (atomic_dec_and_test(&info->send_pending))
+ wake_up(&info->wait_send_pending);
- wake_up(&request->info->wait_post_send);
+ wake_up(&info->wait_post_send);
- mempool_free(request, request->info->request_mempool);
+ mempool_free(request, sc->send_io.mem.pool);
}
-static void dump_smbd_negotiate_resp(struct smbd_negotiate_resp *resp)
+static void dump_smbdirect_negotiate_resp(struct smbdirect_negotiate_resp *resp)
{
log_rdma_event(INFO, "resp message min_version %u max_version %u negotiated_version %u credits_requested %u credits_granted %u status %u max_readwrite_size %u preferred_send_size %u max_receive_size %u max_fragmented_size %u\n",
resp->min_version, resp->max_version,
@@ -315,18 +322,21 @@ static void dump_smbd_negotiate_resp(struct smbd_negotiate_resp *resp)
* return value: true if negotiation is a success, false if failed
*/
static bool process_negotiation_response(
- struct smbd_response *response, int packet_length)
+ struct smbdirect_recv_io *response, int packet_length)
{
- struct smbd_connection *info = response->info;
- struct smbd_negotiate_resp *packet = smbd_response_payload(response);
+ struct smbdirect_socket *sc = response->socket;
+ struct smbd_connection *info =
+ container_of(sc, struct smbd_connection, socket);
+ struct smbdirect_socket_parameters *sp = &sc->parameters;
+ struct smbdirect_negotiate_resp *packet = smbdirect_recv_io_payload(response);
- if (packet_length < sizeof(struct smbd_negotiate_resp)) {
+ if (packet_length < sizeof(struct smbdirect_negotiate_resp)) {
log_rdma_event(ERR,
"error: packet_length=%d\n", packet_length);
return false;
}
- if (le16_to_cpu(packet->negotiated_version) != SMBD_V1) {
+ if (le16_to_cpu(packet->negotiated_version) != SMBDIRECT_V1) {
log_rdma_event(ERR, "error: negotiated_version=%x\n",
le16_to_cpu(packet->negotiated_version));
return false;
@@ -347,20 +357,20 @@ static bool process_negotiation_response(
atomic_set(&info->receive_credits, 0);
- if (le32_to_cpu(packet->preferred_send_size) > info->max_receive_size) {
+ if (le32_to_cpu(packet->preferred_send_size) > sp->max_recv_size) {
log_rdma_event(ERR, "error: preferred_send_size=%d\n",
le32_to_cpu(packet->preferred_send_size));
return false;
}
- info->max_receive_size = le32_to_cpu(packet->preferred_send_size);
+ sp->max_recv_size = le32_to_cpu(packet->preferred_send_size);
if (le32_to_cpu(packet->max_receive_size) < SMBD_MIN_RECEIVE_SIZE) {
log_rdma_event(ERR, "error: max_receive_size=%d\n",
le32_to_cpu(packet->max_receive_size));
return false;
}
- info->max_send_size = min_t(int, info->max_send_size,
- le32_to_cpu(packet->max_receive_size));
+ sp->max_send_size = min_t(u32, sp->max_send_size,
+ le32_to_cpu(packet->max_receive_size));
if (le32_to_cpu(packet->max_fragmented_size) <
SMBD_MIN_FRAGMENTED_SIZE) {
@@ -368,33 +378,34 @@ static bool process_negotiation_response(
le32_to_cpu(packet->max_fragmented_size));
return false;
}
- info->max_fragmented_send_size =
+ sp->max_fragmented_send_size =
le32_to_cpu(packet->max_fragmented_size);
info->rdma_readwrite_threshold =
- rdma_readwrite_threshold > info->max_fragmented_send_size ?
- info->max_fragmented_send_size :
+ rdma_readwrite_threshold > sp->max_fragmented_send_size ?
+ sp->max_fragmented_send_size :
rdma_readwrite_threshold;
- info->max_readwrite_size = min_t(u32,
+ sp->max_read_write_size = min_t(u32,
le32_to_cpu(packet->max_readwrite_size),
info->max_frmr_depth * PAGE_SIZE);
- info->max_frmr_depth = info->max_readwrite_size / PAGE_SIZE;
+ info->max_frmr_depth = sp->max_read_write_size / PAGE_SIZE;
+ sc->recv_io.expected = SMBDIRECT_EXPECT_DATA_TRANSFER;
return true;
}
static void smbd_post_send_credits(struct work_struct *work)
{
int ret = 0;
- int use_receive_queue = 1;
int rc;
- struct smbd_response *response;
+ struct smbdirect_recv_io *response;
struct smbd_connection *info =
container_of(work, struct smbd_connection,
post_send_credits_work);
+ struct smbdirect_socket *sc = &info->socket;
- if (info->transport_status != SMBD_CONNECTED) {
+ if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
wake_up(&info->wait_receive_queues);
return;
}
@@ -402,20 +413,10 @@ static void smbd_post_send_credits(struct work_struct *work)
if (info->receive_credit_target >
atomic_read(&info->receive_credits)) {
while (true) {
- if (use_receive_queue)
- response = get_receive_buffer(info);
- else
- response = get_empty_queue_buffer(info);
- if (!response) {
- /* now switch to empty packet queue */
- if (use_receive_queue) {
- use_receive_queue = 0;
- continue;
- } else
- break;
- }
+ response = get_receive_buffer(info);
+ if (!response)
+ break;
- response->type = SMBD_TRANSFER_DATA;
response->first_segment = false;
rc = smbd_post_recv(info, response);
if (rc) {
@@ -448,20 +449,21 @@ static void smbd_post_send_credits(struct work_struct *work)
/* Called from softirq, when recv is done */
static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
{
- struct smbd_data_transfer *data_transfer;
- struct smbd_response *response =
- container_of(wc->wr_cqe, struct smbd_response, cqe);
- struct smbd_connection *info = response->info;
+ struct smbdirect_data_transfer *data_transfer;
+ struct smbdirect_recv_io *response =
+ container_of(wc->wr_cqe, struct smbdirect_recv_io, cqe);
+ struct smbdirect_socket *sc = response->socket;
+ struct smbd_connection *info =
+ container_of(sc, struct smbd_connection, socket);
int data_length = 0;
log_rdma_recv(INFO, "response=0x%p type=%d wc status=%d wc opcode %d byte_len=%d pkey_index=%u\n",
- response, response->type, wc->status, wc->opcode,
+ response, sc->recv_io.expected, wc->status, wc->opcode,
wc->byte_len, wc->pkey_index);
if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_RECV) {
log_rdma_recv(INFO, "wc->status=%d opcode=%d\n",
wc->status, wc->opcode);
- smbd_disconnect_rdma_connection(info);
goto error;
}
@@ -471,43 +473,31 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
response->sge.length,
DMA_FROM_DEVICE);
- switch (response->type) {
+ switch (sc->recv_io.expected) {
/* SMBD negotiation response */
- case SMBD_NEGOTIATE_RESP:
- dump_smbd_negotiate_resp(smbd_response_payload(response));
- info->full_packet_received = true;
+ case SMBDIRECT_EXPECT_NEGOTIATE_REP:
+ dump_smbdirect_negotiate_resp(smbdirect_recv_io_payload(response));
+ sc->recv_io.reassembly.full_packet_received = true;
info->negotiate_done =
process_negotiation_response(response, wc->byte_len);
+ put_receive_buffer(info, response);
complete(&info->negotiate_completion);
- break;
+ return;
/* SMBD data transfer packet */
- case SMBD_TRANSFER_DATA:
- data_transfer = smbd_response_payload(response);
+ case SMBDIRECT_EXPECT_DATA_TRANSFER:
+ data_transfer = smbdirect_recv_io_payload(response);
data_length = le32_to_cpu(data_transfer->data_length);
- /*
- * If this is a packet with data playload place the data in
- * reassembly queue and wake up the reading thread
- */
if (data_length) {
- if (info->full_packet_received)
+ if (sc->recv_io.reassembly.full_packet_received)
response->first_segment = true;
if (le32_to_cpu(data_transfer->remaining_data_length))
- info->full_packet_received = false;
+ sc->recv_io.reassembly.full_packet_received = false;
else
- info->full_packet_received = true;
-
- enqueue_reassembly(
- info,
- response,
- data_length);
- } else
- put_empty_packet(info, response);
-
- if (data_length)
- wake_up_interruptible(&info->wait_reassembly_queue);
+ sc->recv_io.reassembly.full_packet_received = true;
+ }
atomic_dec(&info->receive_credits);
info->receive_credit_target =
@@ -531,19 +521,35 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
/* Send a KEEP_ALIVE response right away if requested */
info->keep_alive_requested = KEEP_ALIVE_NONE;
if (le16_to_cpu(data_transfer->flags) &
- SMB_DIRECT_RESPONSE_REQUESTED) {
+ SMBDIRECT_FLAG_RESPONSE_REQUESTED) {
info->keep_alive_requested = KEEP_ALIVE_PENDING;
}
+ /*
+ * If this is a packet with data playload place the data in
+ * reassembly queue and wake up the reading thread
+ */
+ if (data_length) {
+ enqueue_reassembly(info, response, data_length);
+ wake_up_interruptible(&sc->recv_io.reassembly.wait_queue);
+ } else
+ put_receive_buffer(info, response);
+
return;
- default:
- log_rdma_recv(ERR,
- "unexpected response type=%d\n", response->type);
+ case SMBDIRECT_EXPECT_NEGOTIATE_REQ:
+ /* Only server... */
+ break;
}
+ /*
+ * This is an internal error!
+ */
+ log_rdma_recv(ERR, "unexpected response type=%d\n", sc->recv_io.expected);
+ WARN_ON_ONCE(sc->recv_io.expected != SMBDIRECT_EXPECT_DATA_TRANSFER);
error:
put_receive_buffer(info, response);
+ smbd_disconnect_rdma_connection(info);
}
static struct rdma_cm_id *smbd_create_id(
@@ -635,32 +641,34 @@ static int smbd_ia_open(
struct smbd_connection *info,
struct sockaddr *dstaddr, int port)
{
+ struct smbdirect_socket *sc = &info->socket;
int rc;
- info->id = smbd_create_id(info, dstaddr, port);
- if (IS_ERR(info->id)) {
- rc = PTR_ERR(info->id);
+ sc->rdma.cm_id = smbd_create_id(info, dstaddr, port);
+ if (IS_ERR(sc->rdma.cm_id)) {
+ rc = PTR_ERR(sc->rdma.cm_id);
goto out1;
}
+ sc->ib.dev = sc->rdma.cm_id->device;
- if (!frwr_is_supported(&info->id->device->attrs)) {
+ if (!frwr_is_supported(&sc->ib.dev->attrs)) {
log_rdma_event(ERR, "Fast Registration Work Requests (FRWR) is not supported\n");
log_rdma_event(ERR, "Device capability flags = %llx max_fast_reg_page_list_len = %u\n",
- info->id->device->attrs.device_cap_flags,
- info->id->device->attrs.max_fast_reg_page_list_len);
+ sc->ib.dev->attrs.device_cap_flags,
+ sc->ib.dev->attrs.max_fast_reg_page_list_len);
rc = -EPROTONOSUPPORT;
goto out2;
}
info->max_frmr_depth = min_t(int,
smbd_max_frmr_depth,
- info->id->device->attrs.max_fast_reg_page_list_len);
+ sc->ib.dev->attrs.max_fast_reg_page_list_len);
info->mr_type = IB_MR_TYPE_MEM_REG;
- if (info->id->device->attrs.kernel_cap_flags & IBK_SG_GAPS_REG)
+ if (sc->ib.dev->attrs.kernel_cap_flags & IBK_SG_GAPS_REG)
info->mr_type = IB_MR_TYPE_SG_GAPS;
- info->pd = ib_alloc_pd(info->id->device, 0);
- if (IS_ERR(info->pd)) {
- rc = PTR_ERR(info->pd);
+ sc->ib.pd = ib_alloc_pd(sc->ib.dev, 0);
+ if (IS_ERR(sc->ib.pd)) {
+ rc = PTR_ERR(sc->ib.pd);
log_rdma_event(ERR, "ib_alloc_pd() returned %d\n", rc);
goto out2;
}
@@ -668,8 +676,8 @@ static int smbd_ia_open(
return 0;
out2:
- rdma_destroy_id(info->id);
- info->id = NULL;
+ rdma_destroy_id(sc->rdma.cm_id);
+ sc->rdma.cm_id = NULL;
out1:
return rc;
@@ -683,41 +691,43 @@ out1:
*/
static int smbd_post_send_negotiate_req(struct smbd_connection *info)
{
+ struct smbdirect_socket *sc = &info->socket;
+ struct smbdirect_socket_parameters *sp = &sc->parameters;
struct ib_send_wr send_wr;
int rc = -ENOMEM;
- struct smbd_request *request;
- struct smbd_negotiate_req *packet;
+ struct smbdirect_send_io *request;
+ struct smbdirect_negotiate_req *packet;
- request = mempool_alloc(info->request_mempool, GFP_KERNEL);
+ request = mempool_alloc(sc->send_io.mem.pool, GFP_KERNEL);
if (!request)
return rc;
- request->info = info;
+ request->socket = sc;
- packet = smbd_request_payload(request);
- packet->min_version = cpu_to_le16(SMBD_V1);
- packet->max_version = cpu_to_le16(SMBD_V1);
+ packet = smbdirect_send_io_payload(request);
+ packet->min_version = cpu_to_le16(SMBDIRECT_V1);
+ packet->max_version = cpu_to_le16(SMBDIRECT_V1);
packet->reserved = 0;
- packet->credits_requested = cpu_to_le16(info->send_credit_target);
- packet->preferred_send_size = cpu_to_le32(info->max_send_size);
- packet->max_receive_size = cpu_to_le32(info->max_receive_size);
+ packet->credits_requested = cpu_to_le16(sp->send_credit_target);
+ packet->preferred_send_size = cpu_to_le32(sp->max_send_size);
+ packet->max_receive_size = cpu_to_le32(sp->max_recv_size);
packet->max_fragmented_size =
- cpu_to_le32(info->max_fragmented_recv_size);
+ cpu_to_le32(sp->max_fragmented_recv_size);
request->num_sge = 1;
request->sge[0].addr = ib_dma_map_single(
- info->id->device, (void *)packet,
+ sc->ib.dev, (void *)packet,
sizeof(*packet), DMA_TO_DEVICE);
- if (ib_dma_mapping_error(info->id->device, request->sge[0].addr)) {
+ if (ib_dma_mapping_error(sc->ib.dev, request->sge[0].addr)) {
rc = -EIO;
goto dma_mapping_failed;
}
request->sge[0].length = sizeof(*packet);
- request->sge[0].lkey = info->pd->local_dma_lkey;
+ request->sge[0].lkey = sc->ib.pd->local_dma_lkey;
ib_dma_sync_single_for_device(
- info->id->device, request->sge[0].addr,
+ sc->ib.dev, request->sge[0].addr,
request->sge[0].length, DMA_TO_DEVICE);
request->cqe.done = send_done;
@@ -734,20 +744,20 @@ static int smbd_post_send_negotiate_req(struct smbd_connection *info)
request->sge[0].length, request->sge[0].lkey);
atomic_inc(&info->send_pending);
- rc = ib_post_send(info->id->qp, &send_wr, NULL);
+ rc = ib_post_send(sc->ib.qp, &send_wr, NULL);
if (!rc)
return 0;
/* if we reach here, post send failed */
log_rdma_send(ERR, "ib_post_send failed rc=%d\n", rc);
atomic_dec(&info->send_pending);
- ib_dma_unmap_single(info->id->device, request->sge[0].addr,
+ ib_dma_unmap_single(sc->ib.dev, request->sge[0].addr,
request->sge[0].length, DMA_TO_DEVICE);
smbd_disconnect_rdma_connection(info);
dma_mapping_failed:
- mempool_free(request, info->request_mempool);
+ mempool_free(request, sc->send_io.mem.pool);
return rc;
}
@@ -774,10 +784,10 @@ static int manage_credits_prior_sending(struct smbd_connection *info)
/*
* Check if we need to send a KEEP_ALIVE message
* The idle connection timer triggers a KEEP_ALIVE message when expires
- * SMB_DIRECT_RESPONSE_REQUESTED is set in the message flag to have peer send
+ * SMBDIRECT_FLAG_RESPONSE_REQUESTED is set in the message flag to have peer send
* back a response.
* return value:
- * 1 if SMB_DIRECT_RESPONSE_REQUESTED needs to be set
+ * 1 if SMBDIRECT_FLAG_RESPONSE_REQUESTED needs to be set
* 0: otherwise
*/
static int manage_keep_alive_before_sending(struct smbd_connection *info)
@@ -791,8 +801,10 @@ static int manage_keep_alive_before_sending(struct smbd_connection *info)
/* Post the send request */
static int smbd_post_send(struct smbd_connection *info,
- struct smbd_request *request)
+ struct smbdirect_send_io *request)
{
+ struct smbdirect_socket *sc = &info->socket;
+ struct smbdirect_socket_parameters *sp = &sc->parameters;
struct ib_send_wr send_wr;
int rc, i;
@@ -801,7 +813,7 @@ static int smbd_post_send(struct smbd_connection *info,
"rdma_request sge[%d] addr=0x%llx length=%u\n",
i, request->sge[i].addr, request->sge[i].length);
ib_dma_sync_single_for_device(
- info->id->device,
+ sc->ib.dev,
request->sge[i].addr,
request->sge[i].length,
DMA_TO_DEVICE);
@@ -816,7 +828,7 @@ static int smbd_post_send(struct smbd_connection *info,
send_wr.opcode = IB_WR_SEND;
send_wr.send_flags = IB_SEND_SIGNALED;
- rc = ib_post_send(info->id->qp, &send_wr, NULL);
+ rc = ib_post_send(sc->ib.qp, &send_wr, NULL);
if (rc) {
log_rdma_send(ERR, "ib_post_send failed rc=%d\n", rc);
smbd_disconnect_rdma_connection(info);
@@ -824,7 +836,7 @@ static int smbd_post_send(struct smbd_connection *info,
} else
/* Reset timer for idle connection after packet is sent */
mod_delayed_work(info->workqueue, &info->idle_timer_work,
- info->keep_alive_interval*HZ);
+ msecs_to_jiffies(sp->keepalive_interval_msec));
return rc;
}
@@ -833,22 +845,24 @@ static int smbd_post_send_iter(struct smbd_connection *info,
struct iov_iter *iter,
int *_remaining_data_length)
{
+ struct smbdirect_socket *sc = &info->socket;
+ struct smbdirect_socket_parameters *sp = &sc->parameters;
int i, rc;
int header_length;
int data_length;
- struct smbd_request *request;
- struct smbd_data_transfer *packet;
+ struct smbdirect_send_io *request;
+ struct smbdirect_data_transfer *packet;
int new_credits = 0;
wait_credit:
/* Wait for send credits. A SMBD packet needs one credit */
rc = wait_event_interruptible(info->wait_send_queue,
atomic_read(&info->send_credits) > 0 ||
- info->transport_status != SMBD_CONNECTED);
+ sc->status != SMBDIRECT_SOCKET_CONNECTED);
if (rc)
goto err_wait_credit;
- if (info->transport_status != SMBD_CONNECTED) {
+ if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
log_outgoing(ERR, "disconnected not sending on wait_credit\n");
rc = -EAGAIN;
goto err_wait_credit;
@@ -860,42 +874,44 @@ wait_credit:
wait_send_queue:
wait_event(info->wait_post_send,
- atomic_read(&info->send_pending) < info->send_credit_target ||
- info->transport_status != SMBD_CONNECTED);
+ atomic_read(&info->send_pending) < sp->send_credit_target ||
+ sc->status != SMBDIRECT_SOCKET_CONNECTED);
- if (info->transport_status != SMBD_CONNECTED) {
+ if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
log_outgoing(ERR, "disconnected not sending on wait_send_queue\n");
rc = -EAGAIN;
goto err_wait_send_queue;
}
if (unlikely(atomic_inc_return(&info->send_pending) >
- info->send_credit_target)) {
+ sp->send_credit_target)) {
atomic_dec(&info->send_pending);
goto wait_send_queue;
}
- request = mempool_alloc(info->request_mempool, GFP_KERNEL);
+ request = mempool_alloc(sc->send_io.mem.pool, GFP_KERNEL);
if (!request) {
rc = -ENOMEM;
goto err_alloc;
}
- request->info = info;
+ request->socket = sc;
memset(request->sge, 0, sizeof(request->sge));
/* Fill in the data payload to find out how much data we can add */
if (iter) {
struct smb_extract_to_rdma extract = {
.nr_sge = 1,
- .max_sge = SMBDIRECT_MAX_SEND_SGE,
+ .max_sge = SMBDIRECT_SEND_IO_MAX_SGE,
.sge = request->sge,
- .device = info->id->device,
- .local_dma_lkey = info->pd->local_dma_lkey,
+ .device = sc->ib.dev,
+ .local_dma_lkey = sc->ib.pd->local_dma_lkey,
.direction = DMA_TO_DEVICE,
};
+ size_t payload_len = umin(*_remaining_data_length,
+ sp->max_send_size - sizeof(*packet));
- rc = smb_extract_iter_to_rdma(iter, *_remaining_data_length,
+ rc = smb_extract_iter_to_rdma(iter, payload_len,
&extract);
if (rc < 0)
goto err_dma;
@@ -908,8 +924,8 @@ wait_send_queue:
}
/* Fill in the packet header */
- packet = smbd_request_payload(request);
- packet->credits_requested = cpu_to_le16(info->send_credit_target);
+ packet = smbdirect_send_io_payload(request);
+ packet->credits_requested = cpu_to_le16(sp->send_credit_target);
new_credits = manage_credits_prior_sending(info);
atomic_add(new_credits, &info->receive_credits);
@@ -919,7 +935,7 @@ wait_send_queue:
packet->flags = 0;
if (manage_keep_alive_before_sending(info))
- packet->flags |= cpu_to_le16(SMB_DIRECT_RESPONSE_REQUESTED);
+ packet->flags |= cpu_to_le16(SMBDIRECT_FLAG_RESPONSE_REQUESTED);
packet->reserved = 0;
if (!data_length)
@@ -938,23 +954,23 @@ wait_send_queue:
le32_to_cpu(packet->remaining_data_length));
/* Map the packet to DMA */
- header_length = sizeof(struct smbd_data_transfer);
+ header_length = sizeof(struct smbdirect_data_transfer);
/* If this is a packet without payload, don't send padding */
if (!data_length)
- header_length = offsetof(struct smbd_data_transfer, padding);
+ header_length = offsetof(struct smbdirect_data_transfer, padding);
- request->sge[0].addr = ib_dma_map_single(info->id->device,
+ request->sge[0].addr = ib_dma_map_single(sc->ib.dev,
(void *)packet,
header_length,
DMA_TO_DEVICE);
- if (ib_dma_mapping_error(info->id->device, request->sge[0].addr)) {
+ if (ib_dma_mapping_error(sc->ib.dev, request->sge[0].addr)) {
rc = -EIO;
request->sge[0].addr = 0;
goto err_dma;
}
request->sge[0].length = header_length;
- request->sge[0].lkey = info->pd->local_dma_lkey;
+ request->sge[0].lkey = sc->ib.pd->local_dma_lkey;
rc = smbd_post_send(info, request);
if (!rc)
@@ -963,11 +979,11 @@ wait_send_queue:
err_dma:
for (i = 0; i < request->num_sge; i++)
if (request->sge[i].addr)
- ib_dma_unmap_single(info->id->device,
+ ib_dma_unmap_single(sc->ib.dev,
request->sge[i].addr,
request->sge[i].length,
DMA_TO_DEVICE);
- mempool_free(request, info->request_mempool);
+ mempool_free(request, sc->send_io.mem.pool);
/* roll back receive credits and credits to be offered */
spin_lock(&info->lock_new_credits_offered);
@@ -1000,25 +1016,48 @@ static int smbd_post_send_empty(struct smbd_connection *info)
return smbd_post_send_iter(info, NULL, &remaining_data_length);
}
+static int smbd_post_send_full_iter(struct smbd_connection *info,
+ struct iov_iter *iter,
+ int *_remaining_data_length)
+{
+ int rc = 0;
+
+ /*
+ * smbd_post_send_iter() respects the
+ * negotiated max_send_size, so we need to
+ * loop until the full iter is posted
+ */
+
+ while (iov_iter_count(iter) > 0) {
+ rc = smbd_post_send_iter(info, iter, _remaining_data_length);
+ if (rc < 0)
+ break;
+ }
+
+ return rc;
+}
+
/*
* Post a receive request to the transport
* The remote peer can only send data when a receive request is posted
* The interaction is controlled by send/receive credit system
*/
static int smbd_post_recv(
- struct smbd_connection *info, struct smbd_response *response)
+ struct smbd_connection *info, struct smbdirect_recv_io *response)
{
+ struct smbdirect_socket *sc = &info->socket;
+ struct smbdirect_socket_parameters *sp = &sc->parameters;
struct ib_recv_wr recv_wr;
int rc = -EIO;
response->sge.addr = ib_dma_map_single(
- info->id->device, response->packet,
- info->max_receive_size, DMA_FROM_DEVICE);
- if (ib_dma_mapping_error(info->id->device, response->sge.addr))
+ sc->ib.dev, response->packet,
+ sp->max_recv_size, DMA_FROM_DEVICE);
+ if (ib_dma_mapping_error(sc->ib.dev, response->sge.addr))
return rc;
- response->sge.length = info->max_receive_size;
- response->sge.lkey = info->pd->local_dma_lkey;
+ response->sge.length = sp->max_recv_size;
+ response->sge.lkey = sc->ib.pd->local_dma_lkey;
response->cqe.done = recv_done;
@@ -1027,10 +1066,11 @@ static int smbd_post_recv(
recv_wr.sg_list = &response->sge;
recv_wr.num_sge = 1;
- rc = ib_post_recv(info->id->qp, &recv_wr, NULL);
+ rc = ib_post_recv(sc->ib.qp, &recv_wr, NULL);
if (rc) {
- ib_dma_unmap_single(info->id->device, response->sge.addr,
+ ib_dma_unmap_single(sc->ib.dev, response->sge.addr,
response->sge.length, DMA_FROM_DEVICE);
+ response->sge.length = 0;
smbd_disconnect_rdma_connection(info);
log_rdma_recv(ERR, "ib_post_recv failed rc=%d\n", rc);
}
@@ -1041,10 +1081,11 @@ static int smbd_post_recv(
/* Perform SMBD negotiate according to [MS-SMBD] 3.1.5.2 */
static int smbd_negotiate(struct smbd_connection *info)
{
+ struct smbdirect_socket *sc = &info->socket;
int rc;
- struct smbd_response *response = get_receive_buffer(info);
+ struct smbdirect_recv_io *response = get_receive_buffer(info);
- response->type = SMBD_NEGOTIATE_RESP;
+ sc->recv_io.expected = SMBDIRECT_EXPECT_NEGOTIATE_REP;
rc = smbd_post_recv(info, response);
log_rdma_event(INFO, "smbd_post_recv rc=%d iov.addr=0x%llx iov.length=%u iov.lkey=0x%x\n",
rc, response->sge.addr,
@@ -1075,17 +1116,6 @@ static int smbd_negotiate(struct smbd_connection *info)
return rc;
}
-static void put_empty_packet(
- struct smbd_connection *info, struct smbd_response *response)
-{
- spin_lock(&info->empty_packet_queue_lock);
- list_add_tail(&response->list, &info->empty_packet_queue);
- info->count_empty_packet_queue++;
- spin_unlock(&info->empty_packet_queue_lock);
-
- queue_work(info->workqueue, &info->post_send_credits_work);
-}
-
/*
* Implement Connection.FragmentReassemblyBuffer defined in [MS-SMBD] 3.1.1.1
* This is a queue for reassembling upper layer payload and present to upper
@@ -1098,12 +1128,14 @@ static void put_empty_packet(
*/
static void enqueue_reassembly(
struct smbd_connection *info,
- struct smbd_response *response,
+ struct smbdirect_recv_io *response,
int data_length)
{
- spin_lock(&info->reassembly_queue_lock);
- list_add_tail(&response->list, &info->reassembly_queue);
- info->reassembly_queue_length++;
+ struct smbdirect_socket *sc = &info->socket;
+
+ spin_lock(&sc->recv_io.reassembly.lock);
+ list_add_tail(&response->list, &sc->recv_io.reassembly.list);
+ sc->recv_io.reassembly.queue_length++;
/*
* Make sure reassembly_data_length is updated after list and
* reassembly_queue_length are updated. On the dequeue side
@@ -1111,8 +1143,8 @@ static void enqueue_reassembly(
* if reassembly_queue_length and list is up to date
*/
virt_wmb();
- info->reassembly_data_length += data_length;
- spin_unlock(&info->reassembly_queue_lock);
+ sc->recv_io.reassembly.data_length += data_length;
+ spin_unlock(&sc->recv_io.reassembly.lock);
info->count_reassembly_queue++;
info->count_enqueue_reassembly_queue++;
}
@@ -1122,34 +1154,16 @@ static void enqueue_reassembly(
* Caller is responsible for locking
* return value: the first entry if any, NULL if queue is empty
*/
-static struct smbd_response *_get_first_reassembly(struct smbd_connection *info)
-{
- struct smbd_response *ret = NULL;
-
- if (!list_empty(&info->reassembly_queue)) {
- ret = list_first_entry(
- &info->reassembly_queue,
- struct smbd_response, list);
- }
- return ret;
-}
-
-static struct smbd_response *get_empty_queue_buffer(
- struct smbd_connection *info)
+static struct smbdirect_recv_io *_get_first_reassembly(struct smbd_connection *info)
{
- struct smbd_response *ret = NULL;
- unsigned long flags;
+ struct smbdirect_socket *sc = &info->socket;
+ struct smbdirect_recv_io *ret = NULL;
- spin_lock_irqsave(&info->empty_packet_queue_lock, flags);
- if (!list_empty(&info->empty_packet_queue)) {
+ if (!list_empty(&sc->recv_io.reassembly.list)) {
ret = list_first_entry(
- &info->empty_packet_queue,
- struct smbd_response, list);
- list_del(&ret->list);
- info->count_empty_packet_queue--;
+ &sc->recv_io.reassembly.list,
+ struct smbdirect_recv_io, list);
}
- spin_unlock_irqrestore(&info->empty_packet_queue_lock, flags);
-
return ret;
}
@@ -1159,21 +1173,22 @@ static struct smbd_response *get_empty_queue_buffer(
* pre-allocated in advance.
* return value: the receive buffer, NULL if none is available
*/
-static struct smbd_response *get_receive_buffer(struct smbd_connection *info)
+static struct smbdirect_recv_io *get_receive_buffer(struct smbd_connection *info)
{
- struct smbd_response *ret = NULL;
+ struct smbdirect_socket *sc = &info->socket;
+ struct smbdirect_recv_io *ret = NULL;
unsigned long flags;
- spin_lock_irqsave(&info->receive_queue_lock, flags);
- if (!list_empty(&info->receive_queue)) {
+ spin_lock_irqsave(&sc->recv_io.free.lock, flags);
+ if (!list_empty(&sc->recv_io.free.list)) {
ret = list_first_entry(
- &info->receive_queue,
- struct smbd_response, list);
+ &sc->recv_io.free.list,
+ struct smbdirect_recv_io, list);
list_del(&ret->list);
info->count_receive_queue--;
info->count_get_receive_buffer++;
}
- spin_unlock_irqrestore(&info->receive_queue_lock, flags);
+ spin_unlock_irqrestore(&sc->recv_io.free.lock, flags);
return ret;
}
@@ -1185,18 +1200,24 @@ static struct smbd_response *get_receive_buffer(struct smbd_connection *info)
* receive buffer is returned.
*/
static void put_receive_buffer(
- struct smbd_connection *info, struct smbd_response *response)
+ struct smbd_connection *info, struct smbdirect_recv_io *response)
{
+ struct smbdirect_socket *sc = &info->socket;
unsigned long flags;
- ib_dma_unmap_single(info->id->device, response->sge.addr,
- response->sge.length, DMA_FROM_DEVICE);
+ if (likely(response->sge.length != 0)) {
+ ib_dma_unmap_single(sc->ib.dev,
+ response->sge.addr,
+ response->sge.length,
+ DMA_FROM_DEVICE);
+ response->sge.length = 0;
+ }
- spin_lock_irqsave(&info->receive_queue_lock, flags);
- list_add_tail(&response->list, &info->receive_queue);
+ spin_lock_irqsave(&sc->recv_io.free.lock, flags);
+ list_add_tail(&response->list, &sc->recv_io.free.list);
info->count_receive_queue++;
info->count_put_receive_buffer++;
- spin_unlock_irqrestore(&info->receive_queue_lock, flags);
+ spin_unlock_irqrestore(&sc->recv_io.free.lock, flags);
queue_work(info->workqueue, &info->post_send_credits_work);
}
@@ -1204,58 +1225,54 @@ static void put_receive_buffer(
/* Preallocate all receive buffer on transport establishment */
static int allocate_receive_buffers(struct smbd_connection *info, int num_buf)
{
+ struct smbdirect_socket *sc = &info->socket;
+ struct smbdirect_recv_io *response;
int i;
- struct smbd_response *response;
- INIT_LIST_HEAD(&info->reassembly_queue);
- spin_lock_init(&info->reassembly_queue_lock);
- info->reassembly_data_length = 0;
- info->reassembly_queue_length = 0;
+ INIT_LIST_HEAD(&sc->recv_io.reassembly.list);
+ spin_lock_init(&sc->recv_io.reassembly.lock);
+ sc->recv_io.reassembly.data_length = 0;
+ sc->recv_io.reassembly.queue_length = 0;
- INIT_LIST_HEAD(&info->receive_queue);
- spin_lock_init(&info->receive_queue_lock);
+ INIT_LIST_HEAD(&sc->recv_io.free.list);
+ spin_lock_init(&sc->recv_io.free.lock);
info->count_receive_queue = 0;
- INIT_LIST_HEAD(&info->empty_packet_queue);
- spin_lock_init(&info->empty_packet_queue_lock);
- info->count_empty_packet_queue = 0;
-
init_waitqueue_head(&info->wait_receive_queues);
for (i = 0; i < num_buf; i++) {
- response = mempool_alloc(info->response_mempool, GFP_KERNEL);
+ response = mempool_alloc(sc->recv_io.mem.pool, GFP_KERNEL);
if (!response)
goto allocate_failed;
- response->info = info;
- list_add_tail(&response->list, &info->receive_queue);
+ response->socket = sc;
+ response->sge.length = 0;
+ list_add_tail(&response->list, &sc->recv_io.free.list);
info->count_receive_queue++;
}
return 0;
allocate_failed:
- while (!list_empty(&info->receive_queue)) {
+ while (!list_empty(&sc->recv_io.free.list)) {
response = list_first_entry(
- &info->receive_queue,
- struct smbd_response, list);
+ &sc->recv_io.free.list,
+ struct smbdirect_recv_io, list);
list_del(&response->list);
info->count_receive_queue--;
- mempool_free(response, info->response_mempool);
+ mempool_free(response, sc->recv_io.mem.pool);
}
return -ENOMEM;
}
static void destroy_receive_buffers(struct smbd_connection *info)
{
- struct smbd_response *response;
+ struct smbdirect_socket *sc = &info->socket;
+ struct smbdirect_recv_io *response;
while ((response = get_receive_buffer(info)))
- mempool_free(response, info->response_mempool);
-
- while ((response = get_empty_queue_buffer(info)))
- mempool_free(response, info->response_mempool);
+ mempool_free(response, sc->recv_io.mem.pool);
}
/* Implement idle connection timer [MS-SMBD] 3.1.6.2 */
@@ -1264,6 +1281,8 @@ static void idle_connection_timer(struct work_struct *work)
struct smbd_connection *info = container_of(
work, struct smbd_connection,
idle_timer_work.work);
+ struct smbdirect_socket *sc = &info->socket;
+ struct smbdirect_socket_parameters *sp = &sc->parameters;
if (info->keep_alive_requested != KEEP_ALIVE_NONE) {
log_keep_alive(ERR,
@@ -1278,7 +1297,7 @@ static void idle_connection_timer(struct work_struct *work)
/* Setup the next idle timeout work */
queue_delayed_work(info->workqueue, &info->idle_timer_work,
- info->keep_alive_interval*HZ);
+ msecs_to_jiffies(sp->keepalive_interval_msec));
}
/*
@@ -1289,54 +1308,54 @@ static void idle_connection_timer(struct work_struct *work)
void smbd_destroy(struct TCP_Server_Info *server)
{
struct smbd_connection *info = server->smbd_conn;
- struct smbd_response *response;
+ struct smbdirect_socket *sc;
+ struct smbdirect_socket_parameters *sp;
+ struct smbdirect_recv_io *response;
unsigned long flags;
if (!info) {
log_rdma_event(INFO, "rdma session already destroyed\n");
return;
}
+ sc = &info->socket;
+ sp = &sc->parameters;
log_rdma_event(INFO, "destroying rdma session\n");
- if (info->transport_status != SMBD_DISCONNECTED) {
- rdma_disconnect(server->smbd_conn->id);
+ if (sc->status != SMBDIRECT_SOCKET_DISCONNECTED) {
+ rdma_disconnect(sc->rdma.cm_id);
log_rdma_event(INFO, "wait for transport being disconnected\n");
wait_event_interruptible(
- info->disconn_wait,
- info->transport_status == SMBD_DISCONNECTED);
+ info->status_wait,
+ sc->status == SMBDIRECT_SOCKET_DISCONNECTED);
}
log_rdma_event(INFO, "destroying qp\n");
- ib_drain_qp(info->id->qp);
- rdma_destroy_qp(info->id);
+ ib_drain_qp(sc->ib.qp);
+ rdma_destroy_qp(sc->rdma.cm_id);
+ sc->ib.qp = NULL;
log_rdma_event(INFO, "cancelling idle timer\n");
cancel_delayed_work_sync(&info->idle_timer_work);
- log_rdma_event(INFO, "wait for all send posted to IB to finish\n");
- wait_event(info->wait_send_pending,
- atomic_read(&info->send_pending) == 0);
-
/* It's not possible for upper layer to get to reassembly */
log_rdma_event(INFO, "drain the reassembly queue\n");
do {
- spin_lock_irqsave(&info->reassembly_queue_lock, flags);
+ spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags);
response = _get_first_reassembly(info);
if (response) {
list_del(&response->list);
spin_unlock_irqrestore(
- &info->reassembly_queue_lock, flags);
+ &sc->recv_io.reassembly.lock, flags);
put_receive_buffer(info, response);
} else
spin_unlock_irqrestore(
- &info->reassembly_queue_lock, flags);
+ &sc->recv_io.reassembly.lock, flags);
} while (response);
- info->reassembly_data_length = 0;
+ sc->recv_io.reassembly.data_length = 0;
log_rdma_event(INFO, "free receive buffers\n");
wait_event(info->wait_receive_queues,
- info->count_receive_queue + info->count_empty_packet_queue
- == info->receive_credit_max);
+ info->count_receive_queue == sp->recv_credit_max);
destroy_receive_buffers(info);
/*
@@ -1355,19 +1374,19 @@ void smbd_destroy(struct TCP_Server_Info *server)
}
destroy_mr_list(info);
- ib_free_cq(info->send_cq);
- ib_free_cq(info->recv_cq);
- ib_dealloc_pd(info->pd);
- rdma_destroy_id(info->id);
+ ib_free_cq(sc->ib.send_cq);
+ ib_free_cq(sc->ib.recv_cq);
+ ib_dealloc_pd(sc->ib.pd);
+ rdma_destroy_id(sc->rdma.cm_id);
/* free mempools */
- mempool_destroy(info->request_mempool);
- kmem_cache_destroy(info->request_cache);
+ mempool_destroy(sc->send_io.mem.pool);
+ kmem_cache_destroy(sc->send_io.mem.cache);
- mempool_destroy(info->response_mempool);
- kmem_cache_destroy(info->response_cache);
+ mempool_destroy(sc->recv_io.mem.pool);
+ kmem_cache_destroy(sc->recv_io.mem.cache);
- info->transport_status = SMBD_DESTROYED;
+ sc->status = SMBDIRECT_SOCKET_DESTROYED;
destroy_workqueue(info->workqueue);
log_rdma_event(INFO, "rdma session destroyed\n");
@@ -1392,7 +1411,7 @@ int smbd_reconnect(struct TCP_Server_Info *server)
* This is possible if transport is disconnected and we haven't received
* notification from RDMA, but upper layer has detected timeout
*/
- if (server->smbd_conn->transport_status == SMBD_CONNECTED) {
+ if (server->smbd_conn->socket.status == SMBDIRECT_SOCKET_CONNECTED) {
log_rdma_event(INFO, "disconnecting transport\n");
smbd_destroy(server);
}
@@ -1413,50 +1432,62 @@ create_conn:
static void destroy_caches_and_workqueue(struct smbd_connection *info)
{
+ struct smbdirect_socket *sc = &info->socket;
+
destroy_receive_buffers(info);
destroy_workqueue(info->workqueue);
- mempool_destroy(info->response_mempool);
- kmem_cache_destroy(info->response_cache);
- mempool_destroy(info->request_mempool);
- kmem_cache_destroy(info->request_cache);
+ mempool_destroy(sc->recv_io.mem.pool);
+ kmem_cache_destroy(sc->recv_io.mem.cache);
+ mempool_destroy(sc->send_io.mem.pool);
+ kmem_cache_destroy(sc->send_io.mem.cache);
}
#define MAX_NAME_LEN 80
static int allocate_caches_and_workqueue(struct smbd_connection *info)
{
+ struct smbdirect_socket *sc = &info->socket;
+ struct smbdirect_socket_parameters *sp = &sc->parameters;
char name[MAX_NAME_LEN];
int rc;
- scnprintf(name, MAX_NAME_LEN, "smbd_request_%p", info);
- info->request_cache =
+ if (WARN_ON_ONCE(sp->max_recv_size < sizeof(struct smbdirect_data_transfer)))
+ return -ENOMEM;
+
+ scnprintf(name, MAX_NAME_LEN, "smbdirect_send_io_%p", info);
+ sc->send_io.mem.cache =
kmem_cache_create(
name,
- sizeof(struct smbd_request) +
- sizeof(struct smbd_data_transfer),
+ sizeof(struct smbdirect_send_io) +
+ sizeof(struct smbdirect_data_transfer),
0, SLAB_HWCACHE_ALIGN, NULL);
- if (!info->request_cache)
+ if (!sc->send_io.mem.cache)
return -ENOMEM;
- info->request_mempool =
- mempool_create(info->send_credit_target, mempool_alloc_slab,
- mempool_free_slab, info->request_cache);
- if (!info->request_mempool)
+ sc->send_io.mem.pool =
+ mempool_create(sp->send_credit_target, mempool_alloc_slab,
+ mempool_free_slab, sc->send_io.mem.cache);
+ if (!sc->send_io.mem.pool)
goto out1;
- scnprintf(name, MAX_NAME_LEN, "smbd_response_%p", info);
- info->response_cache =
- kmem_cache_create(
- name,
- sizeof(struct smbd_response) +
- info->max_receive_size,
- 0, SLAB_HWCACHE_ALIGN, NULL);
- if (!info->response_cache)
+ scnprintf(name, MAX_NAME_LEN, "smbdirect_recv_io_%p", info);
+
+ struct kmem_cache_args response_args = {
+ .align = __alignof__(struct smbdirect_recv_io),
+ .useroffset = (offsetof(struct smbdirect_recv_io, packet) +
+ sizeof(struct smbdirect_data_transfer)),
+ .usersize = sp->max_recv_size - sizeof(struct smbdirect_data_transfer),
+ };
+ sc->recv_io.mem.cache =
+ kmem_cache_create(name,
+ sizeof(struct smbdirect_recv_io) + sp->max_recv_size,
+ &response_args, SLAB_HWCACHE_ALIGN);
+ if (!sc->recv_io.mem.cache)
goto out2;
- info->response_mempool =
- mempool_create(info->receive_credit_max, mempool_alloc_slab,
- mempool_free_slab, info->response_cache);
- if (!info->response_mempool)
+ sc->recv_io.mem.pool =
+ mempool_create(sp->recv_credit_max, mempool_alloc_slab,
+ mempool_free_slab, sc->recv_io.mem.cache);
+ if (!sc->recv_io.mem.pool)
goto out3;
scnprintf(name, MAX_NAME_LEN, "smbd_%p", info);
@@ -1464,7 +1495,7 @@ static int allocate_caches_and_workqueue(struct smbd_connection *info)
if (!info->workqueue)
goto out4;
- rc = allocate_receive_buffers(info, info->receive_credit_max);
+ rc = allocate_receive_buffers(info, sp->recv_credit_max);
if (rc) {
log_rdma_event(ERR, "failed to allocate receive buffers\n");
goto out5;
@@ -1475,13 +1506,13 @@ static int allocate_caches_and_workqueue(struct smbd_connection *info)
out5:
destroy_workqueue(info->workqueue);
out4:
- mempool_destroy(info->response_mempool);
+ mempool_destroy(sc->recv_io.mem.pool);
out3:
- kmem_cache_destroy(info->response_cache);
+ kmem_cache_destroy(sc->recv_io.mem.cache);
out2:
- mempool_destroy(info->request_mempool);
+ mempool_destroy(sc->send_io.mem.pool);
out1:
- kmem_cache_destroy(info->request_cache);
+ kmem_cache_destroy(sc->send_io.mem.cache);
return -ENOMEM;
}
@@ -1491,6 +1522,8 @@ static struct smbd_connection *_smbd_get_connection(
{
int rc;
struct smbd_connection *info;
+ struct smbdirect_socket *sc;
+ struct smbdirect_socket_parameters *sp;
struct rdma_conn_param conn_param;
struct ib_qp_init_attr qp_attr;
struct sockaddr_in *addr_in = (struct sockaddr_in *) dstaddr;
@@ -1500,101 +1533,102 @@ static struct smbd_connection *_smbd_get_connection(
info = kzalloc(sizeof(struct smbd_connection), GFP_KERNEL);
if (!info)
return NULL;
+ sc = &info->socket;
+ sp = &sc->parameters;
- info->transport_status = SMBD_CONNECTING;
+ sc->status = SMBDIRECT_SOCKET_CONNECTING;
rc = smbd_ia_open(info, dstaddr, port);
if (rc) {
log_rdma_event(INFO, "smbd_ia_open rc=%d\n", rc);
goto create_id_failed;
}
- if (smbd_send_credit_target > info->id->device->attrs.max_cqe ||
- smbd_send_credit_target > info->id->device->attrs.max_qp_wr) {
+ if (smbd_send_credit_target > sc->ib.dev->attrs.max_cqe ||
+ smbd_send_credit_target > sc->ib.dev->attrs.max_qp_wr) {
log_rdma_event(ERR, "consider lowering send_credit_target = %d. Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n",
smbd_send_credit_target,
- info->id->device->attrs.max_cqe,
- info->id->device->attrs.max_qp_wr);
+ sc->ib.dev->attrs.max_cqe,
+ sc->ib.dev->attrs.max_qp_wr);
goto config_failed;
}
- if (smbd_receive_credit_max > info->id->device->attrs.max_cqe ||
- smbd_receive_credit_max > info->id->device->attrs.max_qp_wr) {
+ if (smbd_receive_credit_max > sc->ib.dev->attrs.max_cqe ||
+ smbd_receive_credit_max > sc->ib.dev->attrs.max_qp_wr) {
log_rdma_event(ERR, "consider lowering receive_credit_max = %d. Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n",
smbd_receive_credit_max,
- info->id->device->attrs.max_cqe,
- info->id->device->attrs.max_qp_wr);
+ sc->ib.dev->attrs.max_cqe,
+ sc->ib.dev->attrs.max_qp_wr);
goto config_failed;
}
- info->receive_credit_max = smbd_receive_credit_max;
- info->send_credit_target = smbd_send_credit_target;
- info->max_send_size = smbd_max_send_size;
- info->max_fragmented_recv_size = smbd_max_fragmented_recv_size;
- info->max_receive_size = smbd_max_receive_size;
- info->keep_alive_interval = smbd_keep_alive_interval;
+ sp->recv_credit_max = smbd_receive_credit_max;
+ sp->send_credit_target = smbd_send_credit_target;
+ sp->max_send_size = smbd_max_send_size;
+ sp->max_fragmented_recv_size = smbd_max_fragmented_recv_size;
+ sp->max_recv_size = smbd_max_receive_size;
+ sp->keepalive_interval_msec = smbd_keep_alive_interval * 1000;
- if (info->id->device->attrs.max_send_sge < SMBDIRECT_MAX_SEND_SGE ||
- info->id->device->attrs.max_recv_sge < SMBDIRECT_MAX_RECV_SGE) {
+ if (sc->ib.dev->attrs.max_send_sge < SMBDIRECT_SEND_IO_MAX_SGE ||
+ sc->ib.dev->attrs.max_recv_sge < SMBDIRECT_RECV_IO_MAX_SGE) {
log_rdma_event(ERR,
"device %.*s max_send_sge/max_recv_sge = %d/%d too small\n",
IB_DEVICE_NAME_MAX,
- info->id->device->name,
- info->id->device->attrs.max_send_sge,
- info->id->device->attrs.max_recv_sge);
+ sc->ib.dev->name,
+ sc->ib.dev->attrs.max_send_sge,
+ sc->ib.dev->attrs.max_recv_sge);
goto config_failed;
}
- info->send_cq = NULL;
- info->recv_cq = NULL;
- info->send_cq =
- ib_alloc_cq_any(info->id->device, info,
- info->send_credit_target, IB_POLL_SOFTIRQ);
- if (IS_ERR(info->send_cq)) {
- info->send_cq = NULL;
+ sc->ib.send_cq =
+ ib_alloc_cq_any(sc->ib.dev, info,
+ sp->send_credit_target, IB_POLL_SOFTIRQ);
+ if (IS_ERR(sc->ib.send_cq)) {
+ sc->ib.send_cq = NULL;
goto alloc_cq_failed;
}
- info->recv_cq =
- ib_alloc_cq_any(info->id->device, info,
- info->receive_credit_max, IB_POLL_SOFTIRQ);
- if (IS_ERR(info->recv_cq)) {
- info->recv_cq = NULL;
+ sc->ib.recv_cq =
+ ib_alloc_cq_any(sc->ib.dev, info,
+ sp->recv_credit_max, IB_POLL_SOFTIRQ);
+ if (IS_ERR(sc->ib.recv_cq)) {
+ sc->ib.recv_cq = NULL;
goto alloc_cq_failed;
}
memset(&qp_attr, 0, sizeof(qp_attr));
qp_attr.event_handler = smbd_qp_async_error_upcall;
qp_attr.qp_context = info;
- qp_attr.cap.max_send_wr = info->send_credit_target;
- qp_attr.cap.max_recv_wr = info->receive_credit_max;
- qp_attr.cap.max_send_sge = SMBDIRECT_MAX_SEND_SGE;
- qp_attr.cap.max_recv_sge = SMBDIRECT_MAX_RECV_SGE;
+ qp_attr.cap.max_send_wr = sp->send_credit_target;
+ qp_attr.cap.max_recv_wr = sp->recv_credit_max;
+ qp_attr.cap.max_send_sge = SMBDIRECT_SEND_IO_MAX_SGE;
+ qp_attr.cap.max_recv_sge = SMBDIRECT_RECV_IO_MAX_SGE;
qp_attr.cap.max_inline_data = 0;
qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
qp_attr.qp_type = IB_QPT_RC;
- qp_attr.send_cq = info->send_cq;
- qp_attr.recv_cq = info->recv_cq;
+ qp_attr.send_cq = sc->ib.send_cq;
+ qp_attr.recv_cq = sc->ib.recv_cq;
qp_attr.port_num = ~0;
- rc = rdma_create_qp(info->id, info->pd, &qp_attr);
+ rc = rdma_create_qp(sc->rdma.cm_id, sc->ib.pd, &qp_attr);
if (rc) {
log_rdma_event(ERR, "rdma_create_qp failed %i\n", rc);
goto create_qp_failed;
}
+ sc->ib.qp = sc->rdma.cm_id->qp;
memset(&conn_param, 0, sizeof(conn_param));
conn_param.initiator_depth = 0;
conn_param.responder_resources =
- min(info->id->device->attrs.max_qp_rd_atom,
+ min(sc->ib.dev->attrs.max_qp_rd_atom,
SMBD_CM_RESPONDER_RESOURCES);
info->responder_resources = conn_param.responder_resources;
log_rdma_mr(INFO, "responder_resources=%d\n",
info->responder_resources);
/* Need to send IRD/ORD in private data for iWARP */
- info->id->device->ops.get_port_immutable(
- info->id->device, info->id->port_num, &port_immutable);
+ sc->ib.dev->ops.get_port_immutable(
+ sc->ib.dev, sc->rdma.cm_id->port_num, &port_immutable);
if (port_immutable.core_cap_flags & RDMA_CORE_PORT_IWARP) {
ird_ord_hdr[0] = info->responder_resources;
ird_ord_hdr[1] = 1;
@@ -1612,19 +1646,20 @@ static struct smbd_connection *_smbd_get_connection(
log_rdma_event(INFO, "connecting to IP %pI4 port %d\n",
&addr_in->sin_addr, port);
- init_waitqueue_head(&info->conn_wait);
- init_waitqueue_head(&info->disconn_wait);
- init_waitqueue_head(&info->wait_reassembly_queue);
- rc = rdma_connect(info->id, &conn_param);
+ init_waitqueue_head(&info->status_wait);
+ init_waitqueue_head(&sc->recv_io.reassembly.wait_queue);
+ rc = rdma_connect(sc->rdma.cm_id, &conn_param);
if (rc) {
log_rdma_event(ERR, "rdma_connect() failed with %i\n", rc);
goto rdma_connect_failed;
}
- wait_event_interruptible(
- info->conn_wait, info->transport_status != SMBD_CONNECTING);
+ wait_event_interruptible_timeout(
+ info->status_wait,
+ sc->status != SMBDIRECT_SOCKET_CONNECTING,
+ msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT));
- if (info->transport_status != SMBD_CONNECTED) {
+ if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
log_rdma_event(ERR, "rdma_connect failed port=%d\n", port);
goto rdma_connect_failed;
}
@@ -1640,7 +1675,7 @@ static struct smbd_connection *_smbd_get_connection(
init_waitqueue_head(&info->wait_send_queue);
INIT_DELAYED_WORK(&info->idle_timer_work, idle_connection_timer);
queue_delayed_work(info->workqueue, &info->idle_timer_work,
- info->keep_alive_interval*HZ);
+ msecs_to_jiffies(sp->keepalive_interval_msec));
init_waitqueue_head(&info->wait_send_pending);
atomic_set(&info->send_pending, 0);
@@ -1675,26 +1710,25 @@ allocate_mr_failed:
negotiation_failed:
cancel_delayed_work_sync(&info->idle_timer_work);
destroy_caches_and_workqueue(info);
- info->transport_status = SMBD_NEGOTIATE_FAILED;
- init_waitqueue_head(&info->conn_wait);
- rdma_disconnect(info->id);
- wait_event(info->conn_wait,
- info->transport_status == SMBD_DISCONNECTED);
+ sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED;
+ rdma_disconnect(sc->rdma.cm_id);
+ wait_event(info->status_wait,
+ sc->status == SMBDIRECT_SOCKET_DISCONNECTED);
allocate_cache_failed:
rdma_connect_failed:
- rdma_destroy_qp(info->id);
+ rdma_destroy_qp(sc->rdma.cm_id);
create_qp_failed:
alloc_cq_failed:
- if (info->send_cq)
- ib_free_cq(info->send_cq);
- if (info->recv_cq)
- ib_free_cq(info->recv_cq);
+ if (sc->ib.send_cq)
+ ib_free_cq(sc->ib.send_cq);
+ if (sc->ib.recv_cq)
+ ib_free_cq(sc->ib.recv_cq);
config_failed:
- ib_dealloc_pd(info->pd);
- rdma_destroy_id(info->id);
+ ib_dealloc_pd(sc->ib.pd);
+ rdma_destroy_id(sc->rdma.cm_id);
create_id_failed:
kfree(info);
@@ -1719,36 +1753,41 @@ try_again:
}
/*
- * Receive data from receive reassembly queue
+ * Receive data from the transport's receive reassembly queue
* All the incoming data packets are placed in reassembly queue
- * buf: the buffer to read data into
+ * iter: the buffer to read data into
* size: the length of data to read
* return value: actual data read
- * Note: this implementation copies the data from reassebmly queue to receive
+ *
+ * Note: this implementation copies the data from reassembly queue to receive
* buffers used by upper layer. This is not the optimal code path. A better way
* to do it is to not have upper layer allocate its receive buffers but rather
* borrow the buffer from reassembly queue, and return it after data is
* consumed. But this will require more changes to upper layer code, and also
* need to consider packet boundaries while they still being reassembled.
*/
-static int smbd_recv_buf(struct smbd_connection *info, char *buf,
- unsigned int size)
+int smbd_recv(struct smbd_connection *info, struct msghdr *msg)
{
- struct smbd_response *response;
- struct smbd_data_transfer *data_transfer;
+ struct smbdirect_socket *sc = &info->socket;
+ struct smbdirect_recv_io *response;
+ struct smbdirect_data_transfer *data_transfer;
+ size_t size = iov_iter_count(&msg->msg_iter);
int to_copy, to_read, data_read, offset;
u32 data_length, remaining_data_length, data_offset;
int rc;
+ if (WARN_ON_ONCE(iov_iter_rw(&msg->msg_iter) == WRITE))
+ return -EINVAL; /* It's a bug in upper layer to get there */
+
again:
/*
* No need to hold the reassembly queue lock all the time as we are
* the only one reading from the front of the queue. The transport
* may add more entries to the back of the queue at the same time
*/
- log_read(INFO, "size=%d info->reassembly_data_length=%d\n", size,
- info->reassembly_data_length);
- if (info->reassembly_data_length >= size) {
+ log_read(INFO, "size=%zd sc->recv_io.reassembly.data_length=%d\n", size,
+ sc->recv_io.reassembly.data_length);
+ if (sc->recv_io.reassembly.data_length >= size) {
int queue_length;
int queue_removed = 0;
@@ -1760,13 +1799,13 @@ again:
* updated in SOFTIRQ as more data is received
*/
virt_rmb();
- queue_length = info->reassembly_queue_length;
+ queue_length = sc->recv_io.reassembly.queue_length;
data_read = 0;
to_read = size;
- offset = info->first_entry_offset;
+ offset = sc->recv_io.reassembly.first_entry_offset;
while (data_read < size) {
response = _get_first_reassembly(info);
- data_transfer = smbd_response_payload(response);
+ data_transfer = smbdirect_recv_io_payload(response);
data_length = le32_to_cpu(data_transfer->data_length);
remaining_data_length =
le32_to_cpu(
@@ -1784,7 +1823,10 @@ again:
if (response->first_segment && size == 4) {
unsigned int rfc1002_len =
data_length + remaining_data_length;
- *((__be32 *)buf) = cpu_to_be32(rfc1002_len);
+ __be32 rfc1002_hdr = cpu_to_be32(rfc1002_len);
+ if (copy_to_iter(&rfc1002_hdr, sizeof(rfc1002_hdr),
+ &msg->msg_iter) != sizeof(rfc1002_hdr))
+ return -EFAULT;
data_read = 4;
response->first_segment = false;
log_read(INFO, "returning rfc1002 length %d\n",
@@ -1793,10 +1835,9 @@ again:
}
to_copy = min_t(int, data_length - offset, to_read);
- memcpy(
- buf + data_read,
- (char *)data_transfer + data_offset + offset,
- to_copy);
+ if (copy_to_iter((char *)data_transfer + data_offset + offset,
+ to_copy, &msg->msg_iter) != to_copy)
+ return -EFAULT;
/* move on to the next buffer? */
if (to_copy == data_length - offset) {
@@ -1809,10 +1850,10 @@ again:
list_del(&response->list);
else {
spin_lock_irq(
- &info->reassembly_queue_lock);
+ &sc->recv_io.reassembly.lock);
list_del(&response->list);
spin_unlock_irq(
- &info->reassembly_queue_lock);
+ &sc->recv_io.reassembly.lock);
}
queue_removed++;
info->count_reassembly_queue--;
@@ -1831,29 +1872,29 @@ again:
to_read, data_read, offset);
}
- spin_lock_irq(&info->reassembly_queue_lock);
- info->reassembly_data_length -= data_read;
- info->reassembly_queue_length -= queue_removed;
- spin_unlock_irq(&info->reassembly_queue_lock);
+ spin_lock_irq(&sc->recv_io.reassembly.lock);
+ sc->recv_io.reassembly.data_length -= data_read;
+ sc->recv_io.reassembly.queue_length -= queue_removed;
+ spin_unlock_irq(&sc->recv_io.reassembly.lock);
- info->first_entry_offset = offset;
+ sc->recv_io.reassembly.first_entry_offset = offset;
log_read(INFO, "returning to thread data_read=%d reassembly_data_length=%d first_entry_offset=%d\n",
- data_read, info->reassembly_data_length,
- info->first_entry_offset);
+ data_read, sc->recv_io.reassembly.data_length,
+ sc->recv_io.reassembly.first_entry_offset);
read_rfc1002_done:
return data_read;
}
log_read(INFO, "wait_event on more data\n");
rc = wait_event_interruptible(
- info->wait_reassembly_queue,
- info->reassembly_data_length >= size ||
- info->transport_status != SMBD_CONNECTED);
+ sc->recv_io.reassembly.wait_queue,
+ sc->recv_io.reassembly.data_length >= size ||
+ sc->status != SMBDIRECT_SOCKET_CONNECTED);
/* Don't return any data if interrupted */
if (rc)
return rc;
- if (info->transport_status != SMBD_CONNECTED) {
+ if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
log_read(ERR, "disconnected\n");
return -ECONNABORTED;
}
@@ -1862,89 +1903,6 @@ read_rfc1002_done:
}
/*
- * Receive a page from receive reassembly queue
- * page: the page to read data into
- * to_read: the length of data to read
- * return value: actual data read
- */
-static int smbd_recv_page(struct smbd_connection *info,
- struct page *page, unsigned int page_offset,
- unsigned int to_read)
-{
- int ret;
- char *to_address;
- void *page_address;
-
- /* make sure we have the page ready for read */
- ret = wait_event_interruptible(
- info->wait_reassembly_queue,
- info->reassembly_data_length >= to_read ||
- info->transport_status != SMBD_CONNECTED);
- if (ret)
- return ret;
-
- /* now we can read from reassembly queue and not sleep */
- page_address = kmap_atomic(page);
- to_address = (char *) page_address + page_offset;
-
- log_read(INFO, "reading from page=%p address=%p to_read=%d\n",
- page, to_address, to_read);
-
- ret = smbd_recv_buf(info, to_address, to_read);
- kunmap_atomic(page_address);
-
- return ret;
-}
-
-/*
- * Receive data from transport
- * msg: a msghdr point to the buffer, can be ITER_KVEC or ITER_BVEC
- * return: total bytes read, or 0. SMB Direct will not do partial read.
- */
-int smbd_recv(struct smbd_connection *info, struct msghdr *msg)
-{
- char *buf;
- struct page *page;
- unsigned int to_read, page_offset;
- int rc;
-
- if (iov_iter_rw(&msg->msg_iter) == WRITE) {
- /* It's a bug in upper layer to get there */
- cifs_dbg(VFS, "Invalid msg iter dir %u\n",
- iov_iter_rw(&msg->msg_iter));
- rc = -EINVAL;
- goto out;
- }
-
- switch (iov_iter_type(&msg->msg_iter)) {
- case ITER_KVEC:
- buf = msg->msg_iter.kvec->iov_base;
- to_read = msg->msg_iter.kvec->iov_len;
- rc = smbd_recv_buf(info, buf, to_read);
- break;
-
- case ITER_BVEC:
- page = msg->msg_iter.bvec->bv_page;
- page_offset = msg->msg_iter.bvec->bv_offset;
- to_read = msg->msg_iter.bvec->bv_len;
- rc = smbd_recv_page(info, page, page_offset, to_read);
- break;
-
- default:
- /* It's a bug in upper layer to get there */
- cifs_dbg(VFS, "Invalid msg type %d\n",
- iov_iter_type(&msg->msg_iter));
- rc = -EINVAL;
- }
-
-out:
- /* SMBDirect will read it all or nothing */
- if (rc > 0)
- msg->msg_iter.count = 0;
- return rc;
-}
-
-/*
* Send data to transport
* Each rqst is transported as a SMBDirect payload
* rqst: the data to write
@@ -1954,12 +1912,14 @@ int smbd_send(struct TCP_Server_Info *server,
int num_rqst, struct smb_rqst *rqst_array)
{
struct smbd_connection *info = server->smbd_conn;
+ struct smbdirect_socket *sc = &info->socket;
+ struct smbdirect_socket_parameters *sp = &sc->parameters;
struct smb_rqst *rqst;
struct iov_iter iter;
unsigned int remaining_data_length, klen;
int rc, i, rqst_idx;
- if (info->transport_status != SMBD_CONNECTED)
+ if (sc->status != SMBDIRECT_SOCKET_CONNECTED)
return -EAGAIN;
/*
@@ -1971,10 +1931,10 @@ int smbd_send(struct TCP_Server_Info *server,
for (i = 0; i < num_rqst; i++)
remaining_data_length += smb_rqst_len(server, &rqst_array[i]);
- if (unlikely(remaining_data_length > info->max_fragmented_send_size)) {
+ if (unlikely(remaining_data_length > sp->max_fragmented_send_size)) {
/* assertion: payload never exceeds negotiated maximum */
log_write(ERR, "payload size %d > max size %d\n",
- remaining_data_length, info->max_fragmented_send_size);
+ remaining_data_length, sp->max_fragmented_send_size);
return -EINVAL;
}
@@ -2000,14 +1960,14 @@ int smbd_send(struct TCP_Server_Info *server,
klen += rqst->rq_iov[i].iov_len;
iov_iter_kvec(&iter, ITER_SOURCE, rqst->rq_iov, rqst->rq_nvec, klen);
- rc = smbd_post_send_iter(info, &iter, &remaining_data_length);
+ rc = smbd_post_send_full_iter(info, &iter, &remaining_data_length);
if (rc < 0)
break;
if (iov_iter_count(&rqst->rq_iter) > 0) {
/* And then the data pages if there are any */
- rc = smbd_post_send_iter(info, &rqst->rq_iter,
- &remaining_data_length);
+ rc = smbd_post_send_full_iter(info, &rqst->rq_iter,
+ &remaining_data_length);
if (rc < 0)
break;
}
@@ -2022,7 +1982,11 @@ int smbd_send(struct TCP_Server_Info *server,
*/
wait_event(info->wait_send_pending,
- atomic_read(&info->send_pending) == 0);
+ atomic_read(&info->send_pending) == 0 ||
+ sc->status != SMBDIRECT_SOCKET_CONNECTED);
+
+ if (sc->status != SMBDIRECT_SOCKET_CONNECTED && rc == 0)
+ rc = -EAGAIN;
return rc;
}
@@ -2053,6 +2017,7 @@ static void smbd_mr_recovery_work(struct work_struct *work)
{
struct smbd_connection *info =
container_of(work, struct smbd_connection, mr_recovery_work);
+ struct smbdirect_socket *sc = &info->socket;
struct smbd_mr *smbdirect_mr;
int rc;
@@ -2070,7 +2035,7 @@ static void smbd_mr_recovery_work(struct work_struct *work)
}
smbdirect_mr->mr = ib_alloc_mr(
- info->pd, info->mr_type,
+ sc->ib.pd, info->mr_type,
info->max_frmr_depth);
if (IS_ERR(smbdirect_mr->mr)) {
log_rdma_mr(ERR, "ib_alloc_mr failed mr_type=%x max_frmr_depth=%x\n",
@@ -2099,12 +2064,13 @@ static void smbd_mr_recovery_work(struct work_struct *work)
static void destroy_mr_list(struct smbd_connection *info)
{
+ struct smbdirect_socket *sc = &info->socket;
struct smbd_mr *mr, *tmp;
cancel_work_sync(&info->mr_recovery_work);
list_for_each_entry_safe(mr, tmp, &info->mr_list, list) {
if (mr->state == MR_INVALIDATED)
- ib_dma_unmap_sg(info->id->device, mr->sgt.sgl,
+ ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl,
mr->sgt.nents, mr->dir);
ib_dereg_mr(mr->mr);
kfree(mr->sgt.sgl);
@@ -2121,6 +2087,7 @@ static void destroy_mr_list(struct smbd_connection *info)
*/
static int allocate_mr_list(struct smbd_connection *info)
{
+ struct smbdirect_socket *sc = &info->socket;
int i;
struct smbd_mr *smbdirect_mr, *tmp;
@@ -2136,7 +2103,7 @@ static int allocate_mr_list(struct smbd_connection *info)
smbdirect_mr = kzalloc(sizeof(*smbdirect_mr), GFP_KERNEL);
if (!smbdirect_mr)
goto cleanup_entries;
- smbdirect_mr->mr = ib_alloc_mr(info->pd, info->mr_type,
+ smbdirect_mr->mr = ib_alloc_mr(sc->ib.pd, info->mr_type,
info->max_frmr_depth);
if (IS_ERR(smbdirect_mr->mr)) {
log_rdma_mr(ERR, "ib_alloc_mr failed mr_type=%x max_frmr_depth=%x\n",
@@ -2181,20 +2148,20 @@ cleanup_entries:
*/
static struct smbd_mr *get_mr(struct smbd_connection *info)
{
+ struct smbdirect_socket *sc = &info->socket;
struct smbd_mr *ret;
int rc;
again:
rc = wait_event_interruptible(info->wait_mr,
atomic_read(&info->mr_ready_count) ||
- info->transport_status != SMBD_CONNECTED);
+ sc->status != SMBDIRECT_SOCKET_CONNECTED);
if (rc) {
log_rdma_mr(ERR, "wait_event_interruptible rc=%x\n", rc);
return NULL;
}
- if (info->transport_status != SMBD_CONNECTED) {
- log_rdma_mr(ERR, "info->transport_status=%x\n",
- info->transport_status);
+ if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
+ log_rdma_mr(ERR, "sc->status=%x\n", sc->status);
return NULL;
}
@@ -2247,6 +2214,7 @@ struct smbd_mr *smbd_register_mr(struct smbd_connection *info,
struct iov_iter *iter,
bool writing, bool need_invalidate)
{
+ struct smbdirect_socket *sc = &info->socket;
struct smbd_mr *smbdirect_mr;
int rc, num_pages;
enum dma_data_direction dir;
@@ -2276,7 +2244,7 @@ struct smbd_mr *smbd_register_mr(struct smbd_connection *info,
num_pages, iov_iter_count(iter), info->max_frmr_depth);
smbd_iter_to_mr(info, iter, &smbdirect_mr->sgt, info->max_frmr_depth);
- rc = ib_dma_map_sg(info->id->device, smbdirect_mr->sgt.sgl,
+ rc = ib_dma_map_sg(sc->ib.dev, smbdirect_mr->sgt.sgl,
smbdirect_mr->sgt.nents, dir);
if (!rc) {
log_rdma_mr(ERR, "ib_dma_map_sg num_pages=%x dir=%x rc=%x\n",
@@ -2312,7 +2280,7 @@ struct smbd_mr *smbd_register_mr(struct smbd_connection *info,
* on IB_WR_REG_MR. Hardware enforces a barrier and order of execution
* on the next ib_post_send when we actually send I/O to remote peer
*/
- rc = ib_post_send(info->id->qp, &reg_wr->wr, NULL);
+ rc = ib_post_send(sc->ib.qp, &reg_wr->wr, NULL);
if (!rc)
return smbdirect_mr;
@@ -2321,7 +2289,7 @@ struct smbd_mr *smbd_register_mr(struct smbd_connection *info,
/* If all failed, attempt to recover this MR by setting it MR_ERROR*/
map_mr_error:
- ib_dma_unmap_sg(info->id->device, smbdirect_mr->sgt.sgl,
+ ib_dma_unmap_sg(sc->ib.dev, smbdirect_mr->sgt.sgl,
smbdirect_mr->sgt.nents, smbdirect_mr->dir);
dma_map_error:
@@ -2359,6 +2327,7 @@ int smbd_deregister_mr(struct smbd_mr *smbdirect_mr)
{
struct ib_send_wr *wr;
struct smbd_connection *info = smbdirect_mr->conn;
+ struct smbdirect_socket *sc = &info->socket;
int rc = 0;
if (smbdirect_mr->need_invalidate) {
@@ -2372,7 +2341,7 @@ int smbd_deregister_mr(struct smbd_mr *smbdirect_mr)
wr->send_flags = IB_SEND_SIGNALED;
init_completion(&smbdirect_mr->invalidate_done);
- rc = ib_post_send(info->id->qp, wr, NULL);
+ rc = ib_post_send(sc->ib.qp, wr, NULL);
if (rc) {
log_rdma_mr(ERR, "ib_post_send failed rc=%x\n", rc);
smbd_disconnect_rdma_connection(info);
@@ -2389,7 +2358,7 @@ int smbd_deregister_mr(struct smbd_mr *smbdirect_mr)
if (smbdirect_mr->state == MR_INVALIDATED) {
ib_dma_unmap_sg(
- info->id->device, smbdirect_mr->sgt.sgl,
+ sc->ib.dev, smbdirect_mr->sgt.sgl,
smbdirect_mr->sgt.nents,
smbdirect_mr->dir);
smbdirect_mr->state = MR_READY;
@@ -2552,13 +2521,14 @@ static ssize_t smb_extract_folioq_to_rdma(struct iov_iter *iter,
size_t fsize = folioq_folio_size(folioq, slot);
if (offset < fsize) {
- size_t part = umin(maxsize - ret, fsize - offset);
+ size_t part = umin(maxsize, fsize - offset);
if (!smb_set_sge(rdma, folio_page(folio, 0), offset, part))
return -EIO;
offset += part;
ret += part;
+ maxsize -= part;
}
if (offset >= fsize) {
@@ -2573,7 +2543,7 @@ static ssize_t smb_extract_folioq_to_rdma(struct iov_iter *iter,
slot = 0;
}
}
- } while (rdma->nr_sge < rdma->max_sge || maxsize > 0);
+ } while (rdma->nr_sge < rdma->max_sge && maxsize > 0);
iter->folioq = folioq;
iter->folioq_slot = slot;
diff --git a/fs/smb/client/smbdirect.h b/fs/smb/client/smbdirect.h
index c08e3665150d..e45aa9ddd71d 100644
--- a/fs/smb/client/smbdirect.h
+++ b/fs/smb/client/smbdirect.h
@@ -15,6 +15,9 @@
#include <rdma/rdma_cm.h>
#include <linux/mempool.h>
+#include "../common/smbdirect/smbdirect.h"
+#include "../common/smbdirect/smbdirect_socket.h"
+
extern int rdma_readwrite_threshold;
extern int smbd_max_frmr_depth;
extern int smbd_keep_alive_interval;
@@ -30,16 +33,6 @@ enum keep_alive_status {
KEEP_ALIVE_SENT,
};
-enum smbd_connection_status {
- SMBD_CREATED,
- SMBD_CONNECTING,
- SMBD_CONNECTED,
- SMBD_NEGOTIATE_FAILED,
- SMBD_DISCONNECTING,
- SMBD_DISCONNECTED,
- SMBD_DESTROYED
-};
-
/*
* The context for the SMBDirect transport
* Everything related to the transport is here. It has several logical parts
@@ -50,18 +43,11 @@ enum smbd_connection_status {
* 5. mempools for allocating packets
*/
struct smbd_connection {
- enum smbd_connection_status transport_status;
+ struct smbdirect_socket socket;
- /* RDMA related */
- struct rdma_cm_id *id;
- struct ib_qp_init_attr qp_attr;
- struct ib_pd *pd;
- struct ib_cq *send_cq, *recv_cq;
- struct ib_device_attr dev_attr;
int ri_rc;
struct completion ri_done;
- wait_queue_head_t conn_wait;
- wait_queue_head_t disconn_wait;
+ wait_queue_head_t status_wait;
struct completion negotiate_completion;
bool negotiate_done;
@@ -72,21 +58,12 @@ struct smbd_connection {
spinlock_t lock_new_credits_offered;
int new_credits_offered;
- /* Connection parameters defined in [MS-SMBD] 3.1.1.1 */
- int receive_credit_max;
- int send_credit_target;
- int max_send_size;
- int max_fragmented_recv_size;
- int max_fragmented_send_size;
- int max_receive_size;
- int keep_alive_interval;
- int max_readwrite_size;
+ /* dynamic connection parameters defined in [MS-SMBD] 3.1.1.1 */
enum keep_alive_status keep_alive_requested;
int protocol;
atomic_t send_credits;
atomic_t receive_credits;
int receive_credit_target;
- int fragment_reassembly_remaining;
/* Memory registrations */
/* Maximum number of RDMA read/write outstanding on this connection */
@@ -117,52 +94,16 @@ struct smbd_connection {
wait_queue_head_t wait_post_send;
/* Receive queue */
- struct list_head receive_queue;
int count_receive_queue;
- spinlock_t receive_queue_lock;
-
- struct list_head empty_packet_queue;
- int count_empty_packet_queue;
- spinlock_t empty_packet_queue_lock;
-
wait_queue_head_t wait_receive_queues;
- /* Reassembly queue */
- struct list_head reassembly_queue;
- spinlock_t reassembly_queue_lock;
- wait_queue_head_t wait_reassembly_queue;
-
- /* total data length of reassembly queue */
- int reassembly_data_length;
- int reassembly_queue_length;
- /* the offset to first buffer in reassembly queue */
- int first_entry_offset;
-
bool send_immediate;
wait_queue_head_t wait_send_queue;
- /*
- * Indicate if we have received a full packet on the connection
- * This is used to identify the first SMBD packet of a assembled
- * payload (SMB packet) in reassembly queue so we can return a
- * RFC1002 length to upper layer to indicate the length of the SMB
- * packet received
- */
- bool full_packet_received;
-
struct workqueue_struct *workqueue;
struct delayed_work idle_timer_work;
- /* Memory pool for preallocating buffers */
- /* request pool for RDMA send */
- struct kmem_cache *request_cache;
- mempool_t *request_mempool;
-
- /* response pool for RDMA receive */
- struct kmem_cache *response_cache;
- mempool_t *response_mempool;
-
/* for debug purposes */
unsigned int count_get_receive_buffer;
unsigned int count_put_receive_buffer;
@@ -172,96 +113,6 @@ struct smbd_connection {
unsigned int count_send_empty;
};
-enum smbd_message_type {
- SMBD_NEGOTIATE_RESP,
- SMBD_TRANSFER_DATA,
-};
-
-#define SMB_DIRECT_RESPONSE_REQUESTED 0x0001
-
-/* SMBD negotiation request packet [MS-SMBD] 2.2.1 */
-struct smbd_negotiate_req {
- __le16 min_version;
- __le16 max_version;
- __le16 reserved;
- __le16 credits_requested;
- __le32 preferred_send_size;
- __le32 max_receive_size;
- __le32 max_fragmented_size;
-} __packed;
-
-/* SMBD negotiation response packet [MS-SMBD] 2.2.2 */
-struct smbd_negotiate_resp {
- __le16 min_version;
- __le16 max_version;
- __le16 negotiated_version;
- __le16 reserved;
- __le16 credits_requested;
- __le16 credits_granted;
- __le32 status;
- __le32 max_readwrite_size;
- __le32 preferred_send_size;
- __le32 max_receive_size;
- __le32 max_fragmented_size;
-} __packed;
-
-/* SMBD data transfer packet with payload [MS-SMBD] 2.2.3 */
-struct smbd_data_transfer {
- __le16 credits_requested;
- __le16 credits_granted;
- __le16 flags;
- __le16 reserved;
- __le32 remaining_data_length;
- __le32 data_offset;
- __le32 data_length;
- __le32 padding;
- __u8 buffer[];
-} __packed;
-
-/* The packet fields for a registered RDMA buffer */
-struct smbd_buffer_descriptor_v1 {
- __le64 offset;
- __le32 token;
- __le32 length;
-} __packed;
-
-/* Maximum number of SGEs used by smbdirect.c in any send work request */
-#define SMBDIRECT_MAX_SEND_SGE 6
-
-/* The context for a SMBD request */
-struct smbd_request {
- struct smbd_connection *info;
- struct ib_cqe cqe;
-
- /* the SGE entries for this work request */
- struct ib_sge sge[SMBDIRECT_MAX_SEND_SGE];
- int num_sge;
-
- /* SMBD packet header follows this structure */
- u8 packet[];
-};
-
-/* Maximum number of SGEs used by smbdirect.c in any receive work request */
-#define SMBDIRECT_MAX_RECV_SGE 1
-
-/* The context for a SMBD response */
-struct smbd_response {
- struct smbd_connection *info;
- struct ib_cqe cqe;
- struct ib_sge sge;
-
- enum smbd_message_type type;
-
- /* Link to receive queue or reassembly queue */
- struct list_head list;
-
- /* Indicate if this is the 1st packet of a payload */
- bool first_segment;
-
- /* SMBD packet header and payload follows this structure */
- u8 packet[];
-};
-
/* Create a SMBDirect session */
struct smbd_connection *smbd_get_connection(
struct TCP_Server_Info *server, struct sockaddr *dstaddr);
diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h
index 52bcb55d9952..93e5b2bb9f28 100644
--- a/fs/smb/client/trace.h
+++ b/fs/smb/client/trace.h
@@ -140,7 +140,7 @@ DECLARE_EVENT_CLASS(smb3_rw_err_class,
__entry->len = len;
__entry->rc = rc;
),
- TP_printk("\tR=%08x[%x] xid=%u sid=0x%llx tid=0x%x fid=0x%llx offset=0x%llx len=0x%x rc=%d",
+ TP_printk("R=%08x[%x] xid=%u sid=0x%llx tid=0x%x fid=0x%llx offset=0x%llx len=0x%x rc=%d",
__entry->rreq_debug_id, __entry->rreq_debug_index,
__entry->xid, __entry->sesid, __entry->tid, __entry->fid,
__entry->offset, __entry->len, __entry->rc)
@@ -190,7 +190,7 @@ DECLARE_EVENT_CLASS(smb3_other_err_class,
__entry->len = len;
__entry->rc = rc;
),
- TP_printk("\txid=%u sid=0x%llx tid=0x%x fid=0x%llx offset=0x%llx len=0x%x rc=%d",
+ TP_printk("xid=%u sid=0x%llx tid=0x%x fid=0x%llx offset=0x%llx len=0x%x rc=%d",
__entry->xid, __entry->sesid, __entry->tid, __entry->fid,
__entry->offset, __entry->len, __entry->rc)
)
@@ -247,7 +247,7 @@ DECLARE_EVENT_CLASS(smb3_copy_range_err_class,
__entry->len = len;
__entry->rc = rc;
),
- TP_printk("\txid=%u sid=0x%llx tid=0x%x source fid=0x%llx source offset=0x%llx target fid=0x%llx target offset=0x%llx len=0x%x rc=%d",
+ TP_printk("xid=%u sid=0x%llx tid=0x%x source fid=0x%llx source offset=0x%llx target fid=0x%llx target offset=0x%llx len=0x%x rc=%d",
__entry->xid, __entry->sesid, __entry->tid, __entry->target_fid,
__entry->src_offset, __entry->target_fid, __entry->target_offset, __entry->len, __entry->rc)
)
@@ -298,7 +298,7 @@ DECLARE_EVENT_CLASS(smb3_copy_range_done_class,
__entry->target_offset = target_offset;
__entry->len = len;
),
- TP_printk("\txid=%u sid=0x%llx tid=0x%x source fid=0x%llx source offset=0x%llx target fid=0x%llx target offset=0x%llx len=0x%x",
+ TP_printk("xid=%u sid=0x%llx tid=0x%x source fid=0x%llx source offset=0x%llx target fid=0x%llx target offset=0x%llx len=0x%x",
__entry->xid, __entry->sesid, __entry->tid, __entry->target_fid,
__entry->src_offset, __entry->target_fid, __entry->target_offset, __entry->len)
)
@@ -482,7 +482,7 @@ DECLARE_EVENT_CLASS(smb3_fd_class,
__entry->tid = tid;
__entry->sesid = sesid;
),
- TP_printk("\txid=%u sid=0x%llx tid=0x%x fid=0x%llx",
+ TP_printk("xid=%u sid=0x%llx tid=0x%x fid=0x%llx",
__entry->xid, __entry->sesid, __entry->tid, __entry->fid)
)
@@ -521,7 +521,7 @@ DECLARE_EVENT_CLASS(smb3_fd_err_class,
__entry->sesid = sesid;
__entry->rc = rc;
),
- TP_printk("\txid=%u sid=0x%llx tid=0x%x fid=0x%llx rc=%d",
+ TP_printk("xid=%u sid=0x%llx tid=0x%x fid=0x%llx rc=%d",
__entry->xid, __entry->sesid, __entry->tid, __entry->fid,
__entry->rc)
)
@@ -794,7 +794,7 @@ DECLARE_EVENT_CLASS(smb3_cmd_err_class,
__entry->status = status;
__entry->rc = rc;
),
- TP_printk("\tsid=0x%llx tid=0x%x cmd=%u mid=%llu status=0x%x rc=%d",
+ TP_printk("sid=0x%llx tid=0x%x cmd=%u mid=%llu status=0x%x rc=%d",
__entry->sesid, __entry->tid, __entry->cmd, __entry->mid,
__entry->status, __entry->rc)
)
@@ -829,7 +829,7 @@ DECLARE_EVENT_CLASS(smb3_cmd_done_class,
__entry->cmd = cmd;
__entry->mid = mid;
),
- TP_printk("\tsid=0x%llx tid=0x%x cmd=%u mid=%llu",
+ TP_printk("sid=0x%llx tid=0x%x cmd=%u mid=%llu",
__entry->sesid, __entry->tid,
__entry->cmd, __entry->mid)
)
@@ -867,7 +867,7 @@ DECLARE_EVENT_CLASS(smb3_mid_class,
__entry->when_sent = when_sent;
__entry->when_received = when_received;
),
- TP_printk("\tcmd=%u mid=%llu pid=%u, when_sent=%lu when_rcv=%lu",
+ TP_printk("cmd=%u mid=%llu pid=%u, when_sent=%lu when_rcv=%lu",
__entry->cmd, __entry->mid, __entry->pid, __entry->when_sent,
__entry->when_received)
)
@@ -898,7 +898,7 @@ DECLARE_EVENT_CLASS(smb3_exit_err_class,
__assign_str(func_name);
__entry->rc = rc;
),
- TP_printk("\t%s: xid=%u rc=%d",
+ TP_printk("%s: xid=%u rc=%d",
__get_str(func_name), __entry->xid, __entry->rc)
)
@@ -924,7 +924,7 @@ DECLARE_EVENT_CLASS(smb3_sync_err_class,
__entry->ino = ino;
__entry->rc = rc;
),
- TP_printk("\tino=%lu rc=%d",
+ TP_printk("ino=%lu rc=%d",
__entry->ino, __entry->rc)
)
@@ -950,7 +950,7 @@ DECLARE_EVENT_CLASS(smb3_enter_exit_class,
__entry->xid = xid;
__assign_str(func_name);
),
- TP_printk("\t%s: xid=%u",
+ TP_printk("%s: xid=%u",
__get_str(func_name), __entry->xid)
)
diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c
index 0dc80959ce48..a61ba7f3fb86 100644
--- a/fs/smb/client/transport.c
+++ b/fs/smb/client/transport.c
@@ -30,9 +30,6 @@
#include "smbdirect.h"
#include "compress.h"
-/* Max number of iovectors we can use off the stack when sending requests. */
-#define CIFS_MAX_IOV_SIZE 8
-
void
cifs_wake_up_task(struct mid_q_entry *mid)
{
@@ -41,42 +38,6 @@ cifs_wake_up_task(struct mid_q_entry *mid)
wake_up_process(mid->callback_data);
}
-static struct mid_q_entry *
-alloc_mid(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
-{
- struct mid_q_entry *temp;
-
- if (server == NULL) {
- cifs_dbg(VFS, "%s: null TCP session\n", __func__);
- return NULL;
- }
-
- temp = mempool_alloc(cifs_mid_poolp, GFP_NOFS);
- memset(temp, 0, sizeof(struct mid_q_entry));
- kref_init(&temp->refcount);
- temp->mid = get_mid(smb_buffer);
- temp->pid = current->pid;
- temp->command = cpu_to_le16(smb_buffer->Command);
- cifs_dbg(FYI, "For smb_command %d\n", smb_buffer->Command);
- /* easier to use jiffies */
- /* when mid allocated can be before when sent */
- temp->when_alloc = jiffies;
- temp->server = server;
-
- /*
- * The default is for the mid to be synchronous, so the
- * default callback just wakes up the current task.
- */
- get_task_struct(current);
- temp->creator = current;
- temp->callback = cifs_wake_up_task;
- temp->callback_data = current;
-
- atomic_inc(&mid_count);
- temp->mid_state = MID_REQUEST_ALLOCATED;
- return temp;
-}
-
void __release_mid(struct kref *refcount)
{
struct mid_q_entry *midEntry =
@@ -89,7 +50,7 @@ void __release_mid(struct kref *refcount)
#endif
struct TCP_Server_Info *server = midEntry->server;
- if (midEntry->resp_buf && (midEntry->mid_flags & MID_WAIT_CANCELLED) &&
+ if (midEntry->resp_buf && (midEntry->wait_cancelled) &&
(midEntry->mid_state == MID_RESPONSE_RECEIVED ||
midEntry->mid_state == MID_RESPONSE_READY) &&
server->ops->handle_cancelled_mid)
@@ -160,12 +121,12 @@ void __release_mid(struct kref *refcount)
void
delete_mid(struct mid_q_entry *mid)
{
- spin_lock(&mid->server->mid_lock);
- if (!(mid->mid_flags & MID_DELETED)) {
+ spin_lock(&mid->server->mid_queue_lock);
+ if (mid->deleted_from_q == false) {
list_del_init(&mid->qhead);
- mid->mid_flags |= MID_DELETED;
+ mid->deleted_from_q = true;
}
- spin_unlock(&mid->server->mid_lock);
+ spin_unlock(&mid->server->mid_queue_lock);
release_mid(mid);
}
@@ -179,7 +140,7 @@ delete_mid(struct mid_q_entry *mid)
* Our basic "send data to server" function. Should be called with srv_mutex
* held. The caller is responsible for handling the results.
*/
-static int
+int
smb_send_kvec(struct TCP_Server_Info *server, struct msghdr *smb_msg,
size_t *sent)
{
@@ -269,9 +230,8 @@ smb_rqst_len(struct TCP_Server_Info *server, struct smb_rqst *rqst)
return buflen;
}
-static int
-__smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
- struct smb_rqst *rqst)
+int __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
+ struct smb_rqst *rqst)
{
int rc;
struct kvec *iov;
@@ -397,7 +357,7 @@ unmask:
* socket so the server throws away the partial SMB
*/
cifs_signal_cifsd_for_reconnect(server, false);
- trace_smb3_partial_send_reconnect(server->CurrentMid,
+ trace_smb3_partial_send_reconnect(server->current_mid,
server->conn_id, server->hostname);
}
smbd_done:
@@ -456,22 +416,6 @@ smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
return rc;
}
-int
-smb_send(struct TCP_Server_Info *server, struct smb_hdr *smb_buffer,
- unsigned int smb_buf_length)
-{
- struct kvec iov[2];
- struct smb_rqst rqst = { .rq_iov = iov,
- .rq_nvec = 2 };
-
- iov[0].iov_base = smb_buffer;
- iov[0].iov_len = 4;
- iov[1].iov_base = (char *)smb_buffer + 4;
- iov[1].iov_len = smb_buf_length;
-
- return __smb_send_rqst(server, 1, &rqst);
-}
-
static int
wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits,
const int timeout, const int flags,
@@ -509,7 +453,7 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits,
in_flight = server->in_flight;
spin_unlock(&server->req_lock);
- trace_smb3_nblk_credits(server->CurrentMid,
+ trace_smb3_nblk_credits(server->current_mid,
server->conn_id, server->hostname, scredits, -1, in_flight);
cifs_dbg(FYI, "%s: remove %u credits total=%d\n",
__func__, 1, scredits);
@@ -542,7 +486,7 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits,
in_flight = server->in_flight;
spin_unlock(&server->req_lock);
- trace_smb3_credit_timeout(server->CurrentMid,
+ trace_smb3_credit_timeout(server->current_mid,
server->conn_id, server->hostname, scredits,
num_credits, in_flight);
cifs_server_dbg(VFS, "wait timed out after %d ms\n",
@@ -585,7 +529,7 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits,
spin_unlock(&server->req_lock);
trace_smb3_credit_timeout(
- server->CurrentMid,
+ server->current_mid,
server->conn_id, server->hostname,
scredits, num_credits, in_flight);
cifs_server_dbg(VFS, "wait timed out after %d ms\n",
@@ -615,7 +559,7 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits,
in_flight = server->in_flight;
spin_unlock(&server->req_lock);
- trace_smb3_waitff_credits(server->CurrentMid,
+ trace_smb3_waitff_credits(server->current_mid,
server->conn_id, server->hostname, scredits,
-(num_credits), in_flight);
cifs_dbg(FYI, "%s: remove %u credits total=%d\n",
@@ -626,9 +570,8 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits,
return 0;
}
-static int
-wait_for_free_request(struct TCP_Server_Info *server, const int flags,
- unsigned int *instance)
+int wait_for_free_request(struct TCP_Server_Info *server, const int flags,
+ unsigned int *instance)
{
return wait_for_free_credits(server, 1, -1, flags,
instance);
@@ -666,7 +609,7 @@ wait_for_compound_request(struct TCP_Server_Info *server, int num,
*/
if (server->in_flight == 0) {
spin_unlock(&server->req_lock);
- trace_smb3_insufficient_credits(server->CurrentMid,
+ trace_smb3_insufficient_credits(server->current_mid,
server->conn_id, server->hostname, scredits,
num, in_flight);
cifs_dbg(FYI, "%s: %d requests in flight, needed %d total=%d\n",
@@ -690,40 +633,7 @@ cifs_wait_mtu_credits(struct TCP_Server_Info *server, size_t size,
return 0;
}
-static int allocate_mid(struct cifs_ses *ses, struct smb_hdr *in_buf,
- struct mid_q_entry **ppmidQ)
-{
- spin_lock(&ses->ses_lock);
- if (ses->ses_status == SES_NEW) {
- if ((in_buf->Command != SMB_COM_SESSION_SETUP_ANDX) &&
- (in_buf->Command != SMB_COM_NEGOTIATE)) {
- spin_unlock(&ses->ses_lock);
- return -EAGAIN;
- }
- /* else ok - we are setting up session */
- }
-
- if (ses->ses_status == SES_EXITING) {
- /* check if SMB session is bad because we are setting it up */
- if (in_buf->Command != SMB_COM_LOGOFF_ANDX) {
- spin_unlock(&ses->ses_lock);
- return -EAGAIN;
- }
- /* else ok - we are shutting down session */
- }
- spin_unlock(&ses->ses_lock);
-
- *ppmidQ = alloc_mid(in_buf, ses->server);
- if (*ppmidQ == NULL)
- return -ENOMEM;
- spin_lock(&ses->server->mid_lock);
- list_add_tail(&(*ppmidQ)->qhead, &ses->server->pending_mid_q);
- spin_unlock(&ses->server->mid_lock);
- return 0;
-}
-
-static int
-wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ)
+int wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ)
{
int error;
@@ -737,34 +647,6 @@ wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ)
return 0;
}
-struct mid_q_entry *
-cifs_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst)
-{
- int rc;
- struct smb_hdr *hdr = (struct smb_hdr *)rqst->rq_iov[0].iov_base;
- struct mid_q_entry *mid;
-
- if (rqst->rq_iov[0].iov_len != 4 ||
- rqst->rq_iov[0].iov_base + 4 != rqst->rq_iov[1].iov_base)
- return ERR_PTR(-EIO);
-
- /* enable signing if server requires it */
- if (server->sign)
- hdr->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
-
- mid = alloc_mid(hdr, server);
- if (mid == NULL)
- return ERR_PTR(-ENOMEM);
-
- rc = cifs_sign_rqst(rqst, server, &mid->sequence_number);
- if (rc) {
- release_mid(mid);
- return ERR_PTR(rc);
- }
-
- return mid;
-}
-
/*
* Send a SMB request and set the callback function in the mid to handle
* the result. Caller is responsible for dealing with timeouts.
@@ -819,9 +701,9 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst,
mid->mid_state = MID_REQUEST_SUBMITTED;
/* put it on the pending_mid_q */
- spin_lock(&server->mid_lock);
+ spin_lock(&server->mid_queue_lock);
list_add_tail(&mid->qhead, &server->pending_mid_q);
- spin_unlock(&server->mid_lock);
+ spin_unlock(&server->mid_queue_lock);
/*
* Need to store the time in mid before calling I/O. For call_async,
@@ -845,45 +727,17 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst,
return rc;
}
-/*
- *
- * Send an SMB Request. No response info (other than return code)
- * needs to be parsed.
- *
- * flags indicate the type of request buffer and how long to wait
- * and whether to log NT STATUS code (error) before mapping it to POSIX error
- *
- */
-int
-SendReceiveNoRsp(const unsigned int xid, struct cifs_ses *ses,
- char *in_buf, int flags)
-{
- int rc;
- struct kvec iov[1];
- struct kvec rsp_iov;
- int resp_buf_type;
-
- iov[0].iov_base = in_buf;
- iov[0].iov_len = get_rfc1002_length(in_buf) + 4;
- flags |= CIFS_NO_RSP_BUF;
- rc = SendReceive2(xid, ses, iov, 1, &resp_buf_type, flags, &rsp_iov);
- cifs_dbg(NOISY, "SendRcvNoRsp flags %d rc %d\n", flags, rc);
-
- return rc;
-}
-
-static int
-cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
+int cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
{
int rc = 0;
cifs_dbg(FYI, "%s: cmd=%d mid=%llu state=%d\n",
__func__, le16_to_cpu(mid->command), mid->mid, mid->mid_state);
- spin_lock(&server->mid_lock);
+ spin_lock(&server->mid_queue_lock);
switch (mid->mid_state) {
case MID_RESPONSE_READY:
- spin_unlock(&server->mid_lock);
+ spin_unlock(&server->mid_queue_lock);
return rc;
case MID_RETRY_NEEDED:
rc = -EAGAIN;
@@ -894,86 +748,27 @@ cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
case MID_SHUTDOWN:
rc = -EHOSTDOWN;
break;
+ case MID_RC:
+ rc = mid->mid_rc;
+ break;
default:
- if (!(mid->mid_flags & MID_DELETED)) {
+ if (mid->deleted_from_q == false) {
list_del_init(&mid->qhead);
- mid->mid_flags |= MID_DELETED;
+ mid->deleted_from_q = true;
}
- spin_unlock(&server->mid_lock);
+ spin_unlock(&server->mid_queue_lock);
cifs_server_dbg(VFS, "%s: invalid mid state mid=%llu state=%d\n",
__func__, mid->mid, mid->mid_state);
rc = -EIO;
goto sync_mid_done;
}
- spin_unlock(&server->mid_lock);
+ spin_unlock(&server->mid_queue_lock);
sync_mid_done:
release_mid(mid);
return rc;
}
-static inline int
-send_cancel(struct TCP_Server_Info *server, struct smb_rqst *rqst,
- struct mid_q_entry *mid)
-{
- return server->ops->send_cancel ?
- server->ops->send_cancel(server, rqst, mid) : 0;
-}
-
-int
-cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server,
- bool log_error)
-{
- unsigned int len = get_rfc1002_length(mid->resp_buf) + 4;
-
- dump_smb(mid->resp_buf, min_t(u32, 92, len));
-
- /* convert the length into a more usable form */
- if (server->sign) {
- struct kvec iov[2];
- int rc = 0;
- struct smb_rqst rqst = { .rq_iov = iov,
- .rq_nvec = 2 };
-
- iov[0].iov_base = mid->resp_buf;
- iov[0].iov_len = 4;
- iov[1].iov_base = (char *)mid->resp_buf + 4;
- iov[1].iov_len = len - 4;
- /* FIXME: add code to kill session */
- rc = cifs_verify_signature(&rqst, server,
- mid->sequence_number);
- if (rc)
- cifs_server_dbg(VFS, "SMB signature verification returned error = %d\n",
- rc);
- }
-
- /* BB special case reconnect tid and uid here? */
- return map_and_check_smb_error(mid, log_error);
-}
-
-struct mid_q_entry *
-cifs_setup_request(struct cifs_ses *ses, struct TCP_Server_Info *ignored,
- struct smb_rqst *rqst)
-{
- int rc;
- struct smb_hdr *hdr = (struct smb_hdr *)rqst->rq_iov[0].iov_base;
- struct mid_q_entry *mid;
-
- if (rqst->rq_iov[0].iov_len != 4 ||
- rqst->rq_iov[0].iov_base + 4 != rqst->rq_iov[1].iov_base)
- return ERR_PTR(-EIO);
-
- rc = allocate_mid(ses, hdr, &mid);
- if (rc)
- return ERR_PTR(rc);
- rc = cifs_sign_rqst(rqst, ses->server, &mid->sequence_number);
- if (rc) {
- delete_mid(mid);
- return ERR_PTR(rc);
- }
- return mid;
-}
-
static void
cifs_compound_callback(struct mid_q_entry *mid)
{
@@ -1015,14 +810,16 @@ struct TCP_Server_Info *cifs_pick_channel(struct cifs_ses *ses)
uint index = 0;
unsigned int min_in_flight = UINT_MAX, max_in_flight = 0;
struct TCP_Server_Info *server = NULL;
- int i;
+ int i, start, cur;
if (!ses)
return NULL;
spin_lock(&ses->chan_lock);
+ start = atomic_inc_return(&ses->chan_seq);
for (i = 0; i < ses->chan_count; i++) {
- server = ses->chans[i].server;
+ cur = (start + i) % ses->chan_count;
+ server = ses->chans[cur].server;
if (!server || server->terminate)
continue;
@@ -1039,17 +836,15 @@ struct TCP_Server_Info *cifs_pick_channel(struct cifs_ses *ses)
*/
if (server->in_flight < min_in_flight) {
min_in_flight = server->in_flight;
- index = i;
+ index = cur;
}
if (server->in_flight > max_in_flight)
max_in_flight = server->in_flight;
}
/* if all channels are equally loaded, fall back to round-robin */
- if (min_in_flight == max_in_flight) {
- index = (uint)atomic_inc_return(&ses->chan_seq);
- index %= ses->chan_count;
- }
+ if (min_in_flight == max_in_flight)
+ index = (uint)start % ses->chan_count;
server = ses->chans[index].server;
spin_unlock(&ses->chan_lock);
@@ -1210,15 +1005,14 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
cifs_server_dbg(FYI, "Cancelling wait for mid %llu cmd: %d\n",
midQ[i]->mid, le16_to_cpu(midQ[i]->command));
send_cancel(server, &rqst[i], midQ[i]);
- spin_lock(&server->mid_lock);
- midQ[i]->mid_flags |= MID_WAIT_CANCELLED;
- if (midQ[i]->mid_state == MID_REQUEST_SUBMITTED ||
- midQ[i]->mid_state == MID_RESPONSE_RECEIVED) {
+ spin_lock(&midQ[i]->mid_lock);
+ midQ[i]->wait_cancelled = true;
+ if (midQ[i]->callback) {
midQ[i]->callback = cifs_cancelled_callback;
cancelled_mid[i] = true;
credits[i].value = 0;
}
- spin_unlock(&server->mid_lock);
+ spin_unlock(&midQ[i]->mid_lock);
}
}
@@ -1301,344 +1095,6 @@ cifs_send_recv(const unsigned int xid, struct cifs_ses *ses,
rqst, resp_buf_type, resp_iov);
}
-int
-SendReceive2(const unsigned int xid, struct cifs_ses *ses,
- struct kvec *iov, int n_vec, int *resp_buf_type /* ret */,
- const int flags, struct kvec *resp_iov)
-{
- struct smb_rqst rqst;
- struct kvec s_iov[CIFS_MAX_IOV_SIZE], *new_iov;
- int rc;
-
- if (n_vec + 1 > CIFS_MAX_IOV_SIZE) {
- new_iov = kmalloc_array(n_vec + 1, sizeof(struct kvec),
- GFP_KERNEL);
- if (!new_iov) {
- /* otherwise cifs_send_recv below sets resp_buf_type */
- *resp_buf_type = CIFS_NO_BUFFER;
- return -ENOMEM;
- }
- } else
- new_iov = s_iov;
-
- /* 1st iov is a RFC1001 length followed by the rest of the packet */
- memcpy(new_iov + 1, iov, (sizeof(struct kvec) * n_vec));
-
- new_iov[0].iov_base = new_iov[1].iov_base;
- new_iov[0].iov_len = 4;
- new_iov[1].iov_base += 4;
- new_iov[1].iov_len -= 4;
-
- memset(&rqst, 0, sizeof(struct smb_rqst));
- rqst.rq_iov = new_iov;
- rqst.rq_nvec = n_vec + 1;
-
- rc = cifs_send_recv(xid, ses, ses->server,
- &rqst, resp_buf_type, flags, resp_iov);
- if (n_vec + 1 > CIFS_MAX_IOV_SIZE)
- kfree(new_iov);
- return rc;
-}
-
-int
-SendReceive(const unsigned int xid, struct cifs_ses *ses,
- struct smb_hdr *in_buf, struct smb_hdr *out_buf,
- int *pbytes_returned, const int flags)
-{
- int rc = 0;
- struct mid_q_entry *midQ;
- unsigned int len = be32_to_cpu(in_buf->smb_buf_length);
- struct kvec iov = { .iov_base = in_buf, .iov_len = len };
- struct smb_rqst rqst = { .rq_iov = &iov, .rq_nvec = 1 };
- struct cifs_credits credits = { .value = 1, .instance = 0 };
- struct TCP_Server_Info *server;
-
- if (ses == NULL) {
- cifs_dbg(VFS, "Null smb session\n");
- return -EIO;
- }
- server = ses->server;
- if (server == NULL) {
- cifs_dbg(VFS, "Null tcp session\n");
- return -EIO;
- }
-
- spin_lock(&server->srv_lock);
- if (server->tcpStatus == CifsExiting) {
- spin_unlock(&server->srv_lock);
- return -ENOENT;
- }
- spin_unlock(&server->srv_lock);
-
- /* Ensure that we do not send more than 50 overlapping requests
- to the same server. We may make this configurable later or
- use ses->maxReq */
-
- if (len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
- cifs_server_dbg(VFS, "Invalid length, greater than maximum frame, %d\n",
- len);
- return -EIO;
- }
-
- rc = wait_for_free_request(server, flags, &credits.instance);
- if (rc)
- return rc;
-
- /* make sure that we sign in the same order that we send on this socket
- and avoid races inside tcp sendmsg code that could cause corruption
- of smb data */
-
- cifs_server_lock(server);
-
- rc = allocate_mid(ses, in_buf, &midQ);
- if (rc) {
- cifs_server_unlock(server);
- /* Update # of requests on wire to server */
- add_credits(server, &credits, 0);
- return rc;
- }
-
- rc = cifs_sign_smb(in_buf, server, &midQ->sequence_number);
- if (rc) {
- cifs_server_unlock(server);
- goto out;
- }
-
- midQ->mid_state = MID_REQUEST_SUBMITTED;
-
- rc = smb_send(server, in_buf, len);
- cifs_save_when_sent(midQ);
-
- if (rc < 0)
- server->sequence_number -= 2;
-
- cifs_server_unlock(server);
-
- if (rc < 0)
- goto out;
-
- rc = wait_for_response(server, midQ);
- if (rc != 0) {
- send_cancel(server, &rqst, midQ);
- spin_lock(&server->mid_lock);
- if (midQ->mid_state == MID_REQUEST_SUBMITTED ||
- midQ->mid_state == MID_RESPONSE_RECEIVED) {
- /* no longer considered to be "in-flight" */
- midQ->callback = release_mid;
- spin_unlock(&server->mid_lock);
- add_credits(server, &credits, 0);
- return rc;
- }
- spin_unlock(&server->mid_lock);
- }
-
- rc = cifs_sync_mid_result(midQ, server);
- if (rc != 0) {
- add_credits(server, &credits, 0);
- return rc;
- }
-
- if (!midQ->resp_buf || !out_buf ||
- midQ->mid_state != MID_RESPONSE_READY) {
- rc = -EIO;
- cifs_server_dbg(VFS, "Bad MID state?\n");
- goto out;
- }
-
- *pbytes_returned = get_rfc1002_length(midQ->resp_buf);
- memcpy(out_buf, midQ->resp_buf, *pbytes_returned + 4);
- rc = cifs_check_receive(midQ, server, 0);
-out:
- delete_mid(midQ);
- add_credits(server, &credits, 0);
-
- return rc;
-}
-
-/* We send a LOCKINGX_CANCEL_LOCK to cause the Windows
- blocking lock to return. */
-
-static int
-send_lock_cancel(const unsigned int xid, struct cifs_tcon *tcon,
- struct smb_hdr *in_buf,
- struct smb_hdr *out_buf)
-{
- int bytes_returned;
- struct cifs_ses *ses = tcon->ses;
- LOCK_REQ *pSMB = (LOCK_REQ *)in_buf;
-
- /* We just modify the current in_buf to change
- the type of lock from LOCKING_ANDX_SHARED_LOCK
- or LOCKING_ANDX_EXCLUSIVE_LOCK to
- LOCKING_ANDX_CANCEL_LOCK. */
-
- pSMB->LockType = LOCKING_ANDX_CANCEL_LOCK|LOCKING_ANDX_LARGE_FILES;
- pSMB->Timeout = 0;
- pSMB->hdr.Mid = get_next_mid(ses->server);
-
- return SendReceive(xid, ses, in_buf, out_buf,
- &bytes_returned, 0);
-}
-
-int
-SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
- struct smb_hdr *in_buf, struct smb_hdr *out_buf,
- int *pbytes_returned)
-{
- int rc = 0;
- int rstart = 0;
- struct mid_q_entry *midQ;
- struct cifs_ses *ses;
- unsigned int len = be32_to_cpu(in_buf->smb_buf_length);
- struct kvec iov = { .iov_base = in_buf, .iov_len = len };
- struct smb_rqst rqst = { .rq_iov = &iov, .rq_nvec = 1 };
- unsigned int instance;
- struct TCP_Server_Info *server;
-
- if (tcon == NULL || tcon->ses == NULL) {
- cifs_dbg(VFS, "Null smb session\n");
- return -EIO;
- }
- ses = tcon->ses;
- server = ses->server;
-
- if (server == NULL) {
- cifs_dbg(VFS, "Null tcp session\n");
- return -EIO;
- }
-
- spin_lock(&server->srv_lock);
- if (server->tcpStatus == CifsExiting) {
- spin_unlock(&server->srv_lock);
- return -ENOENT;
- }
- spin_unlock(&server->srv_lock);
-
- /* Ensure that we do not send more than 50 overlapping requests
- to the same server. We may make this configurable later or
- use ses->maxReq */
-
- if (len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
- cifs_tcon_dbg(VFS, "Invalid length, greater than maximum frame, %d\n",
- len);
- return -EIO;
- }
-
- rc = wait_for_free_request(server, CIFS_BLOCKING_OP, &instance);
- if (rc)
- return rc;
-
- /* make sure that we sign in the same order that we send on this socket
- and avoid races inside tcp sendmsg code that could cause corruption
- of smb data */
-
- cifs_server_lock(server);
-
- rc = allocate_mid(ses, in_buf, &midQ);
- if (rc) {
- cifs_server_unlock(server);
- return rc;
- }
-
- rc = cifs_sign_smb(in_buf, server, &midQ->sequence_number);
- if (rc) {
- delete_mid(midQ);
- cifs_server_unlock(server);
- return rc;
- }
-
- midQ->mid_state = MID_REQUEST_SUBMITTED;
- rc = smb_send(server, in_buf, len);
- cifs_save_when_sent(midQ);
-
- if (rc < 0)
- server->sequence_number -= 2;
-
- cifs_server_unlock(server);
-
- if (rc < 0) {
- delete_mid(midQ);
- return rc;
- }
-
- /* Wait for a reply - allow signals to interrupt. */
- rc = wait_event_interruptible(server->response_q,
- (!(midQ->mid_state == MID_REQUEST_SUBMITTED ||
- midQ->mid_state == MID_RESPONSE_RECEIVED)) ||
- ((server->tcpStatus != CifsGood) &&
- (server->tcpStatus != CifsNew)));
-
- /* Were we interrupted by a signal ? */
- spin_lock(&server->srv_lock);
- if ((rc == -ERESTARTSYS) &&
- (midQ->mid_state == MID_REQUEST_SUBMITTED ||
- midQ->mid_state == MID_RESPONSE_RECEIVED) &&
- ((server->tcpStatus == CifsGood) ||
- (server->tcpStatus == CifsNew))) {
- spin_unlock(&server->srv_lock);
-
- if (in_buf->Command == SMB_COM_TRANSACTION2) {
- /* POSIX lock. We send a NT_CANCEL SMB to cause the
- blocking lock to return. */
- rc = send_cancel(server, &rqst, midQ);
- if (rc) {
- delete_mid(midQ);
- return rc;
- }
- } else {
- /* Windows lock. We send a LOCKINGX_CANCEL_LOCK
- to cause the blocking lock to return. */
-
- rc = send_lock_cancel(xid, tcon, in_buf, out_buf);
-
- /* If we get -ENOLCK back the lock may have
- already been removed. Don't exit in this case. */
- if (rc && rc != -ENOLCK) {
- delete_mid(midQ);
- return rc;
- }
- }
-
- rc = wait_for_response(server, midQ);
- if (rc) {
- send_cancel(server, &rqst, midQ);
- spin_lock(&server->mid_lock);
- if (midQ->mid_state == MID_REQUEST_SUBMITTED ||
- midQ->mid_state == MID_RESPONSE_RECEIVED) {
- /* no longer considered to be "in-flight" */
- midQ->callback = release_mid;
- spin_unlock(&server->mid_lock);
- return rc;
- }
- spin_unlock(&server->mid_lock);
- }
-
- /* We got the response - restart system call. */
- rstart = 1;
- spin_lock(&server->srv_lock);
- }
- spin_unlock(&server->srv_lock);
-
- rc = cifs_sync_mid_result(midQ, server);
- if (rc != 0)
- return rc;
-
- /* rcvd frame is ok */
- if (out_buf == NULL || midQ->mid_state != MID_RESPONSE_READY) {
- rc = -EIO;
- cifs_tcon_dbg(VFS, "Bad MID state?\n");
- goto out;
- }
-
- *pbytes_returned = get_rfc1002_length(midQ->resp_buf);
- memcpy(out_buf, midQ->resp_buf, *pbytes_returned + 4);
- rc = cifs_check_receive(midQ, server, 0);
-out:
- delete_mid(midQ);
- if (rstart && rc == -EACCES)
- return -ERESTARTSYS;
- return rc;
-}
/*
* Discard any remaining data in the current SMB. To do this, we borrow the
diff --git a/fs/smb/client/xattr.c b/fs/smb/client/xattr.c
index 58a584f0b27e..b88fa04f5792 100644
--- a/fs/smb/client/xattr.c
+++ b/fs/smb/client/xattr.c
@@ -31,6 +31,8 @@
* secure, replaced by SMB2 (then even more highly secure SMB3) many years ago
*/
#define SMB3_XATTR_CIFS_ACL "system.smb3_acl" /* DACL only */
+#define SMB3_XATTR_CIFS_NTSD_SACL "system.smb3_ntsd_sacl" /* SACL only */
+#define SMB3_XATTR_CIFS_NTSD_OWNER "system.smb3_ntsd_owner" /* owner only */
#define SMB3_XATTR_CIFS_NTSD "system.smb3_ntsd" /* owner plus DACL */
#define SMB3_XATTR_CIFS_NTSD_FULL "system.smb3_ntsd_full" /* owner/DACL/SACL */
#define SMB3_XATTR_ATTRIB "smb3.dosattrib" /* full name: user.smb3.dosattrib */
@@ -38,6 +40,7 @@
/* BB need to add server (Samba e.g) support for security and trusted prefix */
enum { XATTR_USER, XATTR_CIFS_ACL, XATTR_ACL_ACCESS, XATTR_ACL_DEFAULT,
+ XATTR_CIFS_NTSD_SACL, XATTR_CIFS_NTSD_OWNER,
XATTR_CIFS_NTSD, XATTR_CIFS_NTSD_FULL };
static int cifs_attrib_set(unsigned int xid, struct cifs_tcon *pTcon,
@@ -160,6 +163,8 @@ static int cifs_xattr_set(const struct xattr_handler *handler,
break;
case XATTR_CIFS_ACL:
+ case XATTR_CIFS_NTSD_SACL:
+ case XATTR_CIFS_NTSD_OWNER:
case XATTR_CIFS_NTSD:
case XATTR_CIFS_NTSD_FULL: {
struct smb_ntsd *pacl;
@@ -187,6 +192,13 @@ static int cifs_xattr_set(const struct xattr_handler *handler,
CIFS_ACL_GROUP |
CIFS_ACL_DACL);
break;
+ case XATTR_CIFS_NTSD_OWNER:
+ aclflags = (CIFS_ACL_OWNER |
+ CIFS_ACL_GROUP);
+ break;
+ case XATTR_CIFS_NTSD_SACL:
+ aclflags = CIFS_ACL_SACL;
+ break;
case XATTR_CIFS_ACL:
default:
aclflags = CIFS_ACL_DACL;
@@ -308,6 +320,8 @@ static int cifs_xattr_get(const struct xattr_handler *handler,
break;
case XATTR_CIFS_ACL:
+ case XATTR_CIFS_NTSD_SACL:
+ case XATTR_CIFS_NTSD_OWNER:
case XATTR_CIFS_NTSD:
case XATTR_CIFS_NTSD_FULL: {
/*
@@ -320,10 +334,23 @@ static int cifs_xattr_get(const struct xattr_handler *handler,
if (pTcon->ses->server->ops->get_acl == NULL)
goto out; /* rc already EOPNOTSUPP */
- if (handler->flags == XATTR_CIFS_NTSD_FULL) {
+ switch (handler->flags) {
+ case XATTR_CIFS_NTSD_FULL:
+ extra_info = OWNER_SECINFO | GROUP_SECINFO | DACL_SECINFO | SACL_SECINFO;
+ break;
+ case XATTR_CIFS_NTSD:
+ extra_info = OWNER_SECINFO | GROUP_SECINFO | DACL_SECINFO;
+ break;
+ case XATTR_CIFS_NTSD_OWNER:
+ extra_info = OWNER_SECINFO | GROUP_SECINFO;
+ break;
+ case XATTR_CIFS_NTSD_SACL:
extra_info = SACL_SECINFO;
- } else {
- extra_info = 0;
+ break;
+ case XATTR_CIFS_ACL:
+ default:
+ extra_info = DACL_SECINFO;
+ break;
}
pacl = pTcon->ses->server->ops->get_acl(cifs_sb,
inode, full_path, &acllen, extra_info);
@@ -441,6 +468,20 @@ static const struct xattr_handler smb3_acl_xattr_handler = {
.set = cifs_xattr_set,
};
+static const struct xattr_handler smb3_ntsd_sacl_xattr_handler = {
+ .name = SMB3_XATTR_CIFS_NTSD_SACL,
+ .flags = XATTR_CIFS_NTSD_SACL,
+ .get = cifs_xattr_get,
+ .set = cifs_xattr_set,
+};
+
+static const struct xattr_handler smb3_ntsd_owner_xattr_handler = {
+ .name = SMB3_XATTR_CIFS_NTSD_OWNER,
+ .flags = XATTR_CIFS_NTSD_OWNER,
+ .get = cifs_xattr_get,
+ .set = cifs_xattr_set,
+};
+
static const struct xattr_handler cifs_cifs_ntsd_xattr_handler = {
.name = CIFS_XATTR_CIFS_NTSD,
.flags = XATTR_CIFS_NTSD,
@@ -486,6 +527,8 @@ const struct xattr_handler * const cifs_xattr_handlers[] = {
&cifs_os2_xattr_handler,
&cifs_cifs_acl_xattr_handler,
&smb3_acl_xattr_handler, /* alias for above since avoiding "cifs" */
+ &smb3_ntsd_sacl_xattr_handler,
+ &smb3_ntsd_owner_xattr_handler,
&cifs_cifs_ntsd_xattr_handler,
&smb3_ntsd_xattr_handler, /* alias for above since avoiding "cifs" */
&cifs_cifs_ntsd_full_xattr_handler,
diff --git a/fs/smb/common/smb2pdu.h b/fs/smb/common/smb2pdu.h
index 3c7c706c797d..f79a5165a7cc 100644
--- a/fs/smb/common/smb2pdu.h
+++ b/fs/smb/common/smb2pdu.h
@@ -95,6 +95,9 @@
*/
#define SMB3_DEFAULT_IOSIZE (4 * 1024 * 1024)
+/* According to MS-SMB2 specification The minimum recommended value is 65536.*/
+#define CIFS_MIN_DEFAULT_IOSIZE (65536)
+
/*
* SMB2 Header Definition
*
@@ -1550,15 +1553,27 @@ struct reparse_symlink_data_buffer {
__u8 PathBuffer[]; /* Variable Length */
} __packed;
-/* See MS-FSCC 2.1.2.6 and cifspdu.h for struct reparse_posix_data */
+/* For IO_REPARSE_TAG_NFS - see MS-FSCC 2.1.2.6 */
+#define NFS_SPECFILE_LNK 0x00000000014B4E4C
+#define NFS_SPECFILE_CHR 0x0000000000524843
+#define NFS_SPECFILE_BLK 0x00000000004B4C42
+#define NFS_SPECFILE_FIFO 0x000000004F464946
+#define NFS_SPECFILE_SOCK 0x000000004B434F53
+struct reparse_nfs_data_buffer {
+ __le32 ReparseTag;
+ __le16 ReparseDataLength;
+ __u16 Reserved;
+ __le64 InodeType; /* NFS_SPECFILE_* */
+ __u8 DataBuffer[];
+} __packed;
-/* For IO_REPARSE_TAG_LX_SYMLINK */
+/* For IO_REPARSE_TAG_LX_SYMLINK - see MS-FSCC 2.1.2.7 */
struct reparse_wsl_symlink_data_buffer {
__le32 ReparseTag;
__le16 ReparseDataLength;
__u16 Reserved;
- __le32 Flags;
- __u8 PathBuffer[]; /* Variable Length UTF-8 string without nul-term */
+ __le32 Version; /* Always 2 */
+ __u8 Target[]; /* Variable Length UTF-8 string without nul-term */
} __packed;
struct validate_negotiate_info_req {
@@ -1695,23 +1710,33 @@ struct smb2_file_internal_info {
} __packed; /* level 6 Query */
struct smb2_file_rename_info { /* encoding of request for level 10 */
- __u8 ReplaceIfExists; /* 1 = replace existing target with new */
- /* 0 = fail if target already exists */
- __u8 Reserved[7];
- __u64 RootDirectory; /* MBZ for network operations (why says spec?) */
- __le32 FileNameLength;
+ /* New members MUST be added within the struct_group() macro below. */
+ __struct_group(smb2_file_rename_info_hdr, __hdr, __packed,
+ __u8 ReplaceIfExists; /* 1 = replace existing target with new */
+ /* 0 = fail if target already exists */
+ __u8 Reserved[7];
+ __u64 RootDirectory; /* MBZ for network operations (why says spec?) */
+ __le32 FileNameLength;
+ );
char FileName[]; /* New name to be assigned */
/* padding - overall struct size must be >= 24 so filename + pad >= 6 */
} __packed; /* level 10 Set */
+static_assert(offsetof(struct smb2_file_rename_info, FileName) == sizeof(struct smb2_file_rename_info_hdr),
+ "struct member likely outside of __struct_group()");
struct smb2_file_link_info { /* encoding of request for level 11 */
- __u8 ReplaceIfExists; /* 1 = replace existing link with new */
- /* 0 = fail if link already exists */
- __u8 Reserved[7];
- __u64 RootDirectory; /* MBZ for network operations (why says spec?) */
- __le32 FileNameLength;
+ /* New members MUST be added within the struct_group() macro below. */
+ __struct_group(smb2_file_link_info_hdr, __hdr, __packed,
+ __u8 ReplaceIfExists; /* 1 = replace existing link with new */
+ /* 0 = fail if link already exists */
+ __u8 Reserved[7];
+ __u64 RootDirectory; /* MBZ for network operations (why says spec?) */
+ __le32 FileNameLength;
+ );
char FileName[]; /* Name to be assigned to new link */
} __packed; /* level 11 Set */
+static_assert(offsetof(struct smb2_file_link_info, FileName) == sizeof(struct smb2_file_link_info_hdr),
+ "struct member likely outside of __struct_group()");
/*
* This level 18, although with struct with same name is different from cifs
diff --git a/fs/smb/common/smbacl.h b/fs/smb/common/smbacl.h
index 6a60698fc6f0..a624ec9e4a14 100644
--- a/fs/smb/common/smbacl.h
+++ b/fs/smb/common/smbacl.h
@@ -107,7 +107,8 @@ struct smb_sid {
struct smb_acl {
__le16 revision; /* revision level */
__le16 size;
- __le32 num_aces;
+ __le16 num_aces;
+ __le16 reserved;
} __attribute__((packed));
struct smb_ace {
diff --git a/fs/smb/common/smbdirect/smbdirect.h b/fs/smb/common/smbdirect/smbdirect.h
new file mode 100644
index 000000000000..b9a385344ff3
--- /dev/null
+++ b/fs/smb/common/smbdirect/smbdirect.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2017, Microsoft Corporation.
+ * Copyright (C) 2018, LG Electronics.
+ */
+
+#ifndef __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_H__
+#define __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_H__
+
+/* SMB-DIRECT buffer descriptor V1 structure [MS-SMBD] 2.2.3.1 */
+struct smbdirect_buffer_descriptor_v1 {
+ __le64 offset;
+ __le32 token;
+ __le32 length;
+} __packed;
+
+/*
+ * Connection parameters mostly from [MS-SMBD] 3.1.1.1
+ *
+ * These are setup and negotiated at the beginning of a
+ * connection and remain constant unless explicitly changed.
+ *
+ * Some values are important for the upper layer.
+ */
+struct smbdirect_socket_parameters {
+ __u16 recv_credit_max;
+ __u16 send_credit_target;
+ __u32 max_send_size;
+ __u32 max_fragmented_send_size;
+ __u32 max_recv_size;
+ __u32 max_fragmented_recv_size;
+ __u32 max_read_write_size;
+ __u32 keepalive_interval_msec;
+ __u32 keepalive_timeout_msec;
+} __packed;
+
+#endif /* __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_H__ */
diff --git a/fs/smb/common/smbdirect/smbdirect_pdu.h b/fs/smb/common/smbdirect/smbdirect_pdu.h
new file mode 100644
index 000000000000..ae9fdb05ce23
--- /dev/null
+++ b/fs/smb/common/smbdirect/smbdirect_pdu.h
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2017 Stefan Metzmacher
+ */
+
+#ifndef __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_PDU_H__
+#define __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_PDU_H__
+
+#define SMBDIRECT_V1 0x0100
+
+/* SMBD negotiation request packet [MS-SMBD] 2.2.1 */
+struct smbdirect_negotiate_req {
+ __le16 min_version;
+ __le16 max_version;
+ __le16 reserved;
+ __le16 credits_requested;
+ __le32 preferred_send_size;
+ __le32 max_receive_size;
+ __le32 max_fragmented_size;
+} __packed;
+
+/* SMBD negotiation response packet [MS-SMBD] 2.2.2 */
+struct smbdirect_negotiate_resp {
+ __le16 min_version;
+ __le16 max_version;
+ __le16 negotiated_version;
+ __le16 reserved;
+ __le16 credits_requested;
+ __le16 credits_granted;
+ __le32 status;
+ __le32 max_readwrite_size;
+ __le32 preferred_send_size;
+ __le32 max_receive_size;
+ __le32 max_fragmented_size;
+} __packed;
+
+#define SMBDIRECT_DATA_MIN_HDR_SIZE 0x14
+#define SMBDIRECT_DATA_OFFSET 0x18
+
+#define SMBDIRECT_FLAG_RESPONSE_REQUESTED 0x0001
+
+/* SMBD data transfer packet with payload [MS-SMBD] 2.2.3 */
+struct smbdirect_data_transfer {
+ __le16 credits_requested;
+ __le16 credits_granted;
+ __le16 flags;
+ __le16 reserved;
+ __le32 remaining_data_length;
+ __le32 data_offset;
+ __le32 data_length;
+ __le32 padding;
+ __u8 buffer[];
+} __packed;
+
+#endif /* __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_PDU_H__ */
diff --git a/fs/smb/common/smbdirect/smbdirect_socket.h b/fs/smb/common/smbdirect/smbdirect_socket.h
new file mode 100644
index 000000000000..3c4a8d627aa3
--- /dev/null
+++ b/fs/smb/common/smbdirect/smbdirect_socket.h
@@ -0,0 +1,161 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2025 Stefan Metzmacher
+ */
+
+#ifndef __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_SOCKET_H__
+#define __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_SOCKET_H__
+
+enum smbdirect_socket_status {
+ SMBDIRECT_SOCKET_CREATED,
+ SMBDIRECT_SOCKET_CONNECTING,
+ SMBDIRECT_SOCKET_CONNECTED,
+ SMBDIRECT_SOCKET_NEGOTIATE_FAILED,
+ SMBDIRECT_SOCKET_DISCONNECTING,
+ SMBDIRECT_SOCKET_DISCONNECTED,
+ SMBDIRECT_SOCKET_DESTROYED
+};
+
+struct smbdirect_socket {
+ enum smbdirect_socket_status status;
+
+ /* RDMA related */
+ struct {
+ struct rdma_cm_id *cm_id;
+ } rdma;
+
+ /* IB verbs related */
+ struct {
+ struct ib_pd *pd;
+ struct ib_cq *send_cq;
+ struct ib_cq *recv_cq;
+
+ /*
+ * shortcuts for rdma.cm_id->{qp,device};
+ */
+ struct ib_qp *qp;
+ struct ib_device *dev;
+ } ib;
+
+ struct smbdirect_socket_parameters parameters;
+
+ /*
+ * The state for posted send buffers
+ */
+ struct {
+ /*
+ * Memory pools for preallocating
+ * smbdirect_send_io buffers
+ */
+ struct {
+ struct kmem_cache *cache;
+ mempool_t *pool;
+ } mem;
+ } send_io;
+
+ /*
+ * The state for posted receive buffers
+ */
+ struct {
+ /*
+ * The type of PDU we are expecting
+ */
+ enum {
+ SMBDIRECT_EXPECT_NEGOTIATE_REQ = 1,
+ SMBDIRECT_EXPECT_NEGOTIATE_REP = 2,
+ SMBDIRECT_EXPECT_DATA_TRANSFER = 3,
+ } expected;
+
+ /*
+ * Memory pools for preallocating
+ * smbdirect_recv_io buffers
+ */
+ struct {
+ struct kmem_cache *cache;
+ mempool_t *pool;
+ } mem;
+
+ /*
+ * The list of free smbdirect_recv_io
+ * structures
+ */
+ struct {
+ struct list_head list;
+ spinlock_t lock;
+ } free;
+
+ /*
+ * The list of arrived non-empty smbdirect_recv_io
+ * structures
+ *
+ * This represents the reassembly queue.
+ */
+ struct {
+ struct list_head list;
+ spinlock_t lock;
+ wait_queue_head_t wait_queue;
+ /* total data length of reassembly queue */
+ int data_length;
+ int queue_length;
+ /* the offset to first buffer in reassembly queue */
+ int first_entry_offset;
+ /*
+ * Indicate if we have received a full packet on the
+ * connection This is used to identify the first SMBD
+ * packet of a assembled payload (SMB packet) in
+ * reassembly queue so we can return a RFC1002 length to
+ * upper layer to indicate the length of the SMB packet
+ * received
+ */
+ bool full_packet_received;
+ } reassembly;
+ } recv_io;
+};
+
+struct smbdirect_send_io {
+ struct smbdirect_socket *socket;
+ struct ib_cqe cqe;
+
+ /*
+ * The SGE entries for this work request
+ *
+ * The first points to the packet header
+ */
+#define SMBDIRECT_SEND_IO_MAX_SGE 6
+ size_t num_sge;
+ struct ib_sge sge[SMBDIRECT_SEND_IO_MAX_SGE];
+
+ /*
+ * Link to the list of sibling smbdirect_send_io
+ * messages.
+ */
+ struct list_head sibling_list;
+ struct ib_send_wr wr;
+
+ /* SMBD packet header follows this structure */
+ u8 packet[];
+};
+
+struct smbdirect_recv_io {
+ struct smbdirect_socket *socket;
+ struct ib_cqe cqe;
+
+ /*
+ * For now we only use a single SGE
+ * as we have just one large buffer
+ * per posted recv.
+ */
+#define SMBDIRECT_RECV_IO_MAX_SGE 1
+ struct ib_sge sge;
+
+ /* Link to free or reassembly list */
+ struct list_head list;
+
+ /* Indicate if this is the 1st packet of a payload */
+ bool first_segment;
+
+ /* SMBD packet header and payload follows this structure */
+ u8 packet[];
+};
+
+#endif /* __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_SOCKET_H__ */
diff --git a/fs/smb/common/smbfsctl.h b/fs/smb/common/smbfsctl.h
index 4b379e84c46b..3253a18ecb5c 100644
--- a/fs/smb/common/smbfsctl.h
+++ b/fs/smb/common/smbfsctl.h
@@ -159,6 +159,9 @@
#define IO_REPARSE_TAG_LX_CHR 0x80000025
#define IO_REPARSE_TAG_LX_BLK 0x80000026
+/* If Name Surrogate Bit is set, the file or directory represents another named entity in the system. */
+#define IS_REPARSE_TAG_NAME_SURROGATE(tag) (!!((tag) & 0x20000000))
+
/* fsctl flags */
/* If Flags is set to this value, the request is an FSCTL not ioctl request */
#define SMB2_0_IOCTL_IS_FSCTL 0x00000001
diff --git a/fs/smb/server/Kconfig b/fs/smb/server/Kconfig
index cabe6a843c6a..4a23a5e7e8fe 100644
--- a/fs/smb/server/Kconfig
+++ b/fs/smb/server/Kconfig
@@ -11,6 +11,7 @@ config SMB_SERVER
select CRYPTO_HMAC
select CRYPTO_ECB
select CRYPTO_LIB_DES
+ select CRYPTO_LIB_SHA256
select CRYPTO_SHA256
select CRYPTO_CMAC
select CRYPTO_SHA512
@@ -70,4 +71,4 @@ config SMB_SERVER_CHECK_CAP_NET_ADMIN
config SMB_SERVER_KERBEROS5
bool "Support for Kerberos 5"
depends on SMB_SERVER
- default n
+ default y
diff --git a/fs/smb/server/auth.c b/fs/smb/server/auth.c
index 2a5b4a96bf99..d99871c21451 100644
--- a/fs/smb/server/auth.c
+++ b/fs/smb/server/auth.c
@@ -550,7 +550,19 @@ int ksmbd_krb5_authenticate(struct ksmbd_session *sess, char *in_blob,
retval = -ENOMEM;
goto out;
}
- sess->user = user;
+
+ if (!sess->user) {
+ /* First successful authentication */
+ sess->user = user;
+ } else {
+ if (!ksmbd_compare_user(sess->user, user)) {
+ ksmbd_debug(AUTH, "different user tried to reuse session\n");
+ retval = -EPERM;
+ ksmbd_free_user(user);
+ goto out;
+ }
+ ksmbd_free_user(user);
+ }
memcpy(sess->sess_key, resp->payload, resp->session_key_len);
memcpy(out_blob, resp->payload + resp->session_key_len,
@@ -967,40 +979,6 @@ out:
return rc;
}
-int ksmbd_gen_sd_hash(struct ksmbd_conn *conn, char *sd_buf, int len,
- __u8 *pi_hash)
-{
- int rc;
- struct ksmbd_crypto_ctx *ctx = NULL;
-
- ctx = ksmbd_crypto_ctx_find_sha256();
- if (!ctx) {
- ksmbd_debug(AUTH, "could not alloc sha256\n");
- return -ENOMEM;
- }
-
- rc = crypto_shash_init(CRYPTO_SHA256(ctx));
- if (rc) {
- ksmbd_debug(AUTH, "could not init shashn");
- goto out;
- }
-
- rc = crypto_shash_update(CRYPTO_SHA256(ctx), sd_buf, len);
- if (rc) {
- ksmbd_debug(AUTH, "could not update with n\n");
- goto out;
- }
-
- rc = crypto_shash_final(CRYPTO_SHA256(ctx), pi_hash);
- if (rc) {
- ksmbd_debug(AUTH, "Could not generate hash err : %d\n", rc);
- goto out;
- }
-out:
- ksmbd_release_crypto_ctx(ctx);
- return rc;
-}
-
static int ksmbd_get_encryption_key(struct ksmbd_work *work, __u64 ses_id,
int enc, u8 *key)
{
@@ -1016,9 +994,9 @@ static int ksmbd_get_encryption_key(struct ksmbd_work *work, __u64 ses_id,
ses_enc_key = enc ? sess->smb3encryptionkey :
sess->smb3decryptionkey;
- if (enc)
- ksmbd_user_session_get(sess);
memcpy(key, ses_enc_key, SMB3_ENC_DEC_KEY_SIZE);
+ if (!enc)
+ ksmbd_user_session_put(sess);
return 0;
}
@@ -1218,7 +1196,7 @@ free_iv:
free_sg:
kfree(sg);
free_req:
- kfree(req);
+ aead_request_free(req);
free_ctx:
ksmbd_release_crypto_ctx(ctx);
return rc;
diff --git a/fs/smb/server/auth.h b/fs/smb/server/auth.h
index 362b6159a6cf..6879a1bd1b91 100644
--- a/fs/smb/server/auth.h
+++ b/fs/smb/server/auth.h
@@ -66,6 +66,4 @@ int ksmbd_gen_smb311_encryptionkey(struct ksmbd_conn *conn,
struct ksmbd_session *sess);
int ksmbd_gen_preauth_integrity_hash(struct ksmbd_conn *conn, char *buf,
__u8 *pi_hash);
-int ksmbd_gen_sd_hash(struct ksmbd_conn *conn, char *sd_buf, int len,
- __u8 *pi_hash);
#endif
diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c
index f8a40f65db6a..67c4f73398df 100644
--- a/fs/smb/server/connection.c
+++ b/fs/smb/server/connection.c
@@ -39,8 +39,10 @@ void ksmbd_conn_free(struct ksmbd_conn *conn)
xa_destroy(&conn->sessions);
kvfree(conn->request_buf);
kfree(conn->preauth_info);
- if (atomic_dec_and_test(&conn->refcnt))
+ if (atomic_dec_and_test(&conn->refcnt)) {
+ conn->transport->ops->free_transport(conn->transport);
kfree(conn);
+ }
}
/**
@@ -433,6 +435,26 @@ void ksmbd_conn_init_server_callbacks(struct ksmbd_conn_ops *ops)
default_conn_ops.terminate_fn = ops->terminate_fn;
}
+void ksmbd_conn_r_count_inc(struct ksmbd_conn *conn)
+{
+ atomic_inc(&conn->r_count);
+}
+
+void ksmbd_conn_r_count_dec(struct ksmbd_conn *conn)
+{
+ /*
+ * Checking waitqueue to dropping pending requests on
+ * disconnection. waitqueue_active is safe because it
+ * uses atomic operation for condition.
+ */
+ atomic_inc(&conn->refcnt);
+ if (!atomic_dec_return(&conn->r_count) && waitqueue_active(&conn->r_count_q))
+ wake_up(&conn->r_count_q);
+
+ if (atomic_dec_and_test(&conn->refcnt))
+ kfree(conn);
+}
+
int ksmbd_conn_transport_init(void)
{
int ret;
@@ -482,7 +504,8 @@ void ksmbd_conn_transport_destroy(void)
{
mutex_lock(&init_lock);
ksmbd_tcp_destroy();
- ksmbd_rdma_destroy();
+ ksmbd_rdma_stop_listening();
stop_sessions();
+ ksmbd_rdma_destroy();
mutex_unlock(&init_lock);
}
diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h
index b379ae4fdcdf..2aa8084bb593 100644
--- a/fs/smb/server/connection.h
+++ b/fs/smb/server/connection.h
@@ -27,6 +27,7 @@ enum {
KSMBD_SESS_EXITING,
KSMBD_SESS_NEED_RECONNECT,
KSMBD_SESS_NEED_NEGOTIATE,
+ KSMBD_SESS_NEED_SETUP,
KSMBD_SESS_RELEASING
};
@@ -45,6 +46,12 @@ struct ksmbd_conn {
struct mutex srv_mutex;
int status;
unsigned int cli_cap;
+ union {
+ __be32 inet_addr;
+#if IS_ENABLED(CONFIG_IPV6)
+ u8 inet6_addr[16];
+#endif
+ };
char *request_buf;
struct ksmbd_transport *transport;
struct nls_table *local_nls;
@@ -107,6 +114,7 @@ struct ksmbd_conn {
__le16 signing_algorithm;
bool binding;
atomic_t refcnt;
+ bool is_aapl;
};
struct ksmbd_conn_ops {
@@ -131,6 +139,7 @@ struct ksmbd_transport_ops {
void *buf, unsigned int len,
struct smb2_buffer_desc_v1 *desc,
unsigned int desc_len);
+ void (*free_transport)(struct ksmbd_transport *kt);
};
struct ksmbd_transport {
@@ -168,6 +177,8 @@ int ksmbd_conn_transport_init(void);
void ksmbd_conn_transport_destroy(void);
void ksmbd_conn_lock(struct ksmbd_conn *conn);
void ksmbd_conn_unlock(struct ksmbd_conn *conn);
+void ksmbd_conn_r_count_inc(struct ksmbd_conn *conn);
+void ksmbd_conn_r_count_dec(struct ksmbd_conn *conn);
/*
* WARNING
@@ -185,6 +196,11 @@ static inline bool ksmbd_conn_need_negotiate(struct ksmbd_conn *conn)
return READ_ONCE(conn->status) == KSMBD_SESS_NEED_NEGOTIATE;
}
+static inline bool ksmbd_conn_need_setup(struct ksmbd_conn *conn)
+{
+ return READ_ONCE(conn->status) == KSMBD_SESS_NEED_SETUP;
+}
+
static inline bool ksmbd_conn_need_reconnect(struct ksmbd_conn *conn)
{
return READ_ONCE(conn->status) == KSMBD_SESS_NEED_RECONNECT;
@@ -215,6 +231,11 @@ static inline void ksmbd_conn_set_need_negotiate(struct ksmbd_conn *conn)
WRITE_ONCE(conn->status, KSMBD_SESS_NEED_NEGOTIATE);
}
+static inline void ksmbd_conn_set_need_setup(struct ksmbd_conn *conn)
+{
+ WRITE_ONCE(conn->status, KSMBD_SESS_NEED_SETUP);
+}
+
static inline void ksmbd_conn_set_need_reconnect(struct ksmbd_conn *conn)
{
WRITE_ONCE(conn->status, KSMBD_SESS_NEED_RECONNECT);
diff --git a/fs/smb/server/crypto_ctx.c b/fs/smb/server/crypto_ctx.c
index ce733dc9a4a3..80bd68c8635e 100644
--- a/fs/smb/server/crypto_ctx.c
+++ b/fs/smb/server/crypto_ctx.c
@@ -75,9 +75,6 @@ static struct shash_desc *alloc_shash_desc(int id)
case CRYPTO_SHASH_CMACAES:
tfm = crypto_alloc_shash("cmac(aes)", 0, 0);
break;
- case CRYPTO_SHASH_SHA256:
- tfm = crypto_alloc_shash("sha256", 0, 0);
- break;
case CRYPTO_SHASH_SHA512:
tfm = crypto_alloc_shash("sha512", 0, 0);
break;
@@ -198,11 +195,6 @@ struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_cmacaes(void)
return ____crypto_shash_ctx_find(CRYPTO_SHASH_CMACAES);
}
-struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_sha256(void)
-{
- return ____crypto_shash_ctx_find(CRYPTO_SHASH_SHA256);
-}
-
struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_sha512(void)
{
return ____crypto_shash_ctx_find(CRYPTO_SHASH_SHA512);
diff --git a/fs/smb/server/crypto_ctx.h b/fs/smb/server/crypto_ctx.h
index 4a367c62f653..ac64801d52d3 100644
--- a/fs/smb/server/crypto_ctx.h
+++ b/fs/smb/server/crypto_ctx.h
@@ -13,7 +13,6 @@ enum {
CRYPTO_SHASH_HMACMD5 = 0,
CRYPTO_SHASH_HMACSHA256,
CRYPTO_SHASH_CMACAES,
- CRYPTO_SHASH_SHA256,
CRYPTO_SHASH_SHA512,
CRYPTO_SHASH_MAX,
};
@@ -39,14 +38,12 @@ struct ksmbd_crypto_ctx {
#define CRYPTO_HMACMD5(c) ((c)->desc[CRYPTO_SHASH_HMACMD5])
#define CRYPTO_HMACSHA256(c) ((c)->desc[CRYPTO_SHASH_HMACSHA256])
#define CRYPTO_CMACAES(c) ((c)->desc[CRYPTO_SHASH_CMACAES])
-#define CRYPTO_SHA256(c) ((c)->desc[CRYPTO_SHASH_SHA256])
#define CRYPTO_SHA512(c) ((c)->desc[CRYPTO_SHASH_SHA512])
#define CRYPTO_HMACMD5_TFM(c) ((c)->desc[CRYPTO_SHASH_HMACMD5]->tfm)
#define CRYPTO_HMACSHA256_TFM(c)\
((c)->desc[CRYPTO_SHASH_HMACSHA256]->tfm)
#define CRYPTO_CMACAES_TFM(c) ((c)->desc[CRYPTO_SHASH_CMACAES]->tfm)
-#define CRYPTO_SHA256_TFM(c) ((c)->desc[CRYPTO_SHASH_SHA256]->tfm)
#define CRYPTO_SHA512_TFM(c) ((c)->desc[CRYPTO_SHASH_SHA512]->tfm)
#define CRYPTO_GCM(c) ((c)->ccmaes[CRYPTO_AEAD_AES_GCM])
@@ -57,7 +54,6 @@ struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_hmacmd5(void);
struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_hmacsha256(void);
struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_cmacaes(void);
struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_sha512(void);
-struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_sha256(void);
struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_gcm(void);
struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_ccm(void);
void ksmbd_crypto_destroy(void);
diff --git a/fs/smb/server/ksmbd_work.c b/fs/smb/server/ksmbd_work.c
index 4af2e6007c29..72b00ca6e455 100644
--- a/fs/smb/server/ksmbd_work.c
+++ b/fs/smb/server/ksmbd_work.c
@@ -26,7 +26,6 @@ struct ksmbd_work *ksmbd_alloc_work_struct(void)
INIT_LIST_HEAD(&work->request_entry);
INIT_LIST_HEAD(&work->async_request_entry);
INIT_LIST_HEAD(&work->fp_entry);
- INIT_LIST_HEAD(&work->interim_entry);
INIT_LIST_HEAD(&work->aux_read_list);
work->iov_alloc_cnt = 4;
work->iov = kcalloc(work->iov_alloc_cnt, sizeof(struct kvec),
@@ -56,8 +55,6 @@ void ksmbd_free_work_struct(struct ksmbd_work *work)
kfree(work->tr_buf);
kvfree(work->request_buf);
kfree(work->iov);
- if (!list_empty(&work->interim_entry))
- list_del(&work->interim_entry);
if (work->async_id)
ksmbd_release_id(&work->conn->async_ida, work->async_id);
diff --git a/fs/smb/server/ksmbd_work.h b/fs/smb/server/ksmbd_work.h
index 8ca2c813246e..d36393ff8310 100644
--- a/fs/smb/server/ksmbd_work.h
+++ b/fs/smb/server/ksmbd_work.h
@@ -89,7 +89,6 @@ struct ksmbd_work {
/* List head at conn->async_requests */
struct list_head async_request_entry;
struct list_head fp_entry;
- struct list_head interim_entry;
};
/**
diff --git a/fs/smb/server/mgmt/user_session.c b/fs/smb/server/mgmt/user_session.c
index 71c6939dfbf1..9dec4c2940bc 100644
--- a/fs/smb/server/mgmt/user_session.c
+++ b/fs/smb/server/mgmt/user_session.c
@@ -59,10 +59,12 @@ static void ksmbd_session_rpc_clear_list(struct ksmbd_session *sess)
struct ksmbd_session_rpc *entry;
long index;
+ down_write(&sess->rpc_lock);
xa_for_each(&sess->rpc_handle_list, index, entry) {
xa_erase(&sess->rpc_handle_list, index);
__session_rpc_close(sess, entry);
}
+ up_write(&sess->rpc_lock);
xa_destroy(&sess->rpc_handle_list);
}
@@ -92,7 +94,7 @@ int ksmbd_session_rpc_open(struct ksmbd_session *sess, char *rpc_name)
{
struct ksmbd_session_rpc *entry, *old;
struct ksmbd_rpc_command *resp;
- int method;
+ int method, id;
method = __rpc_method(rpc_name);
if (!method)
@@ -102,26 +104,29 @@ int ksmbd_session_rpc_open(struct ksmbd_session *sess, char *rpc_name)
if (!entry)
return -ENOMEM;
+ down_read(&sess->rpc_lock);
entry->method = method;
- entry->id = ksmbd_ipc_id_alloc();
- if (entry->id < 0)
+ entry->id = id = ksmbd_ipc_id_alloc();
+ if (id < 0)
goto free_entry;
- old = xa_store(&sess->rpc_handle_list, entry->id, entry, KSMBD_DEFAULT_GFP);
+ old = xa_store(&sess->rpc_handle_list, id, entry, KSMBD_DEFAULT_GFP);
if (xa_is_err(old))
goto free_id;
- resp = ksmbd_rpc_open(sess, entry->id);
+ resp = ksmbd_rpc_open(sess, id);
if (!resp)
goto erase_xa;
+ up_read(&sess->rpc_lock);
kvfree(resp);
- return entry->id;
+ return id;
erase_xa:
xa_erase(&sess->rpc_handle_list, entry->id);
free_id:
ksmbd_rpc_id_free(entry->id);
free_entry:
kfree(entry);
+ up_read(&sess->rpc_lock);
return -EINVAL;
}
@@ -129,9 +134,11 @@ void ksmbd_session_rpc_close(struct ksmbd_session *sess, int id)
{
struct ksmbd_session_rpc *entry;
+ down_write(&sess->rpc_lock);
entry = xa_erase(&sess->rpc_handle_list, id);
if (entry)
__session_rpc_close(sess, entry);
+ up_write(&sess->rpc_lock);
}
int ksmbd_session_rpc_method(struct ksmbd_session *sess, int id)
@@ -181,7 +188,7 @@ static void ksmbd_expire_session(struct ksmbd_conn *conn)
down_write(&sessions_table_lock);
down_write(&conn->session_lock);
xa_for_each(&conn->sessions, id, sess) {
- if (atomic_read(&sess->refcnt) == 0 &&
+ if (atomic_read(&sess->refcnt) <= 1 &&
(sess->state != SMB2_SESSION_VALID ||
time_after(jiffies,
sess->last_active + SMB2_SESSION_TIMEOUT))) {
@@ -230,7 +237,11 @@ void ksmbd_sessions_deregister(struct ksmbd_conn *conn)
if (!ksmbd_chann_del(conn, sess) &&
xa_empty(&sess->ksmbd_chann_list)) {
hash_del(&sess->hlist);
- ksmbd_session_destroy(sess);
+ down_write(&conn->session_lock);
+ xa_erase(&conn->sessions, sess->id);
+ up_write(&conn->session_lock);
+ if (atomic_dec_and_test(&sess->refcnt))
+ ksmbd_session_destroy(sess);
}
}
}
@@ -249,13 +260,30 @@ void ksmbd_sessions_deregister(struct ksmbd_conn *conn)
if (xa_empty(&sess->ksmbd_chann_list)) {
xa_erase(&conn->sessions, sess->id);
hash_del(&sess->hlist);
- ksmbd_session_destroy(sess);
+ if (atomic_dec_and_test(&sess->refcnt))
+ ksmbd_session_destroy(sess);
}
}
up_write(&conn->session_lock);
up_write(&sessions_table_lock);
}
+bool is_ksmbd_session_in_connection(struct ksmbd_conn *conn,
+ unsigned long long id)
+{
+ struct ksmbd_session *sess;
+
+ down_read(&conn->session_lock);
+ sess = xa_load(&conn->sessions, id);
+ if (sess) {
+ up_read(&conn->session_lock);
+ return true;
+ }
+ up_read(&conn->session_lock);
+
+ return false;
+}
+
struct ksmbd_session *ksmbd_session_lookup(struct ksmbd_conn *conn,
unsigned long long id)
{
@@ -309,8 +337,8 @@ void ksmbd_user_session_put(struct ksmbd_session *sess)
if (atomic_read(&sess->refcnt) <= 0)
WARN_ON(1);
- else
- atomic_dec(&sess->refcnt);
+ else if (atomic_dec_and_test(&sess->refcnt))
+ ksmbd_session_destroy(sess);
}
struct preauth_session *ksmbd_preauth_session_alloc(struct ksmbd_conn *conn,
@@ -353,13 +381,13 @@ void destroy_previous_session(struct ksmbd_conn *conn,
ksmbd_all_conn_set_status(id, KSMBD_SESS_NEED_RECONNECT);
err = ksmbd_conn_wait_idle_sess_id(conn, id);
if (err) {
- ksmbd_all_conn_set_status(id, KSMBD_SESS_NEED_NEGOTIATE);
+ ksmbd_all_conn_set_status(id, KSMBD_SESS_NEED_SETUP);
goto out;
}
ksmbd_destroy_file_table(&prev_sess->file_table);
prev_sess->state = SMB2_SESSION_EXPIRED;
- ksmbd_all_conn_set_status(id, KSMBD_SESS_NEED_NEGOTIATE);
+ ksmbd_all_conn_set_status(id, KSMBD_SESS_NEED_SETUP);
ksmbd_launch_ksmbd_durable_scavenger();
out:
up_write(&conn->session_lock);
@@ -417,7 +445,8 @@ static struct ksmbd_session *__session_create(int protocol)
xa_init(&sess->rpc_handle_list);
sess->sequence_number = 1;
rwlock_init(&sess->tree_conns_lock);
- atomic_set(&sess->refcnt, 1);
+ atomic_set(&sess->refcnt, 2);
+ init_rwsem(&sess->rpc_lock);
ret = __init_smb2_session(sess);
if (ret)
diff --git a/fs/smb/server/mgmt/user_session.h b/fs/smb/server/mgmt/user_session.h
index c1c4b20bd5c6..c5749d6ec715 100644
--- a/fs/smb/server/mgmt/user_session.h
+++ b/fs/smb/server/mgmt/user_session.h
@@ -63,6 +63,7 @@ struct ksmbd_session {
rwlock_t tree_conns_lock;
atomic_t refcnt;
+ struct rw_semaphore rpc_lock;
};
static inline int test_session_flag(struct ksmbd_session *sess, int bit)
@@ -87,6 +88,8 @@ void ksmbd_session_destroy(struct ksmbd_session *sess);
struct ksmbd_session *ksmbd_session_lookup_slowpath(unsigned long long id);
struct ksmbd_session *ksmbd_session_lookup(struct ksmbd_conn *conn,
unsigned long long id);
+bool is_ksmbd_session_in_connection(struct ksmbd_conn *conn,
+ unsigned long long id);
int ksmbd_session_register(struct ksmbd_conn *conn,
struct ksmbd_session *sess);
void ksmbd_sessions_deregister(struct ksmbd_conn *conn);
diff --git a/fs/smb/server/oplock.c b/fs/smb/server/oplock.c
index 3a3fe4afbdf0..a04d5702820d 100644
--- a/fs/smb/server/oplock.c
+++ b/fs/smb/server/oplock.c
@@ -46,7 +46,6 @@ static struct oplock_info *alloc_opinfo(struct ksmbd_work *work,
opinfo->fid = id;
opinfo->Tid = Tid;
INIT_LIST_HEAD(&opinfo->op_entry);
- INIT_LIST_HEAD(&opinfo->interim_list);
init_waitqueue_head(&opinfo->oplock_q);
init_waitqueue_head(&opinfo->oplock_brk);
atomic_set(&opinfo->refcount, 1);
@@ -130,14 +129,6 @@ static void free_opinfo(struct oplock_info *opinfo)
kfree(opinfo);
}
-static inline void opinfo_free_rcu(struct rcu_head *rcu_head)
-{
- struct oplock_info *opinfo;
-
- opinfo = container_of(rcu_head, struct oplock_info, rcu_head);
- free_opinfo(opinfo);
-}
-
struct oplock_info *opinfo_get(struct ksmbd_file *fp)
{
struct oplock_info *opinfo;
@@ -155,12 +146,9 @@ static struct oplock_info *opinfo_get_list(struct ksmbd_inode *ci)
{
struct oplock_info *opinfo;
- if (list_empty(&ci->m_op_list))
- return NULL;
-
- rcu_read_lock();
- opinfo = list_first_or_null_rcu(&ci->m_op_list, struct oplock_info,
- op_entry);
+ down_read(&ci->m_lock);
+ opinfo = list_first_entry_or_null(&ci->m_op_list, struct oplock_info,
+ op_entry);
if (opinfo) {
if (opinfo->conn == NULL ||
!atomic_inc_not_zero(&opinfo->refcount))
@@ -172,8 +160,7 @@ static struct oplock_info *opinfo_get_list(struct ksmbd_inode *ci)
}
}
}
-
- rcu_read_unlock();
+ up_read(&ci->m_lock);
return opinfo;
}
@@ -186,7 +173,7 @@ void opinfo_put(struct oplock_info *opinfo)
if (!atomic_dec_and_test(&opinfo->refcount))
return;
- call_rcu(&opinfo->rcu_head, opinfo_free_rcu);
+ free_opinfo(opinfo);
}
static void opinfo_add(struct oplock_info *opinfo)
@@ -194,7 +181,7 @@ static void opinfo_add(struct oplock_info *opinfo)
struct ksmbd_inode *ci = opinfo->o_fp->f_ci;
down_write(&ci->m_lock);
- list_add_rcu(&opinfo->op_entry, &ci->m_op_list);
+ list_add(&opinfo->op_entry, &ci->m_op_list);
up_write(&ci->m_lock);
}
@@ -208,7 +195,7 @@ static void opinfo_del(struct oplock_info *opinfo)
write_unlock(&lease_list_lock);
}
down_write(&ci->m_lock);
- list_del_rcu(&opinfo->op_entry);
+ list_del(&opinfo->op_entry);
up_write(&ci->m_lock);
}
@@ -635,6 +622,7 @@ static void __smb2_oplock_break_noti(struct work_struct *wk)
{
struct smb2_oplock_break *rsp = NULL;
struct ksmbd_work *work = container_of(wk, struct ksmbd_work, work);
+ struct ksmbd_conn *conn = work->conn;
struct oplock_break_info *br_info = work->request_buf;
struct smb2_hdr *rsp_hdr;
struct ksmbd_file *fp;
@@ -690,6 +678,7 @@ static void __smb2_oplock_break_noti(struct work_struct *wk)
out:
ksmbd_free_work_struct(work);
+ ksmbd_conn_r_count_dec(conn);
}
/**
@@ -723,6 +712,7 @@ static int smb2_oplock_break_noti(struct oplock_info *opinfo)
work->conn = conn;
work->sess = opinfo->sess;
+ ksmbd_conn_r_count_inc(conn);
if (opinfo->op_state == OPLOCK_ACK_WAIT) {
INIT_WORK(&work->work, __smb2_oplock_break_noti);
ksmbd_queue_work(work);
@@ -745,6 +735,7 @@ static void __smb2_lease_break_noti(struct work_struct *wk)
{
struct smb2_lease_break *rsp = NULL;
struct ksmbd_work *work = container_of(wk, struct ksmbd_work, work);
+ struct ksmbd_conn *conn = work->conn;
struct lease_break_info *br_info = work->request_buf;
struct smb2_hdr *rsp_hdr;
@@ -791,6 +782,7 @@ static void __smb2_lease_break_noti(struct work_struct *wk)
out:
ksmbd_free_work_struct(work);
+ ksmbd_conn_r_count_dec(conn);
}
/**
@@ -803,7 +795,6 @@ out:
static int smb2_lease_break_noti(struct oplock_info *opinfo)
{
struct ksmbd_conn *conn = opinfo->conn;
- struct list_head *tmp, *t;
struct ksmbd_work *work;
struct lease_break_info *br_info;
struct lease *lease = opinfo->o_lease;
@@ -830,17 +821,8 @@ static int smb2_lease_break_noti(struct oplock_info *opinfo)
work->conn = conn;
work->sess = opinfo->sess;
+ ksmbd_conn_r_count_inc(conn);
if (opinfo->op_state == OPLOCK_ACK_WAIT) {
- list_for_each_safe(tmp, t, &opinfo->interim_list) {
- struct ksmbd_work *in_work;
-
- in_work = list_entry(tmp, struct ksmbd_work,
- interim_entry);
- setup_async_work(in_work, NULL, NULL);
- smb2_send_interim_resp(in_work, STATUS_PENDING);
- list_del_init(&in_work->interim_entry);
- release_async_work(in_work);
- }
INIT_WORK(&work->work, __smb2_lease_break_noti);
ksmbd_queue_work(work);
wait_for_break_ack(opinfo);
@@ -871,7 +853,8 @@ static void wait_lease_breaking(struct oplock_info *opinfo)
}
}
-static int oplock_break(struct oplock_info *brk_opinfo, int req_op_level)
+static int oplock_break(struct oplock_info *brk_opinfo, int req_op_level,
+ struct ksmbd_work *in_work)
{
int err = 0;
@@ -914,9 +897,15 @@ static int oplock_break(struct oplock_info *brk_opinfo, int req_op_level)
}
if (lease->state & (SMB2_LEASE_WRITE_CACHING_LE |
- SMB2_LEASE_HANDLE_CACHING_LE))
+ SMB2_LEASE_HANDLE_CACHING_LE)) {
+ if (in_work) {
+ setup_async_work(in_work, NULL, NULL);
+ smb2_send_interim_resp(in_work, STATUS_PENDING);
+ release_async_work(in_work);
+ }
+
brk_opinfo->op_state = OPLOCK_ACK_WAIT;
- else
+ } else
atomic_dec(&brk_opinfo->breaking_cnt);
} else {
err = oplock_break_pending(brk_opinfo, req_op_level);
@@ -1113,10 +1102,12 @@ void smb_send_parent_lease_break_noti(struct ksmbd_file *fp,
if (!atomic_inc_not_zero(&opinfo->refcount))
continue;
- if (ksmbd_conn_releasing(opinfo->conn))
+ if (ksmbd_conn_releasing(opinfo->conn)) {
+ opinfo_put(opinfo);
continue;
+ }
- oplock_break(opinfo, SMB2_OPLOCK_LEVEL_NONE);
+ oplock_break(opinfo, SMB2_OPLOCK_LEVEL_NONE, NULL);
opinfo_put(opinfo);
}
}
@@ -1150,9 +1141,12 @@ void smb_lazy_parent_lease_break_close(struct ksmbd_file *fp)
if (!atomic_inc_not_zero(&opinfo->refcount))
continue;
- if (ksmbd_conn_releasing(opinfo->conn))
+ if (ksmbd_conn_releasing(opinfo->conn)) {
+ opinfo_put(opinfo);
continue;
- oplock_break(opinfo, SMB2_OPLOCK_LEVEL_NONE);
+ }
+
+ oplock_break(opinfo, SMB2_OPLOCK_LEVEL_NONE, NULL);
opinfo_put(opinfo);
}
}
@@ -1252,8 +1246,7 @@ int smb_grant_oplock(struct ksmbd_work *work, int req_op_level, u64 pid,
goto op_break_not_needed;
}
- list_add(&work->interim_entry, &prev_opinfo->interim_list);
- err = oplock_break(prev_opinfo, SMB2_OPLOCK_LEVEL_II);
+ err = oplock_break(prev_opinfo, SMB2_OPLOCK_LEVEL_II, work);
opinfo_put(prev_opinfo);
if (err == -ENOENT)
goto set_lev;
@@ -1322,8 +1315,7 @@ static void smb_break_all_write_oplock(struct ksmbd_work *work,
}
brk_opinfo->open_trunc = is_trunc;
- list_add(&work->interim_entry, &brk_opinfo->interim_list);
- oplock_break(brk_opinfo, SMB2_OPLOCK_LEVEL_II);
+ oplock_break(brk_opinfo, SMB2_OPLOCK_LEVEL_II, work);
opinfo_put(brk_opinfo);
}
@@ -1348,18 +1340,19 @@ void smb_break_all_levII_oplock(struct ksmbd_work *work, struct ksmbd_file *fp,
ci = fp->f_ci;
op = opinfo_get(fp);
- rcu_read_lock();
- list_for_each_entry_rcu(brk_op, &ci->m_op_list, op_entry) {
+ down_read(&ci->m_lock);
+ list_for_each_entry(brk_op, &ci->m_op_list, op_entry) {
if (brk_op->conn == NULL)
continue;
if (!atomic_inc_not_zero(&brk_op->refcount))
continue;
- if (ksmbd_conn_releasing(brk_op->conn))
+ if (ksmbd_conn_releasing(brk_op->conn)) {
+ opinfo_put(brk_op);
continue;
+ }
- rcu_read_unlock();
if (brk_op->is_lease && (brk_op->o_lease->state &
(~(SMB2_LEASE_READ_CACHING_LE |
SMB2_LEASE_HANDLE_CACHING_LE)))) {
@@ -1386,12 +1379,11 @@ void smb_break_all_levII_oplock(struct ksmbd_work *work, struct ksmbd_file *fp,
SMB2_LEASE_KEY_SIZE))
goto next;
brk_op->open_trunc = is_trunc;
- oplock_break(brk_op, SMB2_OPLOCK_LEVEL_NONE);
+ oplock_break(brk_op, SMB2_OPLOCK_LEVEL_NONE, NULL);
next:
opinfo_put(brk_op);
- rcu_read_lock();
}
- rcu_read_unlock();
+ up_read(&ci->m_lock);
if (op)
opinfo_put(op);
@@ -1506,6 +1498,10 @@ struct lease_ctx_info *parse_lease_state(void *open_req)
if (sizeof(struct lease_context_v2) == le32_to_cpu(cc->DataLength)) {
struct create_lease_v2 *lc = (struct create_lease_v2 *)cc;
+ if (le16_to_cpu(cc->DataOffset) + le32_to_cpu(cc->DataLength) <
+ sizeof(struct create_lease_v2) - 4)
+ goto err_out;
+
memcpy(lreq->lease_key, lc->lcontext.LeaseKey, SMB2_LEASE_KEY_SIZE);
lreq->req_state = lc->lcontext.LeaseState;
lreq->flags = lc->lcontext.LeaseFlags;
@@ -1518,6 +1514,10 @@ struct lease_ctx_info *parse_lease_state(void *open_req)
} else {
struct create_lease *lc = (struct create_lease *)cc;
+ if (le16_to_cpu(cc->DataOffset) + le32_to_cpu(cc->DataLength) <
+ sizeof(struct create_lease))
+ goto err_out;
+
memcpy(lreq->lease_key, lc->lcontext.LeaseKey, SMB2_LEASE_KEY_SIZE);
lreq->req_state = lc->lcontext.LeaseState;
lreq->flags = lc->lcontext.LeaseFlags;
@@ -1525,6 +1525,9 @@ struct lease_ctx_info *parse_lease_state(void *open_req)
lreq->version = 1;
}
return lreq;
+err_out:
+ kfree(lreq);
+ return NULL;
}
/**
diff --git a/fs/smb/server/oplock.h b/fs/smb/server/oplock.h
index 72bc88a63a40..9a56eaadd0dd 100644
--- a/fs/smb/server/oplock.h
+++ b/fs/smb/server/oplock.h
@@ -67,12 +67,10 @@ struct oplock_info {
bool is_lease;
bool open_trunc; /* truncate on open */
struct lease *o_lease;
- struct list_head interim_list;
struct list_head op_entry;
struct list_head lease_entry;
wait_queue_head_t oplock_q; /* Other server threads */
wait_queue_head_t oplock_brk; /* oplock breaking wait */
- struct rcu_head rcu_head;
};
struct lease_break_info {
diff --git a/fs/smb/server/server.c b/fs/smb/server/server.c
index 601e7fcbcf1e..8c9c49c3a0a4 100644
--- a/fs/smb/server/server.c
+++ b/fs/smb/server/server.c
@@ -270,17 +270,7 @@ static void handle_ksmbd_work(struct work_struct *wk)
ksmbd_conn_try_dequeue_request(work);
ksmbd_free_work_struct(work);
- /*
- * Checking waitqueue to dropping pending requests on
- * disconnection. waitqueue_active is safe because it
- * uses atomic operation for condition.
- */
- atomic_inc(&conn->refcnt);
- if (!atomic_dec_return(&conn->r_count) && waitqueue_active(&conn->r_count_q))
- wake_up(&conn->r_count_q);
-
- if (atomic_dec_and_test(&conn->refcnt))
- kfree(conn);
+ ksmbd_conn_r_count_dec(conn);
}
/**
@@ -310,7 +300,7 @@ static int queue_ksmbd_work(struct ksmbd_conn *conn)
conn->request_buf = NULL;
ksmbd_conn_enqueue_request(work);
- atomic_inc(&conn->r_count);
+ ksmbd_conn_r_count_inc(conn);
/* update activity on connection */
conn->last_active = jiffies;
INIT_WORK(&work->work, handle_ksmbd_work);
@@ -641,6 +631,5 @@ MODULE_SOFTDEP("pre: sha512");
MODULE_SOFTDEP("pre: aead2");
MODULE_SOFTDEP("pre: ccm");
MODULE_SOFTDEP("pre: gcm");
-MODULE_SOFTDEP("pre: crc32");
module_init(ksmbd_server_init)
module_exit(ksmbd_server_exit)
diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index f1efcd027475..0d92ce49aed7 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -633,6 +633,11 @@ smb2_get_name(const char *src, const int maxlen, struct nls_table *local_nls)
return name;
}
+ if (*name == '\0') {
+ kfree(name);
+ return ERR_PTR(-EINVAL);
+ }
+
if (*name == '\\') {
pr_err("not allow directory name included leading slash\n");
kfree(name);
@@ -1249,7 +1254,7 @@ int smb2_handle_negotiate(struct ksmbd_work *work)
}
conn->srv_sec_mode = le16_to_cpu(rsp->SecurityMode);
- ksmbd_conn_set_need_negotiate(conn);
+ ksmbd_conn_set_need_setup(conn);
err_out:
ksmbd_conn_unlock(conn);
@@ -1271,6 +1276,9 @@ static int alloc_preauth_hash(struct ksmbd_session *sess,
if (sess->Preauth_HashValue)
return 0;
+ if (!conn->preauth_info)
+ return -ENOMEM;
+
sess->Preauth_HashValue = kmemdup(conn->preauth_info->Preauth_HashValue,
PREAUTH_HASHVALUE_SIZE, KSMBD_DEFAULT_GFP);
if (!sess->Preauth_HashValue)
@@ -1442,7 +1450,7 @@ static int ntlm_authenticate(struct ksmbd_work *work,
{
struct ksmbd_conn *conn = work->conn;
struct ksmbd_session *sess = work->sess;
- struct channel *chann = NULL;
+ struct channel *chann = NULL, *old;
struct ksmbd_user *user;
u64 prev_id;
int sz, rc;
@@ -1554,7 +1562,12 @@ binding_session:
return -ENOMEM;
chann->conn = conn;
- xa_store(&sess->ksmbd_chann_list, (long)conn, chann, KSMBD_DEFAULT_GFP);
+ old = xa_store(&sess->ksmbd_chann_list, (long)conn, chann,
+ KSMBD_DEFAULT_GFP);
+ if (xa_is_err(old)) {
+ kfree(chann);
+ return xa_err(old);
+ }
}
}
@@ -1581,7 +1594,7 @@ static int krb5_authenticate(struct ksmbd_work *work,
struct ksmbd_conn *conn = work->conn;
struct ksmbd_session *sess = work->sess;
char *in_blob, *out_blob;
- struct channel *chann = NULL;
+ struct channel *chann = NULL, *old;
u64 prev_sess_id;
int in_len, out_len;
int retval;
@@ -1594,27 +1607,38 @@ static int krb5_authenticate(struct ksmbd_work *work,
out_len = work->response_sz -
(le16_to_cpu(rsp->SecurityBufferOffset) + 4);
- /* Check previous session */
- prev_sess_id = le64_to_cpu(req->PreviousSessionId);
- if (prev_sess_id && prev_sess_id != sess->id)
- destroy_previous_session(conn, sess->user, prev_sess_id);
-
- if (sess->state == SMB2_SESSION_VALID)
- ksmbd_free_user(sess->user);
-
retval = ksmbd_krb5_authenticate(sess, in_blob, in_len,
out_blob, &out_len);
if (retval) {
ksmbd_debug(SMB, "krb5 authentication failed\n");
return -EINVAL;
}
+
+ /* Check previous session */
+ prev_sess_id = le64_to_cpu(req->PreviousSessionId);
+ if (prev_sess_id && prev_sess_id != sess->id)
+ destroy_previous_session(conn, sess->user, prev_sess_id);
+
rsp->SecurityBufferLength = cpu_to_le16(out_len);
- if ((conn->sign || server_conf.enforced_signing) ||
+ /*
+ * If session state is SMB2_SESSION_VALID, We can assume
+ * that it is reauthentication. And the user/password
+ * has been verified, so return it here.
+ */
+ if (sess->state == SMB2_SESSION_VALID) {
+ if (conn->binding)
+ goto binding_session;
+ return 0;
+ }
+
+ if ((rsp->SessionFlags != SMB2_SESSION_FLAG_IS_GUEST_LE &&
+ (conn->sign || server_conf.enforced_signing)) ||
(req->SecurityMode & SMB2_NEGOTIATE_SIGNING_REQUIRED))
sess->sign = true;
- if (smb3_encryption_negotiated(conn)) {
+ if (smb3_encryption_negotiated(conn) &&
+ !(req->Flags & SMB2_SESSION_REQ_FLAG_BINDING)) {
retval = conn->ops->generate_encryptionkey(conn, sess);
if (retval) {
ksmbd_debug(SMB,
@@ -1627,6 +1651,7 @@ static int krb5_authenticate(struct ksmbd_work *work,
sess->sign = false;
}
+binding_session:
if (conn->dialect >= SMB30_PROT_ID) {
chann = lookup_chann_list(sess, conn);
if (!chann) {
@@ -1635,7 +1660,12 @@ static int krb5_authenticate(struct ksmbd_work *work,
return -ENOMEM;
chann->conn = conn;
- xa_store(&sess->ksmbd_chann_list, (long)conn, chann, KSMBD_DEFAULT_GFP);
+ old = xa_store(&sess->ksmbd_chann_list, (long)conn,
+ chann, KSMBD_DEFAULT_GFP);
+ if (xa_is_err(old)) {
+ kfree(chann);
+ return xa_err(old);
+ }
}
}
@@ -1674,6 +1704,11 @@ int smb2_sess_setup(struct ksmbd_work *work)
ksmbd_debug(SMB, "Received smb2 session setup request\n");
+ if (!ksmbd_conn_need_setup(conn) && !ksmbd_conn_good(conn)) {
+ work->send_no_response = 1;
+ return rc;
+ }
+
WORK_BUFFERS(work, req, rsp);
rsp->StructureSize = cpu_to_le16(9);
@@ -1707,44 +1742,38 @@ int smb2_sess_setup(struct ksmbd_work *work)
if (conn->dialect != sess->dialect) {
rc = -EINVAL;
- ksmbd_user_session_put(sess);
goto out_err;
}
if (!(req->hdr.Flags & SMB2_FLAGS_SIGNED)) {
rc = -EINVAL;
- ksmbd_user_session_put(sess);
goto out_err;
}
if (strncmp(conn->ClientGUID, sess->ClientGUID,
SMB2_CLIENT_GUID_SIZE)) {
rc = -ENOENT;
- ksmbd_user_session_put(sess);
goto out_err;
}
if (sess->state == SMB2_SESSION_IN_PROGRESS) {
rc = -EACCES;
- ksmbd_user_session_put(sess);
goto out_err;
}
if (sess->state == SMB2_SESSION_EXPIRED) {
rc = -EFAULT;
- ksmbd_user_session_put(sess);
goto out_err;
}
- ksmbd_user_session_put(sess);
if (ksmbd_conn_need_reconnect(conn)) {
rc = -EFAULT;
+ ksmbd_user_session_put(sess);
sess = NULL;
goto out_err;
}
- sess = ksmbd_session_lookup(conn, sess_id);
- if (!sess) {
+ if (is_ksmbd_session_in_connection(conn, sess_id)) {
rc = -EACCES;
goto out_err;
}
@@ -1823,8 +1852,6 @@ int smb2_sess_setup(struct ksmbd_work *work)
ksmbd_conn_set_good(conn);
sess->state = SMB2_SESSION_VALID;
}
- kfree(sess->Preauth_HashValue);
- sess->Preauth_HashValue = NULL;
} else if (conn->preferred_auth_mech == KSMBD_AUTH_NTLMSSP) {
if (negblob->MessageType == NtLmNegotiate) {
rc = ntlm_negotiate(work, negblob, negblob_len, rsp);
@@ -1851,8 +1878,6 @@ int smb2_sess_setup(struct ksmbd_work *work)
kfree(preauth_sess);
}
}
- kfree(sess->Preauth_HashValue);
- sess->Preauth_HashValue = NULL;
} else {
pr_info_ratelimited("Unknown NTLMSSP message type : 0x%x\n",
le32_to_cpu(negblob->MessageType));
@@ -1910,10 +1935,12 @@ out_err:
sess->last_active = jiffies;
sess->state = SMB2_SESSION_EXPIRED;
+ ksmbd_user_session_put(sess);
+ work->sess = NULL;
if (try_delay) {
ksmbd_conn_set_need_reconnect(conn);
ssleep(5);
- ksmbd_conn_set_need_negotiate(conn);
+ ksmbd_conn_set_need_setup(conn);
}
}
smb2_set_err_rsp(work);
@@ -2239,14 +2266,11 @@ int smb2_session_logoff(struct ksmbd_work *work)
return -ENOENT;
}
- ksmbd_destroy_file_table(&sess->file_table);
down_write(&conn->session_lock);
sess->state = SMB2_SESSION_EXPIRED;
up_write(&conn->session_lock);
- ksmbd_free_user(sess->user);
- sess->user = NULL;
- ksmbd_all_conn_set_status(sess_id, KSMBD_SESS_NEED_NEGOTIATE);
+ ksmbd_all_conn_set_status(sess_id, KSMBD_SESS_NEED_SETUP);
rsp->StructureSize = cpu_to_le16(4);
err = ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_logoff_rsp));
@@ -2572,7 +2596,7 @@ static void smb2_update_xattrs(struct ksmbd_tree_connect *tcon,
}
}
-static int smb2_creat(struct ksmbd_work *work, struct path *parent_path,
+static int smb2_creat(struct ksmbd_work *work,
struct path *path, char *name, int open_flags,
umode_t posix_mode, bool is_dir)
{
@@ -2601,7 +2625,7 @@ static int smb2_creat(struct ksmbd_work *work, struct path *parent_path,
return rc;
}
- rc = ksmbd_vfs_kern_path_locked(work, name, 0, parent_path, path, 0);
+ rc = ksmbd_vfs_kern_path(work, name, 0, path, 0);
if (rc) {
pr_err("cannot get linux path (%s), err = %d\n",
name, rc);
@@ -2708,6 +2732,13 @@ static int parse_durable_handle_context(struct ksmbd_work *work,
goto out;
}
+ if (le16_to_cpu(context->DataOffset) +
+ le32_to_cpu(context->DataLength) <
+ sizeof(struct create_durable_reconn_v2_req)) {
+ err = -EINVAL;
+ goto out;
+ }
+
recon_v2 = (struct create_durable_reconn_v2_req *)context;
persistent_id = recon_v2->Fid.PersistentFileId;
dh_info->fp = ksmbd_lookup_durable_fd(persistent_id);
@@ -2741,6 +2772,13 @@ static int parse_durable_handle_context(struct ksmbd_work *work,
goto out;
}
+ if (le16_to_cpu(context->DataOffset) +
+ le32_to_cpu(context->DataLength) <
+ sizeof(struct create_durable_reconn_req)) {
+ err = -EINVAL;
+ goto out;
+ }
+
recon = (struct create_durable_reconn_req *)context;
persistent_id = recon->Data.Fid.PersistentFileId;
dh_info->fp = ksmbd_lookup_durable_fd(persistent_id);
@@ -2766,6 +2804,13 @@ static int parse_durable_handle_context(struct ksmbd_work *work,
goto out;
}
+ if (le16_to_cpu(context->DataOffset) +
+ le32_to_cpu(context->DataLength) <
+ sizeof(struct create_durable_req_v2)) {
+ err = -EINVAL;
+ goto out;
+ }
+
durable_v2_blob =
(struct create_durable_req_v2 *)context;
ksmbd_debug(SMB, "Request for durable v2 open\n");
@@ -2830,7 +2875,7 @@ int smb2_open(struct ksmbd_work *work)
struct ksmbd_tree_connect *tcon = work->tcon;
struct smb2_create_req *req;
struct smb2_create_rsp *rsp;
- struct path path, parent_path;
+ struct path path;
struct ksmbd_share_config *share = tcon->share_conf;
struct ksmbd_file *fp = NULL;
struct file *filp = NULL;
@@ -2845,7 +2890,7 @@ int smb2_open(struct ksmbd_work *work)
int req_op_level = 0, open_flags = 0, may_flags = 0, file_info = 0;
int rc = 0;
int contxt_cnt = 0, query_disk_id = 0;
- int maximal_access_ctxt = 0, posix_ctxt = 0;
+ bool maximal_access_ctxt = false, posix_ctxt = false;
int s_type = 0;
int next_off = 0;
char *name = NULL;
@@ -2874,6 +2919,27 @@ int smb2_open(struct ksmbd_work *work)
return create_smb2_pipe(work);
}
+ if (req->CreateContextsOffset && tcon->posix_extensions) {
+ context = smb2_find_context_vals(req, SMB2_CREATE_TAG_POSIX, 16);
+ if (IS_ERR(context)) {
+ rc = PTR_ERR(context);
+ goto err_out2;
+ } else if (context) {
+ struct create_posix *posix = (struct create_posix *)context;
+
+ if (le16_to_cpu(context->DataOffset) +
+ le32_to_cpu(context->DataLength) <
+ sizeof(struct create_posix) - 4) {
+ rc = -EINVAL;
+ goto err_out2;
+ }
+ ksmbd_debug(SMB, "get posix context\n");
+
+ posix_mode = le32_to_cpu(posix->Mode);
+ posix_ctxt = true;
+ }
+ }
+
if (req->NameLength) {
name = smb2_get_name((char *)req + le16_to_cpu(req->NameOffset),
le16_to_cpu(req->NameLength),
@@ -2896,9 +2962,11 @@ int smb2_open(struct ksmbd_work *work)
goto err_out2;
}
- rc = ksmbd_validate_filename(name);
- if (rc < 0)
- goto err_out2;
+ if (posix_ctxt == false) {
+ rc = ksmbd_validate_filename(name);
+ if (rc < 0)
+ goto err_out2;
+ }
if (ksmbd_share_veto_filename(share, name)) {
rc = -ENOENT;
@@ -3056,28 +3124,6 @@ int smb2_open(struct ksmbd_work *work)
rc = -EBADF;
goto err_out2;
}
-
- if (tcon->posix_extensions) {
- context = smb2_find_context_vals(req,
- SMB2_CREATE_TAG_POSIX, 16);
- if (IS_ERR(context)) {
- rc = PTR_ERR(context);
- goto err_out2;
- } else if (context) {
- struct create_posix *posix =
- (struct create_posix *)context;
- if (le16_to_cpu(context->DataOffset) +
- le32_to_cpu(context->DataLength) <
- sizeof(struct create_posix) - 4) {
- rc = -EINVAL;
- goto err_out2;
- }
- ksmbd_debug(SMB, "get posix context\n");
-
- posix_mode = le32_to_cpu(posix->Mode);
- posix_ctxt = 1;
- }
- }
}
if (ksmbd_override_fsids(work)) {
@@ -3085,8 +3131,8 @@ int smb2_open(struct ksmbd_work *work)
goto err_out2;
}
- rc = ksmbd_vfs_kern_path_locked(work, name, LOOKUP_NO_SYMLINKS,
- &parent_path, &path, 1);
+ rc = ksmbd_vfs_kern_path(work, name, LOOKUP_NO_SYMLINKS,
+ &path, 1);
if (!rc) {
file_present = true;
@@ -3207,7 +3253,7 @@ int smb2_open(struct ksmbd_work *work)
/*create file if not present */
if (!file_present) {
- rc = smb2_creat(work, &parent_path, &path, name, open_flags,
+ rc = smb2_creat(work, &path, name, open_flags,
posix_mode,
req->CreateOptions & FILE_DIRECTORY_FILE_LE);
if (rc) {
@@ -3412,7 +3458,7 @@ int smb2_open(struct ksmbd_work *work)
}
if (file_present || created)
- ksmbd_vfs_kern_path_unlock(&parent_path, &path);
+ path_put(&path);
if (!S_ISDIR(file_inode(filp)->i_mode) && open_flags & O_TRUNC &&
!fp->attrib_only && !stream_name) {
@@ -3510,6 +3556,15 @@ int smb2_open(struct ksmbd_work *work)
ksmbd_debug(SMB, "get query on disk id context\n");
query_disk_id = 1;
}
+
+ if (conn->is_aapl == false) {
+ context = smb2_find_context_vals(req, SMB2_CREATE_AAPL, 4);
+ if (IS_ERR(context)) {
+ rc = PTR_ERR(context);
+ goto err_out1;
+ } else if (context)
+ conn->is_aapl = true;
+ }
}
rc = ksmbd_vfs_getattr(&path, &stat);
@@ -3684,7 +3739,7 @@ reconnected_fp:
err_out:
if (rc && (file_present || created))
- ksmbd_vfs_kern_path_unlock(&parent_path, &path);
+ path_put(&path);
err_out1:
ksmbd_revert_fsids(work);
@@ -3949,7 +4004,10 @@ static int smb2_populate_readdir_entry(struct ksmbd_conn *conn, int info_level,
if (dinfo->EaSize)
dinfo->ExtFileAttributes = FILE_ATTRIBUTE_REPARSE_POINT_LE;
dinfo->Reserved = 0;
- dinfo->UniqueId = cpu_to_le64(ksmbd_kstat->kstat->ino);
+ if (conn->is_aapl)
+ dinfo->UniqueId = 0;
+ else
+ dinfo->UniqueId = cpu_to_le64(ksmbd_kstat->kstat->ino);
if (d_info->hide_dot_file && d_info->name[0] == '.')
dinfo->ExtFileAttributes |= FILE_ATTRIBUTE_HIDDEN_LE;
memcpy(dinfo->FileName, conv_name, conv_len);
@@ -3966,7 +4024,10 @@ static int smb2_populate_readdir_entry(struct ksmbd_conn *conn, int info_level,
smb2_get_reparse_tag_special_file(ksmbd_kstat->kstat->mode);
if (fibdinfo->EaSize)
fibdinfo->ExtFileAttributes = FILE_ATTRIBUTE_REPARSE_POINT_LE;
- fibdinfo->UniqueId = cpu_to_le64(ksmbd_kstat->kstat->ino);
+ if (conn->is_aapl)
+ fibdinfo->UniqueId = 0;
+ else
+ fibdinfo->UniqueId = cpu_to_le64(ksmbd_kstat->kstat->ino);
fibdinfo->ShortNameLength = 0;
fibdinfo->Reserved = 0;
fibdinfo->Reserved2 = cpu_to_le16(0);
@@ -4062,20 +4123,6 @@ struct smb2_query_dir_private {
int info_level;
};
-static void lock_dir(struct ksmbd_file *dir_fp)
-{
- struct dentry *dir = dir_fp->filp->f_path.dentry;
-
- inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
-}
-
-static void unlock_dir(struct ksmbd_file *dir_fp)
-{
- struct dentry *dir = dir_fp->filp->f_path.dentry;
-
- inode_unlock(d_inode(dir));
-}
-
static int process_query_dir_entries(struct smb2_query_dir_private *priv)
{
struct mnt_idmap *idmap = file_mnt_idmap(priv->dir_fp->filp);
@@ -4090,11 +4137,10 @@ static int process_query_dir_entries(struct smb2_query_dir_private *priv)
if (dentry_name(priv->d_info, priv->info_level))
return -EINVAL;
- lock_dir(priv->dir_fp);
- dent = lookup_one(idmap, priv->d_info->name,
- priv->dir_fp->filp->f_path.dentry,
- priv->d_info->name_len);
- unlock_dir(priv->dir_fp);
+ dent = lookup_one_unlocked(idmap,
+ &QSTR_LEN(priv->d_info->name,
+ priv->d_info->name_len),
+ priv->dir_fp->filp->f_path.dentry);
if (IS_ERR(dent)) {
ksmbd_debug(SMB, "Cannot lookup `%s' [%ld]\n",
@@ -4825,8 +4871,13 @@ static int get_file_standard_info(struct smb2_query_info_rsp *rsp,
sinfo = (struct smb2_file_standard_info *)rsp->Buffer;
delete_pending = ksmbd_inode_pending_delete(fp);
- sinfo->AllocationSize = cpu_to_le64(stat.blocks << 9);
- sinfo->EndOfFile = S_ISDIR(stat.mode) ? 0 : cpu_to_le64(stat.size);
+ if (ksmbd_stream_fd(fp) == false) {
+ sinfo->AllocationSize = cpu_to_le64(stat.blocks << 9);
+ sinfo->EndOfFile = S_ISDIR(stat.mode) ? 0 : cpu_to_le64(stat.size);
+ } else {
+ sinfo->AllocationSize = cpu_to_le64(fp->stream.size);
+ sinfo->EndOfFile = cpu_to_le64(fp->stream.size);
+ }
sinfo->NumberOfLinks = cpu_to_le32(get_nlink(&stat) - delete_pending);
sinfo->DeletePending = delete_pending;
sinfo->Directory = S_ISDIR(stat.mode) ? 1 : 0;
@@ -4889,9 +4940,14 @@ static int get_file_all_info(struct ksmbd_work *work,
file_info->ChangeTime = cpu_to_le64(time);
file_info->Attributes = fp->f_ci->m_fattr;
file_info->Pad1 = 0;
- file_info->AllocationSize =
- cpu_to_le64(stat.blocks << 9);
- file_info->EndOfFile = S_ISDIR(stat.mode) ? 0 : cpu_to_le64(stat.size);
+ if (ksmbd_stream_fd(fp) == false) {
+ file_info->AllocationSize =
+ cpu_to_le64(stat.blocks << 9);
+ file_info->EndOfFile = S_ISDIR(stat.mode) ? 0 : cpu_to_le64(stat.size);
+ } else {
+ file_info->AllocationSize = cpu_to_le64(fp->stream.size);
+ file_info->EndOfFile = cpu_to_le64(fp->stream.size);
+ }
file_info->NumberOfLinks =
cpu_to_le32(get_nlink(&stat) - delete_pending);
file_info->DeletePending = delete_pending;
@@ -4900,7 +4956,10 @@ static int get_file_all_info(struct ksmbd_work *work,
file_info->IndexNumber = cpu_to_le64(stat.ino);
file_info->EASize = 0;
file_info->AccessFlags = fp->daccess;
- file_info->CurrentByteOffset = cpu_to_le64(fp->filp->f_pos);
+ if (ksmbd_stream_fd(fp) == false)
+ file_info->CurrentByteOffset = cpu_to_le64(fp->filp->f_pos);
+ else
+ file_info->CurrentByteOffset = cpu_to_le64(fp->stream.pos);
file_info->Mode = fp->coption;
file_info->AlignmentRequirement = 0;
conv_len = smbConvertToUTF16((__le16 *)file_info->FileName, filename,
@@ -5088,8 +5147,13 @@ static int get_file_network_open_info(struct smb2_query_info_rsp *rsp,
time = ksmbd_UnixTimeToNT(stat.ctime);
file_info->ChangeTime = cpu_to_le64(time);
file_info->Attributes = fp->f_ci->m_fattr;
- file_info->AllocationSize = cpu_to_le64(stat.blocks << 9);
- file_info->EndOfFile = S_ISDIR(stat.mode) ? 0 : cpu_to_le64(stat.size);
+ if (ksmbd_stream_fd(fp) == false) {
+ file_info->AllocationSize = cpu_to_le64(stat.blocks << 9);
+ file_info->EndOfFile = S_ISDIR(stat.mode) ? 0 : cpu_to_le64(stat.size);
+ } else {
+ file_info->AllocationSize = cpu_to_le64(fp->stream.size);
+ file_info->EndOfFile = cpu_to_le64(fp->stream.size);
+ }
file_info->Reserved = cpu_to_le32(0);
rsp->OutputBufferLength =
cpu_to_le32(sizeof(struct smb2_file_ntwrk_info));
@@ -5112,7 +5176,11 @@ static void get_file_position_info(struct smb2_query_info_rsp *rsp,
struct smb2_file_pos_info *file_info;
file_info = (struct smb2_file_pos_info *)rsp->Buffer;
- file_info->CurrentByteOffset = cpu_to_le64(fp->filp->f_pos);
+ if (ksmbd_stream_fd(fp) == false)
+ file_info->CurrentByteOffset = cpu_to_le64(fp->filp->f_pos);
+ else
+ file_info->CurrentByteOffset = cpu_to_le64(fp->stream.pos);
+
rsp->OutputBufferLength =
cpu_to_le32(sizeof(struct smb2_file_pos_info));
}
@@ -5201,8 +5269,13 @@ static int find_file_posix_info(struct smb2_query_info_rsp *rsp,
file_info->ChangeTime = cpu_to_le64(time);
file_info->DosAttributes = fp->f_ci->m_fattr;
file_info->Inode = cpu_to_le64(stat.ino);
- file_info->EndOfFile = cpu_to_le64(stat.size);
- file_info->AllocationSize = cpu_to_le64(stat.blocks << 9);
+ if (ksmbd_stream_fd(fp) == false) {
+ file_info->EndOfFile = cpu_to_le64(stat.size);
+ file_info->AllocationSize = cpu_to_le64(stat.blocks << 9);
+ } else {
+ file_info->EndOfFile = cpu_to_le64(fp->stream.size);
+ file_info->AllocationSize = cpu_to_le64(fp->stream.size);
+ }
file_info->HardLinks = cpu_to_le32(stat.nlink);
file_info->Mode = cpu_to_le32(stat.mode & 0777);
switch (stat.mode & S_IFMT) {
@@ -5978,8 +6051,7 @@ static int smb2_create_link(struct ksmbd_work *work,
struct nls_table *local_nls)
{
char *link_name = NULL, *target_name = NULL, *pathname = NULL;
- struct path path, parent_path;
- bool file_present = false;
+ struct path path;
int rc;
if (buf_len < (u64)sizeof(struct smb2_file_link_info) +
@@ -6008,15 +6080,12 @@ static int smb2_create_link(struct ksmbd_work *work,
ksmbd_debug(SMB, "target name is %s\n", target_name);
rc = ksmbd_vfs_kern_path_locked(work, link_name, LOOKUP_NO_SYMLINKS,
- &parent_path, &path, 0);
+ &path, 0);
if (rc) {
if (rc != -ENOENT)
goto out;
- } else
- file_present = true;
-
- if (file_info->ReplaceIfExists) {
- if (file_present) {
+ } else {
+ if (file_info->ReplaceIfExists) {
rc = ksmbd_vfs_remove_file(work, &path);
if (rc) {
rc = -EINVAL;
@@ -6024,21 +6093,17 @@ static int smb2_create_link(struct ksmbd_work *work,
link_name);
goto out;
}
- }
- } else {
- if (file_present) {
+ } else {
rc = -EEXIST;
ksmbd_debug(SMB, "link already exists\n");
goto out;
}
+ ksmbd_vfs_kern_path_unlock(&path);
}
-
rc = ksmbd_vfs_link(work, target_name, link_name);
if (rc)
rc = -EINVAL;
out:
- if (file_present)
- ksmbd_vfs_kern_path_unlock(&parent_path, &path);
if (!IS_ERR(link_name))
kfree(link_name);
@@ -6144,6 +6209,9 @@ static int set_file_allocation_info(struct ksmbd_work *work,
if (!(fp->daccess & FILE_WRITE_DATA_LE))
return -EACCES;
+ if (ksmbd_stream_fd(fp) == true)
+ return 0;
+
rc = vfs_getattr(&fp->filp->f_path, &stat, STATX_BASIC_STATS,
AT_STATX_SYNC_AS_STAT);
if (rc)
@@ -6202,7 +6270,8 @@ static int set_end_of_file_info(struct ksmbd_work *work, struct ksmbd_file *fp,
* truncate of some filesystem like FAT32 fill zero data in
* truncated range.
*/
- if (inode->i_sb->s_magic != MSDOS_SUPER_MAGIC) {
+ if (inode->i_sb->s_magic != MSDOS_SUPER_MAGIC &&
+ ksmbd_stream_fd(fp) == false) {
ksmbd_debug(SMB, "truncated to newsize %lld\n", newsize);
rc = ksmbd_vfs_truncate(work, fp, newsize);
if (rc) {
@@ -6275,7 +6344,13 @@ static int set_file_position_info(struct ksmbd_file *fp,
return -EINVAL;
}
- fp->filp->f_pos = current_byte_offset;
+ if (ksmbd_stream_fd(fp) == false)
+ fp->filp->f_pos = current_byte_offset;
+ else {
+ if (current_byte_offset > XATTR_SIZE_MAX)
+ current_byte_offset = XATTR_SIZE_MAX;
+ fp->stream.pos = current_byte_offset;
+ }
return 0;
}
@@ -7458,17 +7533,17 @@ out_check_cl:
}
no_check_cl:
+ flock = smb_lock->fl;
+ list_del(&smb_lock->llist);
+
if (smb_lock->zero_len) {
err = 0;
goto skip;
}
-
- flock = smb_lock->fl;
- list_del(&smb_lock->llist);
retry:
rc = vfs_lock_file(filp, smb_lock->cmd, flock, NULL);
skip:
- if (flags & SMB2_LOCKFLAG_UNLOCK) {
+ if (smb_lock->flags & SMB2_LOCKFLAG_UNLOCK) {
if (!rc) {
ksmbd_debug(SMB, "File unlocked\n");
} else if (rc == -ENOENT) {
@@ -7763,7 +7838,7 @@ static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn,
if (!ksmbd_find_netdev_name_iface_list(netdev->name))
continue;
- flags = dev_get_flags(netdev);
+ flags = netif_get_flags(netdev);
if (!(flags & IFF_RUNNING))
continue;
ipv6_retry:
@@ -8489,11 +8564,6 @@ static void smb20_oplock_break_ack(struct ksmbd_work *work)
goto err_out;
}
- opinfo->op_state = OPLOCK_STATE_NONE;
- wake_up_interruptible_all(&opinfo->oplock_q);
- opinfo_put(opinfo);
- ksmbd_fd_put(work, fp);
-
rsp->StructureSize = cpu_to_le16(24);
rsp->OplockLevel = rsp_oplevel;
rsp->Reserved = 0;
@@ -8501,16 +8571,15 @@ static void smb20_oplock_break_ack(struct ksmbd_work *work)
rsp->VolatileFid = volatile_id;
rsp->PersistentFid = persistent_id;
ret = ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_oplock_break));
- if (!ret)
- return;
-
+ if (ret) {
err_out:
+ smb2_set_err_rsp(work);
+ }
+
opinfo->op_state = OPLOCK_STATE_NONE;
wake_up_interruptible_all(&opinfo->oplock_q);
-
opinfo_put(opinfo);
ksmbd_fd_put(work, fp);
- smb2_set_err_rsp(work);
}
static int check_lease_state(struct lease *lease, __le32 req_state)
@@ -8640,11 +8709,6 @@ static void smb21_lease_break_ack(struct ksmbd_work *work)
}
lease_state = lease->state;
- opinfo->op_state = OPLOCK_STATE_NONE;
- wake_up_interruptible_all(&opinfo->oplock_q);
- atomic_dec(&opinfo->breaking_cnt);
- wake_up_interruptible_all(&opinfo->oplock_brk);
- opinfo_put(opinfo);
rsp->StructureSize = cpu_to_le16(36);
rsp->Reserved = 0;
@@ -8653,16 +8717,16 @@ static void smb21_lease_break_ack(struct ksmbd_work *work)
rsp->LeaseState = lease_state;
rsp->LeaseDuration = 0;
ret = ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_lease_ack));
- if (!ret)
- return;
-
+ if (ret) {
err_out:
+ smb2_set_err_rsp(work);
+ }
+
+ opinfo->op_state = OPLOCK_STATE_NONE;
wake_up_interruptible_all(&opinfo->oplock_q);
atomic_dec(&opinfo->breaking_cnt);
wake_up_interruptible_all(&opinfo->oplock_brk);
-
opinfo_put(opinfo);
- smb2_set_err_rsp(work);
}
/**
diff --git a/fs/smb/server/smb2pdu.h b/fs/smb/server/smb2pdu.h
index 17a0b18a8406..16ae8a10490b 100644
--- a/fs/smb/server/smb2pdu.h
+++ b/fs/smb/server/smb2pdu.h
@@ -63,6 +63,9 @@ struct preauth_integrity_info {
#define SMB2_SESSION_TIMEOUT (10 * HZ)
+/* Apple Defined Contexts */
+#define SMB2_CREATE_AAPL "AAPL"
+
struct create_durable_req_v2 {
struct create_context_hdr ccontext;
__u8 Name[8];
diff --git a/fs/smb/server/smb_common.c b/fs/smb/server/smb_common.c
index 425c756bcfb8..b23203a1c286 100644
--- a/fs/smb/server/smb_common.c
+++ b/fs/smb/server/smb_common.c
@@ -515,7 +515,7 @@ int ksmbd_extract_shortname(struct ksmbd_conn *conn, const char *longname,
p = strrchr(longname, '.');
if (p == longname) { /*name starts with a dot*/
- strscpy(extension, "___", strlen("___"));
+ strscpy(extension, "___", sizeof(extension));
} else {
if (p) {
p++;
diff --git a/fs/smb/server/smb_common.h b/fs/smb/server/smb_common.h
index a3d8a905b07e..d742ba754348 100644
--- a/fs/smb/server/smb_common.h
+++ b/fs/smb/server/smb_common.h
@@ -72,6 +72,8 @@
#define FILE_SUPPORTS_ENCRYPTION 0x00020000
#define FILE_SUPPORTS_OBJECT_IDS 0x00010000
#define FILE_VOLUME_IS_COMPRESSED 0x00008000
+#define FILE_SUPPORTS_POSIX_UNLINK_RENAME 0x00000400
+#define FILE_RETURNS_CLEANUP_RESULT_INFO 0x00000200
#define FILE_SUPPORTS_REMOTE_STORAGE 0x00000100
#define FILE_SUPPORTS_REPARSE_POINTS 0x00000080
#define FILE_SUPPORTS_SPARSE_FILES 0x00000040
diff --git a/fs/smb/server/smbacl.c b/fs/smb/server/smbacl.c
index d39d3e553366..5aa7a66334d9 100644
--- a/fs/smb/server/smbacl.c
+++ b/fs/smb/server/smbacl.c
@@ -270,6 +270,11 @@ static int sid_to_id(struct mnt_idmap *idmap,
return -EIO;
}
+ if (psid->num_subauth == 0) {
+ pr_err("%s: zero subauthorities!\n", __func__);
+ return -EIO;
+ }
+
if (sidtype == SIDOWNER) {
kuid_t uid;
uid_t id;
@@ -333,7 +338,7 @@ void posix_state_to_acl(struct posix_acl_state *state,
pace->e_perm = state->other.allow;
}
-int init_acl_state(struct posix_acl_state *state, int cnt)
+int init_acl_state(struct posix_acl_state *state, u16 cnt)
{
int alloc;
@@ -368,7 +373,7 @@ static void parse_dacl(struct mnt_idmap *idmap,
struct smb_fattr *fattr)
{
int i, ret;
- int num_aces = 0;
+ u16 num_aces = 0;
unsigned int acl_size;
char *acl_base;
struct smb_ace **ppace;
@@ -389,16 +394,18 @@ static void parse_dacl(struct mnt_idmap *idmap,
ksmbd_debug(SMB, "DACL revision %d size %d num aces %d\n",
le16_to_cpu(pdacl->revision), le16_to_cpu(pdacl->size),
- le32_to_cpu(pdacl->num_aces));
+ le16_to_cpu(pdacl->num_aces));
acl_base = (char *)pdacl;
acl_size = sizeof(struct smb_acl);
- num_aces = le32_to_cpu(pdacl->num_aces);
+ num_aces = le16_to_cpu(pdacl->num_aces);
if (num_aces <= 0)
return;
- if (num_aces > ULONG_MAX / sizeof(struct smb_ace *))
+ if (num_aces > (le16_to_cpu(pdacl->size) - sizeof(struct smb_acl)) /
+ (offsetof(struct smb_ace, sid) +
+ offsetof(struct smb_sid, sub_auth) + sizeof(__le16)))
return;
ret = init_acl_state(&acl_state, num_aces);
@@ -432,6 +439,7 @@ static void parse_dacl(struct mnt_idmap *idmap,
offsetof(struct smb_sid, sub_auth);
if (end_of_acl - acl_base < acl_size ||
+ ppace[i]->sid.num_subauth == 0 ||
ppace[i]->sid.num_subauth > SID_MAX_SUB_AUTHORITIES ||
(end_of_acl - acl_base <
acl_size + sizeof(__le32) * ppace[i]->sid.num_subauth) ||
@@ -580,7 +588,7 @@ static void parse_dacl(struct mnt_idmap *idmap,
static void set_posix_acl_entries_dacl(struct mnt_idmap *idmap,
struct smb_ace *pndace,
- struct smb_fattr *fattr, u32 *num_aces,
+ struct smb_fattr *fattr, u16 *num_aces,
u16 *size, u32 nt_aces_num)
{
struct posix_acl_entry *pace;
@@ -701,7 +709,7 @@ static void set_ntacl_dacl(struct mnt_idmap *idmap,
struct smb_fattr *fattr)
{
struct smb_ace *ntace, *pndace;
- int nt_num_aces = le32_to_cpu(nt_dacl->num_aces), num_aces = 0;
+ u16 nt_num_aces = le16_to_cpu(nt_dacl->num_aces), num_aces = 0;
unsigned short size = 0;
int i;
@@ -728,7 +736,7 @@ static void set_ntacl_dacl(struct mnt_idmap *idmap,
set_posix_acl_entries_dacl(idmap, pndace, fattr,
&num_aces, &size, nt_num_aces);
- pndacl->num_aces = cpu_to_le32(num_aces);
+ pndacl->num_aces = cpu_to_le16(num_aces);
pndacl->size = cpu_to_le16(le16_to_cpu(pndacl->size) + size);
}
@@ -736,7 +744,7 @@ static void set_mode_dacl(struct mnt_idmap *idmap,
struct smb_acl *pndacl, struct smb_fattr *fattr)
{
struct smb_ace *pace, *pndace;
- u32 num_aces = 0;
+ u16 num_aces = 0;
u16 size = 0, ace_size = 0;
uid_t uid;
const struct smb_sid *sid;
@@ -792,7 +800,7 @@ static void set_mode_dacl(struct mnt_idmap *idmap,
fattr->cf_mode, 0007);
out:
- pndacl->num_aces = cpu_to_le32(num_aces);
+ pndacl->num_aces = cpu_to_le16(num_aces);
pndacl->size = cpu_to_le16(le16_to_cpu(pndacl->size) + size);
}
@@ -807,6 +815,13 @@ static int parse_sid(struct smb_sid *psid, char *end_of_acl)
return -EINVAL;
}
+ if (!psid->num_subauth)
+ return 0;
+
+ if (psid->num_subauth > SID_MAX_SUB_AUTHORITIES ||
+ end_of_acl < (char *)psid + 8 + sizeof(__le32) * psid->num_subauth)
+ return -EINVAL;
+
return 0;
}
@@ -848,6 +863,9 @@ int parse_sec_desc(struct mnt_idmap *idmap, struct smb_ntsd *pntsd,
pntsd->type = cpu_to_le16(DACL_PRESENT);
if (pntsd->osidoffset) {
+ if (le32_to_cpu(pntsd->osidoffset) < sizeof(struct smb_ntsd))
+ return -EINVAL;
+
rc = parse_sid(owner_sid_ptr, end_of_acl);
if (rc) {
pr_err("%s: Error %d parsing Owner SID\n", __func__, rc);
@@ -863,6 +881,9 @@ int parse_sec_desc(struct mnt_idmap *idmap, struct smb_ntsd *pntsd,
}
if (pntsd->gsidoffset) {
+ if (le32_to_cpu(pntsd->gsidoffset) < sizeof(struct smb_ntsd))
+ return -EINVAL;
+
rc = parse_sid(group_sid_ptr, end_of_acl);
if (rc) {
pr_err("%s: Error %d mapping Owner SID to gid\n",
@@ -884,6 +905,9 @@ int parse_sec_desc(struct mnt_idmap *idmap, struct smb_ntsd *pntsd,
pntsd->type |= cpu_to_le16(DACL_PROTECTED);
if (dacloffset) {
+ if (dacloffset < sizeof(struct smb_ntsd))
+ return -EINVAL;
+
parse_dacl(idmap, dacl_ptr, end_of_acl,
owner_sid_ptr, group_sid_ptr, fattr);
}
@@ -1006,8 +1030,11 @@ int smb_inherit_dacl(struct ksmbd_conn *conn,
struct smb_sid owner_sid, group_sid;
struct dentry *parent = path->dentry->d_parent;
struct mnt_idmap *idmap = mnt_idmap(path->mnt);
- int inherited_flags = 0, flags = 0, i, ace_cnt = 0, nt_size = 0, pdacl_size;
- int rc = 0, num_aces, dacloffset, pntsd_type, pntsd_size, acl_len, aces_size;
+ int inherited_flags = 0, flags = 0, i, nt_size = 0, pdacl_size;
+ int rc = 0, pntsd_type, pntsd_size, acl_len, aces_size;
+ unsigned int dacloffset;
+ size_t dacl_struct_end;
+ u16 num_aces, ace_cnt = 0;
char *aces_base;
bool is_dir = S_ISDIR(d_inode(path->dentry)->i_mode);
@@ -1015,15 +1042,18 @@ int smb_inherit_dacl(struct ksmbd_conn *conn,
parent, &parent_pntsd);
if (pntsd_size <= 0)
return -ENOENT;
+
dacloffset = le32_to_cpu(parent_pntsd->dacloffset);
- if (!dacloffset || (dacloffset + sizeof(struct smb_acl) > pntsd_size)) {
+ if (!dacloffset ||
+ check_add_overflow(dacloffset, sizeof(struct smb_acl), &dacl_struct_end) ||
+ dacl_struct_end > (size_t)pntsd_size) {
rc = -EINVAL;
goto free_parent_pntsd;
}
parent_pdacl = (struct smb_acl *)((char *)parent_pntsd + dacloffset);
acl_len = pntsd_size - dacloffset;
- num_aces = le32_to_cpu(parent_pdacl->num_aces);
+ num_aces = le16_to_cpu(parent_pdacl->num_aces);
pntsd_type = le16_to_cpu(parent_pntsd->type);
pdacl_size = le16_to_cpu(parent_pdacl->size);
@@ -1183,7 +1213,7 @@ pass:
pdacl = (struct smb_acl *)((char *)pntsd + le32_to_cpu(pntsd->dacloffset));
pdacl->revision = cpu_to_le16(2);
pdacl->size = cpu_to_le16(sizeof(struct smb_acl) + nt_size);
- pdacl->num_aces = cpu_to_le32(ace_cnt);
+ pdacl->num_aces = cpu_to_le16(ace_cnt);
pace = (struct smb_ace *)((char *)pdacl + sizeof(struct smb_acl));
memcpy(pace, aces_base, nt_size);
pntsd_size += sizeof(struct smb_acl) + nt_size;
@@ -1220,7 +1250,9 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, const struct path *path,
struct smb_ntsd *pntsd = NULL;
struct smb_acl *pdacl;
struct posix_acl *posix_acls;
- int rc = 0, pntsd_size, acl_size, aces_size, pdacl_size, dacl_offset;
+ int rc = 0, pntsd_size, acl_size, aces_size, pdacl_size;
+ unsigned int dacl_offset;
+ size_t dacl_struct_end;
struct smb_sid sid;
int granted = le32_to_cpu(*pdaccess & ~FILE_MAXIMAL_ACCESS_LE);
struct smb_ace *ace;
@@ -1239,7 +1271,8 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, const struct path *path,
dacl_offset = le32_to_cpu(pntsd->dacloffset);
if (!dacl_offset ||
- (dacl_offset + sizeof(struct smb_acl) > pntsd_size))
+ check_add_overflow(dacl_offset, sizeof(struct smb_acl), &dacl_struct_end) ||
+ dacl_struct_end > (size_t)pntsd_size)
goto err_out;
pdacl = (struct smb_acl *)((char *)pntsd + le32_to_cpu(pntsd->dacloffset));
@@ -1264,7 +1297,7 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, const struct path *path,
ace = (struct smb_ace *)((char *)pdacl + sizeof(struct smb_acl));
aces_size = acl_size - sizeof(struct smb_acl);
- for (i = 0; i < le32_to_cpu(pdacl->num_aces); i++) {
+ for (i = 0; i < le16_to_cpu(pdacl->num_aces); i++) {
if (offsetof(struct smb_ace, access_req) > aces_size)
break;
ace_size = le16_to_cpu(ace->size);
@@ -1285,7 +1318,7 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, const struct path *path,
ace = (struct smb_ace *)((char *)pdacl + sizeof(struct smb_acl));
aces_size = acl_size - sizeof(struct smb_acl);
- for (i = 0; i < le32_to_cpu(pdacl->num_aces); i++) {
+ for (i = 0; i < le16_to_cpu(pdacl->num_aces); i++) {
if (offsetof(struct smb_ace, access_req) > aces_size)
break;
ace_size = le16_to_cpu(ace->size);
diff --git a/fs/smb/server/smbacl.h b/fs/smb/server/smbacl.h
index 24ce576fc292..355adaee39b8 100644
--- a/fs/smb/server/smbacl.h
+++ b/fs/smb/server/smbacl.h
@@ -86,7 +86,7 @@ int parse_sec_desc(struct mnt_idmap *idmap, struct smb_ntsd *pntsd,
int build_sec_desc(struct mnt_idmap *idmap, struct smb_ntsd *pntsd,
struct smb_ntsd *ppntsd, int ppntsd_size, int addition_info,
__u32 *secdesclen, struct smb_fattr *fattr);
-int init_acl_state(struct posix_acl_state *state, int cnt);
+int init_acl_state(struct posix_acl_state *state, u16 cnt);
void free_acl_state(struct posix_acl_state *state);
void posix_state_to_acl(struct posix_acl_state *state,
struct posix_acl_entry *pace);
diff --git a/fs/smb/server/transport_ipc.c b/fs/smb/server/transport_ipc.c
index 0460ebea6ff0..2a3e2b0ce557 100644
--- a/fs/smb/server/transport_ipc.c
+++ b/fs/smb/server/transport_ipc.c
@@ -281,6 +281,7 @@ static int handle_response(int type, void *payload, size_t sz)
if (entry->type + 1 != type) {
pr_err("Waiting for IPC type %d, got %d. Ignore.\n",
entry->type + 1, type);
+ continue;
}
entry->response = kvzalloc(sz, KSMBD_DEFAULT_GFP);
@@ -309,7 +310,11 @@ static int ipc_server_config_on_startup(struct ksmbd_startup_request *req)
server_conf.signing = req->signing;
server_conf.tcp_port = req->tcp_port;
server_conf.ipc_timeout = req->ipc_timeout * HZ;
- server_conf.deadtime = req->deadtime * SMB_ECHO_INTERVAL;
+ if (check_mul_overflow(req->deadtime, SMB_ECHO_INTERVAL,
+ &server_conf.deadtime)) {
+ ret = -EINVAL;
+ goto out;
+ }
server_conf.share_fake_fscaps = req->share_fake_fscaps;
ksmbd_init_domain(req->sub_auth);
@@ -336,6 +341,7 @@ static int ipc_server_config_on_startup(struct ksmbd_startup_request *req)
server_conf.bind_interfaces_only = req->bind_interfaces_only;
ret |= ksmbd_tcp_set_interfaces(KSMBD_STARTUP_CONFIG_INTERFACES(req),
req->ifc_list_sz);
+out:
if (ret) {
pr_err("Server configuration error: %s %s %s\n",
req->netbios_name, req->server_string,
diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c
index c3785a5434f9..5466aa8c39b1 100644
--- a/fs/smb/server/transport_rdma.c
+++ b/fs/smb/server/transport_rdma.c
@@ -14,6 +14,7 @@
#include <linux/mempool.h>
#include <linux/highmem.h>
#include <linux/scatterlist.h>
+#include <linux/string_choices.h>
#include <rdma/ib_verbs.h>
#include <rdma/rdma_cm.h>
#include <rdma/rw.h>
@@ -128,9 +129,6 @@ struct smb_direct_transport {
spinlock_t recvmsg_queue_lock;
struct list_head recvmsg_queue;
- spinlock_t empty_recvmsg_queue_lock;
- struct list_head empty_recvmsg_queue;
-
int send_credit_target;
atomic_t send_credits;
spinlock_t lock_new_recv_credits;
@@ -158,7 +156,8 @@ struct smb_direct_transport {
};
#define KSMBD_TRANS(t) ((struct ksmbd_transport *)&((t)->transport))
-
+#define SMBD_TRANS(t) ((struct smb_direct_transport *)container_of(t, \
+ struct smb_direct_transport, transport))
enum {
SMB_DIRECT_MSG_NEGOTIATE_REQ = 0,
SMB_DIRECT_MSG_DATA_TRANSFER
@@ -266,40 +265,19 @@ smb_direct_recvmsg *get_free_recvmsg(struct smb_direct_transport *t)
static void put_recvmsg(struct smb_direct_transport *t,
struct smb_direct_recvmsg *recvmsg)
{
- ib_dma_unmap_single(t->cm_id->device, recvmsg->sge.addr,
- recvmsg->sge.length, DMA_FROM_DEVICE);
+ if (likely(recvmsg->sge.length != 0)) {
+ ib_dma_unmap_single(t->cm_id->device,
+ recvmsg->sge.addr,
+ recvmsg->sge.length,
+ DMA_FROM_DEVICE);
+ recvmsg->sge.length = 0;
+ }
spin_lock(&t->recvmsg_queue_lock);
list_add(&recvmsg->list, &t->recvmsg_queue);
spin_unlock(&t->recvmsg_queue_lock);
}
-static struct
-smb_direct_recvmsg *get_empty_recvmsg(struct smb_direct_transport *t)
-{
- struct smb_direct_recvmsg *recvmsg = NULL;
-
- spin_lock(&t->empty_recvmsg_queue_lock);
- if (!list_empty(&t->empty_recvmsg_queue)) {
- recvmsg = list_first_entry(&t->empty_recvmsg_queue,
- struct smb_direct_recvmsg, list);
- list_del(&recvmsg->list);
- }
- spin_unlock(&t->empty_recvmsg_queue_lock);
- return recvmsg;
-}
-
-static void put_empty_recvmsg(struct smb_direct_transport *t,
- struct smb_direct_recvmsg *recvmsg)
-{
- ib_dma_unmap_single(t->cm_id->device, recvmsg->sge.addr,
- recvmsg->sge.length, DMA_FROM_DEVICE);
-
- spin_lock(&t->empty_recvmsg_queue_lock);
- list_add_tail(&recvmsg->list, &t->empty_recvmsg_queue);
- spin_unlock(&t->empty_recvmsg_queue_lock);
-}
-
static void enqueue_reassembly(struct smb_direct_transport *t,
struct smb_direct_recvmsg *recvmsg,
int data_length)
@@ -384,9 +362,6 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id)
spin_lock_init(&t->recvmsg_queue_lock);
INIT_LIST_HEAD(&t->recvmsg_queue);
- spin_lock_init(&t->empty_recvmsg_queue_lock);
- INIT_LIST_HEAD(&t->empty_recvmsg_queue);
-
init_waitqueue_head(&t->wait_send_pending);
atomic_set(&t->send_pending, 0);
@@ -409,6 +384,11 @@ err:
return NULL;
}
+static void smb_direct_free_transport(struct ksmbd_transport *kt)
+{
+ kfree(SMBD_TRANS(kt));
+}
+
static void free_transport(struct smb_direct_transport *t)
{
struct smb_direct_recvmsg *recvmsg;
@@ -426,7 +406,8 @@ static void free_transport(struct smb_direct_transport *t)
if (t->qp) {
ib_drain_qp(t->qp);
ib_mr_pool_destroy(t->qp, &t->qp->rdma_mrs);
- ib_destroy_qp(t->qp);
+ t->qp = NULL;
+ rdma_destroy_qp(t->cm_id);
}
ksmbd_debug(RDMA, "drain the reassembly queue\n");
@@ -454,7 +435,6 @@ static void free_transport(struct smb_direct_transport *t)
smb_direct_destroy_pools(t);
ksmbd_conn_free(KSMBD_TRANS(t)->conn);
- kfree(t);
}
static struct smb_direct_sendmsg
@@ -541,13 +521,13 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
t = recvmsg->transport;
if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_RECV) {
+ put_recvmsg(t, recvmsg);
if (wc->status != IB_WC_WR_FLUSH_ERR) {
pr_err("Recv error. status='%s (%d)' opcode=%d\n",
ib_wc_status_msg(wc->status), wc->status,
wc->opcode);
smb_direct_disconnect_rdma_connection(t);
}
- put_empty_recvmsg(t, recvmsg);
return;
}
@@ -561,7 +541,8 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
switch (recvmsg->type) {
case SMB_DIRECT_MSG_NEGOTIATE_REQ:
if (wc->byte_len < sizeof(struct smb_direct_negotiate_req)) {
- put_empty_recvmsg(t, recvmsg);
+ put_recvmsg(t, recvmsg);
+ smb_direct_disconnect_rdma_connection(t);
return;
}
t->negotiation_requested = true;
@@ -569,7 +550,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
t->status = SMB_DIRECT_CS_CONNECTED;
enqueue_reassembly(t, recvmsg, 0);
wake_up_interruptible(&t->wait_status);
- break;
+ return;
case SMB_DIRECT_MSG_DATA_TRANSFER: {
struct smb_direct_data_transfer *data_transfer =
(struct smb_direct_data_transfer *)recvmsg->packet;
@@ -578,7 +559,8 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
if (wc->byte_len <
offsetof(struct smb_direct_data_transfer, padding)) {
- put_empty_recvmsg(t, recvmsg);
+ put_recvmsg(t, recvmsg);
+ smb_direct_disconnect_rdma_connection(t);
return;
}
@@ -586,7 +568,8 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
if (data_length) {
if (wc->byte_len < sizeof(struct smb_direct_data_transfer) +
(u64)data_length) {
- put_empty_recvmsg(t, recvmsg);
+ put_recvmsg(t, recvmsg);
+ smb_direct_disconnect_rdma_connection(t);
return;
}
@@ -598,16 +581,11 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
else
t->full_packet_received = true;
- enqueue_reassembly(t, recvmsg, (int)data_length);
- wake_up_interruptible(&t->wait_reassembly_queue);
-
spin_lock(&t->receive_credit_lock);
receive_credits = --(t->recv_credits);
avail_recvmsg_count = t->count_avail_recvmsg;
spin_unlock(&t->receive_credit_lock);
} else {
- put_empty_recvmsg(t, recvmsg);
-
spin_lock(&t->receive_credit_lock);
receive_credits = --(t->recv_credits);
avail_recvmsg_count = ++(t->count_avail_recvmsg);
@@ -629,11 +607,23 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc)
if (is_receive_credit_post_required(receive_credits, avail_recvmsg_count))
mod_delayed_work(smb_direct_wq,
&t->post_recv_credits_work, 0);
- break;
+
+ if (data_length) {
+ enqueue_reassembly(t, recvmsg, (int)data_length);
+ wake_up_interruptible(&t->wait_reassembly_queue);
+ } else
+ put_recvmsg(t, recvmsg);
+
+ return;
}
- default:
- break;
}
+
+ /*
+ * This is an internal error!
+ */
+ WARN_ON_ONCE(recvmsg->type != SMB_DIRECT_MSG_DATA_TRANSFER);
+ put_recvmsg(t, recvmsg);
+ smb_direct_disconnect_rdma_connection(t);
}
static int smb_direct_post_recv(struct smb_direct_transport *t,
@@ -663,6 +653,7 @@ static int smb_direct_post_recv(struct smb_direct_transport *t,
ib_dma_unmap_single(t->cm_id->device,
recvmsg->sge.addr, recvmsg->sge.length,
DMA_FROM_DEVICE);
+ recvmsg->sge.length = 0;
smb_direct_disconnect_rdma_connection(t);
return ret;
}
@@ -804,7 +795,6 @@ static void smb_direct_post_recv_credits(struct work_struct *work)
struct smb_direct_recvmsg *recvmsg;
int receive_credits, credits = 0;
int ret;
- int use_free = 1;
spin_lock(&t->receive_credit_lock);
receive_credits = t->recv_credits;
@@ -812,18 +802,9 @@ static void smb_direct_post_recv_credits(struct work_struct *work)
if (receive_credits < t->recv_credit_target) {
while (true) {
- if (use_free)
- recvmsg = get_free_recvmsg(t);
- else
- recvmsg = get_empty_recvmsg(t);
- if (!recvmsg) {
- if (use_free) {
- use_free = 0;
- continue;
- } else {
- break;
- }
- }
+ recvmsg = get_free_recvmsg(t);
+ if (!recvmsg)
+ break;
recvmsg->type = SMB_DIRECT_MSG_DATA_TRANSFER;
recvmsg->first_segment = false;
@@ -1396,7 +1377,7 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t,
}
ksmbd_debug(RDMA, "RDMA %s, len %#x, needed credits %#x\n",
- is_read ? "read" : "write", buf_len, credits_needed);
+ str_read_write(is_read), buf_len, credits_needed);
ret = wait_for_rw_credits(t, credits_needed);
if (ret < 0)
@@ -1799,8 +1780,6 @@ static void smb_direct_destroy_pools(struct smb_direct_transport *t)
while ((recvmsg = get_free_recvmsg(t)))
mempool_free(recvmsg, t->recvmsg_mempool);
- while ((recvmsg = get_empty_recvmsg(t)))
- mempool_free(recvmsg, t->recvmsg_mempool);
mempool_destroy(t->recvmsg_mempool);
t->recvmsg_mempool = NULL;
@@ -1856,6 +1835,7 @@ static int smb_direct_create_pools(struct smb_direct_transport *t)
if (!recvmsg)
goto err;
recvmsg->transport = t;
+ recvmsg->sge.length = 0;
list_add(&recvmsg->list, &t->recvmsg_queue);
}
t->count_avail_recvmsg = t->recv_credit_max;
@@ -1934,8 +1914,8 @@ static int smb_direct_create_qpair(struct smb_direct_transport *t,
return 0;
err:
if (t->qp) {
- ib_destroy_qp(t->qp);
t->qp = NULL;
+ rdma_destroy_qp(t->cm_id);
}
if (t->recv_cq) {
ib_destroy_cq(t->recv_cq);
@@ -2214,7 +2194,7 @@ int ksmbd_rdma_init(void)
return 0;
}
-void ksmbd_rdma_destroy(void)
+void ksmbd_rdma_stop_listening(void)
{
if (!smb_direct_listener.cm_id)
return;
@@ -2223,7 +2203,10 @@ void ksmbd_rdma_destroy(void)
rdma_destroy_id(smb_direct_listener.cm_id);
smb_direct_listener.cm_id = NULL;
+}
+void ksmbd_rdma_destroy(void)
+{
if (smb_direct_wq) {
destroy_workqueue(smb_direct_wq);
smb_direct_wq = NULL;
@@ -2241,38 +2224,16 @@ bool ksmbd_rdma_capable_netdev(struct net_device *netdev)
for (i = 0; i < smb_dev->ib_dev->phys_port_cnt; i++) {
struct net_device *ndev;
- if (smb_dev->ib_dev->ops.get_netdev) {
- ndev = smb_dev->ib_dev->ops.get_netdev(
- smb_dev->ib_dev, i + 1);
- if (!ndev)
- continue;
+ ndev = ib_device_get_netdev(smb_dev->ib_dev, i + 1);
+ if (!ndev)
+ continue;
- if (ndev == netdev) {
- dev_put(ndev);
- rdma_capable = true;
- goto out;
- }
+ if (ndev == netdev) {
dev_put(ndev);
- /* if ib_dev does not implement ops.get_netdev
- * check for matching infiniband GUID in hw_addr
- */
- } else if (netdev->type == ARPHRD_INFINIBAND) {
- struct netdev_hw_addr *ha;
- union ib_gid gid;
- u32 port_num;
- int ret;
-
- netdev_hw_addr_list_for_each(
- ha, &netdev->dev_addrs) {
- memcpy(&gid, ha->addr + 4, sizeof(gid));
- ret = ib_find_gid(smb_dev->ib_dev, &gid,
- &port_num, NULL);
- if (!ret) {
- rdma_capable = true;
- goto out;
- }
- }
+ rdma_capable = true;
+ goto out;
}
+ dev_put(ndev);
}
}
out:
@@ -2289,7 +2250,7 @@ out:
}
ksmbd_debug(RDMA, "netdev(%s) rdma capable : %s\n",
- netdev->name, rdma_capable ? "true" : "false");
+ netdev->name, str_true_false(rdma_capable));
return rdma_capable;
}
@@ -2302,4 +2263,5 @@ static const struct ksmbd_transport_ops ksmbd_smb_direct_transport_ops = {
.read = smb_direct_read,
.rdma_read = smb_direct_rdma_read,
.rdma_write = smb_direct_rdma_write,
+ .free_transport = smb_direct_free_transport,
};
diff --git a/fs/smb/server/transport_rdma.h b/fs/smb/server/transport_rdma.h
index 77aee4e5c9dc..a2291b77488a 100644
--- a/fs/smb/server/transport_rdma.h
+++ b/fs/smb/server/transport_rdma.h
@@ -54,13 +54,15 @@ struct smb_direct_data_transfer {
#ifdef CONFIG_SMB_SERVER_SMBDIRECT
int ksmbd_rdma_init(void);
+void ksmbd_rdma_stop_listening(void);
void ksmbd_rdma_destroy(void);
bool ksmbd_rdma_capable_netdev(struct net_device *netdev);
void init_smbd_max_io_size(unsigned int sz);
unsigned int get_smbd_max_read_write_size(void);
#else
static inline int ksmbd_rdma_init(void) { return 0; }
-static inline int ksmbd_rdma_destroy(void) { return 0; }
+static inline void ksmbd_rdma_stop_listening(void) { }
+static inline void ksmbd_rdma_destroy(void) { }
static inline bool ksmbd_rdma_capable_netdev(struct net_device *netdev) { return false; }
static inline void init_smbd_max_io_size(unsigned int sz) { }
static inline unsigned int get_smbd_max_read_write_size(void) { return 0; }
diff --git a/fs/smb/server/transport_tcp.c b/fs/smb/server/transport_tcp.c
index 7f38a3c3f5bd..4337df97987d 100644
--- a/fs/smb/server/transport_tcp.c
+++ b/fs/smb/server/transport_tcp.c
@@ -58,12 +58,10 @@ static inline void ksmbd_tcp_reuseaddr(struct socket *sock)
static inline void ksmbd_tcp_rcv_timeout(struct socket *sock, s64 secs)
{
- lock_sock(sock->sk);
if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
- sock->sk->sk_rcvtimeo = secs * HZ;
+ WRITE_ONCE(sock->sk->sk_rcvtimeo, secs * HZ);
else
- sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
- release_sock(sock->sk);
+ WRITE_ONCE(sock->sk->sk_rcvtimeo, MAX_SCHEDULE_TIMEOUT);
}
static inline void ksmbd_tcp_snd_timeout(struct socket *sock, s64 secs)
@@ -87,23 +85,35 @@ static struct tcp_transport *alloc_transport(struct socket *client_sk)
return NULL;
}
+#if IS_ENABLED(CONFIG_IPV6)
+ if (client_sk->sk->sk_family == AF_INET6)
+ memcpy(&conn->inet6_addr, &client_sk->sk->sk_v6_daddr, 16);
+ else
+ conn->inet_addr = inet_sk(client_sk->sk)->inet_daddr;
+#else
+ conn->inet_addr = inet_sk(client_sk->sk)->inet_daddr;
+#endif
conn->transport = KSMBD_TRANS(t);
KSMBD_TRANS(t)->conn = conn;
KSMBD_TRANS(t)->ops = &ksmbd_tcp_transport_ops;
return t;
}
-static void free_transport(struct tcp_transport *t)
+static void ksmbd_tcp_free_transport(struct ksmbd_transport *kt)
{
- kernel_sock_shutdown(t->sock, SHUT_RDWR);
- sock_release(t->sock);
- t->sock = NULL;
+ struct tcp_transport *t = TCP_TRANS(kt);
- ksmbd_conn_free(KSMBD_TRANS(t)->conn);
+ sock_release(t->sock);
kfree(t->iov);
kfree(t);
}
+static void free_transport(struct tcp_transport *t)
+{
+ kernel_sock_shutdown(t->sock, SHUT_RDWR);
+ ksmbd_conn_free(KSMBD_TRANS(t)->conn);
+}
+
/**
* kvec_array_init() - initialize a IO vector segment
* @new: IO vector to be initialized
@@ -226,6 +236,7 @@ static int ksmbd_kthread_fn(void *p)
{
struct socket *client_sk = NULL;
struct interface *iface = (struct interface *)p;
+ struct ksmbd_conn *conn;
int ret;
while (!kthread_should_stop()) {
@@ -244,6 +255,34 @@ static int ksmbd_kthread_fn(void *p)
continue;
}
+ /*
+ * Limits repeated connections from clients with the same IP.
+ */
+ down_read(&conn_list_lock);
+ list_for_each_entry(conn, &conn_list, conns_list)
+#if IS_ENABLED(CONFIG_IPV6)
+ if (client_sk->sk->sk_family == AF_INET6) {
+ if (memcmp(&client_sk->sk->sk_v6_daddr,
+ &conn->inet6_addr, 16) == 0) {
+ ret = -EAGAIN;
+ break;
+ }
+ } else if (inet_sk(client_sk->sk)->inet_daddr ==
+ conn->inet_addr) {
+ ret = -EAGAIN;
+ break;
+ }
+#else
+ if (inet_sk(client_sk->sk)->inet_daddr ==
+ conn->inet_addr) {
+ ret = -EAGAIN;
+ break;
+ }
+#endif
+ up_read(&conn_list_lock);
+ if (ret == -EAGAIN)
+ continue;
+
if (server_conf.max_connections &&
atomic_inc_return(&active_num_conn) >= server_conf.max_connections) {
pr_info_ratelimited("Limit the maximum number of connections(%u)\n",
@@ -652,4 +691,5 @@ static const struct ksmbd_transport_ops ksmbd_tcp_transport_ops = {
.read = ksmbd_tcp_read,
.writev = ksmbd_tcp_writev,
.disconnect = ksmbd_tcp_disconnect,
+ .free_transport = ksmbd_tcp_free_transport,
};
diff --git a/fs/smb/server/transport_tcp.h b/fs/smb/server/transport_tcp.h
index 8c9aa624cfe3..1e51675ee1b2 100644
--- a/fs/smb/server/transport_tcp.h
+++ b/fs/smb/server/transport_tcp.h
@@ -8,6 +8,7 @@
int ksmbd_tcp_set_interfaces(char *ifc_list, int ifc_list_sz);
struct interface *ksmbd_find_netdev_name_iface_list(char *netdev_name);
+void ksmbd_free_transport(struct ksmbd_transport *kt);
int ksmbd_tcp_init(void);
void ksmbd_tcp_destroy(void);
diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c
index 6890016e1923..04539037108c 100644
--- a/fs/smb/server/vfs.c
+++ b/fs/smb/server/vfs.c
@@ -4,6 +4,7 @@
* Copyright (C) 2018 Samsung Electronics Co., Ltd.
*/
+#include <crypto/sha2.h>
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/filelock.h>
@@ -65,13 +66,12 @@ int ksmbd_vfs_lock_parent(struct dentry *parent, struct dentry *child)
return 0;
}
-static int ksmbd_vfs_path_lookup_locked(struct ksmbd_share_config *share_conf,
- char *pathname, unsigned int flags,
- struct path *parent_path,
- struct path *path)
+static int ksmbd_vfs_path_lookup(struct ksmbd_share_config *share_conf,
+ char *pathname, unsigned int flags,
+ struct path *path, bool do_lock)
{
struct qstr last;
- struct filename *filename;
+ struct filename *filename __free(putname) = NULL;
struct path *root_share_path = &share_conf->vfs_path;
int err, type;
struct dentry *d;
@@ -88,56 +88,57 @@ static int ksmbd_vfs_path_lookup_locked(struct ksmbd_share_config *share_conf,
return PTR_ERR(filename);
err = vfs_path_parent_lookup(filename, flags,
- parent_path, &last, &type,
+ path, &last, &type,
root_share_path);
- if (err) {
- putname(filename);
+ if (err)
return err;
- }
if (unlikely(type != LAST_NORM)) {
- path_put(parent_path);
- putname(filename);
+ path_put(path);
return -ENOENT;
}
- err = mnt_want_write(parent_path->mnt);
- if (err) {
- path_put(parent_path);
- putname(filename);
+ if (do_lock) {
+ err = mnt_want_write(path->mnt);
+ if (err) {
+ path_put(path);
+ return -ENOENT;
+ }
+
+ inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
+ d = lookup_one_qstr_excl(&last, path->dentry, 0);
+
+ if (!IS_ERR(d)) {
+ dput(path->dentry);
+ path->dentry = d;
+ return 0;
+ }
+ inode_unlock(path->dentry->d_inode);
+ mnt_drop_write(path->mnt);
+ path_put(path);
return -ENOENT;
}
- inode_lock_nested(parent_path->dentry->d_inode, I_MUTEX_PARENT);
- d = lookup_one_qstr_excl(&last, parent_path->dentry, 0);
- if (IS_ERR(d))
- goto err_out;
-
- if (d_is_negative(d)) {
+ d = lookup_noperm_unlocked(&last, path->dentry);
+ if (!IS_ERR(d) && d_is_negative(d)) {
dput(d);
- goto err_out;
+ d = ERR_PTR(-ENOENT);
}
-
+ if (IS_ERR(d)) {
+ path_put(path);
+ return -ENOENT;
+ }
+ dput(path->dentry);
path->dentry = d;
- path->mnt = mntget(parent_path->mnt);
if (test_share_config_flag(share_conf, KSMBD_SHARE_FLAG_CROSSMNT)) {
err = follow_down(path, 0);
if (err < 0) {
path_put(path);
- goto err_out;
+ return -ENOENT;
}
}
-
- putname(filename);
return 0;
-
-err_out:
- inode_unlock(d_inode(parent_path->dentry));
- mnt_drop_write(parent_path->mnt);
- path_put(parent_path);
- putname(filename);
- return -ENOENT;
}
void ksmbd_vfs_query_maximal_access(struct mnt_idmap *idmap,
@@ -211,8 +212,8 @@ int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode)
{
struct mnt_idmap *idmap;
struct path path;
- struct dentry *dentry;
- int err;
+ struct dentry *dentry, *d;
+ int err = 0;
dentry = ksmbd_vfs_kern_path_create(work, name,
LOOKUP_NO_SYMLINKS | LOOKUP_DIRECTORY,
@@ -227,27 +228,15 @@ int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode)
idmap = mnt_idmap(path.mnt);
mode |= S_IFDIR;
- err = vfs_mkdir(idmap, d_inode(path.dentry), dentry, mode);
- if (!err && d_unhashed(dentry)) {
- struct dentry *d;
-
- d = lookup_one(idmap, dentry->d_name.name, dentry->d_parent,
- dentry->d_name.len);
- if (IS_ERR(d)) {
- err = PTR_ERR(d);
- goto out_err;
- }
- if (unlikely(d_is_negative(d))) {
- dput(d);
- err = -ENOENT;
- goto out_err;
- }
-
- ksmbd_vfs_inherit_owner(work, d_inode(path.dentry), d_inode(d));
- dput(d);
- }
+ d = dentry;
+ dentry = vfs_mkdir(idmap, d_inode(path.dentry), dentry, mode);
+ if (IS_ERR(dentry))
+ err = PTR_ERR(dentry);
+ else if (d_is_negative(dentry))
+ err = -ENOENT;
+ if (!err && dentry != d)
+ ksmbd_vfs_inherit_owner(work, d_inode(path.dentry), d_inode(dentry));
-out_err:
done_path_create(&path, dentry);
if (err)
pr_err("mkdir(%s): creation failed (err:%d)\n", name, err);
@@ -309,6 +298,7 @@ static int ksmbd_vfs_stream_read(struct ksmbd_file *fp, char *buf, loff_t *pos,
if (v_len - *pos < count)
count = v_len - *pos;
+ fp->stream.pos = v_len;
memcpy(buf, &stream_buf[*pos], count);
@@ -426,10 +416,15 @@ static int ksmbd_vfs_stream_write(struct ksmbd_file *fp, char *buf, loff_t *pos,
ksmbd_debug(VFS, "write stream data pos : %llu, count : %zd\n",
*pos, count);
+ if (*pos >= XATTR_SIZE_MAX) {
+ pr_err("stream write position %lld is out of bounds\n", *pos);
+ return -EINVAL;
+ }
+
size = *pos + count;
if (size > XATTR_SIZE_MAX) {
size = XATTR_SIZE_MAX;
- count = (*pos + count) - XATTR_SIZE_MAX;
+ count = XATTR_SIZE_MAX - *pos;
}
v_len = ksmbd_vfs_getcasexattr(idmap,
@@ -467,8 +462,8 @@ static int ksmbd_vfs_stream_write(struct ksmbd_file *fp, char *buf, loff_t *pos,
true);
if (err < 0)
goto out;
-
- fp->filp->f_pos = *pos;
+ else
+ fp->stream.pos = size;
err = 0;
out:
kvfree(stream_buf);
@@ -496,7 +491,8 @@ int ksmbd_vfs_write(struct ksmbd_work *work, struct ksmbd_file *fp,
int err = 0;
if (work->conn->connection_type) {
- if (!(fp->daccess & (FILE_WRITE_DATA_LE | FILE_APPEND_DATA_LE))) {
+ if (!(fp->daccess & (FILE_WRITE_DATA_LE | FILE_APPEND_DATA_LE)) ||
+ S_ISDIR(file_inode(fp->filp)->i_mode)) {
pr_err("no right to write(%pD)\n", fp->filp);
err = -EACCES;
goto out;
@@ -557,7 +553,8 @@ int ksmbd_vfs_getattr(const struct path *path, struct kstat *stat)
{
int err;
- err = vfs_getattr(path, stat, STATX_BTIME, AT_STATX_SYNC_AS_STAT);
+ err = vfs_getattr(path, stat, STATX_BASIC_STATS | STATX_BTIME,
+ AT_STATX_SYNC_AS_STAT);
if (err)
pr_err("getattr failed, err %d\n", err);
return err;
@@ -693,6 +690,7 @@ int ksmbd_vfs_rename(struct ksmbd_work *work, const struct path *old_path,
struct ksmbd_file *parent_fp;
int new_type;
int err, lookup_flags = LOOKUP_NO_SYMLINKS;
+ int target_lookup_flags = LOOKUP_RENAME_TARGET | LOOKUP_CREATE;
if (ksmbd_override_fsids(work))
return -ENOMEM;
@@ -703,6 +701,14 @@ int ksmbd_vfs_rename(struct ksmbd_work *work, const struct path *old_path,
goto revert_fsids;
}
+ /*
+ * explicitly handle file overwrite case, for compatibility with
+ * filesystems that may not support rename flags (e.g: fuse)
+ */
+ if (flags & RENAME_NOREPLACE)
+ target_lookup_flags |= LOOKUP_EXCL;
+ flags &= ~(RENAME_NOREPLACE);
+
retry:
err = vfs_path_parent_lookup(to, lookup_flags | LOOKUP_BENEATH,
&new_path, &new_last, &new_type,
@@ -743,7 +749,7 @@ retry:
}
new_dentry = lookup_one_qstr_excl(&new_last, new_path.dentry,
- lookup_flags | LOOKUP_RENAME_TARGET);
+ lookup_flags | target_lookup_flags);
if (IS_ERR(new_dentry)) {
err = PTR_ERR(new_dentry);
goto out3;
@@ -754,16 +760,6 @@ retry:
goto out4;
}
- /*
- * explicitly handle file overwrite case, for compatibility with
- * filesystems that may not support rename flags (e.g: fuse)
- */
- if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry)) {
- err = -EEXIST;
- goto out4;
- }
- flags &= ~(RENAME_NOREPLACE);
-
if (old_child == trap) {
err = -EINVAL;
goto out4;
@@ -775,10 +771,10 @@ retry:
}
rd.old_mnt_idmap = mnt_idmap(old_path->mnt),
- rd.old_dir = d_inode(old_parent),
+ rd.old_parent = old_parent,
rd.old_dentry = old_child,
rd.new_mnt_idmap = mnt_idmap(new_path.mnt),
- rd.new_dir = new_path.dentry->d_inode,
+ rd.new_parent = new_path.dentry,
rd.new_dentry = new_dentry,
rd.flags = flags,
rd.delegated_inode = NULL,
@@ -1208,103 +1204,114 @@ static int ksmbd_vfs_lookup_in_dir(const struct path *dir, char *name,
return ret;
}
-/**
- * ksmbd_vfs_kern_path_locked() - lookup a file and get path info
- * @work: work
- * @name: file path that is relative to share
- * @flags: lookup flags
- * @parent_path: if lookup succeed, return parent_path info
- * @path: if lookup succeed, return path info
- * @caseless: caseless filename lookup
- *
- * Return: 0 on success, otherwise error
- */
-int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name,
- unsigned int flags, struct path *parent_path,
- struct path *path, bool caseless)
+static
+int __ksmbd_vfs_kern_path(struct ksmbd_work *work, char *filepath,
+ unsigned int flags,
+ struct path *path, bool caseless, bool do_lock)
{
struct ksmbd_share_config *share_conf = work->tcon->share_conf;
+ struct path parent_path;
+ size_t path_len, remain_len;
int err;
- err = ksmbd_vfs_path_lookup_locked(share_conf, name, flags, parent_path,
- path);
- if (!err)
- return 0;
-
- if (caseless) {
- char *filepath;
- size_t path_len, remain_len;
-
- filepath = name;
- path_len = strlen(filepath);
- remain_len = path_len;
-
- *parent_path = share_conf->vfs_path;
- path_get(parent_path);
-
- while (d_can_lookup(parent_path->dentry)) {
- char *filename = filepath + path_len - remain_len;
- char *next = strchrnul(filename, '/');
- size_t filename_len = next - filename;
- bool is_last = !next[0];
+retry:
+ err = ksmbd_vfs_path_lookup(share_conf, filepath, flags, path, do_lock);
+ if (!err || !caseless)
+ return err;
- if (filename_len == 0)
- break;
+ path_len = strlen(filepath);
+ remain_len = path_len;
- err = ksmbd_vfs_lookup_in_dir(parent_path, filename,
- filename_len,
- work->conn->um);
- if (err)
- goto out2;
+ parent_path = share_conf->vfs_path;
+ path_get(&parent_path);
- next[0] = '\0';
+ while (d_can_lookup(parent_path.dentry)) {
+ char *filename = filepath + path_len - remain_len;
+ char *next = strchrnul(filename, '/');
+ size_t filename_len = next - filename;
+ bool is_last = !next[0];
- err = vfs_path_lookup(share_conf->vfs_path.dentry,
- share_conf->vfs_path.mnt,
- filepath,
- flags,
- path);
- if (!is_last)
- next[0] = '/';
- if (err)
- goto out2;
- else if (is_last)
- goto out1;
- path_put(parent_path);
- *parent_path = *path;
+ if (filename_len == 0)
+ break;
- remain_len -= filename_len + 1;
+ err = ksmbd_vfs_lookup_in_dir(&parent_path, filename,
+ filename_len,
+ work->conn->um);
+ path_put(&parent_path);
+ if (err)
+ goto out;
+ if (is_last) {
+ caseless = false;
+ goto retry;
}
+ next[0] = '\0';
+
+ err = vfs_path_lookup(share_conf->vfs_path.dentry,
+ share_conf->vfs_path.mnt,
+ filepath,
+ flags,
+ &parent_path);
+ next[0] = '/';
+ if (err)
+ goto out;
- err = -EINVAL;
-out2:
- path_put(parent_path);
+ remain_len -= filename_len + 1;
}
-out1:
- if (!err) {
- err = mnt_want_write(parent_path->mnt);
- if (err) {
- path_put(path);
- path_put(parent_path);
- return err;
- }
-
- err = ksmbd_vfs_lock_parent(parent_path->dentry, path->dentry);
- if (err) {
- path_put(path);
- path_put(parent_path);
- }
- }
+ err = -EINVAL;
+ path_put(&parent_path);
+out:
return err;
}
-void ksmbd_vfs_kern_path_unlock(struct path *parent_path, struct path *path)
+/**
+ * ksmbd_vfs_kern_path() - lookup a file and get path info
+ * @work: work
+ * @filepath: file path that is relative to share
+ * @flags: lookup flags
+ * @path: if lookup succeed, return path info
+ * @caseless: caseless filename lookup
+ *
+ * Perform the lookup, possibly crossing over any mount point.
+ * On return no locks will be held and write-access to filesystem
+ * won't have been checked.
+ * Return: 0 if file was found, otherwise error
+ */
+int ksmbd_vfs_kern_path(struct ksmbd_work *work, char *filepath,
+ unsigned int flags,
+ struct path *path, bool caseless)
+{
+ return __ksmbd_vfs_kern_path(work, filepath, flags, path,
+ caseless, false);
+}
+
+/**
+ * ksmbd_vfs_kern_path_locked() - lookup a file and get path info
+ * @work: work
+ * @filepath: file path that is relative to share
+ * @flags: lookup flags
+ * @path: if lookup succeed, return path info
+ * @caseless: caseless filename lookup
+ *
+ * Perform the lookup, but don't cross over any mount point.
+ * On return the parent of path->dentry will be locked and write-access to
+ * filesystem will have been gained.
+ * Return: 0 on if file was found, otherwise error
+ */
+int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *filepath,
+ unsigned int flags,
+ struct path *path, bool caseless)
+{
+ return __ksmbd_vfs_kern_path(work, filepath, flags, path,
+ caseless, true);
+}
+
+void ksmbd_vfs_kern_path_unlock(struct path *path)
{
- inode_unlock(d_inode(parent_path->dentry));
- mnt_drop_write(parent_path->mnt);
+ /* While lock is still held, ->d_parent is safe */
+ inode_unlock(d_inode(path->dentry->d_parent));
+ mnt_drop_write(path->mnt);
path_put(path);
- path_put(parent_path);
}
struct dentry *ksmbd_vfs_kern_path_create(struct ksmbd_work *work,
@@ -1488,11 +1495,7 @@ int ksmbd_vfs_set_sd_xattr(struct ksmbd_conn *conn,
acl.sd_buf = (char *)pntsd;
acl.sd_size = len;
- rc = ksmbd_gen_sd_hash(conn, acl.sd_buf, acl.sd_size, acl.hash);
- if (rc) {
- pr_err("failed to generate hash for ndr acl\n");
- return rc;
- }
+ sha256(acl.sd_buf, acl.sd_size, acl.hash);
smb_acl = ksmbd_vfs_make_xattr_posix_acl(idmap, inode,
ACL_TYPE_ACCESS);
@@ -1507,12 +1510,7 @@ int ksmbd_vfs_set_sd_xattr(struct ksmbd_conn *conn,
goto out;
}
- rc = ksmbd_gen_sd_hash(conn, acl_ndr.data, acl_ndr.offset,
- acl.posix_acl_hash);
- if (rc) {
- pr_err("failed to generate hash for ndr acl\n");
- goto out;
- }
+ sha256(acl_ndr.data, acl_ndr.offset, acl.posix_acl_hash);
rc = ndr_encode_v4_ntacl(&sd_ndr, &acl);
if (rc) {
@@ -1569,11 +1567,7 @@ int ksmbd_vfs_get_sd_xattr(struct ksmbd_conn *conn,
goto out_free;
}
- rc = ksmbd_gen_sd_hash(conn, acl_ndr.data, acl_ndr.offset, cmp_hash);
- if (rc) {
- pr_err("failed to generate hash for ndr acl\n");
- goto out_free;
- }
+ sha256(acl_ndr.data, acl_ndr.offset, cmp_hash);
if (memcmp(cmp_hash, acl.posix_acl_hash, XATTR_SD_HASH_SIZE)) {
pr_err("hash value diff\n");
diff --git a/fs/smb/server/vfs.h b/fs/smb/server/vfs.h
index 2893f59803a6..d47472f3e30b 100644
--- a/fs/smb/server/vfs.h
+++ b/fs/smb/server/vfs.h
@@ -117,10 +117,13 @@ int ksmbd_vfs_xattr_stream_name(char *stream_name, char **xattr_stream_name,
int ksmbd_vfs_remove_xattr(struct mnt_idmap *idmap,
const struct path *path, char *attr_name,
bool get_write);
+int ksmbd_vfs_kern_path(struct ksmbd_work *work, char *name,
+ unsigned int flags,
+ struct path *path, bool caseless);
int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name,
- unsigned int flags, struct path *parent_path,
+ unsigned int flags,
struct path *path, bool caseless);
-void ksmbd_vfs_kern_path_unlock(struct path *parent_path, struct path *path);
+void ksmbd_vfs_kern_path_unlock(struct path *path);
struct dentry *ksmbd_vfs_kern_path_create(struct ksmbd_work *work,
const char *name,
unsigned int flags,
diff --git a/fs/smb/server/vfs_cache.c b/fs/smb/server/vfs_cache.c
index 8d1f30dcba7e..dfed6fce8904 100644
--- a/fs/smb/server/vfs_cache.c
+++ b/fs/smb/server/vfs_cache.c
@@ -661,21 +661,40 @@ __close_file_table_ids(struct ksmbd_file_table *ft,
bool (*skip)(struct ksmbd_tree_connect *tcon,
struct ksmbd_file *fp))
{
- unsigned int id;
- struct ksmbd_file *fp;
- int num = 0;
+ struct ksmbd_file *fp;
+ unsigned int id = 0;
+ int num = 0;
+
+ while (1) {
+ write_lock(&ft->lock);
+ fp = idr_get_next(ft->idr, &id);
+ if (!fp) {
+ write_unlock(&ft->lock);
+ break;
+ }
- idr_for_each_entry(ft->idr, fp, id) {
- if (skip(tcon, fp))
+ if (skip(tcon, fp) ||
+ !atomic_dec_and_test(&fp->refcount)) {
+ id++;
+ write_unlock(&ft->lock);
continue;
+ }
set_close_state_blocked_works(fp);
+ idr_remove(ft->idr, fp->volatile_id);
+ fp->volatile_id = KSMBD_NO_FID;
+ write_unlock(&ft->lock);
+
+ down_write(&fp->f_ci->m_lock);
+ list_del_init(&fp->node);
+ up_write(&fp->f_ci->m_lock);
- if (!atomic_dec_and_test(&fp->refcount))
- continue;
__ksmbd_close_fd(ft, fp);
+
num++;
+ id++;
}
+
return num;
}
@@ -713,12 +732,8 @@ static bool tree_conn_fd_check(struct ksmbd_tree_connect *tcon,
static bool ksmbd_durable_scavenger_alive(void)
{
- mutex_lock(&durable_scavenger_lock);
- if (!durable_scavenger_running) {
- mutex_unlock(&durable_scavenger_lock);
+ if (!durable_scavenger_running)
return false;
- }
- mutex_unlock(&durable_scavenger_lock);
if (kthread_should_stop())
return false;
@@ -799,9 +814,7 @@ static int ksmbd_durable_scavenger(void *dummy)
break;
}
- mutex_lock(&durable_scavenger_lock);
durable_scavenger_running = false;
- mutex_unlock(&durable_scavenger_lock);
module_put(THIS_MODULE);
diff --git a/fs/smb/server/vfs_cache.h b/fs/smb/server/vfs_cache.h
index 5bbb179736c2..0708155b5caf 100644
--- a/fs/smb/server/vfs_cache.h
+++ b/fs/smb/server/vfs_cache.h
@@ -44,6 +44,7 @@ struct ksmbd_lock {
struct stream {
char *name;
ssize_t size;
+ loff_t pos;
};
struct ksmbd_inode {
diff --git a/fs/splice.c b/fs/splice.c
index 28cfa63aa236..f5094b6d00a0 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -45,7 +45,7 @@
* here if set to avoid blocking other users of this pipe if splice is
* being done on it.
*/
-static noinline void noinline pipe_clear_nowait(struct file *file)
+static noinline void pipe_clear_nowait(struct file *file)
{
fmode_t fmode = READ_ONCE(file->f_mode);
@@ -200,7 +200,6 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
unsigned int spd_pages = spd->nr_pages;
unsigned int tail = pipe->tail;
unsigned int head = pipe->head;
- unsigned int mask = pipe->ring_size - 1;
ssize_t ret = 0;
int page_nr = 0;
@@ -214,7 +213,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
}
while (!pipe_full(head, tail, pipe->max_usage)) {
- struct pipe_buffer *buf = &pipe->bufs[head & mask];
+ struct pipe_buffer *buf = pipe_buf(pipe, head);
buf->page = spd->pages[page_nr];
buf->offset = spd->partial[page_nr].offset;
@@ -247,7 +246,6 @@ ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
{
unsigned int head = pipe->head;
unsigned int tail = pipe->tail;
- unsigned int mask = pipe->ring_size - 1;
int ret;
if (unlikely(!pipe->readers)) {
@@ -256,7 +254,7 @@ ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
} else if (pipe_full(head, tail, pipe->max_usage)) {
ret = -EAGAIN;
} else {
- pipe->bufs[head & mask] = *buf;
+ *pipe_buf(pipe, head) = *buf;
pipe->head = head + 1;
return buf->len;
}
@@ -331,7 +329,7 @@ ssize_t copy_splice_read(struct file *in, loff_t *ppos,
int i;
/* Work out how much data we can actually add into the pipe */
- used = pipe_occupancy(pipe->head, pipe->tail);
+ used = pipe_buf_usage(pipe);
npages = max_t(ssize_t, pipe->max_usage - used, 0);
len = min_t(size_t, len, npages * PAGE_SIZE);
npages = DIV_ROUND_UP(len, PAGE_SIZE);
@@ -447,11 +445,10 @@ static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_des
{
unsigned int head = pipe->head;
unsigned int tail = pipe->tail;
- unsigned int mask = pipe->ring_size - 1;
int ret;
while (!pipe_empty(head, tail)) {
- struct pipe_buffer *buf = &pipe->bufs[tail & mask];
+ struct pipe_buffer *buf = pipe_buf(pipe, tail);
sd->len = buf->len;
if (sd->len > sd->total_len)
@@ -495,8 +492,7 @@ static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_des
static inline bool eat_empty_buffer(struct pipe_inode_info *pipe)
{
unsigned int tail = pipe->tail;
- unsigned int mask = pipe->ring_size - 1;
- struct pipe_buffer *buf = &pipe->bufs[tail & mask];
+ struct pipe_buffer *buf = pipe_buf(pipe, tail);
if (unlikely(!buf->len)) {
pipe_buf_release(pipe, buf);
@@ -527,7 +523,7 @@ static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_des
return -ERESTARTSYS;
repeat:
- while (pipe_empty(pipe->head, pipe->tail)) {
+ while (pipe_is_empty(pipe)) {
if (!pipe->writers)
return 0;
@@ -690,7 +686,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
while (sd.total_len) {
struct kiocb kiocb;
struct iov_iter from;
- unsigned int head, tail, mask;
+ unsigned int head, tail;
size_t left;
int n;
@@ -711,12 +707,11 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
head = pipe->head;
tail = pipe->tail;
- mask = pipe->ring_size - 1;
/* build the vector */
left = sd.total_len;
for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++) {
- struct pipe_buffer *buf = &pipe->bufs[tail & mask];
+ struct pipe_buffer *buf = pipe_buf(pipe, tail);
size_t this_len = buf->len;
/* zero-length bvecs are not supported, skip them */
@@ -744,6 +739,9 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
sd.pos = kiocb.ki_pos;
if (ret <= 0)
break;
+ WARN_ONCE(ret > sd.total_len - left,
+ "Splice Exceeded! ret=%zd tot=%zu left=%zu\n",
+ ret, sd.total_len, left);
sd.num_spliced += ret;
sd.total_len -= ret;
@@ -752,7 +750,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
/* dismiss the fully eaten buffers, adjust the partial one */
tail = pipe->tail;
while (ret) {
- struct pipe_buffer *buf = &pipe->bufs[tail & mask];
+ struct pipe_buffer *buf = pipe_buf(pipe, tail);
if (ret >= buf->len) {
ret -= buf->len;
buf->len = 0;
@@ -809,7 +807,7 @@ ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out,
pipe_lock(pipe);
while (len > 0) {
- unsigned int head, tail, mask, bc = 0;
+ unsigned int head, tail, bc = 0;
size_t remain = len;
/*
@@ -820,7 +818,7 @@ ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out,
if (signal_pending(current))
break;
- while (pipe_empty(pipe->head, pipe->tail)) {
+ while (pipe_is_empty(pipe)) {
ret = 0;
if (!pipe->writers)
goto out;
@@ -846,10 +844,9 @@ ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out,
head = pipe->head;
tail = pipe->tail;
- mask = pipe->ring_size - 1;
while (!pipe_empty(head, tail)) {
- struct pipe_buffer *buf = &pipe->bufs[tail & mask];
+ struct pipe_buffer *buf = pipe_buf(pipe, tail);
size_t seg;
if (!buf->len) {
@@ -894,7 +891,7 @@ ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out,
len -= ret;
tail = pipe->tail;
while (ret > 0) {
- struct pipe_buffer *buf = &pipe->bufs[tail & mask];
+ struct pipe_buffer *buf = pipe_buf(pipe, tail);
size_t seg = min_t(size_t, ret, buf->len);
buf->offset += seg;
@@ -968,7 +965,7 @@ static ssize_t do_splice_read(struct file *in, loff_t *ppos,
return 0;
/* Don't try to read more the pipe has space for. */
- p_space = pipe->max_usage - pipe_occupancy(pipe->head, pipe->tail);
+ p_space = pipe->max_usage - pipe_buf_usage(pipe);
len = min_t(size_t, len, p_space << PAGE_SHIFT);
if (unlikely(len > MAX_RW_COUNT))
@@ -1080,7 +1077,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
more = sd->flags & SPLICE_F_MORE;
sd->flags |= SPLICE_F_MORE;
- WARN_ON_ONCE(!pipe_empty(pipe->head, pipe->tail));
+ WARN_ON_ONCE(!pipe_is_empty(pipe));
while (len) {
size_t read_len;
@@ -1268,7 +1265,7 @@ static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags)
send_sig(SIGPIPE, current, 0);
return -EPIPE;
}
- if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage))
+ if (!pipe_is_full(pipe))
return 0;
if (flags & SPLICE_F_NONBLOCK)
return -EAGAIN;
@@ -1652,13 +1649,13 @@ static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
* Check the pipe occupancy without the inode lock first. This function
* is speculative anyways, so missing one is ok.
*/
- if (!pipe_empty(pipe->head, pipe->tail))
+ if (!pipe_is_empty(pipe))
return 0;
ret = 0;
pipe_lock(pipe);
- while (pipe_empty(pipe->head, pipe->tail)) {
+ while (pipe_is_empty(pipe)) {
if (signal_pending(current)) {
ret = -ERESTARTSYS;
break;
@@ -1688,13 +1685,13 @@ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
* Check pipe occupancy without the inode lock first. This function
* is speculative anyways, so missing one is ok.
*/
- if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage))
+ if (!pipe_is_full(pipe))
return 0;
ret = 0;
pipe_lock(pipe);
- while (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
+ while (pipe_is_full(pipe)) {
if (!pipe->readers) {
send_sig(SIGPIPE, current, 0);
ret = -EPIPE;
@@ -1725,7 +1722,6 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
struct pipe_buffer *ibuf, *obuf;
unsigned int i_head, o_head;
unsigned int i_tail, o_tail;
- unsigned int i_mask, o_mask;
int ret = 0;
bool input_wakeup = false;
@@ -1747,9 +1743,7 @@ retry:
pipe_double_lock(ipipe, opipe);
i_tail = ipipe->tail;
- i_mask = ipipe->ring_size - 1;
o_head = opipe->head;
- o_mask = opipe->ring_size - 1;
do {
size_t o_len;
@@ -1792,8 +1786,8 @@ retry:
goto retry;
}
- ibuf = &ipipe->bufs[i_tail & i_mask];
- obuf = &opipe->bufs[o_head & o_mask];
+ ibuf = pipe_buf(ipipe, i_tail);
+ obuf = pipe_buf(opipe, o_head);
if (len >= ibuf->len) {
/*
@@ -1862,7 +1856,6 @@ static ssize_t link_pipe(struct pipe_inode_info *ipipe,
struct pipe_buffer *ibuf, *obuf;
unsigned int i_head, o_head;
unsigned int i_tail, o_tail;
- unsigned int i_mask, o_mask;
ssize_t ret = 0;
/*
@@ -1873,9 +1866,7 @@ static ssize_t link_pipe(struct pipe_inode_info *ipipe,
pipe_double_lock(ipipe, opipe);
i_tail = ipipe->tail;
- i_mask = ipipe->ring_size - 1;
o_head = opipe->head;
- o_mask = opipe->ring_size - 1;
do {
if (!opipe->readers) {
@@ -1896,8 +1887,8 @@ static ssize_t link_pipe(struct pipe_inode_info *ipipe,
pipe_full(o_head, o_tail, opipe->max_usage))
break;
- ibuf = &ipipe->bufs[i_tail & i_mask];
- obuf = &opipe->bufs[o_head & o_mask];
+ ibuf = pipe_buf(ipipe, i_tail);
+ obuf = pipe_buf(opipe, o_head);
/*
* Get a reference to this pipe buffer,
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index b1091e70434a..a9602aae21ef 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -149,6 +149,27 @@ config SQUASHFS_XATTR
If unsure, say N.
+config SQUASHFS_COMP_CACHE_FULL
+ bool "Enable full caching of compressed blocks"
+ depends on SQUASHFS
+ default n
+ help
+ This option enables caching of all compressed blocks, Without caching,
+ repeated reads of the same files trigger excessive disk I/O, significantly
+ reducinng performance in workloads like fio-based benchmarks.
+
+ For example, fio tests (iodepth=1, numjobs=1, ioengine=psync) show:
+ With caching: IOPS=2223, BW=278MiB/s (291MB/s)
+ Without caching: IOPS=815, BW=102MiB/s (107MB/s)
+
+ Enabling this option restores performance to pre-regression levels by
+ caching all compressed blocks in the page cache, reducing disk I/O for
+ repeated reads. However, this increases memory usage, which may be a
+ concern in memory-constrained environments.
+
+ Enable this option if your workload involves frequent repeated reads and
+ memory usage is not a limiting factor. If unsure, say N.
+
config SQUASHFS_ZLIB
bool "Include support for ZLIB compressed file systems"
depends on SQUASHFS
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 2dc730800f44..b69c294e3ef0 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -80,19 +80,22 @@ static int squashfs_bio_read_cached(struct bio *fullbio,
struct address_space *cache_mapping, u64 index, int length,
u64 read_start, u64 read_end, int page_count)
{
- struct page *head_to_cache = NULL, *tail_to_cache = NULL;
+ struct folio *head_to_cache = NULL, *tail_to_cache = NULL;
struct block_device *bdev = fullbio->bi_bdev;
int start_idx = 0, end_idx = 0;
- struct bvec_iter_all iter_all;
+ struct folio_iter fi;
struct bio *bio = NULL;
- struct bio_vec *bv;
int idx = 0;
int err = 0;
+#ifdef CONFIG_SQUASHFS_COMP_CACHE_FULL
+ struct folio **cache_folios = kmalloc_array(page_count,
+ sizeof(*cache_folios), GFP_KERNEL | __GFP_ZERO);
+#endif
- bio_for_each_segment_all(bv, fullbio, iter_all) {
- struct page *page = bv->bv_page;
+ bio_for_each_folio_all(fi, fullbio) {
+ struct folio *folio = fi.folio;
- if (page->mapping == cache_mapping) {
+ if (folio->mapping == cache_mapping) {
idx++;
continue;
}
@@ -107,9 +110,14 @@ static int squashfs_bio_read_cached(struct bio *fullbio,
* adjacent blocks.
*/
if (idx == 0 && index != read_start)
- head_to_cache = page;
+ head_to_cache = folio;
else if (idx == page_count - 1 && index + length != read_end)
- tail_to_cache = page;
+ tail_to_cache = folio;
+#ifdef CONFIG_SQUASHFS_COMP_CACHE_FULL
+ /* Cache all pages in the BIO for repeated reads */
+ else if (cache_folios)
+ cache_folios[idx] = folio;
+#endif
if (!bio || idx != end_idx) {
struct bio *new = bio_alloc_clone(bdev, fullbio,
@@ -141,28 +149,47 @@ static int squashfs_bio_read_cached(struct bio *fullbio,
return err;
if (head_to_cache) {
- int ret = add_to_page_cache_lru(head_to_cache, cache_mapping,
+ int ret = filemap_add_folio(cache_mapping, head_to_cache,
read_start >> PAGE_SHIFT,
GFP_NOIO);
if (!ret) {
- SetPageUptodate(head_to_cache);
- unlock_page(head_to_cache);
+ folio_mark_uptodate(head_to_cache);
+ folio_unlock(head_to_cache);
}
}
if (tail_to_cache) {
- int ret = add_to_page_cache_lru(tail_to_cache, cache_mapping,
+ int ret = filemap_add_folio(cache_mapping, tail_to_cache,
(read_end >> PAGE_SHIFT) - 1,
GFP_NOIO);
if (!ret) {
- SetPageUptodate(tail_to_cache);
- unlock_page(tail_to_cache);
+ folio_mark_uptodate(tail_to_cache);
+ folio_unlock(tail_to_cache);
}
}
+#ifdef CONFIG_SQUASHFS_COMP_CACHE_FULL
+ if (!cache_folios)
+ goto out;
+
+ for (idx = 0; idx < page_count; idx++) {
+ if (!cache_folios[idx])
+ continue;
+ int ret = filemap_add_folio(cache_mapping, cache_folios[idx],
+ (read_start >> PAGE_SHIFT) + idx,
+ GFP_NOIO);
+
+ if (!ret) {
+ folio_mark_uptodate(cache_folios[idx]);
+ folio_unlock(cache_folios[idx]);
+ }
+ }
+ kfree(cache_folios);
+out:
+#endif
return 0;
}
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index 4db0d2b0aab8..181260e72680 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -198,7 +198,7 @@ void squashfs_cache_delete(struct squashfs_cache *cache)
{
int i, j;
- if (cache == NULL)
+ if (IS_ERR(cache) || cache == NULL)
return;
for (i = 0; i < cache->entries; i++) {
diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
index 5ca2baa16dc2..ce7d661d5ad8 100644
--- a/fs/squashfs/file.c
+++ b/fs/squashfs/file.c
@@ -493,10 +493,9 @@ out:
return res;
}
-static int squashfs_readahead_fragment(struct page **page,
+static int squashfs_readahead_fragment(struct inode *inode, struct page **page,
unsigned int pages, unsigned int expected, loff_t start)
{
- struct inode *inode = page[0]->mapping->host;
struct squashfs_cache_entry *buffer = squashfs_get_fragment(inode->i_sb,
squashfs_i(inode)->fragment_block,
squashfs_i(inode)->fragment_size);
@@ -605,8 +604,8 @@ static void squashfs_readahead(struct readahead_control *ractl)
if (start >> msblk->block_log == file_end &&
squashfs_i(inode)->fragment_block != SQUASHFS_INVALID_BLK) {
- res = squashfs_readahead_fragment(pages, nr_pages,
- expected, start);
+ res = squashfs_readahead_fragment(inode, pages,
+ nr_pages, expected, start);
if (res)
goto skip_pages;
continue;
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 67c55fe32ce8..4465cf05603a 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -187,10 +187,15 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
unsigned short flags;
unsigned int fragments;
u64 lookup_table_start, xattr_id_table_start, next_table;
- int err;
+ int err, devblksize = sb_min_blocksize(sb, SQUASHFS_DEVBLK_SIZE);
TRACE("Entered squashfs_fill_superblock\n");
+ if (!devblksize) {
+ errorf(fc, "squashfs: unable to set blocksize\n");
+ return -EINVAL;
+ }
+
sb->s_fs_info = kzalloc(sizeof(*msblk), GFP_KERNEL);
if (sb->s_fs_info == NULL) {
ERROR("Failed to allocate squashfs_sb_info\n");
@@ -201,7 +206,7 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc)
msblk->panic_on_errors = (opts->errors == Opt_errors_panic);
- msblk->devblksize = sb_min_blocksize(sb, SQUASHFS_DEVBLK_SIZE);
+ msblk->devblksize = devblksize;
msblk->devblksize_log2 = ffz(~msblk->devblksize);
mutex_init(&msblk->meta_index_mutex);
diff --git a/fs/stack.c b/fs/stack.c
index f18920119944..d8c782e064e3 100644
--- a/fs/stack.c
+++ b/fs/stack.c
@@ -3,7 +3,7 @@
#include <linux/fs.h>
#include <linux/fs_stack.h>
-/* does _NOT_ require i_mutex to be held.
+/* does _NOT_ require i_rwsem to be held.
*
* This function cannot be inlined since i_size_{read,write} is rather
* heavy-weight on 32-bit systems
@@ -41,7 +41,7 @@ void fsstack_copy_inode_size(struct inode *dst, struct inode *src)
* If CONFIG_SMP or CONFIG_PREEMPTION on 32-bit, it's vital for
* fsstack_copy_inode_size() to hold some lock around
* i_size_write(), otherwise i_size_read() may spin forever (see
- * include/linux/fs.h). We don't necessarily hold i_mutex when this
+ * include/linux/fs.h). We don't necessarily hold i_rwsem when this
* is called, so take i_lock for that case.
*
* And if on 32-bit, continue our effort to keep the two halves of
diff --git a/fs/stat.c b/fs/stat.c
index 2c0e111a098a..f95c1dc3eaa4 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -136,13 +136,15 @@ EXPORT_SYMBOL(generic_fill_statx_attr);
* @stat: Where to fill in the attribute flags
* @unit_min: Minimum supported atomic write length in bytes
* @unit_max: Maximum supported atomic write length in bytes
+ * @unit_max_opt: Optimised maximum supported atomic write length in bytes
*
* Fill in the STATX{_ATTR}_WRITE_ATOMIC flags in the kstat structure from
* atomic write unit_min and unit_max values.
*/
void generic_fill_statx_atomic_writes(struct kstat *stat,
unsigned int unit_min,
- unsigned int unit_max)
+ unsigned int unit_max,
+ unsigned int unit_max_opt)
{
/* Confirm that the request type is known */
stat->result_mask |= STATX_WRITE_ATOMIC;
@@ -153,6 +155,7 @@ void generic_fill_statx_atomic_writes(struct kstat *stat,
if (unit_min) {
stat->atomic_write_unit_min = unit_min;
stat->atomic_write_unit_max = unit_max;
+ stat->atomic_write_unit_max_opt = unit_max_opt;
/* Initially only allow 1x segment */
stat->atomic_write_segments_max = 1;
@@ -204,12 +207,25 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat,
STATX_ATTR_DAX);
idmap = mnt_idmap(path->mnt);
- if (inode->i_op->getattr)
- return inode->i_op->getattr(idmap, path, stat,
- request_mask,
- query_flags);
+ if (inode->i_op->getattr) {
+ int ret;
+
+ ret = inode->i_op->getattr(idmap, path, stat, request_mask,
+ query_flags);
+ if (ret)
+ return ret;
+ } else {
+ generic_fillattr(idmap, request_mask, inode, stat);
+ }
+
+ /*
+ * If this is a block device inode, override the filesystem attributes
+ * with the block device specific parameters that need to be obtained
+ * from the bdev backing inode.
+ */
+ if (S_ISBLK(stat->mode))
+ bdev_statx(path, stat, request_mask);
- generic_fillattr(idmap, request_mask, inode, stat);
return 0;
}
EXPORT_SYMBOL(vfs_getattr_nosec);
@@ -241,7 +257,7 @@ int vfs_getattr(const struct path *path, struct kstat *stat,
int retval;
retval = security_inode_getattr(path);
- if (retval)
+ if (unlikely(retval))
return retval;
return vfs_getattr_nosec(path, stat, request_mask, query_flags);
}
@@ -281,6 +297,8 @@ static int vfs_statx_path(struct path *path, int flags, struct kstat *stat,
u32 request_mask)
{
int error = vfs_getattr(path, stat, request_mask, flags);
+ if (error)
+ return error;
if (request_mask & STATX_MNT_ID_UNIQUE) {
stat->mnt_id = real_mount(path->mnt)->mnt_id_unique;
@@ -293,16 +311,7 @@ static int vfs_statx_path(struct path *path, int flags, struct kstat *stat,
if (path_mounted(path))
stat->attributes |= STATX_ATTR_MOUNT_ROOT;
stat->attributes_mask |= STATX_ATTR_MOUNT_ROOT;
-
- /*
- * If this is a block device inode, override the filesystem
- * attributes with the block device specific parameters that need to be
- * obtained from the bdev backing inode.
- */
- if (S_ISBLK(stat->mode))
- bdev_statx(path, stat, request_mask);
-
- return error;
+ return 0;
}
static int vfs_statx_fd(int fd, int flags, struct kstat *stat,
@@ -419,7 +428,7 @@ SYSCALL_DEFINE2(stat, const char __user *, filename,
int error;
error = vfs_stat(filename, &stat);
- if (error)
+ if (unlikely(error))
return error;
return cp_old_stat(&stat, statbuf);
@@ -432,7 +441,7 @@ SYSCALL_DEFINE2(lstat, const char __user *, filename,
int error;
error = vfs_lstat(filename, &stat);
- if (error)
+ if (unlikely(error))
return error;
return cp_old_stat(&stat, statbuf);
@@ -441,12 +450,13 @@ SYSCALL_DEFINE2(lstat, const char __user *, filename,
SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, statbuf)
{
struct kstat stat;
- int error = vfs_fstat(fd, &stat);
+ int error;
- if (!error)
- error = cp_old_stat(&stat, statbuf);
+ error = vfs_fstat(fd, &stat);
+ if (unlikely(error))
+ return error;
- return error;
+ return cp_old_stat(&stat, statbuf);
}
#endif /* __ARCH_WANT_OLD_STAT */
@@ -500,10 +510,12 @@ SYSCALL_DEFINE2(newstat, const char __user *, filename,
struct stat __user *, statbuf)
{
struct kstat stat;
- int error = vfs_stat(filename, &stat);
+ int error;
- if (error)
+ error = vfs_stat(filename, &stat);
+ if (unlikely(error))
return error;
+
return cp_new_stat(&stat, statbuf);
}
@@ -514,7 +526,7 @@ SYSCALL_DEFINE2(newlstat, const char __user *, filename,
int error;
error = vfs_lstat(filename, &stat);
- if (error)
+ if (unlikely(error))
return error;
return cp_new_stat(&stat, statbuf);
@@ -528,8 +540,9 @@ SYSCALL_DEFINE4(newfstatat, int, dfd, const char __user *, filename,
int error;
error = vfs_fstatat(dfd, filename, &stat, flag);
- if (error)
+ if (unlikely(error))
return error;
+
return cp_new_stat(&stat, statbuf);
}
#endif
@@ -537,12 +550,13 @@ SYSCALL_DEFINE4(newfstatat, int, dfd, const char __user *, filename,
SYSCALL_DEFINE2(newfstat, unsigned int, fd, struct stat __user *, statbuf)
{
struct kstat stat;
- int error = vfs_fstat(fd, &stat);
+ int error;
- if (!error)
- error = cp_new_stat(&stat, statbuf);
+ error = vfs_fstat(fd, &stat);
+ if (unlikely(error))
+ return error;
- return error;
+ return cp_new_stat(&stat, statbuf);
}
#endif
@@ -730,6 +744,7 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer)
tmp.stx_atomic_write_unit_min = stat->atomic_write_unit_min;
tmp.stx_atomic_write_unit_max = stat->atomic_write_unit_max;
tmp.stx_atomic_write_segments_max = stat->atomic_write_segments_max;
+ tmp.stx_atomic_write_unit_max_opt = stat->atomic_write_unit_max_opt;
return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0;
}
diff --git a/fs/super.c b/fs/super.c
index 5a7db4a556e3..7f876f32343a 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -39,7 +39,8 @@
#include <uapi/linux/mount.h>
#include "internal.h"
-static int thaw_super_locked(struct super_block *sb, enum freeze_holder who);
+static int thaw_super_locked(struct super_block *sb, enum freeze_holder who,
+ const void *freeze_owner);
static LIST_HEAD(super_blocks);
static DEFINE_SPINLOCK(sb_lock);
@@ -201,7 +202,7 @@ static unsigned long super_cache_scan(struct shrinker *shrink,
inodes = list_lru_shrink_count(&sb->s_inode_lru, sc);
dentries = list_lru_shrink_count(&sb->s_dentry_lru, sc);
- total_objects = dentries + inodes + fs_objects + 1;
+ total_objects = dentries + inodes + fs_objects;
if (!total_objects)
total_objects = 1;
@@ -823,13 +824,6 @@ struct super_block *sget(struct file_system_type *type,
struct super_block *old;
int err;
- /* We don't yet pass the user namespace of the parent
- * mount through to here so always use &init_user_ns
- * until that changes.
- */
- if (flags & SB_SUBMOUNT)
- user_ns = &init_user_ns;
-
retry:
spin_lock(&sb_lock);
if (test) {
@@ -849,7 +843,7 @@ retry:
}
if (!s) {
spin_unlock(&sb_lock);
- s = alloc_super(type, (flags & ~SB_SUBMOUNT), user_ns);
+ s = alloc_super(type, flags, user_ns);
if (!s)
return ERR_PTR(-ENOMEM);
goto retry;
@@ -887,52 +881,48 @@ void drop_super_exclusive(struct super_block *sb)
}
EXPORT_SYMBOL(drop_super_exclusive);
-static void __iterate_supers(void (*f)(struct super_block *))
-{
- struct super_block *sb, *p = NULL;
-
- spin_lock(&sb_lock);
- list_for_each_entry(sb, &super_blocks, s_list) {
- if (super_flags(sb, SB_DYING))
- continue;
- sb->s_count++;
- spin_unlock(&sb_lock);
+enum super_iter_flags_t {
+ SUPER_ITER_EXCL = (1U << 0),
+ SUPER_ITER_UNLOCKED = (1U << 1),
+ SUPER_ITER_REVERSE = (1U << 2),
+};
- f(sb);
+static inline struct super_block *first_super(enum super_iter_flags_t flags)
+{
+ if (flags & SUPER_ITER_REVERSE)
+ return list_last_entry(&super_blocks, struct super_block, s_list);
+ return list_first_entry(&super_blocks, struct super_block, s_list);
+}
- spin_lock(&sb_lock);
- if (p)
- __put_super(p);
- p = sb;
- }
- if (p)
- __put_super(p);
- spin_unlock(&sb_lock);
+static inline struct super_block *next_super(struct super_block *sb,
+ enum super_iter_flags_t flags)
+{
+ if (flags & SUPER_ITER_REVERSE)
+ return list_prev_entry(sb, s_list);
+ return list_next_entry(sb, s_list);
}
-/**
- * iterate_supers - call function for all active superblocks
- * @f: function to call
- * @arg: argument to pass to it
- *
- * Scans the superblock list and calls given function, passing it
- * locked superblock and given argument.
- */
-void iterate_supers(void (*f)(struct super_block *, void *), void *arg)
+
+static void __iterate_supers(void (*f)(struct super_block *, void *), void *arg,
+ enum super_iter_flags_t flags)
{
struct super_block *sb, *p = NULL;
+ bool excl = flags & SUPER_ITER_EXCL;
- spin_lock(&sb_lock);
- list_for_each_entry(sb, &super_blocks, s_list) {
- bool locked;
+ guard(spinlock)(&sb_lock);
+ for (sb = first_super(flags);
+ !list_entry_is_head(sb, &super_blocks, s_list);
+ sb = next_super(sb, flags)) {
+ if (super_flags(sb, SB_DYING))
+ continue;
sb->s_count++;
spin_unlock(&sb_lock);
- locked = super_lock_shared(sb);
- if (locked) {
- if (sb->s_root)
- f(sb, arg);
- super_unlock_shared(sb);
+ if (flags & SUPER_ITER_UNLOCKED) {
+ f(sb, arg);
+ } else if (super_lock(sb, excl)) {
+ f(sb, arg);
+ super_unlock(sb, excl);
}
spin_lock(&sb_lock);
@@ -942,7 +932,11 @@ void iterate_supers(void (*f)(struct super_block *, void *), void *arg)
}
if (p)
__put_super(p);
- spin_unlock(&sb_lock);
+}
+
+void iterate_supers(void (*f)(struct super_block *, void *), void *arg)
+{
+ __iterate_supers(f, arg, 0);
}
/**
@@ -963,13 +957,15 @@ void iterate_supers_type(struct file_system_type *type,
hlist_for_each_entry(sb, &type->fs_supers, s_instances) {
bool locked;
+ if (super_flags(sb, SB_DYING))
+ continue;
+
sb->s_count++;
spin_unlock(&sb_lock);
locked = super_lock_shared(sb);
if (locked) {
- if (sb->s_root)
- f(sb, arg);
+ f(sb, arg);
super_unlock_shared(sb);
}
@@ -991,23 +987,21 @@ struct super_block *user_get_super(dev_t dev, bool excl)
spin_lock(&sb_lock);
list_for_each_entry(sb, &super_blocks, s_list) {
- if (sb->s_dev == dev) {
- bool locked;
-
- sb->s_count++;
- spin_unlock(&sb_lock);
- /* still alive? */
- locked = super_lock(sb, excl);
- if (locked) {
- if (sb->s_root)
- return sb;
- super_unlock(sb, excl);
- }
- /* nope, got unmounted */
- spin_lock(&sb_lock);
- __put_super(sb);
- break;
- }
+ bool locked;
+
+ if (sb->s_dev != dev)
+ continue;
+
+ sb->s_count++;
+ spin_unlock(&sb_lock);
+
+ locked = super_lock(sb, excl);
+ if (locked)
+ return sb;
+
+ spin_lock(&sb_lock);
+ __put_super(sb);
+ break;
}
spin_unlock(&sb_lock);
return NULL;
@@ -1111,11 +1105,9 @@ cancel_readonly:
return retval;
}
-static void do_emergency_remount_callback(struct super_block *sb)
+static void do_emergency_remount_callback(struct super_block *sb, void *unused)
{
- bool locked = super_lock_excl(sb);
-
- if (locked && sb->s_root && sb->s_bdev && !sb_rdonly(sb)) {
+ if (sb->s_bdev && !sb_rdonly(sb)) {
struct fs_context *fc;
fc = fs_context_for_reconfigure(sb->s_root,
@@ -1126,13 +1118,12 @@ static void do_emergency_remount_callback(struct super_block *sb)
put_fs_context(fc);
}
}
- if (locked)
- super_unlock_excl(sb);
}
static void do_emergency_remount(struct work_struct *work)
{
- __iterate_supers(do_emergency_remount_callback);
+ __iterate_supers(do_emergency_remount_callback, NULL,
+ SUPER_ITER_EXCL | SUPER_ITER_REVERSE);
kfree(work);
printk("Emergency Remount complete\n");
}
@@ -1148,24 +1139,18 @@ void emergency_remount(void)
}
}
-static void do_thaw_all_callback(struct super_block *sb)
+static void do_thaw_all_callback(struct super_block *sb, void *unused)
{
- bool locked = super_lock_excl(sb);
-
- if (locked && sb->s_root) {
- if (IS_ENABLED(CONFIG_BLOCK))
- while (sb->s_bdev && !bdev_thaw(sb->s_bdev))
- pr_warn("Emergency Thaw on %pg\n", sb->s_bdev);
- thaw_super_locked(sb, FREEZE_HOLDER_USERSPACE);
- return;
- }
- if (locked)
- super_unlock_excl(sb);
+ if (IS_ENABLED(CONFIG_BLOCK))
+ while (sb->s_bdev && !bdev_thaw(sb->s_bdev))
+ pr_warn("Emergency Thaw on %pg\n", sb->s_bdev);
+ thaw_super_locked(sb, FREEZE_HOLDER_USERSPACE, NULL);
+ return;
}
static void do_thaw_all(struct work_struct *work)
{
- __iterate_supers(do_thaw_all_callback);
+ __iterate_supers(do_thaw_all_callback, NULL, SUPER_ITER_EXCL);
kfree(work);
printk(KERN_WARNING "Emergency Thaw complete\n");
}
@@ -1186,6 +1171,66 @@ void emergency_thaw_all(void)
}
}
+static inline bool get_active_super(struct super_block *sb)
+{
+ bool active = false;
+
+ if (super_lock_excl(sb)) {
+ active = atomic_inc_not_zero(&sb->s_active);
+ super_unlock_excl(sb);
+ }
+ return active;
+}
+
+static const char *filesystems_freeze_ptr = "filesystems_freeze";
+
+static void filesystems_freeze_callback(struct super_block *sb, void *unused)
+{
+ if (!sb->s_op->freeze_fs && !sb->s_op->freeze_super)
+ return;
+
+ if (!get_active_super(sb))
+ return;
+
+ if (sb->s_op->freeze_super)
+ sb->s_op->freeze_super(sb, FREEZE_EXCL | FREEZE_HOLDER_KERNEL,
+ filesystems_freeze_ptr);
+ else
+ freeze_super(sb, FREEZE_EXCL | FREEZE_HOLDER_KERNEL,
+ filesystems_freeze_ptr);
+
+ deactivate_super(sb);
+}
+
+void filesystems_freeze(void)
+{
+ __iterate_supers(filesystems_freeze_callback, NULL,
+ SUPER_ITER_UNLOCKED | SUPER_ITER_REVERSE);
+}
+
+static void filesystems_thaw_callback(struct super_block *sb, void *unused)
+{
+ if (!sb->s_op->freeze_fs && !sb->s_op->freeze_super)
+ return;
+
+ if (!get_active_super(sb))
+ return;
+
+ if (sb->s_op->thaw_super)
+ sb->s_op->thaw_super(sb, FREEZE_EXCL | FREEZE_HOLDER_KERNEL,
+ filesystems_freeze_ptr);
+ else
+ thaw_super(sb, FREEZE_EXCL | FREEZE_HOLDER_KERNEL,
+ filesystems_freeze_ptr);
+
+ deactivate_super(sb);
+}
+
+void filesystems_thaw(void)
+{
+ __iterate_supers(filesystems_thaw_callback, NULL, SUPER_ITER_UNLOCKED);
+}
+
static DEFINE_IDA(unnamed_dev_ida);
/**
@@ -1414,10 +1459,21 @@ static void fs_bdev_mark_dead(struct block_device *bdev, bool surprise)
if (!sb)
return;
+ if (sb->s_op->remove_bdev) {
+ int ret;
+
+ ret = sb->s_op->remove_bdev(sb, bdev);
+ if (!ret) {
+ super_unlock_shared(sb);
+ return;
+ }
+ /* Fallback to shutdown. */
+ }
+
if (!surprise)
sync_filesystem(sb);
shrink_dcache_sb(sb);
- invalidate_inodes(sb);
+ evict_inodes(sb);
if (sb->s_op->shutdown)
sb->s_op->shutdown(sb);
@@ -1479,10 +1535,10 @@ static int fs_bdev_freeze(struct block_device *bdev)
if (sb->s_op->freeze_super)
error = sb->s_op->freeze_super(sb,
- FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE);
+ FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE, NULL);
else
error = freeze_super(sb,
- FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE);
+ FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE, NULL);
if (!error)
error = sync_blockdev(bdev);
deactivate_super(sb);
@@ -1528,10 +1584,10 @@ static int fs_bdev_thaw(struct block_device *bdev)
if (sb->s_op->thaw_super)
error = sb->s_op->thaw_super(sb,
- FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE);
+ FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE, NULL);
else
error = thaw_super(sb,
- FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE);
+ FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE, NULL);
deactivate_super(sb);
return error;
}
@@ -1737,61 +1793,6 @@ struct dentry *mount_nodev(struct file_system_type *fs_type,
}
EXPORT_SYMBOL(mount_nodev);
-int reconfigure_single(struct super_block *s,
- int flags, void *data)
-{
- struct fs_context *fc;
- int ret;
-
- /* The caller really need to be passing fc down into mount_single(),
- * then a chunk of this can be removed. [Bollocks -- AV]
- * Better yet, reconfiguration shouldn't happen, but rather the second
- * mount should be rejected if the parameters are not compatible.
- */
- fc = fs_context_for_reconfigure(s->s_root, flags, MS_RMT_MASK);
- if (IS_ERR(fc))
- return PTR_ERR(fc);
-
- ret = parse_monolithic_mount_data(fc, data);
- if (ret < 0)
- goto out;
-
- ret = reconfigure_super(fc);
-out:
- put_fs_context(fc);
- return ret;
-}
-
-static int compare_single(struct super_block *s, void *p)
-{
- return 1;
-}
-
-struct dentry *mount_single(struct file_system_type *fs_type,
- int flags, void *data,
- int (*fill_super)(struct super_block *, void *, int))
-{
- struct super_block *s;
- int error;
-
- s = sget(fs_type, compare_single, set_anon_super, flags, NULL);
- if (IS_ERR(s))
- return ERR_CAST(s);
- if (!s->s_root) {
- error = fill_super(s, data, flags & SB_SILENT ? 1 : 0);
- if (!error)
- s->s_flags |= SB_ACTIVE;
- } else {
- error = reconfigure_single(s, flags, data);
- }
- if (unlikely(error)) {
- deactivate_locked_super(s);
- return ERR_PTR(error);
- }
- return dget(s->s_root);
-}
-EXPORT_SYMBOL(mount_single);
-
/**
* vfs_get_tree - Get the mountable root
* @fc: The superblock configuration context.
@@ -1958,7 +1959,7 @@ static int wait_for_partially_frozen(struct super_block *sb)
}
#define FREEZE_HOLDERS (FREEZE_HOLDER_KERNEL | FREEZE_HOLDER_USERSPACE)
-#define FREEZE_FLAGS (FREEZE_HOLDERS | FREEZE_MAY_NEST)
+#define FREEZE_FLAGS (FREEZE_HOLDERS | FREEZE_MAY_NEST | FREEZE_EXCL)
static inline int freeze_inc(struct super_block *sb, enum freeze_holder who)
{
@@ -1984,11 +1985,34 @@ static inline int freeze_dec(struct super_block *sb, enum freeze_holder who)
return sb->s_writers.freeze_kcount + sb->s_writers.freeze_ucount;
}
-static inline bool may_freeze(struct super_block *sb, enum freeze_holder who)
+static inline bool may_freeze(struct super_block *sb, enum freeze_holder who,
+ const void *freeze_owner)
{
+ lockdep_assert_held(&sb->s_umount);
+
WARN_ON_ONCE((who & ~FREEZE_FLAGS));
WARN_ON_ONCE(hweight32(who & FREEZE_HOLDERS) > 1);
+ if (who & FREEZE_EXCL) {
+ if (WARN_ON_ONCE(!(who & FREEZE_HOLDER_KERNEL)))
+ return false;
+ if (WARN_ON_ONCE(who & ~(FREEZE_EXCL | FREEZE_HOLDER_KERNEL)))
+ return false;
+ if (WARN_ON_ONCE(!freeze_owner))
+ return false;
+ /* This freeze already has a specific owner. */
+ if (sb->s_writers.freeze_owner)
+ return false;
+ /*
+ * This is already frozen multiple times so we're just
+ * going to take a reference count and mark the freeze as
+ * being owned by the caller.
+ */
+ if (sb->s_writers.freeze_kcount + sb->s_writers.freeze_ucount)
+ sb->s_writers.freeze_owner = freeze_owner;
+ return true;
+ }
+
if (who & FREEZE_HOLDER_KERNEL)
return (who & FREEZE_MAY_NEST) ||
sb->s_writers.freeze_kcount == 0;
@@ -1998,10 +2022,61 @@ static inline bool may_freeze(struct super_block *sb, enum freeze_holder who)
return false;
}
+static inline bool may_unfreeze(struct super_block *sb, enum freeze_holder who,
+ const void *freeze_owner)
+{
+ lockdep_assert_held(&sb->s_umount);
+
+ WARN_ON_ONCE((who & ~FREEZE_FLAGS));
+ WARN_ON_ONCE(hweight32(who & FREEZE_HOLDERS) > 1);
+
+ if (who & FREEZE_EXCL) {
+ if (WARN_ON_ONCE(!(who & FREEZE_HOLDER_KERNEL)))
+ return false;
+ if (WARN_ON_ONCE(who & ~(FREEZE_EXCL | FREEZE_HOLDER_KERNEL)))
+ return false;
+ if (WARN_ON_ONCE(!freeze_owner))
+ return false;
+ if (WARN_ON_ONCE(sb->s_writers.freeze_kcount == 0))
+ return false;
+ /* This isn't exclusively frozen. */
+ if (!sb->s_writers.freeze_owner)
+ return false;
+ /* This isn't exclusively frozen by us. */
+ if (sb->s_writers.freeze_owner != freeze_owner)
+ return false;
+ /*
+ * This is still frozen multiple times so we're just
+ * going to drop our reference count and undo our
+ * exclusive freeze.
+ */
+ if ((sb->s_writers.freeze_kcount + sb->s_writers.freeze_ucount) > 1)
+ sb->s_writers.freeze_owner = NULL;
+ return true;
+ }
+
+ if (who & FREEZE_HOLDER_KERNEL) {
+ /*
+ * Someone's trying to steal the reference belonging to
+ * @sb->s_writers.freeze_owner.
+ */
+ if (sb->s_writers.freeze_kcount == 1 &&
+ sb->s_writers.freeze_owner)
+ return false;
+ return sb->s_writers.freeze_kcount > 0;
+ }
+
+ if (who & FREEZE_HOLDER_USERSPACE)
+ return sb->s_writers.freeze_ucount > 0;
+
+ return false;
+}
+
/**
* freeze_super - lock the filesystem and force it into a consistent state
* @sb: the super to lock
* @who: context that wants to freeze
+ * @freeze_owner: owner of the freeze
*
* Syncs the super to make sure the filesystem is consistent and calls the fs's
* freeze_fs. Subsequent calls to this without first thawing the fs may return
@@ -2053,7 +2128,7 @@ static inline bool may_freeze(struct super_block *sb, enum freeze_holder who)
* Return: If the freeze was successful zero is returned. If the freeze
* failed a negative error code is returned.
*/
-int freeze_super(struct super_block *sb, enum freeze_holder who)
+int freeze_super(struct super_block *sb, enum freeze_holder who, const void *freeze_owner)
{
int ret;
@@ -2065,7 +2140,7 @@ int freeze_super(struct super_block *sb, enum freeze_holder who)
retry:
if (sb->s_writers.frozen == SB_FREEZE_COMPLETE) {
- if (may_freeze(sb, who))
+ if (may_freeze(sb, who, freeze_owner))
ret = !!WARN_ON_ONCE(freeze_inc(sb, who) == 1);
else
ret = -EBUSY;
@@ -2087,6 +2162,7 @@ retry:
if (sb_rdonly(sb)) {
/* Nothing to do really... */
WARN_ON_ONCE(freeze_inc(sb, who) > 1);
+ sb->s_writers.freeze_owner = freeze_owner;
sb->s_writers.frozen = SB_FREEZE_COMPLETE;
wake_up_var(&sb->s_writers.frozen);
super_unlock_excl(sb);
@@ -2134,6 +2210,7 @@ retry:
* when frozen is set to SB_FREEZE_COMPLETE, and for thaw_super().
*/
WARN_ON_ONCE(freeze_inc(sb, who) > 1);
+ sb->s_writers.freeze_owner = freeze_owner;
sb->s_writers.frozen = SB_FREEZE_COMPLETE;
wake_up_var(&sb->s_writers.frozen);
lockdep_sb_freeze_release(sb);
@@ -2148,13 +2225,17 @@ EXPORT_SYMBOL(freeze_super);
* removes that state without releasing the other state or unlocking the
* filesystem.
*/
-static int thaw_super_locked(struct super_block *sb, enum freeze_holder who)
+static int thaw_super_locked(struct super_block *sb, enum freeze_holder who,
+ const void *freeze_owner)
{
int error = -EINVAL;
if (sb->s_writers.frozen != SB_FREEZE_COMPLETE)
goto out_unlock;
+ if (!may_unfreeze(sb, who, freeze_owner))
+ goto out_unlock;
+
/*
* All freezers share a single active reference.
* So just unlock in case there are any left.
@@ -2164,6 +2245,7 @@ static int thaw_super_locked(struct super_block *sb, enum freeze_holder who)
if (sb_rdonly(sb)) {
sb->s_writers.frozen = SB_UNFROZEN;
+ sb->s_writers.freeze_owner = NULL;
wake_up_var(&sb->s_writers.frozen);
goto out_deactivate;
}
@@ -2181,6 +2263,7 @@ static int thaw_super_locked(struct super_block *sb, enum freeze_holder who)
}
sb->s_writers.frozen = SB_UNFROZEN;
+ sb->s_writers.freeze_owner = NULL;
wake_up_var(&sb->s_writers.frozen);
sb_freeze_unlock(sb, SB_FREEZE_FS);
out_deactivate:
@@ -2196,6 +2279,7 @@ out_unlock:
* thaw_super -- unlock filesystem
* @sb: the super to thaw
* @who: context that wants to freeze
+ * @freeze_owner: owner of the freeze
*
* Unlocks the filesystem and marks it writeable again after freeze_super()
* if there are no remaining freezes on the filesystem.
@@ -2209,13 +2293,14 @@ out_unlock:
* have been frozen through the block layer via multiple block devices.
* The filesystem remains frozen until all block devices are unfrozen.
*/
-int thaw_super(struct super_block *sb, enum freeze_holder who)
+int thaw_super(struct super_block *sb, enum freeze_holder who,
+ const void *freeze_owner)
{
if (!super_lock_excl(sb)) {
WARN_ON_ONCE("Dying superblock while thawing!");
return -EINVAL;
}
- return thaw_super_locked(sb, who);
+ return thaw_super_locked(sb, who, freeze_owner);
}
EXPORT_SYMBOL(thaw_super);
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 4df2afa551dc..94e12efd92f2 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -123,7 +123,7 @@ int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj,
new_parent = new_parent_kobj && new_parent_kobj->sd ?
new_parent_kobj->sd : sysfs_root_kn;
- return kernfs_rename_ns(kn, new_parent, kn->name, new_ns);
+ return kernfs_rename_ns(kn, new_parent, NULL, new_ns);
}
/**
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 6931308876c4..1ca143d2f22a 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -19,13 +19,19 @@
#include "sysfs.h"
+static struct kobject *sysfs_file_kobj(struct kernfs_node *kn)
+{
+ guard(rcu)();
+ return rcu_dereference(kn->__parent)->priv;
+}
+
/*
* Determine ktype->sysfs_ops for the given kernfs_node. This function
* must be called while holding an active reference.
*/
static const struct sysfs_ops *sysfs_file_ops(struct kernfs_node *kn)
{
- struct kobject *kobj = kn->parent->priv;
+ struct kobject *kobj = sysfs_file_kobj(kn);
if (kn->flags & KERNFS_LOCKDEP)
lockdep_assert_held(kn);
@@ -40,7 +46,7 @@ static const struct sysfs_ops *sysfs_file_ops(struct kernfs_node *kn)
static int sysfs_kf_seq_show(struct seq_file *sf, void *v)
{
struct kernfs_open_file *of = sf->private;
- struct kobject *kobj = of->kn->parent->priv;
+ struct kobject *kobj = sysfs_file_kobj(of->kn);
const struct sysfs_ops *ops = sysfs_file_ops(of->kn);
ssize_t count;
char *buf;
@@ -77,8 +83,8 @@ static int sysfs_kf_seq_show(struct seq_file *sf, void *v)
static ssize_t sysfs_kf_bin_read(struct kernfs_open_file *of, char *buf,
size_t count, loff_t pos)
{
- struct bin_attribute *battr = of->kn->priv;
- struct kobject *kobj = of->kn->parent->priv;
+ const struct bin_attribute *battr = of->kn->priv;
+ struct kobject *kobj = sysfs_file_kobj(of->kn);
loff_t size = file_inode(of->file)->i_size;
if (!count)
@@ -105,7 +111,7 @@ static ssize_t sysfs_kf_read(struct kernfs_open_file *of, char *buf,
size_t count, loff_t pos)
{
const struct sysfs_ops *ops = sysfs_file_ops(of->kn);
- struct kobject *kobj = of->kn->parent->priv;
+ struct kobject *kobj = sysfs_file_kobj(of->kn);
ssize_t len;
/*
@@ -131,7 +137,7 @@ static ssize_t sysfs_kf_write(struct kernfs_open_file *of, char *buf,
size_t count, loff_t pos)
{
const struct sysfs_ops *ops = sysfs_file_ops(of->kn);
- struct kobject *kobj = of->kn->parent->priv;
+ struct kobject *kobj = sysfs_file_kobj(of->kn);
if (!count)
return 0;
@@ -143,8 +149,8 @@ static ssize_t sysfs_kf_write(struct kernfs_open_file *of, char *buf,
static ssize_t sysfs_kf_bin_write(struct kernfs_open_file *of, char *buf,
size_t count, loff_t pos)
{
- struct bin_attribute *battr = of->kn->priv;
- struct kobject *kobj = of->kn->parent->priv;
+ const struct bin_attribute *battr = of->kn->priv;
+ struct kobject *kobj = sysfs_file_kobj(of->kn);
loff_t size = file_inode(of->file)->i_size;
if (size) {
@@ -167,8 +173,8 @@ static ssize_t sysfs_kf_bin_write(struct kernfs_open_file *of, char *buf,
static int sysfs_kf_bin_mmap(struct kernfs_open_file *of,
struct vm_area_struct *vma)
{
- struct bin_attribute *battr = of->kn->priv;
- struct kobject *kobj = of->kn->parent->priv;
+ const struct bin_attribute *battr = of->kn->priv;
+ struct kobject *kobj = sysfs_file_kobj(of->kn);
return battr->mmap(of->file, kobj, battr, vma);
}
@@ -176,8 +182,8 @@ static int sysfs_kf_bin_mmap(struct kernfs_open_file *of,
static loff_t sysfs_kf_bin_llseek(struct kernfs_open_file *of, loff_t offset,
int whence)
{
- struct bin_attribute *battr = of->kn->priv;
- struct kobject *kobj = of->kn->parent->priv;
+ const struct bin_attribute *battr = of->kn->priv;
+ struct kobject *kobj = sysfs_file_kobj(of->kn);
if (battr->llseek)
return battr->llseek(of->file, kobj, battr, offset, whence);
@@ -187,7 +193,7 @@ static loff_t sysfs_kf_bin_llseek(struct kernfs_open_file *of, loff_t offset,
static int sysfs_kf_bin_open(struct kernfs_open_file *of)
{
- struct bin_attribute *battr = of->kn->priv;
+ const struct bin_attribute *battr = of->kn->priv;
if (battr->f_mapping)
of->file->f_mapping = battr->f_mapping();
@@ -494,7 +500,7 @@ EXPORT_SYMBOL_GPL(sysfs_break_active_protection);
*/
void sysfs_unbreak_active_protection(struct kernfs_node *kn)
{
- struct kobject *kobj = kn->parent->priv;
+ struct kobject *kobj = sysfs_file_kobj(kn);
kernfs_unbreak_active_protection(kn);
kernfs_put(kn);
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 8b01a7eda5fb..2d78e94072a0 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -21,7 +21,7 @@ static void remove_files(struct kernfs_node *parent,
const struct attribute_group *grp)
{
struct attribute *const *attr;
- struct bin_attribute *const *bin_attr;
+ const struct bin_attribute *const *bin_attr;
if (grp->attrs)
for (attr = grp->attrs; *attr; attr++)
@@ -47,7 +47,7 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj,
const struct attribute_group *grp, int update)
{
struct attribute *const *attr;
- struct bin_attribute *const *bin_attr;
+ const struct bin_attribute *const *bin_attr;
int error = 0, i;
if (grp->attrs) {
@@ -521,7 +521,7 @@ static int sysfs_group_attrs_change_owner(struct kernfs_node *grp_kn,
}
if (grp->bin_attrs) {
- struct bin_attribute *const *bin_attr;
+ const struct bin_attribute *const *bin_attr;
for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) {
kn = kernfs_find_and_get(grp_kn, (*bin_attr)->attr.name);
diff --git a/fs/sysv/Kconfig b/fs/sysv/Kconfig
deleted file mode 100644
index 67b3f90afbfd..000000000000
--- a/fs/sysv/Kconfig
+++ /dev/null
@@ -1,38 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-config SYSV_FS
- tristate "System V/Xenix/V7/Coherent file system support"
- depends on BLOCK
- select BUFFER_HEAD
- help
- SCO, Xenix and Coherent are commercial Unix systems for Intel
- machines, and Version 7 was used on the DEC PDP-11. Saying Y
- here would allow you to read from their floppies and hard disk
- partitions.
-
- If you have floppies or hard disk partitions like that, it is likely
- that they contain binaries from those other Unix systems; in order
- to run these binaries, you will want to install linux-abi which is
- a set of kernel modules that lets you run SCO, Xenix, Wyse,
- UnixWare, Dell Unix and System V programs under Linux. It is
- available via FTP (user: ftp) from
- <ftp://ftp.openlinux.org/pub/people/hch/linux-abi/>).
- NOTE: that will work only for binaries from Intel-based systems;
- PDP ones will have to wait until somebody ports Linux to -11 ;-)
-
- If you only intend to mount files from some other Unix over the
- network using NFS, you don't need the System V file system support
- (but you need NFS file system support obviously).
-
- Note that this option is generally not needed for floppies, since a
- good portable way to transport files and directories between unixes
- (and even other operating systems) is given by the tar program ("man
- tar" or preferably "info tar"). Note also that this option has
- nothing whatsoever to do with the option "System V IPC". Read about
- the System V file system in
- <file:Documentation/filesystems/sysv-fs.rst>.
- Saying Y here will enlarge your kernel by about 27 KB.
-
- To compile this as a module, choose M here: the module will be called
- sysv.
-
- If you haven't heard about all of this before, it's safe to say N.
diff --git a/fs/sysv/Makefile b/fs/sysv/Makefile
deleted file mode 100644
index 17d12ba04b18..000000000000
--- a/fs/sysv/Makefile
+++ /dev/null
@@ -1,9 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-#
-# Makefile for the Linux SystemV/Coherent filesystem routines.
-#
-
-obj-$(CONFIG_SYSV_FS) += sysv.o
-
-sysv-objs := ialloc.o balloc.o inode.o itree.o file.o dir.o \
- namei.o super.o
diff --git a/fs/sysv/balloc.c b/fs/sysv/balloc.c
deleted file mode 100644
index 0e69dbdf7277..000000000000
--- a/fs/sysv/balloc.c
+++ /dev/null
@@ -1,240 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * linux/fs/sysv/balloc.c
- *
- * minix/bitmap.c
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * ext/freelists.c
- * Copyright (C) 1992 Remy Card (card@masi.ibp.fr)
- *
- * xenix/alloc.c
- * Copyright (C) 1992 Doug Evans
- *
- * coh/alloc.c
- * Copyright (C) 1993 Pascal Haible, Bruno Haible
- *
- * sysv/balloc.c
- * Copyright (C) 1993 Bruno Haible
- *
- * This file contains code for allocating/freeing blocks.
- */
-
-#include <linux/buffer_head.h>
-#include <linux/string.h>
-#include "sysv.h"
-
-/* We don't trust the value of
- sb->sv_sbd2->s_tfree = *sb->sv_free_blocks
- but we nevertheless keep it up to date. */
-
-static inline sysv_zone_t *get_chunk(struct super_block *sb, struct buffer_head *bh)
-{
- char *bh_data = bh->b_data;
-
- if (SYSV_SB(sb)->s_type == FSTYPE_SYSV4)
- return (sysv_zone_t*)(bh_data+4);
- else
- return (sysv_zone_t*)(bh_data+2);
-}
-
-/* NOTE NOTE NOTE: nr is a block number _as_ _stored_ _on_ _disk_ */
-
-void sysv_free_block(struct super_block * sb, sysv_zone_t nr)
-{
- struct sysv_sb_info * sbi = SYSV_SB(sb);
- struct buffer_head * bh;
- sysv_zone_t *blocks = sbi->s_bcache;
- unsigned count;
- unsigned block = fs32_to_cpu(sbi, nr);
-
- /*
- * This code does not work at all for AFS (it has a bitmap
- * free list). As AFS is supposed to be read-only no one
- * should call this for an AFS filesystem anyway...
- */
- if (sbi->s_type == FSTYPE_AFS)
- return;
-
- if (block < sbi->s_firstdatazone || block >= sbi->s_nzones) {
- printk("sysv_free_block: trying to free block not in datazone\n");
- return;
- }
-
- mutex_lock(&sbi->s_lock);
- count = fs16_to_cpu(sbi, *sbi->s_bcache_count);
-
- if (count > sbi->s_flc_size) {
- printk("sysv_free_block: flc_count > flc_size\n");
- mutex_unlock(&sbi->s_lock);
- return;
- }
- /* If the free list head in super-block is full, it is copied
- * into this block being freed, ditto if it's completely empty
- * (applies only on Coherent).
- */
- if (count == sbi->s_flc_size || count == 0) {
- block += sbi->s_block_base;
- bh = sb_getblk(sb, block);
- if (!bh) {
- printk("sysv_free_block: getblk() failed\n");
- mutex_unlock(&sbi->s_lock);
- return;
- }
- memset(bh->b_data, 0, sb->s_blocksize);
- *(__fs16*)bh->b_data = cpu_to_fs16(sbi, count);
- memcpy(get_chunk(sb,bh), blocks, count * sizeof(sysv_zone_t));
- mark_buffer_dirty(bh);
- set_buffer_uptodate(bh);
- brelse(bh);
- count = 0;
- }
- sbi->s_bcache[count++] = nr;
-
- *sbi->s_bcache_count = cpu_to_fs16(sbi, count);
- fs32_add(sbi, sbi->s_free_blocks, 1);
- dirty_sb(sb);
- mutex_unlock(&sbi->s_lock);
-}
-
-sysv_zone_t sysv_new_block(struct super_block * sb)
-{
- struct sysv_sb_info *sbi = SYSV_SB(sb);
- unsigned int block;
- sysv_zone_t nr;
- struct buffer_head * bh;
- unsigned count;
-
- mutex_lock(&sbi->s_lock);
- count = fs16_to_cpu(sbi, *sbi->s_bcache_count);
-
- if (count == 0) /* Applies only to Coherent FS */
- goto Enospc;
- nr = sbi->s_bcache[--count];
- if (nr == 0) /* Applies only to Xenix FS, SystemV FS */
- goto Enospc;
-
- block = fs32_to_cpu(sbi, nr);
-
- *sbi->s_bcache_count = cpu_to_fs16(sbi, count);
-
- if (block < sbi->s_firstdatazone || block >= sbi->s_nzones) {
- printk("sysv_new_block: new block %d is not in data zone\n",
- block);
- goto Enospc;
- }
-
- if (count == 0) { /* the last block continues the free list */
- unsigned count;
-
- block += sbi->s_block_base;
- if (!(bh = sb_bread(sb, block))) {
- printk("sysv_new_block: cannot read free-list block\n");
- /* retry this same block next time */
- *sbi->s_bcache_count = cpu_to_fs16(sbi, 1);
- goto Enospc;
- }
- count = fs16_to_cpu(sbi, *(__fs16*)bh->b_data);
- if (count > sbi->s_flc_size) {
- printk("sysv_new_block: free-list block with >flc_size entries\n");
- brelse(bh);
- goto Enospc;
- }
- *sbi->s_bcache_count = cpu_to_fs16(sbi, count);
- memcpy(sbi->s_bcache, get_chunk(sb, bh),
- count * sizeof(sysv_zone_t));
- brelse(bh);
- }
- /* Now the free list head in the superblock is valid again. */
- fs32_add(sbi, sbi->s_free_blocks, -1);
- dirty_sb(sb);
- mutex_unlock(&sbi->s_lock);
- return nr;
-
-Enospc:
- mutex_unlock(&sbi->s_lock);
- return 0;
-}
-
-unsigned long sysv_count_free_blocks(struct super_block * sb)
-{
- struct sysv_sb_info * sbi = SYSV_SB(sb);
- int sb_count;
- int count;
- struct buffer_head * bh = NULL;
- sysv_zone_t *blocks;
- unsigned block;
- int n;
-
- /*
- * This code does not work at all for AFS (it has a bitmap
- * free list). As AFS is supposed to be read-only we just
- * lie and say it has no free block at all.
- */
- if (sbi->s_type == FSTYPE_AFS)
- return 0;
-
- mutex_lock(&sbi->s_lock);
- sb_count = fs32_to_cpu(sbi, *sbi->s_free_blocks);
-
- if (0)
- goto trust_sb;
-
- /* this causes a lot of disk traffic ... */
- count = 0;
- n = fs16_to_cpu(sbi, *sbi->s_bcache_count);
- blocks = sbi->s_bcache;
- while (1) {
- sysv_zone_t zone;
- if (n > sbi->s_flc_size)
- goto E2big;
- zone = 0;
- while (n && (zone = blocks[--n]) != 0)
- count++;
- if (zone == 0)
- break;
-
- block = fs32_to_cpu(sbi, zone);
- if (bh)
- brelse(bh);
-
- if (block < sbi->s_firstdatazone || block >= sbi->s_nzones)
- goto Einval;
- block += sbi->s_block_base;
- bh = sb_bread(sb, block);
- if (!bh)
- goto Eio;
- n = fs16_to_cpu(sbi, *(__fs16*)bh->b_data);
- blocks = get_chunk(sb, bh);
- }
- if (bh)
- brelse(bh);
- if (count != sb_count)
- goto Ecount;
-done:
- mutex_unlock(&sbi->s_lock);
- return count;
-
-Einval:
- printk("sysv_count_free_blocks: new block %d is not in data zone\n",
- block);
- goto trust_sb;
-Eio:
- printk("sysv_count_free_blocks: cannot read free-list block\n");
- goto trust_sb;
-E2big:
- printk("sysv_count_free_blocks: >flc_size entries in free-list block\n");
- if (bh)
- brelse(bh);
-trust_sb:
- count = sb_count;
- goto done;
-Ecount:
- printk("sysv_count_free_blocks: free block count was %d, "
- "correcting to %d\n", sb_count, count);
- if (!sb_rdonly(sb)) {
- *sbi->s_free_blocks = cpu_to_fs32(sbi, count);
- dirty_sb(sb);
- }
- goto done;
-}
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
deleted file mode 100644
index 639307e2ff8c..000000000000
--- a/fs/sysv/dir.c
+++ /dev/null
@@ -1,378 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * linux/fs/sysv/dir.c
- *
- * minix/dir.c
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * coh/dir.c
- * Copyright (C) 1993 Pascal Haible, Bruno Haible
- *
- * sysv/dir.c
- * Copyright (C) 1993 Bruno Haible
- *
- * SystemV/Coherent directory handling functions
- */
-
-#include <linux/pagemap.h>
-#include <linux/highmem.h>
-#include <linux/swap.h>
-#include "sysv.h"
-
-static int sysv_readdir(struct file *, struct dir_context *);
-
-const struct file_operations sysv_dir_operations = {
- .llseek = generic_file_llseek,
- .read = generic_read_dir,
- .iterate_shared = sysv_readdir,
- .fsync = generic_file_fsync,
-};
-
-static void dir_commit_chunk(struct folio *folio, loff_t pos, unsigned len)
-{
- struct address_space *mapping = folio->mapping;
- struct inode *dir = mapping->host;
-
- block_write_end(NULL, mapping, pos, len, len, folio, NULL);
- if (pos+len > dir->i_size) {
- i_size_write(dir, pos+len);
- mark_inode_dirty(dir);
- }
- folio_unlock(folio);
-}
-
-static int sysv_handle_dirsync(struct inode *dir)
-{
- int err;
-
- err = filemap_write_and_wait(dir->i_mapping);
- if (!err)
- err = sync_inode_metadata(dir, 1);
- return err;
-}
-
-/*
- * Calls to dir_get_folio()/folio_release_kmap() must be nested according to the
- * rules documented in mm/highmem.rst.
- *
- * NOTE: sysv_find_entry() and sysv_dotdot() act as calls to dir_get_folio()
- * and must be treated accordingly for nesting purposes.
- */
-static void *dir_get_folio(struct inode *dir, unsigned long n,
- struct folio **foliop)
-{
- struct folio *folio = read_mapping_folio(dir->i_mapping, n, NULL);
-
- if (IS_ERR(folio))
- return ERR_CAST(folio);
- *foliop = folio;
- return kmap_local_folio(folio, 0);
-}
-
-static int sysv_readdir(struct file *file, struct dir_context *ctx)
-{
- unsigned long pos = ctx->pos;
- struct inode *inode = file_inode(file);
- struct super_block *sb = inode->i_sb;
- unsigned long npages = dir_pages(inode);
- unsigned offset;
- unsigned long n;
-
- ctx->pos = pos = (pos + SYSV_DIRSIZE-1) & ~(SYSV_DIRSIZE-1);
- if (pos >= inode->i_size)
- return 0;
-
- offset = pos & ~PAGE_MASK;
- n = pos >> PAGE_SHIFT;
-
- for ( ; n < npages; n++, offset = 0) {
- char *kaddr, *limit;
- struct sysv_dir_entry *de;
- struct folio *folio;
-
- kaddr = dir_get_folio(inode, n, &folio);
- if (IS_ERR(kaddr))
- continue;
- de = (struct sysv_dir_entry *)(kaddr+offset);
- limit = kaddr + PAGE_SIZE - SYSV_DIRSIZE;
- for ( ;(char*)de <= limit; de++, ctx->pos += sizeof(*de)) {
- char *name = de->name;
-
- if (!de->inode)
- continue;
-
- if (!dir_emit(ctx, name, strnlen(name,SYSV_NAMELEN),
- fs16_to_cpu(SYSV_SB(sb), de->inode),
- DT_UNKNOWN)) {
- folio_release_kmap(folio, kaddr);
- return 0;
- }
- }
- folio_release_kmap(folio, kaddr);
- }
- return 0;
-}
-
-/* compare strings: name[0..len-1] (not zero-terminated) and
- * buffer[0..] (filled with zeroes up to buffer[0..maxlen-1])
- */
-static inline int namecompare(int len, int maxlen,
- const char * name, const char * buffer)
-{
- if (len < maxlen && buffer[len])
- return 0;
- return !memcmp(name, buffer, len);
-}
-
-/*
- * sysv_find_entry()
- *
- * finds an entry in the specified directory with the wanted name.
- * It does NOT read the inode of the
- * entry - you'll have to do that yourself if you want to.
- *
- * On Success folio_release_kmap() should be called on *foliop.
- *
- * sysv_find_entry() acts as a call to dir_get_folio() and must be treated
- * accordingly for nesting purposes.
- */
-struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct folio **foliop)
-{
- const char * name = dentry->d_name.name;
- int namelen = dentry->d_name.len;
- struct inode * dir = d_inode(dentry->d_parent);
- unsigned long start, n;
- unsigned long npages = dir_pages(dir);
- struct sysv_dir_entry *de;
-
- start = SYSV_I(dir)->i_dir_start_lookup;
- if (start >= npages)
- start = 0;
- n = start;
-
- do {
- char *kaddr = dir_get_folio(dir, n, foliop);
-
- if (!IS_ERR(kaddr)) {
- de = (struct sysv_dir_entry *)kaddr;
- kaddr += folio_size(*foliop) - SYSV_DIRSIZE;
- for ( ; (char *) de <= kaddr ; de++) {
- if (!de->inode)
- continue;
- if (namecompare(namelen, SYSV_NAMELEN,
- name, de->name))
- goto found;
- }
- folio_release_kmap(*foliop, kaddr);
- }
-
- if (++n >= npages)
- n = 0;
- } while (n != start);
-
- return NULL;
-
-found:
- SYSV_I(dir)->i_dir_start_lookup = n;
- return de;
-}
-
-int sysv_add_link(struct dentry *dentry, struct inode *inode)
-{
- struct inode *dir = d_inode(dentry->d_parent);
- const char * name = dentry->d_name.name;
- int namelen = dentry->d_name.len;
- struct folio *folio = NULL;
- struct sysv_dir_entry * de;
- unsigned long npages = dir_pages(dir);
- unsigned long n;
- char *kaddr;
- loff_t pos;
- int err;
-
- /* We take care of directory expansion in the same loop */
- for (n = 0; n <= npages; n++) {
- kaddr = dir_get_folio(dir, n, &folio);
- if (IS_ERR(kaddr))
- return PTR_ERR(kaddr);
- de = (struct sysv_dir_entry *)kaddr;
- kaddr += PAGE_SIZE - SYSV_DIRSIZE;
- while ((char *)de <= kaddr) {
- if (!de->inode)
- goto got_it;
- err = -EEXIST;
- if (namecompare(namelen, SYSV_NAMELEN, name, de->name))
- goto out_folio;
- de++;
- }
- folio_release_kmap(folio, kaddr);
- }
- BUG();
- return -EINVAL;
-
-got_it:
- pos = folio_pos(folio) + offset_in_folio(folio, de);
- folio_lock(folio);
- err = sysv_prepare_chunk(folio, pos, SYSV_DIRSIZE);
- if (err)
- goto out_unlock;
- memcpy (de->name, name, namelen);
- memset (de->name + namelen, 0, SYSV_DIRSIZE - namelen - 2);
- de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino);
- dir_commit_chunk(folio, pos, SYSV_DIRSIZE);
- inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
- mark_inode_dirty(dir);
- err = sysv_handle_dirsync(dir);
-out_folio:
- folio_release_kmap(folio, kaddr);
- return err;
-out_unlock:
- folio_unlock(folio);
- goto out_folio;
-}
-
-int sysv_delete_entry(struct sysv_dir_entry *de, struct folio *folio)
-{
- struct inode *inode = folio->mapping->host;
- loff_t pos = folio_pos(folio) + offset_in_folio(folio, de);
- int err;
-
- folio_lock(folio);
- err = sysv_prepare_chunk(folio, pos, SYSV_DIRSIZE);
- if (err) {
- folio_unlock(folio);
- return err;
- }
- de->inode = 0;
- dir_commit_chunk(folio, pos, SYSV_DIRSIZE);
- inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
- mark_inode_dirty(inode);
- return sysv_handle_dirsync(inode);
-}
-
-int sysv_make_empty(struct inode *inode, struct inode *dir)
-{
- struct folio *folio = filemap_grab_folio(inode->i_mapping, 0);
- struct sysv_dir_entry * de;
- char *kaddr;
- int err;
-
- if (IS_ERR(folio))
- return PTR_ERR(folio);
- err = sysv_prepare_chunk(folio, 0, 2 * SYSV_DIRSIZE);
- if (err) {
- folio_unlock(folio);
- goto fail;
- }
- kaddr = kmap_local_folio(folio, 0);
- memset(kaddr, 0, folio_size(folio));
-
- de = (struct sysv_dir_entry *)kaddr;
- de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino);
- strcpy(de->name,".");
- de++;
- de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), dir->i_ino);
- strcpy(de->name,"..");
-
- kunmap_local(kaddr);
- dir_commit_chunk(folio, 0, 2 * SYSV_DIRSIZE);
- err = sysv_handle_dirsync(inode);
-fail:
- folio_put(folio);
- return err;
-}
-
-/*
- * routine to check that the specified directory is empty (for rmdir)
- */
-int sysv_empty_dir(struct inode * inode)
-{
- struct super_block *sb = inode->i_sb;
- struct folio *folio = NULL;
- unsigned long i, npages = dir_pages(inode);
- char *kaddr;
-
- for (i = 0; i < npages; i++) {
- struct sysv_dir_entry *de;
-
- kaddr = dir_get_folio(inode, i, &folio);
- if (IS_ERR(kaddr))
- continue;
-
- de = (struct sysv_dir_entry *)kaddr;
- kaddr += folio_size(folio) - SYSV_DIRSIZE;
-
- for ( ;(char *)de <= kaddr; de++) {
- if (!de->inode)
- continue;
- /* check for . and .. */
- if (de->name[0] != '.')
- goto not_empty;
- if (!de->name[1]) {
- if (de->inode == cpu_to_fs16(SYSV_SB(sb),
- inode->i_ino))
- continue;
- goto not_empty;
- }
- if (de->name[1] != '.' || de->name[2])
- goto not_empty;
- }
- folio_release_kmap(folio, kaddr);
- }
- return 1;
-
-not_empty:
- folio_release_kmap(folio, kaddr);
- return 0;
-}
-
-/* Releases the page */
-int sysv_set_link(struct sysv_dir_entry *de, struct folio *folio,
- struct inode *inode)
-{
- struct inode *dir = folio->mapping->host;
- loff_t pos = folio_pos(folio) + offset_in_folio(folio, de);
- int err;
-
- folio_lock(folio);
- err = sysv_prepare_chunk(folio, pos, SYSV_DIRSIZE);
- if (err) {
- folio_unlock(folio);
- return err;
- }
- de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino);
- dir_commit_chunk(folio, pos, SYSV_DIRSIZE);
- inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
- mark_inode_dirty(dir);
- return sysv_handle_dirsync(inode);
-}
-
-/*
- * Calls to dir_get_folio()/folio_release_kmap() must be nested according to the
- * rules documented in mm/highmem.rst.
- *
- * sysv_dotdot() acts as a call to dir_get_folio() and must be treated
- * accordingly for nesting purposes.
- */
-struct sysv_dir_entry *sysv_dotdot(struct inode *dir, struct folio **foliop)
-{
- struct sysv_dir_entry *de = dir_get_folio(dir, 0, foliop);
-
- if (IS_ERR(de))
- return NULL;
- /* ".." is the second directory entry */
- return de + 1;
-}
-
-ino_t sysv_inode_by_name(struct dentry *dentry)
-{
- struct folio *folio;
- struct sysv_dir_entry *de = sysv_find_entry (dentry, &folio);
- ino_t res = 0;
-
- if (de) {
- res = fs16_to_cpu(SYSV_SB(dentry->d_sb), de->inode);
- folio_release_kmap(folio, de);
- }
- return res;
-}
diff --git a/fs/sysv/file.c b/fs/sysv/file.c
deleted file mode 100644
index c645f60bdb7f..000000000000
--- a/fs/sysv/file.c
+++ /dev/null
@@ -1,59 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * linux/fs/sysv/file.c
- *
- * minix/file.c
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * coh/file.c
- * Copyright (C) 1993 Pascal Haible, Bruno Haible
- *
- * sysv/file.c
- * Copyright (C) 1993 Bruno Haible
- *
- * SystemV/Coherent regular file handling primitives
- */
-
-#include "sysv.h"
-
-/*
- * We have mostly NULLs here: the current defaults are OK for
- * the coh filesystem.
- */
-const struct file_operations sysv_file_operations = {
- .llseek = generic_file_llseek,
- .read_iter = generic_file_read_iter,
- .write_iter = generic_file_write_iter,
- .mmap = generic_file_mmap,
- .fsync = generic_file_fsync,
- .splice_read = filemap_splice_read,
-};
-
-static int sysv_setattr(struct mnt_idmap *idmap,
- struct dentry *dentry, struct iattr *attr)
-{
- struct inode *inode = d_inode(dentry);
- int error;
-
- error = setattr_prepare(&nop_mnt_idmap, dentry, attr);
- if (error)
- return error;
-
- if ((attr->ia_valid & ATTR_SIZE) &&
- attr->ia_size != i_size_read(inode)) {
- error = inode_newsize_ok(inode, attr->ia_size);
- if (error)
- return error;
- truncate_setsize(inode, attr->ia_size);
- sysv_truncate(inode);
- }
-
- setattr_copy(&nop_mnt_idmap, inode, attr);
- mark_inode_dirty(inode);
- return 0;
-}
-
-const struct inode_operations sysv_file_inode_operations = {
- .setattr = sysv_setattr,
- .getattr = sysv_getattr,
-};
diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c
deleted file mode 100644
index 269df6d49815..000000000000
--- a/fs/sysv/ialloc.c
+++ /dev/null
@@ -1,235 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * linux/fs/sysv/ialloc.c
- *
- * minix/bitmap.c
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * ext/freelists.c
- * Copyright (C) 1992 Remy Card (card@masi.ibp.fr)
- *
- * xenix/alloc.c
- * Copyright (C) 1992 Doug Evans
- *
- * coh/alloc.c
- * Copyright (C) 1993 Pascal Haible, Bruno Haible
- *
- * sysv/ialloc.c
- * Copyright (C) 1993 Bruno Haible
- *
- * This file contains code for allocating/freeing inodes.
- */
-
-#include <linux/kernel.h>
-#include <linux/stddef.h>
-#include <linux/sched.h>
-#include <linux/stat.h>
-#include <linux/string.h>
-#include <linux/buffer_head.h>
-#include <linux/writeback.h>
-#include "sysv.h"
-
-/* We don't trust the value of
- sb->sv_sbd2->s_tinode = *sb->sv_sb_total_free_inodes
- but we nevertheless keep it up to date. */
-
-/* An inode on disk is considered free if both i_mode == 0 and i_nlink == 0. */
-
-/* return &sb->sv_sb_fic_inodes[i] = &sbd->s_inode[i]; */
-static inline sysv_ino_t *
-sv_sb_fic_inode(struct super_block * sb, unsigned int i)
-{
- struct sysv_sb_info *sbi = SYSV_SB(sb);
-
- if (sbi->s_bh1 == sbi->s_bh2)
- return &sbi->s_sb_fic_inodes[i];
- else {
- /* 512 byte Xenix FS */
- unsigned int offset = offsetof(struct xenix_super_block, s_inode[i]);
- if (offset < 512)
- return (sysv_ino_t*)(sbi->s_sbd1 + offset);
- else
- return (sysv_ino_t*)(sbi->s_sbd2 + offset);
- }
-}
-
-struct sysv_inode *
-sysv_raw_inode(struct super_block *sb, unsigned ino, struct buffer_head **bh)
-{
- struct sysv_sb_info *sbi = SYSV_SB(sb);
- struct sysv_inode *res;
- int block = sbi->s_firstinodezone + sbi->s_block_base;
-
- block += (ino-1) >> sbi->s_inodes_per_block_bits;
- *bh = sb_bread(sb, block);
- if (!*bh)
- return NULL;
- res = (struct sysv_inode *)(*bh)->b_data;
- return res + ((ino-1) & sbi->s_inodes_per_block_1);
-}
-
-static int refill_free_cache(struct super_block *sb)
-{
- struct sysv_sb_info *sbi = SYSV_SB(sb);
- struct buffer_head * bh;
- struct sysv_inode * raw_inode;
- int i = 0, ino;
-
- ino = SYSV_ROOT_INO+1;
- raw_inode = sysv_raw_inode(sb, ino, &bh);
- if (!raw_inode)
- goto out;
- while (ino <= sbi->s_ninodes) {
- if (raw_inode->i_mode == 0 && raw_inode->i_nlink == 0) {
- *sv_sb_fic_inode(sb,i++) = cpu_to_fs16(SYSV_SB(sb), ino);
- if (i == sbi->s_fic_size)
- break;
- }
- if ((ino++ & sbi->s_inodes_per_block_1) == 0) {
- brelse(bh);
- raw_inode = sysv_raw_inode(sb, ino, &bh);
- if (!raw_inode)
- goto out;
- } else
- raw_inode++;
- }
- brelse(bh);
-out:
- return i;
-}
-
-void sysv_free_inode(struct inode * inode)
-{
- struct super_block *sb = inode->i_sb;
- struct sysv_sb_info *sbi = SYSV_SB(sb);
- unsigned int ino;
- struct buffer_head * bh;
- struct sysv_inode * raw_inode;
- unsigned count;
-
- sb = inode->i_sb;
- ino = inode->i_ino;
- if (ino <= SYSV_ROOT_INO || ino > sbi->s_ninodes) {
- printk("sysv_free_inode: inode 0,1,2 or nonexistent inode\n");
- return;
- }
- raw_inode = sysv_raw_inode(sb, ino, &bh);
- if (!raw_inode) {
- printk("sysv_free_inode: unable to read inode block on device "
- "%s\n", inode->i_sb->s_id);
- return;
- }
- mutex_lock(&sbi->s_lock);
- count = fs16_to_cpu(sbi, *sbi->s_sb_fic_count);
- if (count < sbi->s_fic_size) {
- *sv_sb_fic_inode(sb,count++) = cpu_to_fs16(sbi, ino);
- *sbi->s_sb_fic_count = cpu_to_fs16(sbi, count);
- }
- fs16_add(sbi, sbi->s_sb_total_free_inodes, 1);
- dirty_sb(sb);
- memset(raw_inode, 0, sizeof(struct sysv_inode));
- mark_buffer_dirty(bh);
- mutex_unlock(&sbi->s_lock);
- brelse(bh);
-}
-
-struct inode * sysv_new_inode(const struct inode * dir, umode_t mode)
-{
- struct super_block *sb = dir->i_sb;
- struct sysv_sb_info *sbi = SYSV_SB(sb);
- struct inode *inode;
- sysv_ino_t ino;
- unsigned count;
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_NONE
- };
-
- inode = new_inode(sb);
- if (!inode)
- return ERR_PTR(-ENOMEM);
-
- mutex_lock(&sbi->s_lock);
- count = fs16_to_cpu(sbi, *sbi->s_sb_fic_count);
- if (count == 0 || (*sv_sb_fic_inode(sb,count-1) == 0)) {
- count = refill_free_cache(sb);
- if (count == 0) {
- iput(inode);
- mutex_unlock(&sbi->s_lock);
- return ERR_PTR(-ENOSPC);
- }
- }
- /* Now count > 0. */
- ino = *sv_sb_fic_inode(sb,--count);
- *sbi->s_sb_fic_count = cpu_to_fs16(sbi, count);
- fs16_add(sbi, sbi->s_sb_total_free_inodes, -1);
- dirty_sb(sb);
- inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
- inode->i_ino = fs16_to_cpu(sbi, ino);
- simple_inode_init_ts(inode);
- inode->i_blocks = 0;
- memset(SYSV_I(inode)->i_data, 0, sizeof(SYSV_I(inode)->i_data));
- SYSV_I(inode)->i_dir_start_lookup = 0;
- insert_inode_hash(inode);
- mark_inode_dirty(inode);
-
- sysv_write_inode(inode, &wbc); /* ensure inode not allocated again */
- mark_inode_dirty(inode); /* cleared by sysv_write_inode() */
- /* That's it. */
- mutex_unlock(&sbi->s_lock);
- return inode;
-}
-
-unsigned long sysv_count_free_inodes(struct super_block * sb)
-{
- struct sysv_sb_info *sbi = SYSV_SB(sb);
- struct buffer_head * bh;
- struct sysv_inode * raw_inode;
- int ino, count, sb_count;
-
- mutex_lock(&sbi->s_lock);
-
- sb_count = fs16_to_cpu(sbi, *sbi->s_sb_total_free_inodes);
-
- if (0)
- goto trust_sb;
-
- /* this causes a lot of disk traffic ... */
- count = 0;
- ino = SYSV_ROOT_INO+1;
- raw_inode = sysv_raw_inode(sb, ino, &bh);
- if (!raw_inode)
- goto Eio;
- while (ino <= sbi->s_ninodes) {
- if (raw_inode->i_mode == 0 && raw_inode->i_nlink == 0)
- count++;
- if ((ino++ & sbi->s_inodes_per_block_1) == 0) {
- brelse(bh);
- raw_inode = sysv_raw_inode(sb, ino, &bh);
- if (!raw_inode)
- goto Eio;
- } else
- raw_inode++;
- }
- brelse(bh);
- if (count != sb_count)
- goto Einval;
-out:
- mutex_unlock(&sbi->s_lock);
- return count;
-
-Einval:
- printk("sysv_count_free_inodes: "
- "free inode count was %d, correcting to %d\n",
- sb_count, count);
- if (!sb_rdonly(sb)) {
- *sbi->s_sb_total_free_inodes = cpu_to_fs16(SYSV_SB(sb), count);
- dirty_sb(sb);
- }
- goto out;
-
-Eio:
- printk("sysv_count_free_inodes: unable to read inode table\n");
-trust_sb:
- count = sb_count;
- goto out;
-}
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
deleted file mode 100644
index 76bc2d5e75a9..000000000000
--- a/fs/sysv/inode.c
+++ /dev/null
@@ -1,354 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * linux/fs/sysv/inode.c
- *
- * minix/inode.c
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * xenix/inode.c
- * Copyright (C) 1992 Doug Evans
- *
- * coh/inode.c
- * Copyright (C) 1993 Pascal Haible, Bruno Haible
- *
- * sysv/inode.c
- * Copyright (C) 1993 Paul B. Monday
- *
- * sysv/inode.c
- * Copyright (C) 1993 Bruno Haible
- * Copyright (C) 1997, 1998 Krzysztof G. Baranowski
- *
- * This file contains code for allocating/freeing inodes and for read/writing
- * the superblock.
- */
-
-#include <linux/highuid.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/buffer_head.h>
-#include <linux/vfs.h>
-#include <linux/writeback.h>
-#include <linux/namei.h>
-#include <asm/byteorder.h>
-#include "sysv.h"
-
-static int sysv_sync_fs(struct super_block *sb, int wait)
-{
- struct sysv_sb_info *sbi = SYSV_SB(sb);
- u32 time = (u32)ktime_get_real_seconds(), old_time;
-
- mutex_lock(&sbi->s_lock);
-
- /*
- * If we are going to write out the super block,
- * then attach current time stamp.
- * But if the filesystem was marked clean, keep it clean.
- */
- old_time = fs32_to_cpu(sbi, *sbi->s_sb_time);
- if (sbi->s_type == FSTYPE_SYSV4) {
- if (*sbi->s_sb_state == cpu_to_fs32(sbi, 0x7c269d38u - old_time))
- *sbi->s_sb_state = cpu_to_fs32(sbi, 0x7c269d38u - time);
- *sbi->s_sb_time = cpu_to_fs32(sbi, time);
- mark_buffer_dirty(sbi->s_bh2);
- }
-
- mutex_unlock(&sbi->s_lock);
-
- return 0;
-}
-
-static int sysv_remount(struct super_block *sb, int *flags, char *data)
-{
- struct sysv_sb_info *sbi = SYSV_SB(sb);
-
- sync_filesystem(sb);
- if (sbi->s_forced_ro)
- *flags |= SB_RDONLY;
- return 0;
-}
-
-static void sysv_put_super(struct super_block *sb)
-{
- struct sysv_sb_info *sbi = SYSV_SB(sb);
-
- if (!sb_rdonly(sb)) {
- /* XXX ext2 also updates the state here */
- mark_buffer_dirty(sbi->s_bh1);
- if (sbi->s_bh1 != sbi->s_bh2)
- mark_buffer_dirty(sbi->s_bh2);
- }
-
- brelse(sbi->s_bh1);
- if (sbi->s_bh1 != sbi->s_bh2)
- brelse(sbi->s_bh2);
-
- kfree(sbi);
-}
-
-static int sysv_statfs(struct dentry *dentry, struct kstatfs *buf)
-{
- struct super_block *sb = dentry->d_sb;
- struct sysv_sb_info *sbi = SYSV_SB(sb);
- u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
-
- buf->f_type = sb->s_magic;
- buf->f_bsize = sb->s_blocksize;
- buf->f_blocks = sbi->s_ndatazones;
- buf->f_bavail = buf->f_bfree = sysv_count_free_blocks(sb);
- buf->f_files = sbi->s_ninodes;
- buf->f_ffree = sysv_count_free_inodes(sb);
- buf->f_namelen = SYSV_NAMELEN;
- buf->f_fsid = u64_to_fsid(id);
- return 0;
-}
-
-/*
- * NXI <-> N0XI for PDP, XIN <-> XIN0 for le32, NIX <-> 0NIX for be32
- */
-static inline void read3byte(struct sysv_sb_info *sbi,
- unsigned char * from, unsigned char * to)
-{
- if (sbi->s_bytesex == BYTESEX_PDP) {
- to[0] = from[0];
- to[1] = 0;
- to[2] = from[1];
- to[3] = from[2];
- } else if (sbi->s_bytesex == BYTESEX_LE) {
- to[0] = from[0];
- to[1] = from[1];
- to[2] = from[2];
- to[3] = 0;
- } else {
- to[0] = 0;
- to[1] = from[0];
- to[2] = from[1];
- to[3] = from[2];
- }
-}
-
-static inline void write3byte(struct sysv_sb_info *sbi,
- unsigned char * from, unsigned char * to)
-{
- if (sbi->s_bytesex == BYTESEX_PDP) {
- to[0] = from[0];
- to[1] = from[2];
- to[2] = from[3];
- } else if (sbi->s_bytesex == BYTESEX_LE) {
- to[0] = from[0];
- to[1] = from[1];
- to[2] = from[2];
- } else {
- to[0] = from[1];
- to[1] = from[2];
- to[2] = from[3];
- }
-}
-
-static const struct inode_operations sysv_symlink_inode_operations = {
- .get_link = page_get_link,
- .getattr = sysv_getattr,
-};
-
-void sysv_set_inode(struct inode *inode, dev_t rdev)
-{
- if (S_ISREG(inode->i_mode)) {
- inode->i_op = &sysv_file_inode_operations;
- inode->i_fop = &sysv_file_operations;
- inode->i_mapping->a_ops = &sysv_aops;
- } else if (S_ISDIR(inode->i_mode)) {
- inode->i_op = &sysv_dir_inode_operations;
- inode->i_fop = &sysv_dir_operations;
- inode->i_mapping->a_ops = &sysv_aops;
- } else if (S_ISLNK(inode->i_mode)) {
- inode->i_op = &sysv_symlink_inode_operations;
- inode_nohighmem(inode);
- inode->i_mapping->a_ops = &sysv_aops;
- } else
- init_special_inode(inode, inode->i_mode, rdev);
-}
-
-struct inode *sysv_iget(struct super_block *sb, unsigned int ino)
-{
- struct sysv_sb_info * sbi = SYSV_SB(sb);
- struct buffer_head * bh;
- struct sysv_inode * raw_inode;
- struct sysv_inode_info * si;
- struct inode *inode;
- unsigned int block;
-
- if (!ino || ino > sbi->s_ninodes) {
- printk("Bad inode number on dev %s: %d is out of range\n",
- sb->s_id, ino);
- return ERR_PTR(-EIO);
- }
-
- inode = iget_locked(sb, ino);
- if (!inode)
- return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
- return inode;
-
- raw_inode = sysv_raw_inode(sb, ino, &bh);
- if (!raw_inode) {
- printk("Major problem: unable to read inode from dev %s\n",
- inode->i_sb->s_id);
- goto bad_inode;
- }
- /* SystemV FS: kludge permissions if ino==SYSV_ROOT_INO ?? */
- inode->i_mode = fs16_to_cpu(sbi, raw_inode->i_mode);
- i_uid_write(inode, (uid_t)fs16_to_cpu(sbi, raw_inode->i_uid));
- i_gid_write(inode, (gid_t)fs16_to_cpu(sbi, raw_inode->i_gid));
- set_nlink(inode, fs16_to_cpu(sbi, raw_inode->i_nlink));
- inode->i_size = fs32_to_cpu(sbi, raw_inode->i_size);
- inode_set_atime(inode, fs32_to_cpu(sbi, raw_inode->i_atime), 0);
- inode_set_mtime(inode, fs32_to_cpu(sbi, raw_inode->i_mtime), 0);
- inode_set_ctime(inode, fs32_to_cpu(sbi, raw_inode->i_ctime), 0);
- inode->i_blocks = 0;
-
- si = SYSV_I(inode);
- for (block = 0; block < 10+1+1+1; block++)
- read3byte(sbi, &raw_inode->i_data[3*block],
- (u8 *)&si->i_data[block]);
- brelse(bh);
- si->i_dir_start_lookup = 0;
- if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
- sysv_set_inode(inode,
- old_decode_dev(fs32_to_cpu(sbi, si->i_data[0])));
- else
- sysv_set_inode(inode, 0);
- unlock_new_inode(inode);
- return inode;
-
-bad_inode:
- iget_failed(inode);
- return ERR_PTR(-EIO);
-}
-
-static int __sysv_write_inode(struct inode *inode, int wait)
-{
- struct super_block * sb = inode->i_sb;
- struct sysv_sb_info * sbi = SYSV_SB(sb);
- struct buffer_head * bh;
- struct sysv_inode * raw_inode;
- struct sysv_inode_info * si;
- unsigned int ino, block;
- int err = 0;
-
- ino = inode->i_ino;
- if (!ino || ino > sbi->s_ninodes) {
- printk("Bad inode number on dev %s: %d is out of range\n",
- inode->i_sb->s_id, ino);
- return -EIO;
- }
- raw_inode = sysv_raw_inode(sb, ino, &bh);
- if (!raw_inode) {
- printk("unable to read i-node block\n");
- return -EIO;
- }
-
- raw_inode->i_mode = cpu_to_fs16(sbi, inode->i_mode);
- raw_inode->i_uid = cpu_to_fs16(sbi, fs_high2lowuid(i_uid_read(inode)));
- raw_inode->i_gid = cpu_to_fs16(sbi, fs_high2lowgid(i_gid_read(inode)));
- raw_inode->i_nlink = cpu_to_fs16(sbi, inode->i_nlink);
- raw_inode->i_size = cpu_to_fs32(sbi, inode->i_size);
- raw_inode->i_atime = cpu_to_fs32(sbi, inode_get_atime_sec(inode));
- raw_inode->i_mtime = cpu_to_fs32(sbi, inode_get_mtime_sec(inode));
- raw_inode->i_ctime = cpu_to_fs32(sbi, inode_get_ctime_sec(inode));
-
- si = SYSV_I(inode);
- if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
- si->i_data[0] = cpu_to_fs32(sbi, old_encode_dev(inode->i_rdev));
- for (block = 0; block < 10+1+1+1; block++)
- write3byte(sbi, (u8 *)&si->i_data[block],
- &raw_inode->i_data[3*block]);
- mark_buffer_dirty(bh);
- if (wait) {
- sync_dirty_buffer(bh);
- if (buffer_req(bh) && !buffer_uptodate(bh)) {
- printk ("IO error syncing sysv inode [%s:%08x]\n",
- sb->s_id, ino);
- err = -EIO;
- }
- }
- brelse(bh);
- return err;
-}
-
-int sysv_write_inode(struct inode *inode, struct writeback_control *wbc)
-{
- return __sysv_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
-}
-
-int sysv_sync_inode(struct inode *inode)
-{
- return __sysv_write_inode(inode, 1);
-}
-
-static void sysv_evict_inode(struct inode *inode)
-{
- truncate_inode_pages_final(&inode->i_data);
- if (!inode->i_nlink) {
- inode->i_size = 0;
- sysv_truncate(inode);
- }
- invalidate_inode_buffers(inode);
- clear_inode(inode);
- if (!inode->i_nlink)
- sysv_free_inode(inode);
-}
-
-static struct kmem_cache *sysv_inode_cachep;
-
-static struct inode *sysv_alloc_inode(struct super_block *sb)
-{
- struct sysv_inode_info *si;
-
- si = alloc_inode_sb(sb, sysv_inode_cachep, GFP_KERNEL);
- if (!si)
- return NULL;
- return &si->vfs_inode;
-}
-
-static void sysv_free_in_core_inode(struct inode *inode)
-{
- kmem_cache_free(sysv_inode_cachep, SYSV_I(inode));
-}
-
-static void init_once(void *p)
-{
- struct sysv_inode_info *si = (struct sysv_inode_info *)p;
-
- inode_init_once(&si->vfs_inode);
-}
-
-const struct super_operations sysv_sops = {
- .alloc_inode = sysv_alloc_inode,
- .free_inode = sysv_free_in_core_inode,
- .write_inode = sysv_write_inode,
- .evict_inode = sysv_evict_inode,
- .put_super = sysv_put_super,
- .sync_fs = sysv_sync_fs,
- .remount_fs = sysv_remount,
- .statfs = sysv_statfs,
-};
-
-int __init sysv_init_icache(void)
-{
- sysv_inode_cachep = kmem_cache_create("sysv_inode_cache",
- sizeof(struct sysv_inode_info), 0,
- SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT,
- init_once);
- if (!sysv_inode_cachep)
- return -ENOMEM;
- return 0;
-}
-
-void sysv_destroy_icache(void)
-{
- /*
- * Make sure all delayed rcu free inodes are flushed before we
- * destroy cache.
- */
- rcu_barrier();
- kmem_cache_destroy(sysv_inode_cachep);
-}
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
deleted file mode 100644
index 451e95f474fa..000000000000
--- a/fs/sysv/itree.c
+++ /dev/null
@@ -1,511 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * linux/fs/sysv/itree.c
- *
- * Handling of indirect blocks' trees.
- * AV, Sep--Dec 2000
- */
-
-#include <linux/buffer_head.h>
-#include <linux/mount.h>
-#include <linux/mpage.h>
-#include <linux/string.h>
-#include "sysv.h"
-
-enum {DIRECT = 10, DEPTH = 4}; /* Have triple indirect */
-
-static inline void dirty_indirect(struct buffer_head *bh, struct inode *inode)
-{
- mark_buffer_dirty_inode(bh, inode);
- if (IS_SYNC(inode))
- sync_dirty_buffer(bh);
-}
-
-static int block_to_path(struct inode *inode, long block, int offsets[DEPTH])
-{
- struct super_block *sb = inode->i_sb;
- struct sysv_sb_info *sbi = SYSV_SB(sb);
- int ptrs_bits = sbi->s_ind_per_block_bits;
- unsigned long indirect_blocks = sbi->s_ind_per_block,
- double_blocks = sbi->s_ind_per_block_2;
- int n = 0;
-
- if (block < 0) {
- printk("sysv_block_map: block < 0\n");
- } else if (block < DIRECT) {
- offsets[n++] = block;
- } else if ( (block -= DIRECT) < indirect_blocks) {
- offsets[n++] = DIRECT;
- offsets[n++] = block;
- } else if ((block -= indirect_blocks) < double_blocks) {
- offsets[n++] = DIRECT+1;
- offsets[n++] = block >> ptrs_bits;
- offsets[n++] = block & (indirect_blocks - 1);
- } else if (((block -= double_blocks) >> (ptrs_bits * 2)) < indirect_blocks) {
- offsets[n++] = DIRECT+2;
- offsets[n++] = block >> (ptrs_bits * 2);
- offsets[n++] = (block >> ptrs_bits) & (indirect_blocks - 1);
- offsets[n++] = block & (indirect_blocks - 1);
- } else {
- /* nothing */;
- }
- return n;
-}
-
-static inline int block_to_cpu(struct sysv_sb_info *sbi, sysv_zone_t nr)
-{
- return sbi->s_block_base + fs32_to_cpu(sbi, nr);
-}
-
-typedef struct {
- sysv_zone_t *p;
- sysv_zone_t key;
- struct buffer_head *bh;
-} Indirect;
-
-static DEFINE_RWLOCK(pointers_lock);
-
-static inline void add_chain(Indirect *p, struct buffer_head *bh, sysv_zone_t *v)
-{
- p->key = *(p->p = v);
- p->bh = bh;
-}
-
-static inline int verify_chain(Indirect *from, Indirect *to)
-{
- while (from <= to && from->key == *from->p)
- from++;
- return (from > to);
-}
-
-static inline sysv_zone_t *block_end(struct buffer_head *bh)
-{
- return (sysv_zone_t*)((char*)bh->b_data + bh->b_size);
-}
-
-static Indirect *get_branch(struct inode *inode,
- int depth,
- int offsets[],
- Indirect chain[],
- int *err)
-{
- struct super_block *sb = inode->i_sb;
- Indirect *p = chain;
- struct buffer_head *bh;
-
- *err = 0;
- add_chain(chain, NULL, SYSV_I(inode)->i_data + *offsets);
- if (!p->key)
- goto no_block;
- while (--depth) {
- int block = block_to_cpu(SYSV_SB(sb), p->key);
- bh = sb_bread(sb, block);
- if (!bh)
- goto failure;
- read_lock(&pointers_lock);
- if (!verify_chain(chain, p))
- goto changed;
- add_chain(++p, bh, (sysv_zone_t*)bh->b_data + *++offsets);
- read_unlock(&pointers_lock);
- if (!p->key)
- goto no_block;
- }
- return NULL;
-
-changed:
- read_unlock(&pointers_lock);
- brelse(bh);
- *err = -EAGAIN;
- goto no_block;
-failure:
- *err = -EIO;
-no_block:
- return p;
-}
-
-static int alloc_branch(struct inode *inode,
- int num,
- int *offsets,
- Indirect *branch)
-{
- int blocksize = inode->i_sb->s_blocksize;
- int n = 0;
- int i;
-
- branch[0].key = sysv_new_block(inode->i_sb);
- if (branch[0].key) for (n = 1; n < num; n++) {
- struct buffer_head *bh;
- int parent;
- /* Allocate the next block */
- branch[n].key = sysv_new_block(inode->i_sb);
- if (!branch[n].key)
- break;
- /*
- * Get buffer_head for parent block, zero it out and set
- * the pointer to new one, then send parent to disk.
- */
- parent = block_to_cpu(SYSV_SB(inode->i_sb), branch[n-1].key);
- bh = sb_getblk(inode->i_sb, parent);
- if (!bh) {
- sysv_free_block(inode->i_sb, branch[n].key);
- break;
- }
- lock_buffer(bh);
- memset(bh->b_data, 0, blocksize);
- branch[n].bh = bh;
- branch[n].p = (sysv_zone_t*) bh->b_data + offsets[n];
- *branch[n].p = branch[n].key;
- set_buffer_uptodate(bh);
- unlock_buffer(bh);
- dirty_indirect(bh, inode);
- }
- if (n == num)
- return 0;
-
- /* Allocation failed, free what we already allocated */
- for (i = 1; i < n; i++)
- bforget(branch[i].bh);
- for (i = 0; i < n; i++)
- sysv_free_block(inode->i_sb, branch[i].key);
- return -ENOSPC;
-}
-
-static inline int splice_branch(struct inode *inode,
- Indirect chain[],
- Indirect *where,
- int num)
-{
- int i;
-
- /* Verify that place we are splicing to is still there and vacant */
- write_lock(&pointers_lock);
- if (!verify_chain(chain, where-1) || *where->p)
- goto changed;
- *where->p = where->key;
- write_unlock(&pointers_lock);
-
- inode_set_ctime_current(inode);
-
- /* had we spliced it onto indirect block? */
- if (where->bh)
- dirty_indirect(where->bh, inode);
-
- if (IS_SYNC(inode))
- sysv_sync_inode(inode);
- else
- mark_inode_dirty(inode);
- return 0;
-
-changed:
- write_unlock(&pointers_lock);
- for (i = 1; i < num; i++)
- bforget(where[i].bh);
- for (i = 0; i < num; i++)
- sysv_free_block(inode->i_sb, where[i].key);
- return -EAGAIN;
-}
-
-static int get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create)
-{
- int err = -EIO;
- int offsets[DEPTH];
- Indirect chain[DEPTH];
- struct super_block *sb = inode->i_sb;
- Indirect *partial;
- int left;
- int depth = block_to_path(inode, iblock, offsets);
-
- if (depth == 0)
- goto out;
-
-reread:
- partial = get_branch(inode, depth, offsets, chain, &err);
-
- /* Simplest case - block found, no allocation needed */
- if (!partial) {
-got_it:
- map_bh(bh_result, sb, block_to_cpu(SYSV_SB(sb),
- chain[depth-1].key));
- /* Clean up and exit */
- partial = chain+depth-1; /* the whole chain */
- goto cleanup;
- }
-
- /* Next simple case - plain lookup or failed read of indirect block */
- if (!create || err == -EIO) {
-cleanup:
- while (partial > chain) {
- brelse(partial->bh);
- partial--;
- }
-out:
- return err;
- }
-
- /*
- * Indirect block might be removed by truncate while we were
- * reading it. Handling of that case (forget what we've got and
- * reread) is taken out of the main path.
- */
- if (err == -EAGAIN)
- goto changed;
-
- left = (chain + depth) - partial;
- err = alloc_branch(inode, left, offsets+(partial-chain), partial);
- if (err)
- goto cleanup;
-
- if (splice_branch(inode, chain, partial, left) < 0)
- goto changed;
-
- set_buffer_new(bh_result);
- goto got_it;
-
-changed:
- while (partial > chain) {
- brelse(partial->bh);
- partial--;
- }
- goto reread;
-}
-
-static inline int all_zeroes(sysv_zone_t *p, sysv_zone_t *q)
-{
- while (p < q)
- if (*p++)
- return 0;
- return 1;
-}
-
-static Indirect *find_shared(struct inode *inode,
- int depth,
- int offsets[],
- Indirect chain[],
- sysv_zone_t *top)
-{
- Indirect *partial, *p;
- int k, err;
-
- *top = 0;
- for (k = depth; k > 1 && !offsets[k-1]; k--)
- ;
- partial = get_branch(inode, k, offsets, chain, &err);
-
- write_lock(&pointers_lock);
- if (!partial)
- partial = chain + k-1;
- /*
- * If the branch acquired continuation since we've looked at it -
- * fine, it should all survive and (new) top doesn't belong to us.
- */
- if (!partial->key && *partial->p) {
- write_unlock(&pointers_lock);
- goto no_top;
- }
- for (p=partial; p>chain && all_zeroes((sysv_zone_t*)p->bh->b_data,p->p); p--)
- ;
- /*
- * OK, we've found the last block that must survive. The rest of our
- * branch should be detached before unlocking. However, if that rest
- * of branch is all ours and does not grow immediately from the inode
- * it's easier to cheat and just decrement partial->p.
- */
- if (p == chain + k - 1 && p > chain) {
- p->p--;
- } else {
- *top = *p->p;
- *p->p = 0;
- }
- write_unlock(&pointers_lock);
-
- while (partial > p) {
- brelse(partial->bh);
- partial--;
- }
-no_top:
- return partial;
-}
-
-static inline void free_data(struct inode *inode, sysv_zone_t *p, sysv_zone_t *q)
-{
- for ( ; p < q ; p++) {
- sysv_zone_t nr = *p;
- if (nr) {
- *p = 0;
- sysv_free_block(inode->i_sb, nr);
- mark_inode_dirty(inode);
- }
- }
-}
-
-static void free_branches(struct inode *inode, sysv_zone_t *p, sysv_zone_t *q, int depth)
-{
- struct buffer_head * bh;
- struct super_block *sb = inode->i_sb;
-
- if (depth--) {
- for ( ; p < q ; p++) {
- int block;
- sysv_zone_t nr = *p;
- if (!nr)
- continue;
- *p = 0;
- block = block_to_cpu(SYSV_SB(sb), nr);
- bh = sb_bread(sb, block);
- if (!bh)
- continue;
- free_branches(inode, (sysv_zone_t*)bh->b_data,
- block_end(bh), depth);
- bforget(bh);
- sysv_free_block(sb, nr);
- mark_inode_dirty(inode);
- }
- } else
- free_data(inode, p, q);
-}
-
-void sysv_truncate (struct inode * inode)
-{
- sysv_zone_t *i_data = SYSV_I(inode)->i_data;
- int offsets[DEPTH];
- Indirect chain[DEPTH];
- Indirect *partial;
- sysv_zone_t nr = 0;
- int n;
- long iblock;
- unsigned blocksize;
-
- if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
- S_ISLNK(inode->i_mode)))
- return;
-
- blocksize = inode->i_sb->s_blocksize;
- iblock = (inode->i_size + blocksize-1)
- >> inode->i_sb->s_blocksize_bits;
-
- block_truncate_page(inode->i_mapping, inode->i_size, get_block);
-
- n = block_to_path(inode, iblock, offsets);
- if (n == 0)
- return;
-
- if (n == 1) {
- free_data(inode, i_data+offsets[0], i_data + DIRECT);
- goto do_indirects;
- }
-
- partial = find_shared(inode, n, offsets, chain, &nr);
- /* Kill the top of shared branch (already detached) */
- if (nr) {
- if (partial == chain)
- mark_inode_dirty(inode);
- else
- dirty_indirect(partial->bh, inode);
- free_branches(inode, &nr, &nr+1, (chain+n-1) - partial);
- }
- /* Clear the ends of indirect blocks on the shared branch */
- while (partial > chain) {
- free_branches(inode, partial->p + 1, block_end(partial->bh),
- (chain+n-1) - partial);
- dirty_indirect(partial->bh, inode);
- brelse (partial->bh);
- partial--;
- }
-do_indirects:
- /* Kill the remaining (whole) subtrees (== subtrees deeper than...) */
- while (n < DEPTH) {
- nr = i_data[DIRECT + n - 1];
- if (nr) {
- i_data[DIRECT + n - 1] = 0;
- mark_inode_dirty(inode);
- free_branches(inode, &nr, &nr+1, n);
- }
- n++;
- }
- inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
- if (IS_SYNC(inode))
- sysv_sync_inode (inode);
- else
- mark_inode_dirty(inode);
-}
-
-static unsigned sysv_nblocks(struct super_block *s, loff_t size)
-{
- struct sysv_sb_info *sbi = SYSV_SB(s);
- int ptrs_bits = sbi->s_ind_per_block_bits;
- unsigned blocks, res, direct = DIRECT, i = DEPTH;
- blocks = (size + s->s_blocksize - 1) >> s->s_blocksize_bits;
- res = blocks;
- while (--i && blocks > direct) {
- blocks = ((blocks - direct - 1) >> ptrs_bits) + 1;
- res += blocks;
- direct = 1;
- }
- return res;
-}
-
-int sysv_getattr(struct mnt_idmap *idmap, const struct path *path,
- struct kstat *stat, u32 request_mask, unsigned int flags)
-{
- struct super_block *s = path->dentry->d_sb;
- generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(path->dentry),
- stat);
- stat->blocks = (s->s_blocksize / 512) * sysv_nblocks(s, stat->size);
- stat->blksize = s->s_blocksize;
- return 0;
-}
-
-static int sysv_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
-{
- return mpage_writepages(mapping, wbc, get_block);
-}
-
-static int sysv_read_folio(struct file *file, struct folio *folio)
-{
- return block_read_full_folio(folio, get_block);
-}
-
-int sysv_prepare_chunk(struct folio *folio, loff_t pos, unsigned len)
-{
- return __block_write_begin(folio, pos, len, get_block);
-}
-
-static void sysv_write_failed(struct address_space *mapping, loff_t to)
-{
- struct inode *inode = mapping->host;
-
- if (to > inode->i_size) {
- truncate_pagecache(inode, inode->i_size);
- sysv_truncate(inode);
- }
-}
-
-static int sysv_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len,
- struct folio **foliop, void **fsdata)
-{
- int ret;
-
- ret = block_write_begin(mapping, pos, len, foliop, get_block);
- if (unlikely(ret))
- sysv_write_failed(mapping, pos + len);
-
- return ret;
-}
-
-static sector_t sysv_bmap(struct address_space *mapping, sector_t block)
-{
- return generic_block_bmap(mapping,block,get_block);
-}
-
-const struct address_space_operations sysv_aops = {
- .dirty_folio = block_dirty_folio,
- .invalidate_folio = block_invalidate_folio,
- .read_folio = sysv_read_folio,
- .writepages = sysv_writepages,
- .write_begin = sysv_write_begin,
- .write_end = generic_write_end,
- .migrate_folio = buffer_migrate_folio,
- .bmap = sysv_bmap
-};
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
deleted file mode 100644
index fb8bd8437872..000000000000
--- a/fs/sysv/namei.c
+++ /dev/null
@@ -1,280 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * linux/fs/sysv/namei.c
- *
- * minix/namei.c
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * coh/namei.c
- * Copyright (C) 1993 Pascal Haible, Bruno Haible
- *
- * sysv/namei.c
- * Copyright (C) 1993 Bruno Haible
- * Copyright (C) 1997, 1998 Krzysztof G. Baranowski
- */
-
-#include <linux/pagemap.h>
-#include "sysv.h"
-
-static int add_nondir(struct dentry *dentry, struct inode *inode)
-{
- int err = sysv_add_link(dentry, inode);
- if (!err) {
- d_instantiate(dentry, inode);
- return 0;
- }
- inode_dec_link_count(inode);
- iput(inode);
- return err;
-}
-
-static struct dentry *sysv_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags)
-{
- struct inode * inode = NULL;
- ino_t ino;
-
- if (dentry->d_name.len > SYSV_NAMELEN)
- return ERR_PTR(-ENAMETOOLONG);
- ino = sysv_inode_by_name(dentry);
- if (ino)
- inode = sysv_iget(dir->i_sb, ino);
- return d_splice_alias(inode, dentry);
-}
-
-static int sysv_mknod(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode, dev_t rdev)
-{
- struct inode * inode;
- int err;
-
- if (!old_valid_dev(rdev))
- return -EINVAL;
-
- inode = sysv_new_inode(dir, mode);
- err = PTR_ERR(inode);
-
- if (!IS_ERR(inode)) {
- sysv_set_inode(inode, rdev);
- mark_inode_dirty(inode);
- err = add_nondir(dentry, inode);
- }
- return err;
-}
-
-static int sysv_create(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode, bool excl)
-{
- return sysv_mknod(&nop_mnt_idmap, dir, dentry, mode, 0);
-}
-
-static int sysv_symlink(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, const char *symname)
-{
- int err = -ENAMETOOLONG;
- int l = strlen(symname)+1;
- struct inode * inode;
-
- if (l > dir->i_sb->s_blocksize)
- goto out;
-
- inode = sysv_new_inode(dir, S_IFLNK|0777);
- err = PTR_ERR(inode);
- if (IS_ERR(inode))
- goto out;
-
- sysv_set_inode(inode, 0);
- err = page_symlink(inode, symname, l);
- if (err)
- goto out_fail;
-
- mark_inode_dirty(inode);
- err = add_nondir(dentry, inode);
-out:
- return err;
-
-out_fail:
- inode_dec_link_count(inode);
- iput(inode);
- goto out;
-}
-
-static int sysv_link(struct dentry * old_dentry, struct inode * dir,
- struct dentry * dentry)
-{
- struct inode *inode = d_inode(old_dentry);
-
- inode_set_ctime_current(inode);
- inode_inc_link_count(inode);
- ihold(inode);
-
- return add_nondir(dentry, inode);
-}
-
-static int sysv_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
-{
- struct inode * inode;
- int err;
-
- inode_inc_link_count(dir);
-
- inode = sysv_new_inode(dir, S_IFDIR|mode);
- err = PTR_ERR(inode);
- if (IS_ERR(inode))
- goto out_dir;
-
- sysv_set_inode(inode, 0);
-
- inode_inc_link_count(inode);
-
- err = sysv_make_empty(inode, dir);
- if (err)
- goto out_fail;
-
- err = sysv_add_link(dentry, inode);
- if (err)
- goto out_fail;
-
- d_instantiate(dentry, inode);
-out:
- return err;
-
-out_fail:
- inode_dec_link_count(inode);
- inode_dec_link_count(inode);
- iput(inode);
-out_dir:
- inode_dec_link_count(dir);
- goto out;
-}
-
-static int sysv_unlink(struct inode * dir, struct dentry * dentry)
-{
- struct inode * inode = d_inode(dentry);
- struct folio *folio;
- struct sysv_dir_entry * de;
- int err;
-
- de = sysv_find_entry(dentry, &folio);
- if (!de)
- return -ENOENT;
-
- err = sysv_delete_entry(de, folio);
- if (!err) {
- inode_set_ctime_to_ts(inode, inode_get_ctime(dir));
- inode_dec_link_count(inode);
- }
- folio_release_kmap(folio, de);
- return err;
-}
-
-static int sysv_rmdir(struct inode * dir, struct dentry * dentry)
-{
- struct inode *inode = d_inode(dentry);
- int err = -ENOTEMPTY;
-
- if (sysv_empty_dir(inode)) {
- err = sysv_unlink(dir, dentry);
- if (!err) {
- inode->i_size = 0;
- inode_dec_link_count(inode);
- inode_dec_link_count(dir);
- }
- }
- return err;
-}
-
-/*
- * Anybody can rename anything with this: the permission checks are left to the
- * higher-level routines.
- */
-static int sysv_rename(struct mnt_idmap *idmap, struct inode *old_dir,
- struct dentry *old_dentry, struct inode *new_dir,
- struct dentry *new_dentry, unsigned int flags)
-{
- struct inode * old_inode = d_inode(old_dentry);
- struct inode * new_inode = d_inode(new_dentry);
- struct folio *dir_folio;
- struct sysv_dir_entry * dir_de = NULL;
- struct folio *old_folio;
- struct sysv_dir_entry * old_de;
- int err = -ENOENT;
-
- if (flags & ~RENAME_NOREPLACE)
- return -EINVAL;
-
- old_de = sysv_find_entry(old_dentry, &old_folio);
- if (!old_de)
- goto out;
-
- if (S_ISDIR(old_inode->i_mode)) {
- err = -EIO;
- dir_de = sysv_dotdot(old_inode, &dir_folio);
- if (!dir_de)
- goto out_old;
- }
-
- if (new_inode) {
- struct folio *new_folio;
- struct sysv_dir_entry * new_de;
-
- err = -ENOTEMPTY;
- if (dir_de && !sysv_empty_dir(new_inode))
- goto out_dir;
-
- err = -ENOENT;
- new_de = sysv_find_entry(new_dentry, &new_folio);
- if (!new_de)
- goto out_dir;
- err = sysv_set_link(new_de, new_folio, old_inode);
- folio_release_kmap(new_folio, new_de);
- if (err)
- goto out_dir;
- inode_set_ctime_current(new_inode);
- if (dir_de)
- drop_nlink(new_inode);
- inode_dec_link_count(new_inode);
- } else {
- err = sysv_add_link(new_dentry, old_inode);
- if (err)
- goto out_dir;
- if (dir_de)
- inode_inc_link_count(new_dir);
- }
-
- err = sysv_delete_entry(old_de, old_folio);
- if (err)
- goto out_dir;
-
- mark_inode_dirty(old_inode);
-
- if (dir_de) {
- err = sysv_set_link(dir_de, dir_folio, new_dir);
- if (!err)
- inode_dec_link_count(old_dir);
- }
-
-out_dir:
- if (dir_de)
- folio_release_kmap(dir_folio, dir_de);
-out_old:
- folio_release_kmap(old_folio, old_de);
-out:
- return err;
-}
-
-/*
- * directories can handle most operations...
- */
-const struct inode_operations sysv_dir_inode_operations = {
- .create = sysv_create,
- .lookup = sysv_lookup,
- .link = sysv_link,
- .unlink = sysv_unlink,
- .symlink = sysv_symlink,
- .mkdir = sysv_mkdir,
- .rmdir = sysv_rmdir,
- .mknod = sysv_mknod,
- .rename = sysv_rename,
- .getattr = sysv_getattr,
-};
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
deleted file mode 100644
index 5c0d07ddbda2..000000000000
--- a/fs/sysv/super.c
+++ /dev/null
@@ -1,595 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/fs/sysv/inode.c
- *
- * minix/inode.c
- * Copyright (C) 1991, 1992 Linus Torvalds
- *
- * xenix/inode.c
- * Copyright (C) 1992 Doug Evans
- *
- * coh/inode.c
- * Copyright (C) 1993 Pascal Haible, Bruno Haible
- *
- * sysv/inode.c
- * Copyright (C) 1993 Paul B. Monday
- *
- * sysv/inode.c
- * Copyright (C) 1993 Bruno Haible
- * Copyright (C) 1997, 1998 Krzysztof G. Baranowski
- *
- * This file contains code for read/parsing the superblock.
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/buffer_head.h>
-#include "sysv.h"
-
-/*
- * The following functions try to recognize specific filesystems.
- *
- * We recognize:
- * - Xenix FS by its magic number.
- * - SystemV FS by its magic number.
- * - Coherent FS by its funny fname/fpack field.
- * - SCO AFS by s_nfree == 0xffff
- * - V7 FS has no distinguishing features.
- *
- * We discriminate among SystemV4 and SystemV2 FS by the assumption that
- * the time stamp is not < 01-01-1980.
- */
-
-enum {
- JAN_1_1980 = (10*365 + 2) * 24 * 60 * 60
-};
-
-static void detected_xenix(struct sysv_sb_info *sbi, unsigned *max_links)
-{
- struct buffer_head *bh1 = sbi->s_bh1;
- struct buffer_head *bh2 = sbi->s_bh2;
- struct xenix_super_block * sbd1;
- struct xenix_super_block * sbd2;
-
- if (bh1 != bh2)
- sbd1 = sbd2 = (struct xenix_super_block *) bh1->b_data;
- else {
- /* block size = 512, so bh1 != bh2 */
- sbd1 = (struct xenix_super_block *) bh1->b_data;
- sbd2 = (struct xenix_super_block *) (bh2->b_data - 512);
- }
-
- *max_links = XENIX_LINK_MAX;
- sbi->s_fic_size = XENIX_NICINOD;
- sbi->s_flc_size = XENIX_NICFREE;
- sbi->s_sbd1 = (char *)sbd1;
- sbi->s_sbd2 = (char *)sbd2;
- sbi->s_sb_fic_count = &sbd1->s_ninode;
- sbi->s_sb_fic_inodes = &sbd1->s_inode[0];
- sbi->s_sb_total_free_inodes = &sbd2->s_tinode;
- sbi->s_bcache_count = &sbd1->s_nfree;
- sbi->s_bcache = &sbd1->s_free[0];
- sbi->s_free_blocks = &sbd2->s_tfree;
- sbi->s_sb_time = &sbd2->s_time;
- sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd1->s_isize);
- sbi->s_nzones = fs32_to_cpu(sbi, sbd1->s_fsize);
-}
-
-static void detected_sysv4(struct sysv_sb_info *sbi, unsigned *max_links)
-{
- struct sysv4_super_block * sbd;
- struct buffer_head *bh1 = sbi->s_bh1;
- struct buffer_head *bh2 = sbi->s_bh2;
-
- if (bh1 == bh2)
- sbd = (struct sysv4_super_block *) (bh1->b_data + BLOCK_SIZE/2);
- else
- sbd = (struct sysv4_super_block *) bh2->b_data;
-
- *max_links = SYSV_LINK_MAX;
- sbi->s_fic_size = SYSV_NICINOD;
- sbi->s_flc_size = SYSV_NICFREE;
- sbi->s_sbd1 = (char *)sbd;
- sbi->s_sbd2 = (char *)sbd;
- sbi->s_sb_fic_count = &sbd->s_ninode;
- sbi->s_sb_fic_inodes = &sbd->s_inode[0];
- sbi->s_sb_total_free_inodes = &sbd->s_tinode;
- sbi->s_bcache_count = &sbd->s_nfree;
- sbi->s_bcache = &sbd->s_free[0];
- sbi->s_free_blocks = &sbd->s_tfree;
- sbi->s_sb_time = &sbd->s_time;
- sbi->s_sb_state = &sbd->s_state;
- sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd->s_isize);
- sbi->s_nzones = fs32_to_cpu(sbi, sbd->s_fsize);
-}
-
-static void detected_sysv2(struct sysv_sb_info *sbi, unsigned *max_links)
-{
- struct sysv2_super_block *sbd;
- struct buffer_head *bh1 = sbi->s_bh1;
- struct buffer_head *bh2 = sbi->s_bh2;
-
- if (bh1 == bh2)
- sbd = (struct sysv2_super_block *) (bh1->b_data + BLOCK_SIZE/2);
- else
- sbd = (struct sysv2_super_block *) bh2->b_data;
-
- *max_links = SYSV_LINK_MAX;
- sbi->s_fic_size = SYSV_NICINOD;
- sbi->s_flc_size = SYSV_NICFREE;
- sbi->s_sbd1 = (char *)sbd;
- sbi->s_sbd2 = (char *)sbd;
- sbi->s_sb_fic_count = &sbd->s_ninode;
- sbi->s_sb_fic_inodes = &sbd->s_inode[0];
- sbi->s_sb_total_free_inodes = &sbd->s_tinode;
- sbi->s_bcache_count = &sbd->s_nfree;
- sbi->s_bcache = &sbd->s_free[0];
- sbi->s_free_blocks = &sbd->s_tfree;
- sbi->s_sb_time = &sbd->s_time;
- sbi->s_sb_state = &sbd->s_state;
- sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd->s_isize);
- sbi->s_nzones = fs32_to_cpu(sbi, sbd->s_fsize);
-}
-
-static void detected_coherent(struct sysv_sb_info *sbi, unsigned *max_links)
-{
- struct coh_super_block * sbd;
- struct buffer_head *bh1 = sbi->s_bh1;
-
- sbd = (struct coh_super_block *) bh1->b_data;
-
- *max_links = COH_LINK_MAX;
- sbi->s_fic_size = COH_NICINOD;
- sbi->s_flc_size = COH_NICFREE;
- sbi->s_sbd1 = (char *)sbd;
- sbi->s_sbd2 = (char *)sbd;
- sbi->s_sb_fic_count = &sbd->s_ninode;
- sbi->s_sb_fic_inodes = &sbd->s_inode[0];
- sbi->s_sb_total_free_inodes = &sbd->s_tinode;
- sbi->s_bcache_count = &sbd->s_nfree;
- sbi->s_bcache = &sbd->s_free[0];
- sbi->s_free_blocks = &sbd->s_tfree;
- sbi->s_sb_time = &sbd->s_time;
- sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd->s_isize);
- sbi->s_nzones = fs32_to_cpu(sbi, sbd->s_fsize);
-}
-
-static void detected_v7(struct sysv_sb_info *sbi, unsigned *max_links)
-{
- struct buffer_head *bh2 = sbi->s_bh2;
- struct v7_super_block *sbd = (struct v7_super_block *)bh2->b_data;
-
- *max_links = V7_LINK_MAX;
- sbi->s_fic_size = V7_NICINOD;
- sbi->s_flc_size = V7_NICFREE;
- sbi->s_sbd1 = (char *)sbd;
- sbi->s_sbd2 = (char *)sbd;
- sbi->s_sb_fic_count = &sbd->s_ninode;
- sbi->s_sb_fic_inodes = &sbd->s_inode[0];
- sbi->s_sb_total_free_inodes = &sbd->s_tinode;
- sbi->s_bcache_count = &sbd->s_nfree;
- sbi->s_bcache = &sbd->s_free[0];
- sbi->s_free_blocks = &sbd->s_tfree;
- sbi->s_sb_time = &sbd->s_time;
- sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd->s_isize);
- sbi->s_nzones = fs32_to_cpu(sbi, sbd->s_fsize);
-}
-
-static int detect_xenix(struct sysv_sb_info *sbi, struct buffer_head *bh)
-{
- struct xenix_super_block *sbd = (struct xenix_super_block *)bh->b_data;
- if (*(__le32 *)&sbd->s_magic == cpu_to_le32(0x2b5544))
- sbi->s_bytesex = BYTESEX_LE;
- else if (*(__be32 *)&sbd->s_magic == cpu_to_be32(0x2b5544))
- sbi->s_bytesex = BYTESEX_BE;
- else
- return 0;
- switch (fs32_to_cpu(sbi, sbd->s_type)) {
- case 1:
- sbi->s_type = FSTYPE_XENIX;
- return 1;
- case 2:
- sbi->s_type = FSTYPE_XENIX;
- return 2;
- default:
- return 0;
- }
-}
-
-static int detect_sysv(struct sysv_sb_info *sbi, struct buffer_head *bh)
-{
- struct super_block *sb = sbi->s_sb;
- /* All relevant fields are at the same offsets in R2 and R4 */
- struct sysv4_super_block * sbd;
- u32 type;
-
- sbd = (struct sysv4_super_block *) (bh->b_data + BLOCK_SIZE/2);
- if (*(__le32 *)&sbd->s_magic == cpu_to_le32(0xfd187e20))
- sbi->s_bytesex = BYTESEX_LE;
- else if (*(__be32 *)&sbd->s_magic == cpu_to_be32(0xfd187e20))
- sbi->s_bytesex = BYTESEX_BE;
- else
- return 0;
-
- type = fs32_to_cpu(sbi, sbd->s_type);
-
- if (fs16_to_cpu(sbi, sbd->s_nfree) == 0xffff) {
- sbi->s_type = FSTYPE_AFS;
- sbi->s_forced_ro = 1;
- if (!sb_rdonly(sb)) {
- printk("SysV FS: SCO EAFS on %s detected, "
- "forcing read-only mode.\n",
- sb->s_id);
- }
- return type;
- }
-
- if (fs32_to_cpu(sbi, sbd->s_time) < JAN_1_1980) {
- /* this is likely to happen on SystemV2 FS */
- if (type > 3 || type < 1)
- return 0;
- sbi->s_type = FSTYPE_SYSV2;
- return type;
- }
- if ((type > 3 || type < 1) && (type > 0x30 || type < 0x10))
- return 0;
-
- /* On Interactive Unix (ISC) Version 4.0/3.x s_type field = 0x10,
- 0x20 or 0x30 indicates that symbolic links and the 14-character
- filename limit is gone. Due to lack of information about this
- feature read-only mode seems to be a reasonable approach... -KGB */
-
- if (type >= 0x10) {
- printk("SysV FS: can't handle long file names on %s, "
- "forcing read-only mode.\n", sb->s_id);
- sbi->s_forced_ro = 1;
- }
-
- sbi->s_type = FSTYPE_SYSV4;
- return type >= 0x10 ? type >> 4 : type;
-}
-
-static int detect_coherent(struct sysv_sb_info *sbi, struct buffer_head *bh)
-{
- struct coh_super_block * sbd;
-
- sbd = (struct coh_super_block *) (bh->b_data + BLOCK_SIZE/2);
- if ((memcmp(sbd->s_fname,"noname",6) && memcmp(sbd->s_fname,"xxxxx ",6))
- || (memcmp(sbd->s_fpack,"nopack",6) && memcmp(sbd->s_fpack,"xxxxx\n",6)))
- return 0;
- sbi->s_bytesex = BYTESEX_PDP;
- sbi->s_type = FSTYPE_COH;
- return 1;
-}
-
-static int detect_sysv_odd(struct sysv_sb_info *sbi, struct buffer_head *bh)
-{
- int size = detect_sysv(sbi, bh);
-
- return size>2 ? 0 : size;
-}
-
-static struct {
- int block;
- int (*test)(struct sysv_sb_info *, struct buffer_head *);
-} flavours[] = {
- {1, detect_xenix},
- {0, detect_sysv},
- {0, detect_coherent},
- {9, detect_sysv_odd},
- {15,detect_sysv_odd},
- {18,detect_sysv},
-};
-
-static char *flavour_names[] = {
- [FSTYPE_XENIX] = "Xenix",
- [FSTYPE_SYSV4] = "SystemV",
- [FSTYPE_SYSV2] = "SystemV Release 2",
- [FSTYPE_COH] = "Coherent",
- [FSTYPE_V7] = "V7",
- [FSTYPE_AFS] = "AFS",
-};
-
-static void (*flavour_setup[])(struct sysv_sb_info *, unsigned *) = {
- [FSTYPE_XENIX] = detected_xenix,
- [FSTYPE_SYSV4] = detected_sysv4,
- [FSTYPE_SYSV2] = detected_sysv2,
- [FSTYPE_COH] = detected_coherent,
- [FSTYPE_V7] = detected_v7,
- [FSTYPE_AFS] = detected_sysv4,
-};
-
-static int complete_read_super(struct super_block *sb, int silent, int size)
-{
- struct sysv_sb_info *sbi = SYSV_SB(sb);
- struct inode *root_inode;
- char *found = flavour_names[sbi->s_type];
- u_char n_bits = size+8;
- int bsize = 1 << n_bits;
- int bsize_4 = bsize >> 2;
-
- sbi->s_firstinodezone = 2;
-
- flavour_setup[sbi->s_type](sbi, &sb->s_max_links);
- if (sbi->s_firstdatazone < sbi->s_firstinodezone)
- return 0;
-
- sbi->s_ndatazones = sbi->s_nzones - sbi->s_firstdatazone;
- sbi->s_inodes_per_block = bsize >> 6;
- sbi->s_inodes_per_block_1 = (bsize >> 6)-1;
- sbi->s_inodes_per_block_bits = n_bits-6;
- sbi->s_ind_per_block = bsize_4;
- sbi->s_ind_per_block_2 = bsize_4*bsize_4;
- sbi->s_toobig_block = 10 + bsize_4 * (1 + bsize_4 * (1 + bsize_4));
- sbi->s_ind_per_block_bits = n_bits-2;
-
- sbi->s_ninodes = (sbi->s_firstdatazone - sbi->s_firstinodezone)
- << sbi->s_inodes_per_block_bits;
-
- if (!silent)
- printk("VFS: Found a %s FS (block size = %ld) on device %s\n",
- found, sb->s_blocksize, sb->s_id);
-
- sb->s_magic = SYSV_MAGIC_BASE + sbi->s_type;
- /* set up enough so that it can read an inode */
- sb->s_op = &sysv_sops;
- if (sbi->s_forced_ro)
- sb->s_flags |= SB_RDONLY;
- root_inode = sysv_iget(sb, SYSV_ROOT_INO);
- if (IS_ERR(root_inode)) {
- printk("SysV FS: get root inode failed\n");
- return 0;
- }
- sb->s_root = d_make_root(root_inode);
- if (!sb->s_root) {
- printk("SysV FS: get root dentry failed\n");
- return 0;
- }
- return 1;
-}
-
-static int sysv_fill_super(struct super_block *sb, void *data, int silent)
-{
- struct buffer_head *bh1, *bh = NULL;
- struct sysv_sb_info *sbi;
- unsigned long blocknr;
- int size = 0, i;
-
- BUILD_BUG_ON(1024 != sizeof (struct xenix_super_block));
- BUILD_BUG_ON(512 != sizeof (struct sysv4_super_block));
- BUILD_BUG_ON(512 != sizeof (struct sysv2_super_block));
- BUILD_BUG_ON(500 != sizeof (struct coh_super_block));
- BUILD_BUG_ON(64 != sizeof (struct sysv_inode));
-
- sbi = kzalloc(sizeof(struct sysv_sb_info), GFP_KERNEL);
- if (!sbi)
- return -ENOMEM;
-
- sbi->s_sb = sb;
- sbi->s_block_base = 0;
- mutex_init(&sbi->s_lock);
- sb->s_fs_info = sbi;
- sb->s_time_min = 0;
- sb->s_time_max = U32_MAX;
- sb_set_blocksize(sb, BLOCK_SIZE);
-
- for (i = 0; i < ARRAY_SIZE(flavours) && !size; i++) {
- brelse(bh);
- bh = sb_bread(sb, flavours[i].block);
- if (!bh)
- continue;
- size = flavours[i].test(SYSV_SB(sb), bh);
- }
-
- if (!size)
- goto Eunknown;
-
- switch (size) {
- case 1:
- blocknr = bh->b_blocknr << 1;
- brelse(bh);
- sb_set_blocksize(sb, 512);
- bh1 = sb_bread(sb, blocknr);
- bh = sb_bread(sb, blocknr + 1);
- break;
- case 2:
- bh1 = bh;
- break;
- case 3:
- blocknr = bh->b_blocknr >> 1;
- brelse(bh);
- sb_set_blocksize(sb, 2048);
- bh1 = bh = sb_bread(sb, blocknr);
- break;
- default:
- goto Ebadsize;
- }
-
- if (bh && bh1) {
- sbi->s_bh1 = bh1;
- sbi->s_bh2 = bh;
- if (complete_read_super(sb, silent, size))
- return 0;
- }
-
- brelse(bh1);
- brelse(bh);
- sb_set_blocksize(sb, BLOCK_SIZE);
- printk("oldfs: cannot read superblock\n");
-failed:
- kfree(sbi);
- return -EINVAL;
-
-Eunknown:
- brelse(bh);
- if (!silent)
- printk("VFS: unable to find oldfs superblock on device %s\n",
- sb->s_id);
- goto failed;
-Ebadsize:
- brelse(bh);
- if (!silent)
- printk("VFS: oldfs: unsupported block size (%dKb)\n",
- 1<<(size-2));
- goto failed;
-}
-
-static int v7_sanity_check(struct super_block *sb, struct buffer_head *bh)
-{
- struct v7_super_block *v7sb;
- struct sysv_inode *v7i;
- struct buffer_head *bh2;
- struct sysv_sb_info *sbi;
-
- sbi = sb->s_fs_info;
-
- /* plausibility check on superblock */
- v7sb = (struct v7_super_block *) bh->b_data;
- if (fs16_to_cpu(sbi, v7sb->s_nfree) > V7_NICFREE ||
- fs16_to_cpu(sbi, v7sb->s_ninode) > V7_NICINOD ||
- fs32_to_cpu(sbi, v7sb->s_fsize) > V7_MAXSIZE)
- return 0;
-
- /* plausibility check on root inode: it is a directory,
- with a nonzero size that is a multiple of 16 */
- bh2 = sb_bread(sb, 2);
- if (bh2 == NULL)
- return 0;
-
- v7i = (struct sysv_inode *)(bh2->b_data + 64);
- if ((fs16_to_cpu(sbi, v7i->i_mode) & ~0777) != S_IFDIR ||
- (fs32_to_cpu(sbi, v7i->i_size) == 0) ||
- (fs32_to_cpu(sbi, v7i->i_size) & 017) ||
- (fs32_to_cpu(sbi, v7i->i_size) > V7_NFILES *
- sizeof(struct sysv_dir_entry))) {
- brelse(bh2);
- return 0;
- }
-
- brelse(bh2);
- return 1;
-}
-
-static int v7_fill_super(struct super_block *sb, void *data, int silent)
-{
- struct sysv_sb_info *sbi;
- struct buffer_head *bh;
-
- BUILD_BUG_ON(sizeof(struct v7_super_block) != 440);
- BUILD_BUG_ON(sizeof(struct sysv_inode) != 64);
-
- sbi = kzalloc(sizeof(struct sysv_sb_info), GFP_KERNEL);
- if (!sbi)
- return -ENOMEM;
-
- sbi->s_sb = sb;
- sbi->s_block_base = 0;
- sbi->s_type = FSTYPE_V7;
- mutex_init(&sbi->s_lock);
- sb->s_fs_info = sbi;
- sb->s_time_min = 0;
- sb->s_time_max = U32_MAX;
-
- sb_set_blocksize(sb, 512);
-
- if ((bh = sb_bread(sb, 1)) == NULL) {
- if (!silent)
- printk("VFS: unable to read V7 FS superblock on "
- "device %s.\n", sb->s_id);
- goto failed;
- }
-
- /* Try PDP-11 UNIX */
- sbi->s_bytesex = BYTESEX_PDP;
- if (v7_sanity_check(sb, bh))
- goto detected;
-
- /* Try PC/IX, v7/x86 */
- sbi->s_bytesex = BYTESEX_LE;
- if (v7_sanity_check(sb, bh))
- goto detected;
-
- goto failed;
-
-detected:
- sbi->s_bh1 = bh;
- sbi->s_bh2 = bh;
- if (complete_read_super(sb, silent, 1))
- return 0;
-
-failed:
- printk(KERN_ERR "VFS: could not find a valid V7 on %s.\n",
- sb->s_id);
- brelse(bh);
- kfree(sbi);
- return -EINVAL;
-}
-
-/* Every kernel module contains stuff like this. */
-
-static struct dentry *sysv_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
-{
- return mount_bdev(fs_type, flags, dev_name, data, sysv_fill_super);
-}
-
-static struct dentry *v7_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
-{
- return mount_bdev(fs_type, flags, dev_name, data, v7_fill_super);
-}
-
-static struct file_system_type sysv_fs_type = {
- .owner = THIS_MODULE,
- .name = "sysv",
- .mount = sysv_mount,
- .kill_sb = kill_block_super,
- .fs_flags = FS_REQUIRES_DEV,
-};
-MODULE_ALIAS_FS("sysv");
-
-static struct file_system_type v7_fs_type = {
- .owner = THIS_MODULE,
- .name = "v7",
- .mount = v7_mount,
- .kill_sb = kill_block_super,
- .fs_flags = FS_REQUIRES_DEV,
-};
-MODULE_ALIAS_FS("v7");
-MODULE_ALIAS("v7");
-
-static int __init init_sysv_fs(void)
-{
- int error;
-
- error = sysv_init_icache();
- if (error)
- goto out;
- error = register_filesystem(&sysv_fs_type);
- if (error)
- goto destroy_icache;
- error = register_filesystem(&v7_fs_type);
- if (error)
- goto unregister;
- return 0;
-
-unregister:
- unregister_filesystem(&sysv_fs_type);
-destroy_icache:
- sysv_destroy_icache();
-out:
- return error;
-}
-
-static void __exit exit_sysv_fs(void)
-{
- unregister_filesystem(&sysv_fs_type);
- unregister_filesystem(&v7_fs_type);
- sysv_destroy_icache();
-}
-
-module_init(init_sysv_fs)
-module_exit(exit_sysv_fs)
-MODULE_DESCRIPTION("SystemV Filesystem");
-MODULE_LICENSE("GPL");
diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h
deleted file mode 100644
index 0a48b2e7edb1..000000000000
--- a/fs/sysv/sysv.h
+++ /dev/null
@@ -1,245 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _SYSV_H
-#define _SYSV_H
-
-#include <linux/buffer_head.h>
-
-typedef __u16 __bitwise __fs16;
-typedef __u32 __bitwise __fs32;
-
-#include <linux/sysv_fs.h>
-
-/*
- * SystemV/V7/Coherent super-block data in memory
- *
- * The SystemV/V7/Coherent superblock contains dynamic data (it gets modified
- * while the system is running). This is in contrast to the Minix and Berkeley
- * filesystems (where the superblock is never modified). This affects the
- * sync() operation: we must keep the superblock in a disk buffer and use this
- * one as our "working copy".
- */
-
-struct sysv_sb_info {
- struct super_block *s_sb; /* VFS superblock */
- int s_type; /* file system type: FSTYPE_{XENIX|SYSV|COH} */
- char s_bytesex; /* bytesex (le/be/pdp) */
- unsigned int s_inodes_per_block; /* number of inodes per block */
- unsigned int s_inodes_per_block_1; /* inodes_per_block - 1 */
- unsigned int s_inodes_per_block_bits; /* log2(inodes_per_block) */
- unsigned int s_ind_per_block; /* number of indirections per block */
- unsigned int s_ind_per_block_bits; /* log2(ind_per_block) */
- unsigned int s_ind_per_block_2; /* ind_per_block ^ 2 */
- unsigned int s_toobig_block; /* 10 + ipb + ipb^2 + ipb^3 */
- unsigned int s_block_base; /* physical block number of block 0 */
- unsigned short s_fic_size; /* free inode cache size, NICINOD */
- unsigned short s_flc_size; /* free block list chunk size, NICFREE */
- /* The superblock is kept in one or two disk buffers: */
- struct buffer_head *s_bh1;
- struct buffer_head *s_bh2;
- /* These are pointers into the disk buffer, to compensate for
- different superblock layout. */
- char * s_sbd1; /* entire superblock data, for part 1 */
- char * s_sbd2; /* entire superblock data, for part 2 */
- __fs16 *s_sb_fic_count; /* pointer to s_sbd->s_ninode */
- sysv_ino_t *s_sb_fic_inodes; /* pointer to s_sbd->s_inode */
- __fs16 *s_sb_total_free_inodes; /* pointer to s_sbd->s_tinode */
- __fs16 *s_bcache_count; /* pointer to s_sbd->s_nfree */
- sysv_zone_t *s_bcache; /* pointer to s_sbd->s_free */
- __fs32 *s_free_blocks; /* pointer to s_sbd->s_tfree */
- __fs32 *s_sb_time; /* pointer to s_sbd->s_time */
- __fs32 *s_sb_state; /* pointer to s_sbd->s_state, only FSTYPE_SYSV */
- /* We keep those superblock entities that don't change here;
- this saves us an indirection and perhaps a conversion. */
- u32 s_firstinodezone; /* index of first inode zone */
- u32 s_firstdatazone; /* same as s_sbd->s_isize */
- u32 s_ninodes; /* total number of inodes */
- u32 s_ndatazones; /* total number of data zones */
- u32 s_nzones; /* same as s_sbd->s_fsize */
- u16 s_namelen; /* max length of dir entry */
- int s_forced_ro;
- struct mutex s_lock;
-};
-
-/*
- * SystemV/V7/Coherent FS inode data in memory
- */
-struct sysv_inode_info {
- __fs32 i_data[13];
- u32 i_dir_start_lookup;
- struct inode vfs_inode;
-};
-
-
-static inline struct sysv_inode_info *SYSV_I(struct inode *inode)
-{
- return container_of(inode, struct sysv_inode_info, vfs_inode);
-}
-
-static inline struct sysv_sb_info *SYSV_SB(struct super_block *sb)
-{
- return sb->s_fs_info;
-}
-
-
-/* identify the FS in memory */
-enum {
- FSTYPE_NONE = 0,
- FSTYPE_XENIX,
- FSTYPE_SYSV4,
- FSTYPE_SYSV2,
- FSTYPE_COH,
- FSTYPE_V7,
- FSTYPE_AFS,
- FSTYPE_END,
-};
-
-#define SYSV_MAGIC_BASE 0x012FF7B3
-
-#define XENIX_SUPER_MAGIC (SYSV_MAGIC_BASE+FSTYPE_XENIX)
-#define SYSV4_SUPER_MAGIC (SYSV_MAGIC_BASE+FSTYPE_SYSV4)
-#define SYSV2_SUPER_MAGIC (SYSV_MAGIC_BASE+FSTYPE_SYSV2)
-#define COH_SUPER_MAGIC (SYSV_MAGIC_BASE+FSTYPE_COH)
-
-
-/* Admissible values for i_nlink: 0.._LINK_MAX */
-enum {
- XENIX_LINK_MAX = 126, /* ?? */
- SYSV_LINK_MAX = 126, /* 127? 251? */
- V7_LINK_MAX = 126, /* ?? */
- COH_LINK_MAX = 10000,
-};
-
-
-static inline void dirty_sb(struct super_block *sb)
-{
- struct sysv_sb_info *sbi = SYSV_SB(sb);
-
- mark_buffer_dirty(sbi->s_bh1);
- if (sbi->s_bh1 != sbi->s_bh2)
- mark_buffer_dirty(sbi->s_bh2);
-}
-
-
-/* ialloc.c */
-extern struct sysv_inode *sysv_raw_inode(struct super_block *, unsigned,
- struct buffer_head **);
-extern struct inode * sysv_new_inode(const struct inode *, umode_t);
-extern void sysv_free_inode(struct inode *);
-extern unsigned long sysv_count_free_inodes(struct super_block *);
-
-/* balloc.c */
-extern sysv_zone_t sysv_new_block(struct super_block *);
-extern void sysv_free_block(struct super_block *, sysv_zone_t);
-extern unsigned long sysv_count_free_blocks(struct super_block *);
-
-/* itree.c */
-void sysv_truncate(struct inode *);
-int sysv_prepare_chunk(struct folio *folio, loff_t pos, unsigned len);
-
-/* inode.c */
-extern struct inode *sysv_iget(struct super_block *, unsigned int);
-extern int sysv_write_inode(struct inode *, struct writeback_control *wbc);
-extern int sysv_sync_inode(struct inode *);
-extern void sysv_set_inode(struct inode *, dev_t);
-extern int sysv_getattr(struct mnt_idmap *, const struct path *,
- struct kstat *, u32, unsigned int);
-extern int sysv_init_icache(void);
-extern void sysv_destroy_icache(void);
-
-
-/* dir.c */
-struct sysv_dir_entry *sysv_find_entry(struct dentry *, struct folio **);
-int sysv_add_link(struct dentry *, struct inode *);
-int sysv_delete_entry(struct sysv_dir_entry *, struct folio *);
-int sysv_make_empty(struct inode *, struct inode *);
-int sysv_empty_dir(struct inode *);
-int sysv_set_link(struct sysv_dir_entry *, struct folio *,
- struct inode *);
-struct sysv_dir_entry *sysv_dotdot(struct inode *, struct folio **);
-ino_t sysv_inode_by_name(struct dentry *);
-
-
-extern const struct inode_operations sysv_file_inode_operations;
-extern const struct inode_operations sysv_dir_inode_operations;
-extern const struct file_operations sysv_file_operations;
-extern const struct file_operations sysv_dir_operations;
-extern const struct address_space_operations sysv_aops;
-extern const struct super_operations sysv_sops;
-
-
-enum {
- BYTESEX_LE,
- BYTESEX_PDP,
- BYTESEX_BE,
-};
-
-static inline u32 PDP_swab(u32 x)
-{
-#ifdef __LITTLE_ENDIAN
- return ((x & 0xffff) << 16) | ((x & 0xffff0000) >> 16);
-#else
-#ifdef __BIG_ENDIAN
- return ((x & 0xff00ff) << 8) | ((x & 0xff00ff00) >> 8);
-#else
-#error BYTESEX
-#endif
-#endif
-}
-
-static inline __u32 fs32_to_cpu(struct sysv_sb_info *sbi, __fs32 n)
-{
- if (sbi->s_bytesex == BYTESEX_PDP)
- return PDP_swab((__force __u32)n);
- else if (sbi->s_bytesex == BYTESEX_LE)
- return le32_to_cpu((__force __le32)n);
- else
- return be32_to_cpu((__force __be32)n);
-}
-
-static inline __fs32 cpu_to_fs32(struct sysv_sb_info *sbi, __u32 n)
-{
- if (sbi->s_bytesex == BYTESEX_PDP)
- return (__force __fs32)PDP_swab(n);
- else if (sbi->s_bytesex == BYTESEX_LE)
- return (__force __fs32)cpu_to_le32(n);
- else
- return (__force __fs32)cpu_to_be32(n);
-}
-
-static inline __fs32 fs32_add(struct sysv_sb_info *sbi, __fs32 *n, int d)
-{
- if (sbi->s_bytesex == BYTESEX_PDP)
- *(__u32*)n = PDP_swab(PDP_swab(*(__u32*)n)+d);
- else if (sbi->s_bytesex == BYTESEX_LE)
- le32_add_cpu((__le32 *)n, d);
- else
- be32_add_cpu((__be32 *)n, d);
- return *n;
-}
-
-static inline __u16 fs16_to_cpu(struct sysv_sb_info *sbi, __fs16 n)
-{
- if (sbi->s_bytesex != BYTESEX_BE)
- return le16_to_cpu((__force __le16)n);
- else
- return be16_to_cpu((__force __be16)n);
-}
-
-static inline __fs16 cpu_to_fs16(struct sysv_sb_info *sbi, __u16 n)
-{
- if (sbi->s_bytesex != BYTESEX_BE)
- return (__force __fs16)cpu_to_le16(n);
- else
- return (__force __fs16)cpu_to_be16(n);
-}
-
-static inline __fs16 fs16_add(struct sysv_sb_info *sbi, __fs16 *n, int d)
-{
- if (sbi->s_bytesex != BYTESEX_BE)
- le16_add_cpu((__le16 *)n, d);
- else
- be16_add_cpu((__be16 *)n, d);
- return *n;
-}
-
-#endif /* _SYSV_H */
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 9f7eb451a60f..c68f28d9c426 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -205,9 +205,8 @@ static int timerfd_setup(struct timerfd_ctx *ctx, int flags,
ALARM_REALTIME : ALARM_BOOTTIME,
timerfd_alarmproc);
} else {
- hrtimer_init(&ctx->t.tmr, clockid, htmode);
+ hrtimer_setup(&ctx->t.tmr, timerfd_tmrproc, clockid, htmode);
hrtimer_set_expires(&ctx->t.tmr, texp);
- ctx->t.tmr.function = timerfd_tmrproc;
}
if (texp != 0) {
@@ -429,7 +428,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
ALARM_REALTIME : ALARM_BOOTTIME,
timerfd_alarmproc);
else
- hrtimer_init(&ctx->t.tmr, clockid, HRTIMER_MODE_ABS);
+ hrtimer_setup(&ctx->t.tmr, timerfd_tmrproc, clockid, HRTIMER_MODE_ABS);
ctx->moffs = ktime_mono_to_real(0);
@@ -439,15 +438,15 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
return ufd;
}
- file = anon_inode_getfile("[timerfd]", &timerfd_fops, ctx,
- O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS));
+ file = anon_inode_getfile_fmode("[timerfd]", &timerfd_fops, ctx,
+ O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS),
+ FMODE_NOWAIT);
if (IS_ERR(file)) {
put_unused_fd(ufd);
kfree(ctx);
return PTR_ERR(file);
}
- file->f_mode |= FMODE_NOWAIT;
fd_install(ufd, file);
return ufd;
}
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index 53214499e384..0c023941a316 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -109,9 +109,9 @@ static char *get_dname(struct dentry *dentry)
return name;
}
-static int tracefs_syscall_mkdir(struct mnt_idmap *idmap,
- struct inode *inode, struct dentry *dentry,
- umode_t mode)
+static struct dentry *tracefs_syscall_mkdir(struct mnt_idmap *idmap,
+ struct inode *inode, struct dentry *dentry,
+ umode_t mode)
{
struct tracefs_inode *ti;
char *name;
@@ -119,7 +119,7 @@ static int tracefs_syscall_mkdir(struct mnt_idmap *idmap,
name = get_dname(dentry);
if (!name)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
/*
* This is a new directory that does not take the default of
@@ -141,7 +141,7 @@ static int tracefs_syscall_mkdir(struct mnt_idmap *idmap,
kfree(name);
- return ret;
+ return ERR_PTR(ret);
}
static int tracefs_syscall_rmdir(struct inode *inode, struct dentry *dentry)
@@ -465,9 +465,20 @@ static int tracefs_d_revalidate(struct inode *inode, const struct qstr *name,
return !(ei && ei->is_freed);
}
+static int tracefs_d_delete(const struct dentry *dentry)
+{
+ /*
+ * We want to keep eventfs dentries around but not tracefs
+ * ones. eventfs dentries have content in d_fsdata.
+ * Use d_fsdata to determine if it's a eventfs dentry or not.
+ */
+ return dentry->d_fsdata == NULL;
+}
+
static const struct dentry_operations tracefs_dentry_operations = {
.d_revalidate = tracefs_d_revalidate,
.d_release = tracefs_d_release,
+ .d_delete = tracefs_d_delete,
};
static int tracefs_fill_super(struct super_block *sb, struct fs_context *fc)
@@ -480,7 +491,7 @@ static int tracefs_fill_super(struct super_block *sb, struct fs_context *fc)
return err;
sb->s_op = &tracefs_super_operations;
- sb->s_d_op = &tracefs_dentry_operations;
+ set_default_d_op(sb, &tracefs_dentry_operations);
return 0;
}
@@ -551,20 +562,9 @@ struct dentry *tracefs_start_creating(const char *name, struct dentry *parent)
if (!parent)
parent = tracefs_mount->mnt_root;
- inode_lock(d_inode(parent));
- if (unlikely(IS_DEADDIR(d_inode(parent))))
- dentry = ERR_PTR(-ENOENT);
- else
- dentry = lookup_one_len(name, parent, strlen(name));
- if (!IS_ERR(dentry) && d_inode(dentry)) {
- dput(dentry);
- dentry = ERR_PTR(-EEXIST);
- }
-
- if (IS_ERR(dentry)) {
- inode_unlock(d_inode(parent));
+ dentry = simple_start_creating(parent, name);
+ if (IS_ERR(dentry))
simple_release_fs(&tracefs_mount, &tracefs_mount_count);
- }
return dentry;
}
diff --git a/fs/ubifs/compress.c b/fs/ubifs/compress.c
index 0b48cbab8a3d..059a02691edd 100644
--- a/fs/ubifs/compress.c
+++ b/fs/ubifs/compress.c
@@ -15,9 +15,15 @@
* decompression.
*/
-#include <linux/crypto.h>
+#include <crypto/acompress.h>
+#include <linux/highmem.h>
#include "ubifs.h"
+union ubifs_in_ptr {
+ const void *buf;
+ struct folio *folio;
+};
+
/* Fake description object for the "none" compressor */
static struct ubifs_compressor none_compr = {
.compr_type = UBIFS_COMPR_NONE,
@@ -26,11 +32,8 @@ static struct ubifs_compressor none_compr = {
};
#ifdef CONFIG_UBIFS_FS_LZO
-static DEFINE_MUTEX(lzo_mutex);
-
static struct ubifs_compressor lzo_compr = {
.compr_type = UBIFS_COMPR_LZO,
- .comp_mutex = &lzo_mutex,
.name = "lzo",
.capi_name = "lzo",
};
@@ -42,13 +45,8 @@ static struct ubifs_compressor lzo_compr = {
#endif
#ifdef CONFIG_UBIFS_FS_ZLIB
-static DEFINE_MUTEX(deflate_mutex);
-static DEFINE_MUTEX(inflate_mutex);
-
static struct ubifs_compressor zlib_compr = {
.compr_type = UBIFS_COMPR_ZLIB,
- .comp_mutex = &deflate_mutex,
- .decomp_mutex = &inflate_mutex,
.name = "zlib",
.capi_name = "deflate",
};
@@ -60,13 +58,8 @@ static struct ubifs_compressor zlib_compr = {
#endif
#ifdef CONFIG_UBIFS_FS_ZSTD
-static DEFINE_MUTEX(zstd_enc_mutex);
-static DEFINE_MUTEX(zstd_dec_mutex);
-
static struct ubifs_compressor zstd_compr = {
.compr_type = UBIFS_COMPR_ZSTD,
- .comp_mutex = &zstd_enc_mutex,
- .decomp_mutex = &zstd_dec_mutex,
.name = "zstd",
.capi_name = "zstd",
};
@@ -80,6 +73,63 @@ static struct ubifs_compressor zstd_compr = {
/* All UBIFS compressors */
struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT];
+static void ubifs_compress_common(int *compr_type, union ubifs_in_ptr in_ptr,
+ size_t in_offset, int in_len, bool in_folio,
+ void *out_buf, int *out_len)
+{
+ struct ubifs_compressor *compr = ubifs_compressors[*compr_type];
+ int dlen = *out_len;
+ int err;
+
+ if (*compr_type == UBIFS_COMPR_NONE)
+ goto no_compr;
+
+ /* If the input data is small, do not even try to compress it */
+ if (in_len < UBIFS_MIN_COMPR_LEN)
+ goto no_compr;
+
+ dlen = min(dlen, in_len - UBIFS_MIN_COMPRESS_DIFF);
+
+ do {
+ ACOMP_REQUEST_ON_STACK(req, compr->cc);
+ DECLARE_CRYPTO_WAIT(wait);
+
+ acomp_request_set_callback(req, 0, NULL, NULL);
+ if (in_folio)
+ acomp_request_set_src_folio(req, in_ptr.folio,
+ in_offset, in_len);
+ else
+ acomp_request_set_src_dma(req, in_ptr.buf, in_len);
+ acomp_request_set_dst_dma(req, out_buf, dlen);
+ err = crypto_acomp_compress(req);
+ dlen = req->dlen;
+ if (err != -EAGAIN)
+ break;
+
+ req = ACOMP_REQUEST_CLONE(req, GFP_NOFS | __GFP_NOWARN);
+ acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+ crypto_req_done, &wait);
+ err = crypto_acomp_compress(req);
+ err = crypto_wait_req(err, &wait);
+ dlen = req->dlen;
+ acomp_request_free(req);
+ } while (0);
+
+ *out_len = dlen;
+ if (err)
+ goto no_compr;
+
+ return;
+
+no_compr:
+ if (in_folio)
+ memcpy_from_folio(out_buf, in_ptr.folio, in_offset, in_len);
+ else
+ memcpy(out_buf, in_ptr.buf, in_len);
+ *out_len = in_len;
+ *compr_type = UBIFS_COMPR_NONE;
+}
+
/**
* ubifs_compress - compress data.
* @c: UBIFS file-system description object
@@ -102,61 +152,51 @@ struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT];
void ubifs_compress(const struct ubifs_info *c, const void *in_buf,
int in_len, void *out_buf, int *out_len, int *compr_type)
{
- int err;
- struct ubifs_compressor *compr = ubifs_compressors[*compr_type];
-
- if (*compr_type == UBIFS_COMPR_NONE)
- goto no_compr;
-
- /* If the input data is small, do not even try to compress it */
- if (in_len < UBIFS_MIN_COMPR_LEN)
- goto no_compr;
-
- if (compr->comp_mutex)
- mutex_lock(compr->comp_mutex);
- err = crypto_comp_compress(compr->cc, in_buf, in_len, out_buf,
- (unsigned int *)out_len);
- if (compr->comp_mutex)
- mutex_unlock(compr->comp_mutex);
- if (unlikely(err)) {
- ubifs_warn(c, "cannot compress %d bytes, compressor %s, error %d, leave data uncompressed",
- in_len, compr->name, err);
- goto no_compr;
- }
+ union ubifs_in_ptr in_ptr = { .buf = in_buf };
- /*
- * If the data compressed only slightly, it is better to leave it
- * uncompressed to improve read speed.
- */
- if (in_len - *out_len < UBIFS_MIN_COMPRESS_DIFF)
- goto no_compr;
-
- return;
-
-no_compr:
- memcpy(out_buf, in_buf, in_len);
- *out_len = in_len;
- *compr_type = UBIFS_COMPR_NONE;
+ ubifs_compress_common(compr_type, in_ptr, 0, in_len, false,
+ out_buf, out_len);
}
/**
- * ubifs_decompress - decompress data.
+ * ubifs_compress_folio - compress folio.
* @c: UBIFS file-system description object
- * @in_buf: data to decompress
- * @in_len: length of the data to decompress
- * @out_buf: output buffer where decompressed data should
- * @out_len: output length is returned here
- * @compr_type: type of compression
+ * @in_folio: data to compress
+ * @in_offset: offset into @in_folio
+ * @in_len: length of the data to compress
+ * @out_buf: output buffer where compressed data should be stored
+ * @out_len: output buffer length is returned here
+ * @compr_type: type of compression to use on enter, actually used compression
+ * type on exit
*
- * This function decompresses data from buffer @in_buf into buffer @out_buf.
- * The length of the uncompressed data is returned in @out_len. This functions
- * returns %0 on success or a negative error code on failure.
+ * This function compresses input folio @in_folio of length @in_len and
+ * stores the result in the output buffer @out_buf and the resulting length
+ * in @out_len. If the input buffer does not compress, it is just copied
+ * to the @out_buf. The same happens if @compr_type is %UBIFS_COMPR_NONE
+ * or if compression error occurred.
+ *
+ * Note, if the input buffer was not compressed, it is copied to the output
+ * buffer and %UBIFS_COMPR_NONE is returned in @compr_type.
*/
-int ubifs_decompress(const struct ubifs_info *c, const void *in_buf,
- int in_len, void *out_buf, int *out_len, int compr_type)
+void ubifs_compress_folio(const struct ubifs_info *c, struct folio *in_folio,
+ size_t in_offset, int in_len, void *out_buf,
+ int *out_len, int *compr_type)
+{
+ union ubifs_in_ptr in_ptr = { .folio = in_folio };
+
+ ubifs_compress_common(compr_type, in_ptr, in_offset, in_len, true,
+ out_buf, out_len);
+}
+
+static int ubifs_decompress_common(const struct ubifs_info *c,
+ const void *in_buf, int in_len,
+ void *out_ptr, size_t out_offset,
+ int *out_len, bool out_folio,
+ int compr_type)
{
- int err;
struct ubifs_compressor *compr;
+ int dlen = *out_len;
+ int err;
if (unlikely(compr_type < 0 || compr_type >= UBIFS_COMPR_TYPES_CNT)) {
ubifs_err(c, "invalid compression type %d", compr_type);
@@ -171,17 +211,39 @@ int ubifs_decompress(const struct ubifs_info *c, const void *in_buf,
}
if (compr_type == UBIFS_COMPR_NONE) {
- memcpy(out_buf, in_buf, in_len);
+ if (out_folio)
+ memcpy_to_folio(out_ptr, out_offset, in_buf, in_len);
+ else
+ memcpy(out_ptr, in_buf, in_len);
*out_len = in_len;
return 0;
}
- if (compr->decomp_mutex)
- mutex_lock(compr->decomp_mutex);
- err = crypto_comp_decompress(compr->cc, in_buf, in_len, out_buf,
- (unsigned int *)out_len);
- if (compr->decomp_mutex)
- mutex_unlock(compr->decomp_mutex);
+ do {
+ ACOMP_REQUEST_ON_STACK(req, compr->cc);
+ DECLARE_CRYPTO_WAIT(wait);
+
+ acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+ crypto_req_done, &wait);
+ acomp_request_set_src_dma(req, in_buf, in_len);
+ if (out_folio)
+ acomp_request_set_dst_folio(req, out_ptr, out_offset,
+ dlen);
+ else
+ acomp_request_set_dst_dma(req, out_ptr, dlen);
+ err = crypto_acomp_decompress(req);
+ dlen = req->dlen;
+ if (err != -EAGAIN)
+ break;
+
+ req = ACOMP_REQUEST_CLONE(req, GFP_NOFS | __GFP_NOWARN);
+ err = crypto_acomp_decompress(req);
+ err = crypto_wait_req(err, &wait);
+ dlen = req->dlen;
+ acomp_request_free(req);
+ } while (0);
+
+ *out_len = dlen;
if (err)
ubifs_err(c, "cannot decompress %d bytes, compressor %s, error %d",
in_len, compr->name, err);
@@ -190,6 +252,49 @@ int ubifs_decompress(const struct ubifs_info *c, const void *in_buf,
}
/**
+ * ubifs_decompress - decompress data.
+ * @c: UBIFS file-system description object
+ * @in_buf: data to decompress
+ * @in_len: length of the data to decompress
+ * @out_buf: output buffer where decompressed data should
+ * @out_len: output length is returned here
+ * @compr_type: type of compression
+ *
+ * This function decompresses data from buffer @in_buf into buffer @out_buf.
+ * The length of the uncompressed data is returned in @out_len. This functions
+ * returns %0 on success or a negative error code on failure.
+ */
+int ubifs_decompress(const struct ubifs_info *c, const void *in_buf,
+ int in_len, void *out_buf, int *out_len, int compr_type)
+{
+ return ubifs_decompress_common(c, in_buf, in_len, out_buf, 0, out_len,
+ false, compr_type);
+}
+
+/**
+ * ubifs_decompress_folio - decompress folio.
+ * @c: UBIFS file-system description object
+ * @in_buf: data to decompress
+ * @in_len: length of the data to decompress
+ * @out_folio: output folio where decompressed data should
+ * @out_offset: offset into @out_folio
+ * @out_len: output length is returned here
+ * @compr_type: type of compression
+ *
+ * This function decompresses data from buffer @in_buf into folio
+ * @out_folio. The length of the uncompressed data is returned in
+ * @out_len. This functions returns %0 on success or a negative error
+ * code on failure.
+ */
+int ubifs_decompress_folio(const struct ubifs_info *c, const void *in_buf,
+ int in_len, struct folio *out_folio,
+ size_t out_offset, int *out_len, int compr_type)
+{
+ return ubifs_decompress_common(c, in_buf, in_len, out_folio,
+ out_offset, out_len, true, compr_type);
+}
+
+/**
* compr_init - initialize a compressor.
* @compr: compressor description object
*
@@ -199,7 +304,7 @@ int ubifs_decompress(const struct ubifs_info *c, const void *in_buf,
static int __init compr_init(struct ubifs_compressor *compr)
{
if (compr->capi_name) {
- compr->cc = crypto_alloc_comp(compr->capi_name, 0, 0);
+ compr->cc = crypto_alloc_acomp(compr->capi_name, 0, 0);
if (IS_ERR(compr->cc)) {
pr_err("UBIFS error (pid %d): cannot initialize compressor %s, error %ld",
current->pid, compr->name, PTR_ERR(compr->cc));
@@ -218,7 +323,7 @@ static int __init compr_init(struct ubifs_compressor *compr)
static void compr_exit(struct ubifs_compressor *compr)
{
if (compr->capi_name)
- crypto_free_comp(compr->cc);
+ crypto_free_acomp(compr->cc);
}
/**
diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c
index 921f9033d0d2..fb5ac358077b 100644
--- a/fs/ubifs/crypto.c
+++ b/fs/ubifs/crypto.c
@@ -51,7 +51,7 @@ int ubifs_encrypt(const struct inode *inode, struct ubifs_data_node *dn,
memset(p + in_len, 0, pad_len - in_len);
err = fscrypt_encrypt_block_inplace(inode, virt_to_page(p), pad_len,
- offset_in_page(p), block, GFP_NOFS);
+ offset_in_page(p), block);
if (err) {
ubifs_err(c, "fscrypt_encrypt_block_inplace() failed: %d", err);
return err;
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index fda82f3e16e8..3c3d3ad4fa6c 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -1002,8 +1002,8 @@ out_fname:
return err;
}
-static int ubifs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *ubifs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct inode *inode;
struct ubifs_inode *dir_ui = ubifs_inode(dir);
@@ -1023,7 +1023,7 @@ static int ubifs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
err = ubifs_budget_space(c, &req);
if (err)
- return err;
+ return ERR_PTR(err);
err = ubifs_prepare_create(dir, dentry, &nm);
if (err)
@@ -1060,7 +1060,7 @@ static int ubifs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
ubifs_release_budget(c, &req);
d_instantiate(dentry, inode);
fscrypt_free_filename(&nm);
- return 0;
+ return NULL;
out_cancel:
dir->i_size -= sz_change;
@@ -1074,7 +1074,7 @@ out_fname:
fscrypt_free_filename(&nm);
out_budg:
ubifs_release_budget(c, &req);
- return err;
+ return ERR_PTR(err);
}
static int ubifs_mknod(struct mnt_idmap *idmap, struct inode *dir,
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 5130123005e4..e75a6cec67be 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -42,8 +42,8 @@
#include <linux/slab.h>
#include <linux/migrate.h>
-static int read_block(struct inode *inode, void *addr, unsigned int block,
- struct ubifs_data_node *dn)
+static int read_block(struct inode *inode, struct folio *folio, size_t offset,
+ unsigned int block, struct ubifs_data_node *dn)
{
struct ubifs_info *c = inode->i_sb->s_fs_info;
int err, len, out_len;
@@ -55,7 +55,7 @@ static int read_block(struct inode *inode, void *addr, unsigned int block,
if (err) {
if (err == -ENOENT)
/* Not found, so it must be a hole */
- memset(addr, 0, UBIFS_BLOCK_SIZE);
+ folio_zero_range(folio, offset, UBIFS_BLOCK_SIZE);
return err;
}
@@ -74,8 +74,8 @@ static int read_block(struct inode *inode, void *addr, unsigned int block,
}
out_len = UBIFS_BLOCK_SIZE;
- err = ubifs_decompress(c, &dn->data, dlen, addr, &out_len,
- le16_to_cpu(dn->compr_type));
+ err = ubifs_decompress_folio(c, &dn->data, dlen, folio, offset,
+ &out_len, le16_to_cpu(dn->compr_type));
if (err || len != out_len)
goto dump;
@@ -85,7 +85,7 @@ static int read_block(struct inode *inode, void *addr, unsigned int block,
* appending data). Ensure that the remainder is zeroed out.
*/
if (len < UBIFS_BLOCK_SIZE)
- memset(addr + len, 0, UBIFS_BLOCK_SIZE - len);
+ folio_zero_range(folio, offset + len, UBIFS_BLOCK_SIZE - len);
return 0;
@@ -98,27 +98,25 @@ dump:
static int do_readpage(struct folio *folio)
{
- void *addr;
int err = 0, i;
unsigned int block, beyond;
struct ubifs_data_node *dn = NULL;
struct inode *inode = folio->mapping->host;
struct ubifs_info *c = inode->i_sb->s_fs_info;
loff_t i_size = i_size_read(inode);
+ size_t offset = 0;
dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx",
inode->i_ino, folio->index, i_size, folio->flags);
ubifs_assert(c, !folio_test_checked(folio));
ubifs_assert(c, !folio->private);
- addr = kmap_local_folio(folio, 0);
-
block = folio->index << UBIFS_BLOCKS_PER_PAGE_SHIFT;
beyond = (i_size + UBIFS_BLOCK_SIZE - 1) >> UBIFS_BLOCK_SHIFT;
if (block >= beyond) {
/* Reading beyond inode */
folio_set_checked(folio);
- addr = folio_zero_tail(folio, 0, addr);
+ folio_zero_range(folio, 0, folio_size(folio));
goto out;
}
@@ -135,9 +133,9 @@ static int do_readpage(struct folio *folio)
if (block >= beyond) {
/* Reading beyond inode */
err = -ENOENT;
- memset(addr, 0, UBIFS_BLOCK_SIZE);
+ folio_zero_range(folio, offset, UBIFS_BLOCK_SIZE);
} else {
- ret = read_block(inode, addr, block, dn);
+ ret = read_block(inode, folio, offset, block, dn);
if (ret) {
err = ret;
if (err != -ENOENT)
@@ -147,17 +145,13 @@ static int do_readpage(struct folio *folio)
int ilen = i_size & (UBIFS_BLOCK_SIZE - 1);
if (ilen && ilen < dlen)
- memset(addr + ilen, 0, dlen - ilen);
+ folio_zero_range(folio, offset + ilen, dlen - ilen);
}
}
if (++i >= (UBIFS_BLOCKS_PER_PAGE << folio_order(folio)))
break;
block += 1;
- addr += UBIFS_BLOCK_SIZE;
- if (folio_test_highmem(folio) && (offset_in_page(addr) == 0)) {
- kunmap_local(addr - UBIFS_BLOCK_SIZE);
- addr = kmap_local_folio(folio, i * UBIFS_BLOCK_SIZE);
- }
+ offset += UBIFS_BLOCK_SIZE;
}
if (err) {
@@ -177,8 +171,6 @@ out:
kfree(dn);
if (!err)
folio_mark_uptodate(folio);
- flush_dcache_folio(folio);
- kunmap_local(addr);
return err;
}
@@ -412,7 +404,8 @@ static int allocate_budget(struct ubifs_info *c, struct folio *folio,
* there is a plenty of flash space and the budget will be acquired quickly,
* without forcing write-back. The slow path does not make this assumption.
*/
-static int ubifs_write_begin(struct file *file, struct address_space *mapping,
+static int ubifs_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
loff_t pos, unsigned len,
struct folio **foliop, void **fsdata)
{
@@ -522,8 +515,9 @@ static void cancel_budget(struct ubifs_info *c, struct folio *folio,
}
}
-static int ubifs_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
+static int ubifs_write_end(const struct kiocb *iocb,
+ struct address_space *mapping, loff_t pos,
+ unsigned len, unsigned copied,
struct folio *folio, void *fsdata)
{
struct inode *inode = mapping->host;
@@ -602,18 +596,16 @@ static int populate_page(struct ubifs_info *c, struct folio *folio,
struct inode *inode = folio->mapping->host;
loff_t i_size = i_size_read(inode);
unsigned int page_block;
- void *addr, *zaddr;
+ size_t offset = 0;
pgoff_t end_index;
dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx",
inode->i_ino, folio->index, i_size, folio->flags);
- addr = zaddr = kmap_local_folio(folio, 0);
-
end_index = (i_size - 1) >> PAGE_SHIFT;
if (!i_size || folio->index > end_index) {
hole = 1;
- addr = folio_zero_tail(folio, 0, addr);
+ folio_zero_range(folio, 0, folio_size(folio));
goto out_hole;
}
@@ -623,7 +615,7 @@ static int populate_page(struct ubifs_info *c, struct folio *folio,
if (nn >= bu->cnt) {
hole = 1;
- memset(addr, 0, UBIFS_BLOCK_SIZE);
+ folio_zero_range(folio, offset, UBIFS_BLOCK_SIZE);
} else if (key_block(c, &bu->zbranch[nn].key) == page_block) {
struct ubifs_data_node *dn;
@@ -645,13 +637,15 @@ static int populate_page(struct ubifs_info *c, struct folio *folio,
goto out_err;
}
- err = ubifs_decompress(c, &dn->data, dlen, addr, &out_len,
- le16_to_cpu(dn->compr_type));
+ err = ubifs_decompress_folio(
+ c, &dn->data, dlen, folio, offset, &out_len,
+ le16_to_cpu(dn->compr_type));
if (err || len != out_len)
goto out_err;
if (len < UBIFS_BLOCK_SIZE)
- memset(addr + len, 0, UBIFS_BLOCK_SIZE - len);
+ folio_zero_range(folio, offset + len,
+ UBIFS_BLOCK_SIZE - len);
nn += 1;
read = (i << UBIFS_BLOCK_SHIFT) + len;
@@ -660,23 +654,19 @@ static int populate_page(struct ubifs_info *c, struct folio *folio,
continue;
} else {
hole = 1;
- memset(addr, 0, UBIFS_BLOCK_SIZE);
+ folio_zero_range(folio, offset, UBIFS_BLOCK_SIZE);
}
if (++i >= UBIFS_BLOCKS_PER_PAGE)
break;
- addr += UBIFS_BLOCK_SIZE;
+ offset += UBIFS_BLOCK_SIZE;
page_block += 1;
- if (folio_test_highmem(folio) && (offset_in_page(addr) == 0)) {
- kunmap_local(addr - UBIFS_BLOCK_SIZE);
- addr = kmap_local_folio(folio, i * UBIFS_BLOCK_SIZE);
- }
}
if (end_index == folio->index) {
int len = i_size & (PAGE_SIZE - 1);
if (len && len < read)
- memset(zaddr + len, 0, read - len);
+ folio_zero_range(folio, len, read - len);
}
out_hole:
@@ -686,14 +676,10 @@ out_hole:
}
folio_mark_uptodate(folio);
- flush_dcache_folio(folio);
- kunmap_local(addr);
*n = nn;
return 0;
out_err:
- flush_dcache_folio(folio);
- kunmap_local(addr);
ubifs_err(c, "bad data node (block %u, inode %lu)",
page_block, inode->i_ino);
return -EINVAL;
@@ -898,7 +884,6 @@ static int do_writepage(struct folio *folio, size_t len)
{
int err = 0, blen;
unsigned int block;
- void *addr;
size_t offset = 0;
union ubifs_key key;
struct inode *inode = folio->mapping->host;
@@ -913,26 +898,19 @@ static int do_writepage(struct folio *folio, size_t len)
folio_start_writeback(folio);
- addr = kmap_local_folio(folio, offset);
block = folio->index << UBIFS_BLOCKS_PER_PAGE_SHIFT;
for (;;) {
blen = min_t(size_t, len, UBIFS_BLOCK_SIZE);
data_key_init(c, &key, inode->i_ino, block);
- err = ubifs_jnl_write_data(c, inode, &key, addr, blen);
+ err = ubifs_jnl_write_data(c, inode, &key, folio, offset, blen);
if (err)
break;
len -= blen;
if (!len)
break;
block += 1;
- addr += blen;
- if (folio_test_highmem(folio) && !offset_in_page(addr)) {
- kunmap_local(addr - blen);
- offset += PAGE_SIZE;
- addr = kmap_local_folio(folio, offset);
- }
+ offset += blen;
}
- kunmap_local(addr);
if (err) {
mapping_set_error(folio->mapping, err);
ubifs_err(c, "cannot write folio %lu of inode %lu, error %d",
@@ -1001,8 +979,7 @@ static int do_writepage(struct folio *folio, size_t len)
* on the page lock and it would not write the truncated inode node to the
* journal before we have finished.
*/
-static int ubifs_writepage(struct folio *folio, struct writeback_control *wbc,
- void *data)
+static int ubifs_writepage(struct folio *folio, struct writeback_control *wbc)
{
struct inode *inode = folio->mapping->host;
struct ubifs_info *c = inode->i_sb->s_fs_info;
@@ -1074,7 +1051,12 @@ out_unlock:
static int ubifs_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
- return write_cache_pages(mapping, wbc, ubifs_writepage, NULL);
+ struct folio *folio = NULL;
+ int error;
+
+ while ((folio = writeback_iter(mapping, wbc, folio, &error)))
+ error = ubifs_writepage(folio, wbc);
+ return error;
}
/**
@@ -1603,17 +1585,17 @@ static const struct vm_operations_struct ubifs_file_vm_ops = {
.page_mkwrite = ubifs_vm_page_mkwrite,
};
-static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma)
+static int ubifs_file_mmap_prepare(struct vm_area_desc *desc)
{
int err;
- err = generic_file_mmap(file, vma);
+ err = generic_file_mmap_prepare(desc);
if (err)
return err;
- vma->vm_ops = &ubifs_file_vm_ops;
+ desc->vm_ops = &ubifs_file_vm_ops;
if (IS_ENABLED(CONFIG_UBIFS_ATIME_SUPPORT))
- file_accessed(file);
+ file_accessed(desc->file);
return 0;
}
@@ -1676,7 +1658,7 @@ const struct file_operations ubifs_file_operations = {
.llseek = generic_file_llseek,
.read_iter = generic_file_read_iter,
.write_iter = ubifs_write_iter,
- .mmap = ubifs_file_mmap,
+ .mmap_prepare = ubifs_file_mmap_prepare,
.fsync = ubifs_fsync,
.unlocked_ioctl = ubifs_ioctl,
.splice_read = filemap_splice_read,
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 01d8eb170382..a79f229df475 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -1179,8 +1179,7 @@ int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf)
wbuf->c = c;
wbuf->next_ino = 0;
- hrtimer_init(&wbuf->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- wbuf->timer.function = wbuf_timer_callback_nolock;
+ hrtimer_setup(&wbuf->timer, wbuf_timer_callback_nolock, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
return 0;
}
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
index 2c99349cf537..79536b2e3d7a 100644
--- a/fs/ubifs/ioctl.c
+++ b/fs/ubifs/ioctl.c
@@ -130,7 +130,7 @@ static int setflags(struct inode *inode, int flags)
return err;
}
-int ubifs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+int ubifs_fileattr_get(struct dentry *dentry, struct file_kattr *fa)
{
struct inode *inode = d_inode(dentry);
int flags = ubifs2ioctl(ubifs_inode(inode)->flags);
@@ -145,7 +145,7 @@ int ubifs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
}
int ubifs_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa)
+ struct dentry *dentry, struct file_kattr *fa)
{
struct inode *inode = d_inode(dentry);
int flags = fa->flags;
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 36ba79fbd2ff..e28ab4395e5c 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -845,14 +845,16 @@ out_ro:
* @c: UBIFS file-system description object
* @inode: inode the data node belongs to
* @key: node key
- * @buf: buffer to write
+ * @folio: buffer to write
+ * @offset: offset to write at
* @len: data length (must not exceed %UBIFS_BLOCK_SIZE)
*
* This function writes a data node to the journal. Returns %0 if the data node
* was successfully written, and a negative error code in case of failure.
*/
int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
- const union ubifs_key *key, const void *buf, int len)
+ const union ubifs_key *key, struct folio *folio,
+ size_t offset, int len)
{
struct ubifs_data_node *data;
int err, lnum, offs, compr_type, out_len, compr_len, auth_len;
@@ -896,7 +898,8 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
compr_type = ui->compr_type;
out_len = compr_len = dlen - UBIFS_DATA_NODE_SZ;
- ubifs_compress(c, buf, len, &data->data, &compr_len, &compr_type);
+ ubifs_compress_folio(c, folio, offset, len, &data->data, &compr_len,
+ &compr_type);
ubifs_assert(c, compr_len <= UBIFS_BLOCK_SIZE);
if (encrypted) {
@@ -982,7 +985,7 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode)
dbg_jnl("ino %lu, nlink %u", inode->i_ino, inode->i_nlink);
if (kill_xattrs && ui->xattr_cnt > ubifs_xattr_max_cnt(c)) {
- ubifs_err(c, "Cannot delete inode, it has too much xattrs!");
+ ubifs_err(c, "Cannot delete inode, it has too many xattrs!");
err = -EPERM;
ubifs_ro_mode(c, err);
return err;
@@ -1625,7 +1628,7 @@ static int truncate_data_node(const struct ubifs_info *c, const struct inode *in
int err, dlen, compr_type, out_len, data_size;
out_len = le32_to_cpu(dn->size);
- buf = kmalloc_array(out_len, WORST_COMPR_FACTOR, GFP_NOFS);
+ buf = kmalloc(out_len, GFP_NOFS);
if (!buf)
return -ENOMEM;
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 3375bbe0508c..5db45c9e26ee 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -124,13 +124,6 @@
#define OLD_ZNODE_AGE 20
#define YOUNG_ZNODE_AGE 5
-/*
- * Some compressors, like LZO, may end up with more data then the input buffer.
- * So UBIFS always allocates larger output buffer, to be sure the compressor
- * will not corrupt memory in case of worst case compression.
- */
-#define WORST_COMPR_FACTOR 2
-
#ifdef CONFIG_FS_ENCRYPTION
#define UBIFS_CIPHER_BLOCK_SIZE FSCRYPT_CONTENTS_ALIGNMENT
#else
@@ -141,7 +134,7 @@
* How much memory is needed for a buffer where we compress a data node.
*/
#define COMPRESSED_DATA_NODE_BUF_SZ \
- (UBIFS_DATA_NODE_SZ + UBIFS_BLOCK_SIZE * WORST_COMPR_FACTOR)
+ (UBIFS_DATA_NODE_SZ + UBIFS_BLOCK_SIZE)
/* Maximum expected tree height for use by bottom_up_buf */
#define BOTTOM_UP_HEIGHT 64
@@ -270,6 +263,8 @@ enum {
ASSACT_PANIC,
};
+struct folio;
+
/**
* struct ubifs_old_idx - index node obsoleted since last commit start.
* @rb: rb-tree node
@@ -835,16 +830,12 @@ struct ubifs_node_range {
* struct ubifs_compressor - UBIFS compressor description structure.
* @compr_type: compressor type (%UBIFS_COMPR_LZO, etc)
* @cc: cryptoapi compressor handle
- * @comp_mutex: mutex used during compression
- * @decomp_mutex: mutex used during decompression
* @name: compressor name
* @capi_name: cryptoapi compressor name
*/
struct ubifs_compressor {
int compr_type;
- struct crypto_comp *cc;
- struct mutex *comp_mutex;
- struct mutex *decomp_mutex;
+ struct crypto_acomp *cc;
const char *name;
const char *capi_name;
};
@@ -1795,7 +1786,8 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
const struct fscrypt_name *nm, const struct inode *inode,
int deletion, int xent, int in_orphan);
int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
- const union ubifs_key *key, const void *buf, int len);
+ const union ubifs_key *key, struct folio *folio,
+ size_t offset, int len);
int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode);
int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode);
int ubifs_jnl_xrename(struct ubifs_info *c, const struct inode *fst_dir,
@@ -2081,9 +2073,9 @@ int ubifs_recover_size(struct ubifs_info *c, bool in_place);
void ubifs_destroy_size_tree(struct ubifs_info *c);
/* ioctl.c */
-int ubifs_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+int ubifs_fileattr_get(struct dentry *dentry, struct file_kattr *fa);
int ubifs_fileattr_set(struct mnt_idmap *idmap,
- struct dentry *dentry, struct fileattr *fa);
+ struct dentry *dentry, struct file_kattr *fa);
long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
void ubifs_set_inode_flags(struct inode *inode);
#ifdef CONFIG_COMPAT
@@ -2095,8 +2087,14 @@ int __init ubifs_compressors_init(void);
void ubifs_compressors_exit(void);
void ubifs_compress(const struct ubifs_info *c, const void *in_buf, int in_len,
void *out_buf, int *out_len, int *compr_type);
+void ubifs_compress_folio(const struct ubifs_info *c, struct folio *folio,
+ size_t offset, int in_len, void *out_buf,
+ int *out_len, int *compr_type);
int ubifs_decompress(const struct ubifs_info *c, const void *buf, int len,
void *out, int *out_len, int compr_type);
+int ubifs_decompress_folio(const struct ubifs_info *c, const void *buf,
+ int len, struct folio *folio, size_t offset,
+ int *out_len, int compr_type);
/* sysfs.c */
int ubifs_sysfs_init(void);
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 412fe7c4d348..0d76c4f37b3e 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -69,7 +69,7 @@ static vm_fault_t udf_page_mkwrite(struct vm_fault *vmf)
goto out_unlock;
}
- block_commit_write(&folio->page, 0, end);
+ block_commit_write(folio, 0, end);
out_dirty:
folio_mark_dirty(folio);
folio_wait_stable(folio);
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 70c907fe8af9..f24aa98e6869 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -181,19 +181,23 @@ static void udf_write_failed(struct address_space *mapping, loff_t to)
}
}
-static int udf_adinicb_writepage(struct folio *folio,
- struct writeback_control *wbc, void *data)
+static int udf_adinicb_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
{
- struct inode *inode = folio->mapping->host;
+ struct inode *inode = mapping->host;
struct udf_inode_info *iinfo = UDF_I(inode);
+ struct folio *folio = NULL;
+ int error = 0;
+
+ while ((folio = writeback_iter(mapping, wbc, folio, &error))) {
+ BUG_ON(!folio_test_locked(folio));
+ BUG_ON(folio->index != 0);
+ memcpy_from_file_folio(iinfo->i_data + iinfo->i_lenEAttr, folio,
+ 0, i_size_read(inode));
+ folio_unlock(folio);
+ }
- BUG_ON(!folio_test_locked(folio));
- BUG_ON(folio->index != 0);
- memcpy_from_file_folio(iinfo->i_data + iinfo->i_lenEAttr, folio, 0,
- i_size_read(inode));
- folio_unlock(folio);
mark_inode_dirty(inode);
-
return 0;
}
@@ -203,9 +207,9 @@ static int udf_writepages(struct address_space *mapping,
struct inode *inode = mapping->host;
struct udf_inode_info *iinfo = UDF_I(inode);
- if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB)
- return mpage_writepages(mapping, wbc, udf_get_block_wb);
- return write_cache_pages(mapping, wbc, udf_adinicb_writepage, NULL);
+ if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
+ return udf_adinicb_writepages(mapping, wbc);
+ return mpage_writepages(mapping, wbc, udf_get_block_wb);
}
static void udf_adinicb_read_folio(struct folio *folio)
@@ -244,10 +248,12 @@ static void udf_readahead(struct readahead_control *rac)
mpage_readahead(rac, udf_get_block);
}
-static int udf_write_begin(struct file *file, struct address_space *mapping,
+static int udf_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
loff_t pos, unsigned len,
struct folio **foliop, void **fsdata)
{
+ struct file *file = iocb->ki_filp;
struct udf_inode_info *iinfo = UDF_I(file_inode(file));
struct folio *folio;
int ret;
@@ -271,15 +277,16 @@ static int udf_write_begin(struct file *file, struct address_space *mapping,
return 0;
}
-static int udf_write_end(struct file *file, struct address_space *mapping,
+static int udf_write_end(const struct kiocb *iocb,
+ struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct folio *folio, void *fsdata)
{
- struct inode *inode = file_inode(file);
+ struct inode *inode = file_inode(iocb->ki_filp);
loff_t last_pos;
if (UDF_I(inode)->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB)
- return generic_write_end(file, mapping, pos, len, copied, folio,
+ return generic_write_end(iocb, mapping, pos, len, copied, folio,
fsdata);
last_pos = pos + copied;
if (last_pos > inode->i_size)
@@ -810,6 +817,7 @@ static int inode_getblk(struct inode *inode, struct udf_map_rq *map)
}
map->oflags = UDF_BLK_MAPPED;
map->pblk = udf_get_lb_pblock(inode->i_sb, &eloc, offset);
+ ret = 0;
goto out_free;
}
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 2cb49b6b0716..5f2e9a892bff 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -419,8 +419,8 @@ static int udf_mknod(struct mnt_idmap *idmap, struct inode *dir,
return udf_add_nondir(dentry, inode);
}
-static int udf_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *udf_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct inode *inode;
struct udf_fileident_iter iter;
@@ -430,7 +430,7 @@ static int udf_mkdir(struct mnt_idmap *idmap, struct inode *dir,
inode = udf_new_inode(dir, S_IFDIR | mode);
if (IS_ERR(inode))
- return PTR_ERR(inode);
+ return ERR_CAST(inode);
iinfo = UDF_I(inode);
inode->i_op = &udf_dir_inode_operations;
@@ -439,7 +439,7 @@ static int udf_mkdir(struct mnt_idmap *idmap, struct inode *dir,
if (err) {
clear_nlink(inode);
discard_new_inode(inode);
- return err;
+ return ERR_PTR(err);
}
set_nlink(inode, 2);
iter.fi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
@@ -456,7 +456,7 @@ static int udf_mkdir(struct mnt_idmap *idmap, struct inode *dir,
if (err) {
clear_nlink(inode);
discard_new_inode(inode);
- return err;
+ return ERR_PTR(err);
}
iter.fi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
iter.fi.icb.extLocation = cpu_to_lelb(iinfo->i_location);
@@ -471,7 +471,7 @@ static int udf_mkdir(struct mnt_idmap *idmap, struct inode *dir,
mark_inode_dirty(dir);
d_instantiate_new(dentry, inode);
- return 0;
+ return NULL;
}
static int empty_dir(struct inode *dir)
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 1c8a736b3309..b2f168b0a0d1 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1440,7 +1440,7 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
struct genericPartitionMap *gpm;
uint16_t ident;
struct buffer_head *bh;
- unsigned int table_len;
+ unsigned int table_len, part_map_count;
int ret;
bh = udf_read_tagged(sb, block, block, &ident);
@@ -1461,7 +1461,16 @@ static int udf_load_logicalvol(struct super_block *sb, sector_t block,
"logical volume");
if (ret)
goto out_bh;
- ret = udf_sb_alloc_partition_maps(sb, le32_to_cpu(lvd->numPartitionMaps));
+
+ part_map_count = le32_to_cpu(lvd->numPartitionMaps);
+ if (part_map_count > table_len / sizeof(struct genericPartitionMap1)) {
+ udf_err(sb, "error loading logical volume descriptor: "
+ "Too many partition maps (%u > %u)\n", part_map_count,
+ table_len / (unsigned)sizeof(struct genericPartitionMap1));
+ ret = -EIO;
+ goto out_bh;
+ }
+ ret = udf_sb_alloc_partition_maps(sb, part_map_count);
if (ret)
goto out_bh;
diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c
index 4f33a4a48886..b4071c9cf8c9 100644
--- a/fs/udf/truncate.c
+++ b/fs/udf/truncate.c
@@ -115,7 +115,7 @@ void udf_truncate_tail_extent(struct inode *inode)
}
/* This inode entry is in-memory only and thus we don't have to mark
* the inode dirty */
- if (ret == 0)
+ if (ret >= 0)
iinfo->i_lenExtents = inode->i_size;
brelse(epos.bh);
}
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 88d0062cfdb9..0388a1bae326 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -48,7 +48,7 @@ static void ufs_commit_chunk(struct folio *folio, loff_t pos, unsigned len)
struct inode *dir = mapping->host;
inode_inc_iversion(dir);
- block_write_end(NULL, mapping, pos, len, len, folio, NULL);
+ block_write_end(pos, len, len, folio);
if (pos+len > dir->i_size) {
i_size_write(dir, pos+len);
mark_inode_dirty(dir);
diff --git a/fs/ufs/file.c b/fs/ufs/file.c
index 487ad1fc2de6..c2a391c17df7 100644
--- a/fs/ufs/file.c
+++ b/fs/ufs/file.c
@@ -38,7 +38,7 @@ const struct file_operations ufs_file_operations = {
.llseek = generic_file_llseek,
.read_iter = generic_file_read_iter,
.write_iter = generic_file_write_iter,
- .mmap = generic_file_mmap,
+ .mmap_prepare = generic_file_mmap_prepare,
.open = generic_file_open,
.fsync = generic_file_fsync,
.splice_read = filemap_splice_read,
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 7dc38fdef2ea..8361c00e8fa6 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -474,9 +474,10 @@ static void ufs_write_failed(struct address_space *mapping, loff_t to)
}
}
-static int ufs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len,
- struct folio **foliop, void **fsdata)
+static int ufs_write_begin(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len,
+ struct folio **foliop, void **fsdata)
{
int ret;
@@ -487,13 +488,14 @@ static int ufs_write_begin(struct file *file, struct address_space *mapping,
return ret;
}
-static int ufs_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct folio *folio, void *fsdata)
+static int ufs_write_end(const struct kiocb *iocb,
+ struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct folio *folio, void *fsdata)
{
int ret;
- ret = generic_write_end(file, mapping, pos, len, copied, folio, fsdata);
+ ret = generic_write_end(iocb, mapping, pos, len, copied, folio, fsdata);
if (ret < len)
ufs_write_failed(mapping, pos + len);
return ret;
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 38a024c8cccd..5b3c85c93242 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -166,8 +166,8 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir,
return error;
}
-static int ufs_mkdir(struct mnt_idmap * idmap, struct inode * dir,
- struct dentry * dentry, umode_t mode)
+static struct dentry *ufs_mkdir(struct mnt_idmap * idmap, struct inode * dir,
+ struct dentry * dentry, umode_t mode)
{
struct inode * inode;
int err;
@@ -194,7 +194,7 @@ static int ufs_mkdir(struct mnt_idmap * idmap, struct inode * dir,
goto out_fail;
d_instantiate_new(dentry, inode);
- return 0;
+ return NULL;
out_fail:
inode_dec_link_count(inode);
@@ -202,7 +202,7 @@ out_fail:
discard_new_inode(inode);
out_dir:
inode_dec_link_count(dir);
- return err;
+ return ERR_PTR(err);
}
static int ufs_unlink(struct inode *dir, struct dentry *dentry)
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 762699c1bcf6..6e4585169f94 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -83,11 +83,11 @@
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/init.h>
-#include <linux/parser.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include <linux/buffer_head.h>
#include <linux/vfs.h>
#include <linux/log2.h>
-#include <linux/mount.h>
#include <linux/seq_file.h>
#include <linux/iversion.h>
@@ -289,7 +289,7 @@ void ufs_error (struct super_block * sb, const char * function,
va_start(args, fmt);
vaf.fmt = fmt;
vaf.va = &args;
- switch (UFS_SB(sb)->s_mount_opt & UFS_MOUNT_ONERROR) {
+ switch (UFS_SB(sb)->s_on_err) {
case UFS_MOUNT_ONERROR_PANIC:
panic("panic (device %s): %s: %pV\n",
sb->s_id, function, &vaf);
@@ -342,124 +342,74 @@ void ufs_warning (struct super_block * sb, const char * function,
va_end(args);
}
-enum {
- Opt_type_old = UFS_MOUNT_UFSTYPE_OLD,
- Opt_type_sunx86 = UFS_MOUNT_UFSTYPE_SUNx86,
- Opt_type_sun = UFS_MOUNT_UFSTYPE_SUN,
- Opt_type_sunos = UFS_MOUNT_UFSTYPE_SUNOS,
- Opt_type_44bsd = UFS_MOUNT_UFSTYPE_44BSD,
- Opt_type_ufs2 = UFS_MOUNT_UFSTYPE_UFS2,
- Opt_type_hp = UFS_MOUNT_UFSTYPE_HP,
- Opt_type_nextstepcd = UFS_MOUNT_UFSTYPE_NEXTSTEP_CD,
- Opt_type_nextstep = UFS_MOUNT_UFSTYPE_NEXTSTEP,
- Opt_type_openstep = UFS_MOUNT_UFSTYPE_OPENSTEP,
- Opt_onerror_panic = UFS_MOUNT_ONERROR_PANIC,
- Opt_onerror_lock = UFS_MOUNT_ONERROR_LOCK,
- Opt_onerror_umount = UFS_MOUNT_ONERROR_UMOUNT,
- Opt_onerror_repair = UFS_MOUNT_ONERROR_REPAIR,
- Opt_err
+enum { Opt_type, Opt_onerror };
+
+static const struct constant_table ufs_param_ufstype[] = {
+ {"old", UFS_MOUNT_UFSTYPE_OLD},
+ {"sunx86", UFS_MOUNT_UFSTYPE_SUNx86},
+ {"sun", UFS_MOUNT_UFSTYPE_SUN},
+ {"sunos", UFS_MOUNT_UFSTYPE_SUNOS},
+ {"44bsd", UFS_MOUNT_UFSTYPE_44BSD},
+ {"ufs2", UFS_MOUNT_UFSTYPE_UFS2},
+ {"5xbsd", UFS_MOUNT_UFSTYPE_UFS2},
+ {"hp", UFS_MOUNT_UFSTYPE_HP},
+ {"nextstep-cd", UFS_MOUNT_UFSTYPE_NEXTSTEP_CD},
+ {"nextstep", UFS_MOUNT_UFSTYPE_NEXTSTEP},
+ {"openstep", UFS_MOUNT_UFSTYPE_OPENSTEP},
+ {}
};
-static const match_table_t tokens = {
- {Opt_type_old, "ufstype=old"},
- {Opt_type_sunx86, "ufstype=sunx86"},
- {Opt_type_sun, "ufstype=sun"},
- {Opt_type_sunos, "ufstype=sunos"},
- {Opt_type_44bsd, "ufstype=44bsd"},
- {Opt_type_ufs2, "ufstype=ufs2"},
- {Opt_type_ufs2, "ufstype=5xbsd"},
- {Opt_type_hp, "ufstype=hp"},
- {Opt_type_nextstepcd, "ufstype=nextstep-cd"},
- {Opt_type_nextstep, "ufstype=nextstep"},
- {Opt_type_openstep, "ufstype=openstep"},
-/*end of possible ufs types */
- {Opt_onerror_panic, "onerror=panic"},
- {Opt_onerror_lock, "onerror=lock"},
- {Opt_onerror_umount, "onerror=umount"},
- {Opt_onerror_repair, "onerror=repair"},
- {Opt_err, NULL}
+static const struct constant_table ufs_param_onerror[] = {
+ {"panic", UFS_MOUNT_ONERROR_PANIC},
+ {"lock", UFS_MOUNT_ONERROR_LOCK},
+ {"umount", UFS_MOUNT_ONERROR_UMOUNT},
+ {"repair", UFS_MOUNT_ONERROR_REPAIR},
+ {}
};
-static int ufs_parse_options (char * options, unsigned * mount_options)
+static const struct fs_parameter_spec ufs_param_spec[] = {
+ fsparam_enum ("ufstype", Opt_type, ufs_param_ufstype),
+ fsparam_enum ("onerror", Opt_onerror, ufs_param_onerror),
+ {}
+};
+
+struct ufs_fs_context {
+ unsigned int flavour, on_err;
+};
+
+static int ufs_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- char * p;
-
+ struct ufs_fs_context *ctx = fc->fs_private;
+ struct fs_parse_result result;
+ int opt;
+
UFSD("ENTER\n");
-
- if (!options)
- return 1;
- while ((p = strsep(&options, ",")) != NULL) {
- substring_t args[MAX_OPT_ARGS];
- int token;
- if (!*p)
- continue;
+ opt = fs_parse(fc, ufs_param_spec, param, &result);
+ if (opt < 0)
+ return opt;
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_type_old:
- ufs_clear_opt (*mount_options, UFSTYPE);
- ufs_set_opt (*mount_options, UFSTYPE_OLD);
- break;
- case Opt_type_sunx86:
- ufs_clear_opt (*mount_options, UFSTYPE);
- ufs_set_opt (*mount_options, UFSTYPE_SUNx86);
- break;
- case Opt_type_sun:
- ufs_clear_opt (*mount_options, UFSTYPE);
- ufs_set_opt (*mount_options, UFSTYPE_SUN);
- break;
- case Opt_type_sunos:
- ufs_clear_opt(*mount_options, UFSTYPE);
- ufs_set_opt(*mount_options, UFSTYPE_SUNOS);
- break;
- case Opt_type_44bsd:
- ufs_clear_opt (*mount_options, UFSTYPE);
- ufs_set_opt (*mount_options, UFSTYPE_44BSD);
- break;
- case Opt_type_ufs2:
- ufs_clear_opt(*mount_options, UFSTYPE);
- ufs_set_opt(*mount_options, UFSTYPE_UFS2);
- break;
- case Opt_type_hp:
- ufs_clear_opt (*mount_options, UFSTYPE);
- ufs_set_opt (*mount_options, UFSTYPE_HP);
- break;
- case Opt_type_nextstepcd:
- ufs_clear_opt (*mount_options, UFSTYPE);
- ufs_set_opt (*mount_options, UFSTYPE_NEXTSTEP_CD);
- break;
- case Opt_type_nextstep:
- ufs_clear_opt (*mount_options, UFSTYPE);
- ufs_set_opt (*mount_options, UFSTYPE_NEXTSTEP);
- break;
- case Opt_type_openstep:
- ufs_clear_opt (*mount_options, UFSTYPE);
- ufs_set_opt (*mount_options, UFSTYPE_OPENSTEP);
- break;
- case Opt_onerror_panic:
- ufs_clear_opt (*mount_options, ONERROR);
- ufs_set_opt (*mount_options, ONERROR_PANIC);
- break;
- case Opt_onerror_lock:
- ufs_clear_opt (*mount_options, ONERROR);
- ufs_set_opt (*mount_options, ONERROR_LOCK);
- break;
- case Opt_onerror_umount:
- ufs_clear_opt (*mount_options, ONERROR);
- ufs_set_opt (*mount_options, ONERROR_UMOUNT);
- break;
- case Opt_onerror_repair:
- pr_err("Unable to do repair on error, will lock lock instead\n");
- ufs_clear_opt (*mount_options, ONERROR);
- ufs_set_opt (*mount_options, ONERROR_REPAIR);
- break;
- default:
- pr_err("Invalid option: \"%s\" or missing value\n", p);
+ switch (opt) {
+ case Opt_type:
+ if (ctx->flavour == result.uint_32) /* no-op */
return 0;
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+ pr_err("ufstype can't be changed during remount\n");
+ return -EINVAL;
}
+ if (ctx->flavour) {
+ pr_err("conflicting ufstype options\n");
+ return -EINVAL;
+ }
+ ctx->flavour = result.uint_32;
+ break;
+ case Opt_onerror:
+ ctx->on_err = result.uint_32;
+ break;
+ default:
+ return -EINVAL;
}
- return 1;
+ return 0;
}
/*
@@ -474,7 +424,7 @@ static void ufs_setup_cstotal(struct super_block *sb)
struct ufs_super_block_first *usb1;
struct ufs_super_block_second *usb2;
struct ufs_super_block_third *usb3;
- unsigned mtype = sbi->s_mount_opt & UFS_MOUNT_UFSTYPE;
+ unsigned mtype = sbi->s_flavour;
UFSD("ENTER, mtype=%u\n", mtype);
usb1 = ubh_get_usb_first(uspi);
@@ -580,7 +530,7 @@ failed:
*/
static void ufs_put_cstotal(struct super_block *sb)
{
- unsigned mtype = UFS_SB(sb)->s_mount_opt & UFS_MOUNT_UFSTYPE;
+ unsigned mtype = UFS_SB(sb)->s_flavour;
struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
struct ufs_super_block_first *usb1;
struct ufs_super_block_second *usb2;
@@ -764,8 +714,10 @@ static u64 ufs_max_bytes(struct super_block *sb)
return res << uspi->s_bshift;
}
-static int ufs_fill_super(struct super_block *sb, void *data, int silent)
+static int ufs_fill_super(struct super_block *sb, struct fs_context *fc)
{
+ struct ufs_fs_context *ctx = fc->fs_private;
+ int silent = fc->sb_flags & SB_SILENT;
struct ufs_sb_info * sbi;
struct ufs_sb_private_info * uspi;
struct ufs_super_block_first * usb1;
@@ -803,24 +755,18 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
mutex_init(&sbi->s_lock);
spin_lock_init(&sbi->work_lock);
INIT_DELAYED_WORK(&sbi->sync_work, delayed_sync_fs);
- /*
- * Set default mount options
- * Parse mount options
- */
- sbi->s_mount_opt = 0;
- ufs_set_opt (sbi->s_mount_opt, ONERROR_LOCK);
- if (!ufs_parse_options ((char *) data, &sbi->s_mount_opt)) {
- pr_err("wrong mount options\n");
- goto failed;
- }
- if (!(sbi->s_mount_opt & UFS_MOUNT_UFSTYPE)) {
+
+ sbi->s_flavour = ctx->flavour;
+ sbi->s_on_err = ctx->on_err;
+
+ if (!sbi->s_flavour) {
if (!silent)
pr_err("You didn't specify the type of your ufs filesystem\n\n"
"mount -t ufs -o ufstype="
"sun|sunx86|44bsd|ufs2|5xbsd|old|hp|nextstep|nextstep-cd|openstep ...\n\n"
">>>WARNING<<< Wrong ufstype may corrupt your filesystem, "
"default is ufstype=old\n");
- ufs_set_opt (sbi->s_mount_opt, UFSTYPE_OLD);
+ sbi->s_flavour = UFS_MOUNT_UFSTYPE_OLD;
}
uspi = kzalloc(sizeof(struct ufs_sb_private_info), GFP_KERNEL);
@@ -836,7 +782,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_time_min = S32_MIN;
sb->s_time_max = S32_MAX;
- switch (sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) {
+ switch (sbi->s_flavour) {
case UFS_MOUNT_UFSTYPE_44BSD:
UFSD("ufstype=44bsd\n");
uspi->s_fsize = block_size = 512;
@@ -1035,9 +981,9 @@ again:
goto magic_found;
}
- if ((((sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) == UFS_MOUNT_UFSTYPE_NEXTSTEP)
- || ((sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) == UFS_MOUNT_UFSTYPE_NEXTSTEP_CD)
- || ((sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) == UFS_MOUNT_UFSTYPE_OPENSTEP))
+ if ((sbi->s_flavour == UFS_MOUNT_UFSTYPE_NEXTSTEP
+ || sbi->s_flavour == UFS_MOUNT_UFSTYPE_NEXTSTEP_CD
+ || sbi->s_flavour == UFS_MOUNT_UFSTYPE_OPENSTEP)
&& uspi->s_sbbase < 256) {
ubh_brelse_uspi(uspi);
ubh = NULL;
@@ -1237,8 +1183,8 @@ magic_found:
uspi->s_bpf = uspi->s_fsize << 3;
uspi->s_bpfshift = uspi->s_fshift + 3;
uspi->s_bpfmask = uspi->s_bpf - 1;
- if ((sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) == UFS_MOUNT_UFSTYPE_44BSD ||
- (sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) == UFS_MOUNT_UFSTYPE_UFS2)
+ if (sbi->s_flavour == UFS_MOUNT_UFSTYPE_44BSD ||
+ sbi->s_flavour == UFS_MOUNT_UFSTYPE_UFS2)
uspi->s_maxsymlinklen =
fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_maxsymlinklen);
@@ -1290,13 +1236,15 @@ failed_nomem:
return -ENOMEM;
}
-static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
+static int ufs_reconfigure(struct fs_context *fc)
{
struct ufs_sb_private_info * uspi;
struct ufs_super_block_first * usb1;
struct ufs_super_block_third * usb3;
- unsigned new_mount_opt, ufstype;
- unsigned flags;
+ struct ufs_fs_context *ctx = fc->fs_private;
+ struct super_block *sb = fc->root->d_sb;
+ unsigned int ufstype;
+ unsigned int flags;
sync_filesystem(sb);
mutex_lock(&UFS_SB(sb)->s_lock);
@@ -1305,27 +1253,10 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
usb1 = ubh_get_usb_first(uspi);
usb3 = ubh_get_usb_third(uspi);
- /*
- * Allow the "check" option to be passed as a remount option.
- * It is not possible to change ufstype option during remount
- */
- ufstype = UFS_SB(sb)->s_mount_opt & UFS_MOUNT_UFSTYPE;
- new_mount_opt = 0;
- ufs_set_opt (new_mount_opt, ONERROR_LOCK);
- if (!ufs_parse_options (data, &new_mount_opt)) {
- mutex_unlock(&UFS_SB(sb)->s_lock);
- return -EINVAL;
- }
- if (!(new_mount_opt & UFS_MOUNT_UFSTYPE)) {
- new_mount_opt |= ufstype;
- } else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) {
- pr_err("ufstype can't be changed during remount\n");
- mutex_unlock(&UFS_SB(sb)->s_lock);
- return -EINVAL;
- }
+ ufstype = UFS_SB(sb)->s_flavour;
- if ((bool)(*mount_flags & SB_RDONLY) == sb_rdonly(sb)) {
- UFS_SB(sb)->s_mount_opt = new_mount_opt;
+ if ((bool)(fc->sb_flags & SB_RDONLY) == sb_rdonly(sb)) {
+ UFS_SB(sb)->s_on_err = ctx->on_err;
mutex_unlock(&UFS_SB(sb)->s_lock);
return 0;
}
@@ -1333,7 +1264,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
/*
* fs was mouted as rw, remounting ro
*/
- if (*mount_flags & SB_RDONLY) {
+ if (fc->sb_flags & SB_RDONLY) {
ufs_put_super_internal(sb);
usb1->fs_time = ufs_get_seconds(sb);
if ((flags & UFS_ST_MASK) == UFS_ST_SUN
@@ -1369,7 +1300,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
sb->s_flags &= ~SB_RDONLY;
#endif
}
- UFS_SB(sb)->s_mount_opt = new_mount_opt;
+ UFS_SB(sb)->s_on_err = ctx->on_err;
mutex_unlock(&UFS_SB(sb)->s_lock);
return 0;
}
@@ -1377,19 +1308,19 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
static int ufs_show_options(struct seq_file *seq, struct dentry *root)
{
struct ufs_sb_info *sbi = UFS_SB(root->d_sb);
- unsigned mval = sbi->s_mount_opt & UFS_MOUNT_UFSTYPE;
- const struct match_token *tp = tokens;
+ unsigned mval = sbi->s_flavour;
+ const struct constant_table *tp;
- while (tp->token != Opt_onerror_panic && tp->token != mval)
+ tp = ufs_param_ufstype;
+ while (tp->value && tp->value != mval)
++tp;
- BUG_ON(tp->token == Opt_onerror_panic);
- seq_printf(seq, ",%s", tp->pattern);
+ seq_printf(seq, ",ufstype=%s", tp->name);
- mval = sbi->s_mount_opt & UFS_MOUNT_ONERROR;
- while (tp->token != Opt_err && tp->token != mval)
+ tp = ufs_param_onerror;
+ mval = sbi->s_on_err;
+ while (tp->value && tp->value != mval)
++tp;
- BUG_ON(tp->token == Opt_err);
- seq_printf(seq, ",%s", tp->pattern);
+ seq_printf(seq, ",onerror=%s", tp->name);
return 0;
}
@@ -1483,21 +1414,57 @@ static const struct super_operations ufs_super_ops = {
.put_super = ufs_put_super,
.sync_fs = ufs_sync_fs,
.statfs = ufs_statfs,
- .remount_fs = ufs_remount,
.show_options = ufs_show_options,
};
-static struct dentry *ufs_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int ufs_get_tree(struct fs_context *fc)
+{
+ return get_tree_bdev(fc, ufs_fill_super);
+}
+
+static void ufs_free_fc(struct fs_context *fc)
+{
+ kfree(fc->fs_private);
+}
+
+static const struct fs_context_operations ufs_context_ops = {
+ .parse_param = ufs_parse_param,
+ .get_tree = ufs_get_tree,
+ .reconfigure = ufs_reconfigure,
+ .free = ufs_free_fc,
+};
+
+static int ufs_init_fs_context(struct fs_context *fc)
{
- return mount_bdev(fs_type, flags, dev_name, data, ufs_fill_super);
+ struct ufs_fs_context *ctx;
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+ struct super_block *sb = fc->root->d_sb;
+ struct ufs_sb_info *sbi = UFS_SB(sb);
+
+ ctx->flavour = sbi->s_flavour;
+ ctx->on_err = sbi->s_on_err;
+ } else {
+ ctx->flavour = 0;
+ ctx->on_err = UFS_MOUNT_ONERROR_LOCK;
+ }
+
+ fc->fs_private = ctx;
+ fc->ops = &ufs_context_ops;
+
+ return 0;
}
static struct file_system_type ufs_fs_type = {
.owner = THIS_MODULE,
.name = "ufs",
- .mount = ufs_mount,
.kill_sb = kill_block_super,
+ .init_fs_context = ufs_init_fs_context,
+ .parameters = ufs_param_spec,
.fs_flags = FS_REQUIRES_DEV,
};
MODULE_ALIAS_FS("ufs");
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index e7df65dd4351..788e025056b2 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -24,7 +24,8 @@ struct ufs_sb_info {
struct ufs_cg_private_info * s_ucpi[UFS_MAX_GROUP_LOADED];
unsigned s_cgno[UFS_MAX_GROUP_LOADED];
unsigned short s_cg_loaded;
- unsigned s_mount_opt;
+ unsigned s_flavour;
+ unsigned s_on_err;
struct super_block *sb;
int work_queued; /* non-zero if the delayed work is queued */
struct delayed_work sync_work; /* FS sync delayed work */
@@ -52,13 +53,11 @@ struct ufs_inode_info {
};
/* mount options */
-#define UFS_MOUNT_ONERROR 0x0000000F
#define UFS_MOUNT_ONERROR_PANIC 0x00000001
#define UFS_MOUNT_ONERROR_LOCK 0x00000002
#define UFS_MOUNT_ONERROR_UMOUNT 0x00000004
#define UFS_MOUNT_ONERROR_REPAIR 0x00000008
-#define UFS_MOUNT_UFSTYPE 0x0000FFF0
#define UFS_MOUNT_UFSTYPE_OLD 0x00000010
#define UFS_MOUNT_UFSTYPE_44BSD 0x00000020
#define UFS_MOUNT_UFSTYPE_SUN 0x00000040
@@ -70,10 +69,6 @@ struct ufs_inode_info {
#define UFS_MOUNT_UFSTYPE_UFS2 0x00001000
#define UFS_MOUNT_UFSTYPE_SUNOS 0x00002000
-#define ufs_clear_opt(o,opt) o &= ~UFS_MOUNT_##opt
-#define ufs_set_opt(o,opt) o |= UFS_MOUNT_##opt
-#define ufs_test_opt(o,opt) ((o) & UFS_MOUNT_##opt)
-
/*
* Debug code
*/
diff --git a/fs/unicode/Kconfig b/fs/unicode/Kconfig
index da786a687fdc..4ad2c36550f1 100644
--- a/fs/unicode/Kconfig
+++ b/fs/unicode/Kconfig
@@ -10,6 +10,7 @@ config UNICODE
be a separate loadable module that gets requested only when a file
system actually use it.
-config UNICODE_NORMALIZATION_SELFTEST
+config UNICODE_NORMALIZATION_KUNIT_TEST
tristate "Test UTF-8 normalization support"
- depends on UNICODE
+ depends on UNICODE && KUNIT
+ default KUNIT_ALL_TESTS
diff --git a/fs/unicode/Makefile b/fs/unicode/Makefile
index e309afe2b2bb..d95be7fb9f6b 100644
--- a/fs/unicode/Makefile
+++ b/fs/unicode/Makefile
@@ -4,7 +4,7 @@ ifneq ($(CONFIG_UNICODE),)
obj-y += unicode.o
endif
obj-$(CONFIG_UNICODE) += utf8data.o
-obj-$(CONFIG_UNICODE_NORMALIZATION_SELFTEST) += utf8-selftest.o
+obj-$(CONFIG_UNICODE_NORMALIZATION_KUNIT_TEST) += tests/utf8_kunit.o
unicode-y := utf8-norm.o utf8-core.o
diff --git a/fs/unicode/tests/.kunitconfig b/fs/unicode/tests/.kunitconfig
new file mode 100644
index 000000000000..62dd5c171f9c
--- /dev/null
+++ b/fs/unicode/tests/.kunitconfig
@@ -0,0 +1,3 @@
+CONFIG_KUNIT=y
+CONFIG_UNICODE=y
+CONFIG_UNICODE_NORMALIZATION_KUNIT_TEST=y
diff --git a/fs/unicode/utf8-selftest.c b/fs/unicode/tests/utf8_kunit.c
index 5ddaf27b21a6..5063e8138aec 100644
--- a/fs/unicode/utf8-selftest.c
+++ b/fs/unicode/tests/utf8_kunit.c
@@ -1,34 +1,14 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * Kernel module for testing utf-8 support.
+ * KUnit tests for utf-8 support.
*
* Copyright 2017 Collabora Ltd.
*/
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/module.h>
-#include <linux/printk.h>
#include <linux/unicode.h>
-#include <linux/dcache.h>
-
-#include "utf8n.h"
-
-static unsigned int failed_tests;
-static unsigned int total_tests;
-
-#define _test(cond, func, line, fmt, ...) do { \
- total_tests++; \
- if (!cond) { \
- failed_tests++; \
- pr_err("test %s:%d Failed: %s%s", \
- func, line, #cond, (fmt?":":".")); \
- if (fmt) \
- pr_err(fmt, ##__VA_ARGS__); \
- } \
- } while (0)
-#define test_f(cond, fmt, ...) _test(cond, __func__, __LINE__, fmt, ##__VA_ARGS__)
-#define test(cond) _test(cond, __func__, __LINE__, "")
+#include <kunit/test.h>
+
+#include "../utf8n.h"
static const struct {
/* UTF-8 strings in this vector _must_ be NULL-terminated. */
@@ -167,69 +147,74 @@ static int utf8cursor(struct utf8cursor *u8c, const struct unicode_map *um,
return utf8ncursor(u8c, um, n, s, (unsigned int)-1);
}
-static void check_utf8_nfdi(struct unicode_map *um)
+static void check_utf8_nfdi(struct kunit *test)
{
int i;
struct utf8cursor u8c;
+ struct unicode_map *um = test->priv;
for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) {
int len = strlen(nfdi_test_data[i].str);
int nlen = strlen(nfdi_test_data[i].dec);
int j = 0;
unsigned char c;
+ int ret;
+
+ KUNIT_EXPECT_EQ(test, utf8len(um, UTF8_NFDI, nfdi_test_data[i].str), nlen);
+ KUNIT_EXPECT_EQ(test, utf8nlen(um, UTF8_NFDI, nfdi_test_data[i].str, len),
+ nlen);
- test((utf8len(um, UTF8_NFDI, nfdi_test_data[i].str) == nlen));
- test((utf8nlen(um, UTF8_NFDI, nfdi_test_data[i].str, len) ==
- nlen));
- if (utf8cursor(&u8c, um, UTF8_NFDI, nfdi_test_data[i].str) < 0)
- pr_err("can't create cursor\n");
+ ret = utf8cursor(&u8c, um, UTF8_NFDI, nfdi_test_data[i].str);
+ KUNIT_EXPECT_TRUE_MSG(test, ret >= 0, "Can't create cursor\n");
while ((c = utf8byte(&u8c)) > 0) {
- test_f((c == nfdi_test_data[i].dec[j]),
- "Unexpected byte 0x%x should be 0x%x\n",
- c, nfdi_test_data[i].dec[j]);
+ KUNIT_EXPECT_EQ_MSG(test, c, nfdi_test_data[i].dec[j],
+ "Unexpected byte 0x%x should be 0x%x\n",
+ c, nfdi_test_data[i].dec[j]);
j++;
}
- test((j == nlen));
+ KUNIT_EXPECT_EQ(test, j, nlen);
}
}
-static void check_utf8_nfdicf(struct unicode_map *um)
+static void check_utf8_nfdicf(struct kunit *test)
{
int i;
struct utf8cursor u8c;
+ struct unicode_map *um = test->priv;
for (i = 0; i < ARRAY_SIZE(nfdicf_test_data); i++) {
int len = strlen(nfdicf_test_data[i].str);
int nlen = strlen(nfdicf_test_data[i].ncf);
int j = 0;
+ int ret;
unsigned char c;
- test((utf8len(um, UTF8_NFDICF, nfdicf_test_data[i].str) ==
- nlen));
- test((utf8nlen(um, UTF8_NFDICF, nfdicf_test_data[i].str, len) ==
- nlen));
+ KUNIT_EXPECT_EQ(test, utf8len(um, UTF8_NFDICF, nfdicf_test_data[i].str),
+ nlen);
+ KUNIT_EXPECT_EQ(test, utf8nlen(um, UTF8_NFDICF, nfdicf_test_data[i].str, len),
+ nlen);
- if (utf8cursor(&u8c, um, UTF8_NFDICF,
- nfdicf_test_data[i].str) < 0)
- pr_err("can't create cursor\n");
+ ret = utf8cursor(&u8c, um, UTF8_NFDICF, nfdicf_test_data[i].str);
+ KUNIT_EXPECT_TRUE_MSG(test, ret >= 0, "Can't create cursor\n");
while ((c = utf8byte(&u8c)) > 0) {
- test_f((c == nfdicf_test_data[i].ncf[j]),
- "Unexpected byte 0x%x should be 0x%x\n",
- c, nfdicf_test_data[i].ncf[j]);
+ KUNIT_EXPECT_EQ_MSG(test, c, nfdicf_test_data[i].ncf[j],
+ "Unexpected byte 0x%x should be 0x%x\n",
+ c, nfdicf_test_data[i].ncf[j]);
j++;
}
- test((j == nlen));
+ KUNIT_EXPECT_EQ(test, j, nlen);
}
}
-static void check_utf8_comparisons(struct unicode_map *table)
+static void check_utf8_comparisons(struct kunit *test)
{
int i;
+ struct unicode_map *um = test->priv;
for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) {
const struct qstr s1 = {.name = nfdi_test_data[i].str,
@@ -237,8 +222,9 @@ static void check_utf8_comparisons(struct unicode_map *table)
const struct qstr s2 = {.name = nfdi_test_data[i].dec,
.len = sizeof(nfdi_test_data[i].dec)};
- test_f(!utf8_strncmp(table, &s1, &s2),
- "%s %s comparison mismatch\n", s1.name, s2.name);
+ /* strncmp returns 0 when strings are equal */
+ KUNIT_EXPECT_TRUE_MSG(test, utf8_strncmp(um, &s1, &s2) == 0,
+ "%s %s comparison mismatch\n", s1.name, s2.name);
}
for (i = 0; i < ARRAY_SIZE(nfdicf_test_data); i++) {
@@ -247,62 +233,65 @@ static void check_utf8_comparisons(struct unicode_map *table)
const struct qstr s2 = {.name = nfdicf_test_data[i].ncf,
.len = sizeof(nfdicf_test_data[i].ncf)};
- test_f(!utf8_strncasecmp(table, &s1, &s2),
- "%s %s comparison mismatch\n", s1.name, s2.name);
+ /* strncasecmp returns 0 when strings are equal */
+ KUNIT_EXPECT_TRUE_MSG(test, utf8_strncasecmp(um, &s1, &s2) == 0,
+ "%s %s comparison mismatch\n", s1.name, s2.name);
}
}
-static void check_supported_versions(struct unicode_map *um)
+static void check_supported_versions(struct kunit *test)
{
+ struct unicode_map *um = test->priv;
/* Unicode 7.0.0 should be supported. */
- test(utf8version_is_supported(um, UNICODE_AGE(7, 0, 0)));
+ KUNIT_EXPECT_TRUE(test, utf8version_is_supported(um, UNICODE_AGE(7, 0, 0)));
/* Unicode 9.0.0 should be supported. */
- test(utf8version_is_supported(um, UNICODE_AGE(9, 0, 0)));
+ KUNIT_EXPECT_TRUE(test, utf8version_is_supported(um, UNICODE_AGE(9, 0, 0)));
/* Unicode 1x.0.0 (the latest version) should be supported. */
- test(utf8version_is_supported(um, UTF8_LATEST));
+ KUNIT_EXPECT_TRUE(test, utf8version_is_supported(um, UTF8_LATEST));
/* Next versions don't exist. */
- test(!utf8version_is_supported(um, UNICODE_AGE(13, 0, 0)));
- test(!utf8version_is_supported(um, UNICODE_AGE(0, 0, 0)));
- test(!utf8version_is_supported(um, UNICODE_AGE(-1, -1, -1)));
+ KUNIT_EXPECT_FALSE(test, utf8version_is_supported(um, UNICODE_AGE(13, 0, 0)));
+ KUNIT_EXPECT_FALSE(test, utf8version_is_supported(um, UNICODE_AGE(0, 0, 0)));
+ KUNIT_EXPECT_FALSE(test, utf8version_is_supported(um, UNICODE_AGE(-1, -1, -1)));
}
-static int __init init_test_ucd(void)
+static struct kunit_case unicode_normalization_test_cases[] = {
+ KUNIT_CASE(check_supported_versions),
+ KUNIT_CASE(check_utf8_comparisons),
+ KUNIT_CASE(check_utf8_nfdicf),
+ KUNIT_CASE(check_utf8_nfdi),
+ {}
+};
+
+static int init_test_ucd(struct kunit *test)
{
- struct unicode_map *um;
+ struct unicode_map *um = utf8_load(UTF8_LATEST);
- failed_tests = 0;
- total_tests = 0;
+ test->priv = um;
- um = utf8_load(UTF8_LATEST);
- if (IS_ERR(um)) {
- pr_err("%s: Unable to load utf8 table.\n", __func__);
- return PTR_ERR(um);
- }
+ KUNIT_EXPECT_EQ_MSG(test, IS_ERR(um), 0,
+ "%s: Unable to load utf8 table.\n", __func__);
- check_supported_versions(um);
- check_utf8_nfdi(um);
- check_utf8_nfdicf(um);
- check_utf8_comparisons(um);
-
- if (!failed_tests)
- pr_info("All %u tests passed\n", total_tests);
- else
- pr_err("%u out of %u tests failed\n", failed_tests,
- total_tests);
- utf8_unload(um);
return 0;
}
-static void __exit exit_test_ucd(void)
+static void exit_test_ucd(struct kunit *test)
{
+ utf8_unload(test->priv);
}
-module_init(init_test_ucd);
-module_exit(exit_test_ucd);
+static struct kunit_suite unicode_normalization_test_suite = {
+ .name = "unicode_normalization",
+ .test_cases = unicode_normalization_test_cases,
+ .init = init_test_ucd,
+ .exit = exit_test_ucd,
+};
+
+kunit_test_suite(unicode_normalization_test_suite);
+
MODULE_AUTHOR("Gabriel Krisman Bertazi <krisman@collabora.co.uk>");
-MODULE_DESCRIPTION("Kernel module for testing utf-8 support");
+MODULE_DESCRIPTION("KUnit tests for utf-8 support.");
MODULE_LICENSE("GPL");
diff --git a/fs/unicode/utf8-norm.c b/fs/unicode/utf8-norm.c
index 768f8ab448b8..7b998c99c88d 100644
--- a/fs/unicode/utf8-norm.c
+++ b/fs/unicode/utf8-norm.c
@@ -586,7 +586,7 @@ ccc_mismatch:
}
}
-#ifdef CONFIG_UNICODE_NORMALIZATION_SELFTEST_MODULE
+#if IS_MODULE(CONFIG_UNICODE_NORMALIZATION_KUNIT_TEST)
EXPORT_SYMBOL_GPL(utf8version_is_supported);
EXPORT_SYMBOL_GPL(utf8nlen);
EXPORT_SYMBOL_GPL(utf8ncursor);
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 97c4d71115d8..54c6cc7fe9c6 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -165,14 +165,14 @@ static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx)
static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
{
if (refcount_dec_and_test(&ctx->refcount)) {
- VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock));
- VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh));
- VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock));
- VM_BUG_ON(waitqueue_active(&ctx->fault_wqh));
- VM_BUG_ON(spin_is_locked(&ctx->event_wqh.lock));
- VM_BUG_ON(waitqueue_active(&ctx->event_wqh));
- VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock));
- VM_BUG_ON(waitqueue_active(&ctx->fd_wqh));
+ VM_WARN_ON_ONCE(spin_is_locked(&ctx->fault_pending_wqh.lock));
+ VM_WARN_ON_ONCE(waitqueue_active(&ctx->fault_pending_wqh));
+ VM_WARN_ON_ONCE(spin_is_locked(&ctx->fault_wqh.lock));
+ VM_WARN_ON_ONCE(waitqueue_active(&ctx->fault_wqh));
+ VM_WARN_ON_ONCE(spin_is_locked(&ctx->event_wqh.lock));
+ VM_WARN_ON_ONCE(waitqueue_active(&ctx->event_wqh));
+ VM_WARN_ON_ONCE(spin_is_locked(&ctx->fd_wqh.lock));
+ VM_WARN_ON_ONCE(waitqueue_active(&ctx->fd_wqh));
mmdrop(ctx->mm);
kmem_cache_free(userfaultfd_ctx_cachep, ctx);
}
@@ -304,7 +304,7 @@ again:
goto out;
ret = false;
- if (!pmd_present(_pmd) || pmd_devmap(_pmd))
+ if (!pmd_present(_pmd))
goto out;
if (pmd_trans_huge(_pmd)) {
@@ -383,12 +383,12 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
if (!ctx)
goto out;
- BUG_ON(ctx->mm != mm);
+ VM_WARN_ON_ONCE(ctx->mm != mm);
/* Any unrecognized flag is a bug. */
- VM_BUG_ON(reason & ~__VM_UFFD_FLAGS);
+ VM_WARN_ON_ONCE(reason & ~__VM_UFFD_FLAGS);
/* 0 or > 1 flags set is a bug; we expect exactly 1. */
- VM_BUG_ON(!reason || (reason & (reason - 1)));
+ VM_WARN_ON_ONCE(!reason || (reason & (reason - 1)));
if (ctx->features & UFFD_FEATURE_SIGBUS)
goto out;
@@ -396,32 +396,6 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
goto out;
/*
- * If it's already released don't get it. This avoids to loop
- * in __get_user_pages if userfaultfd_release waits on the
- * caller of handle_userfault to release the mmap_lock.
- */
- if (unlikely(READ_ONCE(ctx->released))) {
- /*
- * Don't return VM_FAULT_SIGBUS in this case, so a non
- * cooperative manager can close the uffd after the
- * last UFFDIO_COPY, without risking to trigger an
- * involuntary SIGBUS if the process was starting the
- * userfaultfd while the userfaultfd was still armed
- * (but after the last UFFDIO_COPY). If the uffd
- * wasn't already closed when the userfault reached
- * this point, that would normally be solved by
- * userfaultfd_must_wait returning 'false'.
- *
- * If we were to return VM_FAULT_SIGBUS here, the non
- * cooperative manager would be instead forced to
- * always call UFFDIO_UNREGISTER before it can safely
- * close the uffd.
- */
- ret = VM_FAULT_NOPAGE;
- goto out;
- }
-
- /*
* Check that we can return VM_FAULT_RETRY.
*
* NOTE: it should become possible to return VM_FAULT_RETRY
@@ -437,12 +411,11 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
* to be sure not to return SIGBUS erroneously on
* nowait invocations.
*/
- BUG_ON(vmf->flags & FAULT_FLAG_RETRY_NOWAIT);
+ VM_WARN_ON_ONCE(vmf->flags & FAULT_FLAG_RETRY_NOWAIT);
#ifdef CONFIG_DEBUG_VM
if (printk_ratelimit()) {
- printk(KERN_WARNING
- "FAULT_FLAG_ALLOW_RETRY missing %x\n",
- vmf->flags);
+ pr_warn("FAULT_FLAG_ALLOW_RETRY missing %x\n",
+ vmf->flags);
dump_stack();
}
#endif
@@ -457,6 +430,31 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
goto out;
+ if (unlikely(READ_ONCE(ctx->released))) {
+ /*
+ * If a concurrent release is detected, do not return
+ * VM_FAULT_SIGBUS or VM_FAULT_NOPAGE, but instead always
+ * return VM_FAULT_RETRY with lock released proactively.
+ *
+ * If we were to return VM_FAULT_SIGBUS here, the non
+ * cooperative manager would be instead forced to
+ * always call UFFDIO_UNREGISTER before it can safely
+ * close the uffd, to avoid involuntary SIGBUS triggered.
+ *
+ * If we were to return VM_FAULT_NOPAGE, it would work for
+ * the fault path, in which the lock will be released
+ * later. However for GUP, faultin_page() does nothing
+ * special on NOPAGE, so GUP would spin retrying without
+ * releasing the mmap read lock, causing possible livelock.
+ *
+ * Here only VM_FAULT_RETRY would make sure the mmap lock
+ * be released immediately, so that the thread concurrently
+ * releasing the userfault would always make progress.
+ */
+ release_fault_lock(vmf);
+ goto out;
+ }
+
/* take the reference before dropping the mmap_lock */
userfaultfd_ctx_get(ctx);
@@ -603,7 +601,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
*/
out:
atomic_dec(&ctx->mmap_changing);
- VM_BUG_ON(atomic_read(&ctx->mmap_changing) < 0);
+ VM_WARN_ON_ONCE(atomic_read(&ctx->mmap_changing) < 0);
userfaultfd_ctx_put(ctx);
}
@@ -711,7 +709,7 @@ void dup_userfaultfd_fail(struct list_head *fcs)
struct userfaultfd_ctx *ctx = fctx->new;
atomic_dec(&octx->mmap_changing);
- VM_BUG_ON(atomic_read(&octx->mmap_changing) < 0);
+ VM_WARN_ON_ONCE(atomic_read(&octx->mmap_changing) < 0);
userfaultfd_ctx_put(octx);
userfaultfd_ctx_put(ctx);
@@ -752,11 +750,6 @@ void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx,
if (!ctx)
return;
- if (to & ~PAGE_MASK) {
- userfaultfd_ctx_put(ctx);
- return;
- }
-
msg_init(&ewq.msg);
ewq.msg.event = UFFD_EVENT_REMAP;
@@ -767,6 +760,16 @@ void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx,
userfaultfd_event_wait_completion(ctx, &ewq);
}
+void mremap_userfaultfd_fail(struct vm_userfaultfd_ctx *vm_ctx)
+{
+ struct userfaultfd_ctx *ctx = vm_ctx->ctx;
+
+ if (!ctx)
+ return;
+
+ userfaultfd_ctx_put(ctx);
+}
+
bool userfaultfd_remove(struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
@@ -1244,7 +1247,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
int ret;
struct uffdio_register uffdio_register;
struct uffdio_register __user *user_uffdio_register;
- unsigned long vm_flags;
+ vm_flags_t vm_flags;
bool found;
bool basic_ioctls;
unsigned long start, end;
@@ -1318,8 +1321,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
do {
cond_resched();
- BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
- !!(cur->vm_flags & __VM_UFFD_FLAGS));
+ VM_WARN_ON_ONCE(!!cur->vm_userfaultfd_ctx.ctx ^
+ !!(cur->vm_flags & __VM_UFFD_FLAGS));
/* check not compatible vmas */
ret = -EINVAL;
@@ -1373,7 +1376,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
found = true;
} for_each_vma_range(vmi, cur, end);
- BUG_ON(!found);
+ VM_WARN_ON_ONCE(!found);
ret = userfaultfd_register_range(ctx, vma, vm_flags, start, end,
wp_async);
@@ -1465,8 +1468,16 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
do {
cond_resched();
- BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
- !!(cur->vm_flags & __VM_UFFD_FLAGS));
+ VM_WARN_ON_ONCE(!!cur->vm_userfaultfd_ctx.ctx ^
+ !!(cur->vm_flags & __VM_UFFD_FLAGS));
+
+ /*
+ * Prevent unregistering through a different userfaultfd than
+ * the one used for registration.
+ */
+ if (cur->vm_userfaultfd_ctx.ctx &&
+ cur->vm_userfaultfd_ctx.ctx != ctx)
+ goto out_unlock;
/*
* Check not compatible vmas, not strictly required
@@ -1480,7 +1491,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
found = true;
} for_each_vma_range(vmi, cur, end);
- BUG_ON(!found);
+ VM_WARN_ON_ONCE(!found);
vma_iter_set(&vmi, start);
prev = vma_prev(&vmi);
@@ -1491,16 +1502,13 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
for_each_vma_range(vmi, vma, end) {
cond_resched();
- BUG_ON(!vma_can_userfault(vma, vma->vm_flags, wp_async));
-
- /*
- * Nothing to do: this vma is already registered into this
- * userfaultfd and with the right tracking mode too.
- */
+ /* VMA not registered with userfaultfd. */
if (!vma->vm_userfaultfd_ctx.ctx)
goto skip;
- WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
+ VM_WARN_ON_ONCE(vma->vm_userfaultfd_ctx.ctx != ctx);
+ VM_WARN_ON_ONCE(!vma_can_userfault(vma, vma->vm_flags, wp_async));
+ VM_WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE));
if (vma->vm_start > start)
start = vma->vm_start;
@@ -1565,7 +1573,7 @@ static int userfaultfd_wake(struct userfaultfd_ctx *ctx,
* len == 0 means wake all and we don't want to wake all here,
* so check it again to be sure.
*/
- VM_BUG_ON(!range.len);
+ VM_WARN_ON_ONCE(!range.len);
wake_userfault(ctx, &range);
ret = 0;
@@ -1586,8 +1594,11 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
user_uffdio_copy = (struct uffdio_copy __user *) arg;
ret = -EAGAIN;
- if (atomic_read(&ctx->mmap_changing))
+ if (unlikely(atomic_read(&ctx->mmap_changing))) {
+ if (unlikely(put_user(ret, &user_uffdio_copy->copy)))
+ return -EFAULT;
goto out;
+ }
ret = -EFAULT;
if (copy_from_user(&uffdio_copy, user_uffdio_copy,
@@ -1619,7 +1630,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
return -EFAULT;
if (ret < 0)
goto out;
- BUG_ON(!ret);
+ VM_WARN_ON_ONCE(!ret);
/* len == 0 would wake all */
range.len = ret;
if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) {
@@ -1642,8 +1653,11 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;
ret = -EAGAIN;
- if (atomic_read(&ctx->mmap_changing))
+ if (unlikely(atomic_read(&ctx->mmap_changing))) {
+ if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
+ return -EFAULT;
goto out;
+ }
ret = -EFAULT;
if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage,
@@ -1671,7 +1685,7 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
if (ret < 0)
goto out;
/* len == 0 would wake all */
- BUG_ON(!ret);
+ VM_WARN_ON_ONCE(!ret);
range.len = ret;
if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) {
range.start = uffdio_zeropage.range.start;
@@ -1745,8 +1759,11 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
user_uffdio_continue = (struct uffdio_continue __user *)arg;
ret = -EAGAIN;
- if (atomic_read(&ctx->mmap_changing))
+ if (unlikely(atomic_read(&ctx->mmap_changing))) {
+ if (unlikely(put_user(ret, &user_uffdio_continue->mapped)))
+ return -EFAULT;
goto out;
+ }
ret = -EFAULT;
if (copy_from_user(&uffdio_continue, user_uffdio_continue,
@@ -1780,7 +1797,7 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
goto out;
/* len == 0 would wake all */
- BUG_ON(!ret);
+ VM_WARN_ON_ONCE(!ret);
range.len = ret;
if (!(uffdio_continue.mode & UFFDIO_CONTINUE_MODE_DONTWAKE)) {
range.start = uffdio_continue.range.start;
@@ -1802,8 +1819,11 @@ static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long
user_uffdio_poison = (struct uffdio_poison __user *)arg;
ret = -EAGAIN;
- if (atomic_read(&ctx->mmap_changing))
+ if (unlikely(atomic_read(&ctx->mmap_changing))) {
+ if (unlikely(put_user(ret, &user_uffdio_poison->updated)))
+ return -EFAULT;
goto out;
+ }
ret = -EFAULT;
if (copy_from_user(&uffdio_poison, user_uffdio_poison,
@@ -1834,7 +1854,7 @@ static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long
goto out;
/* len == 0 would wake all */
- BUG_ON(!ret);
+ VM_WARN_ON_ONCE(!ret);
range.len = ret;
if (!(uffdio_poison.mode & UFFDIO_POISON_MODE_DONTWAKE)) {
range.start = uffdio_poison.range.start;
@@ -1871,8 +1891,12 @@ static int userfaultfd_move(struct userfaultfd_ctx *ctx,
user_uffdio_move = (struct uffdio_move __user *) arg;
- if (atomic_read(&ctx->mmap_changing))
- return -EAGAIN;
+ ret = -EAGAIN;
+ if (unlikely(atomic_read(&ctx->mmap_changing))) {
+ if (unlikely(put_user(ret, &user_uffdio_move->move)))
+ return -EFAULT;
+ goto out;
+ }
if (copy_from_user(&uffdio_move, user_uffdio_move,
/* don't copy "move" last field */
@@ -2091,12 +2115,10 @@ static int new_userfaultfd(int flags)
struct file *file;
int fd;
- BUG_ON(!current->mm);
+ VM_WARN_ON_ONCE(!current->mm);
/* Check the UFFD_* constants for consistency. */
BUILD_BUG_ON(UFFD_USER_MODE_ONLY & UFFD_SHARED_FCNTL_FLAGS);
- BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC);
- BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK);
if (flags & ~(UFFD_SHARED_FCNTL_FLAGS | UFFD_USER_MODE_ONLY))
return -EINVAL;
diff --git a/fs/vboxsf/dir.c b/fs/vboxsf/dir.c
index a859ac9b74ba..770e29ec3557 100644
--- a/fs/vboxsf/dir.c
+++ b/fs/vboxsf/dir.c
@@ -303,11 +303,11 @@ static int vboxsf_dir_mkfile(struct mnt_idmap *idmap,
return vboxsf_dir_create(parent, dentry, mode, false, excl, NULL);
}
-static int vboxsf_dir_mkdir(struct mnt_idmap *idmap,
- struct inode *parent, struct dentry *dentry,
- umode_t mode)
+static struct dentry *vboxsf_dir_mkdir(struct mnt_idmap *idmap,
+ struct inode *parent, struct dentry *dentry,
+ umode_t mode)
{
- return vboxsf_dir_create(parent, dentry, mode, true, true, NULL);
+ return ERR_PTR(vboxsf_dir_create(parent, dentry, mode, true, true, NULL));
}
static int vboxsf_dir_atomic_open(struct inode *parent, struct dentry *dentry,
diff --git a/fs/vboxsf/file.c b/fs/vboxsf/file.c
index b780deb81b02..4bebd947314a 100644
--- a/fs/vboxsf/file.c
+++ b/fs/vboxsf/file.c
@@ -165,13 +165,13 @@ static const struct vm_operations_struct vboxsf_file_vm_ops = {
.map_pages = filemap_map_pages,
};
-static int vboxsf_file_mmap(struct file *file, struct vm_area_struct *vma)
+static int vboxsf_file_mmap_prepare(struct vm_area_desc *desc)
{
int err;
- err = generic_file_mmap(file, vma);
+ err = generic_file_mmap_prepare(desc);
if (!err)
- vma->vm_ops = &vboxsf_file_vm_ops;
+ desc->vm_ops = &vboxsf_file_vm_ops;
return err;
}
@@ -213,7 +213,7 @@ const struct file_operations vboxsf_reg_fops = {
.llseek = generic_file_llseek,
.read_iter = generic_file_read_iter,
.write_iter = generic_file_write_iter,
- .mmap = vboxsf_file_mmap,
+ .mmap_prepare = vboxsf_file_mmap_prepare,
.open = vboxsf_file_open,
.release = vboxsf_file_release,
.fsync = noop_fsync,
@@ -262,48 +262,51 @@ static struct vboxsf_handle *vboxsf_get_write_handle(struct vboxsf_inode *sf_i)
return sf_handle;
}
-static int vboxsf_writepage(struct page *page, struct writeback_control *wbc)
+static int vboxsf_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = mapping->host;
+ struct folio *folio = NULL;
struct vboxsf_inode *sf_i = VBOXSF_I(inode);
struct vboxsf_handle *sf_handle;
- loff_t off = page_offset(page);
loff_t size = i_size_read(inode);
- u32 nwrite = PAGE_SIZE;
- u8 *buf;
- int err;
-
- if (off + PAGE_SIZE > size)
- nwrite = size & ~PAGE_MASK;
+ int error;
sf_handle = vboxsf_get_write_handle(sf_i);
if (!sf_handle)
return -EBADF;
- buf = kmap(page);
- err = vboxsf_write(sf_handle->root, sf_handle->handle,
- off, &nwrite, buf);
- kunmap(page);
+ while ((folio = writeback_iter(mapping, wbc, folio, &error))) {
+ loff_t off = folio_pos(folio);
+ u32 nwrite = folio_size(folio);
+ u8 *buf;
- kref_put(&sf_handle->refcount, vboxsf_handle_release);
+ if (nwrite > size - off)
+ nwrite = size - off;
- if (err == 0) {
- /* mtime changed */
- sf_i->force_restat = 1;
- } else {
- ClearPageUptodate(page);
+ buf = kmap_local_folio(folio, 0);
+ error = vboxsf_write(sf_handle->root, sf_handle->handle,
+ off, &nwrite, buf);
+ kunmap_local(buf);
+
+ folio_unlock(folio);
}
- unlock_page(page);
- return err;
+ kref_put(&sf_handle->refcount, vboxsf_handle_release);
+
+ /* mtime changed */
+ if (error == 0)
+ sf_i->force_restat = 1;
+ return error;
}
-static int vboxsf_write_end(struct file *file, struct address_space *mapping,
+static int vboxsf_write_end(const struct kiocb *iocb,
+ struct address_space *mapping,
loff_t pos, unsigned int len, unsigned int copied,
struct folio *folio, void *fsdata)
{
struct inode *inode = mapping->host;
- struct vboxsf_handle *sf_handle = file->private_data;
+ struct vboxsf_handle *sf_handle = iocb->ki_filp->private_data;
size_t from = offset_in_folio(folio, pos);
u32 nwritten = len;
u8 *buf;
@@ -347,10 +350,11 @@ out:
*/
const struct address_space_operations vboxsf_reg_aops = {
.read_folio = vboxsf_read_folio,
- .writepage = vboxsf_writepage,
+ .writepages = vboxsf_writepages,
.dirty_folio = filemap_dirty_folio,
.write_begin = simple_write_begin,
.write_end = vboxsf_write_end,
+ .migrate_folio = filemap_migrate_folio,
};
static const char *vboxsf_get_link(struct dentry *dentry, struct inode *inode,
diff --git a/fs/vboxsf/super.c b/fs/vboxsf/super.c
index e95b8a48d8a0..241647b060ee 100644
--- a/fs/vboxsf/super.c
+++ b/fs/vboxsf/super.c
@@ -21,7 +21,7 @@
#define VBOXSF_SUPER_MAGIC 0x786f4256 /* 'VBox' little endian */
-static const unsigned char VBSF_MOUNT_SIGNATURE[4] = "\000\377\376\375";
+static const unsigned char VBSF_MOUNT_SIGNATURE[4] __nonstring = "\000\377\376\375";
static int follow_symlinks;
module_param(follow_symlinks, int, 0444);
@@ -189,7 +189,7 @@ static int vboxsf_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_blocksize = 1024;
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_op = &vboxsf_super_ops;
- sb->s_d_op = &vboxsf_dentry_ops;
+ set_default_d_op(sb, &vboxsf_dentry_ops);
iroot = iget_locked(sb, 0);
if (!iroot) {
diff --git a/fs/verity/Kconfig b/fs/verity/Kconfig
index e1036e535352..76d1c5971b82 100644
--- a/fs/verity/Kconfig
+++ b/fs/verity/Kconfig
@@ -2,15 +2,9 @@
config FS_VERITY
bool "FS Verity (read-only file-based authenticity protection)"
- select CRYPTO
select CRYPTO_HASH_INFO
- # SHA-256 is implied as it's intended to be the default hash algorithm.
- # To avoid bloat, other wanted algorithms must be selected explicitly.
- # Note that CRYPTO_SHA256 denotes the generic C implementation, but
- # some architectures provided optimized implementations of the same
- # algorithm that may be used instead. In this case, CRYPTO_SHA256 may
- # be omitted even if SHA-256 is being used.
- imply CRYPTO_SHA256
+ select CRYPTO_LIB_SHA256
+ select CRYPTO_LIB_SHA512
help
This option enables fs-verity. fs-verity is the dm-verity
mechanism implemented at the file level. On supported
diff --git a/fs/verity/enable.c b/fs/verity/enable.c
index c284f46d1b53..503268cf4296 100644
--- a/fs/verity/enable.c
+++ b/fs/verity/enable.c
@@ -7,7 +7,7 @@
#include "fsverity_private.h"
-#include <crypto/hash.h>
+#include <linux/export.h>
#include <linux/mount.h>
#include <linux/sched/signal.h>
#include <linux/uaccess.h>
@@ -24,7 +24,6 @@ static int hash_one_block(struct inode *inode,
struct block_buffer *cur)
{
struct block_buffer *next = cur + 1;
- int err;
/*
* Safety check to prevent a buffer overflow in case of a filesystem bug
@@ -37,10 +36,8 @@ static int hash_one_block(struct inode *inode,
/* Zero-pad the block if it's shorter than the block size. */
memset(&cur->data[cur->filled], 0, params->block_size - cur->filled);
- err = fsverity_hash_block(params, inode, cur->data,
- &next->data[next->filled]);
- if (err)
- return err;
+ fsverity_hash_block(params, inode, cur->data,
+ &next->data[next->filled]);
next->filled += params->digest_size;
cur->filled = 0;
return 0;
diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h
index b3506f56e180..5fe854a5b9ad 100644
--- a/fs/verity/fsverity_private.h
+++ b/fs/verity/fsverity_private.h
@@ -20,7 +20,6 @@
/* A hash algorithm supported by fs-verity */
struct fsverity_hash_alg {
- struct crypto_shash *tfm; /* hash tfm, allocated on demand */
const char *name; /* crypto API name, e.g. sha256 */
unsigned int digest_size; /* digest size in bytes, e.g. 32 for SHA-256 */
unsigned int block_size; /* block size in bytes, e.g. 64 for SHA-256 */
@@ -31,10 +30,16 @@ struct fsverity_hash_alg {
enum hash_algo algo_id;
};
+union fsverity_hash_ctx {
+ struct sha256_ctx sha256;
+ struct sha512_ctx sha512;
+};
+
/* Merkle tree parameters: hash algorithm, initial hash state, and topology */
struct merkle_tree_params {
const struct fsverity_hash_alg *hash_alg; /* the hash algorithm */
- const u8 *hashstate; /* initial hash state or NULL */
+ /* initial hash state if salted, NULL if unsalted */
+ const union fsverity_hash_ctx *hashstate;
unsigned int digest_size; /* same as hash_alg->digest_size */
unsigned int block_size; /* size of data and tree blocks */
unsigned int hashes_per_block; /* number of hashes per tree block */
@@ -76,16 +81,17 @@ struct fsverity_info {
/* hash_algs.c */
-extern struct fsverity_hash_alg fsverity_hash_algs[];
+extern const struct fsverity_hash_alg fsverity_hash_algs[];
const struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode,
unsigned int num);
-const u8 *fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg,
- const u8 *salt, size_t salt_size);
-int fsverity_hash_block(const struct merkle_tree_params *params,
- const struct inode *inode, const void *data, u8 *out);
-int fsverity_hash_buffer(const struct fsverity_hash_alg *alg,
- const void *data, size_t size, u8 *out);
+union fsverity_hash_ctx *
+fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg,
+ const u8 *salt, size_t salt_size);
+void fsverity_hash_block(const struct merkle_tree_params *params,
+ const struct inode *inode, const void *data, u8 *out);
+void fsverity_hash_buffer(const struct fsverity_hash_alg *alg,
+ const void *data, size_t size, u8 *out);
void __init fsverity_check_hash_algs(void);
/* init.c */
diff --git a/fs/verity/hash_algs.c b/fs/verity/hash_algs.c
index 6b08b1d9a7d7..9bb3c6344907 100644
--- a/fs/verity/hash_algs.c
+++ b/fs/verity/hash_algs.c
@@ -7,10 +7,8 @@
#include "fsverity_private.h"
-#include <crypto/hash.h>
-
/* The hash algorithms supported by fs-verity */
-struct fsverity_hash_alg fsverity_hash_algs[] = {
+const struct fsverity_hash_alg fsverity_hash_algs[] = {
[FS_VERITY_HASH_ALG_SHA256] = {
.name = "sha256",
.digest_size = SHA256_DIGEST_SIZE,
@@ -25,106 +23,42 @@ struct fsverity_hash_alg fsverity_hash_algs[] = {
},
};
-static DEFINE_MUTEX(fsverity_hash_alg_init_mutex);
-
/**
- * fsverity_get_hash_alg() - validate and prepare a hash algorithm
+ * fsverity_get_hash_alg() - get a hash algorithm by number
* @inode: optional inode for logging purposes
* @num: the hash algorithm number
*
- * Get the struct fsverity_hash_alg for the given hash algorithm number, and
- * ensure it has a hash transform ready to go. The hash transforms are
- * allocated on-demand so that we don't waste resources unnecessarily, and
- * because the crypto modules may be initialized later than fs/verity/.
+ * Get the struct fsverity_hash_alg for the given hash algorithm number.
*
- * Return: pointer to the hash alg on success, else an ERR_PTR()
+ * Return: pointer to the hash alg if it's known, otherwise NULL.
*/
const struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode,
unsigned int num)
{
- struct fsverity_hash_alg *alg;
- struct crypto_shash *tfm;
- int err;
-
if (num >= ARRAY_SIZE(fsverity_hash_algs) ||
!fsverity_hash_algs[num].name) {
fsverity_warn(inode, "Unknown hash algorithm number: %u", num);
- return ERR_PTR(-EINVAL);
- }
- alg = &fsverity_hash_algs[num];
-
- /* pairs with smp_store_release() below */
- if (likely(smp_load_acquire(&alg->tfm) != NULL))
- return alg;
-
- mutex_lock(&fsverity_hash_alg_init_mutex);
-
- if (alg->tfm != NULL)
- goto out_unlock;
-
- tfm = crypto_alloc_shash(alg->name, 0, 0);
- if (IS_ERR(tfm)) {
- if (PTR_ERR(tfm) == -ENOENT) {
- fsverity_warn(inode,
- "Missing crypto API support for hash algorithm \"%s\"",
- alg->name);
- alg = ERR_PTR(-ENOPKG);
- goto out_unlock;
- }
- fsverity_err(inode,
- "Error allocating hash algorithm \"%s\": %ld",
- alg->name, PTR_ERR(tfm));
- alg = ERR_CAST(tfm);
- goto out_unlock;
+ return NULL;
}
-
- err = -EINVAL;
- if (WARN_ON_ONCE(alg->digest_size != crypto_shash_digestsize(tfm)))
- goto err_free_tfm;
- if (WARN_ON_ONCE(alg->block_size != crypto_shash_blocksize(tfm)))
- goto err_free_tfm;
-
- pr_info("%s using implementation \"%s\"\n",
- alg->name, crypto_shash_driver_name(tfm));
-
- /* pairs with smp_load_acquire() above */
- smp_store_release(&alg->tfm, tfm);
- goto out_unlock;
-
-err_free_tfm:
- crypto_free_shash(tfm);
- alg = ERR_PTR(err);
-out_unlock:
- mutex_unlock(&fsverity_hash_alg_init_mutex);
- return alg;
+ return &fsverity_hash_algs[num];
}
/**
* fsverity_prepare_hash_state() - precompute the initial hash state
* @alg: hash algorithm
* @salt: a salt which is to be prepended to all data to be hashed
- * @salt_size: salt size in bytes, possibly 0
+ * @salt_size: salt size in bytes
*
- * Return: NULL if the salt is empty, otherwise the kmalloc()'ed precomputed
- * initial hash state on success or an ERR_PTR() on failure.
+ * Return: the kmalloc()'ed initial hash state, or NULL if out of memory.
*/
-const u8 *fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg,
- const u8 *salt, size_t salt_size)
+union fsverity_hash_ctx *
+fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg,
+ const u8 *salt, size_t salt_size)
{
- u8 *hashstate = NULL;
- SHASH_DESC_ON_STACK(desc, alg->tfm);
u8 *padded_salt = NULL;
size_t padded_salt_size;
- int err;
-
- desc->tfm = alg->tfm;
-
- if (salt_size == 0)
- return NULL;
-
- hashstate = kmalloc(crypto_shash_statesize(alg->tfm), GFP_KERNEL);
- if (!hashstate)
- return ERR_PTR(-ENOMEM);
+ union fsverity_hash_ctx ctx;
+ void *res = NULL;
/*
* Zero-pad the salt to the next multiple of the input size of the hash
@@ -135,30 +69,26 @@ const u8 *fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg,
*/
padded_salt_size = round_up(salt_size, alg->block_size);
padded_salt = kzalloc(padded_salt_size, GFP_KERNEL);
- if (!padded_salt) {
- err = -ENOMEM;
- goto err_free;
- }
+ if (!padded_salt)
+ return NULL;
memcpy(padded_salt, salt, salt_size);
- err = crypto_shash_init(desc);
- if (err)
- goto err_free;
-
- err = crypto_shash_update(desc, padded_salt, padded_salt_size);
- if (err)
- goto err_free;
-
- err = crypto_shash_export(desc, hashstate);
- if (err)
- goto err_free;
-out:
- kfree(padded_salt);
- return hashstate;
-err_free:
- kfree(hashstate);
- hashstate = ERR_PTR(err);
- goto out;
+ switch (alg->algo_id) {
+ case HASH_ALGO_SHA256:
+ sha256_init(&ctx.sha256);
+ sha256_update(&ctx.sha256, padded_salt, padded_salt_size);
+ res = kmemdup(&ctx.sha256, sizeof(ctx.sha256), GFP_KERNEL);
+ break;
+ case HASH_ALGO_SHA512:
+ sha512_init(&ctx.sha512);
+ sha512_update(&ctx.sha512, padded_salt, padded_salt_size);
+ res = kmemdup(&ctx.sha512, sizeof(ctx.sha512), GFP_KERNEL);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ }
+ kfree(padded_salt);
+ return res;
}
/**
@@ -170,31 +100,32 @@ err_free:
*
* Hash a single data or hash block. The hash is salted if a salt is specified
* in the Merkle tree parameters.
- *
- * Return: 0 on success, -errno on failure
*/
-int fsverity_hash_block(const struct merkle_tree_params *params,
- const struct inode *inode, const void *data, u8 *out)
+void fsverity_hash_block(const struct merkle_tree_params *params,
+ const struct inode *inode, const void *data, u8 *out)
{
- SHASH_DESC_ON_STACK(desc, params->hash_alg->tfm);
- int err;
-
- desc->tfm = params->hash_alg->tfm;
-
- if (params->hashstate) {
- err = crypto_shash_import(desc, params->hashstate);
- if (err) {
- fsverity_err(inode,
- "Error %d importing hash state", err);
- return err;
- }
- err = crypto_shash_finup(desc, data, params->block_size, out);
- } else {
- err = crypto_shash_digest(desc, data, params->block_size, out);
+ union fsverity_hash_ctx ctx;
+
+ if (!params->hashstate) {
+ fsverity_hash_buffer(params->hash_alg, data, params->block_size,
+ out);
+ return;
+ }
+
+ switch (params->hash_alg->algo_id) {
+ case HASH_ALGO_SHA256:
+ ctx.sha256 = params->hashstate->sha256;
+ sha256_update(&ctx.sha256, data, params->block_size);
+ sha256_final(&ctx.sha256, out);
+ return;
+ case HASH_ALGO_SHA512:
+ ctx.sha512 = params->hashstate->sha512;
+ sha512_update(&ctx.sha512, data, params->block_size);
+ sha512_final(&ctx.sha512, out);
+ return;
+ default:
+ BUG();
}
- if (err)
- fsverity_err(inode, "Error %d computing block hash", err);
- return err;
}
/**
@@ -203,13 +134,20 @@ int fsverity_hash_block(const struct merkle_tree_params *params,
* @data: the data to hash
* @size: size of data to hash, in bytes
* @out: output digest, size 'alg->digest_size' bytes
- *
- * Return: 0 on success, -errno on failure
*/
-int fsverity_hash_buffer(const struct fsverity_hash_alg *alg,
- const void *data, size_t size, u8 *out)
+void fsverity_hash_buffer(const struct fsverity_hash_alg *alg,
+ const void *data, size_t size, u8 *out)
{
- return crypto_shash_tfm_digest(alg->tfm, data, size, out);
+ switch (alg->algo_id) {
+ case HASH_ALGO_SHA256:
+ sha256(data, size, out);
+ return;
+ case HASH_ALGO_SHA512:
+ sha512(data, size, out);
+ return;
+ default:
+ BUG();
+ }
}
void __init fsverity_check_hash_algs(void)
diff --git a/fs/verity/measure.c b/fs/verity/measure.c
index 175d2f1bc089..388734132f01 100644
--- a/fs/verity/measure.c
+++ b/fs/verity/measure.c
@@ -9,6 +9,7 @@
#include <linux/bpf.h>
#include <linux/btf.h>
+#include <linux/export.h>
#include <linux/uaccess.h>
/**
diff --git a/fs/verity/open.c b/fs/verity/open.c
index fdeb95eca3af..c561e130cd0c 100644
--- a/fs/verity/open.c
+++ b/fs/verity/open.c
@@ -7,6 +7,7 @@
#include "fsverity_private.h"
+#include <linux/export.h>
#include <linux/mm.h>
#include <linux/slab.h>
@@ -42,18 +43,18 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params,
memset(params, 0, sizeof(*params));
hash_alg = fsverity_get_hash_alg(inode, hash_algorithm);
- if (IS_ERR(hash_alg))
- return PTR_ERR(hash_alg);
+ if (!hash_alg)
+ return -EINVAL;
params->hash_alg = hash_alg;
params->digest_size = hash_alg->digest_size;
- params->hashstate = fsverity_prepare_hash_state(hash_alg, salt,
- salt_size);
- if (IS_ERR(params->hashstate)) {
- err = PTR_ERR(params->hashstate);
- params->hashstate = NULL;
- fsverity_err(inode, "Error %d preparing hash state", err);
- goto out_err;
+ if (salt_size) {
+ params->hashstate =
+ fsverity_prepare_hash_state(hash_alg, salt, salt_size);
+ if (!params->hashstate) {
+ err = -ENOMEM;
+ goto out_err;
+ }
}
/*
@@ -158,18 +159,15 @@ out_err:
* Compute the file digest by hashing the fsverity_descriptor excluding the
* builtin signature and with the sig_size field set to 0.
*/
-static int compute_file_digest(const struct fsverity_hash_alg *hash_alg,
- struct fsverity_descriptor *desc,
- u8 *file_digest)
+static void compute_file_digest(const struct fsverity_hash_alg *hash_alg,
+ struct fsverity_descriptor *desc,
+ u8 *file_digest)
{
__le32 sig_size = desc->sig_size;
- int err;
desc->sig_size = 0;
- err = fsverity_hash_buffer(hash_alg, desc, sizeof(*desc), file_digest);
+ fsverity_hash_buffer(hash_alg, desc, sizeof(*desc), file_digest);
desc->sig_size = sig_size;
-
- return err;
}
/*
@@ -201,12 +199,7 @@ struct fsverity_info *fsverity_create_info(const struct inode *inode,
memcpy(vi->root_hash, desc->root_hash, vi->tree_params.digest_size);
- err = compute_file_digest(vi->tree_params.hash_alg, desc,
- vi->file_digest);
- if (err) {
- fsverity_err(inode, "Error %d computing file digest", err);
- goto fail;
- }
+ compute_file_digest(vi->tree_params.hash_alg, desc, vi->file_digest);
err = fsverity_verify_signature(vi, desc->signature,
le32_to_cpu(desc->sig_size));
diff --git a/fs/verity/read_metadata.c b/fs/verity/read_metadata.c
index f58432772d9e..cba5d6af4e04 100644
--- a/fs/verity/read_metadata.c
+++ b/fs/verity/read_metadata.c
@@ -8,6 +8,7 @@
#include "fsverity_private.h"
#include <linux/backing-dev.h>
+#include <linux/export.h>
#include <linux/highmem.h>
#include <linux/sched/signal.h>
#include <linux/uaccess.h>
diff --git a/fs/verity/verify.c b/fs/verity/verify.c
index 4fcad0825a12..a1f00c3fd3b2 100644
--- a/fs/verity/verify.c
+++ b/fs/verity/verify.c
@@ -7,8 +7,8 @@
#include "fsverity_private.h"
-#include <crypto/hash.h>
#include <linux/bio.h>
+#include <linux/export.h>
static struct workqueue_struct *fsverity_read_workqueue;
@@ -202,8 +202,7 @@ descend:
unsigned long hblock_idx = hblocks[level - 1].index;
unsigned int hoffset = hblocks[level - 1].hoffset;
- if (fsverity_hash_block(params, inode, haddr, real_hash) != 0)
- goto error;
+ fsverity_hash_block(params, inode, haddr, real_hash);
if (memcmp(want_hash, real_hash, hsize) != 0)
goto corrupted;
/*
@@ -222,8 +221,7 @@ descend:
}
/* Finally, verify the data block. */
- if (fsverity_hash_block(params, inode, data, real_hash) != 0)
- goto error;
+ fsverity_hash_block(params, inode, data, real_hash);
if (memcmp(want_hash, real_hash, hsize) != 0)
goto corrupted;
return true;
diff --git a/fs/xattr.c b/fs/xattr.c
index 02bee149ad96..8851a5ef34f5 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -215,7 +215,7 @@ EXPORT_SYMBOL(__vfs_setxattr);
*
* returns the result of the internal setxattr or setsecurity operations.
*
- * This function requires the caller to lock the inode's i_mutex before it
+ * This function requires the caller to lock the inode's i_rwsem before it
* is executed. It also assumes that the caller will make the appropriate
* permission checks.
*/
@@ -703,7 +703,7 @@ static int path_setxattrat(int dfd, const char __user *pathname,
return error;
filename = getname_maybe_null(pathname, at_flags);
- if (!filename) {
+ if (!filename && dfd >= 0) {
CLASS(fd, f)(dfd);
if (fd_empty(f))
error = -EBADF;
@@ -847,7 +847,7 @@ static ssize_t path_getxattrat(int dfd, const char __user *pathname,
return error;
filename = getname_maybe_null(pathname, at_flags);
- if (!filename) {
+ if (!filename && dfd >= 0) {
CLASS(fd, f)(dfd);
if (fd_empty(f))
return -EBADF;
@@ -1428,6 +1428,15 @@ static bool xattr_is_trusted(const char *name)
return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
}
+static bool xattr_is_maclabel(const char *name)
+{
+ const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
+
+ return !strncmp(name, XATTR_SECURITY_PREFIX,
+ XATTR_SECURITY_PREFIX_LEN) &&
+ security_ismaclabel(suffix);
+}
+
/**
* simple_xattr_list - list all xattr objects
* @inode: inode from which to get the xattrs
@@ -1460,6 +1469,18 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs,
if (err)
return err;
+ err = security_inode_listsecurity(inode, buffer, remaining_size);
+ if (err < 0)
+ return err;
+
+ if (buffer) {
+ if (remaining_size < err)
+ return -ERANGE;
+ buffer += err;
+ }
+ remaining_size -= err;
+ err = 0;
+
read_lock(&xattrs->lock);
for (rbp = rb_first(&xattrs->rb_root); rbp; rbp = rb_next(rbp)) {
xattr = rb_entry(rbp, struct simple_xattr, rb_node);
@@ -1468,6 +1489,10 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs,
if (!trusted && xattr_is_trusted(xattr->name))
continue;
+ /* skip MAC labels; these are provided by LSM above */
+ if (xattr_is_maclabel(xattr->name))
+ continue;
+
err = xattr_list_one(&buffer, &remaining_size, xattr->name);
if (err)
break;
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index fffd6fffdce0..ae0ca6858496 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -3,7 +3,7 @@ config XFS_FS
tristate "XFS filesystem support"
depends on BLOCK
select EXPORTFS
- select LIBCRC32C
+ select CRC32
select FS_IOMAP
help
XFS is a high performance journaling filesystem which originated
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 7afa51e41427..5bf501cf8271 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -64,6 +64,7 @@ xfs-y += $(addprefix libxfs/, \
xfs-$(CONFIG_XFS_RT) += $(addprefix libxfs/, \
xfs_rtbitmap.o \
xfs_rtgroup.o \
+ xfs_zones.o \
)
# highlevel code
@@ -136,7 +137,11 @@ xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \
xfs_quotaops.o
# xfs_rtbitmap is shared with libxfs
-xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o
+xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o \
+ xfs_zone_alloc.o \
+ xfs_zone_gc.o \
+ xfs_zone_info.o \
+ xfs_zone_space_resv.o
xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o
xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index b59cb461e096..e6ba914f6d06 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -301,7 +301,7 @@ xfs_get_aghdr_buf(
struct xfs_buf *bp;
int error;
- error = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, 0, &bp);
+ error = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, &bp);
if (error)
return error;
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 3d33e17f2e5c..000cc7f4a3ce 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -33,8 +33,6 @@ struct kmem_cache *xfs_extfree_item_cache;
struct workqueue_struct *xfs_alloc_wq;
-#define XFS_ABSDIFF(a,b) (((a) <= (b)) ? ((b) - (a)) : ((a) - (b)))
-
#define XFSA_FIXUP_BNO_OK 1
#define XFSA_FIXUP_CNT_OK 2
@@ -410,8 +408,8 @@ xfs_alloc_compute_diff(
if (newbno1 != NULLAGBLOCK && newbno2 != NULLAGBLOCK) {
if (newlen1 < newlen2 ||
(newlen1 == newlen2 &&
- XFS_ABSDIFF(newbno1, wantbno) >
- XFS_ABSDIFF(newbno2, wantbno)))
+ abs_diff(newbno1, wantbno) >
+ abs_diff(newbno2, wantbno)))
newbno1 = newbno2;
} else if (newbno2 != NULLAGBLOCK)
newbno1 = newbno2;
@@ -427,7 +425,7 @@ xfs_alloc_compute_diff(
} else
newbno1 = freeend - wantlen;
*newbnop = newbno1;
- return newbno1 == NULLAGBLOCK ? 0 : XFS_ABSDIFF(newbno1, wantbno);
+ return newbno1 == NULLAGBLOCK ? 0 : abs_diff(newbno1, wantbno);
}
/*
@@ -3446,16 +3444,41 @@ xfs_alloc_read_agf(
set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate);
}
+
#ifdef DEBUG
- else if (!xfs_is_shutdown(mp)) {
- ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks));
- ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks));
- ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount));
- ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest));
- ASSERT(pag->pagf_bno_level == be32_to_cpu(agf->agf_bno_level));
- ASSERT(pag->pagf_cnt_level == be32_to_cpu(agf->agf_cnt_level));
+ /*
+ * It's possible for the AGF to be out of sync if the block device is
+ * silently dropping writes. This can happen in fstests with dmflakey
+ * enabled, which allows the buffer to be cleaned and reclaimed by
+ * memory pressure and then re-read from disk here. We will get a
+ * stale version of the AGF from disk, and nothing good can happen from
+ * here. Hence if we detect this situation, immediately shut down the
+ * filesystem.
+ *
+ * This can also happen if we are already in the middle of a forced
+ * shutdown, so don't bother checking if we are already shut down.
+ */
+ if (!xfs_is_shutdown(pag_mount(pag))) {
+ bool ok = true;
+
+ ok &= pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks);
+ ok &= pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks);
+ ok &= pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks);
+ ok &= pag->pagf_flcount == be32_to_cpu(agf->agf_flcount);
+ ok &= pag->pagf_longest == be32_to_cpu(agf->agf_longest);
+ ok &= pag->pagf_bno_level == be32_to_cpu(agf->agf_bno_level);
+ ok &= pag->pagf_cnt_level == be32_to_cpu(agf->agf_cnt_level);
+
+ if (XFS_IS_CORRUPT(pag_mount(pag), !ok)) {
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_AGF);
+ xfs_trans_brelse(tp, agfbp);
+ xfs_force_shutdown(pag_mount(pag),
+ SHUTDOWN_CORRUPT_ONDISK);
+ return -EFSCORRUPTED;
+ }
}
-#endif
+#endif /* DEBUG */
+
if (agfbpp)
*agfbpp = agfbp;
else
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index a4ac37ba5d51..fa1f03c1331e 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -186,35 +186,32 @@ xfs_allocbt_init_ptr_from_cur(
ptr->s = agf->agf_cnt_root;
}
-STATIC int64_t
-xfs_bnobt_key_diff(
+STATIC int
+xfs_bnobt_cmp_key_with_cur(
struct xfs_btree_cur *cur,
const union xfs_btree_key *key)
{
struct xfs_alloc_rec_incore *rec = &cur->bc_rec.a;
const struct xfs_alloc_rec *kp = &key->alloc;
- return (int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
+ return cmp_int(be32_to_cpu(kp->ar_startblock),
+ rec->ar_startblock);
}
-STATIC int64_t
-xfs_cntbt_key_diff(
+STATIC int
+xfs_cntbt_cmp_key_with_cur(
struct xfs_btree_cur *cur,
const union xfs_btree_key *key)
{
struct xfs_alloc_rec_incore *rec = &cur->bc_rec.a;
const struct xfs_alloc_rec *kp = &key->alloc;
- int64_t diff;
- diff = (int64_t)be32_to_cpu(kp->ar_blockcount) - rec->ar_blockcount;
- if (diff)
- return diff;
-
- return (int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
+ return cmp_int(be32_to_cpu(kp->ar_blockcount), rec->ar_blockcount) ?:
+ cmp_int(be32_to_cpu(kp->ar_startblock), rec->ar_startblock);
}
-STATIC int64_t
-xfs_bnobt_diff_two_keys(
+STATIC int
+xfs_bnobt_cmp_two_keys(
struct xfs_btree_cur *cur,
const union xfs_btree_key *k1,
const union xfs_btree_key *k2,
@@ -222,29 +219,24 @@ xfs_bnobt_diff_two_keys(
{
ASSERT(!mask || mask->alloc.ar_startblock);
- return (int64_t)be32_to_cpu(k1->alloc.ar_startblock) -
- be32_to_cpu(k2->alloc.ar_startblock);
+ return cmp_int(be32_to_cpu(k1->alloc.ar_startblock),
+ be32_to_cpu(k2->alloc.ar_startblock));
}
-STATIC int64_t
-xfs_cntbt_diff_two_keys(
+STATIC int
+xfs_cntbt_cmp_two_keys(
struct xfs_btree_cur *cur,
const union xfs_btree_key *k1,
const union xfs_btree_key *k2,
const union xfs_btree_key *mask)
{
- int64_t diff;
-
ASSERT(!mask || (mask->alloc.ar_blockcount &&
mask->alloc.ar_startblock));
- diff = be32_to_cpu(k1->alloc.ar_blockcount) -
- be32_to_cpu(k2->alloc.ar_blockcount);
- if (diff)
- return diff;
-
- return be32_to_cpu(k1->alloc.ar_startblock) -
- be32_to_cpu(k2->alloc.ar_startblock);
+ return cmp_int(be32_to_cpu(k1->alloc.ar_blockcount),
+ be32_to_cpu(k2->alloc.ar_blockcount)) ?:
+ cmp_int(be32_to_cpu(k1->alloc.ar_startblock),
+ be32_to_cpu(k2->alloc.ar_startblock));
}
static xfs_failaddr_t
@@ -438,9 +430,9 @@ const struct xfs_btree_ops xfs_bnobt_ops = {
.init_high_key_from_rec = xfs_bnobt_init_high_key_from_rec,
.init_rec_from_cur = xfs_allocbt_init_rec_from_cur,
.init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur,
- .key_diff = xfs_bnobt_key_diff,
+ .cmp_key_with_cur = xfs_bnobt_cmp_key_with_cur,
.buf_ops = &xfs_bnobt_buf_ops,
- .diff_two_keys = xfs_bnobt_diff_two_keys,
+ .cmp_two_keys = xfs_bnobt_cmp_two_keys,
.keys_inorder = xfs_bnobt_keys_inorder,
.recs_inorder = xfs_bnobt_recs_inorder,
.keys_contiguous = xfs_allocbt_keys_contiguous,
@@ -468,9 +460,9 @@ const struct xfs_btree_ops xfs_cntbt_ops = {
.init_high_key_from_rec = xfs_cntbt_init_high_key_from_rec,
.init_rec_from_cur = xfs_allocbt_init_rec_from_cur,
.init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur,
- .key_diff = xfs_cntbt_key_diff,
+ .cmp_key_with_cur = xfs_cntbt_cmp_key_with_cur,
.buf_ops = &xfs_cntbt_buf_ops,
- .diff_two_keys = xfs_cntbt_diff_two_keys,
+ .cmp_two_keys = xfs_cntbt_cmp_two_keys,
.keys_inorder = xfs_cntbt_keys_inorder,
.recs_inorder = xfs_cntbt_recs_inorder,
.keys_contiguous = NULL, /* not needed right now */
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 40ad22fb808b..d954f9b8071f 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -34,13 +34,13 @@
#include "xfs_ag.h"
#include "xfs_ag_resv.h"
#include "xfs_refcount.h"
-#include "xfs_icache.h"
#include "xfs_iomap.h"
#include "xfs_health.h"
#include "xfs_bmap_item.h"
#include "xfs_symlink_remote.h"
#include "xfs_inode_util.h"
#include "xfs_rtgroup.h"
+#include "xfs_zone_alloc.h"
struct kmem_cache *xfs_bmap_intent_cache;
@@ -171,18 +171,16 @@ xfs_bmbt_update(
* Compute the worst-case number of indirect blocks that will be used
* for ip's delayed extent of length "len".
*/
-STATIC xfs_filblks_t
+xfs_filblks_t
xfs_bmap_worst_indlen(
- xfs_inode_t *ip, /* incore inode pointer */
- xfs_filblks_t len) /* delayed extent length */
+ struct xfs_inode *ip, /* incore inode pointer */
+ xfs_filblks_t len) /* delayed extent length */
{
- int level; /* btree level number */
- int maxrecs; /* maximum record count at this level */
- xfs_mount_t *mp; /* mount structure */
- xfs_filblks_t rval; /* return value */
+ struct xfs_mount *mp = ip->i_mount;
+ int maxrecs = mp->m_bmap_dmxr[0];
+ int level;
+ xfs_filblks_t rval;
- mp = ip->i_mount;
- maxrecs = mp->m_bmap_dmxr[0];
for (level = 0, rval = 0;
level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK);
level++) {
@@ -2572,146 +2570,6 @@ done:
}
/*
- * Convert a hole to a delayed allocation.
- */
-STATIC void
-xfs_bmap_add_extent_hole_delay(
- xfs_inode_t *ip, /* incore inode pointer */
- int whichfork,
- struct xfs_iext_cursor *icur,
- xfs_bmbt_irec_t *new) /* new data to add to file extents */
-{
- struct xfs_ifork *ifp; /* inode fork pointer */
- xfs_bmbt_irec_t left; /* left neighbor extent entry */
- xfs_filblks_t newlen=0; /* new indirect size */
- xfs_filblks_t oldlen=0; /* old indirect size */
- xfs_bmbt_irec_t right; /* right neighbor extent entry */
- uint32_t state = xfs_bmap_fork_to_state(whichfork);
- xfs_filblks_t temp; /* temp for indirect calculations */
-
- ifp = xfs_ifork_ptr(ip, whichfork);
- ASSERT(isnullstartblock(new->br_startblock));
-
- /*
- * Check and set flags if this segment has a left neighbor
- */
- if (xfs_iext_peek_prev_extent(ifp, icur, &left)) {
- state |= BMAP_LEFT_VALID;
- if (isnullstartblock(left.br_startblock))
- state |= BMAP_LEFT_DELAY;
- }
-
- /*
- * Check and set flags if the current (right) segment exists.
- * If it doesn't exist, we're converting the hole at end-of-file.
- */
- if (xfs_iext_get_extent(ifp, icur, &right)) {
- state |= BMAP_RIGHT_VALID;
- if (isnullstartblock(right.br_startblock))
- state |= BMAP_RIGHT_DELAY;
- }
-
- /*
- * Set contiguity flags on the left and right neighbors.
- * Don't let extents get too large, even if the pieces are contiguous.
- */
- if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) &&
- left.br_startoff + left.br_blockcount == new->br_startoff &&
- left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
- state |= BMAP_LEFT_CONTIG;
-
- if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) &&
- new->br_startoff + new->br_blockcount == right.br_startoff &&
- new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
- (!(state & BMAP_LEFT_CONTIG) ||
- (left.br_blockcount + new->br_blockcount +
- right.br_blockcount <= XFS_MAX_BMBT_EXTLEN)))
- state |= BMAP_RIGHT_CONTIG;
-
- /*
- * Switch out based on the contiguity flags.
- */
- switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) {
- case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
- /*
- * New allocation is contiguous with delayed allocations
- * on the left and on the right.
- * Merge all three into a single extent record.
- */
- temp = left.br_blockcount + new->br_blockcount +
- right.br_blockcount;
-
- oldlen = startblockval(left.br_startblock) +
- startblockval(new->br_startblock) +
- startblockval(right.br_startblock);
- newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
- oldlen);
- left.br_startblock = nullstartblock(newlen);
- left.br_blockcount = temp;
-
- xfs_iext_remove(ip, icur, state);
- xfs_iext_prev(ifp, icur);
- xfs_iext_update_extent(ip, state, icur, &left);
- break;
-
- case BMAP_LEFT_CONTIG:
- /*
- * New allocation is contiguous with a delayed allocation
- * on the left.
- * Merge the new allocation with the left neighbor.
- */
- temp = left.br_blockcount + new->br_blockcount;
-
- oldlen = startblockval(left.br_startblock) +
- startblockval(new->br_startblock);
- newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
- oldlen);
- left.br_blockcount = temp;
- left.br_startblock = nullstartblock(newlen);
-
- xfs_iext_prev(ifp, icur);
- xfs_iext_update_extent(ip, state, icur, &left);
- break;
-
- case BMAP_RIGHT_CONTIG:
- /*
- * New allocation is contiguous with a delayed allocation
- * on the right.
- * Merge the new allocation with the right neighbor.
- */
- temp = new->br_blockcount + right.br_blockcount;
- oldlen = startblockval(new->br_startblock) +
- startblockval(right.br_startblock);
- newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
- oldlen);
- right.br_startoff = new->br_startoff;
- right.br_startblock = nullstartblock(newlen);
- right.br_blockcount = temp;
- xfs_iext_update_extent(ip, state, icur, &right);
- break;
-
- case 0:
- /*
- * New allocation is not contiguous with another
- * delayed allocation.
- * Insert a new entry.
- */
- oldlen = newlen = 0;
- xfs_iext_insert(ip, icur, new, state);
- break;
- }
- if (oldlen != newlen) {
- ASSERT(oldlen > newlen);
- xfs_add_fdblocks(ip->i_mount, oldlen - newlen);
-
- /*
- * Nothing to do for disk quota accounting here.
- */
- xfs_mod_delalloc(ip, 0, (int64_t)newlen - oldlen);
- }
-}
-
-/*
* Convert a hole to a real allocation.
*/
STATIC int /* error */
@@ -3454,6 +3312,11 @@ xfs_bmap_compute_alignments(
align = xfs_get_cowextsz_hint(ap->ip);
else if (ap->datatype & XFS_ALLOC_USERDATA)
align = xfs_get_extsz_hint(ap->ip);
+
+ /* Try to align start block to any minimum allocation alignment */
+ if (align > 1 && (ap->flags & XFS_BMAPI_EXTSZALIGN))
+ args->alignment = align;
+
if (align) {
if (xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, align, 0,
ap->eof, 0, ap->conv, &ap->offset,
@@ -3563,12 +3426,12 @@ xfs_bmap_btalloc_at_eof(
int error;
/*
- * If there are already extents in the file, try an exact EOF block
- * allocation to extend the file as a contiguous extent. If that fails,
- * or it's the first allocation in a file, just try for a stripe aligned
- * allocation.
+ * If there are already extents in the file, and xfs_bmap_adjacent() has
+ * given a better blkno, try an exact EOF block allocation to extend the
+ * file as a contiguous extent. If that fails, or it's the first
+ * allocation in a file, just try for a stripe aligned allocation.
*/
- if (ap->offset) {
+ if (ap->eof) {
xfs_extlen_t nextminlen = 0;
/*
@@ -3736,7 +3599,8 @@ xfs_bmap_btalloc_best_length(
int error;
ap->blkno = XFS_INO_TO_FSB(args->mp, ap->ip->i_ino);
- xfs_bmap_adjacent(ap);
+ if (!xfs_bmap_adjacent(ap))
+ ap->eof = false;
/*
* Search for an allocation group with a single extent large enough for
@@ -4038,144 +3902,6 @@ xfs_bmapi_read(
return 0;
}
-/*
- * Add a delayed allocation extent to an inode. Blocks are reserved from the
- * global pool and the extent inserted into the inode in-core extent tree.
- *
- * On entry, got refers to the first extent beyond the offset of the extent to
- * allocate or eof is specified if no such extent exists. On return, got refers
- * to the extent record that was inserted to the inode fork.
- *
- * Note that the allocated extent may have been merged with contiguous extents
- * during insertion into the inode fork. Thus, got does not reflect the current
- * state of the inode fork on return. If necessary, the caller can use lastx to
- * look up the updated record in the inode fork.
- */
-int
-xfs_bmapi_reserve_delalloc(
- struct xfs_inode *ip,
- int whichfork,
- xfs_fileoff_t off,
- xfs_filblks_t len,
- xfs_filblks_t prealloc,
- struct xfs_bmbt_irec *got,
- struct xfs_iext_cursor *icur,
- int eof)
-{
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
- xfs_extlen_t alen;
- xfs_extlen_t indlen;
- uint64_t fdblocks;
- int error;
- xfs_fileoff_t aoff;
- bool use_cowextszhint =
- whichfork == XFS_COW_FORK && !prealloc;
-
-retry:
- /*
- * Cap the alloc length. Keep track of prealloc so we know whether to
- * tag the inode before we return.
- */
- aoff = off;
- alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN);
- if (!eof)
- alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
- if (prealloc && alen >= len)
- prealloc = alen - len;
-
- /*
- * If we're targetting the COW fork but aren't creating a speculative
- * posteof preallocation, try to expand the reservation to align with
- * the COW extent size hint if there's sufficient free space.
- *
- * Unlike the data fork, the CoW cancellation functions will free all
- * the reservations at inactivation, so we don't require that every
- * delalloc reservation have a dirty pagecache.
- */
- if (use_cowextszhint) {
- struct xfs_bmbt_irec prev;
- xfs_extlen_t extsz = xfs_get_cowextsz_hint(ip);
-
- if (!xfs_iext_peek_prev_extent(ifp, icur, &prev))
- prev.br_startoff = NULLFILEOFF;
-
- error = xfs_bmap_extsize_align(mp, got, &prev, extsz, 0, eof,
- 1, 0, &aoff, &alen);
- ASSERT(!error);
- }
-
- /*
- * Make a transaction-less quota reservation for delayed allocation
- * blocks. This number gets adjusted later. We return if we haven't
- * allocated blocks already inside this loop.
- */
- error = xfs_quota_reserve_blkres(ip, alen);
- if (error)
- goto out;
-
- /*
- * Split changing sb for alen and indlen since they could be coming
- * from different places.
- */
- indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen);
- ASSERT(indlen > 0);
-
- fdblocks = indlen;
- if (XFS_IS_REALTIME_INODE(ip)) {
- error = xfs_dec_frextents(mp, xfs_blen_to_rtbxlen(mp, alen));
- if (error)
- goto out_unreserve_quota;
- } else {
- fdblocks += alen;
- }
-
- error = xfs_dec_fdblocks(mp, fdblocks, false);
- if (error)
- goto out_unreserve_frextents;
-
- ip->i_delayed_blks += alen;
- xfs_mod_delalloc(ip, alen, indlen);
-
- got->br_startoff = aoff;
- got->br_startblock = nullstartblock(indlen);
- got->br_blockcount = alen;
- got->br_state = XFS_EXT_NORM;
-
- xfs_bmap_add_extent_hole_delay(ip, whichfork, icur, got);
-
- /*
- * Tag the inode if blocks were preallocated. Note that COW fork
- * preallocation can occur at the start or end of the extent, even when
- * prealloc == 0, so we must also check the aligned offset and length.
- */
- if (whichfork == XFS_DATA_FORK && prealloc)
- xfs_inode_set_eofblocks_tag(ip);
- if (whichfork == XFS_COW_FORK && (prealloc || aoff < off || alen > len))
- xfs_inode_set_cowblocks_tag(ip);
-
- return 0;
-
-out_unreserve_frextents:
- if (XFS_IS_REALTIME_INODE(ip))
- xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, alen));
-out_unreserve_quota:
- if (XFS_IS_QUOTA_ON(mp))
- xfs_quota_unreserve_blkres(ip, alen);
-out:
- if (error == -ENOSPC || error == -EDQUOT) {
- trace_xfs_delalloc_enospc(ip, off, len);
-
- if (prealloc || use_cowextszhint) {
- /* retry without any preallocation */
- use_cowextszhint = false;
- prealloc = 0;
- goto retry;
- }
- }
- return error;
-}
-
static int
xfs_bmapi_allocate(
struct xfs_bmalloca *bma)
@@ -4947,7 +4673,8 @@ xfs_bmap_del_extent_delay(
int whichfork,
struct xfs_iext_cursor *icur,
struct xfs_bmbt_irec *got,
- struct xfs_bmbt_irec *del)
+ struct xfs_bmbt_irec *del,
+ uint32_t bflags) /* bmapi flags */
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
@@ -5067,10 +4794,18 @@ xfs_bmap_del_extent_delay(
da_diff = da_old - da_new;
fdblocks = da_diff;
- if (isrt)
- xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, del->br_blockcount));
- else
+ if (bflags & XFS_BMAPI_REMAP) {
+ ;
+ } else if (isrt) {
+ xfs_rtbxlen_t rtxlen;
+
+ rtxlen = xfs_blen_to_rtbxlen(mp, del->br_blockcount);
+ if (xfs_is_zoned_inode(ip))
+ xfs_zoned_add_available(mp, rtxlen);
+ xfs_add_frextents(mp, rtxlen);
+ } else {
fdblocks += del->br_blockcount;
+ }
xfs_add_fdblocks(mp, fdblocks);
xfs_mod_delalloc(ip, -(int64_t)del->br_blockcount, -da_diff);
@@ -5669,7 +5404,8 @@ __xfs_bunmapi(
delete:
if (wasdel) {
- xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del);
+ xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got,
+ &del, flags);
} else {
error = xfs_bmap_del_extent_real(ip, tp, &icur, cur,
&del, &tmp_logflags, whichfork,
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 4b721d935994..d5f2729305fa 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -87,6 +87,9 @@ struct xfs_bmalloca {
/* Do not update the rmap btree. Used for reconstructing bmbt from rmapbt. */
#define XFS_BMAPI_NORMAP (1u << 10)
+/* Try to align allocations to the extent size hint */
+#define XFS_BMAPI_EXTSZALIGN (1u << 11)
+
#define XFS_BMAPI_FLAGS \
{ XFS_BMAPI_ENTIRE, "ENTIRE" }, \
{ XFS_BMAPI_METADATA, "METADATA" }, \
@@ -98,7 +101,8 @@ struct xfs_bmalloca {
{ XFS_BMAPI_REMAP, "REMAP" }, \
{ XFS_BMAPI_COWFORK, "COWFORK" }, \
{ XFS_BMAPI_NODISCARD, "NODISCARD" }, \
- { XFS_BMAPI_NORMAP, "NORMAP" }
+ { XFS_BMAPI_NORMAP, "NORMAP" },\
+ { XFS_BMAPI_EXTSZALIGN, "EXTSZALIGN" }
static inline int xfs_bmapi_aflag(int w)
@@ -204,7 +208,7 @@ int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_extnum_t nexts, int *done);
void xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork,
struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got,
- struct xfs_bmbt_irec *del);
+ struct xfs_bmbt_irec *del, uint32_t bflags);
void xfs_bmap_del_extent_cow(struct xfs_inode *ip,
struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got,
struct xfs_bmbt_irec *del);
@@ -219,10 +223,6 @@ int xfs_bmap_insert_extents(struct xfs_trans *tp, struct xfs_inode *ip,
bool *done, xfs_fileoff_t stop_fsb);
int xfs_bmap_split_extent(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t split_offset);
-int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork,
- xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc,
- struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur,
- int eof);
int xfs_bmapi_convert_delalloc(struct xfs_inode *ip, int whichfork,
xfs_off_t offset, struct iomap *iomap, unsigned int *seq);
int xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp,
@@ -233,6 +233,7 @@ xfs_extlen_t xfs_bmapi_minleft(struct xfs_trans *tp, struct xfs_inode *ip,
int fork);
int xfs_bmap_btalloc_low_space(struct xfs_bmalloca *ap,
struct xfs_alloc_arg *args);
+xfs_filblks_t xfs_bmap_worst_indlen(struct xfs_inode *ip, xfs_filblks_t len);
enum xfs_bmap_intent_type {
XFS_BMAP_MAP = 1,
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 908d7b050e9c..188feac04b60 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -369,38 +369,26 @@ xfs_bmbt_init_rec_from_cur(
xfs_bmbt_disk_set_all(&rec->bmbt, &cur->bc_rec.b);
}
-STATIC int64_t
-xfs_bmbt_key_diff(
+STATIC int
+xfs_bmbt_cmp_key_with_cur(
struct xfs_btree_cur *cur,
const union xfs_btree_key *key)
{
- return (int64_t)be64_to_cpu(key->bmbt.br_startoff) -
- cur->bc_rec.b.br_startoff;
+ return cmp_int(be64_to_cpu(key->bmbt.br_startoff),
+ cur->bc_rec.b.br_startoff);
}
-STATIC int64_t
-xfs_bmbt_diff_two_keys(
+STATIC int
+xfs_bmbt_cmp_two_keys(
struct xfs_btree_cur *cur,
const union xfs_btree_key *k1,
const union xfs_btree_key *k2,
const union xfs_btree_key *mask)
{
- uint64_t a = be64_to_cpu(k1->bmbt.br_startoff);
- uint64_t b = be64_to_cpu(k2->bmbt.br_startoff);
-
ASSERT(!mask || mask->bmbt.br_startoff);
- /*
- * Note: This routine previously casted a and b to int64 and subtracted
- * them to generate a result. This lead to problems if b was the
- * "maximum" key value (all ones) being signed incorrectly, hence this
- * somewhat less efficient version.
- */
- if (a > b)
- return 1;
- if (b > a)
- return -1;
- return 0;
+ return cmp_int(be64_to_cpu(k1->bmbt.br_startoff),
+ be64_to_cpu(k2->bmbt.br_startoff));
}
static xfs_failaddr_t
@@ -647,8 +635,8 @@ const struct xfs_btree_ops xfs_bmbt_ops = {
.init_key_from_rec = xfs_bmbt_init_key_from_rec,
.init_high_key_from_rec = xfs_bmbt_init_high_key_from_rec,
.init_rec_from_cur = xfs_bmbt_init_rec_from_cur,
- .key_diff = xfs_bmbt_key_diff,
- .diff_two_keys = xfs_bmbt_diff_two_keys,
+ .cmp_key_with_cur = xfs_bmbt_cmp_key_with_cur,
+ .cmp_two_keys = xfs_bmbt_cmp_two_keys,
.buf_ops = &xfs_bmbt_buf_ops,
.keys_inorder = xfs_bmbt_keys_inorder,
.recs_inorder = xfs_bmbt_recs_inorder,
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 299ce7fd11b0..a61211d253f1 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -1985,7 +1985,7 @@ xfs_btree_lookup(
int *stat) /* success/failure */
{
struct xfs_btree_block *block; /* current btree block */
- int64_t diff; /* difference for the current key */
+ int cmp_r; /* current key comparison result */
int error; /* error return value */
int keyno; /* current key number */
int level; /* level in the btree */
@@ -2013,13 +2013,13 @@ xfs_btree_lookup(
* on the lookup record, then follow the corresponding block
* pointer down to the next level.
*/
- for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
+ for (level = cur->bc_nlevels - 1, cmp_r = 1; level >= 0; level--) {
/* Get the block we need to do the lookup on. */
error = xfs_btree_lookup_get_block(cur, level, pp, &block);
if (error)
goto error0;
- if (diff == 0) {
+ if (cmp_r == 0) {
/*
* If we already had a key match at a higher level, we
* know we need to use the first entry in this block.
@@ -2065,15 +2065,16 @@ xfs_btree_lookup(
keyno, block, &key);
/*
- * Compute difference to get next direction:
+ * Compute comparison result to get next
+ * direction:
* - less than, move right
* - greater than, move left
* - equal, we're done
*/
- diff = cur->bc_ops->key_diff(cur, kp);
- if (diff < 0)
+ cmp_r = cur->bc_ops->cmp_key_with_cur(cur, kp);
+ if (cmp_r < 0)
low = keyno + 1;
- else if (diff > 0)
+ else if (cmp_r > 0)
high = keyno - 1;
else
break;
@@ -2089,7 +2090,7 @@ xfs_btree_lookup(
* If we moved left, need the previous key number,
* unless there isn't one.
*/
- if (diff > 0 && --keyno < 1)
+ if (cmp_r > 0 && --keyno < 1)
keyno = 1;
pp = xfs_btree_ptr_addr(cur, keyno, block);
@@ -2102,7 +2103,7 @@ xfs_btree_lookup(
}
/* Done with the search. See if we need to adjust the results. */
- if (dir != XFS_LOOKUP_LE && diff < 0) {
+ if (dir != XFS_LOOKUP_LE && cmp_r < 0) {
keyno++;
/*
* If ge search and we went off the end of the block, but it's
@@ -2125,14 +2126,14 @@ xfs_btree_lookup(
*stat = 1;
return 0;
}
- } else if (dir == XFS_LOOKUP_LE && diff > 0)
+ } else if (dir == XFS_LOOKUP_LE && cmp_r > 0)
keyno--;
cur->bc_levels[0].ptr = keyno;
/* Return if we succeeded or not. */
if (keyno == 0 || keyno > xfs_btree_get_numrecs(block))
*stat = 0;
- else if (dir != XFS_LOOKUP_EQ || diff == 0)
+ else if (dir != XFS_LOOKUP_EQ || cmp_r == 0)
*stat = 1;
else
*stat = 0;
@@ -5058,7 +5059,7 @@ xfs_btree_simple_query_range(
int error;
ASSERT(cur->bc_ops->init_high_key_from_rec);
- ASSERT(cur->bc_ops->diff_two_keys);
+ ASSERT(cur->bc_ops->cmp_two_keys);
/*
* Find the leftmost record. The btree cursor must be set
@@ -5352,15 +5353,15 @@ xfs_btree_count_blocks(
}
/* Compare two btree pointers. */
-int64_t
-xfs_btree_diff_two_ptrs(
+int
+xfs_btree_cmp_two_ptrs(
struct xfs_btree_cur *cur,
const union xfs_btree_ptr *a,
const union xfs_btree_ptr *b)
{
if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
- return (int64_t)be64_to_cpu(a->l) - be64_to_cpu(b->l);
- return (int64_t)be32_to_cpu(a->s) - be32_to_cpu(b->s);
+ return cmp_int(be64_to_cpu(a->l), be64_to_cpu(b->l));
+ return cmp_int(be32_to_cpu(a->s), be32_to_cpu(b->s));
}
struct xfs_btree_has_records {
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 355b304696e6..60e78572e725 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -171,20 +171,23 @@ struct xfs_btree_ops {
void (*init_high_key_from_rec)(union xfs_btree_key *key,
const union xfs_btree_rec *rec);
- /* difference between key value and cursor value */
- int64_t (*key_diff)(struct xfs_btree_cur *cur,
- const union xfs_btree_key *key);
+ /*
+ * Compare key value and cursor value -- positive if key > cur,
+ * negative if key < cur, and zero if equal.
+ */
+ int (*cmp_key_with_cur)(struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key);
/*
- * Difference between key2 and key1 -- positive if key1 > key2,
- * negative if key1 < key2, and zero if equal. If the @mask parameter
- * is non NULL, each key field to be used in the comparison must
- * contain a nonzero value.
+ * Compare key1 and key2 -- positive if key1 > key2, negative if
+ * key1 < key2, and zero if equal. If the @mask parameter is non NULL,
+ * each key field to be used in the comparison must contain a nonzero
+ * value.
*/
- int64_t (*diff_two_keys)(struct xfs_btree_cur *cur,
- const union xfs_btree_key *key1,
- const union xfs_btree_key *key2,
- const union xfs_btree_key *mask);
+ int (*cmp_two_keys)(struct xfs_btree_cur *cur,
+ const union xfs_btree_key *key1,
+ const union xfs_btree_key *key2,
+ const union xfs_btree_key *mask);
const struct xfs_buf_ops *buf_ops;
@@ -516,9 +519,9 @@ struct xfs_btree_block *xfs_btree_get_block(struct xfs_btree_cur *cur,
int level, struct xfs_buf **bpp);
bool xfs_btree_ptr_is_null(struct xfs_btree_cur *cur,
const union xfs_btree_ptr *ptr);
-int64_t xfs_btree_diff_two_ptrs(struct xfs_btree_cur *cur,
- const union xfs_btree_ptr *a,
- const union xfs_btree_ptr *b);
+int xfs_btree_cmp_two_ptrs(struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *a,
+ const union xfs_btree_ptr *b);
void xfs_btree_get_sibling(struct xfs_btree_cur *cur,
struct xfs_btree_block *block,
union xfs_btree_ptr *ptr, int lr);
@@ -546,7 +549,7 @@ xfs_btree_keycmp_lt(
const union xfs_btree_key *key1,
const union xfs_btree_key *key2)
{
- return cur->bc_ops->diff_two_keys(cur, key1, key2, NULL) < 0;
+ return cur->bc_ops->cmp_two_keys(cur, key1, key2, NULL) < 0;
}
static inline bool
@@ -555,7 +558,7 @@ xfs_btree_keycmp_gt(
const union xfs_btree_key *key1,
const union xfs_btree_key *key2)
{
- return cur->bc_ops->diff_two_keys(cur, key1, key2, NULL) > 0;
+ return cur->bc_ops->cmp_two_keys(cur, key1, key2, NULL) > 0;
}
static inline bool
@@ -564,7 +567,7 @@ xfs_btree_keycmp_eq(
const union xfs_btree_key *key1,
const union xfs_btree_key *key2)
{
- return cur->bc_ops->diff_two_keys(cur, key1, key2, NULL) == 0;
+ return cur->bc_ops->cmp_two_keys(cur, key1, key2, NULL) == 0;
}
static inline bool
@@ -602,7 +605,7 @@ xfs_btree_masked_keycmp_lt(
const union xfs_btree_key *key2,
const union xfs_btree_key *mask)
{
- return cur->bc_ops->diff_two_keys(cur, key1, key2, mask) < 0;
+ return cur->bc_ops->cmp_two_keys(cur, key1, key2, mask) < 0;
}
static inline bool
@@ -612,7 +615,7 @@ xfs_btree_masked_keycmp_gt(
const union xfs_btree_key *key2,
const union xfs_btree_key *mask)
{
- return cur->bc_ops->diff_two_keys(cur, key1, key2, mask) > 0;
+ return cur->bc_ops->cmp_two_keys(cur, key1, key2, mask) > 0;
}
static inline bool
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index b1007fb661ba..779dac59b1f3 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -112,7 +112,7 @@ typedef struct xfs_sb {
uint16_t sb_sectsize; /* volume sector size, bytes */
uint16_t sb_inodesize; /* inode size, bytes */
uint16_t sb_inopblock; /* inodes per block */
- char sb_fname[XFSLABEL_MAX]; /* file system name */
+ char sb_fname[XFSLABEL_MAX] __nonstring; /* file system name */
uint8_t sb_blocklog; /* log2 of sb_blocksize */
uint8_t sb_sectlog; /* log2 of sb_sectsize */
uint8_t sb_inodelog; /* log2 of sb_inodesize */
@@ -178,9 +178,10 @@ typedef struct xfs_sb {
xfs_rgnumber_t sb_rgcount; /* number of realtime groups */
xfs_rtxlen_t sb_rgextents; /* size of a realtime group in rtx */
-
uint8_t sb_rgblklog; /* rt group number shift */
uint8_t sb_pad[7]; /* zeroes */
+ xfs_rfsblock_t sb_rtstart; /* start of internal RT section (FSB) */
+ xfs_filblks_t sb_rtreserved; /* reserved (zoned) RT blocks */
/* must be padded to 64 bit alignment */
} xfs_sb_t;
@@ -270,9 +271,10 @@ struct xfs_dsb {
__be64 sb_metadirino; /* metadata directory tree root */
__be32 sb_rgcount; /* # of realtime groups */
__be32 sb_rgextents; /* size of rtgroup in rtx */
-
__u8 sb_rgblklog; /* rt group number shift */
__u8 sb_pad[7]; /* zeroes */
+ __be64 sb_rtstart; /* start of internal RT section (FSB) */
+ __be64 sb_rtreserved; /* reserved (zoned) RT blocks */
/*
* The size of this structure must be padded to 64 bit alignment.
@@ -395,6 +397,9 @@ xfs_sb_has_ro_compat_feature(
#define XFS_SB_FEAT_INCOMPAT_EXCHRANGE (1 << 6) /* exchangerange supported */
#define XFS_SB_FEAT_INCOMPAT_PARENT (1 << 7) /* parent pointers */
#define XFS_SB_FEAT_INCOMPAT_METADIR (1 << 8) /* metadata dir tree */
+#define XFS_SB_FEAT_INCOMPAT_ZONED (1 << 9) /* zoned RT allocator */
+#define XFS_SB_FEAT_INCOMPAT_ZONE_GAPS (1 << 10) /* RTGs have LBA gaps */
+
#define XFS_SB_FEAT_INCOMPAT_ALL \
(XFS_SB_FEAT_INCOMPAT_FTYPE | \
XFS_SB_FEAT_INCOMPAT_SPINODES | \
@@ -404,7 +409,9 @@ xfs_sb_has_ro_compat_feature(
XFS_SB_FEAT_INCOMPAT_NREXT64 | \
XFS_SB_FEAT_INCOMPAT_EXCHRANGE | \
XFS_SB_FEAT_INCOMPAT_PARENT | \
- XFS_SB_FEAT_INCOMPAT_METADIR)
+ XFS_SB_FEAT_INCOMPAT_METADIR | \
+ XFS_SB_FEAT_INCOMPAT_ZONED | \
+ XFS_SB_FEAT_INCOMPAT_ZONE_GAPS)
#define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL
static inline bool
@@ -952,7 +959,12 @@ struct xfs_dinode {
__be64 di_changecount; /* number of attribute changes */
__be64 di_lsn; /* flush sequence */
__be64 di_flags2; /* more random flags */
- __be32 di_cowextsize; /* basic cow extent size for file */
+ union {
+ /* basic cow extent size for (regular) file */
+ __be32 di_cowextsize;
+ /* used blocks in RTG for (zoned) rtrmap inode */
+ __be32 di_used_blocks;
+ };
__u8 di_pad2[12]; /* more padding for future expansion */
/* fields only written to during inode creation */
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 2c3171262b44..12463ba766da 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -189,7 +189,9 @@ struct xfs_fsop_geom {
uint32_t checked; /* o: checked fs & rt metadata */
__u32 rgextents; /* rt extents in a realtime group */
__u32 rgcount; /* number of realtime groups */
- __u64 reserved[16]; /* reserved space */
+ __u64 rtstart; /* start of internal rt section */
+ __u64 rtreserved; /* RT (zoned) reserved blocks */
+ __u64 reserved[14]; /* reserved space */
};
#define XFS_FSOP_GEOM_SICK_COUNTERS (1 << 0) /* summary counters */
@@ -247,6 +249,7 @@ typedef struct xfs_fsop_resblks {
#define XFS_FSOP_GEOM_FLAGS_EXCHANGE_RANGE (1 << 24) /* exchange range */
#define XFS_FSOP_GEOM_FLAGS_PARENT (1 << 25) /* linux parent pointers */
#define XFS_FSOP_GEOM_FLAGS_METADIR (1 << 26) /* metadata directories */
+#define XFS_FSOP_GEOM_FLAGS_ZONED (1 << 27) /* zoned rt device */
/*
* Minimum and maximum sizes need for growth checks.
@@ -1079,6 +1082,15 @@ struct xfs_rtgroup_geometry {
#define XFS_IOC_COMMIT_RANGE _IOW ('X', 131, struct xfs_commit_range)
/* XFS_IOC_GETFSUUID ---------- deprecated 140 */
+/*
+ * Devices supported by a single XFS file system. Reported in fsmaps fmr_device
+ * when using internal RT devices.
+ */
+enum xfs_device {
+ XFS_DEV_DATA = 1,
+ XFS_DEV_LOG = 2,
+ XFS_DEV_RT = 3,
+};
#ifndef HAVE_BBMACROS
/*
diff --git a/fs/xfs/libxfs/xfs_group.c b/fs/xfs/libxfs/xfs_group.c
index e9d76bcdc820..792f76d2e2a0 100644
--- a/fs/xfs/libxfs/xfs_group.c
+++ b/fs/xfs/libxfs/xfs_group.c
@@ -163,7 +163,8 @@ xfs_group_free(
xfs_defer_drain_free(&xg->xg_intents_drain);
#ifdef __KERNEL__
- kfree(xg->xg_busy_extents);
+ if (xfs_group_has_extent_busy(xg->xg_mount, xg->xg_type))
+ kfree(xg->xg_busy_extents);
#endif
if (uninit)
@@ -171,7 +172,8 @@ xfs_group_free(
/* drop the mount's active reference */
xfs_group_rele(xg);
- XFS_IS_CORRUPT(mp, atomic_read(&xg->xg_active_ref) != 0);
+ XFS_IS_CORRUPT(mp, atomic_read(&xg->xg_active_ref) > 0);
+ XFS_IS_CORRUPT(mp, atomic_read(&xg->xg_active_ref) < 0);
kfree_rcu_mightsleep(xg);
}
@@ -189,9 +191,11 @@ xfs_group_insert(
xg->xg_type = type;
#ifdef __KERNEL__
- xg->xg_busy_extents = xfs_extent_busy_alloc();
- if (!xg->xg_busy_extents)
- return -ENOMEM;
+ if (xfs_group_has_extent_busy(mp, type)) {
+ xg->xg_busy_extents = xfs_extent_busy_alloc();
+ if (!xg->xg_busy_extents)
+ return -ENOMEM;
+ }
spin_lock_init(&xg->xg_state_lock);
xfs_hooks_init(&xg->xg_rmap_update_hooks);
#endif
@@ -210,7 +214,8 @@ xfs_group_insert(
out_drain:
xfs_defer_drain_free(&xg->xg_intents_drain);
#ifdef __KERNEL__
- kfree(xg->xg_busy_extents);
+ if (xfs_group_has_extent_busy(xg->xg_mount, xg->xg_type))
+ kfree(xg->xg_busy_extents);
#endif
return error;
}
diff --git a/fs/xfs/libxfs/xfs_group.h b/fs/xfs/libxfs/xfs_group.h
index 242b05627c7a..4423932a2313 100644
--- a/fs/xfs/libxfs/xfs_group.h
+++ b/fs/xfs/libxfs/xfs_group.h
@@ -19,10 +19,23 @@ struct xfs_group {
#ifdef __KERNEL__
/* -- kernel only structures below this line -- */
- /*
- * Track freed but not yet committed extents.
- */
- struct xfs_extent_busy_tree *xg_busy_extents;
+ union {
+ /*
+ * For perags and non-zoned RT groups:
+ * Track freed but not yet committed extents.
+ */
+ struct xfs_extent_busy_tree *xg_busy_extents;
+
+ /*
+ * For zoned RT groups:
+ * List of groups that need a zone reset.
+ *
+ * The zonegc code forces a log flush of the rtrmap inode before
+ * resetting the write pointer, so there is no need for
+ * individual busy extent tracking.
+ */
+ struct xfs_group *xg_next_reset;
+ };
/*
* Bitsets of per-ag metadata that have been checked and/or are sick.
@@ -107,9 +120,15 @@ xfs_gbno_to_daddr(
xfs_agblock_t gbno)
{
struct xfs_mount *mp = xg->xg_mount;
- uint32_t blocks = mp->m_groups[xg->xg_type].blocks;
+ struct xfs_groups *g = &mp->m_groups[xg->xg_type];
+ xfs_fsblock_t fsbno;
+
+ if (g->has_daddr_gaps)
+ fsbno = xfs_gbno_to_fsb(xg, gbno);
+ else
+ fsbno = (xfs_fsblock_t)xg->xg_gno * g->blocks + gbno;
- return XFS_FSB_TO_BB(mp, (xfs_fsblock_t)xg->xg_gno * blocks + gbno);
+ return XFS_FSB_TO_BB(mp, g->start_fsb + fsbno);
}
static inline uint32_t
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index f3a840a425f5..750111634d9f 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -364,7 +364,7 @@ xfs_ialloc_inode_init(
(j * M_IGEO(mp)->blocks_per_cluster));
error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
mp->m_bsize * M_IGEO(mp)->blocks_per_cluster,
- XBF_UNMAPPED, &fbuf);
+ 0, &fbuf);
if (error)
return error;
@@ -1927,7 +1927,7 @@ xfs_dialloc(
* that we can immediately allocate, but then we allow allocation on the
* second pass if we fail to find an AG with free inodes in it.
*/
- if (percpu_counter_read_positive(&mp->m_fdblocks) <
+ if (xfs_estimate_freecounter(mp, XC_FREE_BLOCKS) <
mp->m_low_space[XFS_LOWSP_1_PCNT]) {
ok_alloc = false;
low_space = true;
@@ -2801,12 +2801,35 @@ xfs_ialloc_read_agi(
set_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate);
}
+#ifdef DEBUG
/*
- * It's possible for these to be out of sync if
- * we are in the middle of a forced shutdown.
+ * It's possible for the AGF to be out of sync if the block device is
+ * silently dropping writes. This can happen in fstests with dmflakey
+ * enabled, which allows the buffer to be cleaned and reclaimed by
+ * memory pressure and then re-read from disk here. We will get a
+ * stale version of the AGF from disk, and nothing good can happen from
+ * here. Hence if we detect this situation, immediately shut down the
+ * filesystem.
+ *
+ * This can also happen if we are already in the middle of a forced
+ * shutdown, so don't bother checking if we are already shut down.
*/
- ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
- xfs_is_shutdown(pag_mount(pag)));
+ if (!xfs_is_shutdown(pag_mount(pag))) {
+ bool ok = true;
+
+ ok &= pag->pagi_freecount == be32_to_cpu(agi->agi_freecount);
+ ok &= pag->pagi_count == be32_to_cpu(agi->agi_count);
+
+ if (XFS_IS_CORRUPT(pag_mount(pag), !ok)) {
+ xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
+ xfs_trans_brelse(tp, agibp);
+ xfs_force_shutdown(pag_mount(pag),
+ SHUTDOWN_CORRUPT_ONDISK);
+ return -EFSCORRUPTED;
+ }
+ }
+#endif /* DEBUG */
+
if (agibpp)
*agibpp = agibp;
else
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 6f270d8f4270..100afdd66cdd 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -265,17 +265,17 @@ xfs_finobt_init_ptr_from_cur(
ptr->s = agi->agi_free_root;
}
-STATIC int64_t
-xfs_inobt_key_diff(
+STATIC int
+xfs_inobt_cmp_key_with_cur(
struct xfs_btree_cur *cur,
const union xfs_btree_key *key)
{
- return (int64_t)be32_to_cpu(key->inobt.ir_startino) -
- cur->bc_rec.i.ir_startino;
+ return cmp_int(be32_to_cpu(key->inobt.ir_startino),
+ cur->bc_rec.i.ir_startino);
}
-STATIC int64_t
-xfs_inobt_diff_two_keys(
+STATIC int
+xfs_inobt_cmp_two_keys(
struct xfs_btree_cur *cur,
const union xfs_btree_key *k1,
const union xfs_btree_key *k2,
@@ -283,8 +283,8 @@ xfs_inobt_diff_two_keys(
{
ASSERT(!mask || mask->inobt.ir_startino);
- return (int64_t)be32_to_cpu(k1->inobt.ir_startino) -
- be32_to_cpu(k2->inobt.ir_startino);
+ return cmp_int(be32_to_cpu(k1->inobt.ir_startino),
+ be32_to_cpu(k2->inobt.ir_startino));
}
static xfs_failaddr_t
@@ -430,9 +430,9 @@ const struct xfs_btree_ops xfs_inobt_ops = {
.init_high_key_from_rec = xfs_inobt_init_high_key_from_rec,
.init_rec_from_cur = xfs_inobt_init_rec_from_cur,
.init_ptr_from_cur = xfs_inobt_init_ptr_from_cur,
- .key_diff = xfs_inobt_key_diff,
+ .cmp_key_with_cur = xfs_inobt_cmp_key_with_cur,
.buf_ops = &xfs_inobt_buf_ops,
- .diff_two_keys = xfs_inobt_diff_two_keys,
+ .cmp_two_keys = xfs_inobt_cmp_two_keys,
.keys_inorder = xfs_inobt_keys_inorder,
.recs_inorder = xfs_inobt_recs_inorder,
.keys_contiguous = xfs_inobt_keys_contiguous,
@@ -460,9 +460,9 @@ const struct xfs_btree_ops xfs_finobt_ops = {
.init_high_key_from_rec = xfs_inobt_init_high_key_from_rec,
.init_rec_from_cur = xfs_inobt_init_rec_from_cur,
.init_ptr_from_cur = xfs_finobt_init_ptr_from_cur,
- .key_diff = xfs_inobt_key_diff,
+ .cmp_key_with_cur = xfs_inobt_cmp_key_with_cur,
.buf_ops = &xfs_finobt_buf_ops,
- .diff_two_keys = xfs_inobt_diff_two_keys,
+ .cmp_two_keys = xfs_inobt_cmp_two_keys,
.keys_inorder = xfs_inobt_keys_inorder,
.recs_inorder = xfs_inobt_recs_inorder,
.keys_contiguous = xfs_inobt_keys_contiguous,
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index f24fa628fecf..aa13fc00afd7 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -137,7 +137,7 @@ xfs_imap_to_bp(
int error;
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
- imap->im_len, XBF_UNMAPPED, bpp, &xfs_inode_buf_ops);
+ imap->im_len, 0, bpp, &xfs_inode_buf_ops);
if (xfs_metadata_is_sick(error))
xfs_agno_mark_sick(mp, xfs_daddr_to_agno(mp, imap->im_blkno),
XFS_SICK_AG_INODES);
@@ -252,7 +252,10 @@ xfs_inode_from_disk(
be64_to_cpu(from->di_changecount));
ip->i_crtime = xfs_inode_from_disk_ts(from, from->di_crtime);
ip->i_diflags2 = be64_to_cpu(from->di_flags2);
+ /* also covers the di_used_blocks union arm: */
ip->i_cowextsize = be32_to_cpu(from->di_cowextsize);
+ BUILD_BUG_ON(sizeof(from->di_cowextsize) !=
+ sizeof(from->di_used_blocks));
}
error = xfs_iformat_data_fork(ip, from);
@@ -349,6 +352,7 @@ xfs_inode_to_disk(
to->di_changecount = cpu_to_be64(inode_peek_iversion(inode));
to->di_crtime = xfs_inode_to_disk_ts(ip, ip->i_crtime);
to->di_flags2 = cpu_to_be64(ip->i_diflags2);
+ /* also covers the di_used_blocks union arm: */
to->di_cowextsize = cpu_to_be32(ip->i_cowextsize);
to->di_ino = cpu_to_be64(ip->i_ino);
to->di_lsn = cpu_to_be64(lsn);
@@ -752,11 +756,18 @@ xfs_dinode_verify(
!xfs_has_rtreflink(mp))
return __this_address;
- /* COW extent size hint validation */
- fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize),
- mode, flags, flags2);
- if (fa)
- return fa;
+ if (xfs_has_zoned(mp) &&
+ dip->di_metatype == cpu_to_be16(XFS_METAFILE_RTRMAP)) {
+ if (be32_to_cpu(dip->di_used_blocks) > mp->m_sb.sb_rgextents)
+ return __this_address;
+ } else {
+ /* COW extent size hint validation */
+ fa = xfs_inode_validate_cowextsize(mp,
+ be32_to_cpu(dip->di_cowextsize),
+ mode, flags, flags2);
+ if (fa)
+ return fa;
+ }
/* bigtime iflag can only happen on bigtime filesystems */
if (xfs_dinode_has_bigtime(dip) &&
diff --git a/fs/xfs/libxfs/xfs_inode_util.c b/fs/xfs/libxfs/xfs_inode_util.c
index deb0b7c00a1f..48fe49a5f050 100644
--- a/fs/xfs/libxfs/xfs_inode_util.c
+++ b/fs/xfs/libxfs/xfs_inode_util.c
@@ -322,6 +322,7 @@ xfs_inode_init(
if (xfs_has_v3inodes(mp)) {
inode_set_iversion(inode, 1);
+ /* also covers the di_used_blocks union arm: */
ip->i_cowextsize = 0;
times |= XFS_ICHGTIME_CREATE;
}
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index a472ac2e45d0..0d637c276db0 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -475,7 +475,12 @@ struct xfs_log_dinode {
xfs_lsn_t di_lsn;
uint64_t di_flags2; /* more random flags */
- uint32_t di_cowextsize; /* basic cow extent size for file */
+ union {
+ /* basic cow extent size for (regular) file */
+ uint32_t di_cowextsize;
+ /* used blocks in RTG for (zoned) rtrmap inode */
+ uint32_t di_used_blocks;
+ };
uint8_t di_pad2[12]; /* more padding for future expansion */
/* fields only written to during inode creation */
diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
index 66c7916fb5cd..95de23095030 100644
--- a/fs/xfs/libxfs/xfs_log_recover.h
+++ b/fs/xfs/libxfs/xfs_log_recover.h
@@ -104,7 +104,7 @@ struct xlog_recover_item {
struct list_head ri_list;
int ri_cnt; /* count of regions found */
int ri_total; /* total regions */
- struct xfs_log_iovec *ri_buf; /* ptr to regions buffer */
+ struct kvec *ri_buf; /* ptr to regions buffer */
const struct xlog_recover_item_ops *ri_ops;
};
@@ -117,7 +117,7 @@ struct xlog_recover {
struct list_head r_itemq; /* q for items */
};
-#define ITEM_TYPE(i) (*(unsigned short *)(i)->ri_buf[0].i_addr)
+#define ITEM_TYPE(i) (*(unsigned short *)(i)->ri_buf[0].iov_base)
#define XLOG_RECOVER_CRCPASS 0
#define XLOG_RECOVER_PASS1 1
diff --git a/fs/xfs/libxfs/xfs_log_rlimit.c b/fs/xfs/libxfs/xfs_log_rlimit.c
index d3bd6a86c8fe..34bba96d30ca 100644
--- a/fs/xfs/libxfs/xfs_log_rlimit.c
+++ b/fs/xfs/libxfs/xfs_log_rlimit.c
@@ -91,6 +91,7 @@ xfs_log_calc_trans_resv_for_minlogblocks(
*/
if (xfs_want_minlogsize_fixes(&mp->m_sb)) {
xfs_trans_resv_calc(mp, resv);
+ resv->tr_atomic_ioend = M_RES(mp)->tr_atomic_ioend;
return;
}
@@ -107,6 +108,9 @@ xfs_log_calc_trans_resv_for_minlogblocks(
xfs_trans_resv_calc(mp, resv);
+ /* Copy the dynamic transaction reservation types from the running fs */
+ resv->tr_atomic_ioend = M_RES(mp)->tr_atomic_ioend;
+
if (xfs_has_reflink(mp)) {
/*
* In the early days of reflink, typical log operation counts
diff --git a/fs/xfs/libxfs/xfs_metafile.c b/fs/xfs/libxfs/xfs_metafile.c
index 2f5f554a36d4..225923e463c4 100644
--- a/fs/xfs/libxfs/xfs_metafile.c
+++ b/fs/xfs/libxfs/xfs_metafile.c
@@ -21,6 +21,9 @@
#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_alloc.h"
+#include "xfs_rtgroup.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_rtrefcount_btree.h"
static const struct {
enum xfs_metafile_type mtype;
@@ -74,12 +77,11 @@ xfs_metafile_clear_iflag(
}
/*
- * Is the amount of space that could be allocated towards a given metadata
- * file at or beneath a certain threshold?
+ * Is the metafile reservations at or beneath a certain threshold?
*/
static inline bool
xfs_metafile_resv_can_cover(
- struct xfs_inode *ip,
+ struct xfs_mount *mp,
int64_t rhs)
{
/*
@@ -88,43 +90,38 @@ xfs_metafile_resv_can_cover(
* global free block count. Take care of the first case to avoid
* touching the per-cpu counter.
*/
- if (ip->i_delayed_blks >= rhs)
+ if (mp->m_metafile_resv_avail >= rhs)
return true;
/*
* There aren't enough blocks left in the inode's reservation, but it
* isn't critical unless there also isn't enough free space.
*/
- return __percpu_counter_compare(&ip->i_mount->m_fdblocks,
- rhs - ip->i_delayed_blks, 2048) >= 0;
+ return xfs_compare_freecounter(mp, XC_FREE_BLOCKS,
+ rhs - mp->m_metafile_resv_avail, 2048) >= 0;
}
/*
- * Is this metadata file critically low on blocks? For now we'll define that
- * as the number of blocks we can get our hands on being less than 10% of what
- * we reserved or less than some arbitrary number (maximum btree height).
+ * Is the metafile reservation critically low on blocks? For now we'll define
+ * that as the number of blocks we can get our hands on being less than 10% of
+ * what we reserved or less than some arbitrary number (maximum btree height).
*/
bool
xfs_metafile_resv_critical(
- struct xfs_inode *ip)
+ struct xfs_mount *mp)
{
- uint64_t asked_low_water;
+ ASSERT(xfs_has_metadir(mp));
- if (!ip)
- return false;
-
- ASSERT(xfs_is_metadir_inode(ip));
- trace_xfs_metafile_resv_critical(ip, 0);
+ trace_xfs_metafile_resv_critical(mp, 0);
- if (!xfs_metafile_resv_can_cover(ip, ip->i_mount->m_rtbtree_maxlevels))
+ if (!xfs_metafile_resv_can_cover(mp, mp->m_rtbtree_maxlevels))
return true;
- asked_low_water = div_u64(ip->i_meta_resv_asked, 10);
- if (!xfs_metafile_resv_can_cover(ip, asked_low_water))
+ if (!xfs_metafile_resv_can_cover(mp,
+ div_u64(mp->m_metafile_resv_target, 10)))
return true;
- return XFS_TEST_ERROR(false, ip->i_mount,
- XFS_ERRTAG_METAFILE_RESV_CRITICAL);
+ return XFS_TEST_ERROR(false, mp, XFS_ERRTAG_METAFILE_RESV_CRITICAL);
}
/* Allocate a block from the metadata file's reservation. */
@@ -133,22 +130,24 @@ xfs_metafile_resv_alloc_space(
struct xfs_inode *ip,
struct xfs_alloc_arg *args)
{
+ struct xfs_mount *mp = ip->i_mount;
int64_t len = args->len;
ASSERT(xfs_is_metadir_inode(ip));
ASSERT(args->resv == XFS_AG_RESV_METAFILE);
- trace_xfs_metafile_resv_alloc_space(ip, args->len);
+ trace_xfs_metafile_resv_alloc_space(mp, args->len);
/*
* Allocate the blocks from the metadata inode's block reservation
* and update the ondisk sb counter.
*/
- if (ip->i_delayed_blks > 0) {
+ mutex_lock(&mp->m_metafile_resv_lock);
+ if (mp->m_metafile_resv_avail > 0) {
int64_t from_resv;
- from_resv = min_t(int64_t, len, ip->i_delayed_blks);
- ip->i_delayed_blks -= from_resv;
+ from_resv = min_t(int64_t, len, mp->m_metafile_resv_avail);
+ mp->m_metafile_resv_avail -= from_resv;
xfs_mod_delalloc(ip, 0, -from_resv);
xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_RES_FDBLOCKS,
-from_resv);
@@ -175,6 +174,9 @@ xfs_metafile_resv_alloc_space(
xfs_trans_mod_sb(args->tp, field, -len);
}
+ mp->m_metafile_resv_used += args->len;
+ mutex_unlock(&mp->m_metafile_resv_lock);
+
ip->i_nblocks += args->len;
xfs_trans_log_inode(args->tp, ip, XFS_ILOG_CORE);
}
@@ -186,26 +188,33 @@ xfs_metafile_resv_free_space(
struct xfs_trans *tp,
xfs_filblks_t len)
{
+ struct xfs_mount *mp = ip->i_mount;
int64_t to_resv;
ASSERT(xfs_is_metadir_inode(ip));
- trace_xfs_metafile_resv_free_space(ip, len);
+
+ trace_xfs_metafile_resv_free_space(mp, len);
ip->i_nblocks -= len;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ mutex_lock(&mp->m_metafile_resv_lock);
+ mp->m_metafile_resv_used -= len;
+
/*
* Add the freed blocks back into the inode's delalloc reservation
* until it reaches the maximum size. Update the ondisk fdblocks only.
*/
- to_resv = ip->i_meta_resv_asked - (ip->i_nblocks + ip->i_delayed_blks);
+ to_resv = mp->m_metafile_resv_target -
+ (mp->m_metafile_resv_used + mp->m_metafile_resv_avail);
if (to_resv > 0) {
to_resv = min_t(int64_t, to_resv, len);
- ip->i_delayed_blks += to_resv;
+ mp->m_metafile_resv_avail += to_resv;
xfs_mod_delalloc(ip, 0, to_resv);
xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FDBLOCKS, to_resv);
len -= to_resv;
}
+ mutex_unlock(&mp->m_metafile_resv_lock);
/*
* Everything else goes back to the filesystem, so update the in-core
@@ -215,61 +224,99 @@ xfs_metafile_resv_free_space(
xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, len);
}
-/* Release a metadata file's space reservation. */
+static void
+__xfs_metafile_resv_free(
+ struct xfs_mount *mp)
+{
+ if (mp->m_metafile_resv_avail) {
+ xfs_mod_sb_delalloc(mp, -(int64_t)mp->m_metafile_resv_avail);
+ xfs_add_fdblocks(mp, mp->m_metafile_resv_avail);
+ }
+ mp->m_metafile_resv_avail = 0;
+ mp->m_metafile_resv_used = 0;
+ mp->m_metafile_resv_target = 0;
+}
+
+/* Release unused metafile space reservation. */
void
xfs_metafile_resv_free(
- struct xfs_inode *ip)
+ struct xfs_mount *mp)
{
- /* Non-btree metadata inodes don't need space reservations. */
- if (!ip || !ip->i_meta_resv_asked)
+ if (!xfs_has_metadir(mp))
return;
- ASSERT(xfs_is_metadir_inode(ip));
- trace_xfs_metafile_resv_free(ip, 0);
+ trace_xfs_metafile_resv_free(mp, 0);
- if (ip->i_delayed_blks) {
- xfs_mod_delalloc(ip, 0, -ip->i_delayed_blks);
- xfs_add_fdblocks(ip->i_mount, ip->i_delayed_blks);
- ip->i_delayed_blks = 0;
- }
- ip->i_meta_resv_asked = 0;
+ mutex_lock(&mp->m_metafile_resv_lock);
+ __xfs_metafile_resv_free(mp);
+ mutex_unlock(&mp->m_metafile_resv_lock);
}
-/* Set up a metadata file's space reservation. */
+/* Set up a metafile space reservation. */
int
xfs_metafile_resv_init(
- struct xfs_inode *ip,
- xfs_filblks_t ask)
+ struct xfs_mount *mp)
{
+ struct xfs_rtgroup *rtg = NULL;
+ xfs_filblks_t used = 0, target = 0;
xfs_filblks_t hidden_space;
- xfs_filblks_t used;
- int error;
+ xfs_rfsblock_t dblocks_avail = mp->m_sb.sb_dblocks / 4;
+ int error = 0;
- if (!ip || ip->i_meta_resv_asked > 0)
+ if (!xfs_has_metadir(mp))
return 0;
- ASSERT(xfs_is_metadir_inode(ip));
+ /*
+ * Free any previous reservation to have a clean slate.
+ */
+ mutex_lock(&mp->m_metafile_resv_lock);
+ __xfs_metafile_resv_free(mp);
+
+ /*
+ * Currently the only btree metafiles that require reservations are the
+ * rtrmap and the rtrefcount. Anything new will have to be added here
+ * as well.
+ */
+ while ((rtg = xfs_rtgroup_next(mp, rtg))) {
+ if (xfs_has_rtrmapbt(mp)) {
+ used += rtg_rmap(rtg)->i_nblocks;
+ target += xfs_rtrmapbt_calc_reserves(mp);
+ }
+ if (xfs_has_rtreflink(mp)) {
+ used += rtg_refcount(rtg)->i_nblocks;
+ target += xfs_rtrefcountbt_calc_reserves(mp);
+ }
+ }
+
+ if (!target)
+ goto out_unlock;
/*
- * Space taken by all other metadata btrees are accounted on-disk as
+ * Space taken by the per-AG metadata btrees are accounted on-disk as
* used space. We therefore only hide the space that is reserved but
* not used by the trees.
*/
- used = ip->i_nblocks;
- if (used > ask)
- ask = used;
- hidden_space = ask - used;
+ if (used > target)
+ target = used;
+ else if (target > dblocks_avail)
+ target = dblocks_avail;
+ hidden_space = target - used;
- error = xfs_dec_fdblocks(ip->i_mount, hidden_space, true);
+ error = xfs_dec_fdblocks(mp, hidden_space, true);
if (error) {
- trace_xfs_metafile_resv_init_error(ip, error, _RET_IP_);
- return error;
+ trace_xfs_metafile_resv_init_error(mp, 0);
+ goto out_unlock;
}
- xfs_mod_delalloc(ip, 0, hidden_space);
- ip->i_delayed_blks = hidden_space;
- ip->i_meta_resv_asked = ask;
+ xfs_mod_sb_delalloc(mp, hidden_space);
+
+ mp->m_metafile_resv_target = target;
+ mp->m_metafile_resv_used = used;
+ mp->m_metafile_resv_avail = hidden_space;
+
+ trace_xfs_metafile_resv_init(mp, target);
- trace_xfs_metafile_resv_init(ip, ask);
- return 0;
+out_unlock:
+ mutex_unlock(&mp->m_metafile_resv_lock);
+ return error;
}
diff --git a/fs/xfs/libxfs/xfs_metafile.h b/fs/xfs/libxfs/xfs_metafile.h
index 95af4b52e5a7..ae6f9e779b98 100644
--- a/fs/xfs/libxfs/xfs_metafile.h
+++ b/fs/xfs/libxfs/xfs_metafile.h
@@ -26,13 +26,13 @@ void xfs_metafile_clear_iflag(struct xfs_trans *tp, struct xfs_inode *ip);
/* Space reservations for metadata inodes. */
struct xfs_alloc_arg;
-bool xfs_metafile_resv_critical(struct xfs_inode *ip);
+bool xfs_metafile_resv_critical(struct xfs_mount *mp);
void xfs_metafile_resv_alloc_space(struct xfs_inode *ip,
struct xfs_alloc_arg *args);
void xfs_metafile_resv_free_space(struct xfs_inode *ip, struct xfs_trans *tp,
xfs_filblks_t len);
-void xfs_metafile_resv_free(struct xfs_inode *ip);
-int xfs_metafile_resv_init(struct xfs_inode *ip, xfs_filblks_t ask);
+void xfs_metafile_resv_free(struct xfs_mount *mp);
+int xfs_metafile_resv_init(struct xfs_mount *mp);
/* Code specific to kernel/userspace; must be provided externally. */
diff --git a/fs/xfs/libxfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h
index a85ecddaa48e..5ed44fdf7491 100644
--- a/fs/xfs/libxfs/xfs_ondisk.h
+++ b/fs/xfs/libxfs/xfs_ondisk.h
@@ -233,8 +233,8 @@ xfs_check_ondisk_structs(void)
16299260424LL);
/* superblock field checks we got from xfs/122 */
- XFS_CHECK_STRUCT_SIZE(struct xfs_dsb, 288);
- XFS_CHECK_STRUCT_SIZE(struct xfs_sb, 288);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_dsb, 304);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_sb, 304);
XFS_CHECK_SB_OFFSET(sb_magicnum, 0);
XFS_CHECK_SB_OFFSET(sb_blocksize, 4);
XFS_CHECK_SB_OFFSET(sb_dblocks, 8);
@@ -295,6 +295,8 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_SB_OFFSET(sb_rgextents, 276);
XFS_CHECK_SB_OFFSET(sb_rgblklog, 280);
XFS_CHECK_SB_OFFSET(sb_pad, 281);
+ XFS_CHECK_SB_OFFSET(sb_rtstart, 288);
+ XFS_CHECK_SB_OFFSET(sb_rtreserved, 296);
}
#endif /* __XFS_ONDISK_H */
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index cebe83f7842a..897784037483 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -2099,9 +2099,7 @@ xfs_refcount_recover_cow_leftovers(
* recording the CoW debris we cancel the (empty) transaction
* and everything goes away cleanly.
*/
- error = xfs_trans_alloc_empty(mp, &tp);
- if (error)
- return error;
+ tp = xfs_trans_alloc_empty(mp);
if (isrt) {
xfs_rtgroup_lock(to_rtg(xg), XFS_RTGLOCK_REFCOUNT);
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c
index 54505fee1852..06da3ca14727 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.c
+++ b/fs/xfs/libxfs/xfs_refcount_btree.c
@@ -174,8 +174,8 @@ xfs_refcountbt_init_ptr_from_cur(
ptr->s = agf->agf_refcount_root;
}
-STATIC int64_t
-xfs_refcountbt_key_diff(
+STATIC int
+xfs_refcountbt_cmp_key_with_cur(
struct xfs_btree_cur *cur,
const union xfs_btree_key *key)
{
@@ -185,11 +185,11 @@ xfs_refcountbt_key_diff(
start = xfs_refcount_encode_startblock(irec->rc_startblock,
irec->rc_domain);
- return (int64_t)be32_to_cpu(kp->rc_startblock) - start;
+ return cmp_int(be32_to_cpu(kp->rc_startblock), start);
}
-STATIC int64_t
-xfs_refcountbt_diff_two_keys(
+STATIC int
+xfs_refcountbt_cmp_two_keys(
struct xfs_btree_cur *cur,
const union xfs_btree_key *k1,
const union xfs_btree_key *k2,
@@ -197,8 +197,8 @@ xfs_refcountbt_diff_two_keys(
{
ASSERT(!mask || mask->refc.rc_startblock);
- return (int64_t)be32_to_cpu(k1->refc.rc_startblock) -
- be32_to_cpu(k2->refc.rc_startblock);
+ return cmp_int(be32_to_cpu(k1->refc.rc_startblock),
+ be32_to_cpu(k2->refc.rc_startblock));
}
STATIC xfs_failaddr_t
@@ -339,9 +339,9 @@ const struct xfs_btree_ops xfs_refcountbt_ops = {
.init_high_key_from_rec = xfs_refcountbt_init_high_key_from_rec,
.init_rec_from_cur = xfs_refcountbt_init_rec_from_cur,
.init_ptr_from_cur = xfs_refcountbt_init_ptr_from_cur,
- .key_diff = xfs_refcountbt_key_diff,
+ .cmp_key_with_cur = xfs_refcountbt_cmp_key_with_cur,
.buf_ops = &xfs_refcountbt_buf_ops,
- .diff_two_keys = xfs_refcountbt_diff_two_keys,
+ .cmp_two_keys = xfs_refcountbt_cmp_two_keys,
.keys_inorder = xfs_refcountbt_keys_inorder,
.recs_inorder = xfs_refcountbt_recs_inorder,
.keys_contiguous = xfs_refcountbt_keys_contiguous,
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
index 2cab694ac58a..bf16aee50d73 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -243,38 +243,22 @@ static inline uint64_t offset_keymask(uint64_t offset)
return offset & ~XFS_RMAP_OFF_UNWRITTEN;
}
-STATIC int64_t
-xfs_rmapbt_key_diff(
+STATIC int
+xfs_rmapbt_cmp_key_with_cur(
struct xfs_btree_cur *cur,
const union xfs_btree_key *key)
{
struct xfs_rmap_irec *rec = &cur->bc_rec.r;
const struct xfs_rmap_key *kp = &key->rmap;
- __u64 x, y;
- int64_t d;
- d = (int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock;
- if (d)
- return d;
-
- x = be64_to_cpu(kp->rm_owner);
- y = rec->rm_owner;
- if (x > y)
- return 1;
- else if (y > x)
- return -1;
-
- x = offset_keymask(be64_to_cpu(kp->rm_offset));
- y = offset_keymask(xfs_rmap_irec_offset_pack(rec));
- if (x > y)
- return 1;
- else if (y > x)
- return -1;
- return 0;
+ return cmp_int(be32_to_cpu(kp->rm_startblock), rec->rm_startblock) ?:
+ cmp_int(be64_to_cpu(kp->rm_owner), rec->rm_owner) ?:
+ cmp_int(offset_keymask(be64_to_cpu(kp->rm_offset)),
+ offset_keymask(xfs_rmap_irec_offset_pack(rec)));
}
-STATIC int64_t
-xfs_rmapbt_diff_two_keys(
+STATIC int
+xfs_rmapbt_cmp_two_keys(
struct xfs_btree_cur *cur,
const union xfs_btree_key *k1,
const union xfs_btree_key *k2,
@@ -282,36 +266,31 @@ xfs_rmapbt_diff_two_keys(
{
const struct xfs_rmap_key *kp1 = &k1->rmap;
const struct xfs_rmap_key *kp2 = &k2->rmap;
- int64_t d;
- __u64 x, y;
+ int d;
/* Doesn't make sense to mask off the physical space part */
ASSERT(!mask || mask->rmap.rm_startblock);
- d = (int64_t)be32_to_cpu(kp1->rm_startblock) -
- be32_to_cpu(kp2->rm_startblock);
+ d = cmp_int(be32_to_cpu(kp1->rm_startblock),
+ be32_to_cpu(kp2->rm_startblock));
if (d)
return d;
if (!mask || mask->rmap.rm_owner) {
- x = be64_to_cpu(kp1->rm_owner);
- y = be64_to_cpu(kp2->rm_owner);
- if (x > y)
- return 1;
- else if (y > x)
- return -1;
+ d = cmp_int(be64_to_cpu(kp1->rm_owner),
+ be64_to_cpu(kp2->rm_owner));
+ if (d)
+ return d;
}
if (!mask || mask->rmap.rm_offset) {
/* Doesn't make sense to allow offset but not owner */
ASSERT(!mask || mask->rmap.rm_owner);
- x = offset_keymask(be64_to_cpu(kp1->rm_offset));
- y = offset_keymask(be64_to_cpu(kp2->rm_offset));
- if (x > y)
- return 1;
- else if (y > x)
- return -1;
+ d = cmp_int(offset_keymask(be64_to_cpu(kp1->rm_offset)),
+ offset_keymask(be64_to_cpu(kp2->rm_offset)));
+ if (d)
+ return d;
}
return 0;
@@ -515,9 +494,9 @@ const struct xfs_btree_ops xfs_rmapbt_ops = {
.init_high_key_from_rec = xfs_rmapbt_init_high_key_from_rec,
.init_rec_from_cur = xfs_rmapbt_init_rec_from_cur,
.init_ptr_from_cur = xfs_rmapbt_init_ptr_from_cur,
- .key_diff = xfs_rmapbt_key_diff,
+ .cmp_key_with_cur = xfs_rmapbt_cmp_key_with_cur,
.buf_ops = &xfs_rmapbt_buf_ops,
- .diff_two_keys = xfs_rmapbt_diff_two_keys,
+ .cmp_two_keys = xfs_rmapbt_cmp_two_keys,
.keys_inorder = xfs_rmapbt_keys_inorder,
.recs_inorder = xfs_rmapbt_recs_inorder,
.keys_contiguous = xfs_rmapbt_keys_contiguous,
@@ -632,9 +611,9 @@ const struct xfs_btree_ops xfs_rmapbt_mem_ops = {
.init_high_key_from_rec = xfs_rmapbt_init_high_key_from_rec,
.init_rec_from_cur = xfs_rmapbt_init_rec_from_cur,
.init_ptr_from_cur = xfbtree_init_ptr_from_cur,
- .key_diff = xfs_rmapbt_key_diff,
+ .cmp_key_with_cur = xfs_rmapbt_cmp_key_with_cur,
.buf_ops = &xfs_rmapbt_mem_buf_ops,
- .diff_two_keys = xfs_rmapbt_diff_two_keys,
+ .cmp_two_keys = xfs_rmapbt_cmp_two_keys,
.keys_inorder = xfs_rmapbt_keys_inorder,
.recs_inorder = xfs_rmapbt_recs_inorder,
.keys_contiguous = xfs_rmapbt_keys_contiguous,
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index 770adf60dd73..5057536e586c 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -1123,6 +1123,7 @@ xfs_rtfree_blocks(
xfs_extlen_t mod;
int error;
+ ASSERT(!xfs_has_zoned(mp));
ASSERT(rtlen <= XFS_MAX_BMBT_EXTLEN);
mod = xfs_blen_to_rtxoff(mp, rtlen);
@@ -1174,6 +1175,9 @@ xfs_rtalloc_query_range(
end = min(end, rtg->rtg_extents - 1);
+ if (xfs_has_zoned(mp))
+ return -EINVAL;
+
/* Iterate the bitmap, looking for discrepancies. */
while (start <= end) {
struct xfs_rtalloc_rec rec;
@@ -1268,6 +1272,8 @@ xfs_rtbitmap_blockcount_len(
struct xfs_mount *mp,
xfs_rtbxlen_t rtextents)
{
+ if (xfs_has_zoned(mp))
+ return 0;
return howmany_64(rtextents, xfs_rtbitmap_rtx_per_rbmblock(mp));
}
@@ -1308,6 +1314,11 @@ xfs_rtsummary_blockcount(
xfs_rtbxlen_t rextents = xfs_rtbitmap_bitcount(mp);
unsigned long long rsumwords;
+ if (xfs_has_zoned(mp)) {
+ *rsumlevels = 0;
+ return 0;
+ }
+
*rsumlevels = xfs_compute_rextslog(rextents) + 1;
rsumwords = xfs_rtbitmap_blockcount_len(mp, rextents) * (*rsumlevels);
return howmany_64(rsumwords, mp->m_blockwsize);
diff --git a/fs/xfs/libxfs/xfs_rtgroup.c b/fs/xfs/libxfs/xfs_rtgroup.c
index d84d32f1b48f..9186c58e83d5 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.c
+++ b/fs/xfs/libxfs/xfs_rtgroup.c
@@ -194,15 +194,17 @@ xfs_rtgroup_lock(
ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) ||
!(rtglock_flags & XFS_RTGLOCK_BITMAP));
- if (rtglock_flags & XFS_RTGLOCK_BITMAP) {
- /*
- * Lock both realtime free space metadata inodes for a freespace
- * update.
- */
- xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_EXCL);
- xfs_ilock(rtg_summary(rtg), XFS_ILOCK_EXCL);
- } else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) {
- xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_SHARED);
+ if (!xfs_has_zoned(rtg_mount(rtg))) {
+ if (rtglock_flags & XFS_RTGLOCK_BITMAP) {
+ /*
+ * Lock both realtime free space metadata inodes for a
+ * freespace update.
+ */
+ xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_EXCL);
+ xfs_ilock(rtg_summary(rtg), XFS_ILOCK_EXCL);
+ } else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) {
+ xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_SHARED);
+ }
}
if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg_rmap(rtg))
@@ -228,11 +230,13 @@ xfs_rtgroup_unlock(
if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg_rmap(rtg))
xfs_iunlock(rtg_rmap(rtg), XFS_ILOCK_EXCL);
- if (rtglock_flags & XFS_RTGLOCK_BITMAP) {
- xfs_iunlock(rtg_summary(rtg), XFS_ILOCK_EXCL);
- xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_EXCL);
- } else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) {
- xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_SHARED);
+ if (!xfs_has_zoned(rtg_mount(rtg))) {
+ if (rtglock_flags & XFS_RTGLOCK_BITMAP) {
+ xfs_iunlock(rtg_summary(rtg), XFS_ILOCK_EXCL);
+ xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_EXCL);
+ } else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) {
+ xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_SHARED);
+ }
}
}
@@ -249,7 +253,8 @@ xfs_rtgroup_trans_join(
ASSERT(!(rtglock_flags & ~XFS_RTGLOCK_ALL_FLAGS));
ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED));
- if (rtglock_flags & XFS_RTGLOCK_BITMAP) {
+ if (!xfs_has_zoned(rtg_mount(rtg)) &&
+ (rtglock_flags & XFS_RTGLOCK_BITMAP)) {
xfs_trans_ijoin(tp, rtg_bitmap(rtg), XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, rtg_summary(rtg), XFS_ILOCK_EXCL);
}
@@ -270,7 +275,7 @@ xfs_rtgroup_get_geometry(
/* Fill out form. */
memset(rgeo, 0, sizeof(*rgeo));
rgeo->rg_number = rtg_rgno(rtg);
- rgeo->rg_length = rtg_group(rtg)->xg_block_count;
+ rgeo->rg_length = rtg_blocks(rtg);
xfs_rtgroup_geom_health(rtg, rgeo);
return 0;
}
@@ -354,6 +359,7 @@ static const struct xfs_rtginode_ops xfs_rtginode_ops[XFS_RTGI_MAX] = {
.sick = XFS_SICK_RG_BITMAP,
.fmt_mask = (1U << XFS_DINODE_FMT_EXTENTS) |
(1U << XFS_DINODE_FMT_BTREE),
+ .enabled = xfs_has_nonzoned,
.create = xfs_rtbitmap_create,
},
[XFS_RTGI_SUMMARY] = {
@@ -362,6 +368,7 @@ static const struct xfs_rtginode_ops xfs_rtginode_ops[XFS_RTGI_MAX] = {
.sick = XFS_SICK_RG_SUMMARY,
.fmt_mask = (1U << XFS_DINODE_FMT_EXTENTS) |
(1U << XFS_DINODE_FMT_BTREE),
+ .enabled = xfs_has_nonzoned,
.create = xfs_rtsummary_create,
},
[XFS_RTGI_RMAP] = {
diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h
index 03f39d4e43fc..d36a6ae0abe5 100644
--- a/fs/xfs/libxfs/xfs_rtgroup.h
+++ b/fs/xfs/libxfs/xfs_rtgroup.h
@@ -37,15 +37,33 @@ struct xfs_rtgroup {
xfs_rtxnum_t rtg_extents;
/*
- * Cache of rt summary level per bitmap block with the invariant that
- * rtg_rsum_cache[bbno] > the maximum i for which rsum[i][bbno] != 0,
- * or 0 if rsum[i][bbno] == 0 for all i.
- *
+ * For bitmap based RT devices this points to a cache of rt summary
+ * level per bitmap block with the invariant that rtg_rsum_cache[bbno]
+ * > the maximum i for which rsum[i][bbno] != 0, or 0 if
+ * rsum[i][bbno] == 0 for all i.
* Reads and writes are serialized by the rsumip inode lock.
+ *
+ * For zoned RT devices this points to the open zone structure for
+ * a group that is open for writers, or is NULL.
*/
- uint8_t *rtg_rsum_cache;
+ union {
+ uint8_t *rtg_rsum_cache;
+ struct xfs_open_zone *rtg_open_zone;
+ };
};
+/*
+ * For zoned RT devices this is set on groups that have no written blocks
+ * and can be picked by the allocator for opening.
+ */
+#define XFS_RTG_FREE XA_MARK_0
+
+/*
+ * For zoned RT devices this is set on groups that are fully written and that
+ * have unused blocks. Used by the garbage collection to pick targets.
+ */
+#define XFS_RTG_RECLAIMABLE XA_MARK_1
+
static inline struct xfs_rtgroup *to_rtg(struct xfs_group *xg)
{
return container_of(xg, struct xfs_rtgroup, rtg_group);
@@ -66,6 +84,11 @@ static inline xfs_rgnumber_t rtg_rgno(const struct xfs_rtgroup *rtg)
return rtg->rtg_group.xg_gno;
}
+static inline xfs_rgblock_t rtg_blocks(const struct xfs_rtgroup *rtg)
+{
+ return rtg->rtg_group.xg_block_count;
+}
+
static inline struct xfs_inode *rtg_bitmap(const struct xfs_rtgroup *rtg)
{
return rtg->rtg_inodes[XFS_RTGI_BITMAP];
@@ -222,10 +245,14 @@ xfs_rtb_to_daddr(
xfs_rtblock_t rtbno)
{
struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG];
- xfs_rgnumber_t rgno = xfs_rtb_to_rgno(mp, rtbno);
- uint64_t start_bno = (xfs_rtblock_t)rgno * g->blocks;
- return XFS_FSB_TO_BB(mp, start_bno + (rtbno & g->blkmask));
+ if (xfs_has_rtgroups(mp) && !g->has_daddr_gaps) {
+ xfs_rgnumber_t rgno = xfs_rtb_to_rgno(mp, rtbno);
+
+ rtbno = (xfs_rtblock_t)rgno * g->blocks + (rtbno & g->blkmask);
+ }
+
+ return XFS_FSB_TO_BB(mp, g->start_fsb + rtbno);
}
static inline xfs_rtblock_t
@@ -233,10 +260,11 @@ xfs_daddr_to_rtb(
struct xfs_mount *mp,
xfs_daddr_t daddr)
{
- xfs_rfsblock_t bno = XFS_BB_TO_FSBT(mp, daddr);
+ struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG];
+ xfs_rfsblock_t bno;
- if (xfs_has_rtgroups(mp)) {
- struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG];
+ bno = XFS_BB_TO_FSBT(mp, daddr) - g->start_fsb;
+ if (xfs_has_rtgroups(mp) && !g->has_daddr_gaps) {
xfs_rgnumber_t rgno;
uint32_t rgbno;
diff --git a/fs/xfs/libxfs/xfs_rtrefcount_btree.c b/fs/xfs/libxfs/xfs_rtrefcount_btree.c
index 3db5e7a4a945..ac11e94b42ae 100644
--- a/fs/xfs/libxfs/xfs_rtrefcount_btree.c
+++ b/fs/xfs/libxfs/xfs_rtrefcount_btree.c
@@ -156,8 +156,8 @@ xfs_rtrefcountbt_init_ptr_from_cur(
ptr->l = 0;
}
-STATIC int64_t
-xfs_rtrefcountbt_key_diff(
+STATIC int
+xfs_rtrefcountbt_cmp_key_with_cur(
struct xfs_btree_cur *cur,
const union xfs_btree_key *key)
{
@@ -167,11 +167,11 @@ xfs_rtrefcountbt_key_diff(
start = xfs_refcount_encode_startblock(irec->rc_startblock,
irec->rc_domain);
- return (int64_t)be32_to_cpu(kp->rc_startblock) - start;
+ return cmp_int(be32_to_cpu(kp->rc_startblock), start);
}
-STATIC int64_t
-xfs_rtrefcountbt_diff_two_keys(
+STATIC int
+xfs_rtrefcountbt_cmp_two_keys(
struct xfs_btree_cur *cur,
const union xfs_btree_key *k1,
const union xfs_btree_key *k2,
@@ -179,8 +179,8 @@ xfs_rtrefcountbt_diff_two_keys(
{
ASSERT(!mask || mask->refc.rc_startblock);
- return (int64_t)be32_to_cpu(k1->refc.rc_startblock) -
- be32_to_cpu(k2->refc.rc_startblock);
+ return cmp_int(be32_to_cpu(k1->refc.rc_startblock),
+ be32_to_cpu(k2->refc.rc_startblock));
}
static xfs_failaddr_t
@@ -387,9 +387,9 @@ const struct xfs_btree_ops xfs_rtrefcountbt_ops = {
.init_high_key_from_rec = xfs_rtrefcountbt_init_high_key_from_rec,
.init_rec_from_cur = xfs_rtrefcountbt_init_rec_from_cur,
.init_ptr_from_cur = xfs_rtrefcountbt_init_ptr_from_cur,
- .key_diff = xfs_rtrefcountbt_key_diff,
+ .cmp_key_with_cur = xfs_rtrefcountbt_cmp_key_with_cur,
.buf_ops = &xfs_rtrefcountbt_buf_ops,
- .diff_two_keys = xfs_rtrefcountbt_diff_two_keys,
+ .cmp_two_keys = xfs_rtrefcountbt_cmp_two_keys,
.keys_inorder = xfs_rtrefcountbt_keys_inorder,
.recs_inorder = xfs_rtrefcountbt_recs_inorder,
.keys_contiguous = xfs_rtrefcountbt_keys_contiguous,
diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.c b/fs/xfs/libxfs/xfs_rtrmap_btree.c
index e4ec36943cb7..55f903165769 100644
--- a/fs/xfs/libxfs/xfs_rtrmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rtrmap_btree.c
@@ -185,38 +185,22 @@ static inline uint64_t offset_keymask(uint64_t offset)
return offset & ~XFS_RMAP_OFF_UNWRITTEN;
}
-STATIC int64_t
-xfs_rtrmapbt_key_diff(
+STATIC int
+xfs_rtrmapbt_cmp_key_with_cur(
struct xfs_btree_cur *cur,
const union xfs_btree_key *key)
{
struct xfs_rmap_irec *rec = &cur->bc_rec.r;
const struct xfs_rmap_key *kp = &key->rmap;
- __u64 x, y;
- int64_t d;
- d = (int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock;
- if (d)
- return d;
-
- x = be64_to_cpu(kp->rm_owner);
- y = rec->rm_owner;
- if (x > y)
- return 1;
- else if (y > x)
- return -1;
-
- x = offset_keymask(be64_to_cpu(kp->rm_offset));
- y = offset_keymask(xfs_rmap_irec_offset_pack(rec));
- if (x > y)
- return 1;
- else if (y > x)
- return -1;
- return 0;
+ return cmp_int(be32_to_cpu(kp->rm_startblock), rec->rm_startblock) ?:
+ cmp_int(be64_to_cpu(kp->rm_owner), rec->rm_owner) ?:
+ cmp_int(offset_keymask(be64_to_cpu(kp->rm_offset)),
+ offset_keymask(xfs_rmap_irec_offset_pack(rec)));
}
-STATIC int64_t
-xfs_rtrmapbt_diff_two_keys(
+STATIC int
+xfs_rtrmapbt_cmp_two_keys(
struct xfs_btree_cur *cur,
const union xfs_btree_key *k1,
const union xfs_btree_key *k2,
@@ -224,36 +208,31 @@ xfs_rtrmapbt_diff_two_keys(
{
const struct xfs_rmap_key *kp1 = &k1->rmap;
const struct xfs_rmap_key *kp2 = &k2->rmap;
- int64_t d;
- __u64 x, y;
+ int d;
/* Doesn't make sense to mask off the physical space part */
ASSERT(!mask || mask->rmap.rm_startblock);
- d = (int64_t)be32_to_cpu(kp1->rm_startblock) -
- be32_to_cpu(kp2->rm_startblock);
+ d = cmp_int(be32_to_cpu(kp1->rm_startblock),
+ be32_to_cpu(kp2->rm_startblock));
if (d)
return d;
if (!mask || mask->rmap.rm_owner) {
- x = be64_to_cpu(kp1->rm_owner);
- y = be64_to_cpu(kp2->rm_owner);
- if (x > y)
- return 1;
- else if (y > x)
- return -1;
+ d = cmp_int(be64_to_cpu(kp1->rm_owner),
+ be64_to_cpu(kp2->rm_owner));
+ if (d)
+ return d;
}
if (!mask || mask->rmap.rm_offset) {
/* Doesn't make sense to allow offset but not owner */
ASSERT(!mask || mask->rmap.rm_owner);
- x = offset_keymask(be64_to_cpu(kp1->rm_offset));
- y = offset_keymask(be64_to_cpu(kp2->rm_offset));
- if (x > y)
- return 1;
- else if (y > x)
- return -1;
+ d = cmp_int(offset_keymask(be64_to_cpu(kp1->rm_offset)),
+ offset_keymask(be64_to_cpu(kp2->rm_offset)));
+ if (d)
+ return d;
}
return 0;
@@ -511,9 +490,9 @@ const struct xfs_btree_ops xfs_rtrmapbt_ops = {
.init_high_key_from_rec = xfs_rtrmapbt_init_high_key_from_rec,
.init_rec_from_cur = xfs_rtrmapbt_init_rec_from_cur,
.init_ptr_from_cur = xfs_rtrmapbt_init_ptr_from_cur,
- .key_diff = xfs_rtrmapbt_key_diff,
+ .cmp_key_with_cur = xfs_rtrmapbt_cmp_key_with_cur,
.buf_ops = &xfs_rtrmapbt_buf_ops,
- .diff_two_keys = xfs_rtrmapbt_diff_two_keys,
+ .cmp_two_keys = xfs_rtrmapbt_cmp_two_keys,
.keys_inorder = xfs_rtrmapbt_keys_inorder,
.recs_inorder = xfs_rtrmapbt_recs_inorder,
.keys_contiguous = xfs_rtrmapbt_keys_contiguous,
@@ -620,9 +599,9 @@ const struct xfs_btree_ops xfs_rtrmapbt_mem_ops = {
.init_high_key_from_rec = xfs_rtrmapbt_init_high_key_from_rec,
.init_rec_from_cur = xfs_rtrmapbt_init_rec_from_cur,
.init_ptr_from_cur = xfbtree_init_ptr_from_cur,
- .key_diff = xfs_rtrmapbt_key_diff,
+ .cmp_key_with_cur = xfs_rtrmapbt_cmp_key_with_cur,
.buf_ops = &xfs_rtrmapbt_mem_buf_ops,
- .diff_two_keys = xfs_rtrmapbt_diff_two_keys,
+ .cmp_two_keys = xfs_rtrmapbt_cmp_two_keys,
.keys_inorder = xfs_rtrmapbt_keys_inorder,
.recs_inorder = xfs_rtrmapbt_recs_inorder,
.keys_contiguous = xfs_rtrmapbt_keys_contiguous,
@@ -1033,3 +1012,22 @@ xfs_rtrmapbt_init_rtsb(
xfs_btree_del_cursor(cur, error);
return error;
}
+
+/*
+ * Return the highest rgbno currently tracked by the rmap for this rtg.
+ */
+xfs_rgblock_t
+xfs_rtrmap_highest_rgbno(
+ struct xfs_rtgroup *rtg)
+{
+ struct xfs_btree_block *block = rtg_rmap(rtg)->i_df.if_broot;
+ union xfs_btree_key key = {};
+ struct xfs_btree_cur *cur;
+
+ if (block->bb_numrecs == 0)
+ return NULLRGBLOCK;
+ cur = xfs_rtrmapbt_init_cursor(NULL, rtg);
+ xfs_btree_get_keys(cur, block, &key);
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ return be32_to_cpu(key.__rmap_bigkey[1].rm_startblock);
+}
diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.h b/fs/xfs/libxfs/xfs_rtrmap_btree.h
index 9d0915089891..e328fd62a149 100644
--- a/fs/xfs/libxfs/xfs_rtrmap_btree.h
+++ b/fs/xfs/libxfs/xfs_rtrmap_btree.h
@@ -207,4 +207,6 @@ struct xfs_btree_cur *xfs_rtrmapbt_mem_cursor(struct xfs_rtgroup *rtg,
int xfs_rtrmapbt_mem_init(struct xfs_mount *mp, struct xfbtree *xfbtree,
struct xfs_buftarg *btp, xfs_rgnumber_t rgno);
+xfs_rgblock_t xfs_rtrmap_highest_rgbno(struct xfs_rtgroup *rtg);
+
#endif /* __XFS_RTRMAP_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 3dc5f5dba162..711e180f9ebb 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -185,6 +185,8 @@ xfs_sb_version_to_features(
features |= XFS_FEAT_PARENT;
if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR)
features |= XFS_FEAT_METADIR;
+ if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED)
+ features |= XFS_FEAT_ZONED;
return features;
}
@@ -266,6 +268,9 @@ static uint64_t
xfs_expected_rbmblocks(
struct xfs_sb *sbp)
{
+ if (xfs_sb_is_v5(sbp) &&
+ (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED))
+ return 0;
return howmany_64(xfs_extents_per_rbm(sbp),
NBBY * xfs_rtbmblock_size(sbp));
}
@@ -275,9 +280,15 @@ bool
xfs_validate_rt_geometry(
struct xfs_sb *sbp)
{
- if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE ||
- sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE)
- return false;
+ if (xfs_sb_is_v5(sbp) &&
+ (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED)) {
+ if (sbp->sb_rextsize != 1)
+ return false;
+ } else {
+ if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE ||
+ sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE)
+ return false;
+ }
if (sbp->sb_rblocks == 0) {
if (sbp->sb_rextents != 0 || sbp->sb_rbmblocks != 0 ||
@@ -435,6 +446,34 @@ xfs_validate_sb_rtgroups(
return 0;
}
+static int
+xfs_validate_sb_zoned(
+ struct xfs_mount *mp,
+ struct xfs_sb *sbp)
+{
+ if (sbp->sb_frextents != 0) {
+ xfs_warn(mp,
+"sb_frextents must be zero for zoned file systems.");
+ return -EINVAL;
+ }
+
+ if (sbp->sb_rtstart && sbp->sb_rtstart < sbp->sb_dblocks) {
+ xfs_warn(mp,
+"sb_rtstart (%lld) overlaps sb_dblocks (%lld).",
+ sbp->sb_rtstart, sbp->sb_dblocks);
+ return -EINVAL;
+ }
+
+ if (sbp->sb_rtreserved && sbp->sb_rtreserved >= sbp->sb_rblocks) {
+ xfs_warn(mp,
+"sb_rtreserved (%lld) larger than sb_rblocks (%lld).",
+ sbp->sb_rtreserved, sbp->sb_rblocks);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
/* Check the validity of the SB. */
STATIC int
xfs_validate_sb_common(
@@ -523,6 +562,11 @@ xfs_validate_sb_common(
if (error)
return error;
}
+ if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) {
+ error = xfs_validate_sb_zoned(mp, sbp);
+ if (error)
+ return error;
+ }
} else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD |
XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) {
xfs_notice(mp,
@@ -835,6 +879,14 @@ __xfs_sb_from_disk(
to->sb_rgcount = 1;
to->sb_rgextents = 0;
}
+
+ if (to->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) {
+ to->sb_rtstart = be64_to_cpu(from->sb_rtstart);
+ to->sb_rtreserved = be64_to_cpu(from->sb_rtreserved);
+ } else {
+ to->sb_rtstart = 0;
+ to->sb_rtreserved = 0;
+ }
}
void
@@ -1001,6 +1053,11 @@ xfs_sb_to_disk(
to->sb_rbmino = cpu_to_be64(0);
to->sb_rsumino = cpu_to_be64(0);
}
+
+ if (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) {
+ to->sb_rtstart = cpu_to_be64(from->sb_rtstart);
+ to->sb_rtreserved = cpu_to_be64(from->sb_rtreserved);
+ }
}
/*
@@ -1146,6 +1203,10 @@ xfs_sb_mount_rextsize(
rgs->blocks = sbp->sb_rgextents * sbp->sb_rextsize;
rgs->blklog = mp->m_sb.sb_rgblklog;
rgs->blkmask = xfs_mask32lo(mp->m_sb.sb_rgblklog);
+ rgs->start_fsb = mp->m_sb.sb_rtstart;
+ if (xfs_sb_has_incompat_feature(sbp,
+ XFS_SB_FEAT_INCOMPAT_ZONE_GAPS))
+ rgs->has_daddr_gaps = true;
} else {
rgs->blocks = 0;
rgs->blklog = 0;
@@ -1265,8 +1326,7 @@ xfs_log_sb(
mp->m_sb.sb_ifree = min_t(uint64_t,
percpu_counter_sum_positive(&mp->m_ifree),
mp->m_sb.sb_icount);
- mp->m_sb.sb_fdblocks =
- percpu_counter_sum_positive(&mp->m_fdblocks);
+ mp->m_sb.sb_fdblocks = xfs_sum_freecounter(mp, XC_FREE_BLOCKS);
}
/*
@@ -1275,9 +1335,10 @@ xfs_log_sb(
* we handle nearly-lockless reservations, so we must use the _positive
* variant here to avoid writing out nonsense frextents.
*/
- if (xfs_has_rtgroups(mp))
+ if (xfs_has_rtgroups(mp) && !xfs_has_zoned(mp)) {
mp->m_sb.sb_frextents =
- percpu_counter_sum_positive(&mp->m_frextents);
+ xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS);
+ }
xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
@@ -1510,6 +1571,8 @@ xfs_fs_geometry(
geo->flags |= XFS_FSOP_GEOM_FLAGS_EXCHANGE_RANGE;
if (xfs_has_metadir(mp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_METADIR;
+ if (xfs_has_zoned(mp))
+ geo->flags |= XFS_FSOP_GEOM_FLAGS_ZONED;
geo->rtsectsize = sbp->sb_blocksize;
geo->dirblocksize = xfs_dir2_dirblock_bytes(sbp);
@@ -1530,6 +1593,10 @@ xfs_fs_geometry(
geo->rgcount = sbp->sb_rgcount;
geo->rgextents = sbp->sb_rgextents;
}
+ if (xfs_has_zoned(mp)) {
+ geo->rtstart = sbp->sb_rtstart;
+ geo->rtreserved = sbp->sb_rtreserved;
+ }
}
/* Read a secondary superblock. */
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index 13d00c7166e1..86a111d0f2fc 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -22,6 +22,12 @@
#include "xfs_rtbitmap.h"
#include "xfs_attr_item.h"
#include "xfs_log.h"
+#include "xfs_defer.h"
+#include "xfs_bmap_item.h"
+#include "xfs_extfree_item.h"
+#include "xfs_rmap_item.h"
+#include "xfs_refcount_item.h"
+#include "xfs_trace.h"
#define _ALLOC true
#define _FREE false
@@ -264,6 +270,42 @@ xfs_rtalloc_block_count(
*/
/*
+ * Finishing a data device refcount updates (t1):
+ * the agfs of the ags containing the blocks: nr_ops * sector size
+ * the refcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size
+ */
+inline unsigned int
+xfs_calc_finish_cui_reservation(
+ struct xfs_mount *mp,
+ unsigned int nr_ops)
+{
+ if (!xfs_has_reflink(mp))
+ return 0;
+
+ return xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops),
+ mp->m_sb.sb_blocksize);
+}
+
+/*
+ * Realtime refcount updates (t2);
+ * the rt refcount inode
+ * the rtrefcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size
+ */
+inline unsigned int
+xfs_calc_finish_rt_cui_reservation(
+ struct xfs_mount *mp,
+ unsigned int nr_ops)
+{
+ if (!xfs_has_rtreflink(mp))
+ return 0;
+
+ return xfs_calc_inode_res(mp, 1) +
+ xfs_calc_buf_res(xfs_rtrefcountbt_block_count(mp, nr_ops),
+ mp->m_sb.sb_blocksize);
+}
+
+/*
* Compute the log reservation required to handle the refcount update
* transaction. Refcount updates are always done via deferred log items.
*
@@ -280,19 +322,10 @@ xfs_calc_refcountbt_reservation(
struct xfs_mount *mp,
unsigned int nr_ops)
{
- unsigned int blksz = XFS_FSB_TO_B(mp, 1);
- unsigned int t1, t2 = 0;
+ unsigned int t1, t2;
- if (!xfs_has_reflink(mp))
- return 0;
-
- t1 = xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops), blksz);
-
- if (xfs_has_realtime(mp))
- t2 = xfs_calc_inode_res(mp, 1) +
- xfs_calc_buf_res(xfs_rtrefcountbt_block_count(mp, nr_ops),
- blksz);
+ t1 = xfs_calc_finish_cui_reservation(mp, nr_ops);
+ t2 = xfs_calc_finish_rt_cui_reservation(mp, nr_ops);
return max(t1, t2);
}
@@ -380,6 +413,96 @@ xfs_calc_write_reservation_minlogsize(
}
/*
+ * Finishing an EFI can free the blocks and bmap blocks (t2):
+ * the agf for each of the ags: nr * sector size
+ * the agfl for each of the ags: nr * sector size
+ * the super block to reflect the freed blocks: sector size
+ * worst case split in allocation btrees per extent assuming nr extents:
+ * nr exts * 2 trees * (2 * max depth - 1) * block size
+ */
+inline unsigned int
+xfs_calc_finish_efi_reservation(
+ struct xfs_mount *mp,
+ unsigned int nr)
+{
+ return xfs_calc_buf_res((2 * nr) + 1, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, nr),
+ mp->m_sb.sb_blocksize);
+}
+
+/*
+ * Or, if it's a realtime file (t3):
+ * the agf for each of the ags: 2 * sector size
+ * the agfl for each of the ags: 2 * sector size
+ * the super block to reflect the freed blocks: sector size
+ * the realtime bitmap:
+ * 2 exts * ((XFS_BMBT_MAX_EXTLEN / rtextsize) / NBBY) bytes
+ * the realtime summary: 2 exts * 1 block
+ * worst case split in allocation btrees per extent assuming 2 extents:
+ * 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+inline unsigned int
+xfs_calc_finish_rt_efi_reservation(
+ struct xfs_mount *mp,
+ unsigned int nr)
+{
+ if (!xfs_has_realtime(mp))
+ return 0;
+
+ return xfs_calc_buf_res((2 * nr) + 1, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(xfs_rtalloc_block_count(mp, nr),
+ mp->m_sb.sb_blocksize) +
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, nr),
+ mp->m_sb.sb_blocksize);
+}
+
+/*
+ * Finishing an RUI is the same as an EFI. We can split the rmap btree twice
+ * on each end of the record, and that can cause the AGFL to be refilled or
+ * emptied out.
+ */
+inline unsigned int
+xfs_calc_finish_rui_reservation(
+ struct xfs_mount *mp,
+ unsigned int nr)
+{
+ if (!xfs_has_rmapbt(mp))
+ return 0;
+ return xfs_calc_finish_efi_reservation(mp, nr);
+}
+
+/*
+ * Finishing an RUI is the same as an EFI. We can split the rmap btree twice
+ * on each end of the record, and that can cause the AGFL to be refilled or
+ * emptied out.
+ */
+inline unsigned int
+xfs_calc_finish_rt_rui_reservation(
+ struct xfs_mount *mp,
+ unsigned int nr)
+{
+ if (!xfs_has_rtrmapbt(mp))
+ return 0;
+ return xfs_calc_finish_rt_efi_reservation(mp, nr);
+}
+
+/*
+ * In finishing a BUI, we can modify:
+ * the inode being truncated: inode size
+ * dquots
+ * the inode's bmap btree: (max depth + 1) * block size
+ */
+inline unsigned int
+xfs_calc_finish_bui_reservation(
+ struct xfs_mount *mp,
+ unsigned int nr)
+{
+ return xfs_calc_inode_res(mp, 1) + XFS_DQUOT_LOGRES +
+ xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1,
+ mp->m_sb.sb_blocksize);
+}
+
+/*
* In truncating a file we free up to two extents at once. We can modify (t1):
* the inode being truncated: inode size
* the inode's bmap btree: (max depth + 1) * block size
@@ -411,16 +534,8 @@ xfs_calc_itruncate_reservation(
t1 = xfs_calc_inode_res(mp, 1) +
xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, blksz);
- t2 = xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_block_count(mp, 4), blksz);
-
- if (xfs_has_realtime(mp)) {
- t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_rtalloc_block_count(mp, 2), blksz) +
- xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), blksz);
- } else {
- t3 = 0;
- }
+ t2 = xfs_calc_finish_efi_reservation(mp, 4);
+ t3 = xfs_calc_finish_rt_efi_reservation(mp, 2);
/*
* In the early days of reflink, we included enough reservation to log
@@ -501,9 +616,7 @@ xfs_calc_rename_reservation(
xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp),
XFS_FSB_TO_B(mp, 1));
- t2 = xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_block_count(mp, 3),
- XFS_FSB_TO_B(mp, 1));
+ t2 = xfs_calc_finish_efi_reservation(mp, 3);
if (xfs_has_parent(mp)) {
unsigned int rename_overhead, exchange_overhead;
@@ -611,9 +724,7 @@ xfs_calc_link_reservation(
overhead += xfs_calc_iunlink_remove_reservation(mp);
t1 = xfs_calc_inode_res(mp, 2) +
xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1));
- t2 = xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
- XFS_FSB_TO_B(mp, 1));
+ t2 = xfs_calc_finish_efi_reservation(mp, 1);
if (xfs_has_parent(mp)) {
t3 = resp->tr_attrsetm.tr_logres;
@@ -676,9 +787,7 @@ xfs_calc_remove_reservation(
t1 = xfs_calc_inode_res(mp, 2) +
xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1));
- t2 = xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2),
- XFS_FSB_TO_B(mp, 1));
+ t2 = xfs_calc_finish_efi_reservation(mp, 2);
if (xfs_has_parent(mp)) {
t3 = resp->tr_attrrm.tr_logres;
@@ -1181,6 +1290,15 @@ xfs_calc_namespace_reservations(
resp->tr_mkdir.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
}
+STATIC void
+xfs_calc_default_atomic_ioend_reservation(
+ struct xfs_mount *mp,
+ struct xfs_trans_resv *resp)
+{
+ /* Pick a default that will scale reasonably for the log size. */
+ resp->tr_atomic_ioend = resp->tr_itruncate;
+}
+
void
xfs_trans_resv_calc(
struct xfs_mount *mp,
@@ -1275,4 +1393,167 @@ xfs_trans_resv_calc(
resp->tr_itruncate.tr_logcount += logcount_adj;
resp->tr_write.tr_logcount += logcount_adj;
resp->tr_qm_dqalloc.tr_logcount += logcount_adj;
+
+ /*
+ * Now that we've finished computing the static reservations, we can
+ * compute the dynamic reservation for atomic writes.
+ */
+ xfs_calc_default_atomic_ioend_reservation(mp, resp);
+}
+
+/*
+ * Return the per-extent and fixed transaction reservation sizes needed to
+ * complete an atomic write.
+ */
+STATIC unsigned int
+xfs_calc_atomic_write_ioend_geometry(
+ struct xfs_mount *mp,
+ unsigned int *step_size)
+{
+ const unsigned int efi = xfs_efi_log_space(1);
+ const unsigned int efd = xfs_efd_log_space(1);
+ const unsigned int rui = xfs_rui_log_space(1);
+ const unsigned int rud = xfs_rud_log_space();
+ const unsigned int cui = xfs_cui_log_space(1);
+ const unsigned int cud = xfs_cud_log_space();
+ const unsigned int bui = xfs_bui_log_space(1);
+ const unsigned int bud = xfs_bud_log_space();
+
+ /*
+ * Maximum overhead to complete an atomic write ioend in software:
+ * remove data fork extent + remove cow fork extent + map extent into
+ * data fork.
+ *
+ * tx0: Creates a BUI and a CUI and that's all it needs.
+ *
+ * tx1: Roll to finish the BUI. Need space for the BUD, an RUI, and
+ * enough space to relog the CUI (== CUI + CUD).
+ *
+ * tx2: Roll again to finish the RUI. Need space for the RUD and space
+ * to relog the CUI.
+ *
+ * tx3: Roll again, need space for the CUD and possibly a new EFI.
+ *
+ * tx4: Roll again, need space for an EFD.
+ *
+ * If the extent referenced by the pair of BUI/CUI items is not the one
+ * being currently processed, then we need to reserve space to relog
+ * both items.
+ */
+ const unsigned int tx0 = bui + cui;
+ const unsigned int tx1 = bud + rui + cui + cud;
+ const unsigned int tx2 = rud + cui + cud;
+ const unsigned int tx3 = cud + efi;
+ const unsigned int tx4 = efd;
+ const unsigned int relog = bui + bud + cui + cud;
+
+ const unsigned int per_intent = max(max3(tx0, tx1, tx2),
+ max3(tx3, tx4, relog));
+
+ /* Overhead to finish one step of each intent item type */
+ const unsigned int f1 = xfs_calc_finish_efi_reservation(mp, 1);
+ const unsigned int f2 = xfs_calc_finish_rui_reservation(mp, 1);
+ const unsigned int f3 = xfs_calc_finish_cui_reservation(mp, 1);
+ const unsigned int f4 = xfs_calc_finish_bui_reservation(mp, 1);
+
+ /* We only finish one item per transaction in a chain */
+ *step_size = max(f4, max3(f1, f2, f3));
+
+ return per_intent;
+}
+
+/*
+ * Compute the maximum size (in fsblocks) of atomic writes that we can complete
+ * given the existing log reservations.
+ */
+xfs_extlen_t
+xfs_calc_max_atomic_write_fsblocks(
+ struct xfs_mount *mp)
+{
+ const struct xfs_trans_res *resv = &M_RES(mp)->tr_atomic_ioend;
+ unsigned int per_intent = 0;
+ unsigned int step_size = 0;
+ unsigned int ret = 0;
+
+ if (resv->tr_logres > 0) {
+ per_intent = xfs_calc_atomic_write_ioend_geometry(mp,
+ &step_size);
+
+ if (resv->tr_logres >= step_size)
+ ret = (resv->tr_logres - step_size) / per_intent;
+ }
+
+ trace_xfs_calc_max_atomic_write_fsblocks(mp, per_intent, step_size,
+ resv->tr_logres, ret);
+
+ return ret;
+}
+
+/*
+ * Compute the log blocks and transaction reservation needed to complete an
+ * atomic write of a given number of blocks. Worst case, each block requires
+ * separate handling. A return value of 0 means something went wrong.
+ */
+xfs_extlen_t
+xfs_calc_atomic_write_log_geometry(
+ struct xfs_mount *mp,
+ xfs_extlen_t blockcount,
+ unsigned int *new_logres)
+{
+ struct xfs_trans_res *curr_res = &M_RES(mp)->tr_atomic_ioend;
+ uint old_logres = curr_res->tr_logres;
+ unsigned int per_intent, step_size;
+ unsigned int logres;
+ xfs_extlen_t min_logblocks;
+
+ ASSERT(blockcount > 0);
+
+ xfs_calc_default_atomic_ioend_reservation(mp, M_RES(mp));
+
+ per_intent = xfs_calc_atomic_write_ioend_geometry(mp, &step_size);
+
+ /* Check for overflows */
+ if (check_mul_overflow(blockcount, per_intent, &logres) ||
+ check_add_overflow(logres, step_size, &logres))
+ return 0;
+
+ curr_res->tr_logres = logres;
+ min_logblocks = xfs_log_calc_minimum_size(mp);
+ curr_res->tr_logres = old_logres;
+
+ trace_xfs_calc_max_atomic_write_log_geometry(mp, per_intent, step_size,
+ blockcount, min_logblocks, logres);
+
+ *new_logres = logres;
+ return min_logblocks;
+}
+
+/*
+ * Compute the transaction reservation needed to complete an out of place
+ * atomic write of a given number of blocks.
+ */
+int
+xfs_calc_atomic_write_reservation(
+ struct xfs_mount *mp,
+ xfs_extlen_t blockcount)
+{
+ unsigned int new_logres;
+ xfs_extlen_t min_logblocks;
+
+ /*
+ * If the caller doesn't ask for a specific atomic write size, then
+ * use the defaults.
+ */
+ if (blockcount == 0) {
+ xfs_calc_default_atomic_ioend_reservation(mp, M_RES(mp));
+ return 0;
+ }
+
+ min_logblocks = xfs_calc_atomic_write_log_geometry(mp, blockcount,
+ &new_logres);
+ if (!min_logblocks || min_logblocks > mp->m_sb.sb_logblocks)
+ return -EINVAL;
+
+ M_RES(mp)->tr_atomic_ioend.tr_logres = new_logres;
+ return 0;
}
diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h
index 0554b9d775d2..336279e0fc61 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.h
+++ b/fs/xfs/libxfs/xfs_trans_resv.h
@@ -48,6 +48,7 @@ struct xfs_trans_resv {
struct xfs_trans_res tr_qm_dqalloc; /* allocate quota on disk */
struct xfs_trans_res tr_sb; /* modify superblock */
struct xfs_trans_res tr_fsyncts; /* update timestamps on fsync */
+ struct xfs_trans_res tr_atomic_ioend; /* untorn write completion */
};
/* shorthand way of accessing reservation structure */
@@ -98,8 +99,32 @@ struct xfs_trans_resv {
void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp);
uint xfs_allocfree_block_count(struct xfs_mount *mp, uint num_ops);
+unsigned int xfs_calc_finish_bui_reservation(struct xfs_mount *mp,
+ unsigned int nr_ops);
+
+unsigned int xfs_calc_finish_efi_reservation(struct xfs_mount *mp,
+ unsigned int nr_ops);
+unsigned int xfs_calc_finish_rt_efi_reservation(struct xfs_mount *mp,
+ unsigned int nr_ops);
+
+unsigned int xfs_calc_finish_rui_reservation(struct xfs_mount *mp,
+ unsigned int nr_ops);
+unsigned int xfs_calc_finish_rt_rui_reservation(struct xfs_mount *mp,
+ unsigned int nr_ops);
+
+unsigned int xfs_calc_finish_cui_reservation(struct xfs_mount *mp,
+ unsigned int nr_ops);
+unsigned int xfs_calc_finish_rt_cui_reservation(struct xfs_mount *mp,
+ unsigned int nr_ops);
+
unsigned int xfs_calc_itruncate_reservation_minlogsize(struct xfs_mount *mp);
unsigned int xfs_calc_write_reservation_minlogsize(struct xfs_mount *mp);
unsigned int xfs_calc_qm_dqalloc_reservation_minlogsize(struct xfs_mount *mp);
+xfs_extlen_t xfs_calc_max_atomic_write_fsblocks(struct xfs_mount *mp);
+xfs_extlen_t xfs_calc_atomic_write_log_geometry(struct xfs_mount *mp,
+ xfs_extlen_t blockcount, unsigned int *new_logres);
+int xfs_calc_atomic_write_reservation(struct xfs_mount *mp,
+ xfs_extlen_t blockcount);
+
#endif /* __XFS_TRANS_RESV_H__ */
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index ca2401c1facd..f6f4f2d4b5db 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -233,6 +233,34 @@ enum xfs_group_type {
{ XG_TYPE_AG, "ag" }, \
{ XG_TYPE_RTG, "rtg" }
+enum xfs_free_counter {
+ /*
+ * Number of free blocks on the data device.
+ */
+ XC_FREE_BLOCKS,
+
+ /*
+ * Number of free RT extents on the RT device.
+ */
+ XC_FREE_RTEXTENTS,
+
+ /*
+ * Number of available for use RT extents.
+ *
+ * This counter only exists for zoned RT device and indicates the number
+ * of RT extents that can be directly used by writes. XC_FREE_RTEXTENTS
+ * also includes blocks that have been written previously and freed, but
+ * sit in a rtgroup that still needs a zone reset.
+ */
+ XC_FREE_RTAVAILABLE,
+ XC_FREE_NR,
+};
+
+#define XFS_FREECOUNTER_STR \
+ { XC_FREE_BLOCKS, "blocks" }, \
+ { XC_FREE_RTEXTENTS, "rtextents" }, \
+ { XC_FREE_RTAVAILABLE, "rtavailable" }
+
/*
* Type verifier functions
*/
diff --git a/fs/xfs/libxfs/xfs_zones.c b/fs/xfs/libxfs/xfs_zones.c
new file mode 100644
index 000000000000..b0791a71931c
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_zones.c
@@ -0,0 +1,186 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2023-2025 Christoph Hellwig.
+ * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_rtgroup.h"
+#include "xfs_zones.h"
+
+static bool
+xfs_zone_validate_empty(
+ struct blk_zone *zone,
+ struct xfs_rtgroup *rtg,
+ xfs_rgblock_t *write_pointer)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+
+ if (rtg_rmap(rtg)->i_used_blocks > 0) {
+ xfs_warn(mp, "empty zone %u has non-zero used counter (0x%x).",
+ rtg_rgno(rtg), rtg_rmap(rtg)->i_used_blocks);
+ return false;
+ }
+
+ *write_pointer = 0;
+ return true;
+}
+
+static bool
+xfs_zone_validate_wp(
+ struct blk_zone *zone,
+ struct xfs_rtgroup *rtg,
+ xfs_rgblock_t *write_pointer)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+ xfs_rtblock_t wp_fsb = xfs_daddr_to_rtb(mp, zone->wp);
+
+ if (rtg_rmap(rtg)->i_used_blocks > rtg->rtg_extents) {
+ xfs_warn(mp, "zone %u has too large used counter (0x%x).",
+ rtg_rgno(rtg), rtg_rmap(rtg)->i_used_blocks);
+ return false;
+ }
+
+ if (xfs_rtb_to_rgno(mp, wp_fsb) != rtg_rgno(rtg)) {
+ xfs_warn(mp, "zone %u write pointer (0x%llx) outside of zone.",
+ rtg_rgno(rtg), wp_fsb);
+ return false;
+ }
+
+ *write_pointer = xfs_rtb_to_rgbno(mp, wp_fsb);
+ if (*write_pointer >= rtg->rtg_extents) {
+ xfs_warn(mp, "zone %u has invalid write pointer (0x%x).",
+ rtg_rgno(rtg), *write_pointer);
+ return false;
+ }
+
+ return true;
+}
+
+static bool
+xfs_zone_validate_full(
+ struct blk_zone *zone,
+ struct xfs_rtgroup *rtg,
+ xfs_rgblock_t *write_pointer)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+
+ if (rtg_rmap(rtg)->i_used_blocks > rtg->rtg_extents) {
+ xfs_warn(mp, "zone %u has too large used counter (0x%x).",
+ rtg_rgno(rtg), rtg_rmap(rtg)->i_used_blocks);
+ return false;
+ }
+
+ *write_pointer = rtg->rtg_extents;
+ return true;
+}
+
+static bool
+xfs_zone_validate_seq(
+ struct blk_zone *zone,
+ struct xfs_rtgroup *rtg,
+ xfs_rgblock_t *write_pointer)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+
+ switch (zone->cond) {
+ case BLK_ZONE_COND_EMPTY:
+ return xfs_zone_validate_empty(zone, rtg, write_pointer);
+ case BLK_ZONE_COND_IMP_OPEN:
+ case BLK_ZONE_COND_EXP_OPEN:
+ case BLK_ZONE_COND_CLOSED:
+ return xfs_zone_validate_wp(zone, rtg, write_pointer);
+ case BLK_ZONE_COND_FULL:
+ return xfs_zone_validate_full(zone, rtg, write_pointer);
+ case BLK_ZONE_COND_NOT_WP:
+ case BLK_ZONE_COND_OFFLINE:
+ case BLK_ZONE_COND_READONLY:
+ xfs_warn(mp, "zone %u has unsupported zone condition 0x%x.",
+ rtg_rgno(rtg), zone->cond);
+ return false;
+ default:
+ xfs_warn(mp, "zone %u has unknown zone condition 0x%x.",
+ rtg_rgno(rtg), zone->cond);
+ return false;
+ }
+}
+
+static bool
+xfs_zone_validate_conv(
+ struct blk_zone *zone,
+ struct xfs_rtgroup *rtg)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+
+ switch (zone->cond) {
+ case BLK_ZONE_COND_NOT_WP:
+ return true;
+ default:
+ xfs_warn(mp,
+"conventional zone %u has unsupported zone condition 0x%x.",
+ rtg_rgno(rtg), zone->cond);
+ return false;
+ }
+}
+
+bool
+xfs_zone_validate(
+ struct blk_zone *zone,
+ struct xfs_rtgroup *rtg,
+ xfs_rgblock_t *write_pointer)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+ struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG];
+ uint32_t expected_size;
+
+ /*
+ * Check that the zone capacity matches the rtgroup size stored in the
+ * superblock. Note that all zones including the last one must have a
+ * uniform capacity.
+ */
+ if (XFS_BB_TO_FSB(mp, zone->capacity) != g->blocks) {
+ xfs_warn(mp,
+"zone %u capacity (0x%llx) does not match RT group size (0x%x).",
+ rtg_rgno(rtg), XFS_BB_TO_FSB(mp, zone->capacity),
+ g->blocks);
+ return false;
+ }
+
+ if (g->has_daddr_gaps) {
+ expected_size = 1 << g->blklog;
+ } else {
+ if (zone->len != zone->capacity) {
+ xfs_warn(mp,
+"zone %u has capacity != size ((0x%llx vs 0x%llx)",
+ rtg_rgno(rtg),
+ XFS_BB_TO_FSB(mp, zone->len),
+ XFS_BB_TO_FSB(mp, zone->capacity));
+ return false;
+ }
+ expected_size = g->blocks;
+ }
+
+ if (XFS_BB_TO_FSB(mp, zone->len) != expected_size) {
+ xfs_warn(mp,
+"zone %u length (0x%llx) does match geometry (0x%x).",
+ rtg_rgno(rtg), XFS_BB_TO_FSB(mp, zone->len),
+ expected_size);
+ }
+
+ switch (zone->type) {
+ case BLK_ZONE_TYPE_CONVENTIONAL:
+ return xfs_zone_validate_conv(zone, rtg);
+ case BLK_ZONE_TYPE_SEQWRITE_REQ:
+ return xfs_zone_validate_seq(zone, rtg, write_pointer);
+ default:
+ xfs_warn(mp, "zoned %u has unsupported type 0x%x.",
+ rtg_rgno(rtg), zone->type);
+ return false;
+ }
+}
diff --git a/fs/xfs/libxfs/xfs_zones.h b/fs/xfs/libxfs/xfs_zones.h
new file mode 100644
index 000000000000..c4f1367b2cca
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_zones.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LIBXFS_ZONES_H
+#define _LIBXFS_ZONES_H
+
+struct xfs_rtgroup;
+
+/*
+ * In order to guarantee forward progress for GC we need to reserve at least
+ * two zones: one that will be used for moving data into and one spare zone
+ * making sure that we have enough space to relocate a nearly-full zone.
+ * To allow for slightly sloppy accounting for when we need to reserve the
+ * second zone, we actually reserve three as that is easier than doing fully
+ * accurate bookkeeping.
+ */
+#define XFS_GC_ZONES 3U
+
+/*
+ * In addition we need two zones for user writes, one open zone for writing
+ * and one to still have available blocks without resetting the open zone
+ * when data in the open zone has been freed.
+ */
+#define XFS_RESERVED_ZONES (XFS_GC_ZONES + 1)
+#define XFS_MIN_ZONES (XFS_RESERVED_ZONES + 1)
+
+/*
+ * Always keep one zone out of the general open zone pool to allow for GC to
+ * happen while other writers are waiting for free space.
+ */
+#define XFS_OPEN_GC_ZONES 1U
+#define XFS_MIN_OPEN_ZONES (XFS_OPEN_GC_ZONES + 1U)
+
+bool xfs_zone_validate(struct blk_zone *zone, struct xfs_rtgroup *rtg,
+ xfs_rgblock_t *write_pointer);
+
+#endif /* _LIBXFS_ZONES_H */
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index 9f8c312dfd3c..303374df44bd 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -69,6 +69,8 @@ STATIC size_t
xchk_superblock_ondisk_size(
struct xfs_mount *mp)
{
+ if (xfs_has_zoned(mp))
+ return offsetofend(struct xfs_dsb, sb_rtreserved);
if (xfs_has_metadir(mp))
return offsetofend(struct xfs_dsb, sb_pad);
if (xfs_has_metauuid(mp))
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index 66da7d4d56ba..4f1e2574660d 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -1038,8 +1038,8 @@ xchk_bmap(
switch (whichfork) {
case XFS_COW_FORK:
- /* No CoW forks on non-reflink filesystems. */
- if (!xfs_has_reflink(mp)) {
+ /* No CoW forks filesystem doesn't support out of place writes */
+ if (!xfs_has_reflink(mp) && !xfs_has_zoned(mp)) {
xchk_ino_set_corrupt(sc, sc->ip->i_ino);
return 0;
}
diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c
index fe678a0438bc..cd6f0ff382a7 100644
--- a/fs/xfs/scrub/btree.c
+++ b/fs/xfs/scrub/btree.c
@@ -306,7 +306,7 @@ xchk_btree_block_check_sibling(
if (pbp)
xchk_buffer_recheck(bs->sc, pbp);
- if (xfs_btree_diff_two_ptrs(cur, pp, sibling))
+ if (xfs_btree_cmp_two_ptrs(cur, pp, sibling))
xchk_btree_set_corrupt(bs->sc, cur, level);
out:
xfs_btree_del_cursor(ncur, XFS_BTREE_ERROR);
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 28ad341df8ee..2ef7742be7d3 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -866,11 +866,11 @@ xchk_trans_cancel(
sc->tp = NULL;
}
-int
+void
xchk_trans_alloc_empty(
struct xfs_scrub *sc)
{
- return xfs_trans_alloc_empty(sc->mp, &sc->tp);
+ sc->tp = xfs_trans_alloc_empty(sc->mp);
}
/*
@@ -892,7 +892,8 @@ xchk_trans_alloc(
return xfs_trans_alloc(sc->mp, &M_RES(sc->mp)->tr_itruncate,
resblks, 0, 0, &sc->tp);
- return xchk_trans_alloc_empty(sc);
+ xchk_trans_alloc_empty(sc);
+ return 0;
}
/* Set us up with a transaction and an empty context. */
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index bdcd40f0ec74..ddbc065c798c 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -7,7 +7,7 @@
#define __XFS_SCRUB_COMMON_H__
int xchk_trans_alloc(struct xfs_scrub *sc, uint resblks);
-int xchk_trans_alloc_empty(struct xfs_scrub *sc);
+void xchk_trans_alloc_empty(struct xfs_scrub *sc);
void xchk_trans_cancel(struct xfs_scrub *sc);
bool xchk_process_error(struct xfs_scrub *sc, xfs_agnumber_t agno,
@@ -224,7 +224,6 @@ static inline bool xchk_skip_xref(struct xfs_scrub_metadata *sm)
bool xchk_dir_looks_zapped(struct xfs_inode *dp);
bool xchk_pptr_looks_zapped(struct xfs_inode *ip);
-#ifdef CONFIG_XFS_ONLINE_REPAIR
/* Decide if a repair is required. */
static inline bool xchk_needs_repair(const struct xfs_scrub_metadata *sm)
{
@@ -244,10 +243,6 @@ static inline bool xchk_could_repair(const struct xfs_scrub *sc)
return (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) &&
!(sc->flags & XREP_ALREADY_FIXED);
}
-#else
-# define xchk_needs_repair(sc) (false)
-# define xchk_could_repair(sc) (false)
-#endif /* CONFIG_XFS_ONLINE_REPAIR */
int xchk_metadata_inode_forks(struct xfs_scrub *sc);
diff --git a/fs/xfs/scrub/dir_repair.c b/fs/xfs/scrub/dir_repair.c
index 249313882108..8d3b550990b5 100644
--- a/fs/xfs/scrub/dir_repair.c
+++ b/fs/xfs/scrub/dir_repair.c
@@ -1289,9 +1289,7 @@ xrep_dir_scan_dirtree(
if (sc->ilock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL))
xchk_iunlock(sc, sc->ilock_flags & (XFS_ILOCK_SHARED |
XFS_ILOCK_EXCL));
- error = xchk_trans_alloc_empty(sc);
- if (error)
- return error;
+ xchk_trans_alloc_empty(sc);
while ((error = xchk_iscan_iter(&rd->pscan.iscan, &ip)) == 1) {
bool flush;
@@ -1317,9 +1315,7 @@ xrep_dir_scan_dirtree(
if (error)
break;
- error = xchk_trans_alloc_empty(sc);
- if (error)
- break;
+ xchk_trans_alloc_empty(sc);
}
if (xchk_should_terminate(sc, &error))
diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
index ca23cf4db6c5..cebd0d526926 100644
--- a/fs/xfs/scrub/fscounters.c
+++ b/fs/xfs/scrub/fscounters.c
@@ -123,7 +123,7 @@ xchk_fsfreeze(
{
int error;
- error = freeze_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL);
+ error = freeze_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL, NULL);
trace_xchk_fsfreeze(sc, error);
return error;
}
@@ -135,7 +135,7 @@ xchk_fsthaw(
int error;
/* This should always succeed, we have a kernel freeze */
- error = thaw_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL);
+ error = thaw_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL, NULL);
trace_xchk_fsthaw(sc, error);
return error;
}
@@ -237,7 +237,8 @@ xchk_setup_fscounters(
return error;
}
- return xchk_trans_alloc_empty(sc);
+ xchk_trans_alloc_empty(sc);
+ return 0;
}
/*
@@ -350,7 +351,7 @@ retry:
* The global incore space reservation is taken from the incore
* counters, so leave that out of the computation.
*/
- fsc->fdblocks -= mp->m_resblks_avail;
+ fsc->fdblocks -= mp->m_free[XC_FREE_BLOCKS].res_avail;
/*
* Delayed allocation reservations are taken out of the incore counters
@@ -413,7 +414,13 @@ xchk_fscount_count_frextents(
fsc->frextents = 0;
fsc->frextents_delayed = 0;
- if (!xfs_has_realtime(mp))
+
+ /*
+ * Don't bother verifying and repairing the fs counters for zoned file
+ * systems as they don't track an on-disk frextents count, and the
+ * in-memory percpu counter also includes reservations.
+ */
+ if (!xfs_has_realtime(mp) || xfs_has_zoned(mp))
return 0;
while ((rtg = xfs_rtgroup_next(mp, rtg))) {
@@ -513,8 +520,8 @@ xchk_fscounters(
/* Snapshot the percpu counters. */
icount = percpu_counter_sum(&mp->m_icount);
ifree = percpu_counter_sum(&mp->m_ifree);
- fdblocks = percpu_counter_sum(&mp->m_fdblocks);
- frextents = percpu_counter_sum(&mp->m_frextents);
+ fdblocks = xfs_sum_freecounter_raw(mp, XC_FREE_BLOCKS);
+ frextents = xfs_sum_freecounter_raw(mp, XC_FREE_RTEXTENTS);
/* No negative values, please! */
if (icount < 0 || ifree < 0)
@@ -589,15 +596,17 @@ xchk_fscounters(
try_again = true;
}
- if (!xchk_fscount_within_range(sc, fdblocks, &mp->m_fdblocks,
- fsc->fdblocks)) {
+ if (!xchk_fscount_within_range(sc, fdblocks,
+ &mp->m_free[XC_FREE_BLOCKS].count, fsc->fdblocks)) {
if (fsc->frozen)
xchk_set_corrupt(sc);
else
try_again = true;
}
- if (!xchk_fscount_within_range(sc, frextents, &mp->m_frextents,
+ if (!xfs_has_zoned(mp) &&
+ !xchk_fscount_within_range(sc, frextents,
+ &mp->m_free[XC_FREE_RTEXTENTS].count,
fsc->frextents - fsc->frextents_delayed)) {
if (fsc->frozen)
xchk_set_corrupt(sc);
diff --git a/fs/xfs/scrub/fscounters_repair.c b/fs/xfs/scrub/fscounters_repair.c
index cda13447a373..f0d2b04644e4 100644
--- a/fs/xfs/scrub/fscounters_repair.c
+++ b/fs/xfs/scrub/fscounters_repair.c
@@ -64,7 +64,7 @@ xrep_fscounters(
percpu_counter_set(&mp->m_icount, fsc->icount);
percpu_counter_set(&mp->m_ifree, fsc->ifree);
- percpu_counter_set(&mp->m_fdblocks, fsc->fdblocks);
+ xfs_set_freecounter(mp, XC_FREE_BLOCKS, fsc->fdblocks);
/*
* Online repair is only supported on v5 file systems, which require
@@ -74,10 +74,12 @@ xrep_fscounters(
* track of the delalloc reservations separately, as they are are
* subtracted from m_frextents, but not included in sb_frextents.
*/
- percpu_counter_set(&mp->m_frextents,
- fsc->frextents - fsc->frextents_delayed);
- if (!xfs_has_rtgroups(mp))
- mp->m_sb.sb_frextents = fsc->frextents;
+ if (!xfs_has_zoned(mp)) {
+ xfs_set_freecounter(mp, XC_FREE_RTEXTENTS,
+ fsc->frextents - fsc->frextents_delayed);
+ if (!xfs_has_rtgroups(mp))
+ mp->m_sb.sb_frextents = fsc->frextents;
+ }
return 0;
}
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index db6edd5a5fe5..bb3f475b6353 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -273,6 +273,13 @@ xchk_inode_cowextsize(
xfs_failaddr_t fa;
uint32_t value = be32_to_cpu(dip->di_cowextsize);
+ /*
+ * The used block counter for rtrmap is checked and repaired elsewhere.
+ */
+ if (xfs_has_zoned(sc->mp) &&
+ dip->di_metatype == cpu_to_be16(XFS_METAFILE_RTRMAP))
+ return;
+
fa = xfs_inode_validate_cowextsize(sc->mp, value, mode, flags, flags2);
if (fa)
xchk_ino_set_corrupt(sc, ino);
diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c
index 2f641b6d663e..a90a011c7e5f 100644
--- a/fs/xfs/scrub/inode_repair.c
+++ b/fs/xfs/scrub/inode_repair.c
@@ -710,7 +710,9 @@ xrep_dinode_extsize_hints(
XFS_DIFLAG_EXTSZINHERIT);
}
- if (dip->di_version < 3)
+ if (dip->di_version < 3 ||
+ (xfs_has_zoned(sc->mp) &&
+ dip->di_metatype == cpu_to_be16(XFS_METAFILE_RTRMAP)))
return;
fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize),
@@ -1055,9 +1057,17 @@ xrep_dinode_check_dfork(
return true;
break;
case S_IFREG:
- if (fmt == XFS_DINODE_FMT_LOCAL)
+ switch (fmt) {
+ case XFS_DINODE_FMT_LOCAL:
return true;
- fallthrough;
+ case XFS_DINODE_FMT_EXTENTS:
+ case XFS_DINODE_FMT_BTREE:
+ case XFS_DINODE_FMT_META_BTREE:
+ break;
+ default:
+ return true;
+ }
+ break;
case S_IFLNK:
case S_IFDIR:
switch (fmt) {
@@ -1550,8 +1560,7 @@ xrep_dinode_core(
/* Read the inode cluster buffer. */
error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp,
- ri->imap.im_blkno, ri->imap.im_len, XBF_UNMAPPED, &bp,
- NULL);
+ ri->imap.im_blkno, ri->imap.im_len, 0, &bp, NULL);
if (error)
return error;
diff --git a/fs/xfs/scrub/metapath.c b/fs/xfs/scrub/metapath.c
index e21c16fbd15d..14939d7de349 100644
--- a/fs/xfs/scrub/metapath.c
+++ b/fs/xfs/scrub/metapath.c
@@ -318,9 +318,7 @@ xchk_metapath(
return 0;
}
- error = xchk_trans_alloc_empty(sc);
- if (error)
- return error;
+ xchk_trans_alloc_empty(sc);
error = xchk_metapath_ilock_both(mpath);
if (error)
diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c
index ac38f5843090..1588ce971cb8 100644
--- a/fs/xfs/scrub/newbt.c
+++ b/fs/xfs/scrub/newbt.c
@@ -62,7 +62,7 @@ xrep_newbt_estimate_slack(
free = sc->sa.pag->pagf_freeblks;
sz = xfs_ag_block_count(sc->mp, pag_agno(sc->sa.pag));
} else {
- free = percpu_counter_sum(&sc->mp->m_fdblocks);
+ free = xfs_sum_freecounter_raw(sc->mp, XC_FREE_BLOCKS);
sz = sc->mp->m_sb.sb_dblocks;
}
diff --git a/fs/xfs/scrub/nlinks.c b/fs/xfs/scrub/nlinks.c
index 4a47d0aabf73..26721fab5cab 100644
--- a/fs/xfs/scrub/nlinks.c
+++ b/fs/xfs/scrub/nlinks.c
@@ -555,9 +555,7 @@ xchk_nlinks_collect(
* do not take sb_internal.
*/
xchk_trans_cancel(sc);
- error = xchk_trans_alloc_empty(sc);
- if (error)
- return error;
+ xchk_trans_alloc_empty(sc);
while ((error = xchk_iscan_iter(&xnc->collect_iscan, &ip)) == 1) {
if (S_ISDIR(VFS_I(ip)->i_mode))
@@ -880,9 +878,7 @@ xchk_nlinks_compare(
* inactivation workqueue.
*/
xchk_trans_cancel(sc);
- error = xchk_trans_alloc_empty(sc);
- if (error)
- return error;
+ xchk_trans_alloc_empty(sc);
/*
* Use the inobt to walk all allocated inodes to compare the link
diff --git a/fs/xfs/scrub/nlinks_repair.c b/fs/xfs/scrub/nlinks_repair.c
index 4ebdee095428..6ef2ee9c3814 100644
--- a/fs/xfs/scrub/nlinks_repair.c
+++ b/fs/xfs/scrub/nlinks_repair.c
@@ -340,9 +340,7 @@ xrep_nlinks(
* We can only push the inactivation workqueues with an empty
* transaction.
*/
- error = xchk_trans_alloc_empty(sc);
- if (error)
- break;
+ xchk_trans_alloc_empty(sc);
}
xchk_iscan_iter_finish(&xnc->compare_iscan);
xchk_iscan_teardown(&xnc->compare_iscan);
diff --git a/fs/xfs/scrub/orphanage.c b/fs/xfs/scrub/orphanage.c
index c287c755f2c5..9c12cb844231 100644
--- a/fs/xfs/scrub/orphanage.c
+++ b/fs/xfs/scrub/orphanage.c
@@ -153,8 +153,7 @@ xrep_orphanage_create(
/* Try to find the orphanage directory. */
inode_lock_nested(root_inode, I_MUTEX_PARENT);
- orphanage_dentry = lookup_one_len(ORPHANAGE, root_dentry,
- strlen(ORPHANAGE));
+ orphanage_dentry = lookup_noperm(&QSTR(ORPHANAGE), root_dentry);
if (IS_ERR(orphanage_dentry)) {
error = PTR_ERR(orphanage_dentry);
goto out_unlock_root;
@@ -167,10 +166,11 @@ xrep_orphanage_create(
* directory to control access to a file we put in here.
*/
if (d_really_is_negative(orphanage_dentry)) {
- error = vfs_mkdir(&nop_mnt_idmap, root_inode, orphanage_dentry,
- 0750);
- if (error)
- goto out_dput_orphanage;
+ orphanage_dentry = vfs_mkdir(&nop_mnt_idmap, root_inode,
+ orphanage_dentry, 0750);
+ error = PTR_ERR(orphanage_dentry);
+ if (IS_ERR(orphanage_dentry))
+ goto out_unlock_root;
}
/* Not a directory? Bail out. */
@@ -444,7 +444,7 @@ xrep_adoption_check_dcache(
if (!d_orphanage)
return 0;
- d_child = d_hash_and_lookup(d_orphanage, &qname);
+ d_child = try_lookup_noperm(&qname, d_orphanage);
if (d_child) {
trace_xrep_adoption_check_child(sc->mp, d_child);
@@ -481,7 +481,7 @@ xrep_adoption_zap_dcache(
if (!d_orphanage)
return;
- d_child = d_hash_and_lookup(d_orphanage, &qname);
+ d_child = try_lookup_noperm(&qname, d_orphanage);
while (d_child != NULL) {
trace_xrep_adoption_invalidate_child(sc->mp, d_child);
diff --git a/fs/xfs/scrub/parent_repair.c b/fs/xfs/scrub/parent_repair.c
index 31bfe10be22a..2949feda6271 100644
--- a/fs/xfs/scrub/parent_repair.c
+++ b/fs/xfs/scrub/parent_repair.c
@@ -569,9 +569,7 @@ xrep_parent_scan_dirtree(
if (sc->ilock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL))
xchk_iunlock(sc, sc->ilock_flags & (XFS_ILOCK_SHARED |
XFS_ILOCK_EXCL));
- error = xchk_trans_alloc_empty(sc);
- if (error)
- return error;
+ xchk_trans_alloc_empty(sc);
while ((error = xchk_iscan_iter(&rp->pscan.iscan, &ip)) == 1) {
bool flush;
@@ -597,9 +595,7 @@ xrep_parent_scan_dirtree(
if (error)
break;
- error = xchk_trans_alloc_empty(sc);
- if (error)
- break;
+ xchk_trans_alloc_empty(sc);
}
if (xchk_should_terminate(sc, &error))
@@ -1099,9 +1095,7 @@ xrep_parent_flush_xattrs(
xrep_tempfile_iounlock(rp->sc);
/* Recreate the empty transaction and relock the inode. */
- error = xchk_trans_alloc_empty(rp->sc);
- if (error)
- return error;
+ xchk_trans_alloc_empty(rp->sc);
xchk_ilock(rp->sc, XFS_ILOCK_EXCL);
return 0;
}
diff --git a/fs/xfs/scrub/quotacheck.c b/fs/xfs/scrub/quotacheck.c
index dc4033b91e44..e4105aaafe84 100644
--- a/fs/xfs/scrub/quotacheck.c
+++ b/fs/xfs/scrub/quotacheck.c
@@ -505,9 +505,7 @@ xqcheck_collect_counts(
* transactions do not take sb_internal.
*/
xchk_trans_cancel(sc);
- error = xchk_trans_alloc_empty(sc);
- if (error)
- return error;
+ xchk_trans_alloc_empty(sc);
while ((error = xchk_iscan_iter(&xqc->iscan, &ip)) == 1) {
error = xqcheck_collect_inode(xqc, ip);
diff --git a/fs/xfs/scrub/rcbag_btree.c b/fs/xfs/scrub/rcbag_btree.c
index 709356dc6256..9a4ef823c5a7 100644
--- a/fs/xfs/scrub/rcbag_btree.c
+++ b/fs/xfs/scrub/rcbag_btree.c
@@ -47,29 +47,20 @@ rcbagbt_init_rec_from_cur(
bag_rec->rbg_refcount = bag_irec->rbg_refcount;
}
-STATIC int64_t
-rcbagbt_key_diff(
+STATIC int
+rcbagbt_cmp_key_with_cur(
struct xfs_btree_cur *cur,
const union xfs_btree_key *key)
{
struct rcbag_rec *rec = (struct rcbag_rec *)&cur->bc_rec;
const struct rcbag_key *kp = (const struct rcbag_key *)key;
- if (kp->rbg_startblock > rec->rbg_startblock)
- return 1;
- if (kp->rbg_startblock < rec->rbg_startblock)
- return -1;
-
- if (kp->rbg_blockcount > rec->rbg_blockcount)
- return 1;
- if (kp->rbg_blockcount < rec->rbg_blockcount)
- return -1;
-
- return 0;
+ return cmp_int(kp->rbg_startblock, rec->rbg_startblock) ?:
+ cmp_int(kp->rbg_blockcount, rec->rbg_blockcount);
}
-STATIC int64_t
-rcbagbt_diff_two_keys(
+STATIC int
+rcbagbt_cmp_two_keys(
struct xfs_btree_cur *cur,
const union xfs_btree_key *k1,
const union xfs_btree_key *k2,
@@ -80,17 +71,8 @@ rcbagbt_diff_two_keys(
ASSERT(mask == NULL);
- if (kp1->rbg_startblock > kp2->rbg_startblock)
- return 1;
- if (kp1->rbg_startblock < kp2->rbg_startblock)
- return -1;
-
- if (kp1->rbg_blockcount > kp2->rbg_blockcount)
- return 1;
- if (kp1->rbg_blockcount < kp2->rbg_blockcount)
- return -1;
-
- return 0;
+ return cmp_int(kp1->rbg_startblock, kp2->rbg_startblock) ?:
+ cmp_int(kp1->rbg_blockcount, kp2->rbg_blockcount);
}
STATIC int
@@ -201,9 +183,9 @@ static const struct xfs_btree_ops rcbagbt_mem_ops = {
.init_key_from_rec = rcbagbt_init_key_from_rec,
.init_rec_from_cur = rcbagbt_init_rec_from_cur,
.init_ptr_from_cur = xfbtree_init_ptr_from_cur,
- .key_diff = rcbagbt_key_diff,
+ .cmp_key_with_cur = rcbagbt_cmp_key_with_cur,
.buf_ops = &rcbagbt_mem_buf_ops,
- .diff_two_keys = rcbagbt_diff_two_keys,
+ .cmp_two_keys = rcbagbt_cmp_two_keys,
.keys_inorder = rcbagbt_keys_inorder,
.recs_inorder = rcbagbt_recs_inorder,
};
diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c
index b32fb233cf84..8703897c0a9c 100644
--- a/fs/xfs/scrub/reap.c
+++ b/fs/xfs/scrub/reap.c
@@ -935,10 +935,13 @@ xrep_reap_metadir_fsblocks(
if (error)
return error;
- if (xreap_dirty(&rs))
- return xrep_defer_finish(sc);
+ if (xreap_dirty(&rs)) {
+ error = xrep_defer_finish(sc);
+ if (error)
+ return error;
+ }
- return 0;
+ return xrep_reset_metafile_resv(sc);
}
/*
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 3b5288d3ef4e..d00c18954a26 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -43,6 +43,7 @@
#include "xfs_rtalloc.h"
#include "xfs_metafile.h"
#include "xfs_rtrefcount_btree.h"
+#include "xfs_zone_alloc.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -1050,7 +1051,13 @@ xrep_require_rtext_inuse(
xfs_rtxnum_t startrtx;
xfs_rtxnum_t endrtx;
bool is_free = false;
- int error;
+ int error = 0;
+
+ if (xfs_has_zoned(mp)) {
+ if (!xfs_zone_rgbno_is_valid(sc->sr.rtg, rgbno + len - 1))
+ return -EFSCORRUPTED;
+ return 0;
+ }
startrtx = xfs_rgbno_to_rtx(mp, rgbno);
endrtx = xfs_rgbno_to_rtx(mp, rgbno + len - 1);
@@ -1262,42 +1269,6 @@ xrep_setup_xfbtree(
}
/*
- * Create a dummy transaction for use in a live update hook function. This
- * function MUST NOT be called from regular repair code because the current
- * process' transaction is saved via the cookie.
- */
-int
-xrep_trans_alloc_hook_dummy(
- struct xfs_mount *mp,
- void **cookiep,
- struct xfs_trans **tpp)
-{
- int error;
-
- *cookiep = current->journal_info;
- current->journal_info = NULL;
-
- error = xfs_trans_alloc_empty(mp, tpp);
- if (!error)
- return 0;
-
- current->journal_info = *cookiep;
- *cookiep = NULL;
- return error;
-}
-
-/* Cancel a dummy transaction used by a live update hook function. */
-void
-xrep_trans_cancel_hook_dummy(
- void **cookiep,
- struct xfs_trans *tp)
-{
- xfs_trans_cancel(tp);
- current->journal_info = *cookiep;
- *cookiep = NULL;
-}
-
-/*
* See if this buffer can pass the given ->verify_struct() function.
*
* If the buffer already has ops attached and they're not the ones that were
@@ -1386,11 +1357,12 @@ int
xrep_reset_metafile_resv(
struct xfs_scrub *sc)
{
- struct xfs_inode *ip = sc->ip;
+ struct xfs_mount *mp = sc->mp;
int64_t delta;
int error;
- delta = ip->i_nblocks + ip->i_delayed_blks - ip->i_meta_resv_asked;
+ delta = mp->m_metafile_resv_used + mp->m_metafile_resv_avail -
+ mp->m_metafile_resv_target;
if (delta == 0)
return 0;
@@ -1401,11 +1373,11 @@ xrep_reset_metafile_resv(
if (delta > 0) {
int64_t give_back;
- give_back = min_t(uint64_t, delta, ip->i_delayed_blks);
+ give_back = min_t(uint64_t, delta, mp->m_metafile_resv_avail);
if (give_back > 0) {
- xfs_mod_delalloc(ip, 0, -give_back);
- xfs_add_fdblocks(ip->i_mount, give_back);
- ip->i_delayed_blks -= give_back;
+ xfs_mod_sb_delalloc(mp, -give_back);
+ xfs_add_fdblocks(mp, give_back);
+ mp->m_metafile_resv_avail -= give_back;
}
return 0;
@@ -1413,24 +1385,23 @@ xrep_reset_metafile_resv(
/*
* Not enough reservation; try to take some blocks from the filesystem
- * to the metadata inode. @delta is negative here, so invert the sign.
+ * to the metabtree reservation.
*/
- delta = -delta;
- error = xfs_dec_fdblocks(sc->mp, delta, true);
+ delta = -delta; /* delta is negative here, so invert the sign. */
+ error = xfs_dec_fdblocks(mp, delta, true);
while (error == -ENOSPC) {
delta--;
if (delta == 0) {
xfs_warn(sc->mp,
-"Insufficient free space to reset space reservation for inode 0x%llx after repair.",
- ip->i_ino);
+"Insufficient free space to reset metabtree reservation after repair.");
return 0;
}
- error = xfs_dec_fdblocks(sc->mp, delta, true);
+ error = xfs_dec_fdblocks(mp, delta, true);
}
if (error)
return error;
- xfs_mod_delalloc(ip, 0, delta);
- ip->i_delayed_blks += delta;
+ xfs_mod_sb_delalloc(mp, delta);
+ mp->m_metafile_resv_avail += delta;
return 0;
}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 823c00d1a502..9c04295742c8 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -180,10 +180,6 @@ int xrep_quotacheck(struct xfs_scrub *sc);
int xrep_reinit_pagf(struct xfs_scrub *sc);
int xrep_reinit_pagi(struct xfs_scrub *sc);
-int xrep_trans_alloc_hook_dummy(struct xfs_mount *mp, void **cookiep,
- struct xfs_trans **tpp);
-void xrep_trans_cancel_hook_dummy(void **cookiep, struct xfs_trans *tp);
-
bool xrep_buf_verify_struct(struct xfs_buf *bp, const struct xfs_buf_ops *ops);
void xrep_inode_set_nblocks(struct xfs_scrub *sc, int64_t new_blocks);
int xrep_reset_metafile_resv(struct xfs_scrub *sc);
@@ -191,7 +187,16 @@ int xrep_reset_metafile_resv(struct xfs_scrub *sc);
#else
#define xrep_ino_dqattach(sc) (0)
-#define xrep_will_attempt(sc) (false)
+
+/*
+ * When online repair is not built into the kernel, we still want to attempt
+ * the repair so that the stub xrep_attempt below will return EOPNOTSUPP.
+ */
+static inline bool xrep_will_attempt(const struct xfs_scrub *sc)
+{
+ return (sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) ||
+ xchk_needs_repair(sc->sm);
+}
static inline int
xrep_attempt(
diff --git a/fs/xfs/scrub/rmap_repair.c b/fs/xfs/scrub/rmap_repair.c
index f5f73078ffe2..17d4a38d735c 100644
--- a/fs/xfs/scrub/rmap_repair.c
+++ b/fs/xfs/scrub/rmap_repair.c
@@ -951,9 +951,7 @@ end_agscan:
sa->agf_bp = NULL;
sa->agi_bp = NULL;
xchk_trans_cancel(sc);
- error = xchk_trans_alloc_empty(sc);
- if (error)
- return error;
+ xchk_trans_alloc_empty(sc);
/* Iterate all AGs for inodes rmaps. */
while ((error = xchk_iscan_iter(&rr->iscan, &ip)) == 1) {
@@ -1612,7 +1610,6 @@ xrep_rmapbt_live_update(
struct xfs_mount *mp;
struct xfs_btree_cur *mcur;
struct xfs_trans *tp;
- void *txcookie;
int error;
rr = container_of(nb, struct xrep_rmap, rhook.rmap_hook.nb);
@@ -1623,9 +1620,7 @@ xrep_rmapbt_live_update(
trace_xrep_rmap_live_update(pag_group(rr->sc->sa.pag), action, p);
- error = xrep_trans_alloc_hook_dummy(mp, &txcookie, &tp);
- if (error)
- goto out_abort;
+ tp = xfs_trans_alloc_empty(mp);
mutex_lock(&rr->lock);
mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, tp, &rr->rmap_btree);
@@ -1639,14 +1634,13 @@ xrep_rmapbt_live_update(
if (error)
goto out_cancel;
- xrep_trans_cancel_hook_dummy(&txcookie, tp);
+ xfs_trans_cancel(tp);
mutex_unlock(&rr->lock);
return NOTIFY_DONE;
out_cancel:
xfbtree_trans_cancel(&rr->rmap_btree, tp);
- xrep_trans_cancel_hook_dummy(&txcookie, tp);
-out_abort:
+ xfs_trans_cancel(tp);
mutex_unlock(&rr->lock);
xchk_iscan_abort(&rr->iscan);
out_unlock:
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index e8c776a34c1d..d5ff8609dbfb 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -21,6 +21,7 @@
#include "xfs_rmap.h"
#include "xfs_rtrmap_btree.h"
#include "xfs_exchmaps.h"
+#include "xfs_zone_alloc.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/repair.h"
@@ -272,7 +273,6 @@ xchk_xref_is_used_rt_space(
xfs_extlen_t len)
{
struct xfs_rtgroup *rtg = sc->sr.rtg;
- struct xfs_inode *rbmip = rtg_bitmap(rtg);
xfs_rtxnum_t startext;
xfs_rtxnum_t endext;
bool is_free;
@@ -281,6 +281,13 @@ xchk_xref_is_used_rt_space(
if (xchk_skip_xref(sc->sm))
return;
+ if (xfs_has_zoned(sc->mp)) {
+ if (!xfs_zone_rgbno_is_valid(rtg,
+ xfs_rtb_to_rgbno(sc->mp, rtbno) + len - 1))
+ xchk_ino_xref_set_corrupt(sc, rtg_rmap(rtg)->i_ino);
+ return;
+ }
+
startext = xfs_rtb_to_rtx(sc->mp, rtbno);
endext = xfs_rtb_to_rtx(sc->mp, rtbno + len - 1);
error = xfs_rtalloc_extent_is_free(rtg, sc->tp, startext,
@@ -288,5 +295,5 @@ xchk_xref_is_used_rt_space(
if (!xchk_should_check_xref(sc, &error, NULL))
return;
if (is_free)
- xchk_ino_xref_set_corrupt(sc, rbmip->i_ino);
+ xchk_ino_xref_set_corrupt(sc, rtg_bitmap(rtg)->i_ino);
}
diff --git a/fs/xfs/scrub/rtrefcount_repair.c b/fs/xfs/scrub/rtrefcount_repair.c
index 257cfb24beb4..983362447826 100644
--- a/fs/xfs/scrub/rtrefcount_repair.c
+++ b/fs/xfs/scrub/rtrefcount_repair.c
@@ -697,32 +697,6 @@ err_cur:
return error;
}
-/*
- * Now that we've logged the roots of the new btrees, invalidate all of the
- * old blocks and free them.
- */
-STATIC int
-xrep_rtrefc_remove_old_tree(
- struct xrep_rtrefc *rr)
-{
- int error;
-
- /*
- * Free all the extents that were allocated to the former rtrefcountbt
- * and aren't cross-linked with something else.
- */
- error = xrep_reap_metadir_fsblocks(rr->sc,
- &rr->old_rtrefcountbt_blocks);
- if (error)
- return error;
-
- /*
- * Ensure the proper reservation for the rtrefcount inode so that we
- * don't fail to expand the btree.
- */
- return xrep_reset_metafile_resv(rr->sc);
-}
-
/* Rebuild the rt refcount btree. */
int
xrep_rtrefcountbt(
@@ -769,8 +743,12 @@ xrep_rtrefcountbt(
if (error)
goto out_bitmap;
- /* Kill the old tree. */
- error = xrep_rtrefc_remove_old_tree(rr);
+ /*
+ * Free all the extents that were allocated to the former rtrefcountbt
+ * and aren't cross-linked with something else.
+ */
+ error = xrep_reap_metadir_fsblocks(rr->sc,
+ &rr->old_rtrefcountbt_blocks);
if (error)
goto out_bitmap;
diff --git a/fs/xfs/scrub/rtrmap_repair.c b/fs/xfs/scrub/rtrmap_repair.c
index f2fdd7a9fc24..7561941a337a 100644
--- a/fs/xfs/scrub/rtrmap_repair.c
+++ b/fs/xfs/scrub/rtrmap_repair.c
@@ -580,9 +580,7 @@ xrep_rtrmap_find_rmaps(
*/
xchk_trans_cancel(sc);
xchk_rtgroup_unlock(&sc->sr);
- error = xchk_trans_alloc_empty(sc);
- if (error)
- return error;
+ xchk_trans_alloc_empty(sc);
while ((error = xchk_iscan_iter(&rr->iscan, &ip)) == 1) {
error = xrep_rtrmap_scan_inode(rr, ip);
@@ -810,28 +808,6 @@ err_cur:
/* Reaping the old btree. */
-/* Reap the old rtrmapbt blocks. */
-STATIC int
-xrep_rtrmap_remove_old_tree(
- struct xrep_rtrmap *rr)
-{
- int error;
-
- /*
- * Free all the extents that were allocated to the former rtrmapbt and
- * aren't cross-linked with something else.
- */
- error = xrep_reap_metadir_fsblocks(rr->sc, &rr->old_rtrmapbt_blocks);
- if (error)
- return error;
-
- /*
- * Ensure the proper reservation for the rtrmap inode so that we don't
- * fail to expand the new btree.
- */
- return xrep_reset_metafile_resv(rr->sc);
-}
-
static inline bool
xrep_rtrmapbt_want_live_update(
struct xchk_iscan *iscan,
@@ -868,7 +844,6 @@ xrep_rtrmapbt_live_update(
struct xfs_mount *mp;
struct xfs_btree_cur *mcur;
struct xfs_trans *tp;
- void *txcookie;
int error;
rr = container_of(nb, struct xrep_rtrmap, rhook.rmap_hook.nb);
@@ -879,9 +854,7 @@ xrep_rtrmapbt_live_update(
trace_xrep_rmap_live_update(rtg_group(rr->sc->sr.rtg), action, p);
- error = xrep_trans_alloc_hook_dummy(mp, &txcookie, &tp);
- if (error)
- goto out_abort;
+ tp = xfs_trans_alloc_empty(mp);
mutex_lock(&rr->lock);
mcur = xfs_rtrmapbt_mem_cursor(rr->sc->sr.rtg, tp, &rr->rtrmap_btree);
@@ -895,14 +868,13 @@ xrep_rtrmapbt_live_update(
if (error)
goto out_cancel;
- xrep_trans_cancel_hook_dummy(&txcookie, tp);
+ xfs_trans_cancel(tp);
mutex_unlock(&rr->lock);
return NOTIFY_DONE;
out_cancel:
xfbtree_trans_cancel(&rr->rtrmap_btree, tp);
- xrep_trans_cancel_hook_dummy(&txcookie, tp);
-out_abort:
+ xfs_trans_cancel(tp);
xchk_iscan_abort(&rr->iscan);
mutex_unlock(&rr->lock);
out_unlock:
@@ -995,8 +967,11 @@ xrep_rtrmapbt(
if (error)
goto out_records;
- /* Kill the old tree. */
- error = xrep_rtrmap_remove_old_tree(rr);
+ /*
+ * Free all the extents that were allocated to the former rtrmapbt and
+ * aren't cross-linked with something else.
+ */
+ error = xrep_reap_metadir_fsblocks(rr->sc, &rr->old_rtrmapbt_blocks);
if (error)
goto out_records;
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 7567dd5cad14..3c3b0d25006f 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -149,6 +149,18 @@ xchk_probe(
if (xchk_should_terminate(sc, &error))
return error;
+ /*
+ * If the caller is probing to see if repair works but repair isn't
+ * built into the kernel, return EOPNOTSUPP because that's the signal
+ * that userspace expects. If online repair is built in, set the
+ * CORRUPT flag (without any of the usual tracing/logging) to force us
+ * into xrep_probe.
+ */
+ if (xchk_could_repair(sc)) {
+ if (!IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR))
+ return -EOPNOTSUPP;
+ sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+ }
return 0;
}
@@ -387,12 +399,14 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
},
[XFS_SCRUB_TYPE_RTBITMAP] = { /* realtime bitmap */
.type = ST_RTGROUP,
+ .has = xfs_has_nonzoned,
.setup = xchk_setup_rtbitmap,
.scrub = xchk_rtbitmap,
.repair = xrep_rtbitmap,
},
[XFS_SCRUB_TYPE_RTSUM] = { /* realtime summary */
.type = ST_RTGROUP,
+ .has = xfs_has_nonzoned,
.setup = xchk_setup_rtsummary,
.scrub = xchk_rtsummary,
.repair = xrep_rtsummary,
@@ -666,8 +680,6 @@ xfs_scrub_metadata(
if (error)
goto out;
- xfs_warn_experimental(mp, XFS_EXPERIMENTAL_SCRUB);
-
sc = kzalloc(sizeof(struct xfs_scrub), XCHK_GFP_FLAGS);
if (!sc) {
error = -ENOMEM;
@@ -864,10 +876,7 @@ xchk_scrubv_open_by_handle(
struct xfs_inode *ip;
int error;
- error = xfs_trans_alloc_empty(mp, &tp);
- if (error)
- return NULL;
-
+ tp = xfs_trans_alloc_empty(mp);
error = xfs_iget(mp, tp, head->svh_ino, XCHK_IGET_FLAGS, 0, &ip);
xfs_trans_cancel(tp);
if (error)
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index d7c4ced47c15..a8187281eb96 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -479,7 +479,7 @@ DECLARE_EVENT_CLASS(xchk_dqiter_class,
__field(xfs_exntst_t, state)
),
TP_fast_assign(
- __entry->dev = cursor->sc->ip->i_mount->m_super->s_dev;
+ __entry->dev = cursor->sc->mp->m_super->s_dev;
__entry->dqtype = cursor->dqtype;
__entry->ino = cursor->quota_ip->i_ino;
__entry->cur_id = cursor->id;
@@ -2996,7 +2996,7 @@ DEFINE_EVENT(xrep_pptr_salvage_class, name, \
DEFINE_XREP_PPTR_SALVAGE_EVENT(xrep_xattr_salvage_pptr);
DEFINE_XREP_PPTR_SALVAGE_EVENT(xrep_xattr_insert_pptr);
-TRACE_EVENT(xrep_xattr_class,
+DECLARE_EVENT_CLASS(xrep_xattr_class,
TP_PROTO(struct xfs_inode *ip, struct xfs_inode *arg_ip),
TP_ARGS(ip, arg_ip),
TP_STRUCT__entry(
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 67877c36ed11..1ee4f835ac3c 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * Copyright (c) 2016-2018 Christoph Hellwig.
+ * Copyright (c) 2016-2025 Christoph Hellwig.
* All Rights Reserved.
*/
#include "xfs.h"
@@ -19,6 +19,9 @@
#include "xfs_reflink.h"
#include "xfs_errortag.h"
#include "xfs_error.h"
+#include "xfs_icache.h"
+#include "xfs_zone_alloc.h"
+#include "xfs_rtgroup.h"
struct xfs_writepage_ctx {
struct iomap_writepage_ctx ctx;
@@ -76,6 +79,26 @@ xfs_setfilesize(
return xfs_trans_commit(tp);
}
+static void
+xfs_ioend_put_open_zones(
+ struct iomap_ioend *ioend)
+{
+ struct iomap_ioend *tmp;
+
+ /*
+ * Put the open zone for all ioends merged into this one (if any).
+ */
+ list_for_each_entry(tmp, &ioend->io_list, io_list)
+ xfs_open_zone_put(tmp->io_private);
+
+ /*
+ * The main ioend might not have an open zone if the submission failed
+ * before xfs_zone_alloc_and_submit got called.
+ */
+ if (ioend->io_private)
+ xfs_open_zone_put(ioend->io_private);
+}
+
/*
* IO write completion.
*/
@@ -85,6 +108,7 @@ xfs_end_ioend(
{
struct xfs_inode *ip = XFS_I(ioend->io_inode);
struct xfs_mount *mp = ip->i_mount;
+ bool is_zoned = xfs_is_zoned_inode(ip);
xfs_off_t offset = ioend->io_offset;
size_t size = ioend->io_size;
unsigned int nofs_flag;
@@ -114,10 +138,11 @@ xfs_end_ioend(
*/
error = blk_status_to_errno(ioend->io_bio.bi_status);
if (unlikely(error)) {
- if (ioend->io_flags & IOMAP_F_SHARED) {
+ if (ioend->io_flags & IOMAP_IOEND_SHARED) {
+ ASSERT(!is_zoned);
xfs_reflink_cancel_cow_range(ip, offset, size, true);
xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, offset,
- offset + size);
+ offset + size, NULL);
}
goto done;
}
@@ -125,14 +150,21 @@ xfs_end_ioend(
/*
* Success: commit the COW or unwritten blocks if needed.
*/
- if (ioend->io_flags & IOMAP_F_SHARED)
+ if (is_zoned)
+ error = xfs_zoned_end_io(ip, offset, size, ioend->io_sector,
+ ioend->io_private, NULLFSBLOCK);
+ else if (ioend->io_flags & IOMAP_IOEND_SHARED)
error = xfs_reflink_end_cow(ip, offset, size);
- else if (ioend->io_type == IOMAP_UNWRITTEN)
+ else if (ioend->io_flags & IOMAP_IOEND_UNWRITTEN)
error = xfs_iomap_write_unwritten(ip, offset, size, false);
- if (!error && xfs_ioend_is_append(ioend))
+ if (!error &&
+ !(ioend->io_flags & IOMAP_IOEND_DIRECT) &&
+ xfs_ioend_is_append(ioend))
error = xfs_setfilesize(ip, offset, size);
done:
+ if (is_zoned)
+ xfs_ioend_put_open_zones(ioend);
iomap_finish_ioends(ioend, error);
memalloc_nofs_restore(nofs_flag);
}
@@ -175,23 +207,74 @@ xfs_end_io(
}
}
-STATIC void
+void
xfs_end_bio(
struct bio *bio)
{
struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
struct xfs_inode *ip = XFS_I(ioend->io_inode);
+ struct xfs_mount *mp = ip->i_mount;
unsigned long flags;
+ /*
+ * For Appends record the actually written block number and set the
+ * boundary flag if needed.
+ */
+ if (IS_ENABLED(CONFIG_XFS_RT) && bio_is_zone_append(bio)) {
+ ioend->io_sector = bio->bi_iter.bi_sector;
+ xfs_mark_rtg_boundary(ioend);
+ }
+
spin_lock_irqsave(&ip->i_ioend_lock, flags);
if (list_empty(&ip->i_ioend_list))
- WARN_ON_ONCE(!queue_work(ip->i_mount->m_unwritten_workqueue,
+ WARN_ON_ONCE(!queue_work(mp->m_unwritten_workqueue,
&ip->i_ioend_work));
list_add_tail(&ioend->io_list, &ip->i_ioend_list);
spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
}
/*
+ * We cannot cancel the ioend directly on error. We may have already set other
+ * pages under writeback and hence we have to run I/O completion to mark the
+ * error state of the pages under writeback appropriately.
+ *
+ * If the folio has delalloc blocks on it, the caller is asking us to punch them
+ * out. If we don't, we can leave a stale delalloc mapping covered by a clean
+ * page that needs to be dirtied again before the delalloc mapping can be
+ * converted. This stale delalloc mapping can trip up a later direct I/O read
+ * operation on the same region.
+ *
+ * We prevent this by truncating away the delalloc regions on the folio. Because
+ * they are delalloc, we can do this without needing a transaction. Indeed - if
+ * we get ENOSPC errors, we have to be able to do this truncation without a
+ * transaction as there is no space left for block reservation (typically why
+ * we see a ENOSPC in writeback).
+ */
+static void
+xfs_discard_folio(
+ struct folio *folio,
+ loff_t pos)
+{
+ struct xfs_inode *ip = XFS_I(folio->mapping->host);
+ struct xfs_mount *mp = ip->i_mount;
+
+ if (xfs_is_shutdown(mp))
+ return;
+
+ xfs_alert_ratelimited(mp,
+ "page discard on page "PTR_FMT", inode 0x%llx, pos %llu.",
+ folio, ip->i_ino, pos);
+
+ /*
+ * The end of the punch range is always the offset of the first
+ * byte of the next folio. Hence the end offset is only dependent on the
+ * folio itself and not the start offset that is passed in.
+ */
+ xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos,
+ folio_pos(folio) + folio_size(folio), NULL);
+}
+
+/*
* Fast revalidation of the cached writeback mapping. Return true if the current
* mapping is valid, false otherwise.
*/
@@ -236,13 +319,12 @@ xfs_imap_valid(
static int
xfs_map_blocks(
struct iomap_writepage_ctx *wpc,
- struct inode *inode,
loff_t offset,
unsigned int len)
{
- struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_inode *ip = XFS_I(wpc->inode);
struct xfs_mount *mp = ip->i_mount;
- ssize_t count = i_blocksize(inode);
+ ssize_t count = i_blocksize(wpc->inode);
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
xfs_fileoff_t cow_fsb;
@@ -394,76 +476,194 @@ allocate_blocks:
return 0;
}
+static ssize_t
+xfs_writeback_range(
+ struct iomap_writepage_ctx *wpc,
+ struct folio *folio,
+ u64 offset,
+ unsigned int len,
+ u64 end_pos)
+{
+ ssize_t ret;
+
+ ret = xfs_map_blocks(wpc, offset, len);
+ if (!ret)
+ ret = iomap_add_to_ioend(wpc, folio, offset, end_pos, len);
+ if (ret < 0)
+ xfs_discard_folio(folio, offset);
+ return ret;
+}
+
+static bool
+xfs_ioend_needs_wq_completion(
+ struct iomap_ioend *ioend)
+{
+ /* Changing inode size requires a transaction. */
+ if (xfs_ioend_is_append(ioend))
+ return true;
+
+ /* Extent manipulation requires a transaction. */
+ if (ioend->io_flags & (IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_SHARED))
+ return true;
+
+ /* Page cache invalidation cannot be done in irq context. */
+ if (ioend->io_flags & IOMAP_IOEND_DONTCACHE)
+ return true;
+
+ return false;
+}
+
static int
-xfs_prepare_ioend(
- struct iomap_ioend *ioend,
- int status)
+xfs_writeback_submit(
+ struct iomap_writepage_ctx *wpc,
+ int error)
{
- unsigned int nofs_flag;
+ struct iomap_ioend *ioend = wpc->wb_ctx;
/*
- * We can allocate memory here while doing writeback on behalf of
- * memory reclaim. To avoid memory allocation deadlocks set the
- * task-wide nofs context for the following operations.
+ * Convert CoW extents to regular.
+ *
+ * We can allocate memory here while doing writeback on behalf of memory
+ * reclaim. To avoid memory allocation deadlocks, set the task-wide
+ * nofs context.
*/
- nofs_flag = memalloc_nofs_save();
+ if (!error && (ioend->io_flags & IOMAP_IOEND_SHARED)) {
+ unsigned int nofs_flag;
- /* Convert CoW extents to regular */
- if (!status && (ioend->io_flags & IOMAP_F_SHARED)) {
- status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
+ nofs_flag = memalloc_nofs_save();
+ error = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
ioend->io_offset, ioend->io_size);
+ memalloc_nofs_restore(nofs_flag);
}
- memalloc_nofs_restore(nofs_flag);
-
- /* send ioends that might require a transaction to the completion wq */
- if (xfs_ioend_is_append(ioend) || ioend->io_type == IOMAP_UNWRITTEN ||
- (ioend->io_flags & IOMAP_F_SHARED))
+ /*
+ * Send ioends that might require a transaction to the completion wq.
+ */
+ if (xfs_ioend_needs_wq_completion(ioend))
ioend->io_bio.bi_end_io = xfs_end_bio;
- return status;
+
+ return iomap_ioend_writeback_submit(wpc, error);
}
-/*
- * If the folio has delalloc blocks on it, the caller is asking us to punch them
- * out. If we don't, we can leave a stale delalloc mapping covered by a clean
- * page that needs to be dirtied again before the delalloc mapping can be
- * converted. This stale delalloc mapping can trip up a later direct I/O read
- * operation on the same region.
- *
- * We prevent this by truncating away the delalloc regions on the folio. Because
- * they are delalloc, we can do this without needing a transaction. Indeed - if
- * we get ENOSPC errors, we have to be able to do this truncation without a
- * transaction as there is no space left for block reservation (typically why
- * we see a ENOSPC in writeback).
- */
-static void
-xfs_discard_folio(
- struct folio *folio,
- loff_t pos)
+static const struct iomap_writeback_ops xfs_writeback_ops = {
+ .writeback_range = xfs_writeback_range,
+ .writeback_submit = xfs_writeback_submit,
+};
+
+struct xfs_zoned_writepage_ctx {
+ struct iomap_writepage_ctx ctx;
+ struct xfs_open_zone *open_zone;
+};
+
+static inline struct xfs_zoned_writepage_ctx *
+XFS_ZWPC(struct iomap_writepage_ctx *ctx)
{
- struct xfs_inode *ip = XFS_I(folio->mapping->host);
+ return container_of(ctx, struct xfs_zoned_writepage_ctx, ctx);
+}
+
+static int
+xfs_zoned_map_blocks(
+ struct iomap_writepage_ctx *wpc,
+ loff_t offset,
+ unsigned int len)
+{
+ struct xfs_inode *ip = XFS_I(wpc->inode);
struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + len);
+ xfs_filblks_t count_fsb;
+ struct xfs_bmbt_irec imap, del;
+ struct xfs_iext_cursor icur;
if (xfs_is_shutdown(mp))
- return;
+ return -EIO;
- xfs_alert_ratelimited(mp,
- "page discard on page "PTR_FMT", inode 0x%llx, pos %llu.",
- folio, ip->i_ino, pos);
+ XFS_ERRORTAG_DELAY(mp, XFS_ERRTAG_WB_DELAY_MS);
/*
- * The end of the punch range is always the offset of the first
- * byte of the next folio. Hence the end offset is only dependent on the
- * folio itself and not the start offset that is passed in.
+ * All dirty data must be covered by delalloc extents. But truncate can
+ * remove delalloc extents underneath us or reduce their size.
+ * Returning a hole tells iomap to not write back any data from this
+ * range, which is the right thing to do in that case.
+ *
+ * Otherwise just tell iomap to treat ranges previously covered by a
+ * delalloc extent as mapped. The actual block allocation will be done
+ * just before submitting the bio.
+ *
+ * This implies we never map outside folios that are locked or marked
+ * as under writeback, and thus there is no need check the fork sequence
+ * count here.
*/
- xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos,
- folio_pos(folio) + folio_size(folio));
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap))
+ imap.br_startoff = end_fsb; /* fake a hole past EOF */
+ if (imap.br_startoff > offset_fsb) {
+ imap.br_blockcount = imap.br_startoff - offset_fsb;
+ imap.br_startoff = offset_fsb;
+ imap.br_startblock = HOLESTARTBLOCK;
+ imap.br_state = XFS_EXT_NORM;
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, 0);
+ return 0;
+ }
+ end_fsb = min(end_fsb, imap.br_startoff + imap.br_blockcount);
+ count_fsb = end_fsb - offset_fsb;
+
+ del = imap;
+ xfs_trim_extent(&del, offset_fsb, count_fsb);
+ xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, &icur, &imap, &del,
+ XFS_BMAPI_REMAP);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+ wpc->iomap.type = IOMAP_MAPPED;
+ wpc->iomap.flags = IOMAP_F_DIRTY;
+ wpc->iomap.bdev = mp->m_rtdev_targp->bt_bdev;
+ wpc->iomap.offset = offset;
+ wpc->iomap.length = XFS_FSB_TO_B(mp, count_fsb);
+ wpc->iomap.flags = IOMAP_F_ANON_WRITE;
+
+ trace_xfs_zoned_map_blocks(ip, offset, wpc->iomap.length);
+ return 0;
}
-static const struct iomap_writeback_ops xfs_writeback_ops = {
- .map_blocks = xfs_map_blocks,
- .prepare_ioend = xfs_prepare_ioend,
- .discard_folio = xfs_discard_folio,
+static ssize_t
+xfs_zoned_writeback_range(
+ struct iomap_writepage_ctx *wpc,
+ struct folio *folio,
+ u64 offset,
+ unsigned int len,
+ u64 end_pos)
+{
+ ssize_t ret;
+
+ ret = xfs_zoned_map_blocks(wpc, offset, len);
+ if (!ret)
+ ret = iomap_add_to_ioend(wpc, folio, offset, end_pos, len);
+ if (ret < 0)
+ xfs_discard_folio(folio, offset);
+ return ret;
+}
+
+static int
+xfs_zoned_writeback_submit(
+ struct iomap_writepage_ctx *wpc,
+ int error)
+{
+ struct iomap_ioend *ioend = wpc->wb_ctx;
+
+ ioend->io_bio.bi_end_io = xfs_end_bio;
+ if (error) {
+ ioend->io_bio.bi_status = errno_to_blk_status(error);
+ bio_endio(&ioend->io_bio);
+ return error;
+ }
+ xfs_zone_alloc_and_submit(ioend, &XFS_ZWPC(wpc)->open_zone);
+ return 0;
+}
+
+static const struct iomap_writeback_ops xfs_zoned_writeback_ops = {
+ .writeback_range = xfs_zoned_writeback_range,
+ .writeback_submit = xfs_zoned_writeback_submit,
};
STATIC int
@@ -471,10 +671,35 @@ xfs_vm_writepages(
struct address_space *mapping,
struct writeback_control *wbc)
{
- struct xfs_writepage_ctx wpc = { };
+ struct xfs_inode *ip = XFS_I(mapping->host);
- xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
- return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops);
+ xfs_iflags_clear(ip, XFS_ITRUNCATED);
+
+ if (xfs_is_zoned_inode(ip)) {
+ struct xfs_zoned_writepage_ctx xc = {
+ .ctx = {
+ .inode = mapping->host,
+ .wbc = wbc,
+ .ops = &xfs_zoned_writeback_ops
+ },
+ };
+ int error;
+
+ error = iomap_writepages(&xc.ctx);
+ if (xc.open_zone)
+ xfs_open_zone_put(xc.open_zone);
+ return error;
+ } else {
+ struct xfs_writepage_ctx wpc = {
+ .ctx = {
+ .inode = mapping->host,
+ .wbc = wbc,
+ .ops = &xfs_writeback_ops
+ },
+ };
+
+ return iomap_writepages(&wpc.ctx);
+ }
}
STATIC int
@@ -528,12 +753,44 @@ xfs_vm_readahead(
}
static int
-xfs_iomap_swapfile_activate(
+xfs_vm_swap_activate(
struct swap_info_struct *sis,
struct file *swap_file,
sector_t *span)
{
- sis->bdev = xfs_inode_buftarg(XFS_I(file_inode(swap_file)))->bt_bdev;
+ struct xfs_inode *ip = XFS_I(file_inode(swap_file));
+
+ /*
+ * Swap file activation can race against concurrent shared extent
+ * removal in files that have been cloned. If this happens,
+ * iomap_swapfile_iter() can fail because it encountered a shared
+ * extent even though an operation is in progress to remove those
+ * shared extents.
+ *
+ * This race becomes problematic when we defer extent removal
+ * operations beyond the end of a syscall (i.e. use async background
+ * processing algorithms). Users think the extents are no longer
+ * shared, but iomap_swapfile_iter() still sees them as shared
+ * because the refcountbt entries for the extents being removed have
+ * not yet been updated. Hence the swapon call fails unexpectedly.
+ *
+ * The race condition is currently most obvious from the unlink()
+ * operation as extent removal is deferred until after the last
+ * reference to the inode goes away. We then process the extent
+ * removal asynchronously, hence triggers the "syscall completed but
+ * work not done" condition mentioned above. To close this race
+ * window, we need to flush any pending inodegc operations to ensure
+ * they have updated the refcountbt records before we try to map the
+ * swapfile.
+ */
+ xfs_inodegc_flush(ip->i_mount);
+
+ /*
+ * Direct the swap code to the correct block device when this file
+ * sits on the RT device.
+ */
+ sis->bdev = xfs_inode_buftarg(ip)->bt_bdev;
+
return iomap_swapfile_activate(sis, swap_file, span,
&xfs_read_iomap_ops);
}
@@ -549,11 +806,11 @@ const struct address_space_operations xfs_address_space_operations = {
.migrate_folio = filemap_migrate_folio,
.is_partially_uptodate = iomap_is_partially_uptodate,
.error_remove_folio = generic_error_remove_folio,
- .swap_activate = xfs_iomap_swapfile_activate,
+ .swap_activate = xfs_vm_swap_activate,
};
const struct address_space_operations xfs_dax_aops = {
.writepages = xfs_dax_writepages,
.dirty_folio = noop_dirty_folio,
- .swap_activate = xfs_iomap_swapfile_activate,
+ .swap_activate = xfs_vm_swap_activate,
};
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index e0bd68419764..5a7a0f1a0b49 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -9,6 +9,7 @@
extern const struct address_space_operations xfs_address_space_operations;
extern const struct address_space_operations xfs_dax_aops;
-int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size);
+int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size);
+void xfs_end_bio(struct bio *bio);
#endif /* __XFS_AOPS_H__ */
diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c
index f683b7a9323f..5eef3bc30bda 100644
--- a/fs/xfs/xfs_attr_item.c
+++ b/fs/xfs/xfs_attr_item.c
@@ -91,41 +91,37 @@ xfs_attri_log_nameval_alloc(
name_len + new_name_len + value_len +
new_value_len);
- nv->name.i_addr = nv + 1;
- nv->name.i_len = name_len;
- nv->name.i_type = XLOG_REG_TYPE_ATTR_NAME;
- memcpy(nv->name.i_addr, name, name_len);
+ nv->name.iov_base = nv + 1;
+ nv->name.iov_len = name_len;
+ memcpy(nv->name.iov_base, name, name_len);
if (new_name_len) {
- nv->new_name.i_addr = nv->name.i_addr + name_len;
- nv->new_name.i_len = new_name_len;
- memcpy(nv->new_name.i_addr, new_name, new_name_len);
+ nv->new_name.iov_base = nv->name.iov_base + name_len;
+ nv->new_name.iov_len = new_name_len;
+ memcpy(nv->new_name.iov_base, new_name, new_name_len);
} else {
- nv->new_name.i_addr = NULL;
- nv->new_name.i_len = 0;
+ nv->new_name.iov_base = NULL;
+ nv->new_name.iov_len = 0;
}
- nv->new_name.i_type = XLOG_REG_TYPE_ATTR_NEWNAME;
if (value_len) {
- nv->value.i_addr = nv->name.i_addr + name_len + new_name_len;
- nv->value.i_len = value_len;
- memcpy(nv->value.i_addr, value, value_len);
+ nv->value.iov_base = nv->name.iov_base + name_len + new_name_len;
+ nv->value.iov_len = value_len;
+ memcpy(nv->value.iov_base, value, value_len);
} else {
- nv->value.i_addr = NULL;
- nv->value.i_len = 0;
+ nv->value.iov_base = NULL;
+ nv->value.iov_len = 0;
}
- nv->value.i_type = XLOG_REG_TYPE_ATTR_VALUE;
if (new_value_len) {
- nv->new_value.i_addr = nv->name.i_addr + name_len +
+ nv->new_value.iov_base = nv->name.iov_base + name_len +
new_name_len + value_len;
- nv->new_value.i_len = new_value_len;
- memcpy(nv->new_value.i_addr, new_value, new_value_len);
+ nv->new_value.iov_len = new_value_len;
+ memcpy(nv->new_value.iov_base, new_value, new_value_len);
} else {
- nv->new_value.i_addr = NULL;
- nv->new_value.i_len = 0;
+ nv->new_value.iov_base = NULL;
+ nv->new_value.iov_len = 0;
}
- nv->new_value.i_type = XLOG_REG_TYPE_ATTR_NEWVALUE;
refcount_set(&nv->refcount, 1);
return nv;
@@ -170,21 +166,21 @@ xfs_attri_item_size(
*nvecs += 2;
*nbytes += sizeof(struct xfs_attri_log_format) +
- xlog_calc_iovec_len(nv->name.i_len);
+ xlog_calc_iovec_len(nv->name.iov_len);
- if (nv->new_name.i_len) {
+ if (nv->new_name.iov_len) {
*nvecs += 1;
- *nbytes += xlog_calc_iovec_len(nv->new_name.i_len);
+ *nbytes += xlog_calc_iovec_len(nv->new_name.iov_len);
}
- if (nv->value.i_len) {
+ if (nv->value.iov_len) {
*nvecs += 1;
- *nbytes += xlog_calc_iovec_len(nv->value.i_len);
+ *nbytes += xlog_calc_iovec_len(nv->value.iov_len);
}
- if (nv->new_value.i_len) {
+ if (nv->new_value.iov_len) {
*nvecs += 1;
- *nbytes += xlog_calc_iovec_len(nv->new_value.i_len);
+ *nbytes += xlog_calc_iovec_len(nv->new_value.iov_len);
}
}
@@ -212,31 +208,36 @@ xfs_attri_item_format(
* the log recovery.
*/
- ASSERT(nv->name.i_len > 0);
+ ASSERT(nv->name.iov_len > 0);
attrip->attri_format.alfi_size++;
- if (nv->new_name.i_len > 0)
+ if (nv->new_name.iov_len > 0)
attrip->attri_format.alfi_size++;
- if (nv->value.i_len > 0)
+ if (nv->value.iov_len > 0)
attrip->attri_format.alfi_size++;
- if (nv->new_value.i_len > 0)
+ if (nv->new_value.iov_len > 0)
attrip->attri_format.alfi_size++;
xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTRI_FORMAT,
&attrip->attri_format,
sizeof(struct xfs_attri_log_format));
- xlog_copy_from_iovec(lv, &vecp, &nv->name);
- if (nv->new_name.i_len > 0)
- xlog_copy_from_iovec(lv, &vecp, &nv->new_name);
+ xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTR_NAME, nv->name.iov_base,
+ nv->name.iov_len);
- if (nv->value.i_len > 0)
- xlog_copy_from_iovec(lv, &vecp, &nv->value);
+ if (nv->new_name.iov_len > 0)
+ xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTR_NEWNAME,
+ nv->new_name.iov_base, nv->new_name.iov_len);
- if (nv->new_value.i_len > 0)
- xlog_copy_from_iovec(lv, &vecp, &nv->new_value);
+ if (nv->value.iov_len > 0)
+ xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTR_VALUE,
+ nv->value.iov_base, nv->value.iov_len);
+
+ if (nv->new_value.iov_len > 0)
+ xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTR_NEWVALUE,
+ nv->new_value.iov_base, nv->new_value.iov_len);
}
/*
@@ -383,22 +384,22 @@ xfs_attr_log_item(
attrp->alfi_ino = args->dp->i_ino;
ASSERT(!(attr->xattri_op_flags & ~XFS_ATTRI_OP_FLAGS_TYPE_MASK));
attrp->alfi_op_flags = attr->xattri_op_flags;
- attrp->alfi_value_len = nv->value.i_len;
+ attrp->alfi_value_len = nv->value.iov_len;
switch (xfs_attr_log_item_op(attrp)) {
case XFS_ATTRI_OP_FLAGS_PPTR_REPLACE:
- ASSERT(nv->value.i_len == nv->new_value.i_len);
+ ASSERT(nv->value.iov_len == nv->new_value.iov_len);
attrp->alfi_igen = VFS_I(args->dp)->i_generation;
- attrp->alfi_old_name_len = nv->name.i_len;
- attrp->alfi_new_name_len = nv->new_name.i_len;
+ attrp->alfi_old_name_len = nv->name.iov_len;
+ attrp->alfi_new_name_len = nv->new_name.iov_len;
break;
case XFS_ATTRI_OP_FLAGS_PPTR_REMOVE:
case XFS_ATTRI_OP_FLAGS_PPTR_SET:
attrp->alfi_igen = VFS_I(args->dp)->i_generation;
fallthrough;
default:
- attrp->alfi_name_len = nv->name.i_len;
+ attrp->alfi_name_len = nv->name.iov_len;
break;
}
@@ -616,10 +617,7 @@ xfs_attri_iread_extents(
struct xfs_trans *tp;
int error;
- error = xfs_trans_alloc_empty(ip->i_mount, &tp);
- if (error)
- return error;
-
+ tp = xfs_trans_alloc_empty(ip->i_mount);
xfs_ilock(ip, XFS_ILOCK_EXCL);
error = xfs_iread_extents(tp, ip, XFS_ATTR_FORK);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -690,14 +688,14 @@ xfs_attri_recover_work(
args->dp = ip;
args->geo = mp->m_attr_geo;
args->whichfork = XFS_ATTR_FORK;
- args->name = nv->name.i_addr;
- args->namelen = nv->name.i_len;
- args->new_name = nv->new_name.i_addr;
- args->new_namelen = nv->new_name.i_len;
- args->value = nv->value.i_addr;
- args->valuelen = nv->value.i_len;
- args->new_value = nv->new_value.i_addr;
- args->new_valuelen = nv->new_value.i_len;
+ args->name = nv->name.iov_base;
+ args->namelen = nv->name.iov_len;
+ args->new_name = nv->new_name.iov_base;
+ args->new_namelen = nv->new_name.iov_len;
+ args->value = nv->value.iov_base;
+ args->valuelen = nv->value.iov_len;
+ args->new_value = nv->new_value.iov_base;
+ args->new_valuelen = nv->new_value.iov_len;
args->attr_filter = attrp->alfi_attr_filter & XFS_ATTRI_FILTER_MASK;
args->op_flags = XFS_DA_OP_RECOVERY | XFS_DA_OP_OKNOENT |
XFS_DA_OP_LOGGED;
@@ -754,8 +752,8 @@ xfs_attr_recover_work(
*/
attrp = &attrip->attri_format;
if (!xfs_attri_validate(mp, attrp) ||
- !xfs_attr_namecheck(attrp->alfi_attr_filter, nv->name.i_addr,
- nv->name.i_len))
+ !xfs_attr_namecheck(attrp->alfi_attr_filter, nv->name.iov_base,
+ nv->name.iov_len))
return -EFSCORRUPTED;
attr = xfs_attri_recover_work(mp, dfp, attrp, &ip, nv);
@@ -953,50 +951,50 @@ static inline void *
xfs_attri_validate_name_iovec(
struct xfs_mount *mp,
struct xfs_attri_log_format *attri_formatp,
- const struct xfs_log_iovec *iovec,
+ const struct kvec *iovec,
unsigned int name_len)
{
- if (iovec->i_len != xlog_calc_iovec_len(name_len)) {
+ if (iovec->iov_len != xlog_calc_iovec_len(name_len)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
attri_formatp, sizeof(*attri_formatp));
return NULL;
}
- if (!xfs_attr_namecheck(attri_formatp->alfi_attr_filter, iovec->i_addr,
+ if (!xfs_attr_namecheck(attri_formatp->alfi_attr_filter, iovec->iov_base,
name_len)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
attri_formatp, sizeof(*attri_formatp));
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- iovec->i_addr, iovec->i_len);
+ iovec->iov_base, iovec->iov_len);
return NULL;
}
- return iovec->i_addr;
+ return iovec->iov_base;
}
static inline void *
xfs_attri_validate_value_iovec(
struct xfs_mount *mp,
struct xfs_attri_log_format *attri_formatp,
- const struct xfs_log_iovec *iovec,
+ const struct kvec *iovec,
unsigned int value_len)
{
- if (iovec->i_len != xlog_calc_iovec_len(value_len)) {
+ if (iovec->iov_len != xlog_calc_iovec_len(value_len)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
attri_formatp, sizeof(*attri_formatp));
return NULL;
}
if ((attri_formatp->alfi_attr_filter & XFS_ATTR_PARENT) &&
- !xfs_parent_valuecheck(mp, iovec->i_addr, value_len)) {
+ !xfs_parent_valuecheck(mp, iovec->iov_base, value_len)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
attri_formatp, sizeof(*attri_formatp));
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- iovec->i_addr, iovec->i_len);
+ iovec->iov_base, iovec->iov_len);
return NULL;
}
- return iovec->i_addr;
+ return iovec->iov_base;
}
STATIC int
@@ -1023,13 +1021,13 @@ xlog_recover_attri_commit_pass2(
/* Validate xfs_attri_log_format before the large memory allocation */
len = sizeof(struct xfs_attri_log_format);
- if (item->ri_buf[i].i_len != len) {
+ if (item->ri_buf[i].iov_len != len) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
- attri_formatp = item->ri_buf[i].i_addr;
+ attri_formatp = item->ri_buf[i].iov_base;
if (!xfs_attri_validate(mp, attri_formatp)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
attri_formatp, len);
@@ -1218,10 +1216,10 @@ xlog_recover_attrd_commit_pass2(
{
struct xfs_attrd_log_format *attrd_formatp;
- attrd_formatp = item->ri_buf[0].i_addr;
- if (item->ri_buf[0].i_len != sizeof(struct xfs_attrd_log_format)) {
+ attrd_formatp = item->ri_buf[0].iov_base;
+ if (item->ri_buf[0].iov_len != sizeof(struct xfs_attrd_log_format)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
diff --git a/fs/xfs/xfs_attr_item.h b/fs/xfs/xfs_attr_item.h
index e74128cbb722..d108a11b55ae 100644
--- a/fs/xfs/xfs_attr_item.h
+++ b/fs/xfs/xfs_attr_item.h
@@ -12,10 +12,10 @@ struct xfs_mount;
struct kmem_zone;
struct xfs_attri_log_nameval {
- struct xfs_log_iovec name;
- struct xfs_log_iovec new_name; /* PPTR_REPLACE only */
- struct xfs_log_iovec value;
- struct xfs_log_iovec new_value; /* PPTR_REPLACE only */
+ struct kvec name;
+ struct kvec new_name; /* PPTR_REPLACE only */
+ struct kvec value;
+ struct kvec new_value; /* PPTR_REPLACE only */
refcount_t refcount;
/* name and value follow the end of this struct */
diff --git a/fs/xfs/xfs_bio_io.c b/fs/xfs/xfs_bio_io.c
index fe21c76f75b8..2a736d10eafb 100644
--- a/fs/xfs/xfs_bio_io.c
+++ b/fs/xfs/xfs_bio_io.c
@@ -18,42 +18,36 @@ xfs_rw_bdev(
enum req_op op)
{
- unsigned int is_vmalloc = is_vmalloc_addr(data);
- unsigned int left = count;
+ unsigned int done = 0, added;
int error;
struct bio *bio;
- if (is_vmalloc && op == REQ_OP_WRITE)
- flush_kernel_vmap_range(data, count);
+ op |= REQ_META | REQ_SYNC;
+ if (!is_vmalloc_addr(data))
+ return bdev_rw_virt(bdev, sector, data, count, op);
- bio = bio_alloc(bdev, bio_max_vecs(left), op | REQ_META | REQ_SYNC,
- GFP_KERNEL);
+ bio = bio_alloc(bdev, bio_max_vecs(count), op, GFP_KERNEL);
bio->bi_iter.bi_sector = sector;
do {
- struct page *page = kmem_to_page(data);
- unsigned int off = offset_in_page(data);
- unsigned int len = min_t(unsigned, left, PAGE_SIZE - off);
-
- while (bio_add_page(bio, page, len, off) != len) {
+ added = bio_add_vmalloc_chunk(bio, data + done, count - done);
+ if (!added) {
struct bio *prev = bio;
- bio = bio_alloc(prev->bi_bdev, bio_max_vecs(left),
+ bio = bio_alloc(prev->bi_bdev,
+ bio_max_vecs(count - done),
prev->bi_opf, GFP_KERNEL);
bio->bi_iter.bi_sector = bio_end_sector(prev);
bio_chain(prev, bio);
-
submit_bio(prev);
}
-
- data += len;
- left -= len;
- } while (left > 0);
+ done += added;
+ } while (done < count);
error = submit_bio_wait(bio);
bio_put(bio);
- if (is_vmalloc && op == REQ_OP_READ)
+ if (op == REQ_OP_READ)
invalidate_kernel_vmap_range(data, count);
return error;
}
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 3d52e9d7ad57..80f0c4bcc483 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -77,6 +77,11 @@ xfs_bui_item_size(
*nbytes += xfs_bui_log_format_sizeof(buip->bui_format.bui_nextents);
}
+unsigned int xfs_bui_log_space(unsigned int nr)
+{
+ return xlog_item_space(1, xfs_bui_log_format_sizeof(nr));
+}
+
/*
* This is called to fill in the vector of log iovecs for the
* given bui log item. We use only 1 iovec, and we point that
@@ -168,6 +173,11 @@ xfs_bud_item_size(
*nbytes += sizeof(struct xfs_bud_log_format);
}
+unsigned int xfs_bud_log_space(void)
+{
+ return xlog_item_space(1, sizeof(struct xfs_bud_log_format));
+}
+
/*
* This is called to fill in the vector of log iovecs for the
* given bud log item. We use only 1 iovec, and we point that
@@ -644,24 +654,24 @@ xlog_recover_bui_commit_pass2(
struct xfs_bui_log_format *bui_formatp;
size_t len;
- bui_formatp = item->ri_buf[0].i_addr;
+ bui_formatp = item->ri_buf[0].iov_base;
- if (item->ri_buf[0].i_len < xfs_bui_log_format_sizeof(0)) {
+ if (item->ri_buf[0].iov_len < xfs_bui_log_format_sizeof(0)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
if (bui_formatp->bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
len = xfs_bui_log_format_sizeof(bui_formatp->bui_nextents);
- if (item->ri_buf[0].i_len != len) {
+ if (item->ri_buf[0].iov_len != len) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
@@ -695,10 +705,10 @@ xlog_recover_bud_commit_pass2(
{
struct xfs_bud_log_format *bud_formatp;
- bud_formatp = item->ri_buf[0].i_addr;
- if (item->ri_buf[0].i_len != sizeof(struct xfs_bud_log_format)) {
+ bud_formatp = item->ri_buf[0].iov_base;
+ if (item->ri_buf[0].iov_len != sizeof(struct xfs_bud_log_format)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
diff --git a/fs/xfs/xfs_bmap_item.h b/fs/xfs/xfs_bmap_item.h
index 6fee6a508343..b42fee06899d 100644
--- a/fs/xfs/xfs_bmap_item.h
+++ b/fs/xfs/xfs_bmap_item.h
@@ -72,4 +72,7 @@ struct xfs_bmap_intent;
void xfs_bmap_defer_add(struct xfs_trans *tp, struct xfs_bmap_intent *bi);
+unsigned int xfs_bui_log_space(unsigned int nr);
+unsigned int xfs_bud_log_space(void);
+
#endif /* __XFS_BMAP_ITEM_H__ */
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 0836fea2d6d8..06ca11731e43 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -30,6 +30,7 @@
#include "xfs_reflink.h"
#include "xfs_rtbitmap.h"
#include "xfs_rtgroup.h"
+#include "xfs_zone_alloc.h"
/* Kernel only BMAP related definitions and functions */
@@ -436,7 +437,8 @@ xfs_bmap_punch_delalloc_range(
struct xfs_inode *ip,
int whichfork,
xfs_off_t start_byte,
- xfs_off_t end_byte)
+ xfs_off_t end_byte,
+ struct xfs_zone_alloc_ctx *ac)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
@@ -467,7 +469,21 @@ xfs_bmap_punch_delalloc_range(
continue;
}
- xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del);
+ if (xfs_is_zoned_inode(ip) && ac) {
+ /*
+ * In a zoned buffered write context we need to return
+ * the punched delalloc allocations to the allocation
+ * context. This allows reusing them in the following
+ * iomap iterations.
+ */
+ xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got,
+ &del, XFS_BMAPI_REMAP);
+ ac->reserved_blocks += del.br_blockcount;
+ } else {
+ xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got,
+ &del, 0);
+ }
+
if (!xfs_iext_get_extent(ifp, &icur, &got))
break;
}
@@ -582,7 +598,7 @@ xfs_free_eofblocks(
if (ip->i_delayed_blks) {
xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK,
round_up(XFS_ISIZE(ip), mp->m_sb.sb_blocksize),
- LLONG_MAX);
+ LLONG_MAX, NULL);
}
xfs_inode_clear_eofblocks_tag(ip);
return 0;
@@ -825,7 +841,8 @@ int
xfs_free_file_space(
struct xfs_inode *ip,
xfs_off_t offset,
- xfs_off_t len)
+ xfs_off_t len,
+ struct xfs_zone_alloc_ctx *ac)
{
struct xfs_mount *mp = ip->i_mount;
xfs_fileoff_t startoffset_fsb;
@@ -880,7 +897,7 @@ xfs_free_file_space(
return 0;
if (offset + len > XFS_ISIZE(ip))
len = XFS_ISIZE(ip) - offset;
- error = xfs_zero_range(ip, offset, len, NULL);
+ error = xfs_zero_range(ip, offset, len, ac, NULL);
if (error)
return error;
@@ -968,7 +985,8 @@ int
xfs_collapse_file_space(
struct xfs_inode *ip,
xfs_off_t offset,
- xfs_off_t len)
+ xfs_off_t len,
+ struct xfs_zone_alloc_ctx *ac)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp;
@@ -981,7 +999,7 @@ xfs_collapse_file_space(
trace_xfs_collapse_file_space(ip);
- error = xfs_free_file_space(ip, offset, len);
+ error = xfs_free_file_space(ip, offset, len, ac);
if (error)
return error;
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index b29760d36e1a..c477b3361630 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -15,6 +15,7 @@ struct xfs_inode;
struct xfs_mount;
struct xfs_trans;
struct xfs_bmalloca;
+struct xfs_zone_alloc_ctx;
#ifdef CONFIG_XFS_RT
int xfs_bmap_rtalloc(struct xfs_bmalloca *ap);
@@ -31,7 +32,8 @@ xfs_bmap_rtalloc(struct xfs_bmalloca *ap)
#endif /* CONFIG_XFS_RT */
void xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, int whichfork,
- xfs_off_t start_byte, xfs_off_t end_byte);
+ xfs_off_t start_byte, xfs_off_t end_byte,
+ struct xfs_zone_alloc_ctx *ac);
struct kgetbmap {
__s64 bmv_offset; /* file offset of segment in blocks */
@@ -54,13 +56,13 @@ int xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip,
/* preallocation and hole punch interface */
int xfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset,
- xfs_off_t len);
+ xfs_off_t len);
int xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset,
- xfs_off_t len);
+ xfs_off_t len, struct xfs_zone_alloc_ctx *ac);
int xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset,
- xfs_off_t len);
+ xfs_off_t len, struct xfs_zone_alloc_ctx *ac);
int xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset,
- xfs_off_t len);
+ xfs_off_t len);
/* EOF block manipulation functions */
bool xfs_can_free_eofblocks(struct xfs_inode *ip);
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index d1d4a0a22e13..f9ef3b2a332a 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -29,11 +29,6 @@ struct kmem_cache *xfs_buf_cache;
/*
* Locking orders
*
- * xfs_buf_ioacct_inc:
- * xfs_buf_ioacct_dec:
- * b_sema (caller holds)
- * b_lock
- *
* xfs_buf_stale:
* b_sema (caller holds)
* b_lock
@@ -41,8 +36,7 @@ struct kmem_cache *xfs_buf_cache;
*
* xfs_buf_rele:
* b_lock
- * pag_buf_lock
- * lru_lock
+ * lru_lock
*
* xfs_buftarg_drain_rele
* lru_lock
@@ -61,72 +55,6 @@ static inline bool xfs_buf_is_uncached(struct xfs_buf *bp)
return bp->b_rhash_key == XFS_BUF_DADDR_NULL;
}
-static inline int
-xfs_buf_is_vmapped(
- struct xfs_buf *bp)
-{
- /*
- * Return true if the buffer is vmapped.
- *
- * b_addr is null if the buffer is not mapped, but the code is clever
- * enough to know it doesn't have to map a single page, so the check has
- * to be both for b_addr and bp->b_page_count > 1.
- */
- return bp->b_addr && bp->b_page_count > 1;
-}
-
-static inline int
-xfs_buf_vmap_len(
- struct xfs_buf *bp)
-{
- return (bp->b_page_count * PAGE_SIZE);
-}
-
-/*
- * Bump the I/O in flight count on the buftarg if we haven't yet done so for
- * this buffer. The count is incremented once per buffer (per hold cycle)
- * because the corresponding decrement is deferred to buffer release. Buffers
- * can undergo I/O multiple times in a hold-release cycle and per buffer I/O
- * tracking adds unnecessary overhead. This is used for sychronization purposes
- * with unmount (see xfs_buftarg_drain()), so all we really need is a count of
- * in-flight buffers.
- *
- * Buffers that are never released (e.g., superblock, iclog buffers) must set
- * the XBF_NO_IOACCT flag before I/O submission. Otherwise, the buftarg count
- * never reaches zero and unmount hangs indefinitely.
- */
-static inline void
-xfs_buf_ioacct_inc(
- struct xfs_buf *bp)
-{
- if (bp->b_flags & XBF_NO_IOACCT)
- return;
-
- ASSERT(bp->b_flags & XBF_ASYNC);
- spin_lock(&bp->b_lock);
- if (!(bp->b_state & XFS_BSTATE_IN_FLIGHT)) {
- bp->b_state |= XFS_BSTATE_IN_FLIGHT;
- percpu_counter_inc(&bp->b_target->bt_io_count);
- }
- spin_unlock(&bp->b_lock);
-}
-
-/*
- * Clear the in-flight state on a buffer about to be released to the LRU or
- * freed and unaccount from the buftarg.
- */
-static inline void
-__xfs_buf_ioacct_dec(
- struct xfs_buf *bp)
-{
- lockdep_assert_held(&bp->b_lock);
-
- if (bp->b_state & XFS_BSTATE_IN_FLIGHT) {
- bp->b_state &= ~XFS_BSTATE_IN_FLIGHT;
- percpu_counter_dec(&bp->b_target->bt_io_count);
- }
-}
-
/*
* When we mark a buffer stale, we remove the buffer from the LRU and clear the
* b_lru_ref count so that the buffer is freed immediately when the buffer
@@ -150,15 +78,7 @@ xfs_buf_stale(
*/
bp->b_flags &= ~_XBF_DELWRI_Q;
- /*
- * Once the buffer is marked stale and unlocked, a subsequent lookup
- * could reset b_flags. There is no guarantee that the buffer is
- * unaccounted (released to LRU) before that occurs. Drop in-flight
- * status now to preserve accounting consistency.
- */
spin_lock(&bp->b_lock);
- __xfs_buf_ioacct_dec(bp);
-
atomic_set(&bp->b_lru_ref, 0);
if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
(list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru)))
@@ -168,129 +88,14 @@ xfs_buf_stale(
spin_unlock(&bp->b_lock);
}
-static int
-xfs_buf_get_maps(
- struct xfs_buf *bp,
- int map_count)
-{
- ASSERT(bp->b_maps == NULL);
- bp->b_map_count = map_count;
-
- if (map_count == 1) {
- bp->b_maps = &bp->__b_map;
- return 0;
- }
-
- bp->b_maps = kzalloc(map_count * sizeof(struct xfs_buf_map),
- GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
- if (!bp->b_maps)
- return -ENOMEM;
- return 0;
-}
-
-static void
-xfs_buf_free_maps(
- struct xfs_buf *bp)
-{
- if (bp->b_maps != &bp->__b_map) {
- kfree(bp->b_maps);
- bp->b_maps = NULL;
- }
-}
-
-static int
-_xfs_buf_alloc(
- struct xfs_buftarg *target,
- struct xfs_buf_map *map,
- int nmaps,
- xfs_buf_flags_t flags,
- struct xfs_buf **bpp)
-{
- struct xfs_buf *bp;
- int error;
- int i;
-
- *bpp = NULL;
- bp = kmem_cache_zalloc(xfs_buf_cache,
- GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
-
- /*
- * We don't want certain flags to appear in b_flags unless they are
- * specifically set by later operations on the buffer.
- */
- flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD);
-
- spin_lock_init(&bp->b_lock);
- bp->b_hold = 1;
- atomic_set(&bp->b_lru_ref, 1);
- init_completion(&bp->b_iowait);
- INIT_LIST_HEAD(&bp->b_lru);
- INIT_LIST_HEAD(&bp->b_list);
- INIT_LIST_HEAD(&bp->b_li_list);
- sema_init(&bp->b_sema, 0); /* held, no waiters */
- bp->b_target = target;
- bp->b_mount = target->bt_mount;
- bp->b_flags = flags;
-
- /*
- * Set length and io_length to the same value initially.
- * I/O routines should use io_length, which will be the same in
- * most cases but may be reset (e.g. XFS recovery).
- */
- error = xfs_buf_get_maps(bp, nmaps);
- if (error) {
- kmem_cache_free(xfs_buf_cache, bp);
- return error;
- }
-
- bp->b_rhash_key = map[0].bm_bn;
- bp->b_length = 0;
- for (i = 0; i < nmaps; i++) {
- bp->b_maps[i].bm_bn = map[i].bm_bn;
- bp->b_maps[i].bm_len = map[i].bm_len;
- bp->b_length += map[i].bm_len;
- }
-
- atomic_set(&bp->b_pin_count, 0);
- init_waitqueue_head(&bp->b_waiters);
-
- XFS_STATS_INC(bp->b_mount, xb_create);
- trace_xfs_buf_init(bp, _RET_IP_);
-
- *bpp = bp;
- return 0;
-}
-
-static void
-xfs_buf_free_pages(
- struct xfs_buf *bp)
-{
- uint i;
-
- ASSERT(bp->b_flags & _XBF_PAGES);
-
- if (xfs_buf_is_vmapped(bp))
- vm_unmap_ram(bp->b_addr, bp->b_page_count);
-
- for (i = 0; i < bp->b_page_count; i++) {
- if (bp->b_pages[i])
- __free_page(bp->b_pages[i]);
- }
- mm_account_reclaimed_pages(bp->b_page_count);
-
- if (bp->b_pages != bp->b_page_array)
- kfree(bp->b_pages);
- bp->b_pages = NULL;
- bp->b_flags &= ~_XBF_PAGES;
-}
-
static void
xfs_buf_free_callback(
struct callback_head *cb)
{
struct xfs_buf *bp = container_of(cb, struct xfs_buf, b_rcu);
- xfs_buf_free_maps(bp);
+ if (bp->b_maps != &bp->__b_map)
+ kfree(bp->b_maps);
kmem_cache_free(xfs_buf_cache, bp);
}
@@ -298,154 +103,219 @@ static void
xfs_buf_free(
struct xfs_buf *bp)
{
+ unsigned int size = BBTOB(bp->b_length);
+
+ might_sleep();
trace_xfs_buf_free(bp, _RET_IP_);
ASSERT(list_empty(&bp->b_lru));
- if (xfs_buftarg_is_mem(bp->b_target))
- xmbuf_unmap_page(bp);
- else if (bp->b_flags & _XBF_PAGES)
- xfs_buf_free_pages(bp);
+ if (!xfs_buftarg_is_mem(bp->b_target) && size >= PAGE_SIZE)
+ mm_account_reclaimed_pages(howmany(size, PAGE_SHIFT));
+
+ if (is_vmalloc_addr(bp->b_addr))
+ vfree(bp->b_addr);
else if (bp->b_flags & _XBF_KMEM)
kfree(bp->b_addr);
+ else
+ folio_put(virt_to_folio(bp->b_addr));
call_rcu(&bp->b_rcu, xfs_buf_free_callback);
}
static int
xfs_buf_alloc_kmem(
- struct xfs_buf *bp,
- xfs_buf_flags_t flags)
+ struct xfs_buf *bp,
+ size_t size,
+ gfp_t gfp_mask)
{
- gfp_t gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL;
- size_t size = BBTOB(bp->b_length);
+ ASSERT(is_power_of_2(size));
+ ASSERT(size < PAGE_SIZE);
- /* Assure zeroed buffer for non-read cases. */
- if (!(flags & XBF_READ))
- gfp_mask |= __GFP_ZERO;
-
- bp->b_addr = kmalloc(size, gfp_mask);
+ bp->b_addr = kmalloc(size, gfp_mask | __GFP_NOFAIL);
if (!bp->b_addr)
return -ENOMEM;
- if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) !=
- ((unsigned long)bp->b_addr & PAGE_MASK)) {
- /* b_addr spans two pages - use alloc_page instead */
+ /*
+ * Slab guarantees that we get back naturally aligned allocations for
+ * power of two sizes. Keep this check as the canary in the coal mine
+ * if anything changes in slab.
+ */
+ if (WARN_ON_ONCE(!IS_ALIGNED((unsigned long)bp->b_addr, size))) {
kfree(bp->b_addr);
bp->b_addr = NULL;
return -ENOMEM;
}
- bp->b_offset = offset_in_page(bp->b_addr);
- bp->b_pages = bp->b_page_array;
- bp->b_pages[0] = kmem_to_page(bp->b_addr);
- bp->b_page_count = 1;
bp->b_flags |= _XBF_KMEM;
+ trace_xfs_buf_backing_kmem(bp, _RET_IP_);
return 0;
}
+/*
+ * Allocate backing memory for a buffer.
+ *
+ * For tmpfs-backed buffers used by in-memory btrees this directly maps the
+ * tmpfs page cache folios.
+ *
+ * For real file system buffers there are three different kinds backing memory:
+ *
+ * The first type backs the buffer by a kmalloc allocation. This is done for
+ * less than PAGE_SIZE allocations to avoid wasting memory.
+ *
+ * The second type is a single folio buffer - this may be a high order folio or
+ * just a single page sized folio, but either way they get treated the same way
+ * by the rest of the code - the buffer memory spans a single contiguous memory
+ * region that we don't have to map and unmap to access the data directly.
+ *
+ * The third type of buffer is the vmalloc()d buffer. This provides the buffer
+ * with the required contiguous memory region but backed by discontiguous
+ * physical pages.
+ */
static int
-xfs_buf_alloc_pages(
+xfs_buf_alloc_backing_mem(
struct xfs_buf *bp,
xfs_buf_flags_t flags)
{
+ size_t size = BBTOB(bp->b_length);
gfp_t gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOWARN;
- long filled = 0;
+ struct folio *folio;
- if (flags & XBF_READ_AHEAD)
- gfp_mask |= __GFP_NORETRY;
-
- /* Make sure that we have a page list */
- bp->b_page_count = DIV_ROUND_UP(BBTOB(bp->b_length), PAGE_SIZE);
- if (bp->b_page_count <= XB_PAGES) {
- bp->b_pages = bp->b_page_array;
- } else {
- bp->b_pages = kzalloc(sizeof(struct page *) * bp->b_page_count,
- gfp_mask);
- if (!bp->b_pages)
- return -ENOMEM;
- }
- bp->b_flags |= _XBF_PAGES;
+ if (xfs_buftarg_is_mem(bp->b_target))
+ return xmbuf_map_backing_mem(bp);
/* Assure zeroed buffer for non-read cases. */
if (!(flags & XBF_READ))
gfp_mask |= __GFP_ZERO;
+ if (flags & XBF_READ_AHEAD)
+ gfp_mask |= __GFP_NORETRY;
+
/*
- * Bulk filling of pages can take multiple calls. Not filling the entire
- * array is not an allocation failure, so don't back off if we get at
- * least one extra page.
+ * For buffers smaller than PAGE_SIZE use a kmalloc allocation if that
+ * is properly aligned. The slab allocator now guarantees an aligned
+ * allocation for all power of two sizes, which matches most of the
+ * smaller than PAGE_SIZE buffers used by XFS.
*/
- for (;;) {
- long last = filled;
+ if (size < PAGE_SIZE && is_power_of_2(size))
+ return xfs_buf_alloc_kmem(bp, size, gfp_mask);
- filled = alloc_pages_bulk(gfp_mask, bp->b_page_count,
- bp->b_pages);
- if (filled == bp->b_page_count) {
- XFS_STATS_INC(bp->b_mount, xb_page_found);
- break;
- }
-
- if (filled != last)
- continue;
+ /*
+ * Don't bother with the retry loop for single PAGE allocations: vmalloc
+ * won't do any better.
+ */
+ if (size <= PAGE_SIZE)
+ gfp_mask |= __GFP_NOFAIL;
- if (flags & XBF_READ_AHEAD) {
- xfs_buf_free_pages(bp);
+ /*
+ * Optimistically attempt a single high order folio allocation for
+ * larger than PAGE_SIZE buffers.
+ *
+ * Allocating a high order folio makes the assumption that buffers are a
+ * power-of-2 size, matching the power-of-2 folios sizes available.
+ *
+ * The exception here are user xattr data buffers, which can be arbitrarily
+ * sized up to 64kB plus structure metadata, skip straight to the vmalloc
+ * path for them instead of wasting memory here.
+ */
+ if (size > PAGE_SIZE) {
+ if (!is_power_of_2(size))
+ goto fallback;
+ gfp_mask &= ~__GFP_DIRECT_RECLAIM;
+ gfp_mask |= __GFP_NORETRY;
+ }
+ folio = folio_alloc(gfp_mask, get_order(size));
+ if (!folio) {
+ if (size <= PAGE_SIZE)
return -ENOMEM;
- }
+ trace_xfs_buf_backing_fallback(bp, _RET_IP_);
+ goto fallback;
+ }
+ bp->b_addr = folio_address(folio);
+ trace_xfs_buf_backing_folio(bp, _RET_IP_);
+ return 0;
+fallback:
+ for (;;) {
+ bp->b_addr = __vmalloc(size, gfp_mask);
+ if (bp->b_addr)
+ break;
+ if (flags & XBF_READ_AHEAD)
+ return -ENOMEM;
XFS_STATS_INC(bp->b_mount, xb_page_retries);
memalloc_retry_wait(gfp_mask);
}
+
+ trace_xfs_buf_backing_vmalloc(bp, _RET_IP_);
return 0;
}
-/*
- * Map buffer into kernel address-space if necessary.
- */
-STATIC int
-_xfs_buf_map_pages(
- struct xfs_buf *bp,
- xfs_buf_flags_t flags)
+static int
+xfs_buf_alloc(
+ struct xfs_buftarg *target,
+ struct xfs_buf_map *map,
+ int nmaps,
+ xfs_buf_flags_t flags,
+ struct xfs_buf **bpp)
{
- ASSERT(bp->b_flags & _XBF_PAGES);
- if (bp->b_page_count == 1) {
- /* A single page buffer is always mappable */
- bp->b_addr = page_address(bp->b_pages[0]);
- } else if (flags & XBF_UNMAPPED) {
- bp->b_addr = NULL;
- } else {
- int retried = 0;
- unsigned nofs_flag;
+ struct xfs_buf *bp;
+ int error;
+ int i;
- /*
- * vm_map_ram() will allocate auxiliary structures (e.g.
- * pagetables) with GFP_KERNEL, yet we often under a scoped nofs
- * context here. Mixing GFP_KERNEL with GFP_NOFS allocations
- * from the same call site that can be run from both above and
- * below memory reclaim causes lockdep false positives. Hence we
- * always need to force this allocation to nofs context because
- * we can't pass __GFP_NOLOCKDEP down to auxillary structures to
- * prevent false positive lockdep reports.
- *
- * XXX(dgc): I think dquot reclaim is the only place we can get
- * to this function from memory reclaim context now. If we fix
- * that like we've fixed inode reclaim to avoid writeback from
- * reclaim, this nofs wrapping can go away.
- */
- nofs_flag = memalloc_nofs_save();
- do {
- bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
- -1);
- if (bp->b_addr)
- break;
- vm_unmap_aliases();
- } while (retried++ <= 1);
- memalloc_nofs_restore(nofs_flag);
-
- if (!bp->b_addr)
- return -ENOMEM;
+ *bpp = NULL;
+ bp = kmem_cache_zalloc(xfs_buf_cache,
+ GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
+
+ /*
+ * We don't want certain flags to appear in b_flags unless they are
+ * specifically set by later operations on the buffer.
+ */
+ flags &= ~(XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD);
+
+ /*
+ * A new buffer is held and locked by the owner. This ensures that the
+ * buffer is owned by the caller and racing RCU lookups right after
+ * inserting into the hash table are safe (and will have to wait for
+ * the unlock to do anything non-trivial).
+ */
+ bp->b_hold = 1;
+ sema_init(&bp->b_sema, 0); /* held, no waiters */
+
+ spin_lock_init(&bp->b_lock);
+ atomic_set(&bp->b_lru_ref, 1);
+ init_completion(&bp->b_iowait);
+ INIT_LIST_HEAD(&bp->b_lru);
+ INIT_LIST_HEAD(&bp->b_list);
+ INIT_LIST_HEAD(&bp->b_li_list);
+ bp->b_target = target;
+ bp->b_mount = target->bt_mount;
+ bp->b_flags = flags;
+ bp->b_rhash_key = map[0].bm_bn;
+ bp->b_length = 0;
+ bp->b_map_count = nmaps;
+ if (nmaps == 1)
+ bp->b_maps = &bp->__b_map;
+ else
+ bp->b_maps = kcalloc(nmaps, sizeof(struct xfs_buf_map),
+ GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
+ for (i = 0; i < nmaps; i++) {
+ bp->b_maps[i].bm_bn = map[i].bm_bn;
+ bp->b_maps[i].bm_len = map[i].bm_len;
+ bp->b_length += map[i].bm_len;
}
+ atomic_set(&bp->b_pin_count, 0);
+ init_waitqueue_head(&bp->b_waiters);
+
+ XFS_STATS_INC(bp->b_mount, xb_create);
+ trace_xfs_buf_init(bp, _RET_IP_);
+
+ error = xfs_buf_alloc_backing_mem(bp, flags);
+ if (error) {
+ xfs_buf_free(bp);
+ return error;
+ }
+
+ *bpp = bp;
return 0;
}
@@ -502,7 +372,6 @@ int
xfs_buf_cache_init(
struct xfs_buf_cache *bch)
{
- spin_lock_init(&bch->bc_lock);
return rhashtable_init(&bch->bc_hash, &xfs_buf_hash_params);
}
@@ -565,7 +434,7 @@ xfs_buf_find_lock(
return -ENOENT;
}
ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
- bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
+ bp->b_flags &= _XBF_KMEM;
bp->b_ops = NULL;
}
return 0;
@@ -633,36 +502,24 @@ xfs_buf_find_insert(
struct xfs_buf *bp;
int error;
- error = _xfs_buf_alloc(btp, map, nmaps, flags, &new_bp);
+ error = xfs_buf_alloc(btp, map, nmaps, flags, &new_bp);
if (error)
goto out_drop_pag;
- if (xfs_buftarg_is_mem(new_bp->b_target)) {
- error = xmbuf_map_page(new_bp);
- } else if (BBTOB(new_bp->b_length) >= PAGE_SIZE ||
- xfs_buf_alloc_kmem(new_bp, flags) < 0) {
- /*
- * For buffers that fit entirely within a single page, first
- * attempt to allocate the memory from the heap to minimise
- * memory usage. If we can't get heap memory for these small
- * buffers, we fall back to using the page allocator.
- */
- error = xfs_buf_alloc_pages(new_bp, flags);
- }
- if (error)
- goto out_free_buf;
+ /* The new buffer keeps the perag reference until it is freed. */
+ new_bp->b_pag = pag;
- spin_lock(&bch->bc_lock);
+ rcu_read_lock();
bp = rhashtable_lookup_get_insert_fast(&bch->bc_hash,
&new_bp->b_rhash_head, xfs_buf_hash_params);
if (IS_ERR(bp)) {
+ rcu_read_unlock();
error = PTR_ERR(bp);
- spin_unlock(&bch->bc_lock);
goto out_free_buf;
}
if (bp && xfs_buf_try_hold(bp)) {
/* found an existing buffer */
- spin_unlock(&bch->bc_lock);
+ rcu_read_unlock();
error = xfs_buf_find_lock(bp, flags);
if (error)
xfs_buf_rele(bp);
@@ -670,10 +527,8 @@ xfs_buf_find_insert(
*bpp = bp;
goto out_free_buf;
}
+ rcu_read_unlock();
- /* The new buffer keeps the perag reference until it is freed. */
- new_bp->b_pag = pag;
- spin_unlock(&bch->bc_lock);
*bpp = new_bp;
return 0;
@@ -761,18 +616,6 @@ xfs_buf_get_map(
xfs_perag_put(pag);
}
- /* We do not hold a perag reference anymore. */
- if (!bp->b_addr) {
- error = _xfs_buf_map_pages(bp, flags);
- if (unlikely(error)) {
- xfs_warn_ratelimited(btp->bt_mount,
- "%s: failed to map %u pages", __func__,
- bp->b_page_count);
- xfs_buf_relse(bp);
- return error;
- }
- }
-
/*
* Clear b_error if this is a lookup from a caller that doesn't expect
* valid data to be found in the buffer.
@@ -793,18 +636,13 @@ out_put_perag:
int
_xfs_buf_read(
- struct xfs_buf *bp,
- xfs_buf_flags_t flags)
+ struct xfs_buf *bp)
{
- ASSERT(!(flags & XBF_WRITE));
ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL);
bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD | XBF_DONE);
- bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
-
+ bp->b_flags |= XBF_READ;
xfs_buf_submit(bp);
- if (flags & XBF_ASYNC)
- return 0;
return xfs_buf_iowait(bp);
}
@@ -856,6 +694,8 @@ xfs_buf_read_map(
struct xfs_buf *bp;
int error;
+ ASSERT(!(flags & (XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD)));
+
flags |= XBF_READ;
*bpp = NULL;
@@ -869,21 +709,11 @@ xfs_buf_read_map(
/* Initiate the buffer read and wait. */
XFS_STATS_INC(target->bt_mount, xb_get_read);
bp->b_ops = ops;
- error = _xfs_buf_read(bp, flags);
-
- /* Readahead iodone already dropped the buffer, so exit. */
- if (flags & XBF_ASYNC)
- return 0;
+ error = _xfs_buf_read(bp);
} else {
/* Buffer already read; all we need to do is check it. */
error = xfs_buf_reverify(bp, ops);
- /* Readahead already finished; drop the buffer and exit. */
- if (flags & XBF_ASYNC) {
- xfs_buf_relse(bp);
- return 0;
- }
-
/* We do not want read in the flags */
bp->b_flags &= ~XBF_READ;
ASSERT(bp->b_ops != NULL || ops == NULL);
@@ -935,6 +765,7 @@ xfs_buf_readahead_map(
int nmaps,
const struct xfs_buf_ops *ops)
{
+ const xfs_buf_flags_t flags = XBF_READ | XBF_ASYNC | XBF_READ_AHEAD;
struct xfs_buf *bp;
/*
@@ -944,9 +775,21 @@ xfs_buf_readahead_map(
if (xfs_buftarg_is_mem(target))
return;
- xfs_buf_read_map(target, map, nmaps,
- XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD, &bp, ops,
- __this_address);
+ if (xfs_buf_get_map(target, map, nmaps, flags | XBF_TRYLOCK, &bp))
+ return;
+ trace_xfs_buf_readahead(bp, 0, _RET_IP_);
+
+ if (bp->b_flags & XBF_DONE) {
+ xfs_buf_reverify(bp, ops);
+ xfs_buf_relse(bp);
+ return;
+ }
+ XFS_STATS_INC(target->bt_mount, xb_get_read);
+ bp->b_ops = ops;
+ bp->b_flags &= ~(XBF_WRITE | XBF_DONE);
+ bp->b_flags |= flags;
+ percpu_counter_inc(&target->bt_readahead_count);
+ xfs_buf_submit(bp);
}
/*
@@ -960,7 +803,6 @@ xfs_buf_read_uncached(
struct xfs_buftarg *target,
xfs_daddr_t daddr,
size_t numblks,
- xfs_buf_flags_t flags,
struct xfs_buf **bpp,
const struct xfs_buf_ops *ops)
{
@@ -969,7 +811,7 @@ xfs_buf_read_uncached(
*bpp = NULL;
- error = xfs_buf_get_uncached(target, numblks, flags, &bp);
+ error = xfs_buf_get_uncached(target, numblks, &bp);
if (error)
return error;
@@ -995,40 +837,14 @@ int
xfs_buf_get_uncached(
struct xfs_buftarg *target,
size_t numblks,
- xfs_buf_flags_t flags,
struct xfs_buf **bpp)
{
int error;
- struct xfs_buf *bp;
DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks);
- *bpp = NULL;
-
- /* flags might contain irrelevant bits, pass only what we care about */
- error = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT, &bp);
- if (error)
- return error;
-
- if (xfs_buftarg_is_mem(bp->b_target))
- error = xmbuf_map_page(bp);
- else
- error = xfs_buf_alloc_pages(bp, flags);
- if (error)
- goto fail_free_buf;
-
- error = _xfs_buf_map_pages(bp, 0);
- if (unlikely(error)) {
- xfs_warn(target->bt_mount,
- "%s: failed to map pages", __func__);
- goto fail_free_buf;
- }
-
- trace_xfs_buf_get_uncached(bp, _RET_IP_);
- *bpp = bp;
- return 0;
-
-fail_free_buf:
- xfs_buf_free(bp);
+ error = xfs_buf_alloc(target, &map, 1, 0, bpp);
+ if (!error)
+ trace_xfs_buf_get_uncached(*bpp, _RET_IP_);
return error;
}
@@ -1059,7 +875,6 @@ xfs_buf_rele_uncached(
spin_unlock(&bp->b_lock);
return;
}
- __xfs_buf_ioacct_dec(bp);
spin_unlock(&bp->b_lock);
xfs_buf_free(bp);
}
@@ -1078,21 +893,12 @@ xfs_buf_rele_cached(
spin_lock(&bp->b_lock);
ASSERT(bp->b_hold >= 1);
if (bp->b_hold > 1) {
- /*
- * Drop the in-flight state if the buffer is already on the LRU
- * and it holds the only reference. This is racy because we
- * haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT
- * ensures the decrement occurs only once per-buf.
- */
- if (--bp->b_hold == 1 && !list_empty(&bp->b_lru))
- __xfs_buf_ioacct_dec(bp);
+ bp->b_hold--;
goto out_unlock;
}
/* we are asked to drop the last reference */
- spin_lock(&bch->bc_lock);
- __xfs_buf_ioacct_dec(bp);
- if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
+ if (atomic_read(&bp->b_lru_ref)) {
/*
* If the buffer is added to the LRU, keep the reference to the
* buffer for the LRU and clear the (now stale) dispose list
@@ -1102,7 +908,6 @@ xfs_buf_rele_cached(
bp->b_state &= ~XFS_BSTATE_DISPOSE;
else
bp->b_hold--;
- spin_unlock(&bch->bc_lock);
} else {
bp->b_hold--;
/*
@@ -1120,7 +925,6 @@ xfs_buf_rele_cached(
ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
rhashtable_remove_fast(&bch->bc_hash, &bp->b_rhash_head,
xfs_buf_hash_params);
- spin_unlock(&bch->bc_lock);
if (pag)
xfs_perag_put(pag);
freebuf = true;
@@ -1347,6 +1151,7 @@ xfs_buf_ioend_handle_error(
resubmit:
xfs_buf_ioerror(bp, 0);
bp->b_flags |= (XBF_DONE | XBF_WRITE_FAIL);
+ reinit_completion(&bp->b_iowait);
xfs_buf_submit(bp);
return true;
out_stale:
@@ -1357,20 +1162,23 @@ out_stale:
return false;
}
-static void
-xfs_buf_ioend(
+/* returns false if the caller needs to resubmit the I/O, else true */
+static bool
+__xfs_buf_ioend(
struct xfs_buf *bp)
{
trace_xfs_buf_iodone(bp, _RET_IP_);
if (bp->b_flags & XBF_READ) {
- if (!bp->b_error && xfs_buf_is_vmapped(bp))
+ if (!bp->b_error && is_vmalloc_addr(bp->b_addr))
invalidate_kernel_vmap_range(bp->b_addr,
- xfs_buf_vmap_len(bp));
+ roundup(BBTOB(bp->b_length), PAGE_SIZE));
if (!bp->b_error && bp->b_ops)
bp->b_ops->verify_read(bp);
if (!bp->b_error)
bp->b_flags |= XBF_DONE;
+ if (bp->b_flags & XBF_READ_AHEAD)
+ percpu_counter_dec(&bp->b_target->bt_readahead_count);
} else {
if (!bp->b_error) {
bp->b_flags &= ~XBF_WRITE_FAIL;
@@ -1378,7 +1186,7 @@ xfs_buf_ioend(
}
if (unlikely(bp->b_error) && xfs_buf_ioend_handle_error(bp))
- return;
+ return false;
/* clear the retry state */
bp->b_last_error = 0;
@@ -1399,7 +1207,15 @@ xfs_buf_ioend(
bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD |
_XBF_LOGRECOVERY);
+ return true;
+}
+static void
+xfs_buf_ioend(
+ struct xfs_buf *bp)
+{
+ if (!__xfs_buf_ioend(bp))
+ return;
if (bp->b_flags & XBF_ASYNC)
xfs_buf_relse(bp);
else
@@ -1413,15 +1229,8 @@ xfs_buf_ioend_work(
struct xfs_buf *bp =
container_of(work, struct xfs_buf, b_ioend_work);
- xfs_buf_ioend(bp);
-}
-
-static void
-xfs_buf_ioend_async(
- struct xfs_buf *bp)
-{
- INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work);
- queue_work(bp->b_mount->m_buf_workqueue, &bp->b_ioend_work);
+ if (__xfs_buf_ioend(bp))
+ xfs_buf_relse(bp);
}
void
@@ -1493,7 +1302,13 @@ xfs_buf_bio_end_io(
XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR))
xfs_buf_ioerror(bp, -EIO);
- xfs_buf_ioend_async(bp);
+ if (bp->b_flags & XBF_ASYNC) {
+ INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work);
+ queue_work(bp->b_mount->m_buf_workqueue, &bp->b_ioend_work);
+ } else {
+ complete(&bp->b_iowait);
+ }
+
bio_put(bio);
}
@@ -1518,29 +1333,21 @@ static void
xfs_buf_submit_bio(
struct xfs_buf *bp)
{
- unsigned int size = BBTOB(bp->b_length);
- unsigned int map = 0, p;
+ unsigned int len = BBTOB(bp->b_length);
+ unsigned int nr_vecs = bio_add_max_vecs(bp->b_addr, len);
+ unsigned int map = 0;
struct blk_plug plug;
struct bio *bio;
- bio = bio_alloc(bp->b_target->bt_bdev, bp->b_page_count,
- xfs_buf_bio_op(bp), GFP_NOIO);
+ bio = bio_alloc(bp->b_target->bt_bdev, nr_vecs, xfs_buf_bio_op(bp),
+ GFP_NOIO);
+ if (is_vmalloc_addr(bp->b_addr))
+ bio_add_vmalloc(bio, bp->b_addr, len);
+ else
+ bio_add_virt_nofail(bio, bp->b_addr, len);
bio->bi_private = bp;
bio->bi_end_io = xfs_buf_bio_end_io;
- if (bp->b_flags & _XBF_KMEM) {
- __bio_add_page(bio, virt_to_page(bp->b_addr), size,
- bp->b_offset);
- } else {
- for (p = 0; p < bp->b_page_count; p++)
- __bio_add_page(bio, bp->b_pages[p], PAGE_SIZE, 0);
- bio->bi_iter.bi_size = size; /* limit to the actual size used */
-
- if (xfs_buf_is_vmapped(bp))
- flush_kernel_vmap_range(bp->b_addr,
- xfs_buf_vmap_len(bp));
- }
-
/*
* If there is more than one map segment, split out a new bio for each
* map except of the last one. The last map is handled by the
@@ -1570,9 +1377,11 @@ xfs_buf_iowait(
{
ASSERT(!(bp->b_flags & XBF_ASYNC));
- trace_xfs_buf_iowait(bp, _RET_IP_);
- wait_for_completion(&bp->b_iowait);
- trace_xfs_buf_iowait_done(bp, _RET_IP_);
+ do {
+ trace_xfs_buf_iowait(bp, _RET_IP_);
+ wait_for_completion(&bp->b_iowait);
+ trace_xfs_buf_iowait_done(bp, _RET_IP_);
+ } while (!__xfs_buf_ioend(bp));
return bp->b_error;
}
@@ -1650,9 +1459,6 @@ xfs_buf_submit(
*/
bp->b_error = 0;
- if (bp->b_flags & XBF_ASYNC)
- xfs_buf_ioacct_inc(bp);
-
if ((bp->b_flags & XBF_WRITE) && !xfs_buf_verify_write(bp)) {
xfs_force_shutdown(bp->b_mount, SHUTDOWN_CORRUPT_INCORE);
xfs_buf_ioend(bp);
@@ -1668,47 +1474,6 @@ xfs_buf_submit(
xfs_buf_submit_bio(bp);
}
-void *
-xfs_buf_offset(
- struct xfs_buf *bp,
- size_t offset)
-{
- struct page *page;
-
- if (bp->b_addr)
- return bp->b_addr + offset;
-
- page = bp->b_pages[offset >> PAGE_SHIFT];
- return page_address(page) + (offset & (PAGE_SIZE-1));
-}
-
-void
-xfs_buf_zero(
- struct xfs_buf *bp,
- size_t boff,
- size_t bsize)
-{
- size_t bend;
-
- bend = boff + bsize;
- while (boff < bend) {
- struct page *page;
- int page_index, page_offset, csize;
-
- page_index = (boff + bp->b_offset) >> PAGE_SHIFT;
- page_offset = (boff + bp->b_offset) & ~PAGE_MASK;
- page = bp->b_pages[page_index];
- csize = min_t(size_t, PAGE_SIZE - page_offset,
- BBTOB(bp->b_length) - boff);
-
- ASSERT((csize + page_offset) <= PAGE_SIZE);
-
- memset(page_address(page) + page_offset, 0, csize);
-
- boff += csize;
- }
-}
-
/*
* Log a message about and stale a buffer that a caller has decided is corrupt.
*
@@ -1778,9 +1543,8 @@ xfs_buftarg_wait(
struct xfs_buftarg *btp)
{
/*
- * First wait on the buftarg I/O count for all in-flight buffers to be
- * released. This is critical as new buffers do not make the LRU until
- * they are released.
+ * First wait for all in-flight readahead buffers to be released. This is
+ * critical as new buffers do not make the LRU until they are released.
*
* Next, flush the buffer workqueue to ensure all completion processing
* has finished. Just waiting on buffer locks is not sufficient for
@@ -1789,7 +1553,7 @@ xfs_buftarg_wait(
* all reference counts have been dropped before we start walking the
* LRU list.
*/
- while (percpu_counter_sum(&btp->bt_io_count))
+ while (percpu_counter_sum(&btp->bt_readahead_count))
delay(100);
flush_workqueue(btp->bt_mount->m_buf_workqueue);
}
@@ -1906,8 +1670,8 @@ xfs_destroy_buftarg(
struct xfs_buftarg *btp)
{
shrinker_free(btp->bt_shrinker);
- ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
- percpu_counter_destroy(&btp->bt_io_count);
+ ASSERT(percpu_counter_sum(&btp->bt_readahead_count) == 0);
+ percpu_counter_destroy(&btp->bt_readahead_count);
list_lru_destroy(&btp->bt_lru);
}
@@ -1919,26 +1683,63 @@ xfs_free_buftarg(
fs_put_dax(btp->bt_daxdev, btp->bt_mount);
/* the main block device is closed by kill_block_super */
if (btp->bt_bdev != btp->bt_mount->m_super->s_bdev)
- bdev_fput(btp->bt_bdev_file);
+ bdev_fput(btp->bt_file);
kfree(btp);
}
+/*
+ * Configure this buffer target for hardware-assisted atomic writes if the
+ * underlying block device supports is congruent with the filesystem geometry.
+ */
+static inline void
+xfs_configure_buftarg_atomic_writes(
+ struct xfs_buftarg *btp)
+{
+ struct xfs_mount *mp = btp->bt_mount;
+ unsigned int min_bytes, max_bytes;
+
+ min_bytes = bdev_atomic_write_unit_min_bytes(btp->bt_bdev);
+ max_bytes = bdev_atomic_write_unit_max_bytes(btp->bt_bdev);
+
+ /*
+ * Ignore atomic write geometry that is nonsense or doesn't even cover
+ * a single fsblock.
+ */
+ if (min_bytes > max_bytes ||
+ min_bytes > mp->m_sb.sb_blocksize ||
+ max_bytes < mp->m_sb.sb_blocksize) {
+ min_bytes = 0;
+ max_bytes = 0;
+ }
+
+ btp->bt_awu_min = min_bytes;
+ btp->bt_awu_max = max_bytes;
+}
+
+/* Configure a buffer target that abstracts a block device. */
int
-xfs_setsize_buftarg(
+xfs_configure_buftarg(
struct xfs_buftarg *btp,
unsigned int sectorsize)
{
+ int error;
+
+ ASSERT(btp->bt_bdev != NULL);
+
/* Set up metadata sector size info */
btp->bt_meta_sectorsize = sectorsize;
btp->bt_meta_sectormask = sectorsize - 1;
- if (set_blocksize(btp->bt_bdev_file, sectorsize)) {
+ error = bdev_validate_blocksize(btp->bt_bdev, sectorsize);
+ if (error) {
xfs_warn(btp->bt_mount,
- "Cannot set_blocksize to %u on device %pg",
- sectorsize, btp->bt_bdev);
+ "Cannot use blocksize %u on device %pg, err %d",
+ sectorsize, btp->bt_bdev, error);
return -EINVAL;
}
+ if (bdev_can_atomic_write(btp->bt_bdev))
+ xfs_configure_buftarg_atomic_writes(btp);
return 0;
}
@@ -1961,7 +1762,7 @@ xfs_init_buftarg(
if (list_lru_init(&btp->bt_lru))
return -ENOMEM;
- if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
+ if (percpu_counter_init(&btp->bt_readahead_count, 0, GFP_KERNEL))
goto out_destroy_lru;
btp->bt_shrinker =
@@ -1975,7 +1776,7 @@ xfs_init_buftarg(
return 0;
out_destroy_io_count:
- percpu_counter_destroy(&btp->bt_io_count);
+ percpu_counter_destroy(&btp->bt_readahead_count);
out_destroy_lru:
list_lru_destroy(&btp->bt_lru);
return -ENOMEM;
@@ -1988,6 +1789,8 @@ xfs_alloc_buftarg(
{
struct xfs_buftarg *btp;
const struct dax_holder_operations *ops = NULL;
+ int error;
+
#if defined(CONFIG_FS_DAX) && defined(CONFIG_MEMORY_FAILURE)
ops = &xfs_dax_holder_operations;
@@ -1995,34 +1798,37 @@ xfs_alloc_buftarg(
btp = kzalloc(sizeof(*btp), GFP_KERNEL | __GFP_NOFAIL);
btp->bt_mount = mp;
- btp->bt_bdev_file = bdev_file;
+ btp->bt_file = bdev_file;
btp->bt_bdev = file_bdev(bdev_file);
btp->bt_dev = btp->bt_bdev->bd_dev;
btp->bt_daxdev = fs_dax_get_by_bdev(btp->bt_bdev, &btp->bt_dax_part_off,
mp, ops);
- if (bdev_can_atomic_write(btp->bt_bdev)) {
- btp->bt_bdev_awu_min = bdev_atomic_write_unit_min_bytes(
- btp->bt_bdev);
- btp->bt_bdev_awu_max = bdev_atomic_write_unit_max_bytes(
- btp->bt_bdev);
- }
+ /*
+ * Flush and invalidate all devices' pagecaches before reading any
+ * metadata because XFS doesn't use the bdev pagecache.
+ */
+ error = sync_blockdev(btp->bt_bdev);
+ if (error)
+ goto error_free;
/*
* When allocating the buftargs we have not yet read the super block and
* thus don't know the file system sector size yet.
*/
- if (xfs_setsize_buftarg(btp, bdev_logical_block_size(btp->bt_bdev)))
- goto error_free;
- if (xfs_init_buftarg(btp, bdev_logical_block_size(btp->bt_bdev),
- mp->m_super->s_id))
+ btp->bt_meta_sectorsize = bdev_logical_block_size(btp->bt_bdev);
+ btp->bt_meta_sectormask = btp->bt_meta_sectorsize - 1;
+
+ error = xfs_init_buftarg(btp, btp->bt_meta_sectorsize,
+ mp->m_super->s_id);
+ if (error)
goto error_free;
return btp;
error_free:
kfree(btp);
- return NULL;
+ return ERR_PTR(error);
}
static inline void
@@ -2271,44 +2077,6 @@ xfs_buf_delwri_submit(
return error;
}
-/*
- * Push a single buffer on a delwri queue.
- *
- * The purpose of this function is to submit a single buffer of a delwri queue
- * and return with the buffer still on the original queue.
- *
- * The buffer locking and queue management logic between _delwri_pushbuf() and
- * _delwri_queue() guarantee that the buffer cannot be queued to another list
- * before returning.
- */
-int
-xfs_buf_delwri_pushbuf(
- struct xfs_buf *bp,
- struct list_head *buffer_list)
-{
- int error;
-
- ASSERT(bp->b_flags & _XBF_DELWRI_Q);
-
- trace_xfs_buf_delwri_pushbuf(bp, _RET_IP_);
-
- xfs_buf_lock(bp);
- bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC);
- bp->b_flags |= XBF_WRITE;
- xfs_buf_submit(bp);
-
- /*
- * The buffer is now locked, under I/O but still on the original delwri
- * queue. Wait for I/O completion, restore the DELWRI_Q flag and
- * return with the buffer unlocked and still on the original queue.
- */
- error = xfs_buf_iowait(bp);
- bp->b_flags |= _XBF_DELWRI_Q;
- xfs_buf_unlock(bp);
-
- return error;
-}
-
void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
{
/*
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 7e73663c5d4a..b269e115d9ac 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -27,7 +27,6 @@ struct xfs_buf;
#define XBF_READ (1u << 0) /* buffer intended for reading from device */
#define XBF_WRITE (1u << 1) /* buffer intended for writing to device */
#define XBF_READ_AHEAD (1u << 2) /* asynchronous read-ahead */
-#define XBF_NO_IOACCT (1u << 3) /* bypass I/O accounting (non-LRU bufs) */
#define XBF_ASYNC (1u << 4) /* initiator will not wait for completion */
#define XBF_DONE (1u << 5) /* all pages in the buffer uptodate */
#define XBF_STALE (1u << 6) /* buffer has been staled, do not find it */
@@ -37,7 +36,6 @@ struct xfs_buf;
#define _XBF_LOGRECOVERY (1u << 18)/* log recovery buffer */
/* flags used only internally */
-#define _XBF_PAGES (1u << 20)/* backed by refcounted pages */
#define _XBF_KMEM (1u << 21)/* backed by heap memory */
#define _XBF_DELWRI_Q (1u << 22)/* buffer on a delwri queue */
@@ -49,7 +47,6 @@ struct xfs_buf;
#define XBF_LIVESCAN (1u << 28)
#define XBF_INCORE (1u << 29)/* lookup only, return if found in cache */
#define XBF_TRYLOCK (1u << 30)/* lock requested, but do not wait */
-#define XBF_UNMAPPED (1u << 31)/* do not map the buffer */
typedef unsigned int xfs_buf_flags_t;
@@ -58,29 +55,24 @@ typedef unsigned int xfs_buf_flags_t;
{ XBF_READ, "READ" }, \
{ XBF_WRITE, "WRITE" }, \
{ XBF_READ_AHEAD, "READ_AHEAD" }, \
- { XBF_NO_IOACCT, "NO_IOACCT" }, \
{ XBF_ASYNC, "ASYNC" }, \
{ XBF_DONE, "DONE" }, \
{ XBF_STALE, "STALE" }, \
{ XBF_WRITE_FAIL, "WRITE_FAIL" }, \
{ _XBF_LOGRECOVERY, "LOG_RECOVERY" }, \
- { _XBF_PAGES, "PAGES" }, \
{ _XBF_KMEM, "KMEM" }, \
{ _XBF_DELWRI_Q, "DELWRI_Q" }, \
/* The following interface flags should never be set */ \
{ XBF_LIVESCAN, "LIVESCAN" }, \
{ XBF_INCORE, "INCORE" }, \
- { XBF_TRYLOCK, "TRYLOCK" }, \
- { XBF_UNMAPPED, "UNMAPPED" }
+ { XBF_TRYLOCK, "TRYLOCK" }
/*
* Internal state flags.
*/
#define XFS_BSTATE_DISPOSE (1 << 0) /* buffer being discarded */
-#define XFS_BSTATE_IN_FLIGHT (1 << 1) /* I/O in flight */
struct xfs_buf_cache {
- spinlock_t bc_lock;
struct rhashtable bc_hash;
};
@@ -102,7 +94,6 @@ void xfs_buf_cache_destroy(struct xfs_buf_cache *bch);
*/
struct xfs_buftarg {
dev_t bt_dev;
- struct file *bt_bdev_file;
struct block_device *bt_bdev;
struct dax_device *bt_daxdev;
struct file *bt_file;
@@ -117,19 +108,17 @@ struct xfs_buftarg {
struct shrinker *bt_shrinker;
struct list_lru bt_lru;
- struct percpu_counter bt_io_count;
+ struct percpu_counter bt_readahead_count;
struct ratelimit_state bt_ioerror_rl;
- /* Atomic write unit values */
- unsigned int bt_bdev_awu_min;
- unsigned int bt_bdev_awu_max;
+ /* Hardware atomic write unit values, bytes */
+ unsigned int bt_awu_min;
+ unsigned int bt_awu_max;
/* built-in cache, if we're not using the perag one */
struct xfs_buf_cache bt_cache[];
};
-#define XB_PAGES 2
-
struct xfs_buf_map {
xfs_daddr_t bm_bn; /* block number for I/O */
int bm_len; /* size of I/O */
@@ -191,15 +180,10 @@ struct xfs_buf {
struct xfs_buf_log_item *b_log_item;
struct list_head b_li_list; /* Log items list head */
struct xfs_trans *b_transp;
- struct page **b_pages; /* array of page pointers */
- struct page *b_page_array[XB_PAGES]; /* inline pages */
struct xfs_buf_map *b_maps; /* compound buffer map */
struct xfs_buf_map __b_map; /* inline compound buffer map */
int b_map_count;
atomic_t b_pin_count; /* pin count */
- unsigned int b_page_count; /* size of page array */
- unsigned int b_offset; /* page offset of b_addr,
- only for _XBF_KMEM buffers */
int b_error; /* error code on I/O */
void (*b_iodone)(struct xfs_buf *bp);
@@ -288,11 +272,11 @@ xfs_buf_readahead(
}
int xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks,
- xfs_buf_flags_t flags, struct xfs_buf **bpp);
+ struct xfs_buf **bpp);
int xfs_buf_read_uncached(struct xfs_buftarg *target, xfs_daddr_t daddr,
- size_t numblks, xfs_buf_flags_t flags, struct xfs_buf **bpp,
+ size_t numblks, struct xfs_buf **bpp,
const struct xfs_buf_ops *ops);
-int _xfs_buf_read(struct xfs_buf *bp, xfs_buf_flags_t flags);
+int _xfs_buf_read(struct xfs_buf *bp);
void xfs_buf_hold(struct xfs_buf *bp);
/* Releasing Buffers */
@@ -319,12 +303,20 @@ extern void __xfs_buf_ioerror(struct xfs_buf *bp, int error,
#define xfs_buf_ioerror(bp, err) __xfs_buf_ioerror((bp), (err), __this_address)
extern void xfs_buf_ioerror_alert(struct xfs_buf *bp, xfs_failaddr_t fa);
void xfs_buf_ioend_fail(struct xfs_buf *);
-void xfs_buf_zero(struct xfs_buf *bp, size_t boff, size_t bsize);
void __xfs_buf_mark_corrupt(struct xfs_buf *bp, xfs_failaddr_t fa);
#define xfs_buf_mark_corrupt(bp) __xfs_buf_mark_corrupt((bp), __this_address)
/* Buffer Utility Routines */
-extern void *xfs_buf_offset(struct xfs_buf *, size_t);
+static inline void *xfs_buf_offset(struct xfs_buf *bp, size_t offset)
+{
+ return bp->b_addr + offset;
+}
+
+static inline void xfs_buf_zero(struct xfs_buf *bp, size_t boff, size_t bsize)
+{
+ memset(bp->b_addr + boff, 0, bsize);
+}
+
extern void xfs_buf_stale(struct xfs_buf *bp);
/* Delayed Write Buffer Routines */
@@ -333,7 +325,6 @@ extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *);
void xfs_buf_delwri_queue_here(struct xfs_buf *bp, struct list_head *bl);
extern int xfs_buf_delwri_submit(struct list_head *);
extern int xfs_buf_delwri_submit_nowait(struct list_head *);
-extern int xfs_buf_delwri_pushbuf(struct xfs_buf *, struct list_head *);
static inline xfs_daddr_t xfs_buf_daddr(struct xfs_buf *bp)
{
@@ -381,9 +372,8 @@ struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *mp,
extern void xfs_free_buftarg(struct xfs_buftarg *);
extern void xfs_buftarg_wait(struct xfs_buftarg *);
extern void xfs_buftarg_drain(struct xfs_buftarg *);
-extern int xfs_setsize_buftarg(struct xfs_buftarg *, unsigned int);
+int xfs_configure_buftarg(struct xfs_buftarg *btp, unsigned int sectorsize);
-#define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev)
#define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev)
int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops);
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 47549cfa61cd..8d85b5eee444 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -32,19 +32,74 @@ static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip)
return container_of(lip, struct xfs_buf_log_item, bli_item);
}
+static void
+xfs_buf_item_get_format(
+ struct xfs_buf_log_item *bip,
+ int count)
+{
+ ASSERT(bip->bli_formats == NULL);
+ bip->bli_format_count = count;
+
+ if (count == 1) {
+ bip->bli_formats = &bip->__bli_format;
+ return;
+ }
+
+ bip->bli_formats = kzalloc(count * sizeof(struct xfs_buf_log_format),
+ GFP_KERNEL | __GFP_NOFAIL);
+}
+
+static void
+xfs_buf_item_free_format(
+ struct xfs_buf_log_item *bip)
+{
+ if (bip->bli_formats != &bip->__bli_format) {
+ kfree(bip->bli_formats);
+ bip->bli_formats = NULL;
+ }
+}
+
+static void
+xfs_buf_item_free(
+ struct xfs_buf_log_item *bip)
+{
+ xfs_buf_item_free_format(bip);
+ kvfree(bip->bli_item.li_lv_shadow);
+ kmem_cache_free(xfs_buf_item_cache, bip);
+}
+
+/*
+ * xfs_buf_item_relse() is called when the buf log item is no longer needed.
+ */
+static void
+xfs_buf_item_relse(
+ struct xfs_buf_log_item *bip)
+{
+ struct xfs_buf *bp = bip->bli_buf;
+
+ trace_xfs_buf_item_relse(bp, _RET_IP_);
+
+ ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags));
+ ASSERT(atomic_read(&bip->bli_refcount) == 0);
+
+ bp->b_log_item = NULL;
+ xfs_buf_rele(bp);
+ xfs_buf_item_free(bip);
+}
+
/* Is this log iovec plausibly large enough to contain the buffer log format? */
bool
xfs_buf_log_check_iovec(
- struct xfs_log_iovec *iovec)
+ struct kvec *iovec)
{
- struct xfs_buf_log_format *blfp = iovec->i_addr;
+ struct xfs_buf_log_format *blfp = iovec->iov_base;
char *bmp_end;
char *item_end;
- if (offsetof(struct xfs_buf_log_format, blf_data_map) > iovec->i_len)
+ if (offsetof(struct xfs_buf_log_format, blf_data_map) > iovec->iov_len)
return false;
- item_end = (char *)iovec->i_addr + iovec->i_len;
+ item_end = (char *)iovec->iov_base + iovec->iov_len;
bmp_end = (char *)&blfp->blf_data_map[blfp->blf_map_size];
return bmp_end <= item_end;
}
@@ -57,24 +112,6 @@ xfs_buf_log_format_size(
(blfp->blf_map_size * sizeof(blfp->blf_data_map[0]));
}
-static inline bool
-xfs_buf_item_straddle(
- struct xfs_buf *bp,
- uint offset,
- int first_bit,
- int nbits)
-{
- void *first, *last;
-
- first = xfs_buf_offset(bp, offset + (first_bit << XFS_BLF_SHIFT));
- last = xfs_buf_offset(bp,
- offset + ((first_bit + nbits) << XFS_BLF_SHIFT));
-
- if (last - first != nbits * XFS_BLF_CHUNK)
- return true;
- return false;
-}
-
/*
* Return the number of log iovecs and space needed to log the given buf log
* item segment.
@@ -91,11 +128,8 @@ xfs_buf_item_size_segment(
int *nvecs,
int *nbytes)
{
- struct xfs_buf *bp = bip->bli_buf;
int first_bit;
int nbits;
- int next_bit;
- int last_bit;
first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0);
if (first_bit == -1)
@@ -108,15 +142,6 @@ xfs_buf_item_size_segment(
nbits = xfs_contig_bits(blfp->blf_data_map,
blfp->blf_map_size, first_bit);
ASSERT(nbits > 0);
-
- /*
- * Straddling a page is rare because we don't log contiguous
- * chunks of unmapped buffers anywhere.
- */
- if (nbits > 1 &&
- xfs_buf_item_straddle(bp, offset, first_bit, nbits))
- goto slow_scan;
-
(*nvecs)++;
*nbytes += nbits * XFS_BLF_CHUNK;
@@ -131,40 +156,25 @@ xfs_buf_item_size_segment(
} while (first_bit != -1);
return;
+}
-slow_scan:
- /* Count the first bit we jumped out of the above loop from */
- (*nvecs)++;
- *nbytes += XFS_BLF_CHUNK;
- last_bit = first_bit;
- while (last_bit != -1) {
- /*
- * This takes the bit number to start looking from and
- * returns the next set bit from there. It returns -1
- * if there are no more bits set or the start bit is
- * beyond the end of the bitmap.
- */
- next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size,
- last_bit + 1);
- /*
- * If we run out of bits, leave the loop,
- * else if we find a new set of bits bump the number of vecs,
- * else keep scanning the current set of bits.
- */
- if (next_bit == -1) {
- break;
- } else if (next_bit != last_bit + 1 ||
- xfs_buf_item_straddle(bp, offset, first_bit, nbits)) {
- last_bit = next_bit;
- first_bit = next_bit;
- (*nvecs)++;
- nbits = 1;
- } else {
- last_bit++;
- nbits++;
- }
- *nbytes += XFS_BLF_CHUNK;
- }
+/*
+ * Compute the worst case log item overhead for an invalidated buffer with the
+ * given map count and block size.
+ */
+unsigned int
+xfs_buf_inval_log_space(
+ unsigned int map_count,
+ unsigned int blocksize)
+{
+ unsigned int chunks = DIV_ROUND_UP(blocksize, XFS_BLF_CHUNK);
+ unsigned int bitmap_size = DIV_ROUND_UP(chunks, NBWORD);
+ unsigned int ret =
+ offsetof(struct xfs_buf_log_format, blf_data_map) +
+ (bitmap_size * sizeof_field(struct xfs_buf_log_format,
+ blf_data_map[0]));
+
+ return ret * map_count;
}
/*
@@ -277,8 +287,6 @@ xfs_buf_item_format_segment(
struct xfs_buf *bp = bip->bli_buf;
uint base_size;
int first_bit;
- int last_bit;
- int next_bit;
uint nbits;
/* copy the flags across from the base format item */
@@ -323,15 +331,6 @@ xfs_buf_item_format_segment(
nbits = xfs_contig_bits(blfp->blf_data_map,
blfp->blf_map_size, first_bit);
ASSERT(nbits > 0);
-
- /*
- * Straddling a page is rare because we don't log contiguous
- * chunks of unmapped buffers anywhere.
- */
- if (nbits > 1 &&
- xfs_buf_item_straddle(bp, offset, first_bit, nbits))
- goto slow_scan;
-
xfs_buf_item_copy_iovec(lv, vecp, bp, offset,
first_bit, nbits);
blfp->blf_size++;
@@ -347,45 +346,6 @@ xfs_buf_item_format_segment(
} while (first_bit != -1);
return;
-
-slow_scan:
- ASSERT(bp->b_addr == NULL);
- last_bit = first_bit;
- nbits = 1;
- for (;;) {
- /*
- * This takes the bit number to start looking from and
- * returns the next set bit from there. It returns -1
- * if there are no more bits set or the start bit is
- * beyond the end of the bitmap.
- */
- next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size,
- (uint)last_bit + 1);
- /*
- * If we run out of bits fill in the last iovec and get out of
- * the loop. Else if we start a new set of bits then fill in
- * the iovec for the series we were looking at and start
- * counting the bits in the new one. Else we're still in the
- * same set of bits so just keep counting and scanning.
- */
- if (next_bit == -1) {
- xfs_buf_item_copy_iovec(lv, vecp, bp, offset,
- first_bit, nbits);
- blfp->blf_size++;
- break;
- } else if (next_bit != last_bit + 1 ||
- xfs_buf_item_straddle(bp, offset, first_bit, nbits)) {
- xfs_buf_item_copy_iovec(lv, vecp, bp, offset,
- first_bit, nbits);
- blfp->blf_size++;
- first_bit = next_bit;
- last_bit = next_bit;
- nbits = 1;
- } else {
- last_bit++;
- nbits++;
- }
- }
}
/*
@@ -485,6 +445,42 @@ xfs_buf_item_pin(
}
/*
+ * For a stale BLI, process all the necessary completions that must be
+ * performed when the final BLI reference goes away. The buffer will be
+ * referenced and locked here - we return to the caller with the buffer still
+ * referenced and locked for them to finalise processing of the buffer.
+ */
+static void
+xfs_buf_item_finish_stale(
+ struct xfs_buf_log_item *bip)
+{
+ struct xfs_buf *bp = bip->bli_buf;
+ struct xfs_log_item *lip = &bip->bli_item;
+
+ ASSERT(bip->bli_flags & XFS_BLI_STALE);
+ ASSERT(xfs_buf_islocked(bp));
+ ASSERT(bp->b_flags & XBF_STALE);
+ ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
+ ASSERT(list_empty(&lip->li_trans));
+ ASSERT(!bp->b_transp);
+
+ if (bip->bli_flags & XFS_BLI_STALE_INODE) {
+ xfs_buf_item_done(bp);
+ xfs_buf_inode_iodone(bp);
+ ASSERT(list_empty(&bp->b_li_list));
+ return;
+ }
+
+ /*
+ * We may or may not be on the AIL here, xfs_trans_ail_delete() will do
+ * the right thing regardless of the situation in which we are called.
+ */
+ xfs_trans_ail_delete(lip, SHUTDOWN_LOG_IO_ERROR);
+ xfs_buf_item_relse(bip);
+ ASSERT(bp->b_log_item == NULL);
+}
+
+/*
* This is called to unpin the buffer associated with the buf log item which was
* previously pinned with a call to xfs_buf_item_pin(). We enter this function
* with a buffer pin count, a buffer reference and a BLI reference.
@@ -533,13 +529,6 @@ xfs_buf_item_unpin(
}
if (stale) {
- ASSERT(bip->bli_flags & XFS_BLI_STALE);
- ASSERT(xfs_buf_islocked(bp));
- ASSERT(bp->b_flags & XBF_STALE);
- ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
- ASSERT(list_empty(&lip->li_trans));
- ASSERT(!bp->b_transp);
-
trace_xfs_buf_item_unpin_stale(bip);
/*
@@ -550,22 +539,7 @@ xfs_buf_item_unpin(
* processing is complete.
*/
xfs_buf_rele(bp);
-
- /*
- * If we get called here because of an IO error, we may or may
- * not have the item on the AIL. xfs_trans_ail_delete() will
- * take care of that situation. xfs_trans_ail_delete() drops
- * the AIL lock.
- */
- if (bip->bli_flags & XFS_BLI_STALE_INODE) {
- xfs_buf_item_done(bp);
- xfs_buf_inode_iodone(bp);
- ASSERT(list_empty(&bp->b_li_list));
- } else {
- xfs_trans_ail_delete(lip, SHUTDOWN_LOG_IO_ERROR);
- xfs_buf_item_relse(bp);
- ASSERT(bp->b_log_item == NULL);
- }
+ xfs_buf_item_finish_stale(bip);
xfs_buf_relse(bp);
return;
}
@@ -638,43 +612,42 @@ xfs_buf_item_push(
* Drop the buffer log item refcount and take appropriate action. This helper
* determines whether the bli must be freed or not, since a decrement to zero
* does not necessarily mean the bli is unused.
- *
- * Return true if the bli is freed, false otherwise.
*/
-bool
+void
xfs_buf_item_put(
struct xfs_buf_log_item *bip)
{
- struct xfs_log_item *lip = &bip->bli_item;
- bool aborted;
- bool dirty;
+
+ ASSERT(xfs_buf_islocked(bip->bli_buf));
/* drop the bli ref and return if it wasn't the last one */
if (!atomic_dec_and_test(&bip->bli_refcount))
- return false;
+ return;
- /*
- * We dropped the last ref and must free the item if clean or aborted.
- * If the bli is dirty and non-aborted, the buffer was clean in the
- * transaction but still awaiting writeback from previous changes. In
- * that case, the bli is freed on buffer writeback completion.
- */
- aborted = test_bit(XFS_LI_ABORTED, &lip->li_flags) ||
- xlog_is_shutdown(lip->li_log);
- dirty = bip->bli_flags & XFS_BLI_DIRTY;
- if (dirty && !aborted)
- return false;
+ /* If the BLI is in the AIL, then it is still dirty and in use */
+ if (test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags)) {
+ ASSERT(bip->bli_flags & XFS_BLI_DIRTY);
+ return;
+ }
/*
- * The bli is aborted or clean. An aborted item may be in the AIL
- * regardless of dirty state. For example, consider an aborted
- * transaction that invalidated a dirty bli and cleared the dirty
- * state.
+ * In shutdown conditions, we can be asked to free a dirty BLI that
+ * isn't in the AIL. This can occur due to a checkpoint aborting a BLI
+ * instead of inserting it into the AIL at checkpoint IO completion. If
+ * there's another bli reference (e.g. a btree cursor holds a clean
+ * reference) and it is released via xfs_trans_brelse(), we can get here
+ * with that aborted, dirty BLI. In this case, it is safe to free the
+ * dirty BLI immediately, as it is not in the AIL and there are no
+ * other references to it.
+ *
+ * We should never get here with a stale BLI via that path as
+ * xfs_trans_brelse() specifically holds onto stale buffers rather than
+ * releasing them.
*/
- if (aborted)
- xfs_trans_ail_delete(lip, 0);
- xfs_buf_item_relse(bip->bli_buf);
- return true;
+ ASSERT(!(bip->bli_flags & XFS_BLI_DIRTY) ||
+ test_bit(XFS_LI_ABORTED, &bip->bli_item.li_flags));
+ ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+ xfs_buf_item_relse(bip);
}
/*
@@ -695,6 +668,15 @@ xfs_buf_item_put(
* if necessary but do not unlock the buffer. This is for support of
* xfs_trans_bhold(). Make sure the XFS_BLI_HOLD field is cleared if we don't
* free the item.
+ *
+ * If the XFS_BLI_STALE flag is set, the last reference to the BLI *must*
+ * perform a completion abort of any objects attached to the buffer for IO
+ * tracking purposes. This generally only happens in shutdown situations,
+ * normally xfs_buf_item_unpin() will drop the last BLI reference and perform
+ * completion processing. However, because transaction completion can race with
+ * checkpoint completion during a shutdown, this release context may end up
+ * being the last active reference to the BLI and so needs to perform this
+ * cleanup.
*/
STATIC void
xfs_buf_item_release(
@@ -702,18 +684,19 @@ xfs_buf_item_release(
{
struct xfs_buf_log_item *bip = BUF_ITEM(lip);
struct xfs_buf *bp = bip->bli_buf;
- bool released;
bool hold = bip->bli_flags & XFS_BLI_HOLD;
bool stale = bip->bli_flags & XFS_BLI_STALE;
-#if defined(DEBUG) || defined(XFS_WARN)
- bool ordered = bip->bli_flags & XFS_BLI_ORDERED;
- bool dirty = bip->bli_flags & XFS_BLI_DIRTY;
bool aborted = test_bit(XFS_LI_ABORTED,
&lip->li_flags);
+ bool dirty = bip->bli_flags & XFS_BLI_DIRTY;
+#if defined(DEBUG) || defined(XFS_WARN)
+ bool ordered = bip->bli_flags & XFS_BLI_ORDERED;
#endif
trace_xfs_buf_item_release(bip);
+ ASSERT(xfs_buf_islocked(bp));
+
/*
* The bli dirty state should match whether the blf has logged segments
* except for ordered buffers, where only the bli should be dirty.
@@ -729,16 +712,56 @@ xfs_buf_item_release(
bp->b_transp = NULL;
bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED);
+ /* If there are other references, then we have nothing to do. */
+ if (!atomic_dec_and_test(&bip->bli_refcount))
+ goto out_release;
+
+ /*
+ * Stale buffer completion frees the BLI, unlocks and releases the
+ * buffer. Neither the BLI or buffer are safe to reference after this
+ * call, so there's nothing more we need to do here.
+ *
+ * If we get here with a stale buffer and references to the BLI remain,
+ * we must not unlock the buffer as the last BLI reference owns lock
+ * context, not us.
+ */
+ if (stale) {
+ xfs_buf_item_finish_stale(bip);
+ xfs_buf_relse(bp);
+ ASSERT(!hold);
+ return;
+ }
+
+ /*
+ * Dirty or clean, aborted items are done and need to be removed from
+ * the AIL and released. This frees the BLI, but leaves the buffer
+ * locked and referenced.
+ */
+ if (aborted || xlog_is_shutdown(lip->li_log)) {
+ ASSERT(list_empty(&bip->bli_buf->b_li_list));
+ xfs_buf_item_done(bp);
+ goto out_release;
+ }
+
+ /*
+ * Clean, unreferenced BLIs can be immediately freed, leaving the buffer
+ * locked and referenced.
+ *
+ * Dirty, unreferenced BLIs *must* be in the AIL awaiting writeback.
+ */
+ if (!dirty)
+ xfs_buf_item_relse(bip);
+ else
+ ASSERT(test_bit(XFS_LI_IN_AIL, &lip->li_flags));
+
+ /* Not safe to reference the BLI from here */
+out_release:
/*
- * Unref the item and unlock the buffer unless held or stale. Stale
- * buffers remain locked until final unpin unless the bli is freed by
- * the unref call. The latter implies shutdown because buffer
- * invalidation dirties the bli and transaction.
+ * If we get here with a stale buffer, we must not unlock the
+ * buffer as the last BLI reference owns lock context, not us.
*/
- released = xfs_buf_item_put(bip);
- if (hold || (stale && !released))
+ if (stale || hold)
return;
- ASSERT(!stale || aborted);
xfs_buf_relse(bp);
}
@@ -824,33 +847,6 @@ static const struct xfs_item_ops xfs_buf_item_ops = {
.iop_push = xfs_buf_item_push,
};
-STATIC void
-xfs_buf_item_get_format(
- struct xfs_buf_log_item *bip,
- int count)
-{
- ASSERT(bip->bli_formats == NULL);
- bip->bli_format_count = count;
-
- if (count == 1) {
- bip->bli_formats = &bip->__bli_format;
- return;
- }
-
- bip->bli_formats = kzalloc(count * sizeof(struct xfs_buf_log_format),
- GFP_KERNEL | __GFP_NOFAIL);
-}
-
-STATIC void
-xfs_buf_item_free_format(
- struct xfs_buf_log_item *bip)
-{
- if (bip->bli_formats != &bip->__bli_format) {
- kfree(bip->bli_formats);
- bip->bli_formats = NULL;
- }
-}
-
/*
* Allocate a new buf log item to go with the given buffer.
* Set the buffer's b_log_item field to point to the new
@@ -1071,34 +1067,6 @@ xfs_buf_item_dirty_format(
return false;
}
-STATIC void
-xfs_buf_item_free(
- struct xfs_buf_log_item *bip)
-{
- xfs_buf_item_free_format(bip);
- kvfree(bip->bli_item.li_lv_shadow);
- kmem_cache_free(xfs_buf_item_cache, bip);
-}
-
-/*
- * xfs_buf_item_relse() is called when the buf log item is no longer needed.
- */
-void
-xfs_buf_item_relse(
- struct xfs_buf *bp)
-{
- struct xfs_buf_log_item *bip = bp->b_log_item;
-
- trace_xfs_buf_item_relse(bp, _RET_IP_);
- ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags));
-
- if (atomic_read(&bip->bli_refcount))
- return;
- bp->b_log_item = NULL;
- xfs_buf_rele(bp);
- xfs_buf_item_free(bip);
-}
-
void
xfs_buf_item_done(
struct xfs_buf *bp)
@@ -1118,5 +1086,5 @@ xfs_buf_item_done(
xfs_trans_ail_delete(&bp->b_log_item->bli_item,
(bp->b_flags & _XBF_LOGRECOVERY) ? 0 :
SHUTDOWN_CORRUPT_INCORE);
- xfs_buf_item_relse(bp);
+ xfs_buf_item_relse(bp->b_log_item);
}
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 8cde85259a58..3159325dd17b 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -49,8 +49,7 @@ struct xfs_buf_log_item {
int xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
void xfs_buf_item_done(struct xfs_buf *bp);
-void xfs_buf_item_relse(struct xfs_buf *);
-bool xfs_buf_item_put(struct xfs_buf_log_item *);
+void xfs_buf_item_put(struct xfs_buf_log_item *bip);
void xfs_buf_item_log(struct xfs_buf_log_item *, uint, uint);
bool xfs_buf_item_dirty_format(struct xfs_buf_log_item *);
void xfs_buf_inode_iodone(struct xfs_buf *);
@@ -62,7 +61,10 @@ static inline void xfs_buf_dquot_iodone(struct xfs_buf *bp)
}
#endif /* CONFIG_XFS_QUOTA */
void xfs_buf_iodone(struct xfs_buf *);
-bool xfs_buf_log_check_iovec(struct xfs_log_iovec *iovec);
+bool xfs_buf_log_check_iovec(struct kvec *iovec);
+
+unsigned int xfs_buf_inval_log_space(unsigned int map_count,
+ unsigned int blocksize);
extern struct kmem_cache *xfs_buf_item_cache;
diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c
index 05a2f6927c12..5d58e2ae4972 100644
--- a/fs/xfs/xfs_buf_item_recover.c
+++ b/fs/xfs/xfs_buf_item_recover.c
@@ -159,7 +159,7 @@ STATIC enum xlog_recover_reorder
xlog_recover_buf_reorder(
struct xlog_recover_item *item)
{
- struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr;
+ struct xfs_buf_log_format *buf_f = item->ri_buf[0].iov_base;
if (buf_f->blf_flags & XFS_BLF_CANCEL)
return XLOG_REORDER_CANCEL_LIST;
@@ -173,7 +173,7 @@ xlog_recover_buf_ra_pass2(
struct xlog *log,
struct xlog_recover_item *item)
{
- struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr;
+ struct xfs_buf_log_format *buf_f = item->ri_buf[0].iov_base;
xlog_buf_readahead(log, buf_f->blf_blkno, buf_f->blf_len, NULL);
}
@@ -187,11 +187,11 @@ xlog_recover_buf_commit_pass1(
struct xlog *log,
struct xlog_recover_item *item)
{
- struct xfs_buf_log_format *bf = item->ri_buf[0].i_addr;
+ struct xfs_buf_log_format *bf = item->ri_buf[0].iov_base;
if (!xfs_buf_log_check_iovec(&item->ri_buf[0])) {
- xfs_err(log->l_mp, "bad buffer log item size (%d)",
- item->ri_buf[0].i_len);
+ xfs_err(log->l_mp, "bad buffer log item size (%zd)",
+ item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
@@ -487,8 +487,8 @@ xlog_recover_do_reg_buffer(
nbits = xfs_contig_bits(buf_f->blf_data_map,
buf_f->blf_map_size, bit);
ASSERT(nbits > 0);
- ASSERT(item->ri_buf[i].i_addr != NULL);
- ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
+ ASSERT(item->ri_buf[i].iov_base != NULL);
+ ASSERT(item->ri_buf[i].iov_len % XFS_BLF_CHUNK == 0);
ASSERT(BBTOB(bp->b_length) >=
((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT));
@@ -500,8 +500,8 @@ xlog_recover_do_reg_buffer(
* the log. Hence we need to trim nbits back to the length of
* the current region being copied out of the log.
*/
- if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT))
- nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT;
+ if (item->ri_buf[i].iov_len < (nbits << XFS_BLF_SHIFT))
+ nbits = item->ri_buf[i].iov_len >> XFS_BLF_SHIFT;
/*
* Do a sanity check if this is a dquot buffer. Just checking
@@ -511,18 +511,18 @@ xlog_recover_do_reg_buffer(
fa = NULL;
if (buf_f->blf_flags &
(XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
- if (item->ri_buf[i].i_addr == NULL) {
+ if (item->ri_buf[i].iov_base == NULL) {
xfs_alert(mp,
"XFS: NULL dquot in %s.", __func__);
goto next;
}
- if (item->ri_buf[i].i_len < size_disk_dquot) {
+ if (item->ri_buf[i].iov_len < size_disk_dquot) {
xfs_alert(mp,
- "XFS: dquot too small (%d) in %s.",
- item->ri_buf[i].i_len, __func__);
+ "XFS: dquot too small (%zd) in %s.",
+ item->ri_buf[i].iov_len, __func__);
goto next;
}
- fa = xfs_dquot_verify(mp, item->ri_buf[i].i_addr, -1);
+ fa = xfs_dquot_verify(mp, item->ri_buf[i].iov_base, -1);
if (fa) {
xfs_alert(mp,
"dquot corrupt at %pS trying to replay into block 0x%llx",
@@ -533,7 +533,7 @@ xlog_recover_do_reg_buffer(
memcpy(xfs_buf_offset(bp,
(uint)bit << XFS_BLF_SHIFT), /* dest */
- item->ri_buf[i].i_addr, /* source */
+ item->ri_buf[i].iov_base, /* source */
nbits<<XFS_BLF_SHIFT); /* length */
next:
i++;
@@ -669,8 +669,8 @@ xlog_recover_do_inode_buffer(
if (next_unlinked_offset < reg_buf_offset)
continue;
- ASSERT(item->ri_buf[item_index].i_addr != NULL);
- ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
+ ASSERT(item->ri_buf[item_index].iov_base != NULL);
+ ASSERT((item->ri_buf[item_index].iov_len % XFS_BLF_CHUNK) == 0);
ASSERT((reg_buf_offset + reg_buf_bytes) <= BBTOB(bp->b_length));
/*
@@ -678,7 +678,7 @@ xlog_recover_do_inode_buffer(
* current di_next_unlinked field. Extract its value
* and copy it to the buffer copy.
*/
- logged_nextp = item->ri_buf[item_index].i_addr +
+ logged_nextp = item->ri_buf[item_index].iov_base +
next_unlinked_offset - reg_buf_offset;
if (XFS_IS_CORRUPT(mp, *logged_nextp == 0)) {
xfs_alert(mp,
@@ -1002,11 +1002,10 @@ xlog_recover_buf_commit_pass2(
struct xlog_recover_item *item,
xfs_lsn_t current_lsn)
{
- struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr;
+ struct xfs_buf_log_format *buf_f = item->ri_buf[0].iov_base;
struct xfs_mount *mp = log->l_mp;
struct xfs_buf *bp;
int error;
- uint buf_flags;
xfs_lsn_t lsn;
/*
@@ -1025,13 +1024,8 @@ xlog_recover_buf_commit_pass2(
}
trace_xfs_log_recover_buf_recover(log, buf_f);
-
- buf_flags = 0;
- if (buf_f->blf_flags & XFS_BLF_INODE_BUF)
- buf_flags |= XBF_UNMAPPED;
-
error = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
- buf_flags, &bp, NULL);
+ 0, &bp, NULL);
if (error)
return error;
diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c
index 07bebbfb16ee..dcbfa274e06d 100644
--- a/fs/xfs/xfs_buf_mem.c
+++ b/fs/xfs/xfs_buf_mem.c
@@ -74,7 +74,7 @@ xmbuf_alloc(
/*
* We don't want to bother with kmapping data during repair, so don't
- * allow highmem pages to back this mapping.
+ * allow highmem folios to back this mapping.
*/
mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL);
@@ -117,7 +117,7 @@ xmbuf_free(
struct xfs_buftarg *btp)
{
ASSERT(xfs_buftarg_is_mem(btp));
- ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
+ ASSERT(percpu_counter_sum(&btp->bt_readahead_count) == 0);
trace_xmbuf_free(btp);
@@ -127,14 +127,13 @@ xmbuf_free(
kfree(btp);
}
-/* Directly map a shmem page into the buffer cache. */
+/* Directly map a shmem folio into the buffer cache. */
int
-xmbuf_map_page(
+xmbuf_map_backing_mem(
struct xfs_buf *bp)
{
struct inode *inode = file_inode(bp->b_target->bt_file);
struct folio *folio = NULL;
- struct page *page;
loff_t pos = BBTOB(xfs_buf_daddr(bp));
int error;
@@ -159,39 +158,17 @@ xmbuf_map_page(
return -EIO;
}
- page = folio_file_page(folio, pos >> PAGE_SHIFT);
-
/*
- * Mark the page dirty so that it won't be reclaimed once we drop the
- * (potentially last) reference in xmbuf_unmap_page.
+ * Mark the folio dirty so that it won't be reclaimed once we drop the
+ * (potentially last) reference in xfs_buf_free.
*/
- set_page_dirty(page);
- unlock_page(page);
+ folio_set_dirty(folio);
+ folio_unlock(folio);
- bp->b_addr = page_address(page);
- bp->b_pages = bp->b_page_array;
- bp->b_pages[0] = page;
- bp->b_page_count = 1;
+ bp->b_addr = folio_address(folio) + offset_in_folio(folio, pos);
return 0;
}
-/* Unmap a shmem page that was mapped into the buffer cache. */
-void
-xmbuf_unmap_page(
- struct xfs_buf *bp)
-{
- struct page *page = bp->b_pages[0];
-
- ASSERT(xfs_buftarg_is_mem(bp->b_target));
-
- put_page(page);
-
- bp->b_addr = NULL;
- bp->b_pages[0] = NULL;
- bp->b_pages = NULL;
- bp->b_page_count = 0;
-}
-
/* Is this a valid daddr within the buftarg? */
bool
xmbuf_verify_daddr(
@@ -205,7 +182,7 @@ xmbuf_verify_daddr(
return daddr < (inode->i_sb->s_maxbytes >> BBSHIFT);
}
-/* Discard the page backing this buffer. */
+/* Discard the folio backing this buffer. */
static void
xmbuf_stale(
struct xfs_buf *bp)
@@ -220,7 +197,7 @@ xmbuf_stale(
}
/*
- * Finalize a buffer -- discard the backing page if it's stale, or run the
+ * Finalize a buffer -- discard the backing folio if it's stale, or run the
* write verifier to detect problems.
*/
int
diff --git a/fs/xfs/xfs_buf_mem.h b/fs/xfs/xfs_buf_mem.h
index eed4a7b63232..67d525cc1513 100644
--- a/fs/xfs/xfs_buf_mem.h
+++ b/fs/xfs/xfs_buf_mem.h
@@ -19,16 +19,14 @@ int xmbuf_alloc(struct xfs_mount *mp, const char *descr,
struct xfs_buftarg **btpp);
void xmbuf_free(struct xfs_buftarg *btp);
-int xmbuf_map_page(struct xfs_buf *bp);
-void xmbuf_unmap_page(struct xfs_buf *bp);
bool xmbuf_verify_daddr(struct xfs_buftarg *btp, xfs_daddr_t daddr);
void xmbuf_trans_bdetach(struct xfs_trans *tp, struct xfs_buf *bp);
int xmbuf_finalize(struct xfs_buf *bp);
#else
# define xfs_buftarg_is_mem(...) (false)
-# define xmbuf_map_page(...) (-ENOMEM)
-# define xmbuf_unmap_page(...) ((void)0)
# define xmbuf_verify_daddr(...) (false)
#endif /* CONFIG_XFS_MEMORY_BUFS */
+int xmbuf_map_backing_mem(struct xfs_buf *bp);
+
#endif /* __XFS_BUF_MEM_H__ */
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 3f2403a7b49c..ee49f20875af 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -103,24 +103,6 @@ xfs_discard_endio(
bio_put(bio);
}
-static inline struct block_device *
-xfs_group_bdev(
- const struct xfs_group *xg)
-{
- struct xfs_mount *mp = xg->xg_mount;
-
- switch (xg->xg_type) {
- case XG_TYPE_AG:
- return mp->m_ddev_targp->bt_bdev;
- case XG_TYPE_RTG:
- return mp->m_rtdev_targp->bt_bdev;
- default:
- ASSERT(0);
- break;
- }
- return NULL;
-}
-
/*
* Walk the discard list and issue discards on all the busy extents in the
* list. We plug and chain the bios so that we only need a single completion
@@ -138,11 +120,14 @@ xfs_discard_extents(
blk_start_plug(&plug);
list_for_each_entry(busyp, &extents->extent_list, list) {
- trace_xfs_discard_extent(busyp->group, busyp->bno,
- busyp->length);
+ struct xfs_group *xg = busyp->group;
+ struct xfs_buftarg *btp =
+ xfs_group_type_buftarg(xg->xg_mount, xg->xg_type);
- error = __blkdev_issue_discard(xfs_group_bdev(busyp->group),
- xfs_gbno_to_daddr(busyp->group, busyp->bno),
+ trace_xfs_discard_extent(xg, busyp->bno, busyp->length);
+
+ error = __blkdev_issue_discard(btp->bt_bdev,
+ xfs_gbno_to_daddr(xg, busyp->bno),
XFS_FSB_TO_BB(mp, busyp->length),
GFP_KERNEL, &bio);
if (error && error != -EOPNOTSUPP) {
@@ -167,6 +152,14 @@ xfs_discard_extents(
return error;
}
+/*
+ * Care must be taken setting up the trim cursor as the perags may not have been
+ * initialised when the cursor is initialised. e.g. a clean mount which hasn't
+ * read in AGFs and the first operation run on the mounted fs is a trim. This
+ * can result in perag fields that aren't initialised until
+ * xfs_trim_gather_extents() calls xfs_alloc_read_agf() to lock down the AG for
+ * the free space search.
+ */
struct xfs_trim_cur {
xfs_agblock_t start;
xfs_extlen_t count;
@@ -196,14 +189,20 @@ xfs_trim_gather_extents(
*/
xfs_log_force(mp, XFS_LOG_SYNC);
- error = xfs_trans_alloc_empty(mp, &tp);
- if (error)
- return error;
+ tp = xfs_trans_alloc_empty(mp);
error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
if (error)
goto out_trans_cancel;
+ /*
+ * First time through tcur->count will not have been initialised as
+ * pag->pagf_longest is not guaranteed to be valid before we read
+ * the AGF buffer above.
+ */
+ if (!tcur->count)
+ tcur->count = pag->pagf_longest;
+
if (tcur->by_bno) {
/* sub-AG discard request always starts at tcur->start */
cur = xfs_bnobt_init_cursor(mp, tp, agbp, pag);
@@ -350,7 +349,6 @@ xfs_trim_perag_extents(
{
struct xfs_trim_cur tcur = {
.start = start,
- .count = pag->pagf_longest,
.end = end,
.minlen = minlen,
};
@@ -583,9 +581,7 @@ xfs_trim_rtextents(
struct xfs_trans *tp;
int error;
- error = xfs_trans_alloc_empty(mp, &tp);
- if (error)
- return error;
+ tp = xfs_trans_alloc_empty(mp);
/*
* Walk the free ranges between low and high. The query_range function
@@ -701,9 +697,7 @@ xfs_trim_rtgroup_extents(
struct xfs_trans *tp;
int error;
- error = xfs_trans_alloc_empty(mp, &tp);
- if (error)
- return error;
+ tp = xfs_trans_alloc_empty(mp);
/*
* Walk the free ranges between low and high. The query_range function
@@ -844,7 +838,8 @@ xfs_ioc_trim(
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (mp->m_rtdev_targp &&
+
+ if (mp->m_rtdev_targp && !xfs_has_zoned(mp) &&
bdev_max_discard_sectors(mp->m_rtdev_targp->bt_bdev))
rt_bdev = mp->m_rtdev_targp->bt_bdev;
if (!bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev) && !rt_bdev)
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index edbc521870a1..0bd8022e47b4 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -1186,9 +1186,8 @@ xfs_qm_dqflush_done(
if (test_bit(XFS_LI_IN_AIL, &lip->li_flags) &&
(lip->li_lsn == qlip->qli_flush_lsn ||
test_bit(XFS_LI_FAILED, &lip->li_flags))) {
-
spin_lock(&ailp->ail_lock);
- xfs_clear_li_failed(lip);
+ clear_bit(XFS_LI_FAILED, &lip->li_flags);
if (lip->li_lsn == qlip->qli_flush_lsn) {
/* xfs_ail_update_finish() drops the AIL lock */
tail_lsn = xfs_ail_delete_one(ailp, lip);
@@ -1399,11 +1398,9 @@ xfs_qm_dqflush(
ASSERT(XFS_DQ_IS_LOCKED(dqp));
ASSERT(!completion_done(&dqp->q_flush));
+ ASSERT(atomic_read(&dqp->q_pincount) == 0);
trace_xfs_dqflush(dqp);
-
- xfs_qm_dqunpin_wait(dqp);
-
fa = xfs_qm_dqflush_check(dqp);
if (fa) {
xfs_alert(mp, "corrupt dquot ID 0x%x in memory at %pS",
diff --git a/fs/xfs/xfs_dquot_item_recover.c b/fs/xfs/xfs_dquot_item_recover.c
index 2c2720ce6923..89bc9bcaf51e 100644
--- a/fs/xfs/xfs_dquot_item_recover.c
+++ b/fs/xfs/xfs_dquot_item_recover.c
@@ -34,10 +34,10 @@ xlog_recover_dquot_ra_pass2(
if (mp->m_qflags == 0)
return;
- recddq = item->ri_buf[1].i_addr;
+ recddq = item->ri_buf[1].iov_base;
if (recddq == NULL)
return;
- if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot))
+ if (item->ri_buf[1].iov_len < sizeof(struct xfs_disk_dquot))
return;
type = recddq->d_type & XFS_DQTYPE_REC_MASK;
@@ -45,7 +45,7 @@ xlog_recover_dquot_ra_pass2(
if (log->l_quotaoffs_flag & type)
return;
- dq_f = item->ri_buf[0].i_addr;
+ dq_f = item->ri_buf[0].iov_base;
ASSERT(dq_f);
ASSERT(dq_f->qlf_len == 1);
@@ -79,14 +79,14 @@ xlog_recover_dquot_commit_pass2(
if (mp->m_qflags == 0)
return 0;
- recddq = item->ri_buf[1].i_addr;
+ recddq = item->ri_buf[1].iov_base;
if (recddq == NULL) {
xfs_alert(log->l_mp, "NULL dquot in %s.", __func__);
return -EFSCORRUPTED;
}
- if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot)) {
- xfs_alert(log->l_mp, "dquot too small (%d) in %s.",
- item->ri_buf[1].i_len, __func__);
+ if (item->ri_buf[1].iov_len < sizeof(struct xfs_disk_dquot)) {
+ xfs_alert(log->l_mp, "dquot too small (%zd) in %s.",
+ item->ri_buf[1].iov_len, __func__);
return -EFSCORRUPTED;
}
@@ -108,7 +108,7 @@ xlog_recover_dquot_commit_pass2(
* The other possibility, of course, is that the quota subsystem was
* removed since the last mount - ENOSYS.
*/
- dq_f = item->ri_buf[0].i_addr;
+ dq_f = item->ri_buf[0].iov_base;
ASSERT(dq_f);
fa = xfs_dquot_verify(mp, recddq, dq_f->qlf_id);
if (fa) {
@@ -147,7 +147,7 @@ xlog_recover_dquot_commit_pass2(
}
}
- memcpy(ddq, recddq, item->ri_buf[1].i_len);
+ memcpy(ddq, recddq, item->ri_buf[1].iov_len);
if (xfs_has_crc(mp)) {
xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk),
XFS_DQUOT_CRC_OFF);
@@ -192,7 +192,7 @@ xlog_recover_quotaoff_commit_pass1(
struct xlog *log,
struct xlog_recover_item *item)
{
- struct xfs_qoff_logformat *qoff_f = item->ri_buf[0].i_addr;
+ struct xfs_qoff_logformat *qoff_f = item->ri_buf[0].iov_base;
ASSERT(qoff_f);
/*
diff --git a/fs/xfs/xfs_exchmaps_item.c b/fs/xfs/xfs_exchmaps_item.c
index 264a121c5e16..229cbe0adf17 100644
--- a/fs/xfs/xfs_exchmaps_item.c
+++ b/fs/xfs/xfs_exchmaps_item.c
@@ -558,12 +558,12 @@ xlog_recover_xmi_commit_pass2(
size_t len;
len = sizeof(struct xfs_xmi_log_format);
- if (item->ri_buf[0].i_len != len) {
+ if (item->ri_buf[0].iov_len != len) {
XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
return -EFSCORRUPTED;
}
- xmi_formatp = item->ri_buf[0].i_addr;
+ xmi_formatp = item->ri_buf[0].iov_base;
if (xmi_formatp->__pad != 0) {
XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
return -EFSCORRUPTED;
@@ -598,8 +598,8 @@ xlog_recover_xmd_commit_pass2(
{
struct xfs_xmd_log_format *xmd_formatp;
- xmd_formatp = item->ri_buf[0].i_addr;
- if (item->ri_buf[0].i_len != sizeof(struct xfs_xmd_log_format)) {
+ xmd_formatp = item->ri_buf[0].iov_base;
+ if (item->ri_buf[0].iov_len != sizeof(struct xfs_xmd_log_format)) {
XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
return -EFSCORRUPTED;
}
diff --git a/fs/xfs/xfs_exchrange.c b/fs/xfs/xfs_exchrange.c
index f340a2015c4c..0b41bdfecdfb 100644
--- a/fs/xfs/xfs_exchrange.c
+++ b/fs/xfs/xfs_exchrange.c
@@ -329,22 +329,6 @@ out_trans_cancel:
* successfully but before locks are dropped.
*/
-/* Verify that we have security clearance to perform this operation. */
-static int
-xfs_exchange_range_verify_area(
- struct xfs_exchrange *fxr)
-{
- int ret;
-
- ret = remap_verify_area(fxr->file1, fxr->file1_offset, fxr->length,
- true);
- if (ret)
- return ret;
-
- return remap_verify_area(fxr->file2, fxr->file2_offset, fxr->length,
- true);
-}
-
/*
* Performs necessary checks before doing a range exchange, having stabilized
* mutable inode attributes via i_rwsem.
@@ -355,11 +339,13 @@ xfs_exchange_range_checks(
unsigned int alloc_unit)
{
struct inode *inode1 = file_inode(fxr->file1);
+ loff_t size1 = i_size_read(inode1);
struct inode *inode2 = file_inode(fxr->file2);
+ loff_t size2 = i_size_read(inode2);
uint64_t allocmask = alloc_unit - 1;
int64_t test_len;
uint64_t blen;
- loff_t size1, size2, tmp;
+ loff_t tmp;
int error;
/* Don't touch certain kinds of inodes */
@@ -368,24 +354,25 @@ xfs_exchange_range_checks(
if (IS_SWAPFILE(inode1) || IS_SWAPFILE(inode2))
return -ETXTBSY;
- size1 = i_size_read(inode1);
- size2 = i_size_read(inode2);
-
/* Ranges cannot start after EOF. */
if (fxr->file1_offset > size1 || fxr->file2_offset > size2)
return -EINVAL;
- /*
- * If the caller said to exchange to EOF, we set the length of the
- * request large enough to cover everything to the end of both files.
- */
if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) {
+ /*
+ * If the caller said to exchange to EOF, we set the length of
+ * the request large enough to cover everything to the end of
+ * both files.
+ */
fxr->length = max_t(int64_t, size1 - fxr->file1_offset,
size2 - fxr->file2_offset);
-
- error = xfs_exchange_range_verify_area(fxr);
- if (error)
- return error;
+ } else {
+ /*
+ * Otherwise we require both ranges to end within EOF.
+ */
+ if (fxr->file1_offset + fxr->length > size1 ||
+ fxr->file2_offset + fxr->length > size2)
+ return -EINVAL;
}
/*
@@ -402,15 +389,6 @@ xfs_exchange_range_checks(
return -EINVAL;
/*
- * We require both ranges to end within EOF, unless we're exchanging
- * to EOF.
- */
- if (!(fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) &&
- (fxr->file1_offset + fxr->length > size1 ||
- fxr->file2_offset + fxr->length > size2))
- return -EINVAL;
-
- /*
* Make sure we don't hit any file size limits. If we hit any size
* limits such that test_length was adjusted, we abort the whole
* operation.
@@ -747,6 +725,7 @@ xfs_exchange_range(
{
struct inode *inode1 = file_inode(fxr->file1);
struct inode *inode2 = file_inode(fxr->file2);
+ loff_t check_len = fxr->length;
int ret;
BUILD_BUG_ON(XFS_EXCHANGE_RANGE_ALL_FLAGS &
@@ -779,14 +758,18 @@ xfs_exchange_range(
return -EBADF;
/*
- * If we're not exchanging to EOF, we can check the areas before
- * stabilizing both files' i_size.
+ * If we're exchanging to EOF we can't calculate the length until taking
+ * the iolock. Pass a 0 length to remap_verify_area similar to the
+ * FICLONE and FICLONERANGE ioctls that support cloning to EOF as well.
*/
- if (!(fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)) {
- ret = xfs_exchange_range_verify_area(fxr);
- if (ret)
- return ret;
- }
+ if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF)
+ check_len = 0;
+ ret = remap_verify_area(fxr->file1, fxr->file1_offset, check_len, true);
+ if (ret)
+ return ret;
+ ret = remap_verify_area(fxr->file2, fxr->file2_offset, check_len, true);
+ if (ret)
+ return ret;
/* Update cmtime if the fd/inode don't forbid it. */
if (!(fxr->file1->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode1))
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index ea43c9a6e54c..da3161572735 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -671,7 +671,7 @@ xfs_extent_busy_wait_all(
while ((pag = xfs_perag_next(mp, pag)))
xfs_extent_busy_wait_group(pag_group(pag));
- if (xfs_has_rtgroups(mp))
+ if (xfs_has_rtgroups(mp) && !xfs_has_zoned(mp))
while ((rtg = xfs_rtgroup_next(mp, rtg)))
xfs_extent_busy_wait_group(rtg_group(rtg));
}
diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h
index f069b04e8ea1..3e6e019b6146 100644
--- a/fs/xfs/xfs_extent_busy.h
+++ b/fs/xfs/xfs_extent_busy.h
@@ -68,4 +68,12 @@ static inline void xfs_extent_busy_sort(struct list_head *list)
list_sort(NULL, list, xfs_extent_busy_ag_cmp);
}
+/*
+ * Zoned RTGs don't need to track busy extents, as the actual block freeing only
+ * happens by a zone reset, which forces out all transactions that touched the
+ * to be reset zone first.
+ */
+#define xfs_group_has_extent_busy(mp, type) \
+ ((type) == XG_TYPE_AG || !xfs_has_zoned((mp)))
+
#endif /* __XFS_EXTENT_BUSY_H__ */
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index a25c713ff888..47ee598a9827 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -29,6 +29,7 @@
#include "xfs_inode.h"
#include "xfs_rtbitmap.h"
#include "xfs_rtgroup.h"
+#include "xfs_zone_alloc.h"
struct kmem_cache *xfs_efi_cache;
struct kmem_cache *xfs_efd_cache;
@@ -82,6 +83,11 @@ xfs_efi_item_size(
*nbytes += xfs_efi_log_format_sizeof(efip->efi_format.efi_nextents);
}
+unsigned int xfs_efi_log_space(unsigned int nr)
+{
+ return xlog_item_space(1, xfs_efi_log_format_sizeof(nr));
+}
+
/*
* This is called to fill in the vector of log iovecs for the
* given efi log item. We use only 1 iovec, and we point that
@@ -176,15 +182,18 @@ xfs_efi_init(
* It will handle the conversion of formats if necessary.
*/
STATIC int
-xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
+xfs_efi_copy_format(
+ struct kvec *buf,
+ struct xfs_efi_log_format *dst_efi_fmt)
{
- xfs_efi_log_format_t *src_efi_fmt = buf->i_addr;
- uint i;
- uint len = xfs_efi_log_format_sizeof(src_efi_fmt->efi_nextents);
- uint len32 = xfs_efi_log_format32_sizeof(src_efi_fmt->efi_nextents);
- uint len64 = xfs_efi_log_format64_sizeof(src_efi_fmt->efi_nextents);
+ struct xfs_efi_log_format *src_efi_fmt = buf->iov_base;
+ uint len, len32, len64, i;
+
+ len = xfs_efi_log_format_sizeof(src_efi_fmt->efi_nextents);
+ len32 = xfs_efi_log_format32_sizeof(src_efi_fmt->efi_nextents);
+ len64 = xfs_efi_log_format64_sizeof(src_efi_fmt->efi_nextents);
- if (buf->i_len == len) {
+ if (buf->iov_len == len) {
memcpy(dst_efi_fmt, src_efi_fmt,
offsetof(struct xfs_efi_log_format, efi_extents));
for (i = 0; i < src_efi_fmt->efi_nextents; i++)
@@ -192,8 +201,8 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
&src_efi_fmt->efi_extents[i],
sizeof(struct xfs_extent));
return 0;
- } else if (buf->i_len == len32) {
- xfs_efi_log_format_32_t *src_efi_fmt_32 = buf->i_addr;
+ } else if (buf->iov_len == len32) {
+ xfs_efi_log_format_32_t *src_efi_fmt_32 = buf->iov_base;
dst_efi_fmt->efi_type = src_efi_fmt_32->efi_type;
dst_efi_fmt->efi_size = src_efi_fmt_32->efi_size;
@@ -206,8 +215,8 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
src_efi_fmt_32->efi_extents[i].ext_len;
}
return 0;
- } else if (buf->i_len == len64) {
- xfs_efi_log_format_64_t *src_efi_fmt_64 = buf->i_addr;
+ } else if (buf->iov_len == len64) {
+ xfs_efi_log_format_64_t *src_efi_fmt_64 = buf->iov_base;
dst_efi_fmt->efi_type = src_efi_fmt_64->efi_type;
dst_efi_fmt->efi_size = src_efi_fmt_64->efi_size;
@@ -221,8 +230,8 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
}
return 0;
}
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, NULL, buf->i_addr,
- buf->i_len);
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, NULL, buf->iov_base,
+ buf->iov_len);
return -EFSCORRUPTED;
}
@@ -253,6 +262,11 @@ xfs_efd_item_size(
*nbytes += xfs_efd_log_format_sizeof(efdp->efd_format.efd_nextents);
}
+unsigned int xfs_efd_log_space(unsigned int nr)
+{
+ return xlog_item_space(1, xfs_efd_log_format_sizeof(nr));
+}
+
/*
* This is called to fill in the vector of log iovecs for the
* given efd log item. We use only 1 iovec, and we point that
@@ -767,21 +781,35 @@ xfs_rtextent_free_finish_item(
trace_xfs_extent_free_deferred(mp, xefi);
- if (!(xefi->xefi_flags & XFS_EFI_CANCELLED)) {
- if (*rtgp != to_rtg(xefi->xefi_group)) {
- *rtgp = to_rtg(xefi->xefi_group);
- xfs_rtgroup_lock(*rtgp, XFS_RTGLOCK_BITMAP);
- xfs_rtgroup_trans_join(tp, *rtgp,
- XFS_RTGLOCK_BITMAP);
- }
- error = xfs_rtfree_blocks(tp, *rtgp,
- xefi->xefi_startblock, xefi->xefi_blockcount);
+ if (xefi->xefi_flags & XFS_EFI_CANCELLED)
+ goto done;
+
+ if (*rtgp != to_rtg(xefi->xefi_group)) {
+ unsigned int lock_flags;
+
+ if (xfs_has_zoned(mp))
+ lock_flags = XFS_RTGLOCK_RMAP;
+ else
+ lock_flags = XFS_RTGLOCK_BITMAP;
+
+ *rtgp = to_rtg(xefi->xefi_group);
+ xfs_rtgroup_lock(*rtgp, lock_flags);
+ xfs_rtgroup_trans_join(tp, *rtgp, lock_flags);
+ }
+
+ if (xfs_has_zoned(mp)) {
+ error = xfs_zone_free_blocks(tp, *rtgp, xefi->xefi_startblock,
+ xefi->xefi_blockcount);
+ } else {
+ error = xfs_rtfree_blocks(tp, *rtgp, xefi->xefi_startblock,
+ xefi->xefi_blockcount);
}
+
if (error == -EAGAIN) {
xfs_efd_from_efi(efdp);
return error;
}
-
+done:
xfs_efd_add_extent(efdp, xefi);
xfs_extent_free_cancel_item(item);
return error;
@@ -840,11 +868,11 @@ xlog_recover_efi_commit_pass2(
struct xfs_efi_log_format *efi_formatp;
int error;
- efi_formatp = item->ri_buf[0].i_addr;
+ efi_formatp = item->ri_buf[0].iov_base;
- if (item->ri_buf[0].i_len < xfs_efi_log_format_sizeof(0)) {
+ if (item->ri_buf[0].iov_len < xfs_efi_log_format_sizeof(0)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
@@ -879,11 +907,11 @@ xlog_recover_rtefi_commit_pass2(
struct xfs_efi_log_format *efi_formatp;
int error;
- efi_formatp = item->ri_buf[0].i_addr;
+ efi_formatp = item->ri_buf[0].iov_base;
- if (item->ri_buf[0].i_len < xfs_efi_log_format_sizeof(0)) {
+ if (item->ri_buf[0].iov_len < xfs_efi_log_format_sizeof(0)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
@@ -908,7 +936,7 @@ xlog_recover_rtefi_commit_pass2(
xfs_lsn_t lsn)
{
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
#endif
@@ -933,9 +961,9 @@ xlog_recover_efd_commit_pass2(
xfs_lsn_t lsn)
{
struct xfs_efd_log_format *efd_formatp;
- int buflen = item->ri_buf[0].i_len;
+ int buflen = item->ri_buf[0].iov_len;
- efd_formatp = item->ri_buf[0].i_addr;
+ efd_formatp = item->ri_buf[0].iov_base;
if (buflen < sizeof(struct xfs_efd_log_format)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
@@ -943,9 +971,9 @@ xlog_recover_efd_commit_pass2(
return -EFSCORRUPTED;
}
- if (item->ri_buf[0].i_len != xfs_efd_log_format32_sizeof(
+ if (item->ri_buf[0].iov_len != xfs_efd_log_format32_sizeof(
efd_formatp->efd_nextents) &&
- item->ri_buf[0].i_len != xfs_efd_log_format64_sizeof(
+ item->ri_buf[0].iov_len != xfs_efd_log_format64_sizeof(
efd_formatp->efd_nextents)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
efd_formatp, buflen);
@@ -970,9 +998,9 @@ xlog_recover_rtefd_commit_pass2(
xfs_lsn_t lsn)
{
struct xfs_efd_log_format *efd_formatp;
- int buflen = item->ri_buf[0].i_len;
+ int buflen = item->ri_buf[0].iov_len;
- efd_formatp = item->ri_buf[0].i_addr;
+ efd_formatp = item->ri_buf[0].iov_base;
if (buflen < sizeof(struct xfs_efd_log_format)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
@@ -980,9 +1008,9 @@ xlog_recover_rtefd_commit_pass2(
return -EFSCORRUPTED;
}
- if (item->ri_buf[0].i_len != xfs_efd_log_format32_sizeof(
+ if (item->ri_buf[0].iov_len != xfs_efd_log_format32_sizeof(
efd_formatp->efd_nextents) &&
- item->ri_buf[0].i_len != xfs_efd_log_format64_sizeof(
+ item->ri_buf[0].iov_len != xfs_efd_log_format64_sizeof(
efd_formatp->efd_nextents)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
efd_formatp, buflen);
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
index 41b7c4306079..c8402040410b 100644
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -94,4 +94,7 @@ void xfs_extent_free_defer_add(struct xfs_trans *tp,
struct xfs_extent_free_item *xefi,
struct xfs_defer_pending **dfpp);
+unsigned int xfs_efi_log_space(unsigned int nr);
+unsigned int xfs_efd_log_space(unsigned int nr);
+
#endif /* __XFS_EXTFREE_ITEM_H__ */
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index f7a7d89c345e..f96fbf5c54c9 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -25,6 +25,8 @@
#include "xfs_iomap.h"
#include "xfs_reflink.h"
#include "xfs_file.h"
+#include "xfs_aops.h"
+#include "xfs_zone_alloc.h"
#include <linux/dax.h>
#include <linux/falloc.h>
@@ -150,7 +152,7 @@ xfs_file_fsync(
* ensure newly written file data make it to disk before logging the new
* inode size in case of an extending write.
*/
- if (XFS_IS_REALTIME_INODE(ip))
+ if (XFS_IS_REALTIME_INODE(ip) && mp->m_rtdev_targp != mp->m_ddev_targp)
error = blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev);
else if (mp->m_logdev_targp != mp->m_ddev_targp)
error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
@@ -360,7 +362,8 @@ xfs_file_write_zero_eof(
struct iov_iter *from,
unsigned int *iolock,
size_t count,
- bool *drained_dio)
+ bool *drained_dio,
+ struct xfs_zone_alloc_ctx *ac)
{
struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
loff_t isize;
@@ -414,7 +417,7 @@ xfs_file_write_zero_eof(
trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
- error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL);
+ error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, ac, NULL);
xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
return error;
@@ -431,7 +434,8 @@ STATIC ssize_t
xfs_file_write_checks(
struct kiocb *iocb,
struct iov_iter *from,
- unsigned int *iolock)
+ unsigned int *iolock,
+ struct xfs_zone_alloc_ctx *ac)
{
struct inode *inode = iocb->ki_filp->f_mapping->host;
size_t count = iov_iter_count(from);
@@ -481,7 +485,7 @@ restart:
*/
if (iocb->ki_pos > i_size_read(inode)) {
error = xfs_file_write_zero_eof(iocb, from, iolock, count,
- &drained_dio);
+ &drained_dio, ac);
if (error == 1)
goto restart;
if (error)
@@ -491,6 +495,48 @@ restart:
return kiocb_modified(iocb);
}
+static ssize_t
+xfs_zoned_write_space_reserve(
+ struct xfs_mount *mp,
+ struct kiocb *iocb,
+ struct iov_iter *from,
+ unsigned int flags,
+ struct xfs_zone_alloc_ctx *ac)
+{
+ loff_t count = iov_iter_count(from);
+ int error;
+
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ flags |= XFS_ZR_NOWAIT;
+
+ /*
+ * Check the rlimit and LFS boundary first so that we don't over-reserve
+ * by possibly a lot.
+ *
+ * The generic write path will redo this check later, and it might have
+ * changed by then. If it got expanded we'll stick to our earlier
+ * smaller limit, and if it is decreased the new smaller limit will be
+ * used and our extra space reservation will be returned after finishing
+ * the write.
+ */
+ error = generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, &count);
+ if (error)
+ return error;
+
+ /*
+ * Sloppily round up count to file system blocks.
+ *
+ * This will often reserve an extra block, but that avoids having to look
+ * at the start offset, which isn't stable for O_APPEND until taking the
+ * iolock. Also we need to reserve a block each for zeroing the old
+ * EOF block and the new start block if they are unaligned.
+ *
+ * Any remaining block will be returned after the write.
+ */
+ return xfs_zoned_space_reserve(mp, XFS_B_TO_FSB(mp, count) + 1 + 2,
+ flags, ac);
+}
+
static int
xfs_dio_write_end_io(
struct kiocb *iocb,
@@ -503,6 +549,9 @@ xfs_dio_write_end_io(
loff_t offset = iocb->ki_pos;
unsigned int nofs_flag;
+ ASSERT(!xfs_is_zoned_inode(ip) ||
+ !(flags & (IOMAP_DIO_UNWRITTEN | IOMAP_DIO_COW)));
+
trace_xfs_end_io_direct_write(ip, offset, size);
if (xfs_is_shutdown(ip->i_mount))
@@ -527,7 +576,10 @@ xfs_dio_write_end_io(
nofs_flag = memalloc_nofs_save();
if (flags & IOMAP_DIO_COW) {
- error = xfs_reflink_end_cow(ip, offset, size);
+ if (iocb->ki_flags & IOCB_ATOMIC)
+ error = xfs_reflink_end_atomic_cow(ip, offset, size);
+ else
+ error = xfs_reflink_end_cow(ip, offset, size);
if (error)
goto out;
}
@@ -582,14 +634,51 @@ static const struct iomap_dio_ops xfs_dio_write_ops = {
.end_io = xfs_dio_write_end_io,
};
+static void
+xfs_dio_zoned_submit_io(
+ const struct iomap_iter *iter,
+ struct bio *bio,
+ loff_t file_offset)
+{
+ struct xfs_mount *mp = XFS_I(iter->inode)->i_mount;
+ struct xfs_zone_alloc_ctx *ac = iter->private;
+ xfs_filblks_t count_fsb;
+ struct iomap_ioend *ioend;
+
+ count_fsb = XFS_B_TO_FSB(mp, bio->bi_iter.bi_size);
+ if (count_fsb > ac->reserved_blocks) {
+ xfs_err(mp,
+"allocation (%lld) larger than reservation (%lld).",
+ count_fsb, ac->reserved_blocks);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ bio_io_error(bio);
+ return;
+ }
+ ac->reserved_blocks -= count_fsb;
+
+ bio->bi_end_io = xfs_end_bio;
+ ioend = iomap_init_ioend(iter->inode, bio, file_offset,
+ IOMAP_IOEND_DIRECT);
+ xfs_zone_alloc_and_submit(ioend, &ac->open_zone);
+}
+
+static const struct iomap_dio_ops xfs_dio_zoned_write_ops = {
+ .bio_set = &iomap_ioend_bioset,
+ .submit_io = xfs_dio_zoned_submit_io,
+ .end_io = xfs_dio_write_end_io,
+};
+
/*
- * Handle block aligned direct I/O writes
+ * Handle block aligned direct I/O writes.
*/
static noinline ssize_t
xfs_file_dio_write_aligned(
struct xfs_inode *ip,
struct kiocb *iocb,
- struct iov_iter *from)
+ struct iov_iter *from,
+ const struct iomap_ops *ops,
+ const struct iomap_dio_ops *dops,
+ struct xfs_zone_alloc_ctx *ac)
{
unsigned int iolock = XFS_IOLOCK_SHARED;
ssize_t ret;
@@ -597,7 +686,7 @@ xfs_file_dio_write_aligned(
ret = xfs_ilock_iocb_for_write(iocb, &iolock);
if (ret)
return ret;
- ret = xfs_file_write_checks(iocb, from, &iolock);
+ ret = xfs_file_write_checks(iocb, from, &iolock, ac);
if (ret)
goto out_unlock;
@@ -611,8 +700,94 @@ xfs_file_dio_write_aligned(
iolock = XFS_IOLOCK_SHARED;
}
trace_xfs_file_direct_write(iocb, from);
- ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
- &xfs_dio_write_ops, 0, NULL, 0);
+ ret = iomap_dio_rw(iocb, from, ops, dops, 0, ac, 0);
+out_unlock:
+ xfs_iunlock(ip, iolock);
+ return ret;
+}
+
+/*
+ * Handle block aligned direct I/O writes to zoned devices.
+ */
+static noinline ssize_t
+xfs_file_dio_write_zoned(
+ struct xfs_inode *ip,
+ struct kiocb *iocb,
+ struct iov_iter *from)
+{
+ struct xfs_zone_alloc_ctx ac = { };
+ ssize_t ret;
+
+ ret = xfs_zoned_write_space_reserve(ip->i_mount, iocb, from, 0, &ac);
+ if (ret < 0)
+ return ret;
+ ret = xfs_file_dio_write_aligned(ip, iocb, from,
+ &xfs_zoned_direct_write_iomap_ops,
+ &xfs_dio_zoned_write_ops, &ac);
+ xfs_zoned_space_unreserve(ip->i_mount, &ac);
+ return ret;
+}
+
+/*
+ * Handle block atomic writes
+ *
+ * Two methods of atomic writes are supported:
+ * - REQ_ATOMIC-based, which would typically use some form of HW offload in the
+ * disk
+ * - COW-based, which uses a COW fork as a staging extent for data updates
+ * before atomically updating extent mappings for the range being written
+ *
+ */
+static noinline ssize_t
+xfs_file_dio_write_atomic(
+ struct xfs_inode *ip,
+ struct kiocb *iocb,
+ struct iov_iter *from)
+{
+ unsigned int iolock = XFS_IOLOCK_SHARED;
+ ssize_t ret, ocount = iov_iter_count(from);
+ const struct iomap_ops *dops;
+
+ /*
+ * HW offload should be faster, so try that first if it is already
+ * known that the write length is not too large.
+ */
+ if (ocount > xfs_inode_buftarg(ip)->bt_awu_max)
+ dops = &xfs_atomic_write_cow_iomap_ops;
+ else
+ dops = &xfs_direct_write_iomap_ops;
+
+retry:
+ ret = xfs_ilock_iocb_for_write(iocb, &iolock);
+ if (ret)
+ return ret;
+
+ ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
+ if (ret)
+ goto out_unlock;
+
+ /* Demote similar to xfs_file_dio_write_aligned() */
+ if (iolock == XFS_IOLOCK_EXCL) {
+ xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
+ iolock = XFS_IOLOCK_SHARED;
+ }
+
+ trace_xfs_file_direct_write(iocb, from);
+ ret = iomap_dio_rw(iocb, from, dops, &xfs_dio_write_ops,
+ 0, NULL, 0);
+
+ /*
+ * The retry mechanism is based on the ->iomap_begin method returning
+ * -ENOPROTOOPT, which would be when the REQ_ATOMIC-based write is not
+ * possible. The REQ_ATOMIC-based method typically not be possible if
+ * the write spans multiple extents or the disk blocks are misaligned.
+ */
+ if (ret == -ENOPROTOOPT && dops == &xfs_direct_write_iomap_ops) {
+ xfs_iunlock(ip, iolock);
+ dops = &xfs_atomic_write_cow_iomap_ops;
+ goto retry;
+ }
+
out_unlock:
if (iolock)
xfs_iunlock(ip, iolock);
@@ -675,7 +850,7 @@ retry_exclusive:
goto out_unlock;
}
- ret = xfs_file_write_checks(iocb, from, &iolock);
+ ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
if (ret)
goto out_unlock;
@@ -721,9 +896,23 @@ xfs_file_dio_write(
/* direct I/O must be aligned to device logical sector size */
if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
return -EINVAL;
- if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask)
+
+ /*
+ * For always COW inodes we also must check the alignment of each
+ * individual iovec segment, as they could end up with different
+ * I/Os due to the way bio_iov_iter_get_pages works, and we'd
+ * then overwrite an already written block.
+ */
+ if (((iocb->ki_pos | count) & ip->i_mount->m_blockmask) ||
+ (xfs_is_always_cow_inode(ip) &&
+ (iov_iter_alignment(from) & ip->i_mount->m_blockmask)))
return xfs_file_dio_write_unaligned(ip, iocb, from);
- return xfs_file_dio_write_aligned(ip, iocb, from);
+ if (xfs_is_zoned_inode(ip))
+ return xfs_file_dio_write_zoned(ip, iocb, from);
+ if (iocb->ki_flags & IOCB_ATOMIC)
+ return xfs_file_dio_write_atomic(ip, iocb, from);
+ return xfs_file_dio_write_aligned(ip, iocb, from,
+ &xfs_direct_write_iomap_ops, &xfs_dio_write_ops, NULL);
}
static noinline ssize_t
@@ -740,7 +929,7 @@ xfs_file_dax_write(
ret = xfs_ilock_iocb(iocb, iolock);
if (ret)
return ret;
- ret = xfs_file_write_checks(iocb, from, &iolock);
+ ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
if (ret)
goto out;
@@ -784,13 +973,14 @@ write_retry:
if (ret)
return ret;
- ret = xfs_file_write_checks(iocb, from, &iolock);
+ ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
if (ret)
goto out;
trace_xfs_file_buffered_write(iocb, from);
ret = iomap_file_buffered_write(iocb, from,
- &xfs_buffered_write_iomap_ops, NULL);
+ &xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops,
+ NULL);
/*
* If we hit a space limit, try to free up some lingering preallocated
@@ -832,6 +1022,68 @@ out:
}
STATIC ssize_t
+xfs_file_buffered_write_zoned(
+ struct kiocb *iocb,
+ struct iov_iter *from)
+{
+ struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
+ struct xfs_mount *mp = ip->i_mount;
+ unsigned int iolock = XFS_IOLOCK_EXCL;
+ bool cleared_space = false;
+ struct xfs_zone_alloc_ctx ac = { };
+ ssize_t ret;
+
+ ret = xfs_zoned_write_space_reserve(mp, iocb, from, XFS_ZR_GREEDY, &ac);
+ if (ret < 0)
+ return ret;
+
+ ret = xfs_ilock_iocb(iocb, iolock);
+ if (ret)
+ goto out_unreserve;
+
+ ret = xfs_file_write_checks(iocb, from, &iolock, &ac);
+ if (ret)
+ goto out_unlock;
+
+ /*
+ * Truncate the iter to the length that we were actually able to
+ * allocate blocks for. This needs to happen after
+ * xfs_file_write_checks, because that assigns ki_pos for O_APPEND
+ * writes.
+ */
+ iov_iter_truncate(from,
+ XFS_FSB_TO_B(mp, ac.reserved_blocks) -
+ (iocb->ki_pos & mp->m_blockmask));
+ if (!iov_iter_count(from))
+ goto out_unlock;
+
+retry:
+ trace_xfs_file_buffered_write(iocb, from);
+ ret = iomap_file_buffered_write(iocb, from,
+ &xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops,
+ &ac);
+ if (ret == -ENOSPC && !cleared_space) {
+ /*
+ * Kick off writeback to convert delalloc space and release the
+ * usually too pessimistic indirect block reservations.
+ */
+ xfs_flush_inodes(mp);
+ cleared_space = true;
+ goto retry;
+ }
+
+out_unlock:
+ xfs_iunlock(ip, iolock);
+out_unreserve:
+ xfs_zoned_space_unreserve(ip->i_mount, &ac);
+ if (ret > 0) {
+ XFS_STATS_ADD(mp, xs_write_bytes, ret);
+ ret = generic_write_sync(iocb, ret);
+ }
+ return ret;
+}
+
+STATIC ssize_t
xfs_file_write_iter(
struct kiocb *iocb,
struct iov_iter *from)
@@ -849,23 +1101,21 @@ xfs_file_write_iter(
if (xfs_is_shutdown(ip->i_mount))
return -EIO;
- if (IS_DAX(inode))
- return xfs_file_dax_write(iocb, from);
-
if (iocb->ki_flags & IOCB_ATOMIC) {
- /*
- * Currently only atomic writing of a single FS block is
- * supported. It would be possible to atomic write smaller than
- * a FS block, but there is no requirement to support this.
- * Note that iomap also does not support this yet.
- */
- if (ocount != ip->i_mount->m_sb.sb_blocksize)
+ if (ocount < xfs_get_atomic_write_min(ip))
+ return -EINVAL;
+
+ if (ocount > xfs_get_atomic_write_max(ip))
return -EINVAL;
+
ret = generic_atomic_write_valid(iocb, from);
if (ret)
return ret;
}
+ if (IS_DAX(inode))
+ return xfs_file_dax_write(iocb, from);
+
if (iocb->ki_flags & IOCB_DIRECT) {
/*
* Allow a directio write to fall back to a buffered
@@ -878,6 +1128,8 @@ xfs_file_write_iter(
return ret;
}
+ if (xfs_is_zoned_inode(ip))
+ return xfs_file_buffered_write_zoned(iocb, from);
return xfs_file_buffered_write(iocb, from);
}
@@ -932,7 +1184,8 @@ static int
xfs_falloc_collapse_range(
struct file *file,
loff_t offset,
- loff_t len)
+ loff_t len,
+ struct xfs_zone_alloc_ctx *ac)
{
struct inode *inode = file_inode(file);
loff_t new_size = i_size_read(inode) - len;
@@ -948,7 +1201,7 @@ xfs_falloc_collapse_range(
if (offset + len >= i_size_read(inode))
return -EINVAL;
- error = xfs_collapse_file_space(XFS_I(inode), offset, len);
+ error = xfs_collapse_file_space(XFS_I(inode), offset, len, ac);
if (error)
return error;
return xfs_falloc_setsize(file, new_size);
@@ -1004,7 +1257,8 @@ xfs_falloc_zero_range(
struct file *file,
int mode,
loff_t offset,
- loff_t len)
+ loff_t len,
+ struct xfs_zone_alloc_ctx *ac)
{
struct inode *inode = file_inode(file);
unsigned int blksize = i_blocksize(inode);
@@ -1017,7 +1271,7 @@ xfs_falloc_zero_range(
if (error)
return error;
- error = xfs_free_file_space(XFS_I(inode), offset, len);
+ error = xfs_free_file_space(XFS_I(inode), offset, len, ac);
if (error)
return error;
@@ -1083,27 +1337,24 @@ xfs_falloc_allocate_range(
}
#define XFS_FALLOC_FL_SUPPORTED \
- (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \
- FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \
- FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE)
+ (FALLOC_FL_ALLOCATE_RANGE | FALLOC_FL_KEEP_SIZE | \
+ FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE | \
+ FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE | \
+ FALLOC_FL_UNSHARE_RANGE)
STATIC long
-xfs_file_fallocate(
+__xfs_file_fallocate(
struct file *file,
int mode,
loff_t offset,
- loff_t len)
+ loff_t len,
+ struct xfs_zone_alloc_ctx *ac)
{
struct inode *inode = file_inode(file);
struct xfs_inode *ip = XFS_I(inode);
long error;
uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
- if (!S_ISREG(inode->i_mode))
- return -EINVAL;
- if (mode & ~XFS_FALLOC_FL_SUPPORTED)
- return -EOPNOTSUPP;
-
xfs_ilock(ip, iolock);
error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
if (error)
@@ -1124,16 +1375,16 @@ xfs_file_fallocate(
switch (mode & FALLOC_FL_MODE_MASK) {
case FALLOC_FL_PUNCH_HOLE:
- error = xfs_free_file_space(ip, offset, len);
+ error = xfs_free_file_space(ip, offset, len, ac);
break;
case FALLOC_FL_COLLAPSE_RANGE:
- error = xfs_falloc_collapse_range(file, offset, len);
+ error = xfs_falloc_collapse_range(file, offset, len, ac);
break;
case FALLOC_FL_INSERT_RANGE:
error = xfs_falloc_insert_range(file, offset, len);
break;
case FALLOC_FL_ZERO_RANGE:
- error = xfs_falloc_zero_range(file, mode, offset, len);
+ error = xfs_falloc_zero_range(file, mode, offset, len, ac);
break;
case FALLOC_FL_UNSHARE_RANGE:
error = xfs_falloc_unshare_range(file, mode, offset, len);
@@ -1154,6 +1405,54 @@ out_unlock:
return error;
}
+static long
+xfs_file_zoned_fallocate(
+ struct file *file,
+ int mode,
+ loff_t offset,
+ loff_t len)
+{
+ struct xfs_zone_alloc_ctx ac = { };
+ struct xfs_inode *ip = XFS_I(file_inode(file));
+ int error;
+
+ error = xfs_zoned_space_reserve(ip->i_mount, 2, XFS_ZR_RESERVED, &ac);
+ if (error)
+ return error;
+ error = __xfs_file_fallocate(file, mode, offset, len, &ac);
+ xfs_zoned_space_unreserve(ip->i_mount, &ac);
+ return error;
+}
+
+static long
+xfs_file_fallocate(
+ struct file *file,
+ int mode,
+ loff_t offset,
+ loff_t len)
+{
+ struct inode *inode = file_inode(file);
+
+ if (!S_ISREG(inode->i_mode))
+ return -EINVAL;
+ if (mode & ~XFS_FALLOC_FL_SUPPORTED)
+ return -EOPNOTSUPP;
+
+ /*
+ * For zoned file systems, zeroing the first and last block of a hole
+ * punch requires allocating a new block to rewrite the remaining data
+ * and new zeroes out of place. Get a reservations for those before
+ * taking the iolock. Dip into the reserved pool because we are
+ * expected to be able to punch a hole even on a completely full
+ * file system.
+ */
+ if (xfs_is_zoned_inode(XFS_I(inode)) &&
+ (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE |
+ FALLOC_FL_COLLAPSE_RANGE)))
+ return xfs_file_zoned_fallocate(file, mode, offset, len);
+ return __xfs_file_fallocate(file, mode, offset, len, NULL);
+}
+
STATIC int
xfs_file_fadvise(
struct file *file,
@@ -1261,7 +1560,7 @@ xfs_file_open(
if (xfs_is_shutdown(XFS_M(inode->i_sb)))
return -EIO;
file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
- if (xfs_inode_can_atomicwrite(XFS_I(inode)))
+ if (xfs_get_atomic_write_min(XFS_I(inode)) > 0)
file->f_mode |= FMODE_CAN_ATOMIC_WRITE;
return generic_file_open(inode, file);
}
@@ -1347,15 +1646,22 @@ xfs_file_release(
* blocks. This avoids open/read/close workloads from removing EOF
* blocks that other writers depend upon to reduce fragmentation.
*
+ * Inodes on the zoned RT device never have preallocations, so skip
+ * taking the locks below.
+ */
+ if (!inode->i_nlink ||
+ !(file->f_mode & FMODE_WRITE) ||
+ (ip->i_diflags & XFS_DIFLAG_APPEND) ||
+ xfs_is_zoned_inode(ip))
+ return 0;
+
+ /*
* If we can't get the iolock just skip truncating the blocks past EOF
* because we could deadlock with the mmap_lock otherwise. We'll get
* another chance to drop them once the last reference to the inode is
* dropped, so we'll never leak blocks permanently.
*/
- if (inode->i_nlink &&
- (file->f_mode & FMODE_WRITE) &&
- !(ip->i_diflags & XFS_DIFLAG_APPEND) &&
- !xfs_iflags_test(ip, XFS_EOFBLOCKS_RELEASED) &&
+ if (!xfs_iflags_test(ip, XFS_EOFBLOCKS_RELEASED) &&
xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
if (xfs_can_free_eofblocks(ip) &&
!xfs_iflags_test_and_set(ip, XFS_EOFBLOCKS_RELEASED))
@@ -1426,7 +1732,7 @@ xfs_dax_fault_locked(
bool write_fault)
{
vm_fault_t ret;
- pfn_t pfn;
+ unsigned long pfn;
if (!IS_ENABLED(CONFIG_FS_DAX)) {
ASSERT(0);
@@ -1451,9 +1757,6 @@ xfs_dax_read_fault(
trace_xfs_read_fault(ip, order);
- ret = filemap_fsnotify_fault(vmf);
- if (unlikely(ret))
- return ret;
xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
ret = xfs_dax_fault_locked(vmf, order, false);
xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
@@ -1472,9 +1775,10 @@ xfs_dax_read_fault(
* i_lock (XFS - extent map serialisation)
*/
static vm_fault_t
-xfs_write_fault(
+__xfs_write_fault(
struct vm_fault *vmf,
- unsigned int order)
+ unsigned int order,
+ struct xfs_zone_alloc_ctx *ac)
{
struct inode *inode = file_inode(vmf->vma->vm_file);
struct xfs_inode *ip = XFS_I(inode);
@@ -1482,16 +1786,6 @@ xfs_write_fault(
vm_fault_t ret;
trace_xfs_write_fault(ip, order);
- /*
- * Usually we get here from ->page_mkwrite callback but in case of DAX
- * we will get here also for ordinary write fault. Handle HSM
- * notifications for that case.
- */
- if (IS_DAX(inode)) {
- ret = filemap_fsnotify_fault(vmf);
- if (unlikely(ret))
- return ret;
- }
sb_start_pagefault(inode->i_sb);
file_update_time(vmf->vma->vm_file);
@@ -1511,13 +1805,50 @@ xfs_write_fault(
if (IS_DAX(inode))
ret = xfs_dax_fault_locked(vmf, order, true);
else
- ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops);
+ ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops,
+ ac);
xfs_iunlock(ip, lock_mode);
sb_end_pagefault(inode->i_sb);
return ret;
}
+static vm_fault_t
+xfs_write_fault_zoned(
+ struct vm_fault *vmf,
+ unsigned int order)
+{
+ struct xfs_inode *ip = XFS_I(file_inode(vmf->vma->vm_file));
+ unsigned int len = folio_size(page_folio(vmf->page));
+ struct xfs_zone_alloc_ctx ac = { };
+ int error;
+ vm_fault_t ret;
+
+ /*
+ * This could over-allocate as it doesn't check for truncation.
+ *
+ * But as the overallocation is limited to less than a folio and will be
+ * release instantly that's just fine.
+ */
+ error = xfs_zoned_space_reserve(ip->i_mount,
+ XFS_B_TO_FSB(ip->i_mount, len), 0, &ac);
+ if (error < 0)
+ return vmf_fs_error(error);
+ ret = __xfs_write_fault(vmf, order, &ac);
+ xfs_zoned_space_unreserve(ip->i_mount, &ac);
+ return ret;
+}
+
+static vm_fault_t
+xfs_write_fault(
+ struct vm_fault *vmf,
+ unsigned int order)
+{
+ if (xfs_is_zoned_inode(XFS_I(file_inode(vmf->vma->vm_file))))
+ return xfs_write_fault_zoned(vmf, order);
+ return __xfs_write_fault(vmf, order, NULL);
+}
+
static inline bool
xfs_is_write_fault(
struct vm_fault *vmf)
@@ -1585,10 +1916,10 @@ static const struct vm_operations_struct xfs_file_vm_ops = {
};
STATIC int
-xfs_file_mmap(
- struct file *file,
- struct vm_area_struct *vma)
+xfs_file_mmap_prepare(
+ struct vm_area_desc *desc)
{
+ struct file *file = desc->file;
struct inode *inode = file_inode(file);
struct xfs_buftarg *target = xfs_inode_buftarg(XFS_I(inode));
@@ -1596,13 +1927,14 @@ xfs_file_mmap(
* We don't support synchronous mappings for non-DAX files and
* for DAX files if underneath dax_device is not synchronous.
*/
- if (!daxdev_mapping_supported(vma, target->bt_daxdev))
+ if (!daxdev_mapping_supported(desc->vm_flags, file_inode(file),
+ target->bt_daxdev))
return -EOPNOTSUPP;
file_accessed(file);
- vma->vm_ops = &xfs_file_vm_ops;
+ desc->vm_ops = &xfs_file_vm_ops;
if (IS_DAX(inode))
- vm_flags_set(vma, VM_HUGEPAGE);
+ desc->vm_flags |= VM_HUGEPAGE;
return 0;
}
@@ -1617,7 +1949,7 @@ const struct file_operations xfs_file_operations = {
#ifdef CONFIG_COMPAT
.compat_ioctl = xfs_file_compat_ioctl,
#endif
- .mmap = xfs_file_mmap,
+ .mmap_prepare = xfs_file_mmap_prepare,
.open = xfs_file_open,
.release = xfs_file_release,
.fsync = xfs_file_fsync,
@@ -1626,7 +1958,8 @@ const struct file_operations xfs_file_operations = {
.fadvise = xfs_file_fadvise,
.remap_file_range = xfs_file_remap_range,
.fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC |
- FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE,
+ FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE |
+ FOP_DONTCACHE,
};
const struct file_operations xfs_dir_file_operations = {
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index a961aa420c48..044918fbae06 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -304,11 +304,9 @@ xfs_filestream_create_association(
* for us, so all we need to do here is take another active reference to
* the perag for the cached association.
*
- * If we fail to store the association, we need to drop the fstrms
- * counter as well as drop the perag reference we take here for the
- * item. We do not need to return an error for this failure - as long as
- * we return a referenced AG, the allocation can still go ahead just
- * fine.
+ * If we fail to store the association, we do not need to return an
+ * error for this failure - as long as we return a referenced AG, the
+ * allocation can still go ahead just fine.
*/
item = kmalloc(sizeof(*item), GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (!item)
@@ -316,14 +314,9 @@ xfs_filestream_create_association(
atomic_inc(&pag_group(args->pag)->xg_active_ref);
item->pag = args->pag;
- error = xfs_mru_cache_insert(mp->m_filestream, pino, &item->mru);
- if (error)
- goto out_free_item;
+ xfs_mru_cache_insert(mp->m_filestream, pino, &item->mru);
return 0;
-out_free_item:
- xfs_perag_rele(item->pag);
- kfree(item);
out_put_fstrms:
atomic_dec(&args->pag->pagf_fstrms);
return 0;
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 1dbd2d75f7ae..af68c7de8ee8 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -876,21 +876,58 @@ xfs_getfsmap_rtdev_rmapbt(
const struct xfs_fsmap *keys,
struct xfs_getfsmap_info *info)
{
+ struct xfs_fsmap key0 = *keys; /* struct copy */
struct xfs_mount *mp = tp->t_mountp;
struct xfs_rtgroup *rtg = NULL;
struct xfs_btree_cur *bt_cur = NULL;
+ xfs_daddr_t rtstart_daddr;
xfs_rtblock_t start_rtb;
xfs_rtblock_t end_rtb;
xfs_rgnumber_t start_rg, end_rg;
uint64_t eofs;
int error = 0;
- eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
- if (keys[0].fmr_physical >= eofs)
+ eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart + mp->m_sb.sb_rblocks);
+ if (key0.fmr_physical >= eofs)
return 0;
- start_rtb = xfs_daddr_to_rtb(mp, keys[0].fmr_physical);
- end_rtb = xfs_daddr_to_rtb(mp, min(eofs - 1, keys[1].fmr_physical));
+ /*
+ * On zoned filesystems with an internal rt volume, the volume comes
+ * immediately after the end of the data volume. However, the
+ * xfs_rtblock_t address space is relative to the start of the data
+ * device, which means that the first @rtstart fsblocks do not actually
+ * point anywhere. If a fsmap query comes in with the low key starting
+ * below @rtstart, report it as "owned by filesystem".
+ */
+ rtstart_daddr = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart);
+ if (xfs_has_zoned(mp) && key0.fmr_physical < rtstart_daddr) {
+ struct xfs_fsmap_irec frec = {
+ .owner = XFS_RMAP_OWN_FS,
+ .len_daddr = rtstart_daddr,
+ };
+
+ /*
+ * Adjust the start of the query range if we're picking up from
+ * a previous round, and only emit the record if we haven't
+ * already gone past.
+ */
+ key0.fmr_physical += key0.fmr_length;
+ if (key0.fmr_physical < rtstart_daddr) {
+ error = xfs_getfsmap_helper(tp, info, &frec);
+ if (error)
+ return error;
+
+ key0.fmr_physical = rtstart_daddr;
+ }
+
+ /* Zero the other fields to avoid further adjustments. */
+ key0.fmr_owner = 0;
+ key0.fmr_offset = 0;
+ key0.fmr_length = 0;
+ }
+
+ start_rtb = xfs_daddr_to_rtb(mp, key0.fmr_physical);
+ end_rtb = xfs_daddr_to_rtb(mp, min(eofs - 1, keys[1].fmr_physical));
info->missing_owner = XFS_FMR_OWN_FREE;
/*
@@ -898,12 +935,12 @@ xfs_getfsmap_rtdev_rmapbt(
* low to the fsmap low key and max out the high key to the end
* of the rtgroup.
*/
- info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset);
- error = xfs_fsmap_owner_to_rmap(&info->low, &keys[0]);
+ info->low.rm_offset = XFS_BB_TO_FSBT(mp, key0.fmr_offset);
+ error = xfs_fsmap_owner_to_rmap(&info->low, &key0);
if (error)
return error;
- info->low.rm_blockcount = XFS_BB_TO_FSBT(mp, keys[0].fmr_length);
- xfs_getfsmap_set_irec_flags(&info->low, &keys[0]);
+ info->low.rm_blockcount = XFS_BB_TO_FSBT(mp, key0.fmr_length);
+ xfs_getfsmap_set_irec_flags(&info->low, &key0);
/* Adjust the low key if we are continuing from where we left off. */
if (info->low.rm_blockcount == 0) {
@@ -1004,22 +1041,40 @@ xfs_getfsmap_rtdev_rmapbt(
}
#endif /* CONFIG_XFS_RT */
+static uint32_t
+xfs_getfsmap_device(
+ struct xfs_mount *mp,
+ enum xfs_device dev)
+{
+ if (mp->m_sb.sb_rtstart)
+ return dev;
+
+ switch (dev) {
+ case XFS_DEV_DATA:
+ return new_encode_dev(mp->m_ddev_targp->bt_dev);
+ case XFS_DEV_LOG:
+ return new_encode_dev(mp->m_logdev_targp->bt_dev);
+ case XFS_DEV_RT:
+ if (!mp->m_rtdev_targp)
+ break;
+ return new_encode_dev(mp->m_rtdev_targp->bt_dev);
+ }
+
+ return -1;
+}
+
/* Do we recognize the device? */
STATIC bool
xfs_getfsmap_is_valid_device(
struct xfs_mount *mp,
struct xfs_fsmap *fm)
{
- if (fm->fmr_device == 0 || fm->fmr_device == UINT_MAX ||
- fm->fmr_device == new_encode_dev(mp->m_ddev_targp->bt_dev))
- return true;
- if (mp->m_logdev_targp &&
- fm->fmr_device == new_encode_dev(mp->m_logdev_targp->bt_dev))
- return true;
- if (mp->m_rtdev_targp &&
- fm->fmr_device == new_encode_dev(mp->m_rtdev_targp->bt_dev))
- return true;
- return false;
+ return fm->fmr_device == 0 ||
+ fm->fmr_device == UINT_MAX ||
+ fm->fmr_device == xfs_getfsmap_device(mp, XFS_DEV_DATA) ||
+ fm->fmr_device == xfs_getfsmap_device(mp, XFS_DEV_LOG) ||
+ (mp->m_rtdev_targp &&
+ fm->fmr_device == xfs_getfsmap_device(mp, XFS_DEV_RT));
}
/* Ensure that the low key is less than the high key. */
@@ -1126,7 +1181,7 @@ xfs_getfsmap(
/* Set up our device handlers. */
memset(handlers, 0, sizeof(handlers));
handlers[0].nr_sectors = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
- handlers[0].dev = new_encode_dev(mp->m_ddev_targp->bt_dev);
+ handlers[0].dev = xfs_getfsmap_device(mp, XFS_DEV_DATA);
if (use_rmap)
handlers[0].fn = xfs_getfsmap_datadev_rmapbt;
else
@@ -1134,13 +1189,17 @@ xfs_getfsmap(
if (mp->m_logdev_targp != mp->m_ddev_targp) {
handlers[1].nr_sectors = XFS_FSB_TO_BB(mp,
mp->m_sb.sb_logblocks);
- handlers[1].dev = new_encode_dev(mp->m_logdev_targp->bt_dev);
+ handlers[1].dev = xfs_getfsmap_device(mp, XFS_DEV_LOG);
handlers[1].fn = xfs_getfsmap_logdev;
}
#ifdef CONFIG_XFS_RT
- if (mp->m_rtdev_targp) {
+ /*
+ * For zoned file systems there is no rtbitmap, so only support fsmap
+ * if the callers is privileged enough to use the full rmap version.
+ */
+ if (mp->m_rtdev_targp && (use_rmap || !xfs_has_zoned(mp))) {
handlers[2].nr_sectors = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
- handlers[2].dev = new_encode_dev(mp->m_rtdev_targp->bt_dev);
+ handlers[2].dev = xfs_getfsmap_device(mp, XFS_DEV_RT);
if (use_rmap)
handlers[2].fn = xfs_getfsmap_rtdev_rmapbt;
else
@@ -1211,9 +1270,7 @@ xfs_getfsmap(
* buffer locking abilities to detect cycles in the rmapbt
* without deadlocking.
*/
- error = xfs_trans_alloc_empty(mp, &tp);
- if (error)
- break;
+ tp = xfs_trans_alloc_empty(mp);
info.dev = handlers[i].dev;
info.last = false;
@@ -1230,7 +1287,13 @@ xfs_getfsmap(
if (tp)
xfs_trans_cancel(tp);
- head->fmh_oflags = FMH_OF_DEV_T;
+
+ /*
+ * For internal RT device we need to report different synthetic devices
+ * for a single physical device, and thus can't report the actual dev_t.
+ */
+ if (!mp->m_sb.sb_rtstart)
+ head->fmh_oflags = FMH_OF_DEV_T;
return error;
}
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 455298503d01..0ada73569394 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -24,6 +24,7 @@
#include "xfs_rtalloc.h"
#include "xfs_rtrmap_btree.h"
#include "xfs_rtrefcount_btree.h"
+#include "xfs_metafile.h"
/*
* Write new AG headers to disk. Non-transactional, but need to be
@@ -110,7 +111,7 @@ xfs_growfs_data_private(
if (nb > mp->m_sb.sb_dblocks) {
error = xfs_buf_read_uncached(mp->m_ddev_targp,
XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
- XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL);
+ XFS_FSS_TO_BB(mp, 1), &bp, NULL);
if (error)
return error;
xfs_buf_relse(bp);
@@ -300,24 +301,30 @@ xfs_growfs_data(
struct xfs_mount *mp,
struct xfs_growfs_data *in)
{
- int error = 0;
+ int error;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (!mutex_trylock(&mp->m_growlock))
return -EWOULDBLOCK;
+ /* we can't grow the data section when an internal RT section exists */
+ if (in->newblocks != mp->m_sb.sb_dblocks && mp->m_sb.sb_rtstart) {
+ error = -EINVAL;
+ goto out_unlock;
+ }
+
/* update imaxpct separately to the physical grow of the filesystem */
if (in->imaxpct != mp->m_sb.sb_imax_pct) {
error = xfs_growfs_imaxpct(mp, in->imaxpct);
if (error)
- goto out_error;
+ goto out_unlock;
}
if (in->newblocks != mp->m_sb.sb_dblocks) {
error = xfs_growfs_data_private(mp, in);
if (error)
- goto out_error;
+ goto out_unlock;
}
/* Post growfs calculations needed to reflect new state in operations */
@@ -331,13 +338,12 @@ xfs_growfs_data(
/* Update secondary superblocks now the physical grow has completed */
error = xfs_update_secondary_sbs(mp);
-out_error:
/*
- * Increment the generation unconditionally, the error could be from
- * updating the secondary superblocks, in which case the new size
- * is live already.
+ * Increment the generation unconditionally, after trying to update the
+ * secondary superblocks, as the new size is live already at this point.
*/
mp->m_generation++;
+out_unlock:
mutex_unlock(&mp->m_growlock);
return error;
}
@@ -366,6 +372,7 @@ xfs_growfs_log(
int
xfs_reserve_blocks(
struct xfs_mount *mp,
+ enum xfs_free_counter ctr,
uint64_t request)
{
int64_t lcounter, delta;
@@ -373,6 +380,8 @@ xfs_reserve_blocks(
int64_t free;
int error = 0;
+ ASSERT(ctr < XC_FREE_NR);
+
/*
* With per-cpu counters, this becomes an interesting problem. we need
* to work out if we are freeing or allocation blocks first, then we can
@@ -391,16 +400,16 @@ xfs_reserve_blocks(
* counters directly since we shouldn't have any problems unreserving
* space.
*/
- if (mp->m_resblks > request) {
- lcounter = mp->m_resblks_avail - request;
+ if (mp->m_free[ctr].res_total > request) {
+ lcounter = mp->m_free[ctr].res_avail - request;
if (lcounter > 0) { /* release unused blocks */
fdblks_delta = lcounter;
- mp->m_resblks_avail -= lcounter;
+ mp->m_free[ctr].res_avail -= lcounter;
}
- mp->m_resblks = request;
+ mp->m_free[ctr].res_total = request;
if (fdblks_delta) {
spin_unlock(&mp->m_sb_lock);
- xfs_add_fdblocks(mp, fdblks_delta);
+ xfs_add_freecounter(mp, ctr, fdblks_delta);
spin_lock(&mp->m_sb_lock);
}
@@ -409,7 +418,7 @@ xfs_reserve_blocks(
/*
* If the request is larger than the current reservation, reserve the
- * blocks before we update the reserve counters. Sample m_fdblocks and
+ * blocks before we update the reserve counters. Sample m_free and
* perform a partial reservation if the request exceeds free space.
*
* The code below estimates how many blocks it can request from
@@ -419,10 +428,10 @@ xfs_reserve_blocks(
* space to fill it because mod_fdblocks will refill an undersized
* reserve when it can.
*/
- free = percpu_counter_sum(&mp->m_fdblocks) -
- xfs_fdblocks_unavailable(mp);
- delta = request - mp->m_resblks;
- mp->m_resblks = request;
+ free = xfs_sum_freecounter_raw(mp, ctr) -
+ xfs_freecounter_unavailable(mp, ctr);
+ delta = request - mp->m_free[ctr].res_total;
+ mp->m_free[ctr].res_total = request;
if (delta > 0 && free > 0) {
/*
* We'll either succeed in getting space from the free block
@@ -436,9 +445,9 @@ xfs_reserve_blocks(
*/
fdblks_delta = min(free, delta);
spin_unlock(&mp->m_sb_lock);
- error = xfs_dec_fdblocks(mp, fdblks_delta, 0);
+ error = xfs_dec_freecounter(mp, ctr, fdblks_delta, 0);
if (!error)
- xfs_add_fdblocks(mp, fdblks_delta);
+ xfs_add_freecounter(mp, ctr, fdblks_delta);
spin_lock(&mp->m_sb_lock);
}
out:
@@ -558,15 +567,13 @@ xfs_fs_reserve_ag_blocks(
return error;
}
- if (xfs_has_realtime(mp)) {
- err2 = xfs_rt_resv_init(mp);
- if (err2 && err2 != -ENOSPC) {
- xfs_warn(mp,
- "Error %d reserving realtime metadata reserve pool.", err2);
- xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
- }
+ err2 = xfs_metafile_resv_init(mp);
+ if (err2 && err2 != -ENOSPC) {
+ xfs_warn(mp,
+ "Error %d reserving realtime metadata reserve pool.", err2);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
- if (err2 && !error)
+ if (!error)
error = err2;
}
@@ -582,9 +589,7 @@ xfs_fs_unreserve_ag_blocks(
{
struct xfs_perag *pag = NULL;
- if (xfs_has_realtime(mp))
- xfs_rt_resv_free(mp);
-
+ xfs_metafile_resv_free(mp);
while ((pag = xfs_perag_next(mp, pag)))
xfs_ag_resv_free(pag);
}
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index 3e2f73bcf831..9d23c361ef56 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -8,7 +8,8 @@
int xfs_growfs_data(struct xfs_mount *mp, struct xfs_growfs_data *in);
int xfs_growfs_log(struct xfs_mount *mp, struct xfs_growfs_log *in);
-int xfs_reserve_blocks(struct xfs_mount *mp, uint64_t request);
+int xfs_reserve_blocks(struct xfs_mount *mp, enum xfs_free_counter cnt,
+ uint64_t request);
int xfs_fs_goingdown(struct xfs_mount *mp, uint32_t inflags);
int xfs_fs_reserve_ag_blocks(struct xfs_mount *mp);
diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
index f18fec0adf66..f6f628c01feb 100644
--- a/fs/xfs/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c
@@ -23,8 +23,6 @@ xfs_param_t xfs_params = {
.inherit_sync = { 0, 1, 1 },
.inherit_nodump = { 0, 1, 1 },
.inherit_noatim = { 0, 1, 1 },
- .xfs_buf_timer = { 100/2, 1*100, 30*100 },
- .xfs_buf_age = { 1*100, 15*100, 7200*100},
.inherit_nosym = { 0, 0, 1 },
.rotorstep = { 1, 1, 255 },
.inherit_nodfrg = { 0, 1, 1 },
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 7b6c026d01a1..4cf7abe50143 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -230,7 +230,7 @@ xfs_blockgc_queue(
rcu_read_lock();
if (radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG))
queue_delayed_work(mp->m_blockgc_wq, &pag->pag_blockgc_work,
- msecs_to_jiffies(xfs_blockgc_secs * 1000));
+ secs_to_jiffies(xfs_blockgc_secs));
rcu_read_unlock();
}
@@ -893,10 +893,7 @@ xfs_metafile_iget(
struct xfs_trans *tp;
int error;
- error = xfs_trans_alloc_empty(mp, &tp);
- if (error)
- return error;
-
+ tp = xfs_trans_alloc_empty(mp);
error = xfs_trans_metafile_iget(tp, ino, metafile_type, ipp);
xfs_trans_cancel(tp);
return error;
@@ -979,7 +976,15 @@ xfs_reclaim_inode(
*/
if (xlog_is_shutdown(ip->i_mount->m_log)) {
xfs_iunpin_wait(ip);
+ /*
+ * Avoid a ABBA deadlock on the inode cluster buffer vs
+ * concurrent xfs_ifree_cluster() trying to mark the inode
+ * stale. We don't need the inode locked to run the flush abort
+ * code, but the flush abort needs to lock the cluster buffer.
+ */
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
xfs_iflush_shutdown_abort(ip);
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
goto reclaim;
}
if (xfs_ipincount(ip))
@@ -2073,10 +2078,10 @@ xfs_inodegc_want_queue_rt_file(
{
struct xfs_mount *mp = ip->i_mount;
- if (!XFS_IS_REALTIME_INODE(ip))
+ if (!XFS_IS_REALTIME_INODE(ip) || xfs_has_zoned(mp))
return false;
- if (__percpu_counter_compare(&mp->m_frextents,
+ if (xfs_compare_freecounter(mp, XC_FREE_RTEXTENTS,
mp->m_low_rtexts[XFS_LOWSP_5_PCNT],
XFS_FDBLOCKS_BATCH) < 0)
return true;
@@ -2104,7 +2109,7 @@ xfs_inodegc_want_queue_work(
if (items > mp->m_ino_geo.inodes_per_cluster)
return true;
- if (__percpu_counter_compare(&mp->m_fdblocks,
+ if (xfs_compare_freecounter(mp, XC_FREE_BLOCKS,
mp->m_low_space[XFS_LOWSP_5_PCNT],
XFS_FDBLOCKS_BATCH) < 0)
return true;
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index 4345db501714..f83ec2bd0583 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -158,7 +158,7 @@ xlog_recover_icreate_commit_pass2(
int nbufs;
int i;
- icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr;
+ icl = (struct xfs_icreate_log *)item->ri_buf[0].iov_base;
if (icl->icl_type != XFS_LI_ICREATE) {
xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type");
return -EINVAL;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index c95fe1b1de4e..9c39251961a3 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1404,8 +1404,11 @@ xfs_inactive(
goto out;
/* Try to clean out the cow blocks if there are any. */
- if (xfs_inode_has_cow_data(ip))
- xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, true);
+ if (xfs_inode_has_cow_data(ip)) {
+ error = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, true);
+ if (error)
+ goto out;
+ }
if (VFS_I(ip)->i_nlink != 0) {
/*
@@ -1632,7 +1635,7 @@ retry:
iip = ip->i_itemp;
if (__xfs_iflags_test(ip, XFS_IFLUSHING)) {
ASSERT(!list_empty(&iip->ili_item.li_bio_list));
- ASSERT(iip->ili_last_fields);
+ ASSERT(iip->ili_last_fields || xlog_is_shutdown(mp->m_log));
goto out_iunlock;
}
@@ -1718,8 +1721,7 @@ xfs_ifree_cluster(
* to mark all the active inodes on the buffer stale.
*/
error = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
- mp->m_bsize * igeo->blocks_per_cluster,
- XBF_UNMAPPED, &bp);
+ mp->m_bsize * igeo->blocks_per_cluster, 0, &bp);
if (error)
return error;
@@ -2732,21 +2734,16 @@ xfs_mmaplock_two_inodes_and_break_dax_layout(
struct xfs_inode *ip2)
{
int error;
- bool retry;
- struct page *page;
if (ip1->i_ino > ip2->i_ino)
swap(ip1, ip2);
again:
- retry = false;
/* Lock the first inode */
xfs_ilock(ip1, XFS_MMAPLOCK_EXCL);
- error = xfs_break_dax_layouts(VFS_I(ip1), &retry);
- if (error || retry) {
+ error = xfs_break_dax_layouts(VFS_I(ip1));
+ if (error) {
xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL);
- if (error == 0 && retry)
- goto again;
return error;
}
@@ -2760,8 +2757,8 @@ again:
* need to unlock & lock the XFS_MMAPLOCK_EXCL which is not suitable
* for this nested lock case.
*/
- page = dax_layout_busy_page(VFS_I(ip2)->i_mapping);
- if (page && page_ref_count(page) != 1) {
+ error = dax_break_layout(VFS_I(ip2), 0, -1, NULL);
+ if (error) {
xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL);
xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL);
goto again;
@@ -2935,12 +2932,9 @@ xfs_inode_reload_unlinked(
struct xfs_inode *ip)
{
struct xfs_trans *tp;
- int error;
-
- error = xfs_trans_alloc_empty(ip->i_mount, &tp);
- if (error)
- return error;
+ int error = 0;
+ tp = xfs_trans_alloc_empty(ip->i_mount);
xfs_ilock(ip, XFS_ILOCK_SHARED);
if (xfs_inode_unlinked_incomplete(ip))
error = xfs_inode_reload_unlinked_bucket(tp, ip);
@@ -3005,21 +2999,11 @@ xfs_wait_dax_page(
int
xfs_break_dax_layouts(
- struct inode *inode,
- bool *retry)
+ struct inode *inode)
{
- struct page *page;
-
xfs_assert_ilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL);
- page = dax_layout_busy_page(inode->i_mapping);
- if (!page)
- return 0;
-
- *retry = true;
- return ___wait_var_event(&page->_refcount,
- atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
- 0, 0, xfs_wait_dax_page(inode));
+ return dax_break_layout_inode(inode, xfs_wait_dax_page);
}
int
@@ -3037,8 +3021,8 @@ xfs_break_layouts(
retry = false;
switch (reason) {
case BREAK_UNMAP:
- error = xfs_break_dax_layouts(inode, &retry);
- if (error || retry)
+ error = xfs_break_dax_layouts(inode);
+ if (error)
break;
fallthrough;
case BREAK_WRITE:
@@ -3071,5 +3055,6 @@ bool
xfs_is_always_cow_inode(
const struct xfs_inode *ip)
{
- return ip->i_mount->m_always_cow && xfs_has_reflink(ip->i_mount);
+ return xfs_is_zoned_inode(ip) ||
+ (ip->i_mount->m_always_cow && xfs_has_reflink(ip->i_mount));
}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index c08093a65352..bd6d33557194 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -25,19 +25,9 @@ struct xfs_dquot;
typedef struct xfs_inode {
/* Inode linking and identification information. */
struct xfs_mount *i_mount; /* fs mount struct ptr */
- union {
- struct {
- struct xfs_dquot *i_udquot; /* user dquot */
- struct xfs_dquot *i_gdquot; /* group dquot */
- struct xfs_dquot *i_pdquot; /* project dquot */
- };
-
- /*
- * Space that has been set aside to accomodate expansions of a
- * metadata btree rooted in this file.
- */
- uint64_t i_meta_resv_asked;
- };
+ struct xfs_dquot *i_udquot; /* user dquot */
+ struct xfs_dquot *i_gdquot; /* group dquot */
+ struct xfs_dquot *i_pdquot; /* project dquot */
/* Inode location stuff */
xfs_ino_t i_ino; /* inode number (agno/agino)*/
@@ -69,8 +59,13 @@ typedef struct xfs_inode {
xfs_rfsblock_t i_nblocks; /* # of direct & btree blocks */
prid_t i_projid; /* owner's project id */
xfs_extlen_t i_extsize; /* basic/minimum extent size */
- /* cowextsize is only used for v3 inodes, flushiter for v1/2 */
+ /*
+ * i_used_blocks is used for zoned rtrmap inodes,
+ * i_cowextsize is used for other v3 inodes,
+ * i_flushiter for v1/2 inodes
+ */
union {
+ uint32_t i_used_blocks; /* used blocks in RTG */
xfs_extlen_t i_cowextsize; /* basic cow extent size */
uint16_t i_flushiter; /* incremented on flush */
};
@@ -309,6 +304,11 @@ static inline bool xfs_is_internal_inode(const struct xfs_inode *ip)
xfs_is_quota_inode(&mp->m_sb, ip->i_ino);
}
+static inline bool xfs_is_zoned_inode(const struct xfs_inode *ip)
+{
+ return xfs_has_zoned(ip->i_mount) && XFS_IS_REALTIME_INODE(ip);
+}
+
bool xfs_is_always_cow_inode(const struct xfs_inode *ip);
static inline bool xfs_is_cow_inode(const struct xfs_inode *ip)
@@ -356,19 +356,20 @@ static inline bool xfs_inode_has_bigrtalloc(const struct xfs_inode *ip)
(XFS_IS_REALTIME_INODE(ip) ? \
(ip)->i_mount->m_rtdev_targp : (ip)->i_mount->m_ddev_targp)
-static inline bool
-xfs_inode_can_atomicwrite(
- struct xfs_inode *ip)
+static inline bool xfs_inode_can_hw_atomic_write(const struct xfs_inode *ip)
{
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_buftarg *target = xfs_inode_buftarg(ip);
-
- if (mp->m_sb.sb_blocksize < target->bt_bdev_awu_min)
+ if (IS_DAX(VFS_IC(ip)))
return false;
- if (mp->m_sb.sb_blocksize > target->bt_bdev_awu_max)
+
+ return xfs_inode_buftarg(ip)->bt_awu_max > 0;
+}
+
+static inline bool xfs_inode_can_sw_atomic_write(const struct xfs_inode *ip)
+{
+ if (IS_DAX(VFS_IC(ip)))
return false;
- return true;
+ return xfs_can_sw_atomic_write(ip->i_mount);
}
/*
@@ -603,7 +604,7 @@ xfs_itruncate_extents(
return xfs_itruncate_extents_flags(tpp, ip, whichfork, new_size, 0);
}
-int xfs_break_dax_layouts(struct inode *inode, bool *retry);
+int xfs_break_dax_layouts(struct inode *inode);
int xfs_break_layouts(struct inode *inode, uint *iolock,
enum layout_break_reason reason);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 35803fcf0beb..829675700fcd 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -596,6 +596,7 @@ xfs_inode_to_log_dinode(
to->di_changecount = inode_peek_iversion(inode);
to->di_crtime = xfs_inode_to_log_dinode_ts(ip, ip->i_crtime);
to->di_flags2 = ip->i_diflags2;
+ /* also covers the di_used_blocks union arm: */
to->di_cowextsize = ip->i_cowextsize;
to->di_ino = ip->i_ino;
to->di_lsn = lsn;
@@ -757,11 +758,14 @@ xfs_inode_item_push(
* completed and items removed from the AIL before the next push
* attempt.
*/
+ trace_xfs_inode_push_stale(ip, _RET_IP_);
return XFS_ITEM_PINNED;
}
- if (xfs_ipincount(ip) > 0 || xfs_buf_ispinned(bp))
+ if (xfs_ipincount(ip) > 0 || xfs_buf_ispinned(bp)) {
+ trace_xfs_inode_push_pinned(ip, _RET_IP_);
return XFS_ITEM_PINNED;
+ }
if (xfs_iflags_test(ip, XFS_IFLUSHING))
return XFS_ITEM_FLUSHING;
@@ -1088,13 +1092,7 @@ xfs_iflush_abort(
* state. Whilst the inode is in the AIL, it should have a valid buffer
* pointer for push operations to access - it is only safe to remove the
* inode from the buffer once it has been removed from the AIL.
- *
- * We also clear the failed bit before removing the item from the AIL
- * as xfs_trans_ail_delete()->xfs_clear_li_failed() will release buffer
- * references the inode item owns and needs to hold until we've fully
- * aborted the inode log item and detached it from the buffer.
*/
- clear_bit(XFS_LI_FAILED, &iip->ili_item.li_flags);
xfs_trans_ail_delete(&iip->ili_item, 0);
/*
@@ -1184,12 +1182,12 @@ xfs_iflush_shutdown_abort(
*/
int
xfs_inode_item_format_convert(
- struct xfs_log_iovec *buf,
+ struct kvec *buf,
struct xfs_inode_log_format *in_f)
{
- struct xfs_inode_log_format_32 *in_f32 = buf->i_addr;
+ struct xfs_inode_log_format_32 *in_f32 = buf->iov_base;
- if (buf->i_len != sizeof(*in_f32)) {
+ if (buf->iov_len != sizeof(*in_f32)) {
XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
return -EFSCORRUPTED;
}
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 377e06007804..ba92ce11a011 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -46,8 +46,8 @@ extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *);
extern void xfs_inode_item_destroy(struct xfs_inode *);
extern void xfs_iflush_abort(struct xfs_inode *);
extern void xfs_iflush_shutdown_abort(struct xfs_inode *);
-extern int xfs_inode_item_format_convert(xfs_log_iovec_t *,
- struct xfs_inode_log_format *);
+int xfs_inode_item_format_convert(struct kvec *buf,
+ struct xfs_inode_log_format *in_f);
extern struct kmem_cache *xfs_ili_cache;
diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c
index f3bfb814378c..9d1999d41be1 100644
--- a/fs/xfs/xfs_inode_item_recover.c
+++ b/fs/xfs/xfs_inode_item_recover.c
@@ -30,13 +30,13 @@ xlog_recover_inode_ra_pass2(
struct xlog *log,
struct xlog_recover_item *item)
{
- if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
- struct xfs_inode_log_format *ilfp = item->ri_buf[0].i_addr;
+ if (item->ri_buf[0].iov_len == sizeof(struct xfs_inode_log_format)) {
+ struct xfs_inode_log_format *ilfp = item->ri_buf[0].iov_base;
xlog_buf_readahead(log, ilfp->ilf_blkno, ilfp->ilf_len,
&xfs_inode_buf_ra_ops);
} else {
- struct xfs_inode_log_format_32 *ilfp = item->ri_buf[0].i_addr;
+ struct xfs_inode_log_format_32 *ilfp = item->ri_buf[0].iov_base;
xlog_buf_readahead(log, ilfp->ilf_blkno, ilfp->ilf_len,
&xfs_inode_buf_ra_ops);
@@ -203,6 +203,7 @@ xfs_log_dinode_to_disk(
to->di_crtime = xfs_log_dinode_to_disk_ts(from,
from->di_crtime);
to->di_flags2 = cpu_to_be64(from->di_flags2);
+ /* also covers the di_used_blocks union arm: */
to->di_cowextsize = cpu_to_be32(from->di_cowextsize);
to->di_ino = cpu_to_be64(from->di_ino);
to->di_lsn = cpu_to_be64(lsn);
@@ -325,8 +326,8 @@ xlog_recover_inode_commit_pass2(
int need_free = 0;
xfs_failaddr_t fa;
- if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
- in_f = item->ri_buf[0].i_addr;
+ if (item->ri_buf[0].iov_len == sizeof(struct xfs_inode_log_format)) {
+ in_f = item->ri_buf[0].iov_base;
} else {
in_f = kmalloc(sizeof(struct xfs_inode_log_format),
GFP_KERNEL | __GFP_NOFAIL);
@@ -365,7 +366,7 @@ xlog_recover_inode_commit_pass2(
error = -EFSCORRUPTED;
goto out_release;
}
- ldip = item->ri_buf[1].i_addr;
+ ldip = item->ri_buf[1].iov_base;
if (XFS_IS_CORRUPT(mp, ldip->di_magic != XFS_DINODE_MAGIC)) {
xfs_alert(mp,
"%s: Bad inode log record, rec ptr "PTR_FMT", ino %lld",
@@ -471,12 +472,12 @@ xlog_recover_inode_commit_pass2(
goto out_release;
}
isize = xfs_log_dinode_size(mp);
- if (unlikely(item->ri_buf[1].i_len > isize)) {
+ if (unlikely(item->ri_buf[1].iov_len > isize)) {
XFS_CORRUPTION_ERROR("Bad log dinode size", XFS_ERRLEVEL_LOW,
mp, ldip, sizeof(*ldip));
xfs_alert(mp,
- "Bad inode 0x%llx log dinode size 0x%x",
- in_f->ilf_ino, item->ri_buf[1].i_len);
+ "Bad inode 0x%llx log dinode size 0x%zx",
+ in_f->ilf_ino, item->ri_buf[1].iov_len);
error = -EFSCORRUPTED;
goto out_release;
}
@@ -499,8 +500,8 @@ xlog_recover_inode_commit_pass2(
if (in_f->ilf_size == 2)
goto out_owner_change;
- len = item->ri_buf[2].i_len;
- src = item->ri_buf[2].i_addr;
+ len = item->ri_buf[2].iov_len;
+ src = item->ri_buf[2].iov_base;
ASSERT(in_f->ilf_size <= 4);
ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK));
ASSERT(!(fields & XFS_ILOG_DFORK) ||
@@ -537,8 +538,8 @@ xlog_recover_inode_commit_pass2(
} else {
attr_index = 2;
}
- len = item->ri_buf[attr_index].i_len;
- src = item->ri_buf[attr_index].i_addr;
+ len = item->ri_buf[attr_index].iov_len;
+ src = item->ri_buf[attr_index].iov_base;
ASSERT(len == xlog_calc_iovec_len(in_f->ilf_asize));
switch (in_f->ilf_fields & XFS_ILOG_AFORK) {
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index ed85322507dd..e1051a530a50 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -219,7 +219,7 @@ xfs_bulk_ireq_setup(
else if (XFS_INO_TO_AGNO(mp, breq->startino) < hdr->agno)
return -EINVAL;
- breq->flags |= XFS_IBULK_SAME_AG;
+ breq->iwalk_flags |= XFS_IWALK_SAME_AG;
/* Asking for an inode past the end of the AG? We're done! */
if (XFS_INO_TO_AGNO(mp, breq->startino) > hdr->agno)
@@ -444,7 +444,7 @@ static void
xfs_fill_fsxattr(
struct xfs_inode *ip,
int whichfork,
- struct fileattr *fa)
+ struct file_kattr *fa)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
@@ -496,7 +496,7 @@ xfs_ioc_fsgetxattra(
xfs_inode_t *ip,
void __user *arg)
{
- struct fileattr fa;
+ struct file_kattr fa;
xfs_ilock(ip, XFS_ILOCK_SHARED);
xfs_fill_fsxattr(ip, XFS_ATTR_FORK, &fa);
@@ -508,7 +508,7 @@ xfs_ioc_fsgetxattra(
int
xfs_fileattr_get(
struct dentry *dentry,
- struct fileattr *fa)
+ struct file_kattr *fa)
{
struct xfs_inode *ip = XFS_I(d_inode(dentry));
@@ -526,7 +526,7 @@ static int
xfs_ioctl_setattr_xflags(
struct xfs_trans *tp,
struct xfs_inode *ip,
- struct fileattr *fa)
+ struct file_kattr *fa)
{
struct xfs_mount *mp = ip->i_mount;
bool rtflag = (fa->fsx_xflags & FS_XFLAG_REALTIME);
@@ -582,7 +582,7 @@ xfs_ioctl_setattr_xflags(
static void
xfs_ioctl_setattr_prepare_dax(
struct xfs_inode *ip,
- struct fileattr *fa)
+ struct file_kattr *fa)
{
struct xfs_mount *mp = ip->i_mount;
struct inode *inode = VFS_I(ip);
@@ -642,7 +642,7 @@ out_error:
static int
xfs_ioctl_setattr_check_extsize(
struct xfs_inode *ip,
- struct fileattr *fa)
+ struct file_kattr *fa)
{
struct xfs_mount *mp = ip->i_mount;
xfs_failaddr_t failaddr;
@@ -684,7 +684,7 @@ xfs_ioctl_setattr_check_extsize(
static int
xfs_ioctl_setattr_check_cowextsize(
struct xfs_inode *ip,
- struct fileattr *fa)
+ struct file_kattr *fa)
{
struct xfs_mount *mp = ip->i_mount;
xfs_failaddr_t failaddr;
@@ -709,7 +709,7 @@ xfs_ioctl_setattr_check_cowextsize(
static int
xfs_ioctl_setattr_check_projid(
struct xfs_inode *ip,
- struct fileattr *fa)
+ struct file_kattr *fa)
{
if (!fa->fsx_valid)
return 0;
@@ -725,7 +725,7 @@ int
xfs_fileattr_set(
struct mnt_idmap *idmap,
struct dentry *dentry,
- struct fileattr *fa)
+ struct file_kattr *fa)
{
struct xfs_inode *ip = XFS_I(d_inode(dentry));
struct xfs_mount *mp = ip->i_mount;
@@ -990,9 +990,8 @@ xfs_ioc_getlabel(
BUILD_BUG_ON(sizeof(sbp->sb_fname) > FSLABEL_MAX);
/* 1 larger than sb_fname, so this ensures a trailing NUL char */
- memset(label, 0, sizeof(label));
spin_lock(&mp->m_sb_lock);
- strncpy(label, sbp->sb_fname, XFSLABEL_MAX);
+ memtostr_pad(label, sbp->sb_fname);
spin_unlock(&mp->m_sb_lock);
if (copy_to_user(user_label, label, sizeof(label)))
@@ -1131,15 +1130,15 @@ xfs_ioctl_getset_resblocks(
error = mnt_want_write_file(filp);
if (error)
return error;
- error = xfs_reserve_blocks(mp, fsop.resblks);
+ error = xfs_reserve_blocks(mp, XC_FREE_BLOCKS, fsop.resblks);
mnt_drop_write_file(filp);
if (error)
return error;
}
spin_lock(&mp->m_sb_lock);
- fsop.resblks = mp->m_resblks;
- fsop.resblks_avail = mp->m_resblks_avail;
+ fsop.resblks = mp->m_free[XC_FREE_BLOCKS].res_total;
+ fsop.resblks_avail = mp->m_free[XC_FREE_BLOCKS].res_avail;
spin_unlock(&mp->m_sb_lock);
if (copy_to_user(arg, &fsop, sizeof(fsop)))
@@ -1155,9 +1154,9 @@ xfs_ioctl_fs_counts(
struct xfs_fsop_counts out = {
.allocino = percpu_counter_read_positive(&mp->m_icount),
.freeino = percpu_counter_read_positive(&mp->m_ifree),
- .freedata = percpu_counter_read_positive(&mp->m_fdblocks) -
- xfs_fdblocks_unavailable(mp),
- .freertx = percpu_counter_read_positive(&mp->m_frextents),
+ .freedata = xfs_estimate_freecounter(mp, XC_FREE_BLOCKS) -
+ xfs_freecounter_unavailable(mp, XC_FREE_BLOCKS),
+ .freertx = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS),
};
if (copy_to_user(uarg, &out, sizeof(out)))
diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h
index 12124946f347..f5ed5cf9d3df 100644
--- a/fs/xfs/xfs_ioctl.h
+++ b/fs/xfs/xfs_ioctl.h
@@ -17,13 +17,13 @@ xfs_ioc_swapext(
extern int
xfs_fileattr_get(
struct dentry *dentry,
- struct fileattr *fa);
+ struct file_kattr *fa);
extern int
xfs_fileattr_set(
struct mnt_idmap *idmap,
struct dentry *dentry,
- struct fileattr *fa);
+ struct file_kattr *fa);
extern long
xfs_file_ioctl(
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 50fa3ef89f6c..2a74f2957341 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -30,6 +30,8 @@
#include "xfs_reflink.h"
#include "xfs_health.h"
#include "xfs_rtbitmap.h"
+#include "xfs_icache.h"
+#include "xfs_zone_alloc.h"
#define XFS_ALLOC_ALIGN(mp, off) \
(((off) >> mp->m_allocsize_log) << mp->m_allocsize_log)
@@ -77,6 +79,9 @@ xfs_iomap_valid(
{
struct xfs_inode *ip = XFS_I(inode);
+ if (iomap->type == IOMAP_HOLE)
+ return true;
+
if (iomap->validity_cookie !=
xfs_iomap_inode_sequence(ip, iomap->flags)) {
trace_xfs_iomap_invalid(ip, iomap);
@@ -87,7 +92,7 @@ xfs_iomap_valid(
return true;
}
-static const struct iomap_folio_ops xfs_iomap_folio_ops = {
+const struct iomap_write_ops xfs_iomap_write_ops = {
.iomap_valid = xfs_iomap_valid,
};
@@ -149,7 +154,6 @@ xfs_bmbt_to_iomap(
iomap->flags |= IOMAP_F_DIRTY;
iomap->validity_cookie = sequence_cookie;
- iomap->folio_ops = &xfs_iomap_folio_ops;
return 0;
}
@@ -431,13 +435,14 @@ xfs_quota_calc_throttle(
static int64_t
xfs_iomap_freesp(
- struct percpu_counter *counter,
+ struct xfs_mount *mp,
+ unsigned int idx,
uint64_t low_space[XFS_LOWSP_MAX],
int *shift)
{
int64_t freesp;
- freesp = percpu_counter_read_positive(counter);
+ freesp = xfs_estimate_freecounter(mp, idx);
if (freesp < low_space[XFS_LOWSP_5_PCNT]) {
*shift = 2;
if (freesp < low_space[XFS_LOWSP_4_PCNT])
@@ -536,10 +541,10 @@ xfs_iomap_prealloc_size(
if (unlikely(XFS_IS_REALTIME_INODE(ip)))
freesp = xfs_rtbxlen_to_blen(mp,
- xfs_iomap_freesp(&mp->m_frextents,
+ xfs_iomap_freesp(mp, XC_FREE_RTEXTENTS,
mp->m_low_rtexts, &shift));
else
- freesp = xfs_iomap_freesp(&mp->m_fdblocks, mp->m_low_space,
+ freesp = xfs_iomap_freesp(mp, XC_FREE_BLOCKS, mp->m_low_space,
&shift);
/*
@@ -795,6 +800,38 @@ imap_spans_range(
return true;
}
+static bool
+xfs_bmap_hw_atomic_write_possible(
+ struct xfs_inode *ip,
+ struct xfs_bmbt_irec *imap,
+ xfs_fileoff_t offset_fsb,
+ xfs_fileoff_t end_fsb)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fsize_t len = XFS_FSB_TO_B(mp, end_fsb - offset_fsb);
+
+ /*
+ * atomic writes are required to be naturally aligned for disk blocks,
+ * which ensures that we adhere to block layer rules that we won't
+ * straddle any boundary or violate write alignment requirement.
+ */
+ if (!IS_ALIGNED(imap->br_startblock, imap->br_blockcount))
+ return false;
+
+ /*
+ * Spanning multiple extents would mean that multiple BIOs would be
+ * issued, and so would lose atomicity required for REQ_ATOMIC-based
+ * atomics.
+ */
+ if (!imap_spans_range(imap, offset_fsb, end_fsb))
+ return false;
+
+ /*
+ * The ->iomap_begin caller should ensure this, but check anyway.
+ */
+ return len <= xfs_inode_buftarg(ip)->bt_awu_max;
+}
+
static int
xfs_direct_write_iomap_begin(
struct inode *inode,
@@ -809,9 +846,11 @@ xfs_direct_write_iomap_begin(
struct xfs_bmbt_irec imap, cmap;
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length);
+ xfs_fileoff_t orig_end_fsb = end_fsb;
int nimaps = 1, error = 0;
bool shared = false;
u16 iomap_flags = 0;
+ bool needs_alloc;
unsigned int lockmode;
u64 seq;
@@ -828,6 +867,10 @@ xfs_direct_write_iomap_begin(
if (offset + length > i_size_read(inode))
iomap_flags |= IOMAP_F_DIRTY;
+ /* HW-offload atomics are always used in this path */
+ if (flags & IOMAP_ATOMIC)
+ iomap_flags |= IOMAP_F_ATOMIC_BIO;
+
/*
* COW writes may allocate delalloc space or convert unwritten COW
* extents, so we need to make sure to take the lock exclusively here.
@@ -868,13 +911,37 @@ relock:
(flags & IOMAP_DIRECT) || IS_DAX(inode));
if (error)
goto out_unlock;
- if (shared)
+ if (shared) {
+ if ((flags & IOMAP_ATOMIC) &&
+ !xfs_bmap_hw_atomic_write_possible(ip, &cmap,
+ offset_fsb, end_fsb)) {
+ error = -ENOPROTOOPT;
+ goto out_unlock;
+ }
goto out_found_cow;
+ }
end_fsb = imap.br_startoff + imap.br_blockcount;
length = XFS_FSB_TO_B(mp, end_fsb) - offset;
}
- if (imap_needs_alloc(inode, flags, &imap, nimaps))
+ needs_alloc = imap_needs_alloc(inode, flags, &imap, nimaps);
+
+ if (flags & IOMAP_ATOMIC) {
+ error = -ENOPROTOOPT;
+ /*
+ * If we allocate less than what is required for the write
+ * then we may end up with multiple extents, which means that
+ * REQ_ATOMIC-based cannot be used, so avoid this possibility.
+ */
+ if (needs_alloc && orig_end_fsb - offset_fsb > 1)
+ goto out_unlock;
+
+ if (!xfs_bmap_hw_atomic_write_possible(ip, &imap, offset_fsb,
+ orig_end_fsb))
+ goto out_unlock;
+ }
+
+ if (needs_alloc)
goto allocate_blocks;
/*
@@ -962,6 +1029,187 @@ const struct iomap_ops xfs_direct_write_iomap_ops = {
.iomap_begin = xfs_direct_write_iomap_begin,
};
+#ifdef CONFIG_XFS_RT
+/*
+ * This is really simple. The space has already been reserved before taking the
+ * IOLOCK, the actual block allocation is done just before submitting the bio
+ * and only recorded in the extent map on I/O completion.
+ */
+static int
+xfs_zoned_direct_write_iomap_begin(
+ struct inode *inode,
+ loff_t offset,
+ loff_t length,
+ unsigned flags,
+ struct iomap *iomap,
+ struct iomap *srcmap)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ int error;
+
+ ASSERT(!(flags & IOMAP_OVERWRITE_ONLY));
+
+ /*
+ * Needs to be pushed down into the allocator so that only writes into
+ * a single zone can be supported.
+ */
+ if (flags & IOMAP_NOWAIT)
+ return -EAGAIN;
+
+ /*
+ * Ensure the extent list is in memory in so that we don't have to do
+ * read it from the I/O completion handler.
+ */
+ if (xfs_need_iread_extents(&ip->i_df)) {
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ if (error)
+ return error;
+ }
+
+ iomap->type = IOMAP_MAPPED;
+ iomap->flags = IOMAP_F_DIRTY;
+ iomap->bdev = ip->i_mount->m_rtdev_targp->bt_bdev;
+ iomap->offset = offset;
+ iomap->length = length;
+ iomap->flags = IOMAP_F_ANON_WRITE;
+ return 0;
+}
+
+const struct iomap_ops xfs_zoned_direct_write_iomap_ops = {
+ .iomap_begin = xfs_zoned_direct_write_iomap_begin,
+};
+#endif /* CONFIG_XFS_RT */
+
+static int
+xfs_atomic_write_cow_iomap_begin(
+ struct inode *inode,
+ loff_t offset,
+ loff_t length,
+ unsigned flags,
+ struct iomap *iomap,
+ struct iomap *srcmap)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ const xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length);
+ xfs_filblks_t count_fsb = end_fsb - offset_fsb;
+ int nmaps = 1;
+ xfs_filblks_t resaligned;
+ struct xfs_bmbt_irec cmap;
+ struct xfs_iext_cursor icur;
+ struct xfs_trans *tp;
+ unsigned int dblocks = 0, rblocks = 0;
+ int error;
+ u64 seq;
+
+ ASSERT(flags & IOMAP_WRITE);
+ ASSERT(flags & IOMAP_DIRECT);
+
+ if (xfs_is_shutdown(mp))
+ return -EIO;
+
+ if (!xfs_can_sw_atomic_write(mp)) {
+ ASSERT(xfs_can_sw_atomic_write(mp));
+ return -EINVAL;
+ }
+
+ /* blocks are always allocated in this path */
+ if (flags & IOMAP_NOWAIT)
+ return -EAGAIN;
+
+ trace_xfs_iomap_atomic_write_cow(ip, offset, length);
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+ if (!ip->i_cowfp) {
+ ASSERT(!xfs_is_reflink_inode(ip));
+ xfs_ifork_init_cow(ip);
+ }
+
+ if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap))
+ cmap.br_startoff = end_fsb;
+ if (cmap.br_startoff <= offset_fsb) {
+ xfs_trim_extent(&cmap, offset_fsb, count_fsb);
+ goto found;
+ }
+
+ end_fsb = cmap.br_startoff;
+ count_fsb = end_fsb - offset_fsb;
+
+ resaligned = xfs_aligned_fsb_count(offset_fsb, count_fsb,
+ xfs_get_cowextsz_hint(ip));
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+ if (XFS_IS_REALTIME_INODE(ip)) {
+ dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
+ rblocks = resaligned;
+ } else {
+ dblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
+ rblocks = 0;
+ }
+
+ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, dblocks,
+ rblocks, false, &tp);
+ if (error)
+ return error;
+
+ /* extent layout could have changed since the unlock, so check again */
+ if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap))
+ cmap.br_startoff = end_fsb;
+ if (cmap.br_startoff <= offset_fsb) {
+ xfs_trim_extent(&cmap, offset_fsb, count_fsb);
+ xfs_trans_cancel(tp);
+ goto found;
+ }
+
+ /*
+ * Allocate the entire reservation as unwritten blocks.
+ *
+ * Use XFS_BMAPI_EXTSZALIGN to hint at aligning new extents according to
+ * extszhint, such that there will be a greater chance that future
+ * atomic writes to that same range will be aligned (and don't require
+ * this COW-based method).
+ */
+ error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
+ XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC |
+ XFS_BMAPI_EXTSZALIGN, 0, &cmap, &nmaps);
+ if (error) {
+ xfs_trans_cancel(tp);
+ goto out_unlock;
+ }
+
+ xfs_inode_set_cowblocks_tag(ip);
+ error = xfs_trans_commit(tp);
+ if (error)
+ goto out_unlock;
+
+found:
+ if (cmap.br_state != XFS_EXT_NORM) {
+ error = xfs_reflink_convert_cow_locked(ip, offset_fsb,
+ count_fsb);
+ if (error)
+ goto out_unlock;
+ cmap.br_state = XFS_EXT_NORM;
+ }
+
+ length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount);
+ trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap);
+ seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq);
+
+out_unlock:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
+}
+
+const struct iomap_ops xfs_atomic_write_cow_iomap_ops = {
+ .iomap_begin = xfs_atomic_write_cow_iomap_begin,
+};
+
static int
xfs_dax_write_iomap_end(
struct inode *inode,
@@ -976,10 +1224,8 @@ xfs_dax_write_iomap_end(
if (!xfs_is_cow_inode(ip))
return 0;
- if (!written) {
- xfs_reflink_cancel_cow_range(ip, pos, length, true);
- return 0;
- }
+ if (!written)
+ return xfs_reflink_cancel_cow_range(ip, pos, length, true);
return xfs_reflink_end_cow(ip, pos, written);
}
@@ -989,6 +1235,455 @@ const struct iomap_ops xfs_dax_write_iomap_ops = {
.iomap_end = xfs_dax_write_iomap_end,
};
+/*
+ * Convert a hole to a delayed allocation.
+ */
+static void
+xfs_bmap_add_extent_hole_delay(
+ struct xfs_inode *ip, /* incore inode pointer */
+ int whichfork,
+ struct xfs_iext_cursor *icur,
+ struct xfs_bmbt_irec *new) /* new data to add to file extents */
+{
+ struct xfs_ifork *ifp; /* inode fork pointer */
+ xfs_bmbt_irec_t left; /* left neighbor extent entry */
+ xfs_filblks_t newlen=0; /* new indirect size */
+ xfs_filblks_t oldlen=0; /* old indirect size */
+ xfs_bmbt_irec_t right; /* right neighbor extent entry */
+ uint32_t state = xfs_bmap_fork_to_state(whichfork);
+ xfs_filblks_t temp; /* temp for indirect calculations */
+
+ ifp = xfs_ifork_ptr(ip, whichfork);
+ ASSERT(isnullstartblock(new->br_startblock));
+
+ /*
+ * Check and set flags if this segment has a left neighbor
+ */
+ if (xfs_iext_peek_prev_extent(ifp, icur, &left)) {
+ state |= BMAP_LEFT_VALID;
+ if (isnullstartblock(left.br_startblock))
+ state |= BMAP_LEFT_DELAY;
+ }
+
+ /*
+ * Check and set flags if the current (right) segment exists.
+ * If it doesn't exist, we're converting the hole at end-of-file.
+ */
+ if (xfs_iext_get_extent(ifp, icur, &right)) {
+ state |= BMAP_RIGHT_VALID;
+ if (isnullstartblock(right.br_startblock))
+ state |= BMAP_RIGHT_DELAY;
+ }
+
+ /*
+ * Set contiguity flags on the left and right neighbors.
+ * Don't let extents get too large, even if the pieces are contiguous.
+ */
+ if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) &&
+ left.br_startoff + left.br_blockcount == new->br_startoff &&
+ left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
+ state |= BMAP_LEFT_CONTIG;
+
+ if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) &&
+ new->br_startoff + new->br_blockcount == right.br_startoff &&
+ new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
+ (!(state & BMAP_LEFT_CONTIG) ||
+ (left.br_blockcount + new->br_blockcount +
+ right.br_blockcount <= XFS_MAX_BMBT_EXTLEN)))
+ state |= BMAP_RIGHT_CONTIG;
+
+ /*
+ * Switch out based on the contiguity flags.
+ */
+ switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) {
+ case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG:
+ /*
+ * New allocation is contiguous with delayed allocations
+ * on the left and on the right.
+ * Merge all three into a single extent record.
+ */
+ temp = left.br_blockcount + new->br_blockcount +
+ right.br_blockcount;
+
+ oldlen = startblockval(left.br_startblock) +
+ startblockval(new->br_startblock) +
+ startblockval(right.br_startblock);
+ newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+ oldlen);
+ left.br_startblock = nullstartblock(newlen);
+ left.br_blockcount = temp;
+
+ xfs_iext_remove(ip, icur, state);
+ xfs_iext_prev(ifp, icur);
+ xfs_iext_update_extent(ip, state, icur, &left);
+ break;
+
+ case BMAP_LEFT_CONTIG:
+ /*
+ * New allocation is contiguous with a delayed allocation
+ * on the left.
+ * Merge the new allocation with the left neighbor.
+ */
+ temp = left.br_blockcount + new->br_blockcount;
+
+ oldlen = startblockval(left.br_startblock) +
+ startblockval(new->br_startblock);
+ newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+ oldlen);
+ left.br_blockcount = temp;
+ left.br_startblock = nullstartblock(newlen);
+
+ xfs_iext_prev(ifp, icur);
+ xfs_iext_update_extent(ip, state, icur, &left);
+ break;
+
+ case BMAP_RIGHT_CONTIG:
+ /*
+ * New allocation is contiguous with a delayed allocation
+ * on the right.
+ * Merge the new allocation with the right neighbor.
+ */
+ temp = new->br_blockcount + right.br_blockcount;
+ oldlen = startblockval(new->br_startblock) +
+ startblockval(right.br_startblock);
+ newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+ oldlen);
+ right.br_startoff = new->br_startoff;
+ right.br_startblock = nullstartblock(newlen);
+ right.br_blockcount = temp;
+ xfs_iext_update_extent(ip, state, icur, &right);
+ break;
+
+ case 0:
+ /*
+ * New allocation is not contiguous with another
+ * delayed allocation.
+ * Insert a new entry.
+ */
+ oldlen = newlen = 0;
+ xfs_iext_insert(ip, icur, new, state);
+ break;
+ }
+ if (oldlen != newlen) {
+ ASSERT(oldlen > newlen);
+ xfs_add_fdblocks(ip->i_mount, oldlen - newlen);
+
+ /*
+ * Nothing to do for disk quota accounting here.
+ */
+ xfs_mod_delalloc(ip, 0, (int64_t)newlen - oldlen);
+ }
+}
+
+/*
+ * Add a delayed allocation extent to an inode. Blocks are reserved from the
+ * global pool and the extent inserted into the inode in-core extent tree.
+ *
+ * On entry, got refers to the first extent beyond the offset of the extent to
+ * allocate or eof is specified if no such extent exists. On return, got refers
+ * to the extent record that was inserted to the inode fork.
+ *
+ * Note that the allocated extent may have been merged with contiguous extents
+ * during insertion into the inode fork. Thus, got does not reflect the current
+ * state of the inode fork on return. If necessary, the caller can use lastx to
+ * look up the updated record in the inode fork.
+ */
+static int
+xfs_bmapi_reserve_delalloc(
+ struct xfs_inode *ip,
+ int whichfork,
+ xfs_fileoff_t off,
+ xfs_filblks_t len,
+ xfs_filblks_t prealloc,
+ struct xfs_bmbt_irec *got,
+ struct xfs_iext_cursor *icur,
+ int eof)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
+ xfs_extlen_t alen;
+ xfs_extlen_t indlen;
+ uint64_t fdblocks;
+ int error;
+ xfs_fileoff_t aoff;
+ bool use_cowextszhint =
+ whichfork == XFS_COW_FORK && !prealloc;
+
+retry:
+ /*
+ * Cap the alloc length. Keep track of prealloc so we know whether to
+ * tag the inode before we return.
+ */
+ aoff = off;
+ alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN);
+ if (!eof)
+ alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
+ if (prealloc && alen >= len)
+ prealloc = alen - len;
+
+ /*
+ * If we're targetting the COW fork but aren't creating a speculative
+ * posteof preallocation, try to expand the reservation to align with
+ * the COW extent size hint if there's sufficient free space.
+ *
+ * Unlike the data fork, the CoW cancellation functions will free all
+ * the reservations at inactivation, so we don't require that every
+ * delalloc reservation have a dirty pagecache.
+ */
+ if (use_cowextszhint) {
+ struct xfs_bmbt_irec prev;
+ xfs_extlen_t extsz = xfs_get_cowextsz_hint(ip);
+
+ if (!xfs_iext_peek_prev_extent(ifp, icur, &prev))
+ prev.br_startoff = NULLFILEOFF;
+
+ error = xfs_bmap_extsize_align(mp, got, &prev, extsz, 0, eof,
+ 1, 0, &aoff, &alen);
+ ASSERT(!error);
+ }
+
+ /*
+ * Make a transaction-less quota reservation for delayed allocation
+ * blocks. This number gets adjusted later. We return if we haven't
+ * allocated blocks already inside this loop.
+ */
+ error = xfs_quota_reserve_blkres(ip, alen);
+ if (error)
+ goto out;
+
+ /*
+ * Split changing sb for alen and indlen since they could be coming
+ * from different places.
+ */
+ indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen);
+ ASSERT(indlen > 0);
+
+ fdblocks = indlen;
+ if (XFS_IS_REALTIME_INODE(ip)) {
+ ASSERT(!xfs_is_zoned_inode(ip));
+ error = xfs_dec_frextents(mp, xfs_blen_to_rtbxlen(mp, alen));
+ if (error)
+ goto out_unreserve_quota;
+ } else {
+ fdblocks += alen;
+ }
+
+ error = xfs_dec_fdblocks(mp, fdblocks, false);
+ if (error)
+ goto out_unreserve_frextents;
+
+ ip->i_delayed_blks += alen;
+ xfs_mod_delalloc(ip, alen, indlen);
+
+ got->br_startoff = aoff;
+ got->br_startblock = nullstartblock(indlen);
+ got->br_blockcount = alen;
+ got->br_state = XFS_EXT_NORM;
+
+ xfs_bmap_add_extent_hole_delay(ip, whichfork, icur, got);
+
+ /*
+ * Tag the inode if blocks were preallocated. Note that COW fork
+ * preallocation can occur at the start or end of the extent, even when
+ * prealloc == 0, so we must also check the aligned offset and length.
+ */
+ if (whichfork == XFS_DATA_FORK && prealloc)
+ xfs_inode_set_eofblocks_tag(ip);
+ if (whichfork == XFS_COW_FORK && (prealloc || aoff < off || alen > len))
+ xfs_inode_set_cowblocks_tag(ip);
+
+ return 0;
+
+out_unreserve_frextents:
+ if (XFS_IS_REALTIME_INODE(ip))
+ xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, alen));
+out_unreserve_quota:
+ if (XFS_IS_QUOTA_ON(mp))
+ xfs_quota_unreserve_blkres(ip, alen);
+out:
+ if (error == -ENOSPC || error == -EDQUOT) {
+ trace_xfs_delalloc_enospc(ip, off, len);
+
+ if (prealloc || use_cowextszhint) {
+ /* retry without any preallocation */
+ use_cowextszhint = false;
+ prealloc = 0;
+ goto retry;
+ }
+ }
+ return error;
+}
+
+static int
+xfs_zoned_buffered_write_iomap_begin(
+ struct inode *inode,
+ loff_t offset,
+ loff_t count,
+ unsigned flags,
+ struct iomap *iomap,
+ struct iomap *srcmap)
+{
+ struct iomap_iter *iter =
+ container_of(iomap, struct iomap_iter, iomap);
+ struct xfs_zone_alloc_ctx *ac = iter->private;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, count);
+ u16 iomap_flags = IOMAP_F_SHARED;
+ unsigned int lockmode = XFS_ILOCK_EXCL;
+ xfs_filblks_t count_fsb;
+ xfs_extlen_t indlen;
+ struct xfs_bmbt_irec got;
+ struct xfs_iext_cursor icur;
+ int error = 0;
+
+ ASSERT(!xfs_get_extsz_hint(ip));
+ ASSERT(!(flags & IOMAP_UNSHARE));
+ ASSERT(ac);
+
+ if (xfs_is_shutdown(mp))
+ return -EIO;
+
+ error = xfs_qm_dqattach(ip);
+ if (error)
+ return error;
+
+ error = xfs_ilock_for_iomap(ip, flags, &lockmode);
+ if (error)
+ return error;
+
+ if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) ||
+ XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+ xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
+ error = -EFSCORRUPTED;
+ goto out_unlock;
+ }
+
+ XFS_STATS_INC(mp, xs_blk_mapw);
+
+ error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
+ if (error)
+ goto out_unlock;
+
+ /*
+ * For zeroing operations check if there is any data to zero first.
+ *
+ * For regular writes we always need to allocate new blocks, but need to
+ * provide the source mapping when the range is unaligned to support
+ * read-modify-write of the whole block in the page cache.
+ *
+ * In either case we need to limit the reported range to the boundaries
+ * of the source map in the data fork.
+ */
+ if (!IS_ALIGNED(offset, mp->m_sb.sb_blocksize) ||
+ !IS_ALIGNED(offset + count, mp->m_sb.sb_blocksize) ||
+ (flags & IOMAP_ZERO)) {
+ struct xfs_bmbt_irec smap;
+ struct xfs_iext_cursor scur;
+
+ if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &scur,
+ &smap))
+ smap.br_startoff = end_fsb; /* fake hole until EOF */
+ if (smap.br_startoff > offset_fsb) {
+ /*
+ * We never need to allocate blocks for zeroing a hole.
+ */
+ if (flags & IOMAP_ZERO) {
+ xfs_hole_to_iomap(ip, iomap, offset_fsb,
+ smap.br_startoff);
+ goto out_unlock;
+ }
+ end_fsb = min(end_fsb, smap.br_startoff);
+ } else {
+ end_fsb = min(end_fsb,
+ smap.br_startoff + smap.br_blockcount);
+ xfs_trim_extent(&smap, offset_fsb,
+ end_fsb - offset_fsb);
+ error = xfs_bmbt_to_iomap(ip, srcmap, &smap, flags, 0,
+ xfs_iomap_inode_sequence(ip, 0));
+ if (error)
+ goto out_unlock;
+ }
+ }
+
+ if (!ip->i_cowfp)
+ xfs_ifork_init_cow(ip);
+
+ if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got))
+ got.br_startoff = end_fsb;
+ if (got.br_startoff <= offset_fsb) {
+ trace_xfs_reflink_cow_found(ip, &got);
+ goto done;
+ }
+
+ /*
+ * Cap the maximum length to keep the chunks of work done here somewhat
+ * symmetric with the work writeback does.
+ */
+ end_fsb = min(end_fsb, got.br_startoff);
+ count_fsb = min3(end_fsb - offset_fsb, XFS_MAX_BMBT_EXTLEN,
+ XFS_B_TO_FSB(mp, 1024 * PAGE_SIZE));
+
+ /*
+ * The block reservation is supposed to cover all blocks that the
+ * operation could possible write, but there is a nasty corner case
+ * where blocks could be stolen from underneath us:
+ *
+ * 1) while this thread iterates over a larger buffered write,
+ * 2) another thread is causing a write fault that calls into
+ * ->page_mkwrite in range this thread writes to, using up the
+ * delalloc reservation created by a previous call to this function.
+ * 3) another thread does direct I/O on the range that the write fault
+ * happened on, which causes writeback of the dirty data.
+ * 4) this then set the stale flag, which cuts the current iomap
+ * iteration short, causing the new call to ->iomap_begin that gets
+ * us here again, but now without a sufficient reservation.
+ *
+ * This is a very unusual I/O pattern, and nothing but generic/095 is
+ * known to hit it. There's not really much we can do here, so turn this
+ * into a short write.
+ */
+ if (count_fsb > ac->reserved_blocks) {
+ xfs_warn_ratelimited(mp,
+"Short write on ino 0x%llx comm %.20s due to three-way race with write fault and direct I/O",
+ ip->i_ino, current->comm);
+ count_fsb = ac->reserved_blocks;
+ if (!count_fsb) {
+ error = -EIO;
+ goto out_unlock;
+ }
+ }
+
+ error = xfs_quota_reserve_blkres(ip, count_fsb);
+ if (error)
+ goto out_unlock;
+
+ indlen = xfs_bmap_worst_indlen(ip, count_fsb);
+ error = xfs_dec_fdblocks(mp, indlen, false);
+ if (error)
+ goto out_unlock;
+ ip->i_delayed_blks += count_fsb;
+ xfs_mod_delalloc(ip, count_fsb, indlen);
+
+ got.br_startoff = offset_fsb;
+ got.br_startblock = nullstartblock(indlen);
+ got.br_blockcount = count_fsb;
+ got.br_state = XFS_EXT_NORM;
+ xfs_bmap_add_extent_hole_delay(ip, XFS_COW_FORK, &icur, &got);
+ ac->reserved_blocks -= count_fsb;
+ iomap_flags |= IOMAP_F_NEW;
+
+ trace_xfs_iomap_alloc(ip, offset, XFS_FSB_TO_B(mp, count_fsb),
+ XFS_COW_FORK, &got);
+done:
+ error = xfs_bmbt_to_iomap(ip, iomap, &got, flags, iomap_flags,
+ xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED));
+out_unlock:
+ xfs_iunlock(ip, lockmode);
+ return error;
+}
+
static int
xfs_buffered_write_iomap_begin(
struct inode *inode,
@@ -1015,6 +1710,10 @@ xfs_buffered_write_iomap_begin(
if (xfs_is_shutdown(mp))
return -EIO;
+ if (xfs_is_zoned_inode(ip))
+ return xfs_zoned_buffered_write_iomap_begin(inode, offset,
+ count, flags, iomap, srcmap);
+
/* we can't use delayed allocations when using extent size hints */
if (xfs_get_extsz_hint(ip))
return xfs_direct_write_iomap_begin(inode, offset, count,
@@ -1247,10 +1946,13 @@ xfs_buffered_write_delalloc_punch(
loff_t length,
struct iomap *iomap)
{
+ struct iomap_iter *iter =
+ container_of(iomap, struct iomap_iter, iomap);
+
xfs_bmap_punch_delalloc_range(XFS_I(inode),
(iomap->flags & IOMAP_F_SHARED) ?
XFS_COW_FORK : XFS_DATA_FORK,
- offset, offset + length);
+ offset, offset + length, iter->private);
}
static int
@@ -1487,6 +2189,7 @@ xfs_zero_range(
struct xfs_inode *ip,
loff_t pos,
loff_t len,
+ struct xfs_zone_alloc_ctx *ac,
bool *did_zero)
{
struct inode *inode = VFS_I(ip);
@@ -1497,13 +2200,15 @@ xfs_zero_range(
return dax_zero_range(inode, pos, len, did_zero,
&xfs_dax_write_iomap_ops);
return iomap_zero_range(inode, pos, len, did_zero,
- &xfs_buffered_write_iomap_ops);
+ &xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops,
+ ac);
}
int
xfs_truncate_page(
struct xfs_inode *ip,
loff_t pos,
+ struct xfs_zone_alloc_ctx *ac,
bool *did_zero)
{
struct inode *inode = VFS_I(ip);
@@ -1512,5 +2217,6 @@ xfs_truncate_page(
return dax_truncate_page(inode, pos, did_zero,
&xfs_dax_write_iomap_ops);
return iomap_truncate_page(inode, pos, did_zero,
- &xfs_buffered_write_iomap_ops);
+ &xfs_buffered_write_iomap_ops, &xfs_iomap_write_ops,
+ ac);
}
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 8347268af727..ebcce7d49446 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -10,6 +10,7 @@
struct xfs_inode;
struct xfs_bmbt_irec;
+struct xfs_zone_alloc_ctx;
int xfs_iomap_write_direct(struct xfs_inode *ip, xfs_fileoff_t offset_fsb,
xfs_fileoff_t count_fsb, unsigned int flags,
@@ -24,8 +25,9 @@ int xfs_bmbt_to_iomap(struct xfs_inode *ip, struct iomap *iomap,
u16 iomap_flags, u64 sequence_cookie);
int xfs_zero_range(struct xfs_inode *ip, loff_t pos, loff_t len,
- bool *did_zero);
-int xfs_truncate_page(struct xfs_inode *ip, loff_t pos, bool *did_zero);
+ struct xfs_zone_alloc_ctx *ac, bool *did_zero);
+int xfs_truncate_page(struct xfs_inode *ip, loff_t pos,
+ struct xfs_zone_alloc_ctx *ac, bool *did_zero);
static inline xfs_filblks_t
xfs_aligned_fsb_count(
@@ -49,9 +51,12 @@ xfs_aligned_fsb_count(
extern const struct iomap_ops xfs_buffered_write_iomap_ops;
extern const struct iomap_ops xfs_direct_write_iomap_ops;
+extern const struct iomap_ops xfs_zoned_direct_write_iomap_ops;
extern const struct iomap_ops xfs_read_iomap_ops;
extern const struct iomap_ops xfs_seek_iomap_ops;
extern const struct iomap_ops xfs_xattr_iomap_ops;
extern const struct iomap_ops xfs_dax_write_iomap_ops;
+extern const struct iomap_ops xfs_atomic_write_cow_iomap_ops;
+extern const struct iomap_write_ops xfs_iomap_write_ops;
#endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 40289fe6f5b2..603effabe1ee 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -29,6 +29,7 @@
#include "xfs_xattr.h"
#include "xfs_file.h"
#include "xfs_bmap.h"
+#include "xfs_zone_alloc.h"
#include <linux/posix_acl.h>
#include <linux/security.h>
@@ -298,14 +299,14 @@ xfs_vn_create(
return xfs_generic_create(idmap, dir, dentry, mode, 0, NULL);
}
-STATIC int
+STATIC struct dentry *
xfs_vn_mkdir(
struct mnt_idmap *idmap,
struct inode *dir,
struct dentry *dentry,
umode_t mode)
{
- return xfs_generic_create(idmap, dir, dentry, mode | S_IFDIR, 0, NULL);
+ return ERR_PTR(xfs_generic_create(idmap, dir, dentry, mode | S_IFDIR, 0, NULL));
}
STATIC struct dentry *
@@ -600,16 +601,83 @@ xfs_report_dioalign(
stat->dio_offset_align = stat->dio_read_offset_align;
}
+unsigned int
+xfs_get_atomic_write_min(
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+
+ /*
+ * If we can complete an atomic write via atomic out of place writes,
+ * then advertise a minimum size of one fsblock. Without this
+ * mechanism, we can only guarantee atomic writes up to a single LBA.
+ *
+ * If out of place writes are not available, we can guarantee an atomic
+ * write of exactly one single fsblock if the bdev will make that
+ * guarantee for us.
+ */
+ if (xfs_inode_can_hw_atomic_write(ip) ||
+ xfs_inode_can_sw_atomic_write(ip))
+ return mp->m_sb.sb_blocksize;
+
+ return 0;
+}
+
+unsigned int
+xfs_get_atomic_write_max(
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+
+ /*
+ * If out of place writes are not available, we can guarantee an atomic
+ * write of exactly one single fsblock if the bdev will make that
+ * guarantee for us.
+ */
+ if (!xfs_inode_can_sw_atomic_write(ip)) {
+ if (xfs_inode_can_hw_atomic_write(ip))
+ return mp->m_sb.sb_blocksize;
+ return 0;
+ }
+
+ /*
+ * If we can complete an atomic write via atomic out of place writes,
+ * then advertise a maximum size of whatever we can complete through
+ * that means. Hardware support is reported via max_opt, not here.
+ */
+ if (XFS_IS_REALTIME_INODE(ip))
+ return XFS_FSB_TO_B(mp, mp->m_groups[XG_TYPE_RTG].awu_max);
+ return XFS_FSB_TO_B(mp, mp->m_groups[XG_TYPE_AG].awu_max);
+}
+
+unsigned int
+xfs_get_atomic_write_max_opt(
+ struct xfs_inode *ip)
+{
+ unsigned int awu_max = xfs_get_atomic_write_max(ip);
+
+ /* if the max is 1x block, then just keep behaviour that opt is 0 */
+ if (awu_max <= ip->i_mount->m_sb.sb_blocksize)
+ return 0;
+
+ /*
+ * Advertise the maximum size of an atomic write that we can tell the
+ * block device to perform for us. In general the bdev limit will be
+ * less than our out of place write limit, but we don't want to exceed
+ * the awu_max.
+ */
+ return min(awu_max, xfs_inode_buftarg(ip)->bt_awu_max);
+}
+
static void
xfs_report_atomic_write(
struct xfs_inode *ip,
struct kstat *stat)
{
- unsigned int unit_min = 0, unit_max = 0;
-
- if (xfs_inode_can_atomicwrite(ip))
- unit_min = unit_max = ip->i_mount->m_sb.sb_blocksize;
- generic_fill_statx_atomic_writes(stat, unit_min, unit_max);
+ generic_fill_statx_atomic_writes(stat,
+ xfs_get_atomic_write_min(ip),
+ xfs_get_atomic_write_max(ip),
+ xfs_get_atomic_write_max_opt(ip));
}
STATIC int
@@ -854,6 +922,7 @@ xfs_setattr_size(
uint lock_flags = 0;
uint resblks = 0;
bool did_zeroing = false;
+ struct xfs_zone_alloc_ctx ac = { };
xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
ASSERT(S_ISREG(inode->i_mode));
@@ -890,6 +959,28 @@ xfs_setattr_size(
inode_dio_wait(inode);
/*
+ * Normally xfs_zoned_space_reserve is supposed to be called outside the
+ * IOLOCK. For truncate we can't do that since ->setattr is called with
+ * it already held by the VFS. So for now chicken out and try to
+ * allocate space under it.
+ *
+ * To avoid deadlocks this means we can't block waiting for space, which
+ * can lead to spurious -ENOSPC if there are no directly available
+ * blocks. We mitigate this a bit by allowing zeroing to dip into the
+ * reserved pool, but eventually the VFS calling convention needs to
+ * change.
+ */
+ if (xfs_is_zoned_inode(ip)) {
+ error = xfs_zoned_space_reserve(mp, 1,
+ XFS_ZR_NOWAIT | XFS_ZR_RESERVED, &ac);
+ if (error) {
+ if (error == -EAGAIN)
+ return -ENOSPC;
+ return error;
+ }
+ }
+
+ /*
* File data changes must be complete before we start the transaction to
* modify the inode. This needs to be done before joining the inode to
* the transaction because the inode cannot be unlocked once it is a
@@ -902,11 +993,14 @@ xfs_setattr_size(
if (newsize > oldsize) {
trace_xfs_zero_eof(ip, oldsize, newsize - oldsize);
error = xfs_zero_range(ip, oldsize, newsize - oldsize,
- &did_zeroing);
+ &ac, &did_zeroing);
} else {
- error = xfs_truncate_page(ip, newsize, &did_zeroing);
+ error = xfs_truncate_page(ip, newsize, &ac, &did_zeroing);
}
+ if (xfs_is_zoned_inode(ip))
+ xfs_zoned_space_unreserve(mp, &ac);
+
if (error)
return error;
diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h
index 3c1a2605ffd2..0896f6b8b3b8 100644
--- a/fs/xfs/xfs_iops.h
+++ b/fs/xfs/xfs_iops.h
@@ -19,5 +19,8 @@ int xfs_inode_init_security(struct inode *inode, struct inode *dir,
extern void xfs_setup_inode(struct xfs_inode *ip);
extern void xfs_setup_iops(struct xfs_inode *ip);
extern void xfs_diflags_to_iflags(struct xfs_inode *ip, bool init);
+unsigned int xfs_get_atomic_write_min(struct xfs_inode *ip);
+unsigned int xfs_get_atomic_write_max(struct xfs_inode *ip);
+unsigned int xfs_get_atomic_write_max_opt(struct xfs_inode *ip);
#endif /* __XFS_IOPS_H__ */
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 1fa1c0564b0c..2aa37a4d2706 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -239,14 +239,10 @@ xfs_bulkstat_one(
* Grab an empty transaction so that we can use its recursive buffer
* locking abilities to detect cycles in the inobt without deadlocking.
*/
- error = xfs_trans_alloc_empty(breq->mp, &tp);
- if (error)
- goto out;
-
+ tp = xfs_trans_alloc_empty(breq->mp);
error = xfs_bulkstat_one_int(breq->mp, breq->idmap, tp,
breq->startino, &bc);
xfs_trans_cancel(tp);
-out:
kfree(bc.buf);
/*
@@ -311,7 +307,6 @@ xfs_bulkstat(
.breq = breq,
};
struct xfs_trans *tp;
- unsigned int iwalk_flags = 0;
int error;
if (breq->idmap != &nop_mnt_idmap) {
@@ -331,17 +326,10 @@ xfs_bulkstat(
* Grab an empty transaction so that we can use its recursive buffer
* locking abilities to detect cycles in the inobt without deadlocking.
*/
- error = xfs_trans_alloc_empty(breq->mp, &tp);
- if (error)
- goto out;
-
- if (breq->flags & XFS_IBULK_SAME_AG)
- iwalk_flags |= XFS_IWALK_SAME_AG;
-
- error = xfs_iwalk(breq->mp, tp, breq->startino, iwalk_flags,
+ tp = xfs_trans_alloc_empty(breq->mp);
+ error = xfs_iwalk(breq->mp, tp, breq->startino, breq->iwalk_flags,
xfs_bulkstat_iwalk, breq->icount, &bc);
xfs_trans_cancel(tp);
-out:
kfree(bc.buf);
/*
@@ -464,14 +452,10 @@ xfs_inumbers(
* Grab an empty transaction so that we can use its recursive buffer
* locking abilities to detect cycles in the inobt without deadlocking.
*/
- error = xfs_trans_alloc_empty(breq->mp, &tp);
- if (error)
- goto out;
-
- error = xfs_inobt_walk(breq->mp, tp, breq->startino, breq->flags,
+ tp = xfs_trans_alloc_empty(breq->mp);
+ error = xfs_inobt_walk(breq->mp, tp, breq->startino, breq->iwalk_flags,
xfs_inumbers_walk, breq->icount, &ic);
xfs_trans_cancel(tp);
-out:
/*
* We found some inode groups, so clear the error status and return
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index f10e8f8f2335..2d0612f14d6e 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -13,17 +13,15 @@ struct xfs_ibulk {
xfs_ino_t startino; /* start with this inode */
unsigned int icount; /* number of elements in ubuffer */
unsigned int ocount; /* number of records returned */
- unsigned int flags; /* see XFS_IBULK_FLAG_* */
+ unsigned int flags; /* XFS_IBULK_FLAG_* */
+ unsigned int iwalk_flags; /* XFS_IWALK_FLAG_* */
};
-/* Only iterate within the same AG as startino */
-#define XFS_IBULK_SAME_AG (1U << 0)
-
/* Fill out the bs_extents64 field if set. */
-#define XFS_IBULK_NREXT64 (1U << 1)
+#define XFS_IBULK_NREXT64 (1U << 0)
/* Signal that we can return metadata directories. */
-#define XFS_IBULK_METADIR (1U << 2)
+#define XFS_IBULK_METADIR (1U << 1)
/*
* Advance the user buffer pointer by one record of the given size. If the
diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c
index 7db3ece370b1..c1c31d1a8e21 100644
--- a/fs/xfs/xfs_iwalk.c
+++ b/fs/xfs/xfs_iwalk.c
@@ -377,11 +377,8 @@ xfs_iwalk_run_callbacks(
if (!has_more)
return 0;
- if (iwag->drop_trans) {
- error = xfs_trans_alloc_empty(mp, &iwag->tp);
- if (error)
- return error;
- }
+ if (iwag->drop_trans)
+ iwag->tp = xfs_trans_alloc_empty(mp);
/* ...and recreate the cursor just past where we left off. */
error = xfs_ialloc_read_agi(iwag->pag, iwag->tp, 0, agi_bpp);
@@ -617,9 +614,7 @@ xfs_iwalk_ag_work(
* Grab an empty transaction so that we can use its recursive buffer
* locking abilities to detect cycles in the inobt without deadlocking.
*/
- error = xfs_trans_alloc_empty(mp, &iwag->tp);
- if (error)
- goto out;
+ iwag->tp = xfs_trans_alloc_empty(mp);
iwag->drop_trans = 1;
error = xfs_iwalk_ag(iwag);
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index f8851ff835de..c8a57e21a1d3 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -20,6 +20,7 @@
#include "xfs_sysfs.h"
#include "xfs_sb.h"
#include "xfs_health.h"
+#include "xfs_zone_alloc.h"
struct kmem_cache *xfs_log_ticket_cache;
@@ -108,14 +109,14 @@ xlog_prepare_iovec(
vec = &lv->lv_iovecp[0];
}
- len = lv->lv_buf_len + sizeof(struct xlog_op_header);
+ len = lv->lv_buf_used + sizeof(struct xlog_op_header);
if (!IS_ALIGNED(len, sizeof(uint64_t))) {
- lv->lv_buf_len = round_up(len, sizeof(uint64_t)) -
+ lv->lv_buf_used = round_up(len, sizeof(uint64_t)) -
sizeof(struct xlog_op_header);
}
vec->i_type = type;
- vec->i_addr = lv->lv_buf + lv->lv_buf_len;
+ vec->i_addr = lv->lv_buf + lv->lv_buf_used;
oph = vec->i_addr;
oph->oh_clientid = XFS_TRANSACTION;
@@ -1606,27 +1607,6 @@ xlog_bio_end_io(
&iclog->ic_end_io_work);
}
-static int
-xlog_map_iclog_data(
- struct bio *bio,
- void *data,
- size_t count)
-{
- do {
- struct page *page = kmem_to_page(data);
- unsigned int off = offset_in_page(data);
- size_t len = min_t(size_t, count, PAGE_SIZE - off);
-
- if (bio_add_page(bio, page, len, off) != len)
- return -EIO;
-
- data += len;
- count -= len;
- } while (count);
-
- return 0;
-}
-
STATIC void
xlog_write_iclog(
struct xlog *log,
@@ -1692,11 +1672,12 @@ xlog_write_iclog(
iclog->ic_flags &= ~(XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA);
- if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count))
- goto shutdown;
-
- if (is_vmalloc_addr(iclog->ic_data))
- flush_kernel_vmap_range(iclog->ic_data, count);
+ if (is_vmalloc_addr(iclog->ic_data)) {
+ if (!bio_add_vmalloc(&iclog->ic_bio, iclog->ic_data, count))
+ goto shutdown;
+ } else {
+ bio_add_virt_nofail(&iclog->ic_bio, iclog->ic_data, count);
+ }
/*
* If this log buffer would straddle the end of the log we will have
@@ -1950,9 +1931,9 @@ xlog_print_trans(
if (!lv)
continue;
xfs_warn(mp, " niovecs = %d", lv->lv_niovecs);
- xfs_warn(mp, " size = %d", lv->lv_size);
+ xfs_warn(mp, " alloc_size = %d", lv->lv_alloc_size);
xfs_warn(mp, " bytes = %d", lv->lv_bytes);
- xfs_warn(mp, " buf len = %d", lv->lv_buf_len);
+ xfs_warn(mp, " buf used= %d", lv->lv_buf_used);
/* dump each iovec for the log item */
vec = lv->lv_iovecp;
@@ -2887,7 +2868,7 @@ xlog_force_and_check_iclog(
*
* 1. the current iclog is active and has no data; the previous iclog
* is in the active or dirty state.
- * 2. the current iclog is drity, and the previous iclog is in the
+ * 2. the current iclog is dirty, and the previous iclog is in the
* active or dirty state.
*
* We may sleep if:
@@ -3111,16 +3092,16 @@ xfs_log_force_seq(
*/
void
xfs_log_ticket_put(
- xlog_ticket_t *ticket)
+ struct xlog_ticket *ticket)
{
ASSERT(atomic_read(&ticket->t_ref) > 0);
if (atomic_dec_and_test(&ticket->t_ref))
kmem_cache_free(xfs_log_ticket_cache, ticket);
}
-xlog_ticket_t *
+struct xlog_ticket *
xfs_log_ticket_get(
- xlog_ticket_t *ticket)
+ struct xlog_ticket *ticket)
{
ASSERT(atomic_read(&ticket->t_ref) > 0);
atomic_inc(&ticket->t_ref);
@@ -3540,6 +3521,9 @@ xlog_force_shutdown(
spin_unlock(&log->l_icloglock);
wake_up_var(&log->l_opstate);
+ if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(log->l_mp))
+ xfs_zoned_wake_all(log->l_mp);
+
return log_error;
}
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 13455854365f..af6daf4f6792 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -16,8 +16,8 @@ struct xfs_log_vec {
struct xfs_log_item *lv_item; /* owner */
char *lv_buf; /* formatted buffer */
int lv_bytes; /* accounted space in buffer */
- int lv_buf_len; /* aligned size of buffer */
- int lv_size; /* size of allocated lv */
+ int lv_buf_used; /* buffer space used so far */
+ int lv_alloc_size; /* size of allocated lv */
};
#define XFS_LOG_VEC_ORDERED (-1)
@@ -64,12 +64,13 @@ xlog_finish_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec *vec,
oph->oh_len = cpu_to_be32(len);
len += sizeof(struct xlog_op_header);
- lv->lv_buf_len += len;
+ lv->lv_buf_used += len;
lv->lv_bytes += len;
vec->i_len = len;
/* Catch buffer overruns */
- ASSERT((void *)lv->lv_buf + lv->lv_bytes <= (void *)lv + lv->lv_size);
+ ASSERT((void *)lv->lv_buf + lv->lv_bytes <=
+ (void *)lv + lv->lv_alloc_size);
}
/*
@@ -87,13 +88,6 @@ xlog_copy_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp,
return buf;
}
-static inline void *
-xlog_copy_from_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp,
- const struct xfs_log_iovec *src)
-{
- return xlog_copy_iovec(lv, vecp, src->i_type, src->i_addr, src->i_len);
-}
-
/*
* By comparing each component, we don't have to worry about extra
* endian issues in treating two 32 bit numbers as one 64 bit number
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 1ca406ec1b40..f443757e93c2 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -275,7 +275,7 @@ xlog_cil_alloc_shadow_bufs(
struct xfs_log_vec *lv;
int niovecs = 0;
int nbytes = 0;
- int buf_size;
+ int alloc_size;
bool ordered = false;
/* Skip items which aren't dirty in this transaction. */
@@ -309,23 +309,21 @@ xlog_cil_alloc_shadow_bufs(
* Then round nbytes up to 64-bit alignment so that the initial
* buffer alignment is easy to calculate and verify.
*/
- nbytes += niovecs *
- (sizeof(uint64_t) + sizeof(struct xlog_op_header));
- nbytes = round_up(nbytes, sizeof(uint64_t));
+ nbytes = xlog_item_space(niovecs, nbytes);
/*
* The data buffer needs to start 64-bit aligned, so round up
* that space to ensure we can align it appropriately and not
* overrun the buffer.
*/
- buf_size = nbytes + xlog_cil_iovec_space(niovecs);
+ alloc_size = nbytes + xlog_cil_iovec_space(niovecs);
/*
* if we have no shadow buffer, or it is too small, we need to
* reallocate it.
*/
if (!lip->li_lv_shadow ||
- buf_size > lip->li_lv_shadow->lv_size) {
+ alloc_size > lip->li_lv_shadow->lv_alloc_size) {
/*
* We free and allocate here as a realloc would copy
* unnecessary data. We don't use kvzalloc() for the
@@ -334,15 +332,15 @@ xlog_cil_alloc_shadow_bufs(
* storage.
*/
kvfree(lip->li_lv_shadow);
- lv = xlog_kvmalloc(buf_size);
+ lv = xlog_kvmalloc(alloc_size);
memset(lv, 0, xlog_cil_iovec_space(niovecs));
INIT_LIST_HEAD(&lv->lv_list);
lv->lv_item = lip;
- lv->lv_size = buf_size;
+ lv->lv_alloc_size = alloc_size;
if (ordered)
- lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
+ lv->lv_buf_used = XFS_LOG_VEC_ORDERED;
else
lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1];
lip->li_lv_shadow = lv;
@@ -350,9 +348,9 @@ xlog_cil_alloc_shadow_bufs(
/* same or smaller, optimise common overwrite case */
lv = lip->li_lv_shadow;
if (ordered)
- lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
+ lv->lv_buf_used = XFS_LOG_VEC_ORDERED;
else
- lv->lv_buf_len = 0;
+ lv->lv_buf_used = 0;
lv->lv_bytes = 0;
}
@@ -372,30 +370,30 @@ xlog_cil_alloc_shadow_bufs(
STATIC void
xfs_cil_prepare_item(
struct xlog *log,
+ struct xfs_log_item *lip,
struct xfs_log_vec *lv,
- struct xfs_log_vec *old_lv,
int *diff_len)
{
/* Account for the new LV being passed in */
- if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED)
+ if (lv->lv_buf_used != XFS_LOG_VEC_ORDERED)
*diff_len += lv->lv_bytes;
/*
* If there is no old LV, this is the first time we've seen the item in
* this CIL context and so we need to pin it. If we are replacing the
- * old_lv, then remove the space it accounts for and make it the shadow
+ * old lv, then remove the space it accounts for and make it the shadow
* buffer for later freeing. In both cases we are now switching to the
* shadow buffer, so update the pointer to it appropriately.
*/
- if (!old_lv) {
+ if (!lip->li_lv) {
if (lv->lv_item->li_ops->iop_pin)
lv->lv_item->li_ops->iop_pin(lv->lv_item);
lv->lv_item->li_lv_shadow = NULL;
- } else if (old_lv != lv) {
- ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED);
+ } else if (lip->li_lv != lv) {
+ ASSERT(lv->lv_buf_used != XFS_LOG_VEC_ORDERED);
- *diff_len -= old_lv->lv_bytes;
- lv->lv_item->li_lv_shadow = old_lv;
+ *diff_len -= lip->li_lv->lv_bytes;
+ lv->lv_item->li_lv_shadow = lip->li_lv;
}
/* attach new log vector to log item */
@@ -454,10 +452,8 @@ xlog_cil_insert_format_items(
}
list_for_each_entry(lip, &tp->t_items, li_trans) {
- struct xfs_log_vec *lv;
- struct xfs_log_vec *old_lv = NULL;
- struct xfs_log_vec *shadow;
- bool ordered = false;
+ struct xfs_log_vec *lv = lip->li_lv;
+ struct xfs_log_vec *shadow = lip->li_lv_shadow;
/* Skip items which aren't dirty in this transaction. */
if (!test_bit(XFS_LI_DIRTY, &lip->li_flags))
@@ -467,22 +463,23 @@ xlog_cil_insert_format_items(
* The formatting size information is already attached to
* the shadow lv on the log item.
*/
- shadow = lip->li_lv_shadow;
- if (shadow->lv_buf_len == XFS_LOG_VEC_ORDERED)
- ordered = true;
+ if (shadow->lv_buf_used == XFS_LOG_VEC_ORDERED) {
+ if (!lv) {
+ lv = shadow;
+ lv->lv_item = lip;
+ }
+ ASSERT(shadow->lv_alloc_size == lv->lv_alloc_size);
+ xfs_cil_prepare_item(log, lip, lv, diff_len);
+ continue;
+ }
/* Skip items that do not have any vectors for writing */
- if (!shadow->lv_niovecs && !ordered)
+ if (!shadow->lv_niovecs)
continue;
/* compare to existing item size */
- old_lv = lip->li_lv;
- if (lip->li_lv && shadow->lv_size <= lip->li_lv->lv_size) {
+ if (lv && shadow->lv_alloc_size <= lv->lv_alloc_size) {
/* same or smaller, optimise common overwrite case */
- lv = lip->li_lv;
-
- if (ordered)
- goto insert;
/*
* set the item up as though it is a new insertion so
@@ -494,7 +491,7 @@ xlog_cil_insert_format_items(
lv->lv_niovecs = shadow->lv_niovecs;
/* reset the lv buffer information for new formatting */
- lv->lv_buf_len = 0;
+ lv->lv_buf_used = 0;
lv->lv_bytes = 0;
lv->lv_buf = (char *)lv +
xlog_cil_iovec_space(lv->lv_niovecs);
@@ -502,17 +499,11 @@ xlog_cil_insert_format_items(
/* switch to shadow buffer! */
lv = shadow;
lv->lv_item = lip;
- if (ordered) {
- /* track as an ordered logvec */
- ASSERT(lip->li_lv == NULL);
- goto insert;
- }
}
ASSERT(IS_ALIGNED((unsigned long)lv->lv_buf, sizeof(uint64_t)));
lip->li_ops->iop_format(lip, lv);
-insert:
- xfs_cil_prepare_item(log, lv, old_lv, diff_len);
+ xfs_cil_prepare_item(log, lip, lv, diff_len);
}
}
@@ -795,8 +786,10 @@ xlog_cil_ail_insert(
struct xfs_log_item *lip = lv->lv_item;
xfs_lsn_t item_lsn;
- if (aborted)
+ if (aborted) {
+ trace_xlog_ail_insert_abort(lip);
set_bit(XFS_LI_ABORTED, &lip->li_flags);
+ }
if (lip->li_ops->flags & XFS_ITEM_RELEASE_WHEN_COMMITTED) {
lip->li_ops->iop_release(lip);
@@ -1245,7 +1238,7 @@ xlog_cil_build_lv_chain(
lv->lv_order_id = item->li_order_id;
/* we don't write ordered log vectors */
- if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED)
+ if (lv->lv_buf_used != XFS_LOG_VEC_ORDERED)
*num_bytes += lv->lv_bytes;
*num_iovecs += lv->lv_niovecs;
list_add_tail(&lv->lv_list, &ctx->lv_chain);
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index f3d78869e5e5..a9a7a271c15b 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -144,7 +144,7 @@ enum xlog_iclog_state {
#define XLOG_COVER_OPS 5
-typedef struct xlog_ticket {
+struct xlog_ticket {
struct list_head t_queue; /* reserve/write queue */
struct task_struct *t_task; /* task that owns this ticket */
xlog_tid_t t_tid; /* transaction identifier */
@@ -155,7 +155,7 @@ typedef struct xlog_ticket {
char t_cnt; /* current unit count */
uint8_t t_flags; /* properties of reservation */
int t_iclog_hdrs; /* iclog hdrs in t_curr_res */
-} xlog_ticket_t;
+};
/*
* - A log record header is 512 bytes. There is plenty of room to grow the
@@ -698,4 +698,17 @@ xlog_kvmalloc(
return p;
}
+/*
+ * Given a count of iovecs and space for a log item, compute the space we need
+ * in the log to store that data plus the log headers.
+ */
+static inline unsigned int
+xlog_item_space(
+ unsigned int niovecs,
+ unsigned int nbytes)
+{
+ nbytes += niovecs * (sizeof(uint64_t) + sizeof(struct xlog_op_header));
+ return round_up(nbytes, sizeof(uint64_t));
+}
+
#endif /* __XFS_LOG_PRIV_H__ */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index b3c27dbccce8..e6ed9e09c027 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2131,15 +2131,15 @@ xlog_recover_add_to_cont_trans(
item = list_entry(trans->r_itemq.prev, struct xlog_recover_item,
ri_list);
- old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
- old_len = item->ri_buf[item->ri_cnt-1].i_len;
+ old_ptr = item->ri_buf[item->ri_cnt-1].iov_base;
+ old_len = item->ri_buf[item->ri_cnt-1].iov_len;
ptr = kvrealloc(old_ptr, len + old_len, GFP_KERNEL);
if (!ptr)
return -ENOMEM;
memcpy(&ptr[old_len], dp, len);
- item->ri_buf[item->ri_cnt-1].i_len += len;
- item->ri_buf[item->ri_cnt-1].i_addr = ptr;
+ item->ri_buf[item->ri_cnt-1].iov_len += len;
+ item->ri_buf[item->ri_cnt-1].iov_base = ptr;
trace_xfs_log_recover_item_add_cont(log, trans, item, 0);
return 0;
}
@@ -2223,7 +2223,7 @@ xlog_recover_add_to_trans(
}
item->ri_total = in_f->ilf_size;
- item->ri_buf = kzalloc(item->ri_total * sizeof(xfs_log_iovec_t),
+ item->ri_buf = kcalloc(item->ri_total, sizeof(*item->ri_buf),
GFP_KERNEL | __GFP_NOFAIL);
}
@@ -2237,8 +2237,8 @@ xlog_recover_add_to_trans(
}
/* Description region is ri_buf[0] */
- item->ri_buf[item->ri_cnt].i_addr = ptr;
- item->ri_buf[item->ri_cnt].i_len = len;
+ item->ri_buf[item->ri_cnt].iov_base = ptr;
+ item->ri_buf[item->ri_cnt].iov_len = len;
item->ri_cnt++;
trace_xfs_log_recover_item_add(log, trans, item, 0);
return 0;
@@ -2262,7 +2262,7 @@ xlog_recover_free_trans(
/* Free the regions in the item. */
list_del(&item->ri_list);
for (i = 0; i < item->ri_cnt; i++)
- kvfree(item->ri_buf[i].i_addr);
+ kvfree(item->ri_buf[i].iov_base);
/* Free the item itself */
kfree(item->ri_buf);
kfree(item);
@@ -3380,7 +3380,7 @@ xlog_do_recover(
*/
xfs_buf_lock(bp);
xfs_buf_hold(bp);
- error = _xfs_buf_read(bp, XBF_READ);
+ error = _xfs_buf_read(bp);
if (error) {
if (!xlog_is_shutdown(log)) {
xfs_buf_ioerror_alert(bp, __this_address);
diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c
index 6ed485ff2756..19aba2c3d525 100644
--- a/fs/xfs/xfs_message.c
+++ b/fs/xfs/xfs_message.c
@@ -141,14 +141,6 @@ xfs_warn_experimental(
const char *name;
long opstate;
} features[] = {
- [XFS_EXPERIMENTAL_PNFS] = {
- .opstate = XFS_OPSTATE_WARNED_PNFS,
- .name = "pNFS",
- },
- [XFS_EXPERIMENTAL_SCRUB] = {
- .opstate = XFS_OPSTATE_WARNED_SCRUB,
- .name = "online scrub",
- },
[XFS_EXPERIMENTAL_SHRINK] = {
.opstate = XFS_OPSTATE_WARNED_SHRINK,
.name = "online shrink",
@@ -161,18 +153,14 @@ xfs_warn_experimental(
.opstate = XFS_OPSTATE_WARNED_LBS,
.name = "large block size",
},
- [XFS_EXPERIMENTAL_EXCHRANGE] = {
- .opstate = XFS_OPSTATE_WARNED_EXCHRANGE,
- .name = "exchange range",
- },
- [XFS_EXPERIMENTAL_PPTR] = {
- .opstate = XFS_OPSTATE_WARNED_PPTR,
- .name = "parent pointer",
- },
[XFS_EXPERIMENTAL_METADIR] = {
.opstate = XFS_OPSTATE_WARNED_METADIR,
.name = "metadata directory tree",
},
+ [XFS_EXPERIMENTAL_ZONED] = {
+ .opstate = XFS_OPSTATE_WARNED_ZONED,
+ .name = "zoned RT device",
+ },
};
ASSERT(feat >= 0 && feat < XFS_EXPERIMENTAL_MAX);
BUILD_BUG_ON(ARRAY_SIZE(features) != XFS_EXPERIMENTAL_MAX);
diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h
index 7fb36ced9df7..d68e72379f9d 100644
--- a/fs/xfs/xfs_message.h
+++ b/fs/xfs/xfs_message.h
@@ -91,14 +91,11 @@ void xfs_buf_alert_ratelimited(struct xfs_buf *bp, const char *rlmsg,
const char *fmt, ...);
enum xfs_experimental_feat {
- XFS_EXPERIMENTAL_PNFS,
- XFS_EXPERIMENTAL_SCRUB,
XFS_EXPERIMENTAL_SHRINK,
XFS_EXPERIMENTAL_LARP,
XFS_EXPERIMENTAL_LBS,
- XFS_EXPERIMENTAL_EXCHRANGE,
- XFS_EXPERIMENTAL_PPTR,
XFS_EXPERIMENTAL_METADIR,
+ XFS_EXPERIMENTAL_ZONED,
XFS_EXPERIMENTAL_MAX,
};
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 477c5262cf91..dc32c5e34d81 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -40,6 +40,7 @@
#include "xfs_rtrmap_btree.h"
#include "xfs_rtrefcount_btree.h"
#include "scrub/stats.h"
+#include "xfs_zone_alloc.h"
static DEFINE_MUTEX(xfs_uuid_table_mutex);
static int xfs_uuid_table_size;
@@ -170,25 +171,19 @@ xfs_readsb(
ASSERT(mp->m_ddev_targp != NULL);
/*
- * For the initial read, we must guess at the sector
- * size based on the block device. It's enough to
- * get the sb_sectsize out of the superblock and
- * then reread with the proper length.
- * We don't verify it yet, because it may not be complete.
+ * In the first pass, use the device sector size to just read enough
+ * of the superblock to extract the XFS sector size.
+ *
+ * The device sector size must be smaller than or equal to the XFS
+ * sector size and thus we can always read the superblock. Once we know
+ * the XFS sector size, re-read it and run the buffer verifier.
*/
- sector_size = xfs_getsize_buftarg(mp->m_ddev_targp);
+ sector_size = mp->m_ddev_targp->bt_logical_sectorsize;
buf_ops = NULL;
- /*
- * Allocate a (locked) buffer to hold the superblock. This will be kept
- * around at all times to optimize access to the superblock. Therefore,
- * set XBF_NO_IOACCT to make sure it doesn't hold the buftarg count
- * elevated.
- */
reread:
error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
- BTOBB(sector_size), XBF_NO_IOACCT, &bp,
- buf_ops);
+ BTOBB(sector_size), &bp, buf_ops);
if (error) {
if (loud)
xfs_warn(mp, "SB validate failed with error %d.", error);
@@ -249,6 +244,10 @@ reread:
/* no need to be quiet anymore, so reset the buf ops */
bp->b_ops = &xfs_sb_buf_ops;
+ /*
+ * Keep a pointer of the sb buffer around instead of caching it in the
+ * buffer cache because we access it frequently.
+ */
mp->m_sb_bp = bp;
xfs_buf_unlock(bp);
return 0;
@@ -416,7 +415,7 @@ xfs_check_sizes(
}
error = xfs_buf_read_uncached(mp->m_ddev_targp,
d - XFS_FSS_TO_BB(mp, 1),
- XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL);
+ XFS_FSS_TO_BB(mp, 1), &bp, NULL);
if (error) {
xfs_warn(mp, "last sector read failed");
return error;
@@ -433,7 +432,7 @@ xfs_check_sizes(
}
error = xfs_buf_read_uncached(mp->m_logdev_targp,
d - XFS_FSB_TO_BB(mp, 1),
- XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL);
+ XFS_FSB_TO_BB(mp, 1), &bp, NULL);
if (error) {
xfs_warn(mp, "log device read failed");
return error;
@@ -464,22 +463,38 @@ xfs_mount_reset_sbqflags(
return xfs_sync_sb(mp, false);
}
+static const char *const xfs_free_pool_name[] = {
+ [XC_FREE_BLOCKS] = "free blocks",
+ [XC_FREE_RTEXTENTS] = "free rt extents",
+ [XC_FREE_RTAVAILABLE] = "available rt extents",
+};
+
uint64_t
-xfs_default_resblks(xfs_mount_t *mp)
+xfs_default_resblks(
+ struct xfs_mount *mp,
+ enum xfs_free_counter ctr)
{
- uint64_t resblks;
-
- /*
- * We default to 5% or 8192 fsbs of space reserved, whichever is
- * smaller. This is intended to cover concurrent allocation
- * transactions when we initially hit enospc. These each require a 4
- * block reservation. Hence by default we cover roughly 2000 concurrent
- * allocation reservations.
- */
- resblks = mp->m_sb.sb_dblocks;
- do_div(resblks, 20);
- resblks = min_t(uint64_t, resblks, 8192);
- return resblks;
+ switch (ctr) {
+ case XC_FREE_BLOCKS:
+ /*
+ * Default to 5% or 8192 FSBs of space reserved, whichever is
+ * smaller.
+ *
+ * This is intended to cover concurrent allocation transactions
+ * when we initially hit ENOSPC. These each require a 4 block
+ * reservation. Hence by default we cover roughly 2000
+ * concurrent allocation reservations.
+ */
+ return min(div_u64(mp->m_sb.sb_dblocks, 20), 8192ULL);
+ case XC_FREE_RTEXTENTS:
+ case XC_FREE_RTAVAILABLE:
+ if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(mp))
+ return xfs_zoned_default_resblks(mp, ctr);
+ return 0;
+ default:
+ ASSERT(0);
+ return 0;
+ }
}
/* Ensure the summary counts are correct. */
@@ -546,7 +561,7 @@ xfs_check_summary_counts(
* If we're mounting the rt volume after recovering the log, recompute
* frextents from the rtbitmap file to fix the inconsistency.
*/
- if (xfs_has_realtime(mp) && !xfs_is_clean(mp)) {
+ if (xfs_has_realtime(mp) && !xfs_has_zoned(mp) && !xfs_is_clean(mp)) {
error = xfs_rtalloc_reinit_frextents(mp);
if (error)
return error;
@@ -652,6 +667,152 @@ xfs_agbtree_compute_maxlevels(
mp->m_agbtree_maxlevels = max(levels, mp->m_refc_maxlevels);
}
+/* Maximum atomic write IO size that the kernel allows. */
+static inline xfs_extlen_t xfs_calc_atomic_write_max(struct xfs_mount *mp)
+{
+ return rounddown_pow_of_two(XFS_B_TO_FSB(mp, MAX_RW_COUNT));
+}
+
+/*
+ * If the underlying device advertises atomic write support, limit the size of
+ * atomic writes to the greatest power-of-two factor of the group size so
+ * that every atomic write unit aligns with the start of every group. This is
+ * required so that the allocations for an atomic write will always be
+ * aligned compatibly with the alignment requirements of the storage.
+ *
+ * If the device doesn't advertise atomic writes, then there are no alignment
+ * restrictions and the largest out-of-place write we can do ourselves is the
+ * number of blocks that user files can allocate from any group.
+ */
+static xfs_extlen_t
+xfs_calc_group_awu_max(
+ struct xfs_mount *mp,
+ enum xfs_group_type type)
+{
+ struct xfs_groups *g = &mp->m_groups[type];
+ struct xfs_buftarg *btp = xfs_group_type_buftarg(mp, type);
+
+ if (g->blocks == 0)
+ return 0;
+ if (btp && btp->bt_awu_min > 0)
+ return max_pow_of_two_factor(g->blocks);
+ return rounddown_pow_of_two(g->blocks);
+}
+
+/* Compute the maximum atomic write unit size for each section. */
+static inline void
+xfs_calc_atomic_write_unit_max(
+ struct xfs_mount *mp,
+ enum xfs_group_type type)
+{
+ struct xfs_groups *g = &mp->m_groups[type];
+
+ const xfs_extlen_t max_write = xfs_calc_atomic_write_max(mp);
+ const xfs_extlen_t max_ioend = xfs_reflink_max_atomic_cow(mp);
+ const xfs_extlen_t max_gsize = xfs_calc_group_awu_max(mp, type);
+
+ g->awu_max = min3(max_write, max_ioend, max_gsize);
+ trace_xfs_calc_atomic_write_unit_max(mp, type, max_write, max_ioend,
+ max_gsize, g->awu_max);
+}
+
+/*
+ * Try to set the atomic write maximum to a new value that we got from
+ * userspace via mount option.
+ */
+int
+xfs_set_max_atomic_write_opt(
+ struct xfs_mount *mp,
+ unsigned long long new_max_bytes)
+{
+ const xfs_filblks_t new_max_fsbs = XFS_B_TO_FSBT(mp, new_max_bytes);
+ const xfs_extlen_t max_write = xfs_calc_atomic_write_max(mp);
+ const xfs_extlen_t max_group =
+ max(mp->m_groups[XG_TYPE_AG].blocks,
+ mp->m_groups[XG_TYPE_RTG].blocks);
+ const xfs_extlen_t max_group_write =
+ max(xfs_calc_group_awu_max(mp, XG_TYPE_AG),
+ xfs_calc_group_awu_max(mp, XG_TYPE_RTG));
+ int error;
+
+ if (new_max_bytes == 0)
+ goto set_limit;
+
+ ASSERT(max_write <= U32_MAX);
+
+ /* generic_atomic_write_valid enforces power of two length */
+ if (!is_power_of_2(new_max_bytes)) {
+ xfs_warn(mp,
+ "max atomic write size of %llu bytes is not a power of 2",
+ new_max_bytes);
+ return -EINVAL;
+ }
+
+ if (new_max_bytes & mp->m_blockmask) {
+ xfs_warn(mp,
+ "max atomic write size of %llu bytes not aligned with fsblock",
+ new_max_bytes);
+ return -EINVAL;
+ }
+
+ if (new_max_fsbs > max_write) {
+ xfs_warn(mp,
+ "max atomic write size of %lluk cannot be larger than max write size %lluk",
+ new_max_bytes >> 10,
+ XFS_FSB_TO_B(mp, max_write) >> 10);
+ return -EINVAL;
+ }
+
+ if (new_max_fsbs > max_group) {
+ xfs_warn(mp,
+ "max atomic write size of %lluk cannot be larger than allocation group size %lluk",
+ new_max_bytes >> 10,
+ XFS_FSB_TO_B(mp, max_group) >> 10);
+ return -EINVAL;
+ }
+
+ if (new_max_fsbs > max_group_write) {
+ xfs_warn(mp,
+ "max atomic write size of %lluk cannot be larger than max allocation group write size %lluk",
+ new_max_bytes >> 10,
+ XFS_FSB_TO_B(mp, max_group_write) >> 10);
+ return -EINVAL;
+ }
+
+ if (xfs_has_reflink(mp))
+ goto set_limit;
+
+ if (new_max_fsbs == 1) {
+ if (mp->m_ddev_targp->bt_awu_max ||
+ (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_awu_max)) {
+ } else {
+ xfs_warn(mp,
+ "cannot support atomic writes of size %lluk with no reflink or HW support",
+ new_max_bytes >> 10);
+ return -EINVAL;
+ }
+ } else {
+ xfs_warn(mp,
+ "cannot support atomic writes of size %lluk with no reflink support",
+ new_max_bytes >> 10);
+ return -EINVAL;
+ }
+
+set_limit:
+ error = xfs_calc_atomic_write_reservation(mp, new_max_fsbs);
+ if (error) {
+ xfs_warn(mp,
+ "cannot support completing atomic writes of %lluk",
+ new_max_bytes >> 10);
+ return error;
+ }
+
+ xfs_calc_atomic_write_unit_max(mp, XG_TYPE_AG);
+ xfs_calc_atomic_write_unit_max(mp, XG_TYPE_RTG);
+ mp->m_awu_max_bytes = new_max_bytes;
+ return 0;
+}
+
/* Compute maximum possible height for realtime btree types for this fs. */
static inline void
xfs_rtbtree_compute_maxlevels(
@@ -681,6 +842,7 @@ xfs_mountfs(
uint quotamount = 0;
uint quotaflags = 0;
int error = 0;
+ int i;
xfs_sb_mount_common(mp, sbp);
@@ -750,27 +912,15 @@ xfs_mountfs(
/* enable fail_at_unmount as default */
mp->m_fail_unmount = true;
- super_set_sysfs_name_id(mp->m_super);
-
- error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype,
- NULL, mp->m_super->s_id);
+ error = xfs_mount_sysfs_init(mp);
if (error)
- goto out;
-
- error = xfs_sysfs_init(&mp->m_stats.xs_kobj, &xfs_stats_ktype,
- &mp->m_kobj, "stats");
- if (error)
- goto out_remove_sysfs;
+ goto out_remove_scrub_stats;
xchk_stats_register(mp->m_scrub_stats, mp->m_debugfs);
- error = xfs_error_sysfs_init(mp);
- if (error)
- goto out_remove_scrub_stats;
-
error = xfs_errortag_init(mp);
if (error)
- goto out_remove_error_sysfs;
+ goto out_remove_sysfs;
error = xfs_uuid_mount(mp);
if (error)
@@ -1034,6 +1184,12 @@ xfs_mountfs(
if (xfs_is_readonly(mp) && !xfs_has_norecovery(mp))
xfs_log_clean(mp);
+ if (xfs_has_zoned(mp)) {
+ error = xfs_mount_zones(mp);
+ if (error)
+ goto out_rtunmount;
+ }
+
/*
* Complete the quota initialisation, post-log-replay component.
*/
@@ -1049,29 +1205,46 @@ xfs_mountfs(
* privileged transactions. This is needed so that transaction
* space required for critical operations can dip into this pool
* when at ENOSPC. This is needed for operations like create with
- * attr, unwritten extent conversion at ENOSPC, etc. Data allocations
- * are not allowed to use this reserved space.
+ * attr, unwritten extent conversion at ENOSPC, garbage collection
+ * etc. Data allocations are not allowed to use this reserved space.
*
* This may drive us straight to ENOSPC on mount, but that implies
* we were already there on the last unmount. Warn if this occurs.
*/
if (!xfs_is_readonly(mp)) {
- error = xfs_reserve_blocks(mp, xfs_default_resblks(mp));
- if (error)
- xfs_warn(mp,
- "Unable to allocate reserve blocks. Continuing without reserve pool.");
+ for (i = 0; i < XC_FREE_NR; i++) {
+ error = xfs_reserve_blocks(mp, i,
+ xfs_default_resblks(mp, i));
+ if (error)
+ xfs_warn(mp,
+"Unable to allocate reserve blocks. Continuing without reserve pool for %s.",
+ xfs_free_pool_name[i]);
+ }
/* Reserve AG blocks for future btree expansion. */
error = xfs_fs_reserve_ag_blocks(mp);
if (error && error != -ENOSPC)
goto out_agresv;
+
+ xfs_zone_gc_start(mp);
}
+ /*
+ * Pre-calculate atomic write unit max. This involves computations
+ * derived from transaction reservations, so we must do this after the
+ * log is fully initialized.
+ */
+ error = xfs_set_max_atomic_write_opt(mp, mp->m_awu_max_bytes);
+ if (error)
+ goto out_agresv;
+
return 0;
out_agresv:
xfs_fs_unreserve_ag_blocks(mp);
xfs_qm_unmount_quotas(mp);
+ if (xfs_has_zoned(mp))
+ xfs_unmount_zones(mp);
out_rtunmount:
xfs_rtunmount_inodes(mp);
out_rele_rip:
@@ -1119,13 +1292,10 @@ xfs_mountfs(
xfs_uuid_unmount(mp);
out_remove_errortag:
xfs_errortag_del(mp);
- out_remove_error_sysfs:
- xfs_error_sysfs_del(mp);
+ out_remove_sysfs:
+ xfs_mount_sysfs_del(mp);
out_remove_scrub_stats:
xchk_stats_unregister(mp->m_scrub_stats);
- xfs_sysfs_del(&mp->m_stats.xs_kobj);
- out_remove_sysfs:
- xfs_sysfs_del(&mp->m_kobj);
out:
return error;
}
@@ -1151,8 +1321,12 @@ xfs_unmountfs(
xfs_inodegc_flush(mp);
xfs_blockgc_stop(mp);
+ if (!test_bit(XFS_OPSTATE_READONLY, &mp->m_opstate))
+ xfs_zone_gc_stop(mp);
xfs_fs_unreserve_ag_blocks(mp);
xfs_qm_unmount_quotas(mp);
+ if (xfs_has_zoned(mp))
+ xfs_unmount_zones(mp);
xfs_rtunmount_inodes(mp);
xfs_irele(mp->m_rootip);
if (mp->m_metadirip)
@@ -1176,7 +1350,7 @@ xfs_unmountfs(
* we only every apply deltas to the superblock and hence the incore
* value does not matter....
*/
- error = xfs_reserve_blocks(mp, 0);
+ error = xfs_reserve_blocks(mp, XC_FREE_BLOCKS, 0);
if (error)
xfs_warn(mp, "Unable to free reserved block pool. "
"Freespace may not be correct on next mount.");
@@ -1198,10 +1372,8 @@ xfs_unmountfs(
xfs_free_rtgroups(mp, 0, mp->m_sb.sb_rgcount);
xfs_free_perag_range(mp, 0, mp->m_sb.sb_agcount);
xfs_errortag_del(mp);
- xfs_error_sysfs_del(mp);
xchk_stats_unregister(mp->m_scrub_stats);
- xfs_sysfs_del(&mp->m_stats.xs_kobj);
- xfs_sysfs_del(&mp->m_kobj);
+ xfs_mount_sysfs_del(mp);
}
/*
@@ -1223,52 +1395,67 @@ xfs_fs_writable(
return true;
}
+/*
+ * Estimate the amount of free space that is not available to userspace and is
+ * not explicitly reserved from the incore fdblocks. This includes:
+ *
+ * - The minimum number of blocks needed to support splitting a bmap btree
+ * - The blocks currently in use by the freespace btrees because they record
+ * the actual blocks that will fill per-AG metadata space reservations
+ */
+uint64_t
+xfs_freecounter_unavailable(
+ struct xfs_mount *mp,
+ enum xfs_free_counter ctr)
+{
+ if (ctr != XC_FREE_BLOCKS)
+ return 0;
+ return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks);
+}
+
void
xfs_add_freecounter(
struct xfs_mount *mp,
- struct percpu_counter *counter,
+ enum xfs_free_counter ctr,
uint64_t delta)
{
- bool has_resv_pool = (counter == &mp->m_fdblocks);
+ struct xfs_freecounter *counter = &mp->m_free[ctr];
uint64_t res_used;
/*
* If the reserve pool is depleted, put blocks back into it first.
* Most of the time the pool is full.
*/
- if (!has_resv_pool || mp->m_resblks == mp->m_resblks_avail) {
- percpu_counter_add(counter, delta);
+ if (likely(counter->res_avail == counter->res_total)) {
+ percpu_counter_add(&counter->count, delta);
return;
}
spin_lock(&mp->m_sb_lock);
- res_used = mp->m_resblks - mp->m_resblks_avail;
+ res_used = counter->res_total - counter->res_avail;
if (res_used > delta) {
- mp->m_resblks_avail += delta;
+ counter->res_avail += delta;
} else {
delta -= res_used;
- mp->m_resblks_avail = mp->m_resblks;
- percpu_counter_add(counter, delta);
+ counter->res_avail = counter->res_total;
+ percpu_counter_add(&counter->count, delta);
}
spin_unlock(&mp->m_sb_lock);
}
+
+/* Adjust in-core free blocks or RT extents. */
int
xfs_dec_freecounter(
struct xfs_mount *mp,
- struct percpu_counter *counter,
+ enum xfs_free_counter ctr,
uint64_t delta,
bool rsvd)
{
- int64_t lcounter;
- uint64_t set_aside = 0;
+ struct xfs_freecounter *counter = &mp->m_free[ctr];
s32 batch;
- bool has_resv_pool;
- ASSERT(counter == &mp->m_fdblocks || counter == &mp->m_frextents);
- has_resv_pool = (counter == &mp->m_fdblocks);
- if (rsvd)
- ASSERT(has_resv_pool);
+ ASSERT(ctr < XC_FREE_NR);
/*
* Taking blocks away, need to be more accurate the closer we
@@ -1278,7 +1465,7 @@ xfs_dec_freecounter(
* then make everything serialise as we are real close to
* ENOSPC.
*/
- if (__percpu_counter_compare(counter, 2 * XFS_FDBLOCKS_BATCH,
+ if (__percpu_counter_compare(&counter->count, 2 * XFS_FDBLOCKS_BATCH,
XFS_FDBLOCKS_BATCH) < 0)
batch = 1;
else
@@ -1295,34 +1482,34 @@ xfs_dec_freecounter(
* problems (i.e. transaction abort, pagecache discards, etc.) than
* slightly premature -ENOSPC.
*/
- if (has_resv_pool)
- set_aside = xfs_fdblocks_unavailable(mp);
- percpu_counter_add_batch(counter, -((int64_t)delta), batch);
- if (__percpu_counter_compare(counter, set_aside,
- XFS_FDBLOCKS_BATCH) >= 0) {
- /* we had space! */
- return 0;
- }
-
- /*
- * lock up the sb for dipping into reserves before releasing the space
- * that took us to ENOSPC.
- */
- spin_lock(&mp->m_sb_lock);
- percpu_counter_add(counter, delta);
- if (!has_resv_pool || !rsvd)
- goto fdblocks_enospc;
-
- lcounter = (long long)mp->m_resblks_avail - delta;
- if (lcounter >= 0) {
- mp->m_resblks_avail = lcounter;
+ percpu_counter_add_batch(&counter->count, -((int64_t)delta), batch);
+ if (__percpu_counter_compare(&counter->count,
+ xfs_freecounter_unavailable(mp, ctr),
+ XFS_FDBLOCKS_BATCH) < 0) {
+ /*
+ * Lock up the sb for dipping into reserves before releasing the
+ * space that took us to ENOSPC.
+ */
+ spin_lock(&mp->m_sb_lock);
+ percpu_counter_add(&counter->count, delta);
+ if (!rsvd)
+ goto fdblocks_enospc;
+ if (delta > counter->res_avail) {
+ if (ctr == XC_FREE_BLOCKS)
+ xfs_warn_once(mp,
+"Reserve blocks depleted! Consider increasing reserve pool size.");
+ goto fdblocks_enospc;
+ }
+ counter->res_avail -= delta;
+ trace_xfs_freecounter_reserved(mp, ctr, delta, _RET_IP_);
spin_unlock(&mp->m_sb_lock);
- return 0;
}
- xfs_warn_once(mp,
-"Reserve blocks depleted! Consider increasing reserve pool size.");
+
+ /* we had space! */
+ return 0;
fdblocks_enospc:
+ trace_xfs_freecounter_enospc(mp, ctr, delta, _RET_IP_);
spin_unlock(&mp->m_sb_lock);
return -ENOSPC;
}
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index fbed172d6770..97de44c32272 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -98,11 +98,47 @@ struct xfs_groups {
uint8_t blklog;
/*
+ * Zoned devices can have gaps beyond the usable capacity of a zone and
+ * the end in the LBA/daddr address space. In other words, the hardware
+ * equivalent to the RT groups already takes care of the power of 2
+ * alignment for us. In this case the sparse FSB/RTB address space maps
+ * 1:1 to the device address space.
+ */
+ bool has_daddr_gaps;
+
+ /*
* Mask to extract the group-relative block number from a FSB.
* For a pre-rtgroups filesystem we pretend to have one very large
* rtgroup, so this mask must be 64-bit.
*/
uint64_t blkmask;
+
+ /*
+ * Start of the first group in the device. This is used to support a
+ * RT device following the data device on the same block device for
+ * SMR hard drives.
+ */
+ xfs_fsblock_t start_fsb;
+
+ /*
+ * Maximum length of an atomic write for files stored in this
+ * collection of allocation groups, in fsblocks.
+ */
+ xfs_extlen_t awu_max;
+};
+
+struct xfs_freecounter {
+ /* free blocks for general use: */
+ struct percpu_counter count;
+
+ /* total reserved blocks: */
+ uint64_t res_total;
+
+ /* available reserved blocks: */
+ uint64_t res_avail;
+
+ /* reserved blks @ remount,ro: */
+ uint64_t res_saved;
};
/*
@@ -198,6 +234,12 @@ typedef struct xfs_mount {
bool m_fail_unmount;
bool m_finobt_nores; /* no per-AG finobt resv. */
bool m_update_sb; /* sb needs update in mount */
+ unsigned int m_max_open_zones;
+ unsigned int m_zonegc_low_space;
+ struct xfs_mru_cache *m_zone_cache; /* Inode to open zone cache */
+
+ /* max_atomic_write mount option value */
+ unsigned long long m_awu_max_bytes;
/*
* Bitsets of per-fs metadata that have been checked and/or are sick.
@@ -222,8 +264,8 @@ typedef struct xfs_mount {
spinlock_t ____cacheline_aligned m_sb_lock; /* sb counter lock */
struct percpu_counter m_icount; /* allocated inodes counter */
struct percpu_counter m_ifree; /* free inodes counter */
- struct percpu_counter m_fdblocks; /* free block counter */
- struct percpu_counter m_frextents; /* free rt extent counter */
+
+ struct xfs_freecounter m_free[XC_FREE_NR];
/*
* Count of data device blocks reserved for delayed allocations,
@@ -245,10 +287,8 @@ typedef struct xfs_mount {
atomic64_t m_allocbt_blks;
struct xfs_groups m_groups[XG_TYPE_MAX];
- uint64_t m_resblks; /* total reserved blocks */
- uint64_t m_resblks_avail;/* available reserved blocks */
- uint64_t m_resblks_save; /* reserved blks @ remount,ro */
struct delayed_work m_reclaim_work; /* background inode reclaim */
+ struct xfs_zone_info *m_zone_info; /* zone allocator information */
struct dentry *m_debugfs; /* debugfs parent */
struct xfs_kobj m_kobj;
struct xfs_kobj m_error_kobj;
@@ -258,10 +298,16 @@ typedef struct xfs_mount {
#ifdef CONFIG_XFS_ONLINE_SCRUB_STATS
struct xchk_stats *m_scrub_stats;
#endif
+ struct xfs_kobj m_zoned_kobj;
xfs_agnumber_t m_agfrotor; /* last ag where space found */
atomic_t m_agirotor; /* last ag dir inode alloced */
atomic_t m_rtgrotor; /* last rtgroup rtpicked */
+ struct mutex m_metafile_resv_lock;
+ uint64_t m_metafile_resv_target;
+ uint64_t m_metafile_resv_used;
+ uint64_t m_metafile_resv_avail;
+
/* Memory shrinker to throttle and reprioritize inodegc */
struct shrinker *m_inodegc_shrinker;
/*
@@ -336,8 +382,10 @@ typedef struct xfs_mount {
#define XFS_FEAT_NREXT64 (1ULL << 26) /* large extent counters */
#define XFS_FEAT_EXCHANGE_RANGE (1ULL << 27) /* exchange range */
#define XFS_FEAT_METADIR (1ULL << 28) /* metadata directory tree */
+#define XFS_FEAT_ZONED (1ULL << 29) /* zoned RT device */
/* Mount features */
+#define XFS_FEAT_NOLIFETIME (1ULL << 47) /* disable lifetime hints */
#define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */
#define XFS_FEAT_NOALIGN (1ULL << 49) /* ignore alignment */
#define XFS_FEAT_ALLOCSIZE (1ULL << 50) /* user specified allocation size */
@@ -392,6 +440,8 @@ __XFS_HAS_FEAT(needsrepair, NEEDSREPAIR)
__XFS_HAS_FEAT(large_extent_counts, NREXT64)
__XFS_HAS_FEAT(exchange_range, EXCHANGE_RANGE)
__XFS_HAS_FEAT(metadir, METADIR)
+__XFS_HAS_FEAT(zoned, ZONED)
+__XFS_HAS_FEAT(nolifetime, NOLIFETIME)
static inline bool xfs_has_rtgroups(const struct xfs_mount *mp)
{
@@ -402,7 +452,9 @@ static inline bool xfs_has_rtgroups(const struct xfs_mount *mp)
static inline bool xfs_has_rtsb(const struct xfs_mount *mp)
{
/* all rtgroups filesystems with an rt section have an rtsb */
- return xfs_has_rtgroups(mp) && xfs_has_realtime(mp);
+ return xfs_has_rtgroups(mp) &&
+ xfs_has_realtime(mp) &&
+ !xfs_has_zoned(mp);
}
static inline bool xfs_has_rtrmapbt(const struct xfs_mount *mp)
@@ -417,6 +469,16 @@ static inline bool xfs_has_rtreflink(const struct xfs_mount *mp)
xfs_has_reflink(mp);
}
+static inline bool xfs_has_nonzoned(const struct xfs_mount *mp)
+{
+ return !xfs_has_zoned(mp);
+}
+
+static inline bool xfs_can_sw_atomic_write(struct xfs_mount *mp)
+{
+ return xfs_has_reflink(mp);
+}
+
/*
* Some features are always on for v5 file systems, allow the compiler to
* eliminiate dead code when building without v4 support.
@@ -496,10 +558,6 @@ __XFS_HAS_FEAT(nouuid, NOUUID)
*/
#define XFS_OPSTATE_BLOCKGC_ENABLED 6
-/* Kernel has logged a warning about pNFS being used on this fs. */
-#define XFS_OPSTATE_WARNED_PNFS 7
-/* Kernel has logged a warning about online fsck being used on this fs. */
-#define XFS_OPSTATE_WARNED_SCRUB 8
/* Kernel has logged a warning about shrink being used on this fs. */
#define XFS_OPSTATE_WARNED_SHRINK 9
/* Kernel has logged a warning about logged xattr updates being used. */
@@ -512,14 +570,14 @@ __XFS_HAS_FEAT(nouuid, NOUUID)
#define XFS_OPSTATE_USE_LARP 13
/* Kernel has logged a warning about blocksize > pagesize on this fs. */
#define XFS_OPSTATE_WARNED_LBS 14
-/* Kernel has logged a warning about exchange-range being used on this fs. */
-#define XFS_OPSTATE_WARNED_EXCHRANGE 15
-/* Kernel has logged a warning about parent pointers being used on this fs. */
-#define XFS_OPSTATE_WARNED_PPTR 16
/* Kernel has logged a warning about metadata dirs being used on this fs. */
#define XFS_OPSTATE_WARNED_METADIR 17
/* Filesystem should use qflags to determine quotaon status */
#define XFS_OPSTATE_RESUMING_QUOTAON 18
+/* Kernel has logged a warning about zoned RT device being used on this fs. */
+#define XFS_OPSTATE_WARNED_ZONED 19
+/* (Zoned) GC is in progress */
+#define XFS_OPSTATE_ZONEGC_RUNNING 20
#define __XFS_IS_OPSTATE(name, NAME) \
static inline bool xfs_is_ ## name (struct xfs_mount *mp) \
@@ -564,6 +622,7 @@ static inline bool xfs_clear_resuming_quotaon(struct xfs_mount *mp)
#endif /* CONFIG_XFS_QUOTA */
__XFS_IS_OPSTATE(done_with_log_incompat, UNSET_LOG_INCOMPAT)
__XFS_IS_OPSTATE(using_logged_xattrs, USE_LARP)
+__XFS_IS_OPSTATE(zonegc_running, ZONEGC_RUNNING)
static inline bool
xfs_should_warn(struct xfs_mount *mp, long nr)
@@ -579,7 +638,6 @@ xfs_should_warn(struct xfs_mount *mp, long nr)
{ (1UL << XFS_OPSTATE_READONLY), "read_only" }, \
{ (1UL << XFS_OPSTATE_INODEGC_ENABLED), "inodegc" }, \
{ (1UL << XFS_OPSTATE_BLOCKGC_ENABLED), "blockgc" }, \
- { (1UL << XFS_OPSTATE_WARNED_SCRUB), "wscrub" }, \
{ (1UL << XFS_OPSTATE_WARNED_SHRINK), "wshrink" }, \
{ (1UL << XFS_OPSTATE_WARNED_LARP), "wlarp" }, \
{ (1UL << XFS_OPSTATE_QUOTACHECK_RUNNING), "quotacheck" }, \
@@ -633,7 +691,8 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
}
extern void xfs_uuid_table_free(void);
-extern uint64_t xfs_default_resblks(xfs_mount_t *mp);
+uint64_t xfs_default_resblks(struct xfs_mount *mp,
+ enum xfs_free_counter ctr);
extern int xfs_mountfs(xfs_mount_t *mp);
extern void xfs_unmountfs(xfs_mount_t *);
@@ -646,45 +705,74 @@ extern void xfs_unmountfs(xfs_mount_t *);
*/
#define XFS_FDBLOCKS_BATCH 1024
+uint64_t xfs_freecounter_unavailable(struct xfs_mount *mp,
+ enum xfs_free_counter ctr);
+
/*
- * Estimate the amount of free space that is not available to userspace and is
- * not explicitly reserved from the incore fdblocks. This includes:
- *
- * - The minimum number of blocks needed to support splitting a bmap btree
- * - The blocks currently in use by the freespace btrees because they record
- * the actual blocks that will fill per-AG metadata space reservations
+ * Sum up the freecount, but never return negative values.
+ */
+static inline s64 xfs_sum_freecounter(struct xfs_mount *mp,
+ enum xfs_free_counter ctr)
+{
+ return percpu_counter_sum_positive(&mp->m_free[ctr].count);
+}
+
+/*
+ * Same as above, but does return negative values. Mostly useful for
+ * special cases like repair and tracing.
*/
-static inline uint64_t
-xfs_fdblocks_unavailable(
- struct xfs_mount *mp)
+static inline s64 xfs_sum_freecounter_raw(struct xfs_mount *mp,
+ enum xfs_free_counter ctr)
{
- return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks);
+ return percpu_counter_sum(&mp->m_free[ctr].count);
}
-int xfs_dec_freecounter(struct xfs_mount *mp, struct percpu_counter *counter,
+/*
+ * This just provides and estimate without the cpu-local updates, use
+ * xfs_sum_freecounter for the exact value.
+ */
+static inline s64 xfs_estimate_freecounter(struct xfs_mount *mp,
+ enum xfs_free_counter ctr)
+{
+ return percpu_counter_read_positive(&mp->m_free[ctr].count);
+}
+
+static inline int xfs_compare_freecounter(struct xfs_mount *mp,
+ enum xfs_free_counter ctr, s64 rhs, s32 batch)
+{
+ return __percpu_counter_compare(&mp->m_free[ctr].count, rhs, batch);
+}
+
+static inline void xfs_set_freecounter(struct xfs_mount *mp,
+ enum xfs_free_counter ctr, uint64_t val)
+{
+ percpu_counter_set(&mp->m_free[ctr].count, val);
+}
+
+int xfs_dec_freecounter(struct xfs_mount *mp, enum xfs_free_counter ctr,
uint64_t delta, bool rsvd);
-void xfs_add_freecounter(struct xfs_mount *mp, struct percpu_counter *counter,
+void xfs_add_freecounter(struct xfs_mount *mp, enum xfs_free_counter ctr,
uint64_t delta);
static inline int xfs_dec_fdblocks(struct xfs_mount *mp, uint64_t delta,
bool reserved)
{
- return xfs_dec_freecounter(mp, &mp->m_fdblocks, delta, reserved);
+ return xfs_dec_freecounter(mp, XC_FREE_BLOCKS, delta, reserved);
}
static inline void xfs_add_fdblocks(struct xfs_mount *mp, uint64_t delta)
{
- xfs_add_freecounter(mp, &mp->m_fdblocks, delta);
+ xfs_add_freecounter(mp, XC_FREE_BLOCKS, delta);
}
static inline int xfs_dec_frextents(struct xfs_mount *mp, uint64_t delta)
{
- return xfs_dec_freecounter(mp, &mp->m_frextents, delta, false);
+ return xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, delta, false);
}
static inline void xfs_add_frextents(struct xfs_mount *mp, uint64_t delta)
{
- xfs_add_freecounter(mp, &mp->m_frextents, delta);
+ xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, delta);
}
extern int xfs_readsb(xfs_mount_t *, int);
@@ -706,5 +794,29 @@ int xfs_add_incompat_log_feature(struct xfs_mount *mp, uint32_t feature);
bool xfs_clear_incompat_log_features(struct xfs_mount *mp);
void xfs_mod_delalloc(struct xfs_inode *ip, int64_t data_delta,
int64_t ind_delta);
+static inline void xfs_mod_sb_delalloc(struct xfs_mount *mp, int64_t delta)
+{
+ percpu_counter_add(&mp->m_delalloc_blks, delta);
+}
+
+int xfs_set_max_atomic_write_opt(struct xfs_mount *mp,
+ unsigned long long new_max_bytes);
+
+static inline struct xfs_buftarg *
+xfs_group_type_buftarg(
+ struct xfs_mount *mp,
+ enum xfs_group_type type)
+{
+ switch (type) {
+ case XG_TYPE_AG:
+ return mp->m_ddev_targp;
+ case XG_TYPE_RTG:
+ return mp->m_rtdev_targp;
+ default:
+ ASSERT(0);
+ break;
+ }
+ return NULL;
+}
#endif /* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index d0f5b403bdbe..866c71d9fbae 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -320,7 +320,7 @@ xfs_mru_cache_create(
xfs_mru_cache_free_func_t free_func)
{
struct xfs_mru_cache *mru = NULL;
- int err = 0, grp;
+ int grp;
unsigned int grp_time;
if (mrup)
@@ -341,8 +341,8 @@ xfs_mru_cache_create(
mru->lists = kzalloc(mru->grp_count * sizeof(*mru->lists),
GFP_KERNEL | __GFP_NOFAIL);
if (!mru->lists) {
- err = -ENOMEM;
- goto exit;
+ kfree(mru);
+ return -ENOMEM;
}
for (grp = 0; grp < mru->grp_count; grp++)
@@ -361,14 +361,7 @@ xfs_mru_cache_create(
mru->free_func = free_func;
mru->data = data;
*mrup = mru;
-
-exit:
- if (err && mru && mru->lists)
- kfree(mru->lists);
- if (err && mru)
- kfree(mru);
-
- return err;
+ return 0;
}
/*
@@ -414,6 +407,8 @@ xfs_mru_cache_destroy(
* To insert an element, call xfs_mru_cache_insert() with the data store, the
* element's key and the client data pointer. This function returns 0 on
* success or ENOMEM if memory for the data element couldn't be allocated.
+ *
+ * The passed in elem is freed through the per-cache free_func on failure.
*/
int
xfs_mru_cache_insert(
@@ -421,14 +416,11 @@ xfs_mru_cache_insert(
unsigned long key,
struct xfs_mru_cache_elem *elem)
{
- int error;
-
- ASSERT(mru && mru->lists);
- if (!mru || !mru->lists)
- return -EINVAL;
+ int error = -EINVAL;
+ error = -ENOMEM;
if (radix_tree_preload(GFP_KERNEL))
- return -ENOMEM;
+ goto out_free;
INIT_LIST_HEAD(&elem->list_node);
elem->key = key;
@@ -440,6 +432,12 @@ xfs_mru_cache_insert(
_xfs_mru_cache_list_insert(mru, elem);
spin_unlock(&mru->lock);
+ if (error)
+ goto out_free;
+ return 0;
+
+out_free:
+ mru->free_func(mru->data, elem);
return error;
}
diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c
index ed8d8ed42f0a..fbeddcac4792 100644
--- a/fs/xfs/xfs_notify_failure.c
+++ b/fs/xfs/xfs_notify_failure.c
@@ -127,7 +127,7 @@ xfs_dax_notify_failure_freeze(
struct super_block *sb = mp->m_super;
int error;
- error = freeze_super(sb, FREEZE_HOLDER_KERNEL);
+ error = freeze_super(sb, FREEZE_HOLDER_KERNEL, NULL);
if (error)
xfs_emerg(mp, "already frozen by kernel, err=%d", error);
@@ -143,7 +143,7 @@ xfs_dax_notify_failure_thaw(
int error;
if (kernel_frozen) {
- error = thaw_super(sb, FREEZE_HOLDER_KERNEL);
+ error = thaw_super(sb, FREEZE_HOLDER_KERNEL, NULL);
if (error)
xfs_emerg(mp, "still frozen after notify failure, err=%d",
error);
@@ -153,7 +153,7 @@ xfs_dax_notify_failure_thaw(
* Also thaw userspace call anyway because the device is about to be
* removed immediately.
*/
- thaw_super(sb, FREEZE_HOLDER_USERSPACE);
+ thaw_super(sb, FREEZE_HOLDER_USERSPACE, NULL);
}
static int
@@ -253,8 +253,7 @@ xfs_dax_notify_dev_failure(
return -EOPNOTSUPP;
}
- error = xfs_dax_translate_range(type == XG_TYPE_RTG ?
- mp->m_rtdev_targp : mp->m_ddev_targp,
+ error = xfs_dax_translate_range(xfs_group_type_buftarg(mp, type),
offset, len, &daddr, &bblen);
if (error)
return error;
@@ -280,10 +279,7 @@ xfs_dax_notify_dev_failure(
kernel_frozen = xfs_dax_notify_failure_freeze(mp) == 0;
}
- error = xfs_trans_alloc_empty(mp, &tp);
- if (error)
- goto out;
-
+ tp = xfs_trans_alloc_empty(mp);
start_gno = xfs_fsb_to_gno(mp, start_bno, type);
end_gno = xfs_fsb_to_gno(mp, end_bno, type);
while ((xg = xfs_group_next_range(mp, xg, start_gno, end_gno, type))) {
@@ -354,7 +350,6 @@ xfs_dax_notify_dev_failure(
error = -EFSCORRUPTED;
}
-out:
/* Thaw the fs if it has been frozen before. */
if (mf_flags & MF_MEM_PRE_REMOVE)
xfs_dax_notify_failure_thaw(mp, kernel_frozen);
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index 6f4479deac6d..afe7497012d4 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -58,8 +58,6 @@ xfs_fs_get_uuid(
{
struct xfs_mount *mp = XFS_M(sb);
- xfs_warn_experimental(mp, XFS_EXPERIMENTAL_PNFS);
-
if (*len < sizeof(uuid_t))
return -EINVAL;
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index e1ba5af6250f..23ba84ec919a 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -134,6 +134,7 @@ xfs_qm_dqpurge(
dqp->q_flags |= XFS_DQFLAG_FREEING;
+ xfs_qm_dqunpin_wait(dqp);
xfs_dqflock(dqp);
/*
@@ -465,6 +466,7 @@ xfs_qm_dquot_isolate(
struct xfs_dquot *dqp = container_of(item,
struct xfs_dquot, q_lru);
struct xfs_qm_isolate *isol = arg;
+ enum lru_status ret = LRU_SKIP;
if (!xfs_dqlock_nowait(dqp))
goto out_miss_busy;
@@ -478,6 +480,16 @@ xfs_qm_dquot_isolate(
goto out_miss_unlock;
/*
+ * If the dquot is pinned or dirty, rotate it to the end of the LRU to
+ * give some time for it to be cleaned before we try to isolate it
+ * again.
+ */
+ ret = LRU_ROTATE;
+ if (XFS_DQ_IS_DIRTY(dqp) || atomic_read(&dqp->q_pincount) > 0) {
+ goto out_miss_unlock;
+ }
+
+ /*
* This dquot has acquired a reference in the meantime remove it from
* the freelist and try again.
*/
@@ -492,41 +504,14 @@ xfs_qm_dquot_isolate(
}
/*
- * If the dquot is dirty, flush it. If it's already being flushed, just
- * skip it so there is time for the IO to complete before we try to
- * reclaim it again on the next LRU pass.
+ * The dquot may still be under IO, in which case the flush lock will be
+ * held. If we can't get the flush lock now, just skip over the dquot as
+ * if it was dirty.
*/
if (!xfs_dqflock_nowait(dqp))
goto out_miss_unlock;
- if (XFS_DQ_IS_DIRTY(dqp)) {
- struct xfs_buf *bp = NULL;
- int error;
-
- trace_xfs_dqreclaim_dirty(dqp);
-
- /* we have to drop the LRU lock to flush the dquot */
- spin_unlock(&lru->lock);
-
- error = xfs_dquot_use_attached_buf(dqp, &bp);
- if (!bp || error == -EAGAIN) {
- xfs_dqfunlock(dqp);
- goto out_unlock_dirty;
- }
-
- /*
- * dqflush completes dqflock on error, and the delwri ioend
- * does it on success.
- */
- error = xfs_qm_dqflush(dqp, bp);
- if (error)
- goto out_unlock_dirty;
-
- xfs_buf_delwri_queue(bp, &isol->buffers);
- xfs_buf_relse(bp);
- goto out_unlock_dirty;
- }
-
+ ASSERT(!XFS_DQ_IS_DIRTY(dqp));
xfs_dquot_detach_buf(dqp);
xfs_dqfunlock(dqp);
@@ -548,13 +533,7 @@ out_miss_unlock:
out_miss_busy:
trace_xfs_dqreclaim_busy(dqp);
XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses);
- return LRU_SKIP;
-
-out_unlock_dirty:
- trace_xfs_dqreclaim_busy(dqp);
- XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses);
- xfs_dqunlock(dqp);
- return LRU_RETRY;
+ return ret;
}
static unsigned long
@@ -681,10 +660,7 @@ xfs_qm_load_metadir_qinos(
struct xfs_trans *tp;
int error;
- error = xfs_trans_alloc_empty(mp, &tp);
- if (error)
- return error;
-
+ tp = xfs_trans_alloc_empty(mp);
error = xfs_dqinode_load_parent(tp, &qi->qi_dirip);
if (error == -ENOENT) {
/* no quota dir directory, but we'll create one later */
@@ -1486,7 +1462,6 @@ xfs_qm_flush_one(
struct xfs_dquot *dqp,
void *data)
{
- struct xfs_mount *mp = dqp->q_mount;
struct list_head *buffer_list = data;
struct xfs_buf *bp = NULL;
int error = 0;
@@ -1497,34 +1472,8 @@ xfs_qm_flush_one(
if (!XFS_DQ_IS_DIRTY(dqp))
goto out_unlock;
- /*
- * The only way the dquot is already flush locked by the time quotacheck
- * gets here is if reclaim flushed it before the dqadjust walk dirtied
- * it for the final time. Quotacheck collects all dquot bufs in the
- * local delwri queue before dquots are dirtied, so reclaim can't have
- * possibly queued it for I/O. The only way out is to push the buffer to
- * cycle the flush lock.
- */
- if (!xfs_dqflock_nowait(dqp)) {
- /* buf is pinned in-core by delwri list */
- error = xfs_buf_incore(mp->m_ddev_targp, dqp->q_blkno,
- mp->m_quotainfo->qi_dqchunklen, 0, &bp);
- if (error)
- goto out_unlock;
-
- if (!(bp->b_flags & _XBF_DELWRI_Q)) {
- error = -EAGAIN;
- xfs_buf_relse(bp);
- goto out_unlock;
- }
- xfs_buf_unlock(bp);
-
- xfs_buf_delwri_pushbuf(bp, buffer_list);
- xfs_buf_rele(bp);
-
- error = -EAGAIN;
- goto out_unlock;
- }
+ xfs_qm_dqunpin_wait(dqp);
+ xfs_dqflock(dqp);
error = xfs_dquot_use_attached_buf(dqp, &bp);
if (error)
@@ -1711,7 +1660,8 @@ xfs_qm_mount_quotas(
* immediately. We only support rtquota if rtgroups are enabled to
* avoid problems with older kernels.
*/
- if (mp->m_sb.sb_rextents && !xfs_has_rtgroups(mp)) {
+ if (mp->m_sb.sb_rextents &&
+ (!xfs_has_rtgroups(mp) || xfs_has_zoned(mp))) {
xfs_notice(mp, "Cannot turn on quotas for realtime filesystem");
mp->m_qflags = 0;
goto write_changes;
@@ -1802,10 +1752,7 @@ xfs_qm_qino_load(
struct xfs_inode *dp = NULL;
int error;
- error = xfs_trans_alloc_empty(mp, &tp);
- if (error)
- return error;
-
+ tp = xfs_trans_alloc_empty(mp);
if (xfs_has_metadir(mp)) {
error = xfs_dqinode_load_parent(tp, &dp);
if (error)
diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index 37f1230e7584..245d754f382a 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -78,6 +78,28 @@ xfs_qm_statvfs(
}
}
+STATIC int
+xfs_qm_validate_state_change(
+ struct xfs_mount *mp,
+ uint uqd,
+ uint gqd,
+ uint pqd)
+{
+ int state;
+
+ /* Is quota state changing? */
+ state = ((uqd && !XFS_IS_UQUOTA_ON(mp)) ||
+ (!uqd && XFS_IS_UQUOTA_ON(mp)) ||
+ (gqd && !XFS_IS_GQUOTA_ON(mp)) ||
+ (!gqd && XFS_IS_GQUOTA_ON(mp)) ||
+ (pqd && !XFS_IS_PQUOTA_ON(mp)) ||
+ (!pqd && XFS_IS_PQUOTA_ON(mp)));
+
+ return state &&
+ (xfs_dev_is_read_only(mp, "changing quota state") ||
+ xfs_has_norecovery(mp));
+}
+
int
xfs_qm_newmount(
xfs_mount_t *mp,
@@ -97,24 +119,25 @@ xfs_qm_newmount(
}
/*
- * If the device itself is read-only, we can't allow
- * the user to change the state of quota on the mount -
- * this would generate a transaction on the ro device,
- * which would lead to an I/O error and shutdown
+ * If the device itself is read-only and/or in norecovery
+ * mode, we can't allow the user to change the state of
+ * quota on the mount - this would generate a transaction
+ * on the ro device, which would lead to an I/O error and
+ * shutdown.
*/
- if (((uquotaondisk && !XFS_IS_UQUOTA_ON(mp)) ||
- (!uquotaondisk && XFS_IS_UQUOTA_ON(mp)) ||
- (gquotaondisk && !XFS_IS_GQUOTA_ON(mp)) ||
- (!gquotaondisk && XFS_IS_GQUOTA_ON(mp)) ||
- (pquotaondisk && !XFS_IS_PQUOTA_ON(mp)) ||
- (!pquotaondisk && XFS_IS_PQUOTA_ON(mp))) &&
- xfs_dev_is_read_only(mp, "changing quota state")) {
- xfs_warn(mp, "please mount with%s%s%s%s.",
- (!quotaondisk ? "out quota" : ""),
- (uquotaondisk ? " usrquota" : ""),
- (gquotaondisk ? " grpquota" : ""),
- (pquotaondisk ? " prjquota" : ""));
+ if (xfs_qm_validate_state_change(mp, uquotaondisk,
+ gquotaondisk, pquotaondisk)) {
+
+ if (xfs_has_metadir(mp))
+ xfs_warn(mp,
+ "metadir enabled, please mount without any quota mount options");
+ else
+ xfs_warn(mp, "please mount with%s%s%s%s.",
+ (!quotaondisk ? "out quota" : ""),
+ (uquotaondisk ? " usrquota" : ""),
+ (gquotaondisk ? " grpquota" : ""),
+ (pquotaondisk ? " prjquota" : ""));
return -EPERM;
}
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index fe2d7aab8554..3728234699a2 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -78,6 +78,11 @@ xfs_cui_item_size(
*nbytes += xfs_cui_log_format_sizeof(cuip->cui_format.cui_nextents);
}
+unsigned int xfs_cui_log_space(unsigned int nr)
+{
+ return xlog_item_space(1, xfs_cui_log_format_sizeof(nr));
+}
+
/*
* This is called to fill in the vector of log iovecs for the
* given cui log item. We use only 1 iovec, and we point that
@@ -179,6 +184,11 @@ xfs_cud_item_size(
*nbytes += sizeof(struct xfs_cud_log_format);
}
+unsigned int xfs_cud_log_space(void)
+{
+ return xlog_item_space(1, sizeof(struct xfs_cud_log_format));
+}
+
/*
* This is called to fill in the vector of log iovecs for the
* given cud log item. We use only 1 iovec, and we point that
@@ -707,18 +717,18 @@ xlog_recover_cui_commit_pass2(
struct xfs_cui_log_format *cui_formatp;
size_t len;
- cui_formatp = item->ri_buf[0].i_addr;
+ cui_formatp = item->ri_buf[0].iov_base;
- if (item->ri_buf[0].i_len < xfs_cui_log_format_sizeof(0)) {
+ if (item->ri_buf[0].iov_len < xfs_cui_log_format_sizeof(0)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
len = xfs_cui_log_format_sizeof(cui_formatp->cui_nextents);
- if (item->ri_buf[0].i_len != len) {
+ if (item->ri_buf[0].iov_len != len) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
@@ -749,18 +759,18 @@ xlog_recover_rtcui_commit_pass2(
struct xfs_cui_log_format *cui_formatp;
size_t len;
- cui_formatp = item->ri_buf[0].i_addr;
+ cui_formatp = item->ri_buf[0].iov_base;
- if (item->ri_buf[0].i_len < xfs_cui_log_format_sizeof(0)) {
+ if (item->ri_buf[0].iov_len < xfs_cui_log_format_sizeof(0)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
len = xfs_cui_log_format_sizeof(cui_formatp->cui_nextents);
- if (item->ri_buf[0].i_len != len) {
+ if (item->ri_buf[0].iov_len != len) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
@@ -781,7 +791,7 @@ xlog_recover_rtcui_commit_pass2(
xfs_lsn_t lsn)
{
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
#endif
@@ -807,10 +817,10 @@ xlog_recover_cud_commit_pass2(
{
struct xfs_cud_log_format *cud_formatp;
- cud_formatp = item->ri_buf[0].i_addr;
- if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format)) {
+ cud_formatp = item->ri_buf[0].iov_base;
+ if (item->ri_buf[0].iov_len != sizeof(struct xfs_cud_log_format)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
@@ -833,10 +843,10 @@ xlog_recover_rtcud_commit_pass2(
{
struct xfs_cud_log_format *cud_formatp;
- cud_formatp = item->ri_buf[0].i_addr;
- if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format)) {
+ cud_formatp = item->ri_buf[0].iov_base;
+ if (item->ri_buf[0].iov_len != sizeof(struct xfs_cud_log_format)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
diff --git a/fs/xfs/xfs_refcount_item.h b/fs/xfs/xfs_refcount_item.h
index bfee8f30c63c..0fc3f493342b 100644
--- a/fs/xfs/xfs_refcount_item.h
+++ b/fs/xfs/xfs_refcount_item.h
@@ -76,4 +76,7 @@ struct xfs_refcount_intent;
void xfs_refcount_defer_add(struct xfs_trans *tp,
struct xfs_refcount_intent *ri);
+unsigned int xfs_cui_log_space(unsigned int nr);
+unsigned int xfs_cud_log_space(void);
+
#endif /* __XFS_REFCOUNT_ITEM_H__ */
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 59f7fc16eb80..3f177b4ec131 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -235,7 +235,7 @@ xfs_reflink_trim_around_shared(
int error = 0;
/* Holes, unwritten, and delalloc extents cannot be shared */
- if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_written_extent(irec)) {
+ if (!xfs_is_reflink_inode(ip) || !xfs_bmap_is_written_extent(irec)) {
*shared = false;
return 0;
}
@@ -293,7 +293,7 @@ xfs_bmap_trim_cow(
return xfs_reflink_trim_around_shared(ip, imap, shared);
}
-static int
+int
xfs_reflink_convert_cow_locked(
struct xfs_inode *ip,
xfs_fileoff_t offset_fsb,
@@ -651,7 +651,7 @@ xfs_reflink_cancel_cow_blocks(
if (isnullstartblock(del.br_startblock)) {
xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, &icur, &got,
- &del);
+ &del, 0);
} else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) {
ASSERT((*tpp)->t_highest_agno == NULLAGNUMBER);
@@ -786,35 +786,19 @@ xfs_reflink_update_quota(
* requirements as low as possible.
*/
STATIC int
-xfs_reflink_end_cow_extent(
+xfs_reflink_end_cow_extent_locked(
+ struct xfs_trans *tp,
struct xfs_inode *ip,
xfs_fileoff_t *offset_fsb,
xfs_fileoff_t end_fsb)
{
struct xfs_iext_cursor icur;
struct xfs_bmbt_irec got, del, data;
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_trans *tp;
struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
- unsigned int resblks;
int nmaps;
bool isrt = XFS_IS_REALTIME_INODE(ip);
int error;
- resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
- XFS_TRANS_RESERVE, &tp);
- if (error)
- return error;
-
- /*
- * Lock the inode. We have to ijoin without automatic unlock because
- * the lead transaction is the refcountbt record deletion; the data
- * fork update follows as a deferred log item.
- */
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, 0);
-
/*
* In case of racing, overlapping AIO writes no COW extents might be
* left by the time I/O completes for the loser of the race. In that
@@ -823,7 +807,7 @@ xfs_reflink_end_cow_extent(
if (!xfs_iext_lookup_extent(ip, ifp, *offset_fsb, &icur, &got) ||
got.br_startoff >= end_fsb) {
*offset_fsb = end_fsb;
- goto out_cancel;
+ return 0;
}
/*
@@ -837,7 +821,7 @@ xfs_reflink_end_cow_extent(
if (!xfs_iext_next_extent(ifp, &icur, &got) ||
got.br_startoff >= end_fsb) {
*offset_fsb = end_fsb;
- goto out_cancel;
+ return 0;
}
}
del = got;
@@ -846,14 +830,14 @@ xfs_reflink_end_cow_extent(
error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK,
XFS_IEXT_REFLINK_END_COW_CNT);
if (error)
- goto out_cancel;
+ return error;
/* Grab the corresponding mapping in the data fork. */
nmaps = 1;
error = xfs_bmapi_read(ip, del.br_startoff, del.br_blockcount, &data,
&nmaps, 0);
if (error)
- goto out_cancel;
+ return error;
/* We can only remap the smaller of the two extent sizes. */
data.br_blockcount = min(data.br_blockcount, del.br_blockcount);
@@ -882,7 +866,7 @@ xfs_reflink_end_cow_extent(
error = xfs_bunmapi(NULL, ip, data.br_startoff,
data.br_blockcount, 0, 1, &done);
if (error)
- goto out_cancel;
+ return error;
ASSERT(done);
}
@@ -899,17 +883,45 @@ xfs_reflink_end_cow_extent(
/* Remove the mapping from the CoW fork. */
xfs_bmap_del_extent_cow(ip, &icur, &got, &del);
- error = xfs_trans_commit(tp);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- if (error)
- return error;
-
/* Update the caller about how much progress we made. */
*offset_fsb = del.br_startoff + del.br_blockcount;
return 0;
+}
-out_cancel:
- xfs_trans_cancel(tp);
+/*
+ * Remap part of the CoW fork into the data fork.
+ *
+ * We aim to remap the range starting at @offset_fsb and ending at @end_fsb
+ * into the data fork; this function will remap what it can (at the end of the
+ * range) and update @end_fsb appropriately. Each remap gets its own
+ * transaction because we can end up merging and splitting bmbt blocks for
+ * every remap operation and we'd like to keep the block reservation
+ * requirements as low as possible.
+ */
+STATIC int
+xfs_reflink_end_cow_extent(
+ struct xfs_inode *ip,
+ xfs_fileoff_t *offset_fsb,
+ xfs_fileoff_t end_fsb)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ unsigned int resblks;
+ int error;
+
+ resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
+ XFS_TRANS_RESERVE, &tp);
+ if (error)
+ return error;
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
+
+ error = xfs_reflink_end_cow_extent_locked(tp, ip, offset_fsb, end_fsb);
+ if (error)
+ xfs_trans_cancel(tp);
+ else
+ error = xfs_trans_commit(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
@@ -973,6 +985,78 @@ xfs_reflink_end_cow(
}
/*
+ * Fully remap all of the file's data fork at once, which is the critical part
+ * in achieving atomic behaviour.
+ * The regular CoW end path does not use function as to keep the block
+ * reservation per transaction as low as possible.
+ */
+int
+xfs_reflink_end_atomic_cow(
+ struct xfs_inode *ip,
+ xfs_off_t offset,
+ xfs_off_t count)
+{
+ xfs_fileoff_t offset_fsb;
+ xfs_fileoff_t end_fsb;
+ int error = 0;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ unsigned int resblks;
+
+ trace_xfs_reflink_end_cow(ip, offset, count);
+
+ offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ end_fsb = XFS_B_TO_FSB(mp, offset + count);
+
+ /*
+ * Each remapping operation could cause a btree split, so in the worst
+ * case that's one for each block.
+ */
+ resblks = (end_fsb - offset_fsb) *
+ XFS_NEXTENTADD_SPACE_RES(mp, 1, XFS_DATA_FORK);
+
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_atomic_ioend, resblks, 0,
+ XFS_TRANS_RESERVE, &tp);
+ if (error)
+ return error;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
+
+ while (end_fsb > offset_fsb && !error) {
+ error = xfs_reflink_end_cow_extent_locked(tp, ip, &offset_fsb,
+ end_fsb);
+ }
+ if (error) {
+ trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
+ goto out_cancel;
+ }
+ error = xfs_trans_commit(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
+out_cancel:
+ xfs_trans_cancel(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
+}
+
+/* Compute the largest atomic write that we can complete through software. */
+xfs_extlen_t
+xfs_reflink_max_atomic_cow(
+ struct xfs_mount *mp)
+{
+ /* We cannot do any atomic writes without out of place writes. */
+ if (!xfs_can_sw_atomic_write(mp))
+ return 0;
+
+ /*
+ * Atomic write limits must always be a power-of-2, according to
+ * generic_atomic_write_valid.
+ */
+ return rounddown_pow_of_two(xfs_calc_max_atomic_write_fsblocks(mp));
+}
+
+/*
* Free all CoW staging blocks that are still referenced by the ondisk refcount
* metadata. The ondisk metadata does not track which inode created the
* staging extent, so callers must ensure that there are no cached inodes with
@@ -1207,15 +1291,9 @@ xfs_reflink_ag_has_free_space(
if (!xfs_has_rmapbt(mp))
return 0;
if (XFS_IS_REALTIME_INODE(ip)) {
- struct xfs_rtgroup *rtg;
- xfs_rgnumber_t rgno;
-
- rgno = xfs_rtb_to_rgno(mp, fsb);
- rtg = xfs_rtgroup_get(mp, rgno);
- if (xfs_metafile_resv_critical(rtg_rmap(rtg)))
- error = -ENOSPC;
- xfs_rtgroup_put(rtg);
- return error;
+ if (xfs_metafile_resv_critical(mp))
+ return -ENOSPC;
+ return 0;
}
agno = XFS_FSB_TO_AGNO(mp, fsb);
@@ -1538,7 +1616,7 @@ xfs_reflink_zero_posteof(
return 0;
trace_xfs_zero_eof(ip, isize, pos - isize);
- return xfs_zero_range(ip, isize, pos - isize, NULL);
+ return xfs_zero_range(ip, isize, pos - isize, NULL, NULL);
}
/*
@@ -1803,7 +1881,8 @@ xfs_reflink_unshare(
&xfs_dax_write_iomap_ops);
else
error = iomap_file_unshare(inode, offset, len,
- &xfs_buffered_write_iomap_ops);
+ &xfs_buffered_write_iomap_ops,
+ &xfs_iomap_write_ops);
if (error)
goto out;
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index cc4e92278279..36cda724da89 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -35,6 +35,8 @@ int xfs_reflink_allocate_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap,
bool convert_now);
extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t count);
+int xfs_reflink_convert_cow_locked(struct xfs_inode *ip,
+ xfs_fileoff_t offset_fsb, xfs_filblks_t count_fsb);
extern int xfs_reflink_cancel_cow_blocks(struct xfs_inode *ip,
struct xfs_trans **tpp, xfs_fileoff_t offset_fsb,
@@ -43,6 +45,8 @@ extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t count, bool cancel_real);
extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t count);
+int xfs_reflink_end_atomic_cow(struct xfs_inode *ip, xfs_off_t offset,
+ xfs_off_t count);
extern int xfs_reflink_recover_cow(struct xfs_mount *mp);
extern loff_t xfs_reflink_remap_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out, loff_t len,
@@ -64,4 +68,6 @@ extern int xfs_reflink_update_dest(struct xfs_inode *dest, xfs_off_t newlen,
bool xfs_reflink_supports_rextsize(struct xfs_mount *mp, unsigned int rextsize);
+xfs_extlen_t xfs_reflink_max_atomic_cow(struct xfs_mount *mp);
+
#endif /* __XFS_REFLINK_H */
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 89decffe76c8..15f0903f6fd4 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -77,6 +77,11 @@ xfs_rui_item_size(
*nbytes += xfs_rui_log_format_sizeof(ruip->rui_format.rui_nextents);
}
+unsigned int xfs_rui_log_space(unsigned int nr)
+{
+ return xlog_item_space(1, xfs_rui_log_format_sizeof(nr));
+}
+
/*
* This is called to fill in the vector of log iovecs for the
* given rui log item. We use only 1 iovec, and we point that
@@ -180,6 +185,11 @@ xfs_rud_item_size(
*nbytes += sizeof(struct xfs_rud_log_format);
}
+unsigned int xfs_rud_log_space(void)
+{
+ return xlog_item_space(1, sizeof(struct xfs_rud_log_format));
+}
+
/*
* This is called to fill in the vector of log iovecs for the
* given rud log item. We use only 1 iovec, and we point that
@@ -736,18 +746,18 @@ xlog_recover_rui_commit_pass2(
struct xfs_rui_log_format *rui_formatp;
size_t len;
- rui_formatp = item->ri_buf[0].i_addr;
+ rui_formatp = item->ri_buf[0].iov_base;
- if (item->ri_buf[0].i_len < xfs_rui_log_format_sizeof(0)) {
+ if (item->ri_buf[0].iov_len < xfs_rui_log_format_sizeof(0)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
len = xfs_rui_log_format_sizeof(rui_formatp->rui_nextents);
- if (item->ri_buf[0].i_len != len) {
+ if (item->ri_buf[0].iov_len != len) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
@@ -778,18 +788,18 @@ xlog_recover_rtrui_commit_pass2(
struct xfs_rui_log_format *rui_formatp;
size_t len;
- rui_formatp = item->ri_buf[0].i_addr;
+ rui_formatp = item->ri_buf[0].iov_base;
- if (item->ri_buf[0].i_len < xfs_rui_log_format_sizeof(0)) {
+ if (item->ri_buf[0].iov_len < xfs_rui_log_format_sizeof(0)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
len = xfs_rui_log_format_sizeof(rui_formatp->rui_nextents);
- if (item->ri_buf[0].i_len != len) {
+ if (item->ri_buf[0].iov_len != len) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
@@ -810,7 +820,7 @@ xlog_recover_rtrui_commit_pass2(
xfs_lsn_t lsn)
{
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
- item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
+ item->ri_buf[0].iov_base, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
#endif
@@ -836,10 +846,10 @@ xlog_recover_rud_commit_pass2(
{
struct xfs_rud_log_format *rud_formatp;
- rud_formatp = item->ri_buf[0].i_addr;
- if (item->ri_buf[0].i_len != sizeof(struct xfs_rud_log_format)) {
+ rud_formatp = item->ri_buf[0].iov_base;
+ if (item->ri_buf[0].iov_len != sizeof(struct xfs_rud_log_format)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
- rud_formatp, item->ri_buf[0].i_len);
+ rud_formatp, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
@@ -862,10 +872,10 @@ xlog_recover_rtrud_commit_pass2(
{
struct xfs_rud_log_format *rud_formatp;
- rud_formatp = item->ri_buf[0].i_addr;
- if (item->ri_buf[0].i_len != sizeof(struct xfs_rud_log_format)) {
+ rud_formatp = item->ri_buf[0].iov_base;
+ if (item->ri_buf[0].iov_len != sizeof(struct xfs_rud_log_format)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, log->l_mp,
- rud_formatp, item->ri_buf[0].i_len);
+ rud_formatp, item->ri_buf[0].iov_len);
return -EFSCORRUPTED;
}
diff --git a/fs/xfs/xfs_rmap_item.h b/fs/xfs/xfs_rmap_item.h
index 40d331555675..3a99f0117f2d 100644
--- a/fs/xfs/xfs_rmap_item.h
+++ b/fs/xfs/xfs_rmap_item.h
@@ -75,4 +75,7 @@ struct xfs_rmap_intent;
void xfs_rmap_defer_add(struct xfs_trans *tp, struct xfs_rmap_intent *ri);
+unsigned int xfs_rui_log_space(unsigned int nr);
+unsigned int xfs_rud_log_space(void);
+
#endif /* __XFS_RMAP_ITEM_H__ */
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index d8e6d073d64d..6907e871fa15 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -33,6 +33,7 @@
#include "xfs_trace.h"
#include "xfs_rtrefcount_btree.h"
#include "xfs_reflink.h"
+#include "xfs_zone_alloc.h"
/*
* Return whether there are any free extents in the size range given
@@ -663,7 +664,8 @@ xfs_rtunmount_rtg(
for (i = 0; i < XFS_RTGI_MAX; i++)
xfs_rtginode_irele(&rtg->rtg_inodes[i]);
- kvfree(rtg->rtg_rsum_cache);
+ if (!xfs_has_zoned(rtg_mount(rtg)))
+ kvfree(rtg->rtg_rsum_cache);
}
static int
@@ -727,9 +729,7 @@ xfs_rtginode_ensure(
if (rtg->rtg_inodes[type])
return 0;
- error = xfs_trans_alloc_empty(rtg_mount(rtg), &tp);
- if (error)
- return error;
+ tp = xfs_trans_alloc_empty(rtg_mount(rtg));
error = xfs_rtginode_load(rtg, type, tp);
xfs_trans_cancel(tp);
@@ -837,7 +837,7 @@ xfs_growfs_rt_init_rtsb(
return 0;
error = xfs_buf_get_uncached(mp->m_rtdev_targp, XFS_FSB_TO_BB(mp, 1),
- 0, &rtsb_bp);
+ &rtsb_bp);
if (error)
return error;
@@ -858,6 +858,84 @@ xfs_growfs_rt_init_rtsb(
return error;
}
+static void
+xfs_growfs_rt_sb_fields(
+ struct xfs_trans *tp,
+ const struct xfs_mount *nmp)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+
+ if (nmp->m_sb.sb_rextsize != mp->m_sb.sb_rextsize)
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSIZE,
+ nmp->m_sb.sb_rextsize - mp->m_sb.sb_rextsize);
+ if (nmp->m_sb.sb_rbmblocks != mp->m_sb.sb_rbmblocks)
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBMBLOCKS,
+ nmp->m_sb.sb_rbmblocks - mp->m_sb.sb_rbmblocks);
+ if (nmp->m_sb.sb_rblocks != mp->m_sb.sb_rblocks)
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBLOCKS,
+ nmp->m_sb.sb_rblocks - mp->m_sb.sb_rblocks);
+ if (nmp->m_sb.sb_rextents != mp->m_sb.sb_rextents)
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTENTS,
+ nmp->m_sb.sb_rextents - mp->m_sb.sb_rextents);
+ if (nmp->m_sb.sb_rextslog != mp->m_sb.sb_rextslog)
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSLOG,
+ nmp->m_sb.sb_rextslog - mp->m_sb.sb_rextslog);
+ if (nmp->m_sb.sb_rgcount != mp->m_sb.sb_rgcount)
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_RGCOUNT,
+ nmp->m_sb.sb_rgcount - mp->m_sb.sb_rgcount);
+}
+
+static int
+xfs_growfs_rt_zoned(
+ struct xfs_rtgroup *rtg,
+ xfs_rfsblock_t nrblocks)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+ struct xfs_mount *nmp;
+ struct xfs_trans *tp;
+ xfs_rtbxlen_t freed_rtx;
+ int error;
+
+ /*
+ * Calculate new sb and mount fields for this round. Also ensure the
+ * rtg_extents value is uptodate as the rtbitmap code relies on it.
+ */
+ nmp = xfs_growfs_rt_alloc_fake_mount(mp, nrblocks,
+ mp->m_sb.sb_rextsize);
+ if (!nmp)
+ return -ENOMEM;
+ freed_rtx = nmp->m_sb.sb_rextents - mp->m_sb.sb_rextents;
+
+ xfs_rtgroup_calc_geometry(nmp, rtg, rtg_rgno(rtg),
+ nmp->m_sb.sb_rgcount, nmp->m_sb.sb_rextents);
+
+ error = xfs_trans_alloc(mp, &M_RES(nmp)->tr_growrtfree, 0, 0, 0, &tp);
+ if (error)
+ goto out_free;
+
+ xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
+ xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP);
+
+ xfs_growfs_rt_sb_fields(tp, nmp);
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, freed_rtx);
+
+ error = xfs_trans_commit(tp);
+ if (error)
+ goto out_free;
+
+ /*
+ * Ensure the mount RT feature flag is now set, and compute new
+ * maxlevels for rt btrees.
+ */
+ mp->m_features |= XFS_FEAT_REALTIME;
+ xfs_rtrmapbt_compute_maxlevels(mp);
+ xfs_rtrefcountbt_compute_maxlevels(mp);
+ xfs_zoned_add_available(mp, freed_rtx);
+out_free:
+ kfree(nmp);
+ return error;
+}
+
static int
xfs_growfs_rt_bmblock(
struct xfs_rtgroup *rtg,
@@ -943,24 +1021,7 @@ xfs_growfs_rt_bmblock(
/*
* Update superblock fields.
*/
- if (nmp->m_sb.sb_rextsize != mp->m_sb.sb_rextsize)
- xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_REXTSIZE,
- nmp->m_sb.sb_rextsize - mp->m_sb.sb_rextsize);
- if (nmp->m_sb.sb_rbmblocks != mp->m_sb.sb_rbmblocks)
- xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_RBMBLOCKS,
- nmp->m_sb.sb_rbmblocks - mp->m_sb.sb_rbmblocks);
- if (nmp->m_sb.sb_rblocks != mp->m_sb.sb_rblocks)
- xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_RBLOCKS,
- nmp->m_sb.sb_rblocks - mp->m_sb.sb_rblocks);
- if (nmp->m_sb.sb_rextents != mp->m_sb.sb_rextents)
- xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_REXTENTS,
- nmp->m_sb.sb_rextents - mp->m_sb.sb_rextents);
- if (nmp->m_sb.sb_rextslog != mp->m_sb.sb_rextslog)
- xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_REXTSLOG,
- nmp->m_sb.sb_rextslog - mp->m_sb.sb_rextslog);
- if (nmp->m_sb.sb_rgcount != mp->m_sb.sb_rgcount)
- xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_RGCOUNT,
- nmp->m_sb.sb_rgcount - mp->m_sb.sb_rgcount);
+ xfs_growfs_rt_sb_fields(args.tp, nmp);
/*
* Free the new extent.
@@ -1127,6 +1188,11 @@ xfs_growfs_rtg(
goto out_rele;
}
+ if (xfs_has_zoned(mp)) {
+ error = xfs_growfs_rt_zoned(rtg, nrblocks);
+ goto out_rele;
+ }
+
error = xfs_growfs_rt_alloc_blocks(rtg, nrblocks, rextsize, &bmblocks);
if (error)
goto out_rele;
@@ -1144,10 +1210,8 @@ xfs_growfs_rtg(
goto out_error;
}
- if (old_rsum_cache)
- kvfree(old_rsum_cache);
- xfs_rtgroup_rele(rtg);
- return 0;
+ kvfree(old_rsum_cache);
+ goto out_rele;
out_error:
/*
@@ -1193,8 +1257,26 @@ xfs_growfs_check_rtgeom(
kfree(nmp);
+ trace_xfs_growfs_check_rtgeom(mp, min_logfsbs);
+
if (min_logfsbs > mp->m_sb.sb_logblocks)
return -EINVAL;
+
+ if (xfs_has_zoned(mp)) {
+ uint32_t gblocks = mp->m_groups[XG_TYPE_RTG].blocks;
+ uint32_t rem;
+
+ if (rextsize != 1)
+ return -EINVAL;
+ div_u64_rem(mp->m_sb.sb_rblocks, gblocks, &rem);
+ if (rem) {
+ xfs_warn(mp,
+"new RT volume size (%lld) not aligned to RT group size (%d)",
+ mp->m_sb.sb_rblocks, gblocks);
+ return -EINVAL;
+ }
+ }
+
return 0;
}
@@ -1221,9 +1303,7 @@ xfs_growfs_rt_prep_groups(
if (!mp->m_rtdirip) {
struct xfs_trans *tp;
- error = xfs_trans_alloc_empty(mp, &tp);
- if (error)
- return error;
+ tp = xfs_trans_alloc_empty(mp);
error = xfs_rtginode_load_parent(tp);
xfs_trans_cancel(tp);
@@ -1249,6 +1329,35 @@ xfs_grow_last_rtg(
}
/*
+ * Read in the last block of the RT device to make sure it is accessible.
+ */
+static int
+xfs_rt_check_size(
+ struct xfs_mount *mp,
+ xfs_rfsblock_t last_block)
+{
+ xfs_daddr_t daddr = XFS_FSB_TO_BB(mp, last_block);
+ struct xfs_buf *bp;
+ int error;
+
+ if (XFS_BB_TO_FSB(mp, daddr) != last_block) {
+ xfs_warn(mp, "RT device size overflow: %llu != %llu",
+ XFS_BB_TO_FSB(mp, daddr), last_block);
+ return -EFBIG;
+ }
+
+ error = xfs_buf_read_uncached(mp->m_rtdev_targp,
+ XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart) + daddr,
+ XFS_FSB_TO_BB(mp, 1), &bp, NULL);
+ if (error)
+ xfs_warn(mp, "cannot read last RT device sector (%lld)",
+ last_block);
+ else
+ xfs_buf_relse(bp);
+ return error;
+}
+
+/*
* Grow the realtime area of the filesystem.
*/
int
@@ -1259,7 +1368,6 @@ xfs_growfs_rt(
xfs_rgnumber_t old_rgcount = mp->m_sb.sb_rgcount;
xfs_rgnumber_t new_rgcount = 1;
xfs_rgnumber_t rgno;
- struct xfs_buf *bp;
xfs_agblock_t old_rextsize = mp->m_sb.sb_rextsize;
int error;
@@ -1302,15 +1410,10 @@ xfs_growfs_rt(
error = xfs_sb_validate_fsb_count(&mp->m_sb, in->newblocks);
if (error)
goto out_unlock;
- /*
- * Read in the last block of the device, make sure it exists.
- */
- error = xfs_buf_read_uncached(mp->m_rtdev_targp,
- XFS_FSB_TO_BB(mp, in->newblocks - 1),
- XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL);
+
+ error = xfs_rt_check_size(mp, in->newblocks - 1);
if (error)
goto out_unlock;
- xfs_buf_relse(bp);
/*
* Calculate new parameters. These are the final values to be reached.
@@ -1376,8 +1479,7 @@ xfs_growfs_rt(
error = error2;
/* Reset the rt metadata btree space reservations. */
- xfs_rt_resv_free(mp);
- error2 = xfs_rt_resv_init(mp);
+ error2 = xfs_metafile_resv_init(mp);
if (error2 && error2 != -ENOSPC)
error = error2;
}
@@ -1407,7 +1509,7 @@ xfs_rtmount_readsb(
/* m_blkbb_log is not set up yet */
error = xfs_buf_read_uncached(mp->m_rtdev_targp, XFS_RTSB_DADDR,
- mp->m_sb.sb_blocksize >> BBSHIFT, XBF_NO_IOACCT, &bp,
+ mp->m_sb.sb_blocksize >> BBSHIFT, &bp,
&xfs_rtsb_buf_ops);
if (error) {
xfs_warn(mp, "rt sb validate failed with error %d.", error);
@@ -1444,10 +1546,6 @@ int /* error */
xfs_rtmount_init(
struct xfs_mount *mp) /* file system mount structure */
{
- struct xfs_buf *bp; /* buffer for last block of subvolume */
- xfs_daddr_t d; /* address of last block of subvolume */
- int error;
-
if (mp->m_sb.sb_rblocks == 0)
return 0;
if (mp->m_rtdev_targp == NULL) {
@@ -1458,25 +1556,7 @@ xfs_rtmount_init(
mp->m_rsumblocks = xfs_rtsummary_blockcount(mp, &mp->m_rsumlevels);
- /*
- * Check that the realtime section is an ok size.
- */
- d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
- if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_rblocks) {
- xfs_warn(mp, "realtime mount -- %llu != %llu",
- (unsigned long long) XFS_BB_TO_FSB(mp, d),
- (unsigned long long) mp->m_sb.sb_rblocks);
- return -EFBIG;
- }
- error = xfs_buf_read_uncached(mp->m_rtdev_targp,
- d - XFS_FSB_TO_BB(mp, 1),
- XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL);
- if (error) {
- xfs_warn(mp, "realtime device size check failed");
- return error;
- }
- xfs_buf_relse(bp);
- return 0;
+ return xfs_rt_check_size(mp, mp->m_sb.sb_rblocks - 1);
}
static int
@@ -1519,50 +1599,10 @@ xfs_rtalloc_reinit_frextents(
spin_lock(&mp->m_sb_lock);
mp->m_sb.sb_frextents = val;
spin_unlock(&mp->m_sb_lock);
- percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents);
+ xfs_set_freecounter(mp, XC_FREE_RTEXTENTS, mp->m_sb.sb_frextents);
return 0;
}
-/* Free space reservations for rt metadata inodes. */
-void
-xfs_rt_resv_free(
- struct xfs_mount *mp)
-{
- struct xfs_rtgroup *rtg = NULL;
- unsigned int i;
-
- while ((rtg = xfs_rtgroup_next(mp, rtg))) {
- for (i = 0; i < XFS_RTGI_MAX; i++)
- xfs_metafile_resv_free(rtg->rtg_inodes[i]);
- }
-}
-
-/* Reserve space for rt metadata inodes' space expansion. */
-int
-xfs_rt_resv_init(
- struct xfs_mount *mp)
-{
- struct xfs_rtgroup *rtg = NULL;
- xfs_filblks_t ask;
- int error = 0;
-
- while ((rtg = xfs_rtgroup_next(mp, rtg))) {
- int err2;
-
- ask = xfs_rtrmapbt_calc_reserves(mp);
- err2 = xfs_metafile_resv_init(rtg_rmap(rtg), ask);
- if (err2 && !error)
- error = err2;
-
- ask = xfs_rtrefcountbt_calc_reserves(mp);
- err2 = xfs_metafile_resv_init(rtg_refcount(rtg), ask);
- if (err2 && !error)
- error = err2;
- }
-
- return error;
-}
-
/*
* Read in the bmbt of an rt metadata inode so that we never have to load them
* at runtime. This enables the use of shared ILOCKs for rtbitmap scans. Use
@@ -1613,6 +1653,8 @@ xfs_rtmount_rtg(
}
}
+ if (xfs_has_zoned(mp))
+ return 0;
return xfs_alloc_rsum_cache(rtg, mp->m_sb.sb_rbmblocks);
}
@@ -1628,10 +1670,7 @@ xfs_rtmount_inodes(
struct xfs_rtgroup *rtg = NULL;
int error;
- error = xfs_trans_alloc_empty(mp, &tp);
- if (error)
- return error;
-
+ tp = xfs_trans_alloc_empty(mp);
if (xfs_has_rtgroups(mp) && mp->m_sb.sb_rgcount > 0) {
error = xfs_rtginode_load_parent(tp);
if (error)
@@ -2097,6 +2136,8 @@ xfs_bmap_rtalloc(
ap->datatype & XFS_ALLOC_INITIAL_USER_DATA;
int error;
+ ASSERT(!xfs_has_zoned(ap->tp->t_mountp));
+
retry:
error = xfs_rtallocate_align(ap, &ralen, &raminlen, &prod, &noalign);
if (error)
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index 0d95b29092c9..78a690b489ed 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -34,9 +34,6 @@ int /* error */
xfs_rtmount_inodes(
struct xfs_mount *mp); /* file system mount structure */
-void xfs_rt_resv_free(struct xfs_mount *mp);
-int xfs_rt_resv_init(struct xfs_mount *mp);
-
/*
* Grow the realtime area of the filesystem.
*/
@@ -65,8 +62,6 @@ xfs_rtmount_init(
}
# define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (-ENOSYS))
# define xfs_rtunmount_inodes(m)
-# define xfs_rt_resv_free(mp) ((void)0)
-# define xfs_rt_resv_init(mp) (0)
static inline int
xfs_growfs_check_rtgeom(const struct xfs_mount *mp,
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index d92d7a07ea89..bb0a82635a77 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -46,6 +46,7 @@
#include "xfs_exchmaps_item.h"
#include "xfs_parent.h"
#include "xfs_rtalloc.h"
+#include "xfs_zone_alloc.h"
#include "scrub/stats.h"
#include "scrub/rcbag_btree.h"
@@ -109,7 +110,8 @@ enum {
Opt_filestreams, Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota,
Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota,
Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce,
- Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum,
+ Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum, Opt_max_open_zones,
+ Opt_lifetime, Opt_nolifetime, Opt_max_atomic_write,
};
static const struct fs_parameter_spec xfs_fs_parameters[] = {
@@ -154,6 +156,10 @@ static const struct fs_parameter_spec xfs_fs_parameters[] = {
fsparam_flag("nodiscard", Opt_nodiscard),
fsparam_flag("dax", Opt_dax),
fsparam_enum("dax", Opt_dax_enum, dax_param_enums),
+ fsparam_u32("max_open_zones", Opt_max_open_zones),
+ fsparam_flag("lifetime", Opt_lifetime),
+ fsparam_flag("nolifetime", Opt_nolifetime),
+ fsparam_string("max_atomic_write", Opt_max_atomic_write),
{}
};
@@ -182,6 +188,7 @@ xfs_fs_show_options(
{ XFS_FEAT_LARGE_IOSIZE, ",largeio" },
{ XFS_FEAT_DAX_ALWAYS, ",dax=always" },
{ XFS_FEAT_DAX_NEVER, ",dax=never" },
+ { XFS_FEAT_NOLIFETIME, ",nolifetime" },
{ 0, NULL }
};
struct xfs_mount *mp = XFS_M(root->d_sb);
@@ -233,6 +240,12 @@ xfs_fs_show_options(
if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT))
seq_puts(m, ",noquota");
+ if (mp->m_max_open_zones)
+ seq_printf(m, ",max_open_zones=%u", mp->m_max_open_zones);
+ if (mp->m_awu_max_bytes)
+ seq_printf(m, ",max_atomic_write=%lluk",
+ mp->m_awu_max_bytes >> 10);
+
return 0;
}
@@ -371,10 +384,11 @@ xfs_blkdev_get(
struct file **bdev_filep)
{
int error = 0;
+ blk_mode_t mode;
- *bdev_filep = bdev_file_open_by_path(name,
- BLK_OPEN_READ | BLK_OPEN_WRITE | BLK_OPEN_RESTRICT_WRITES,
- mp->m_super, &fs_holder_ops);
+ mode = sb_open_mode(mp->m_super->s_flags);
+ *bdev_filep = bdev_file_open_by_path(name, mode,
+ mp->m_super, &fs_holder_ops);
if (IS_ERR(*bdev_filep)) {
error = PTR_ERR(*bdev_filep);
*bdev_filep = NULL;
@@ -472,21 +486,29 @@ xfs_open_devices(
/*
* Setup xfs_mount buffer target pointers
*/
- error = -ENOMEM;
mp->m_ddev_targp = xfs_alloc_buftarg(mp, sb->s_bdev_file);
- if (!mp->m_ddev_targp)
+ if (IS_ERR(mp->m_ddev_targp)) {
+ error = PTR_ERR(mp->m_ddev_targp);
+ mp->m_ddev_targp = NULL;
goto out_close_rtdev;
+ }
if (rtdev_file) {
mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev_file);
- if (!mp->m_rtdev_targp)
+ if (IS_ERR(mp->m_rtdev_targp)) {
+ error = PTR_ERR(mp->m_rtdev_targp);
+ mp->m_rtdev_targp = NULL;
goto out_free_ddev_targ;
+ }
}
if (logdev_file && file_bdev(logdev_file) != ddev) {
mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev_file);
- if (!mp->m_logdev_targp)
+ if (IS_ERR(mp->m_logdev_targp)) {
+ error = PTR_ERR(mp->m_logdev_targp);
+ mp->m_logdev_targp = NULL;
goto out_free_rtdev_targ;
+ }
} else {
mp->m_logdev_targp = mp->m_ddev_targp;
/* Handle won't be used, drop it */
@@ -519,7 +541,7 @@ xfs_setup_devices(
{
int error;
- error = xfs_setsize_buftarg(mp->m_ddev_targp, mp->m_sb.sb_sectsize);
+ error = xfs_configure_buftarg(mp->m_ddev_targp, mp->m_sb.sb_sectsize);
if (error)
return error;
@@ -528,13 +550,21 @@ xfs_setup_devices(
if (xfs_has_sector(mp))
log_sector_size = mp->m_sb.sb_logsectsize;
- error = xfs_setsize_buftarg(mp->m_logdev_targp,
+ error = xfs_configure_buftarg(mp->m_logdev_targp,
log_sector_size);
if (error)
return error;
}
- if (mp->m_rtdev_targp) {
- error = xfs_setsize_buftarg(mp->m_rtdev_targp,
+
+ if (mp->m_sb.sb_rtstart) {
+ if (mp->m_rtdev_targp) {
+ xfs_warn(mp,
+ "can't use internal and external rtdev at the same time");
+ return -EINVAL;
+ }
+ mp->m_rtdev_targp = mp->m_ddev_targp;
+ } else if (mp->m_rtname) {
+ error = xfs_configure_buftarg(mp->m_rtdev_targp,
mp->m_sb.sb_sectsize);
if (error)
return error;
@@ -751,13 +781,24 @@ xfs_fs_drop_inode(
return generic_drop_inode(inode);
}
+STATIC void
+xfs_fs_evict_inode(
+ struct inode *inode)
+{
+ if (IS_DAX(inode))
+ dax_break_layout_final(inode);
+
+ truncate_inode_pages_final(&inode->i_data);
+ clear_inode(inode);
+}
+
static void
xfs_mount_free(
struct xfs_mount *mp)
{
if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
xfs_free_buftarg(mp->m_logdev_targp);
- if (mp->m_rtdev_targp)
+ if (mp->m_rtdev_targp && mp->m_rtdev_targp != mp->m_ddev_targp)
xfs_free_buftarg(mp->m_rtdev_targp);
if (mp->m_ddev_targp)
xfs_free_buftarg(mp->m_ddev_targp);
@@ -814,6 +855,7 @@ xfs_fs_sync_fs(
if (sb->s_writers.frozen == SB_FREEZE_PAGEFAULT) {
xfs_inodegc_stop(mp);
xfs_blockgc_stop(mp);
+ xfs_zone_gc_stop(mp);
}
return 0;
@@ -834,10 +876,12 @@ xfs_statfs_data(
struct kstatfs *st)
{
int64_t fdblocks =
- percpu_counter_sum(&mp->m_fdblocks);
+ xfs_sum_freecounter(mp, XC_FREE_BLOCKS);
/* make sure st->f_bfree does not underflow */
- st->f_bfree = max(0LL, fdblocks - xfs_fdblocks_unavailable(mp));
+ st->f_bfree = max(0LL,
+ fdblocks - xfs_freecounter_unavailable(mp, XC_FREE_BLOCKS));
+
/*
* sb_dblocks can change during growfs, but nothing cares about reporting
* the old or new value during growfs.
@@ -856,8 +900,9 @@ xfs_statfs_rt(
struct kstatfs *st)
{
st->f_bfree = xfs_rtbxlen_to_blen(mp,
- percpu_counter_sum_positive(&mp->m_frextents));
- st->f_blocks = mp->m_sb.sb_rblocks;
+ xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS));
+ st->f_blocks = mp->m_sb.sb_rblocks - xfs_rtbxlen_to_blen(mp,
+ mp->m_free[XC_FREE_RTEXTENTS].res_total);
}
static void
@@ -922,24 +967,32 @@ xfs_fs_statfs(
}
STATIC void
-xfs_save_resvblks(struct xfs_mount *mp)
+xfs_save_resvblks(
+ struct xfs_mount *mp)
{
- mp->m_resblks_save = mp->m_resblks;
- xfs_reserve_blocks(mp, 0);
+ enum xfs_free_counter i;
+
+ for (i = 0; i < XC_FREE_NR; i++) {
+ mp->m_free[i].res_saved = mp->m_free[i].res_total;
+ xfs_reserve_blocks(mp, i, 0);
+ }
}
STATIC void
-xfs_restore_resvblks(struct xfs_mount *mp)
+xfs_restore_resvblks(
+ struct xfs_mount *mp)
{
- uint64_t resblks;
+ uint64_t resblks;
+ enum xfs_free_counter i;
- if (mp->m_resblks_save) {
- resblks = mp->m_resblks_save;
- mp->m_resblks_save = 0;
- } else
- resblks = xfs_default_resblks(mp);
-
- xfs_reserve_blocks(mp, resblks);
+ for (i = 0; i < XC_FREE_NR; i++) {
+ if (mp->m_free[i].res_saved) {
+ resblks = mp->m_free[i].res_saved;
+ mp->m_free[i].res_saved = 0;
+ } else
+ resblks = xfs_default_resblks(mp, i);
+ xfs_reserve_blocks(mp, i, resblks);
+ }
}
/*
@@ -976,6 +1029,7 @@ xfs_fs_freeze(
if (ret && !xfs_is_readonly(mp)) {
xfs_blockgc_start(mp);
xfs_inodegc_start(mp);
+ xfs_zone_gc_start(mp);
}
return ret;
@@ -997,6 +1051,7 @@ xfs_fs_unfreeze(
* filesystem.
*/
if (!xfs_is_readonly(mp)) {
+ xfs_zone_gc_start(mp);
xfs_blockgc_start(mp);
xfs_inodegc_start(mp);
}
@@ -1058,6 +1113,19 @@ xfs_finish_flags(
return -EINVAL;
}
+ if (!xfs_has_zoned(mp)) {
+ if (mp->m_max_open_zones) {
+ xfs_warn(mp,
+"max_open_zones mount option only supported on zoned file systems.");
+ return -EINVAL;
+ }
+ if (mp->m_features & XFS_FEAT_NOLIFETIME) {
+ xfs_warn(mp,
+"nolifetime mount option only supported on zoned file systems.");
+ return -EINVAL;
+ }
+ }
+
return 0;
}
@@ -1065,7 +1133,8 @@ static int
xfs_init_percpu_counters(
struct xfs_mount *mp)
{
- int error;
+ int error;
+ int i;
error = percpu_counter_init(&mp->m_icount, 0, GFP_KERNEL);
if (error)
@@ -1075,30 +1144,29 @@ xfs_init_percpu_counters(
if (error)
goto free_icount;
- error = percpu_counter_init(&mp->m_fdblocks, 0, GFP_KERNEL);
- if (error)
- goto free_ifree;
-
error = percpu_counter_init(&mp->m_delalloc_blks, 0, GFP_KERNEL);
if (error)
- goto free_fdblocks;
+ goto free_ifree;
error = percpu_counter_init(&mp->m_delalloc_rtextents, 0, GFP_KERNEL);
if (error)
goto free_delalloc;
- error = percpu_counter_init(&mp->m_frextents, 0, GFP_KERNEL);
- if (error)
- goto free_delalloc_rt;
+ for (i = 0; i < XC_FREE_NR; i++) {
+ error = percpu_counter_init(&mp->m_free[i].count, 0,
+ GFP_KERNEL);
+ if (error)
+ goto free_freecounters;
+ }
return 0;
-free_delalloc_rt:
+free_freecounters:
+ while (--i >= 0)
+ percpu_counter_destroy(&mp->m_free[i].count);
percpu_counter_destroy(&mp->m_delalloc_rtextents);
free_delalloc:
percpu_counter_destroy(&mp->m_delalloc_blks);
-free_fdblocks:
- percpu_counter_destroy(&mp->m_fdblocks);
free_ifree:
percpu_counter_destroy(&mp->m_ifree);
free_icount:
@@ -1112,24 +1180,28 @@ xfs_reinit_percpu_counters(
{
percpu_counter_set(&mp->m_icount, mp->m_sb.sb_icount);
percpu_counter_set(&mp->m_ifree, mp->m_sb.sb_ifree);
- percpu_counter_set(&mp->m_fdblocks, mp->m_sb.sb_fdblocks);
- percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents);
+ xfs_set_freecounter(mp, XC_FREE_BLOCKS, mp->m_sb.sb_fdblocks);
+ if (!xfs_has_zoned(mp))
+ xfs_set_freecounter(mp, XC_FREE_RTEXTENTS,
+ mp->m_sb.sb_frextents);
}
static void
xfs_destroy_percpu_counters(
struct xfs_mount *mp)
{
+ enum xfs_free_counter i;
+
+ for (i = 0; i < XC_FREE_NR; i++)
+ percpu_counter_destroy(&mp->m_free[i].count);
percpu_counter_destroy(&mp->m_icount);
percpu_counter_destroy(&mp->m_ifree);
- percpu_counter_destroy(&mp->m_fdblocks);
ASSERT(xfs_is_shutdown(mp) ||
percpu_counter_sum(&mp->m_delalloc_rtextents) == 0);
percpu_counter_destroy(&mp->m_delalloc_rtextents);
ASSERT(xfs_is_shutdown(mp) ||
percpu_counter_sum(&mp->m_delalloc_blks) == 0);
percpu_counter_destroy(&mp->m_delalloc_blks);
- percpu_counter_destroy(&mp->m_frextents);
}
static int
@@ -1210,11 +1282,24 @@ xfs_fs_shutdown(
xfs_force_shutdown(XFS_M(sb), SHUTDOWN_DEVICE_REMOVED);
}
+static int
+xfs_fs_show_stats(
+ struct seq_file *m,
+ struct dentry *root)
+{
+ struct xfs_mount *mp = XFS_M(root->d_sb);
+
+ if (xfs_has_zoned(mp) && IS_ENABLED(CONFIG_XFS_RT))
+ xfs_zoned_show_stats(m, mp);
+ return 0;
+}
+
static const struct super_operations xfs_super_operations = {
.alloc_inode = xfs_fs_alloc_inode,
.destroy_inode = xfs_fs_destroy_inode,
.dirty_inode = xfs_fs_dirty_inode,
.drop_inode = xfs_fs_drop_inode,
+ .evict_inode = xfs_fs_evict_inode,
.put_super = xfs_fs_put_super,
.sync_fs = xfs_fs_sync_fs,
.freeze_fs = xfs_fs_freeze,
@@ -1224,6 +1309,7 @@ static const struct super_operations xfs_super_operations = {
.nr_cached_objects = xfs_fs_nr_cached_objects,
.free_cached_objects = xfs_fs_free_cached_objects,
.shutdown = xfs_fs_shutdown,
+ .show_stats = xfs_fs_show_stats,
};
static int
@@ -1261,6 +1347,42 @@ suffix_kstrtoint(
return ret;
}
+static int
+suffix_kstrtoull(
+ const char *s,
+ unsigned int base,
+ unsigned long long *res)
+{
+ int last, shift_left_factor = 0;
+ unsigned long long _res;
+ char *value;
+ int ret = 0;
+
+ value = kstrdup(s, GFP_KERNEL);
+ if (!value)
+ return -ENOMEM;
+
+ last = strlen(value) - 1;
+ if (value[last] == 'K' || value[last] == 'k') {
+ shift_left_factor = 10;
+ value[last] = '\0';
+ }
+ if (value[last] == 'M' || value[last] == 'm') {
+ shift_left_factor = 20;
+ value[last] = '\0';
+ }
+ if (value[last] == 'G' || value[last] == 'g') {
+ shift_left_factor = 30;
+ value[last] = '\0';
+ }
+
+ if (kstrtoull(value, base, &_res))
+ ret = -EINVAL;
+ kfree(value);
+ *res = _res << shift_left_factor;
+ return ret;
+}
+
static inline void
xfs_fs_warn_deprecated(
struct fs_context *fc,
@@ -1436,6 +1558,23 @@ xfs_fs_parse_param(
xfs_fs_warn_deprecated(fc, param, XFS_FEAT_NOATTR2, true);
parsing_mp->m_features |= XFS_FEAT_NOATTR2;
return 0;
+ case Opt_max_open_zones:
+ parsing_mp->m_max_open_zones = result.uint_32;
+ return 0;
+ case Opt_lifetime:
+ parsing_mp->m_features &= ~XFS_FEAT_NOLIFETIME;
+ return 0;
+ case Opt_nolifetime:
+ parsing_mp->m_features |= XFS_FEAT_NOLIFETIME;
+ return 0;
+ case Opt_max_atomic_write:
+ if (suffix_kstrtoull(param->string, 10,
+ &parsing_mp->m_awu_max_bytes)) {
+ xfs_warn(parsing_mp,
+ "max atomic write size must be positive integer");
+ return -EINVAL;
+ }
+ return 0;
default:
xfs_warn(parsing_mp, "unknown mount option [%s].", param->key);
return -EINVAL;
@@ -1661,8 +1800,12 @@ xfs_fs_fill_super(
#endif
}
- /* Filesystem claims it needs repair, so refuse the mount. */
- if (xfs_has_needsrepair(mp)) {
+ /*
+ * Filesystem claims it needs repair, so refuse the mount unless
+ * norecovery is also specified, in which case the filesystem can
+ * be mounted with no risk of further damage.
+ */
+ if (xfs_has_needsrepair(mp) && !xfs_has_norecovery(mp)) {
xfs_warn(mp, "Filesystem needs repair. Please run xfs_repair.");
error = -EFSCORRUPTED;
goto out_free_sb;
@@ -1776,8 +1919,17 @@ xfs_fs_fill_super(
mp->m_features &= ~XFS_FEAT_DISCARD;
}
- if (xfs_has_metadir(mp))
+ if (xfs_has_zoned(mp)) {
+ if (!xfs_has_metadir(mp)) {
+ xfs_alert(mp,
+ "metadir feature required for zoned realtime devices.");
+ error = -EINVAL;
+ goto out_filestream_unmount;
+ }
+ xfs_warn_experimental(mp, XFS_EXPERIMENTAL_ZONED);
+ } else if (xfs_has_metadir(mp)) {
xfs_warn_experimental(mp, XFS_EXPERIMENTAL_METADIR);
+ }
if (xfs_has_reflink(mp)) {
if (xfs_has_realtime(mp) &&
@@ -1789,19 +1941,19 @@ xfs_fs_fill_super(
goto out_filestream_unmount;
}
+ if (xfs_has_zoned(mp)) {
+ xfs_alert(mp,
+ "reflink not compatible with zoned RT device!");
+ error = -EINVAL;
+ goto out_filestream_unmount;
+ }
+
if (xfs_globals.always_cow) {
xfs_info(mp, "using DEBUG-only always_cow mode.");
mp->m_always_cow = true;
}
}
-
- if (xfs_has_exchange_range(mp))
- xfs_warn_experimental(mp, XFS_EXPERIMENTAL_EXCHRANGE);
-
- if (xfs_has_parent(mp))
- xfs_warn_experimental(mp, XFS_EXPERIMENTAL_PPTR);
-
/*
* If no quota mount options were provided, maybe we'll try to pick
* up the quota accounting and enforcement flags from the ondisk sb.
@@ -1867,6 +2019,19 @@ xfs_remount_rw(
struct xfs_sb *sbp = &mp->m_sb;
int error;
+ if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp &&
+ xfs_readonly_buftarg(mp->m_logdev_targp)) {
+ xfs_warn(mp,
+ "ro->rw transition prohibited by read-only logdev");
+ return -EACCES;
+ }
+
+ if (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp)) {
+ xfs_warn(mp,
+ "ro->rw transition prohibited by read-only rtdev");
+ return -EACCES;
+ }
+
if (xfs_has_norecovery(mp)) {
xfs_warn(mp,
"ro->rw transition prohibited on norecovery mount");
@@ -1913,6 +2078,9 @@ xfs_remount_rw(
/* Re-enable the background inode inactivation worker. */
xfs_inodegc_start(mp);
+ /* Restart zone reclaim */
+ xfs_zone_gc_start(mp);
+
return 0;
}
@@ -1957,6 +2125,9 @@ xfs_remount_ro(
*/
xfs_inodegc_stop(mp);
+ /* Stop zone reclaim */
+ xfs_zone_gc_stop(mp);
+
/* Free the per-AG metadata reservation pool. */
xfs_fs_unreserve_ag_blocks(mp);
@@ -2006,6 +2177,29 @@ xfs_fs_reconfigure(
if (error)
return error;
+ /* attr2 -> noattr2 */
+ if (xfs_has_noattr2(new_mp)) {
+ if (xfs_has_crc(mp)) {
+ xfs_warn(mp,
+ "attr2 is always enabled for a V5 filesystem - can't be changed.");
+ return -EINVAL;
+ }
+ mp->m_features &= ~XFS_FEAT_ATTR2;
+ mp->m_features |= XFS_FEAT_NOATTR2;
+ } else if (xfs_has_attr2(new_mp)) {
+ /* noattr2 -> attr2 */
+ mp->m_features &= ~XFS_FEAT_NOATTR2;
+ mp->m_features |= XFS_FEAT_ATTR2;
+ }
+
+ /* Validate new max_atomic_write option before making other changes */
+ if (mp->m_awu_max_bytes != new_mp->m_awu_max_bytes) {
+ error = xfs_set_max_atomic_write_opt(mp,
+ new_mp->m_awu_max_bytes);
+ if (error)
+ return error;
+ }
+
/* inode32 -> inode64 */
if (xfs_has_small_inums(mp) && !xfs_has_small_inums(new_mp)) {
mp->m_features &= ~XFS_FEAT_SMALL_INUMS;
@@ -2018,6 +2212,17 @@ xfs_fs_reconfigure(
mp->m_maxagi = xfs_set_inode_alloc(mp, mp->m_sb.sb_agcount);
}
+ /*
+ * Now that mp has been modified according to the remount options, we
+ * do a final option validation with xfs_finish_flags() just like it is
+ * just like it is done during mount. We cannot use
+ * done during mount. We cannot use xfs_finish_flags() on new_mp as it
+ * contains only the user given options.
+ */
+ error = xfs_finish_flags(mp);
+ if (error)
+ return error;
+
/* ro -> rw */
if (xfs_is_readonly(mp) && !(flags & SB_RDONLY)) {
error = xfs_remount_rw(mp);
@@ -2078,6 +2283,7 @@ xfs_init_fs_context(
for (i = 0; i < XG_TYPE_MAX; i++)
xa_init(&mp->m_groups[i].xa);
mutex_init(&mp->m_growlock);
+ mutex_init(&mp->m_metafile_resv_lock);
INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker);
INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
mp->m_kobj.kobject.kset = xfs_kset;
@@ -2118,7 +2324,8 @@ static struct file_system_type xfs_fs_type = {
.init_fs_context = xfs_init_fs_context,
.parameters = xfs_fs_parameters,
.kill_sb = xfs_kill_sb,
- .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME,
+ .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME |
+ FS_LBS,
};
MODULE_ALIAS_FS("xfs");
diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
index 276696a07040..51646f066c4f 100644
--- a/fs/xfs/xfs_sysctl.h
+++ b/fs/xfs/xfs_sysctl.h
@@ -29,8 +29,6 @@ typedef struct xfs_param {
xfs_sysctl_val_t inherit_sync; /* Inherit the "sync" inode flag. */
xfs_sysctl_val_t inherit_nodump;/* Inherit the "nodump" inode flag. */
xfs_sysctl_val_t inherit_noatim;/* Inherit the "noatime" inode flag. */
- xfs_sysctl_val_t xfs_buf_timer; /* Interval between xfsbufd wakeups. */
- xfs_sysctl_val_t xfs_buf_age; /* Metadata buffer age before flush. */
xfs_sysctl_val_t inherit_nosym; /* Inherit the "nosymlinks" flag. */
xfs_sysctl_val_t rotorstep; /* inode32 AG rotoring control knob */
xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index 60cb5318fdae..7a5c5ef2db92 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -13,6 +13,7 @@
#include "xfs_log.h"
#include "xfs_log_priv.h"
#include "xfs_mount.h"
+#include "xfs_zones.h"
struct xfs_sysfs_attr {
struct attribute attr;
@@ -69,7 +70,7 @@ static struct attribute *xfs_mp_attrs[] = {
};
ATTRIBUTE_GROUPS(xfs_mp);
-const struct kobj_type xfs_mp_ktype = {
+static const struct kobj_type xfs_mp_ktype = {
.release = xfs_sysfs_release,
.sysfs_ops = &xfs_sysfs_ops,
.default_groups = xfs_mp_groups,
@@ -568,8 +569,8 @@ retry_timeout_seconds_store(
if (val == -1)
cfg->retry_timeout = XFS_ERR_RETRY_FOREVER;
else {
- cfg->retry_timeout = msecs_to_jiffies(val * MSEC_PER_SEC);
- ASSERT(msecs_to_jiffies(val * MSEC_PER_SEC) < LONG_MAX);
+ cfg->retry_timeout = secs_to_jiffies(val);
+ ASSERT(secs_to_jiffies(val) < LONG_MAX);
}
return count;
}
@@ -686,8 +687,8 @@ xfs_error_sysfs_init_class(
if (init[i].retry_timeout == XFS_ERR_RETRY_FOREVER)
cfg->retry_timeout = XFS_ERR_RETRY_FOREVER;
else
- cfg->retry_timeout = msecs_to_jiffies(
- init[i].retry_timeout * MSEC_PER_SEC);
+ cfg->retry_timeout =
+ secs_to_jiffies(init[i].retry_timeout);
}
return 0;
@@ -701,45 +702,135 @@ out_error:
return error;
}
+static inline struct xfs_mount *zoned_to_mp(struct kobject *kobj)
+{
+ return container_of(to_kobj(kobj), struct xfs_mount, m_zoned_kobj);
+}
+
+static ssize_t
+max_open_zones_show(
+ struct kobject *kobj,
+ char *buf)
+{
+ /* only report the open zones available for user data */
+ return sysfs_emit(buf, "%u\n",
+ zoned_to_mp(kobj)->m_max_open_zones - XFS_OPEN_GC_ZONES);
+}
+XFS_SYSFS_ATTR_RO(max_open_zones);
+
+static ssize_t
+zonegc_low_space_store(
+ struct kobject *kobj,
+ const char *buf,
+ size_t count)
+{
+ int ret;
+ unsigned int val;
+
+ ret = kstrtouint(buf, 0, &val);
+ if (ret)
+ return ret;
+
+ if (val > 100)
+ return -EINVAL;
+
+ zoned_to_mp(kobj)->m_zonegc_low_space = val;
+
+ return count;
+}
+
+static ssize_t
+zonegc_low_space_show(
+ struct kobject *kobj,
+ char *buf)
+{
+ return sysfs_emit(buf, "%u\n",
+ zoned_to_mp(kobj)->m_zonegc_low_space);
+}
+XFS_SYSFS_ATTR_RW(zonegc_low_space);
+
+static struct attribute *xfs_zoned_attrs[] = {
+ ATTR_LIST(max_open_zones),
+ ATTR_LIST(zonegc_low_space),
+ NULL,
+};
+ATTRIBUTE_GROUPS(xfs_zoned);
+
+static const struct kobj_type xfs_zoned_ktype = {
+ .release = xfs_sysfs_release,
+ .sysfs_ops = &xfs_sysfs_ops,
+ .default_groups = xfs_zoned_groups,
+};
+
int
-xfs_error_sysfs_init(
+xfs_mount_sysfs_init(
struct xfs_mount *mp)
{
int error;
+ super_set_sysfs_name_id(mp->m_super);
+
+ /* .../xfs/<dev>/ */
+ error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype,
+ NULL, mp->m_super->s_id);
+ if (error)
+ return error;
+
+ /* .../xfs/<dev>/stats/ */
+ error = xfs_sysfs_init(&mp->m_stats.xs_kobj, &xfs_stats_ktype,
+ &mp->m_kobj, "stats");
+ if (error)
+ goto out_remove_fsdir;
+
/* .../xfs/<dev>/error/ */
error = xfs_sysfs_init(&mp->m_error_kobj, &xfs_error_ktype,
&mp->m_kobj, "error");
if (error)
- return error;
+ goto out_remove_stats_dir;
+ /* .../xfs/<dev>/error/fail_at_unmount */
error = sysfs_create_file(&mp->m_error_kobj.kobject,
ATTR_LIST(fail_at_unmount));
if (error)
- goto out_error;
+ goto out_remove_error_dir;
/* .../xfs/<dev>/error/metadata/ */
error = xfs_error_sysfs_init_class(mp, XFS_ERR_METADATA,
"metadata", &mp->m_error_meta_kobj,
xfs_error_meta_init);
if (error)
- goto out_error;
+ goto out_remove_error_dir;
+
+ if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(mp)) {
+ /* .../xfs/<dev>/zoned/ */
+ error = xfs_sysfs_init(&mp->m_zoned_kobj, &xfs_zoned_ktype,
+ &mp->m_kobj, "zoned");
+ if (error)
+ goto out_remove_error_dir;
+ }
return 0;
-out_error:
+out_remove_error_dir:
xfs_sysfs_del(&mp->m_error_kobj);
+out_remove_stats_dir:
+ xfs_sysfs_del(&mp->m_stats.xs_kobj);
+out_remove_fsdir:
+ xfs_sysfs_del(&mp->m_kobj);
return error;
}
void
-xfs_error_sysfs_del(
+xfs_mount_sysfs_del(
struct xfs_mount *mp)
{
struct xfs_error_cfg *cfg;
int i, j;
+ if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(mp))
+ xfs_sysfs_del(&mp->m_zoned_kobj);
+
for (i = 0; i < XFS_ERR_CLASS_MAX; i++) {
for (j = 0; j < XFS_ERR_ERRNO_MAX; j++) {
cfg = &mp->m_error_cfg[i][j];
@@ -749,6 +840,8 @@ xfs_error_sysfs_del(
}
xfs_sysfs_del(&mp->m_error_meta_kobj);
xfs_sysfs_del(&mp->m_error_kobj);
+ xfs_sysfs_del(&mp->m_stats.xs_kobj);
+ xfs_sysfs_del(&mp->m_kobj);
}
struct xfs_error_cfg *
diff --git a/fs/xfs/xfs_sysfs.h b/fs/xfs/xfs_sysfs.h
index 148893ebfdef..1622fe80ad3e 100644
--- a/fs/xfs/xfs_sysfs.h
+++ b/fs/xfs/xfs_sysfs.h
@@ -7,7 +7,6 @@
#ifndef __XFS_SYSFS_H__
#define __XFS_SYSFS_H__
-extern const struct kobj_type xfs_mp_ktype; /* xfs_mount */
extern const struct kobj_type xfs_dbg_ktype; /* debug */
extern const struct kobj_type xfs_log_ktype; /* xlog */
extern const struct kobj_type xfs_stats_ktype; /* stats */
@@ -53,7 +52,7 @@ xfs_sysfs_del(
wait_for_completion(&kobj->complete);
}
-int xfs_error_sysfs_init(struct xfs_mount *mp);
-void xfs_error_sysfs_del(struct xfs_mount *mp);
+int xfs_mount_sysfs_init(struct xfs_mount *mp);
+void xfs_mount_sysfs_del(struct xfs_mount *mp);
#endif /* __XFS_SYSFS_H__ */
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index 8f530e69c18a..a60556dbd172 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -49,6 +49,8 @@
#include "xfs_metafile.h"
#include "xfs_metadir.h"
#include "xfs_rtgroup.h"
+#include "xfs_zone_alloc.h"
+#include "xfs_zone_priv.h"
/*
* We include this last to have the helpers above available for the trace
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index b29462363b81..ac344e42846c 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -102,6 +102,7 @@ struct xfs_rmap_intent;
struct xfs_refcount_intent;
struct xfs_metadir_update;
struct xfs_rtgroup;
+struct xfs_open_zone;
#define XFS_ATTR_FILTER_FLAGS \
{ XFS_ATTR_ROOT, "ROOT" }, \
@@ -169,6 +170,96 @@ DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound);
DEFINE_ATTR_LIST_EVENT(xfs_attr_leaf_list);
DEFINE_ATTR_LIST_EVENT(xfs_attr_node_list);
+TRACE_EVENT(xfs_calc_atomic_write_unit_max,
+ TP_PROTO(struct xfs_mount *mp, enum xfs_group_type type,
+ unsigned int max_write, unsigned int max_ioend,
+ unsigned int max_gsize, unsigned int awu_max),
+ TP_ARGS(mp, type, max_write, max_ioend, max_gsize, awu_max),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(enum xfs_group_type, type)
+ __field(unsigned int, max_write)
+ __field(unsigned int, max_ioend)
+ __field(unsigned int, max_gsize)
+ __field(unsigned int, awu_max)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->type = type;
+ __entry->max_write = max_write;
+ __entry->max_ioend = max_ioend;
+ __entry->max_gsize = max_gsize;
+ __entry->awu_max = awu_max;
+ ),
+ TP_printk("dev %d:%d %s max_write %u max_ioend %u max_gsize %u awu_max %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->type, XG_TYPE_STRINGS),
+ __entry->max_write,
+ __entry->max_ioend,
+ __entry->max_gsize,
+ __entry->awu_max)
+);
+
+TRACE_EVENT(xfs_calc_max_atomic_write_fsblocks,
+ TP_PROTO(struct xfs_mount *mp, unsigned int per_intent,
+ unsigned int step_size, unsigned int logres,
+ unsigned int blockcount),
+ TP_ARGS(mp, per_intent, step_size, logres, blockcount),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned int, per_intent)
+ __field(unsigned int, step_size)
+ __field(unsigned int, logres)
+ __field(unsigned int, blockcount)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->per_intent = per_intent;
+ __entry->step_size = step_size;
+ __entry->logres = logres;
+ __entry->blockcount = blockcount;
+ ),
+ TP_printk("dev %d:%d per_intent %u step_size %u logres %u blockcount %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->per_intent,
+ __entry->step_size,
+ __entry->logres,
+ __entry->blockcount)
+);
+
+TRACE_EVENT(xfs_calc_max_atomic_write_log_geometry,
+ TP_PROTO(struct xfs_mount *mp, unsigned int per_intent,
+ unsigned int step_size, unsigned int blockcount,
+ unsigned int min_logblocks, unsigned int logres),
+ TP_ARGS(mp, per_intent, step_size, blockcount, min_logblocks, logres),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(unsigned int, per_intent)
+ __field(unsigned int, step_size)
+ __field(unsigned int, blockcount)
+ __field(unsigned int, min_logblocks)
+ __field(unsigned int, cur_logblocks)
+ __field(unsigned int, logres)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->per_intent = per_intent;
+ __entry->step_size = step_size;
+ __entry->blockcount = blockcount;
+ __entry->min_logblocks = min_logblocks;
+ __entry->cur_logblocks = mp->m_sb.sb_logblocks;
+ __entry->logres = logres;
+ ),
+ TP_printk("dev %d:%d per_intent %u step_size %u blockcount %u min_logblocks %u logblocks %u logres %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->per_intent,
+ __entry->step_size,
+ __entry->blockcount,
+ __entry->min_logblocks,
+ __entry->cur_logblocks,
+ __entry->logres)
+);
+
TRACE_EVENT(xlog_intent_recovery_failed,
TP_PROTO(struct xfs_mount *mp, const struct xfs_defer_op_type *ops,
int error),
@@ -265,6 +356,153 @@ DEFINE_GROUP_REF_EVENT(xfs_group_grab);
DEFINE_GROUP_REF_EVENT(xfs_group_grab_next_tag);
DEFINE_GROUP_REF_EVENT(xfs_group_rele);
+#ifdef CONFIG_XFS_RT
+DECLARE_EVENT_CLASS(xfs_zone_class,
+ TP_PROTO(struct xfs_rtgroup *rtg),
+ TP_ARGS(rtg),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_rgnumber_t, rgno)
+ __field(xfs_rgblock_t, used)
+ __field(unsigned int, nr_open)
+ ),
+ TP_fast_assign(
+ struct xfs_mount *mp = rtg_mount(rtg);
+
+ __entry->dev = mp->m_super->s_dev;
+ __entry->rgno = rtg_rgno(rtg);
+ __entry->used = rtg_rmap(rtg)->i_used_blocks;
+ __entry->nr_open = mp->m_zone_info->zi_nr_open_zones;
+ ),
+ TP_printk("dev %d:%d rgno 0x%x used 0x%x nr_open %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->rgno,
+ __entry->used,
+ __entry->nr_open)
+);
+
+#define DEFINE_ZONE_EVENT(name) \
+DEFINE_EVENT(xfs_zone_class, name, \
+ TP_PROTO(struct xfs_rtgroup *rtg), \
+ TP_ARGS(rtg))
+DEFINE_ZONE_EVENT(xfs_zone_emptied);
+DEFINE_ZONE_EVENT(xfs_zone_full);
+DEFINE_ZONE_EVENT(xfs_zone_opened);
+DEFINE_ZONE_EVENT(xfs_zone_reset);
+DEFINE_ZONE_EVENT(xfs_zone_gc_target_opened);
+
+TRACE_EVENT(xfs_zone_free_blocks,
+ TP_PROTO(struct xfs_rtgroup *rtg, xfs_rgblock_t rgbno,
+ xfs_extlen_t len),
+ TP_ARGS(rtg, rgbno, len),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_rgnumber_t, rgno)
+ __field(xfs_rgblock_t, used)
+ __field(xfs_rgblock_t, rgbno)
+ __field(xfs_extlen_t, len)
+ ),
+ TP_fast_assign(
+ __entry->dev = rtg_mount(rtg)->m_super->s_dev;
+ __entry->rgno = rtg_rgno(rtg);
+ __entry->used = rtg_rmap(rtg)->i_used_blocks;
+ __entry->rgbno = rgbno;
+ __entry->len = len;
+ ),
+ TP_printk("dev %d:%d rgno 0x%x used 0x%x rgbno 0x%x len 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->rgno,
+ __entry->used,
+ __entry->rgbno,
+ __entry->len)
+);
+
+DECLARE_EVENT_CLASS(xfs_zone_alloc_class,
+ TP_PROTO(struct xfs_open_zone *oz, xfs_rgblock_t rgbno,
+ xfs_extlen_t len),
+ TP_ARGS(oz, rgbno, len),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_rgnumber_t, rgno)
+ __field(xfs_rgblock_t, used)
+ __field(xfs_rgblock_t, allocated)
+ __field(xfs_rgblock_t, written)
+ __field(xfs_rgblock_t, rgbno)
+ __field(xfs_extlen_t, len)
+ ),
+ TP_fast_assign(
+ __entry->dev = rtg_mount(oz->oz_rtg)->m_super->s_dev;
+ __entry->rgno = rtg_rgno(oz->oz_rtg);
+ __entry->used = rtg_rmap(oz->oz_rtg)->i_used_blocks;
+ __entry->allocated = oz->oz_allocated;
+ __entry->written = oz->oz_written;
+ __entry->rgbno = rgbno;
+ __entry->len = len;
+ ),
+ TP_printk("dev %d:%d rgno 0x%x used 0x%x alloced 0x%x written 0x%x rgbno 0x%x len 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->rgno,
+ __entry->used,
+ __entry->allocated,
+ __entry->written,
+ __entry->rgbno,
+ __entry->len)
+);
+
+#define DEFINE_ZONE_ALLOC_EVENT(name) \
+DEFINE_EVENT(xfs_zone_alloc_class, name, \
+ TP_PROTO(struct xfs_open_zone *oz, xfs_rgblock_t rgbno, \
+ xfs_extlen_t len), \
+ TP_ARGS(oz, rgbno, len))
+DEFINE_ZONE_ALLOC_EVENT(xfs_zone_record_blocks);
+DEFINE_ZONE_ALLOC_EVENT(xfs_zone_skip_blocks);
+DEFINE_ZONE_ALLOC_EVENT(xfs_zone_alloc_blocks);
+
+TRACE_EVENT(xfs_zone_gc_select_victim,
+ TP_PROTO(struct xfs_rtgroup *rtg, unsigned int bucket),
+ TP_ARGS(rtg, bucket),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_rgnumber_t, rgno)
+ __field(xfs_rgblock_t, used)
+ __field(unsigned int, bucket)
+ ),
+ TP_fast_assign(
+ __entry->dev = rtg_mount(rtg)->m_super->s_dev;
+ __entry->rgno = rtg_rgno(rtg);
+ __entry->used = rtg_rmap(rtg)->i_used_blocks;
+ __entry->bucket = bucket;
+ ),
+ TP_printk("dev %d:%d rgno 0x%x used 0x%x bucket %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->rgno,
+ __entry->used,
+ __entry->bucket)
+);
+
+TRACE_EVENT(xfs_zones_mount,
+ TP_PROTO(struct xfs_mount *mp),
+ TP_ARGS(mp),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_rgnumber_t, rgcount)
+ __field(uint32_t, blocks)
+ __field(unsigned int, max_open_zones)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->rgcount = mp->m_sb.sb_rgcount;
+ __entry->blocks = mp->m_groups[XG_TYPE_RTG].blocks;
+ __entry->max_open_zones = mp->m_max_open_zones;
+ ),
+ TP_printk("dev %d:%d zoned %u blocks_per_zone %u, max_open %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->rgcount,
+ __entry->blocks,
+ __entry->max_open_zones)
+);
+#endif /* CONFIG_XFS_RT */
+
TRACE_EVENT(xfs_inodegc_worker,
TP_PROTO(struct xfs_mount *mp, unsigned int shrinker_hits),
TP_ARGS(mp, shrinker_hits),
@@ -538,13 +776,16 @@ DEFINE_BUF_EVENT(xfs_buf_iowait_done);
DEFINE_BUF_EVENT(xfs_buf_delwri_queue);
DEFINE_BUF_EVENT(xfs_buf_delwri_queued);
DEFINE_BUF_EVENT(xfs_buf_delwri_split);
-DEFINE_BUF_EVENT(xfs_buf_delwri_pushbuf);
DEFINE_BUF_EVENT(xfs_buf_get_uncached);
DEFINE_BUF_EVENT(xfs_buf_item_relse);
DEFINE_BUF_EVENT(xfs_buf_iodone_async);
DEFINE_BUF_EVENT(xfs_buf_error_relse);
DEFINE_BUF_EVENT(xfs_buf_drain_buftarg);
DEFINE_BUF_EVENT(xfs_trans_read_buf_shut);
+DEFINE_BUF_EVENT(xfs_buf_backing_folio);
+DEFINE_BUF_EVENT(xfs_buf_backing_kmem);
+DEFINE_BUF_EVENT(xfs_buf_backing_vmalloc);
+DEFINE_BUF_EVENT(xfs_buf_backing_fallback);
/* not really buffer traces, but the buf provides useful information */
DEFINE_BUF_EVENT(xfs_btree_corrupt);
@@ -593,6 +834,7 @@ DEFINE_EVENT(xfs_buf_flags_class, name, \
DEFINE_BUF_FLAGS_EVENT(xfs_buf_find);
DEFINE_BUF_FLAGS_EVENT(xfs_buf_get);
DEFINE_BUF_FLAGS_EVENT(xfs_buf_read);
+DEFINE_BUF_FLAGS_EVENT(xfs_buf_readahead);
TRACE_EVENT(xfs_buf_ioerror,
TP_PROTO(struct xfs_buf *bp, int error, xfs_failaddr_t caller_ip),
@@ -838,7 +1080,9 @@ DEFINE_INODE_EVENT(xfs_get_acl);
#endif
DEFINE_INODE_EVENT(xfs_vm_bmap);
DEFINE_INODE_EVENT(xfs_file_ioctl);
+#ifdef CONFIG_COMPAT
DEFINE_INODE_EVENT(xfs_file_compat_ioctl);
+#endif
DEFINE_INODE_EVENT(xfs_ioctl_setattr);
DEFINE_INODE_EVENT(xfs_dir_fsync);
DEFINE_INODE_EVENT(xfs_file_fsync);
@@ -902,6 +1146,7 @@ DECLARE_EVENT_CLASS(xfs_iref_class,
__field(xfs_ino_t, ino)
__field(int, count)
__field(int, pincount)
+ __field(unsigned long, iflags)
__field(unsigned long, caller_ip)
),
TP_fast_assign(
@@ -909,13 +1154,15 @@ DECLARE_EVENT_CLASS(xfs_iref_class,
__entry->ino = ip->i_ino;
__entry->count = atomic_read(&VFS_I(ip)->i_count);
__entry->pincount = atomic_read(&ip->i_pincount);
+ __entry->iflags = ip->i_flags;
__entry->caller_ip = caller_ip;
),
- TP_printk("dev %d:%d ino 0x%llx count %d pincount %d caller %pS",
+ TP_printk("dev %d:%d ino 0x%llx count %d pincount %d iflags 0x%lx caller %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->count,
__entry->pincount,
+ __entry->iflags,
(char *)__entry->caller_ip)
)
@@ -1005,6 +1252,8 @@ DEFINE_IREF_EVENT(xfs_irele);
DEFINE_IREF_EVENT(xfs_inode_pin);
DEFINE_IREF_EVENT(xfs_inode_unpin);
DEFINE_IREF_EVENT(xfs_inode_unpin_nowait);
+DEFINE_IREF_EVENT(xfs_inode_push_pinned);
+DEFINE_IREF_EVENT(xfs_inode_push_stale);
DECLARE_EVENT_CLASS(xfs_namespace_class,
TP_PROTO(struct xfs_inode *dp, const struct xfs_name *name),
@@ -1148,7 +1397,6 @@ DEFINE_EVENT(xfs_dquot_class, name, \
TP_ARGS(dqp))
DEFINE_DQUOT_EVENT(xfs_dqadjust);
DEFINE_DQUOT_EVENT(xfs_dqreclaim_want);
-DEFINE_DQUOT_EVENT(xfs_dqreclaim_dirty);
DEFINE_DQUOT_EVENT(xfs_dqreclaim_busy);
DEFINE_DQUOT_EVENT(xfs_dqreclaim_done);
DEFINE_DQUOT_EVENT(xfs_dqattach_found);
@@ -1353,7 +1601,6 @@ DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant);
DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant_sub);
DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant_exit);
DEFINE_LOGGRANT_EVENT(xfs_log_cil_wait);
-DEFINE_LOGGRANT_EVENT(xfs_log_cil_return);
DECLARE_EVENT_CLASS(xfs_log_item_class,
TP_PROTO(struct xfs_log_item *lip),
@@ -1409,6 +1656,8 @@ DEFINE_LOG_ITEM_EVENT(xfs_ail_flushing);
DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_mark);
DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_skip);
DEFINE_LOG_ITEM_EVENT(xfs_cil_whiteout_unpin);
+DEFINE_LOG_ITEM_EVENT(xlog_ail_insert_abort);
+DEFINE_LOG_ITEM_EVENT(xfs_trans_free_abort);
DECLARE_EVENT_CLASS(xfs_ail_class,
TP_PROTO(struct xfs_log_item *lip, xfs_lsn_t old_lsn, xfs_lsn_t new_lsn),
@@ -1505,6 +1754,28 @@ DEFINE_RW_EVENT(xfs_file_direct_write);
DEFINE_RW_EVENT(xfs_file_dax_write);
DEFINE_RW_EVENT(xfs_reflink_bounce_dio_write);
+TRACE_EVENT(xfs_iomap_atomic_write_cow,
+ TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
+ TP_ARGS(ip, offset, count),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_off_t, offset)
+ __field(ssize_t, count)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->offset = offset;
+ __entry->count = count;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx pos 0x%llx bytecount 0x%zx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->offset,
+ __entry->count)
+)
+
DECLARE_EVENT_CLASS(xfs_imap_class,
TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
int whichfork, struct xfs_bmbt_irec *irec),
@@ -1592,9 +1863,8 @@ DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
DEFINE_SIMPLE_IO_EVENT(xfs_setfilesize);
DEFINE_SIMPLE_IO_EVENT(xfs_zero_eof);
DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write);
-DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_unwritten);
-DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_append);
DEFINE_SIMPLE_IO_EVENT(xfs_file_splice_read);
+DEFINE_SIMPLE_IO_EVENT(xfs_zoned_map_blocks);
DECLARE_EVENT_CLASS(xfs_itrunc_class,
TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size),
@@ -1625,31 +1895,6 @@ DEFINE_EVENT(xfs_itrunc_class, name, \
DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_start);
DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_end);
-TRACE_EVENT(xfs_pagecache_inval,
- TP_PROTO(struct xfs_inode *ip, xfs_off_t start, xfs_off_t finish),
- TP_ARGS(ip, start, finish),
- TP_STRUCT__entry(
- __field(dev_t, dev)
- __field(xfs_ino_t, ino)
- __field(xfs_fsize_t, size)
- __field(xfs_off_t, start)
- __field(xfs_off_t, finish)
- ),
- TP_fast_assign(
- __entry->dev = VFS_I(ip)->i_sb->s_dev;
- __entry->ino = ip->i_ino;
- __entry->size = ip->i_disk_size;
- __entry->start = start;
- __entry->finish = finish;
- ),
- TP_printk("dev %d:%d ino 0x%llx disize 0x%llx start 0x%llx finish 0x%llx",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->ino,
- __entry->size,
- __entry->start,
- __entry->finish)
-);
-
TRACE_EVENT(xfs_bunmap,
TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t fileoff, xfs_filblks_t len,
int flags, unsigned long caller_ip),
@@ -1995,14 +2240,12 @@ DEFINE_EVENT(xfs_alloc_class, name, \
DEFINE_ALLOC_EVENT(xfs_alloc_exact_done);
DEFINE_ALLOC_EVENT(xfs_alloc_exact_notfound);
DEFINE_ALLOC_EVENT(xfs_alloc_exact_error);
-DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft);
DEFINE_ALLOC_EVENT(xfs_alloc_near_first);
DEFINE_ALLOC_EVENT(xfs_alloc_cur);
DEFINE_ALLOC_EVENT(xfs_alloc_cur_right);
DEFINE_ALLOC_EVENT(xfs_alloc_cur_left);
DEFINE_ALLOC_EVENT(xfs_alloc_cur_lookup);
DEFINE_ALLOC_EVENT(xfs_alloc_cur_lookup_done);
-DEFINE_ALLOC_EVENT(xfs_alloc_near_error);
DEFINE_ALLOC_EVENT(xfs_alloc_near_noentry);
DEFINE_ALLOC_EVENT(xfs_alloc_near_busy);
DEFINE_ALLOC_EVENT(xfs_alloc_size_neither);
@@ -2197,13 +2440,8 @@ DEFINE_ATTR_EVENT(xfs_attr_leaf_toosmall);
DEFINE_ATTR_EVENT(xfs_attr_node_addname);
DEFINE_ATTR_EVENT(xfs_attr_node_get);
DEFINE_ATTR_EVENT(xfs_attr_node_replace);
-DEFINE_ATTR_EVENT(xfs_attr_node_removename);
-
-DEFINE_ATTR_EVENT(xfs_attr_fillstate);
-DEFINE_ATTR_EVENT(xfs_attr_refillstate);
DEFINE_ATTR_EVENT(xfs_attr_rmtval_get);
-DEFINE_ATTR_EVENT(xfs_attr_rmtval_set);
#define DEFINE_DA_EVENT(name) \
DEFINE_EVENT(xfs_da_class, name, \
@@ -2621,7 +2859,6 @@ DEFINE_EVENT(xfs_rtdiscard_class, name, \
TP_ARGS(mp, rtbno, len))
DEFINE_RTDISCARD_EVENT(xfs_discard_rtextent);
DEFINE_RTDISCARD_EVENT(xfs_discard_rttoosmall);
-DEFINE_RTDISCARD_EVENT(xfs_discard_rtrelax);
DECLARE_EVENT_CLASS(xfs_btree_cur_class,
TP_PROTO(struct xfs_btree_cur *cur, int level, struct xfs_buf *bp),
@@ -3938,36 +4175,6 @@ DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_extent_error);
DEFINE_INODE_IREC_EVENT(xfs_reflink_remap_extent_src);
DEFINE_INODE_IREC_EVENT(xfs_reflink_remap_extent_dest);
-/* dedupe tracepoints */
-DEFINE_DOUBLE_IO_EVENT(xfs_reflink_compare_extents);
-DEFINE_INODE_ERROR_EVENT(xfs_reflink_compare_extents_error);
-
-/* ioctl tracepoints */
-TRACE_EVENT(xfs_ioctl_clone,
- TP_PROTO(struct inode *src, struct inode *dest),
- TP_ARGS(src, dest),
- TP_STRUCT__entry(
- __field(dev_t, dev)
- __field(unsigned long, src_ino)
- __field(loff_t, src_isize)
- __field(unsigned long, dest_ino)
- __field(loff_t, dest_isize)
- ),
- TP_fast_assign(
- __entry->dev = src->i_sb->s_dev;
- __entry->src_ino = src->i_ino;
- __entry->src_isize = i_size_read(src);
- __entry->dest_ino = dest->i_ino;
- __entry->dest_isize = i_size_read(dest);
- ),
- TP_printk("dev %d:%d ino 0x%lx isize 0x%llx -> ino 0x%lx isize 0x%llx",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->src_ino,
- __entry->src_isize,
- __entry->dest_ino,
- __entry->dest_isize)
-);
-
/* unshare tracepoints */
DEFINE_SIMPLE_IO_EVENT(xfs_reflink_unshare);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error);
@@ -3975,13 +4182,13 @@ DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error);
/* copy on write */
DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared);
DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found);
-DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc);
DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow);
DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow);
DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_from);
DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_to);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_skip);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error);
@@ -4759,7 +4966,6 @@ DEFINE_ICLOG_EVENT(xlog_iclog_switch);
DEFINE_ICLOG_EVENT(xlog_iclog_sync);
DEFINE_ICLOG_EVENT(xlog_iclog_syncing);
DEFINE_ICLOG_EVENT(xlog_iclog_sync_done);
-DEFINE_ICLOG_EVENT(xlog_iclog_want_sync);
DEFINE_ICLOG_EVENT(xlog_iclog_wait_on);
DEFINE_ICLOG_EVENT(xlog_iclog_write);
@@ -4808,7 +5014,6 @@ DEFINE_DAS_STATE_EVENT(xfs_attr_sf_addname_return);
DEFINE_DAS_STATE_EVENT(xfs_attr_set_iter_return);
DEFINE_DAS_STATE_EVENT(xfs_attr_leaf_addname_return);
DEFINE_DAS_STATE_EVENT(xfs_attr_node_addname_return);
-DEFINE_DAS_STATE_EVENT(xfs_attr_remove_iter_return);
DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_alloc);
DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_remove_return);
DEFINE_DAS_STATE_EVENT(xfs_attr_defer_add);
@@ -5605,11 +5810,10 @@ DEFINE_METADIR_EVENT(xfs_metadir_lookup);
/* metadata inode space reservations */
DECLARE_EVENT_CLASS(xfs_metafile_resv_class,
- TP_PROTO(struct xfs_inode *ip, xfs_filblks_t len),
- TP_ARGS(ip, len),
+ TP_PROTO(struct xfs_mount *mp, xfs_filblks_t len),
+ TP_ARGS(mp, len),
TP_STRUCT__entry(
__field(dev_t, dev)
- __field(xfs_ino_t, ino)
__field(unsigned long long, freeblks)
__field(unsigned long long, reserved)
__field(unsigned long long, asked)
@@ -5617,19 +5821,15 @@ DECLARE_EVENT_CLASS(xfs_metafile_resv_class,
__field(unsigned long long, len)
),
TP_fast_assign(
- struct xfs_mount *mp = ip->i_mount;
-
__entry->dev = mp->m_super->s_dev;
- __entry->ino = ip->i_ino;
- __entry->freeblks = percpu_counter_sum(&mp->m_fdblocks);
- __entry->reserved = ip->i_delayed_blks;
- __entry->asked = ip->i_meta_resv_asked;
- __entry->used = ip->i_nblocks;
+ __entry->freeblks = xfs_sum_freecounter_raw(mp, XC_FREE_BLOCKS);
+ __entry->reserved = mp->m_metafile_resv_avail;
+ __entry->asked = mp->m_metafile_resv_target;
+ __entry->used = mp->m_metafile_resv_used;
__entry->len = len;
),
- TP_printk("dev %d:%d ino 0x%llx freeblks %llu resv %llu ask %llu used %llu len %llu",
+ TP_printk("dev %d:%d freeblks %llu resv %llu ask %llu used %llu len %llu",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->ino,
__entry->freeblks,
__entry->reserved,
__entry->asked,
@@ -5638,14 +5838,14 @@ DECLARE_EVENT_CLASS(xfs_metafile_resv_class,
)
#define DEFINE_METAFILE_RESV_EVENT(name) \
DEFINE_EVENT(xfs_metafile_resv_class, name, \
- TP_PROTO(struct xfs_inode *ip, xfs_filblks_t len), \
- TP_ARGS(ip, len))
+ TP_PROTO(struct xfs_mount *mp, xfs_filblks_t len), \
+ TP_ARGS(mp, len))
DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_init);
DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_free);
DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_alloc_space);
DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_free_space);
DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_critical);
-DEFINE_INODE_ERROR_EVENT(xfs_metafile_resv_init_error);
+DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_init_error);
#ifdef CONFIG_XFS_RT
TRACE_EVENT(xfs_growfs_check_rtgeom,
@@ -5668,6 +5868,46 @@ TRACE_EVENT(xfs_growfs_check_rtgeom,
);
#endif /* CONFIG_XFS_RT */
+TRACE_DEFINE_ENUM(XC_FREE_BLOCKS);
+TRACE_DEFINE_ENUM(XC_FREE_RTEXTENTS);
+TRACE_DEFINE_ENUM(XC_FREE_RTAVAILABLE);
+
+DECLARE_EVENT_CLASS(xfs_freeblocks_resv_class,
+ TP_PROTO(struct xfs_mount *mp, enum xfs_free_counter ctr,
+ uint64_t delta, unsigned long caller_ip),
+ TP_ARGS(mp, ctr, delta, caller_ip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(enum xfs_free_counter, ctr)
+ __field(uint64_t, delta)
+ __field(uint64_t, avail)
+ __field(uint64_t, total)
+ __field(unsigned long, caller_ip)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->ctr = ctr;
+ __entry->delta = delta;
+ __entry->avail = mp->m_free[ctr].res_avail;
+ __entry->total = mp->m_free[ctr].res_total;
+ __entry->caller_ip = caller_ip;
+ ),
+ TP_printk("dev %d:%d ctr %s delta %llu avail %llu total %llu caller %pS",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->ctr, XFS_FREECOUNTER_STR),
+ __entry->delta,
+ __entry->avail,
+ __entry->total,
+ (char *)__entry->caller_ip)
+)
+#define DEFINE_FREEBLOCKS_RESV_EVENT(name) \
+DEFINE_EVENT(xfs_freeblocks_resv_class, name, \
+ TP_PROTO(struct xfs_mount *mp, enum xfs_free_counter ctr, \
+ uint64_t delta, unsigned long caller_ip), \
+ TP_ARGS(mp, ctr, delta, caller_ip))
+DEFINE_FREEBLOCKS_RESV_EVENT(xfs_freecounter_reserved);
+DEFINE_FREEBLOCKS_RESV_EVENT(xfs_freecounter_enospc);
+
#endif /* _TRACE_XFS_H */
#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index c6657072361a..575e7028f423 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -134,18 +134,14 @@ xfs_trans_dup(
}
/*
- * This is called to reserve free disk blocks and log space for the
- * given transaction. This must be done before allocating any resources
- * within the transaction.
+ * This is called to reserve free disk blocks and log space for the given
+ * transaction before allocating any resources within the transaction.
*
* This will return ENOSPC if there are not enough blocks available.
* It will sleep waiting for available log space.
- * The only valid value for the flags parameter is XFS_RES_LOG_PERM, which
- * is used by long running transactions. If any one of the reservations
- * fails then they will all be backed out.
*
- * This does not do quota reservations. That typically is done by the
- * caller afterwards.
+ * This does not do quota reservations. That typically is done by the caller
+ * afterwards.
*/
static int
xfs_trans_reserve(
@@ -158,10 +154,12 @@ xfs_trans_reserve(
int error = 0;
bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
+ ASSERT(resp->tr_logres > 0);
+
/*
- * Attempt to reserve the needed disk blocks by decrementing
- * the number needed from the number available. This will
- * fail if the count would go below zero.
+ * Attempt to reserve the needed disk blocks by decrementing the number
+ * needed from the number available. This will fail if the count would
+ * go below zero.
*/
if (blocks > 0) {
error = xfs_dec_fdblocks(mp, blocks, rsvd);
@@ -173,42 +171,20 @@ xfs_trans_reserve(
/*
* Reserve the log space needed for this transaction.
*/
- if (resp->tr_logres > 0) {
- bool permanent = false;
-
- ASSERT(tp->t_log_res == 0 ||
- tp->t_log_res == resp->tr_logres);
- ASSERT(tp->t_log_count == 0 ||
- tp->t_log_count == resp->tr_logcount);
-
- if (resp->tr_logflags & XFS_TRANS_PERM_LOG_RES) {
- tp->t_flags |= XFS_TRANS_PERM_LOG_RES;
- permanent = true;
- } else {
- ASSERT(tp->t_ticket == NULL);
- ASSERT(!(tp->t_flags & XFS_TRANS_PERM_LOG_RES));
- }
-
- if (tp->t_ticket != NULL) {
- ASSERT(resp->tr_logflags & XFS_TRANS_PERM_LOG_RES);
- error = xfs_log_regrant(mp, tp->t_ticket);
- } else {
- error = xfs_log_reserve(mp, resp->tr_logres,
- resp->tr_logcount,
- &tp->t_ticket, permanent);
- }
-
- if (error)
- goto undo_blocks;
+ if (resp->tr_logflags & XFS_TRANS_PERM_LOG_RES)
+ tp->t_flags |= XFS_TRANS_PERM_LOG_RES;
+ error = xfs_log_reserve(mp, resp->tr_logres, resp->tr_logcount,
+ &tp->t_ticket, (tp->t_flags & XFS_TRANS_PERM_LOG_RES));
+ if (error)
+ goto undo_blocks;
- tp->t_log_res = resp->tr_logres;
- tp->t_log_count = resp->tr_logcount;
- }
+ tp->t_log_res = resp->tr_logres;
+ tp->t_log_count = resp->tr_logcount;
/*
- * Attempt to reserve the needed realtime extents by decrementing
- * the number needed from the number available. This will
- * fail if the count would go below zero.
+ * Attempt to reserve the needed realtime extents by decrementing the
+ * number needed from the number available. This will fail if the
+ * count would go below zero.
*/
if (rtextents > 0) {
error = xfs_dec_frextents(mp, rtextents);
@@ -221,18 +197,11 @@ xfs_trans_reserve(
return 0;
- /*
- * Error cases jump to one of these labels to undo any
- * reservations which have already been performed.
- */
undo_log:
- if (resp->tr_logres > 0) {
- xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket);
- tp->t_ticket = NULL;
- tp->t_log_res = 0;
- tp->t_flags &= ~XFS_TRANS_PERM_LOG_RES;
- }
-
+ xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket);
+ tp->t_ticket = NULL;
+ tp->t_log_res = 0;
+ tp->t_flags &= ~XFS_TRANS_PERM_LOG_RES;
undo_blocks:
if (blocks > 0) {
xfs_add_fdblocks(mp, blocks);
@@ -241,6 +210,28 @@ undo_blocks:
return error;
}
+static struct xfs_trans *
+__xfs_trans_alloc(
+ struct xfs_mount *mp,
+ uint flags)
+{
+ struct xfs_trans *tp;
+
+ ASSERT(!(flags & XFS_TRANS_RES_FDBLKS) || xfs_has_lazysbcount(mp));
+
+ tp = kmem_cache_zalloc(xfs_trans_cache, GFP_KERNEL | __GFP_NOFAIL);
+ if (!(flags & XFS_TRANS_NO_WRITECOUNT))
+ sb_start_intwrite(mp->m_super);
+ xfs_trans_set_context(tp);
+ tp->t_flags = flags;
+ tp->t_mountp = mp;
+ INIT_LIST_HEAD(&tp->t_items);
+ INIT_LIST_HEAD(&tp->t_busy);
+ INIT_LIST_HEAD(&tp->t_dfops);
+ tp->t_highest_agno = NULLAGNUMBER;
+ return tp;
+}
+
int
xfs_trans_alloc(
struct xfs_mount *mp,
@@ -254,33 +245,16 @@ xfs_trans_alloc(
bool want_retry = true;
int error;
+ ASSERT(resp->tr_logres > 0);
+
/*
* Allocate the handle before we do our freeze accounting and setting up
* GFP_NOFS allocation context so that we avoid lockdep false positives
* by doing GFP_KERNEL allocations inside sb_start_intwrite().
*/
retry:
- tp = kmem_cache_zalloc(xfs_trans_cache, GFP_KERNEL | __GFP_NOFAIL);
- if (!(flags & XFS_TRANS_NO_WRITECOUNT))
- sb_start_intwrite(mp->m_super);
- xfs_trans_set_context(tp);
-
- /*
- * Zero-reservation ("empty") transactions can't modify anything, so
- * they're allowed to run while we're frozen.
- */
- WARN_ON(resp->tr_logres > 0 &&
- mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE);
- ASSERT(!(flags & XFS_TRANS_RES_FDBLKS) ||
- xfs_has_lazysbcount(mp));
-
- tp->t_flags = flags;
- tp->t_mountp = mp;
- INIT_LIST_HEAD(&tp->t_items);
- INIT_LIST_HEAD(&tp->t_busy);
- INIT_LIST_HEAD(&tp->t_dfops);
- tp->t_highest_agno = NULLAGNUMBER;
-
+ tp = __xfs_trans_alloc(mp, flags);
+ WARN_ON(mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE);
error = xfs_trans_reserve(tp, resp, blocks, rtextents);
if (error == -ENOSPC && want_retry) {
xfs_trans_cancel(tp);
@@ -324,14 +298,11 @@ retry:
* where we can be grabbing buffers at the same time that freeze is trying to
* drain the buffer LRU list.
*/
-int
+struct xfs_trans *
xfs_trans_alloc_empty(
- struct xfs_mount *mp,
- struct xfs_trans **tpp)
+ struct xfs_mount *mp)
{
- struct xfs_trans_res resv = {0};
-
- return xfs_trans_alloc(mp, &resv, 0, 0, XFS_TRANS_NO_WRITECOUNT, tpp);
+ return __xfs_trans_alloc(mp, XFS_TRANS_NO_WRITECOUNT);
}
/*
@@ -742,8 +713,10 @@ xfs_trans_free_items(
list_for_each_entry_safe(lip, next, &tp->t_items, li_trans) {
xfs_trans_del_item(lip);
- if (abort)
+ if (abort) {
+ trace_xfs_trans_free_abort(lip);
set_bit(XFS_LI_ABORTED, &lip->li_flags);
+ }
if (lip->li_ops->iop_release)
lip->li_ops->iop_release(lip);
}
@@ -1024,51 +997,57 @@ xfs_trans_cancel(
}
/*
- * Roll from one trans in the sequence of PERMANENT transactions to
- * the next: permanent transactions are only flushed out when
- * committed with xfs_trans_commit(), but we still want as soon
- * as possible to let chunks of it go to the log. So we commit the
- * chunk we've been working on and get a new transaction to continue.
+ * Roll from one trans in the sequence of PERMANENT transactions to the next:
+ * permanent transactions are only flushed out when committed with
+ * xfs_trans_commit(), but we still want as soon as possible to let chunks of it
+ * go to the log. So we commit the chunk we've been working on and get a new
+ * transaction to continue.
*/
int
xfs_trans_roll(
struct xfs_trans **tpp)
{
- struct xfs_trans *trans = *tpp;
- struct xfs_trans_res tres;
+ struct xfs_trans *tp = *tpp;
+ unsigned int log_res = tp->t_log_res;
+ unsigned int log_count = tp->t_log_count;
int error;
- trace_xfs_trans_roll(trans, _RET_IP_);
+ trace_xfs_trans_roll(tp, _RET_IP_);
+
+ ASSERT(log_res > 0);
/*
* Copy the critical parameters from one trans to the next.
*/
- tres.tr_logres = trans->t_log_res;
- tres.tr_logcount = trans->t_log_count;
-
- *tpp = xfs_trans_dup(trans);
+ *tpp = xfs_trans_dup(tp);
/*
* Commit the current transaction.
- * If this commit failed, then it'd just unlock those items that
- * are not marked ihold. That also means that a filesystem shutdown
- * is in progress. The caller takes the responsibility to cancel
- * the duplicate transaction that gets returned.
+ *
+ * If this commit failed, then it'd just unlock those items that are not
+ * marked ihold. That also means that a filesystem shutdown is in
+ * progress. The caller takes the responsibility to cancel the
+ * duplicate transaction that gets returned.
*/
- error = __xfs_trans_commit(trans, true);
+ error = __xfs_trans_commit(tp, true);
if (error)
return error;
/*
* Reserve space in the log for the next transaction.
- * This also pushes items in the "AIL", the list of logged items,
- * out to disk if they are taking up space at the tail of the log
- * that we want to use. This requires that either nothing be locked
- * across this call, or that anything that is locked be logged in
- * the prior and the next transactions.
+ *
+ * This also pushes items in the AIL out to disk if they are taking up
+ * space at the tail of the log that we want to use. This requires that
+ * either nothing be locked across this call, or that anything that is
+ * locked be logged in the prior and the next transactions.
*/
- tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
- return xfs_trans_reserve(*tpp, &tres, 0, 0);
+ tp = *tpp;
+ error = xfs_log_regrant(tp->t_mountp, tp->t_ticket);
+ if (error)
+ return error;
+ tp->t_log_res = log_res;
+ tp->t_log_count = log_count;
+ return 0;
}
/*
@@ -1144,9 +1123,18 @@ xfs_trans_reserve_more(
unsigned int blocks,
unsigned int rtextents)
{
- struct xfs_trans_res resv = { };
-
- return xfs_trans_reserve(tp, &resv, blocks, rtextents);
+ bool rsvd = tp->t_flags & XFS_TRANS_RESERVE;
+
+ if (blocks && xfs_dec_fdblocks(tp->t_mountp, blocks, rsvd))
+ return -ENOSPC;
+ if (rtextents && xfs_dec_frextents(tp->t_mountp, rtextents)) {
+ if (blocks)
+ xfs_add_fdblocks(tp->t_mountp, blocks);
+ return -ENOSPC;
+ }
+ tp->t_blk_res += blocks;
+ tp->t_rtx_res += rtextents;
+ return 0;
}
/*
@@ -1161,14 +1149,13 @@ xfs_trans_reserve_more_inode(
unsigned int rblocks,
bool force_quota)
{
- struct xfs_trans_res resv = { };
struct xfs_mount *mp = ip->i_mount;
unsigned int rtx = xfs_extlen_to_rtxlen(mp, rblocks);
int error;
xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
- error = xfs_trans_reserve(tp, &resv, dblocks, rtx);
+ error = xfs_trans_reserve_more(tp, dblocks, rtx);
if (error)
return error;
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 2b366851e9a4..7fb860f645a3 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -15,7 +15,6 @@ struct xfs_efd_log_item;
struct xfs_efi_log_item;
struct xfs_inode;
struct xfs_item_ops;
-struct xfs_log_iovec;
struct xfs_mount;
struct xfs_trans;
struct xfs_trans_res;
@@ -168,8 +167,7 @@ int xfs_trans_alloc(struct xfs_mount *mp, struct xfs_trans_res *resp,
struct xfs_trans **tpp);
int xfs_trans_reserve_more(struct xfs_trans *tp,
unsigned int blocks, unsigned int rtextents);
-int xfs_trans_alloc_empty(struct xfs_mount *mp,
- struct xfs_trans **tpp);
+struct xfs_trans *xfs_trans_alloc_empty(struct xfs_mount *mp);
void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t);
int xfs_trans_get_buf_map(struct xfs_trans *tp, struct xfs_buftarg *target,
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 0fcb1828e598..67c328d23e4a 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -315,7 +315,7 @@ xfs_ail_splice(
}
/*
- * Delete the given item from the AIL. Return a pointer to the item.
+ * Delete the given item from the AIL.
*/
static void
xfs_ail_delete(
@@ -777,26 +777,28 @@ xfs_ail_update_finish(
}
/*
- * xfs_trans_ail_update - bulk AIL insertion operation.
+ * xfs_trans_ail_update_bulk - bulk AIL insertion operation.
*
- * @xfs_trans_ail_update takes an array of log items that all need to be
+ * @xfs_trans_ail_update_bulk takes an array of log items that all need to be
* positioned at the same LSN in the AIL. If an item is not in the AIL, it will
- * be added. Otherwise, it will be repositioned by removing it and re-adding
- * it to the AIL. If we move the first item in the AIL, update the log tail to
- * match the new minimum LSN in the AIL.
+ * be added. Otherwise, it will be repositioned by removing it and re-adding
+ * it to the AIL.
*
- * This function takes the AIL lock once to execute the update operations on
- * all the items in the array, and as such should not be called with the AIL
- * lock held. As a result, once we have the AIL lock, we need to check each log
- * item LSN to confirm it needs to be moved forward in the AIL.
+ * If we move the first item in the AIL, update the log tail to match the new
+ * minimum LSN in the AIL.
*
- * To optimise the insert operation, we delete all the items from the AIL in
- * the first pass, moving them into a temporary list, then splice the temporary
- * list into the correct position in the AIL. This avoids needing to do an
- * insert operation on every item.
+ * This function should be called with the AIL lock held.
*
- * This function must be called with the AIL lock held. The lock is dropped
- * before returning.
+ * To optimise the insert operation, we add all items to a temporary list, then
+ * splice this list into the correct position in the AIL.
+ *
+ * Items that are already in the AIL are first deleted from their current
+ * location before being added to the temporary list.
+ *
+ * This avoids needing to do an insert operation on every item.
+ *
+ * The AIL lock is dropped by xfs_ail_update_finish() before returning to
+ * the caller.
*/
void
xfs_trans_ail_update_bulk(
@@ -909,10 +911,9 @@ xfs_trans_ail_delete(
return;
}
- /* xfs_ail_update_finish() drops the AIL lock */
- xfs_clear_li_failed(lip);
+ clear_bit(XFS_LI_FAILED, &lip->li_flags);
tail_lsn = xfs_ail_delete_one(ailp, lip);
- xfs_ail_update_finish(ailp, tail_lsn);
+ xfs_ail_update_finish(ailp, tail_lsn); /* drops the AIL lock */
}
int
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index bd841df93021..f945f0450b16 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -167,32 +167,4 @@ xfs_trans_ail_copy_lsn(
}
#endif
-static inline void
-xfs_clear_li_failed(
- struct xfs_log_item *lip)
-{
- struct xfs_buf *bp = lip->li_buf;
-
- ASSERT(test_bit(XFS_LI_IN_AIL, &lip->li_flags));
- lockdep_assert_held(&lip->li_ailp->ail_lock);
-
- if (test_and_clear_bit(XFS_LI_FAILED, &lip->li_flags)) {
- lip->li_buf = NULL;
- xfs_buf_rele(bp);
- }
-}
-
-static inline void
-xfs_set_li_failed(
- struct xfs_log_item *lip,
- struct xfs_buf *bp)
-{
- lockdep_assert_held(&lip->li_ailp->ail_lock);
-
- if (!test_and_set_bit(XFS_LI_FAILED, &lip->li_flags)) {
- xfs_buf_hold(bp);
- lip->li_buf = bp;
- }
-}
-
#endif /* __XFS_TRANS_PRIV_H__ */
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 0f641a9091ec..ac5cecec9aa1 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -243,7 +243,7 @@ __xfs_xattr_put_listent(
offset = context->buffer + context->count;
memcpy(offset, prefix, prefix_len);
offset += prefix_len;
- strncpy(offset, (char *)name, namelen); /* real name */
+ memcpy(offset, (char *)name, namelen); /* real name */
offset += namelen;
*offset = '\0';
diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c
new file mode 100644
index 000000000000..f8bd6d741755
--- /dev/null
+++ b/fs/xfs/xfs_zone_alloc.c
@@ -0,0 +1,1349 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2023-2025 Christoph Hellwig.
+ * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
+ */
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_error.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_iomap.h"
+#include "xfs_trans.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trans_space.h"
+#include "xfs_refcount.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_zone_alloc.h"
+#include "xfs_zone_priv.h"
+#include "xfs_zones.h"
+#include "xfs_trace.h"
+#include "xfs_mru_cache.h"
+
+void
+xfs_open_zone_put(
+ struct xfs_open_zone *oz)
+{
+ if (atomic_dec_and_test(&oz->oz_ref)) {
+ xfs_rtgroup_rele(oz->oz_rtg);
+ kfree(oz);
+ }
+}
+
+static inline uint32_t
+xfs_zone_bucket(
+ struct xfs_mount *mp,
+ uint32_t used_blocks)
+{
+ return XFS_ZONE_USED_BUCKETS * used_blocks /
+ mp->m_groups[XG_TYPE_RTG].blocks;
+}
+
+static inline void
+xfs_zone_add_to_bucket(
+ struct xfs_zone_info *zi,
+ xfs_rgnumber_t rgno,
+ uint32_t to_bucket)
+{
+ __set_bit(rgno, zi->zi_used_bucket_bitmap[to_bucket]);
+ zi->zi_used_bucket_entries[to_bucket]++;
+}
+
+static inline void
+xfs_zone_remove_from_bucket(
+ struct xfs_zone_info *zi,
+ xfs_rgnumber_t rgno,
+ uint32_t from_bucket)
+{
+ __clear_bit(rgno, zi->zi_used_bucket_bitmap[from_bucket]);
+ zi->zi_used_bucket_entries[from_bucket]--;
+}
+
+static void
+xfs_zone_account_reclaimable(
+ struct xfs_rtgroup *rtg,
+ uint32_t freed)
+{
+ struct xfs_group *xg = &rtg->rtg_group;
+ struct xfs_mount *mp = rtg_mount(rtg);
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ uint32_t used = rtg_rmap(rtg)->i_used_blocks;
+ xfs_rgnumber_t rgno = rtg_rgno(rtg);
+ uint32_t from_bucket = xfs_zone_bucket(mp, used + freed);
+ uint32_t to_bucket = xfs_zone_bucket(mp, used);
+ bool was_full = (used + freed == rtg_blocks(rtg));
+
+ /*
+ * This can be called from log recovery, where the zone_info structure
+ * hasn't been allocated yet. Skip all work as xfs_mount_zones will
+ * add the zones to the right buckets before the file systems becomes
+ * active.
+ */
+ if (!zi)
+ return;
+
+ if (!used) {
+ /*
+ * The zone is now empty, remove it from the bottom bucket and
+ * trigger a reset.
+ */
+ trace_xfs_zone_emptied(rtg);
+
+ if (!was_full)
+ xfs_group_clear_mark(xg, XFS_RTG_RECLAIMABLE);
+
+ spin_lock(&zi->zi_used_buckets_lock);
+ if (!was_full)
+ xfs_zone_remove_from_bucket(zi, rgno, from_bucket);
+ spin_unlock(&zi->zi_used_buckets_lock);
+
+ spin_lock(&zi->zi_reset_list_lock);
+ xg->xg_next_reset = zi->zi_reset_list;
+ zi->zi_reset_list = xg;
+ spin_unlock(&zi->zi_reset_list_lock);
+
+ if (zi->zi_gc_thread)
+ wake_up_process(zi->zi_gc_thread);
+ } else if (was_full) {
+ /*
+ * The zone transitioned from full, mark it up as reclaimable
+ * and wake up GC which might be waiting for zones to reclaim.
+ */
+ spin_lock(&zi->zi_used_buckets_lock);
+ xfs_zone_add_to_bucket(zi, rgno, to_bucket);
+ spin_unlock(&zi->zi_used_buckets_lock);
+
+ xfs_group_set_mark(xg, XFS_RTG_RECLAIMABLE);
+ if (zi->zi_gc_thread && xfs_zoned_need_gc(mp))
+ wake_up_process(zi->zi_gc_thread);
+ } else if (to_bucket != from_bucket) {
+ /*
+ * Move the zone to a new bucket if it dropped below the
+ * threshold.
+ */
+ spin_lock(&zi->zi_used_buckets_lock);
+ xfs_zone_add_to_bucket(zi, rgno, to_bucket);
+ xfs_zone_remove_from_bucket(zi, rgno, from_bucket);
+ spin_unlock(&zi->zi_used_buckets_lock);
+ }
+}
+
+static void
+xfs_open_zone_mark_full(
+ struct xfs_open_zone *oz)
+{
+ struct xfs_rtgroup *rtg = oz->oz_rtg;
+ struct xfs_mount *mp = rtg_mount(rtg);
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ uint32_t used = rtg_rmap(rtg)->i_used_blocks;
+
+ trace_xfs_zone_full(rtg);
+
+ WRITE_ONCE(rtg->rtg_open_zone, NULL);
+
+ spin_lock(&zi->zi_open_zones_lock);
+ if (oz->oz_is_gc) {
+ ASSERT(current == zi->zi_gc_thread);
+ zi->zi_open_gc_zone = NULL;
+ } else {
+ zi->zi_nr_open_zones--;
+ list_del_init(&oz->oz_entry);
+ }
+ spin_unlock(&zi->zi_open_zones_lock);
+ xfs_open_zone_put(oz);
+
+ wake_up_all(&zi->zi_zone_wait);
+ if (used < rtg_blocks(rtg))
+ xfs_zone_account_reclaimable(rtg, rtg_blocks(rtg) - used);
+}
+
+static void
+xfs_zone_record_blocks(
+ struct xfs_trans *tp,
+ struct xfs_open_zone *oz,
+ xfs_fsblock_t fsbno,
+ xfs_filblks_t len)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_rtgroup *rtg = oz->oz_rtg;
+ struct xfs_inode *rmapip = rtg_rmap(rtg);
+
+ trace_xfs_zone_record_blocks(oz, xfs_rtb_to_rgbno(mp, fsbno), len);
+
+ xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
+ xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP);
+ rmapip->i_used_blocks += len;
+ ASSERT(rmapip->i_used_blocks <= rtg_blocks(rtg));
+ oz->oz_written += len;
+ if (oz->oz_written == rtg_blocks(rtg))
+ xfs_open_zone_mark_full(oz);
+ xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE);
+}
+
+/*
+ * Called for blocks that have been written to disk, but not actually linked to
+ * an inode, which can happen when garbage collection races with user data
+ * writes to a file.
+ */
+static void
+xfs_zone_skip_blocks(
+ struct xfs_open_zone *oz,
+ xfs_filblks_t len)
+{
+ struct xfs_rtgroup *rtg = oz->oz_rtg;
+
+ trace_xfs_zone_skip_blocks(oz, 0, len);
+
+ xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
+ oz->oz_written += len;
+ if (oz->oz_written == rtg_blocks(rtg))
+ xfs_open_zone_mark_full(oz);
+ xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
+
+ xfs_add_frextents(rtg_mount(rtg), len);
+}
+
+static int
+xfs_zoned_map_extent(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ struct xfs_bmbt_irec *new,
+ struct xfs_open_zone *oz,
+ xfs_fsblock_t old_startblock)
+{
+ struct xfs_bmbt_irec data;
+ int nmaps = 1;
+ int error;
+
+ /* Grab the corresponding mapping in the data fork. */
+ error = xfs_bmapi_read(ip, new->br_startoff, new->br_blockcount, &data,
+ &nmaps, 0);
+ if (error)
+ return error;
+
+ /*
+ * Cap the update to the existing extent in the data fork because we can
+ * only overwrite one extent at a time.
+ */
+ ASSERT(new->br_blockcount >= data.br_blockcount);
+ new->br_blockcount = data.br_blockcount;
+
+ /*
+ * If a data write raced with this GC write, keep the existing data in
+ * the data fork, mark our newly written GC extent as reclaimable, then
+ * move on to the next extent.
+ */
+ if (old_startblock != NULLFSBLOCK &&
+ old_startblock != data.br_startblock)
+ goto skip;
+
+ trace_xfs_reflink_cow_remap_from(ip, new);
+ trace_xfs_reflink_cow_remap_to(ip, &data);
+
+ error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK,
+ XFS_IEXT_REFLINK_END_COW_CNT);
+ if (error)
+ return error;
+
+ if (data.br_startblock != HOLESTARTBLOCK) {
+ ASSERT(data.br_startblock != DELAYSTARTBLOCK);
+ ASSERT(!isnullstartblock(data.br_startblock));
+
+ xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &data);
+ if (xfs_is_reflink_inode(ip)) {
+ xfs_refcount_decrease_extent(tp, true, &data);
+ } else {
+ error = xfs_free_extent_later(tp, data.br_startblock,
+ data.br_blockcount, NULL,
+ XFS_AG_RESV_NONE,
+ XFS_FREE_EXTENT_REALTIME);
+ if (error)
+ return error;
+ }
+ }
+
+ xfs_zone_record_blocks(tp, oz, new->br_startblock, new->br_blockcount);
+
+ /* Map the new blocks into the data fork. */
+ xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, new);
+ return 0;
+
+skip:
+ trace_xfs_reflink_cow_remap_skip(ip, new);
+ xfs_zone_skip_blocks(oz, new->br_blockcount);
+ return 0;
+}
+
+int
+xfs_zoned_end_io(
+ struct xfs_inode *ip,
+ xfs_off_t offset,
+ xfs_off_t count,
+ xfs_daddr_t daddr,
+ struct xfs_open_zone *oz,
+ xfs_fsblock_t old_startblock)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
+ struct xfs_bmbt_irec new = {
+ .br_startoff = XFS_B_TO_FSBT(mp, offset),
+ .br_startblock = xfs_daddr_to_rtb(mp, daddr),
+ .br_state = XFS_EXT_NORM,
+ };
+ unsigned int resblks =
+ XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
+ struct xfs_trans *tp;
+ int error;
+
+ if (xfs_is_shutdown(mp))
+ return -EIO;
+
+ while (new.br_startoff < end_fsb) {
+ new.br_blockcount = end_fsb - new.br_startoff;
+
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
+ XFS_TRANS_RESERVE | XFS_TRANS_RES_FDBLKS, &tp);
+ if (error)
+ return error;
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
+
+ error = xfs_zoned_map_extent(tp, ip, &new, oz, old_startblock);
+ if (error)
+ xfs_trans_cancel(tp);
+ else
+ error = xfs_trans_commit(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ if (error)
+ return error;
+
+ new.br_startoff += new.br_blockcount;
+ new.br_startblock += new.br_blockcount;
+ if (old_startblock != NULLFSBLOCK)
+ old_startblock += new.br_blockcount;
+ }
+
+ return 0;
+}
+
+/*
+ * "Free" blocks allocated in a zone.
+ *
+ * Just decrement the used blocks counter and report the space as freed.
+ */
+int
+xfs_zone_free_blocks(
+ struct xfs_trans *tp,
+ struct xfs_rtgroup *rtg,
+ xfs_fsblock_t fsbno,
+ xfs_filblks_t len)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_inode *rmapip = rtg_rmap(rtg);
+
+ xfs_assert_ilocked(rmapip, XFS_ILOCK_EXCL);
+
+ if (len > rmapip->i_used_blocks) {
+ xfs_err(mp,
+"trying to free more blocks (%lld) than used counter (%u).",
+ len, rmapip->i_used_blocks);
+ ASSERT(len <= rmapip->i_used_blocks);
+ xfs_rtginode_mark_sick(rtg, XFS_RTGI_RMAP);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ return -EFSCORRUPTED;
+ }
+
+ trace_xfs_zone_free_blocks(rtg, xfs_rtb_to_rgbno(mp, fsbno), len);
+
+ rmapip->i_used_blocks -= len;
+ /*
+ * Don't add open zones to the reclaimable buckets. The I/O completion
+ * for writing the last block will take care of accounting for already
+ * unused blocks instead.
+ */
+ if (!READ_ONCE(rtg->rtg_open_zone))
+ xfs_zone_account_reclaimable(rtg, len);
+ xfs_add_frextents(mp, len);
+ xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE);
+ return 0;
+}
+
+/*
+ * Check if the zone containing the data just before the offset we are
+ * writing to is still open and has space.
+ */
+static struct xfs_open_zone *
+xfs_last_used_zone(
+ struct iomap_ioend *ioend)
+{
+ struct xfs_inode *ip = XFS_I(ioend->io_inode);
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t offset_fsb = XFS_B_TO_FSB(mp, ioend->io_offset);
+ struct xfs_rtgroup *rtg = NULL;
+ struct xfs_open_zone *oz = NULL;
+ struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec got;
+
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+ if (!xfs_iext_lookup_extent_before(ip, &ip->i_df, &offset_fsb,
+ &icur, &got)) {
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ return NULL;
+ }
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+ rtg = xfs_rtgroup_grab(mp, xfs_rtb_to_rgno(mp, got.br_startblock));
+ if (!rtg)
+ return NULL;
+
+ xfs_ilock(rtg_rmap(rtg), XFS_ILOCK_SHARED);
+ oz = READ_ONCE(rtg->rtg_open_zone);
+ if (oz && (oz->oz_is_gc || !atomic_inc_not_zero(&oz->oz_ref)))
+ oz = NULL;
+ xfs_iunlock(rtg_rmap(rtg), XFS_ILOCK_SHARED);
+
+ xfs_rtgroup_rele(rtg);
+ return oz;
+}
+
+static struct xfs_group *
+xfs_find_free_zone(
+ struct xfs_mount *mp,
+ unsigned long start,
+ unsigned long end)
+{
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ XA_STATE (xas, &mp->m_groups[XG_TYPE_RTG].xa, start);
+ struct xfs_group *xg;
+
+ xas_lock(&xas);
+ xas_for_each_marked(&xas, xg, end, XFS_RTG_FREE)
+ if (atomic_inc_not_zero(&xg->xg_active_ref))
+ goto found;
+ xas_unlock(&xas);
+ return NULL;
+
+found:
+ xas_clear_mark(&xas, XFS_RTG_FREE);
+ atomic_dec(&zi->zi_nr_free_zones);
+ zi->zi_free_zone_cursor = xg->xg_gno;
+ xas_unlock(&xas);
+ return xg;
+}
+
+static struct xfs_open_zone *
+xfs_init_open_zone(
+ struct xfs_rtgroup *rtg,
+ xfs_rgblock_t write_pointer,
+ enum rw_hint write_hint,
+ bool is_gc)
+{
+ struct xfs_open_zone *oz;
+
+ oz = kzalloc(sizeof(*oz), GFP_NOFS | __GFP_NOFAIL);
+ spin_lock_init(&oz->oz_alloc_lock);
+ atomic_set(&oz->oz_ref, 1);
+ oz->oz_rtg = rtg;
+ oz->oz_allocated = write_pointer;
+ oz->oz_written = write_pointer;
+ oz->oz_write_hint = write_hint;
+ oz->oz_is_gc = is_gc;
+
+ /*
+ * All dereferences of rtg->rtg_open_zone hold the ILOCK for the rmap
+ * inode, but we don't really want to take that here because we are
+ * under the zone_list_lock. Ensure the pointer is only set for a fully
+ * initialized open zone structure so that a racy lookup finding it is
+ * fine.
+ */
+ WRITE_ONCE(rtg->rtg_open_zone, oz);
+ return oz;
+}
+
+/*
+ * Find a completely free zone, open it, and return a reference.
+ */
+struct xfs_open_zone *
+xfs_open_zone(
+ struct xfs_mount *mp,
+ enum rw_hint write_hint,
+ bool is_gc)
+{
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ struct xfs_group *xg;
+
+ xg = xfs_find_free_zone(mp, zi->zi_free_zone_cursor, ULONG_MAX);
+ if (!xg)
+ xg = xfs_find_free_zone(mp, 0, zi->zi_free_zone_cursor);
+ if (!xg)
+ return NULL;
+
+ set_current_state(TASK_RUNNING);
+ return xfs_init_open_zone(to_rtg(xg), 0, write_hint, is_gc);
+}
+
+static struct xfs_open_zone *
+xfs_try_open_zone(
+ struct xfs_mount *mp,
+ enum rw_hint write_hint)
+{
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ struct xfs_open_zone *oz;
+
+ if (zi->zi_nr_open_zones >= mp->m_max_open_zones - XFS_OPEN_GC_ZONES)
+ return NULL;
+ if (atomic_read(&zi->zi_nr_free_zones) <
+ XFS_GC_ZONES - XFS_OPEN_GC_ZONES)
+ return NULL;
+
+ /*
+ * Increment the open zone count to reserve our slot before dropping
+ * zi_open_zones_lock.
+ */
+ zi->zi_nr_open_zones++;
+ spin_unlock(&zi->zi_open_zones_lock);
+ oz = xfs_open_zone(mp, write_hint, false);
+ spin_lock(&zi->zi_open_zones_lock);
+ if (!oz) {
+ zi->zi_nr_open_zones--;
+ return NULL;
+ }
+
+ atomic_inc(&oz->oz_ref);
+ list_add_tail(&oz->oz_entry, &zi->zi_open_zones);
+
+ /*
+ * If this was the last free zone, other waiters might be waiting
+ * on us to write to it as well.
+ */
+ wake_up_all(&zi->zi_zone_wait);
+
+ if (xfs_zoned_need_gc(mp))
+ wake_up_process(zi->zi_gc_thread);
+
+ trace_xfs_zone_opened(oz->oz_rtg);
+ return oz;
+}
+
+/*
+ * For data with short or medium lifetime, try to colocated it into an
+ * already open zone with a matching temperature.
+ */
+static bool
+xfs_colocate_eagerly(
+ enum rw_hint file_hint)
+{
+ switch (file_hint) {
+ case WRITE_LIFE_MEDIUM:
+ case WRITE_LIFE_SHORT:
+ case WRITE_LIFE_NONE:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static bool
+xfs_good_hint_match(
+ struct xfs_open_zone *oz,
+ enum rw_hint file_hint)
+{
+ switch (oz->oz_write_hint) {
+ case WRITE_LIFE_LONG:
+ case WRITE_LIFE_EXTREME:
+ /* colocate long and extreme */
+ if (file_hint == WRITE_LIFE_LONG ||
+ file_hint == WRITE_LIFE_EXTREME)
+ return true;
+ break;
+ case WRITE_LIFE_MEDIUM:
+ /* colocate medium with medium */
+ if (file_hint == WRITE_LIFE_MEDIUM)
+ return true;
+ break;
+ case WRITE_LIFE_SHORT:
+ case WRITE_LIFE_NONE:
+ case WRITE_LIFE_NOT_SET:
+ /* colocate short and none */
+ if (file_hint <= WRITE_LIFE_SHORT)
+ return true;
+ break;
+ }
+ return false;
+}
+
+static bool
+xfs_try_use_zone(
+ struct xfs_zone_info *zi,
+ enum rw_hint file_hint,
+ struct xfs_open_zone *oz,
+ bool lowspace)
+{
+ if (oz->oz_allocated == rtg_blocks(oz->oz_rtg))
+ return false;
+ if (!lowspace && !xfs_good_hint_match(oz, file_hint))
+ return false;
+ if (!atomic_inc_not_zero(&oz->oz_ref))
+ return false;
+
+ /*
+ * If we have a hint set for the data, use that for the zone even if
+ * some data was written already without any hint set, but don't change
+ * the temperature after that as that would make little sense without
+ * tracking per-temperature class written block counts, which is
+ * probably overkill anyway.
+ */
+ if (file_hint != WRITE_LIFE_NOT_SET &&
+ oz->oz_write_hint == WRITE_LIFE_NOT_SET)
+ oz->oz_write_hint = file_hint;
+
+ /*
+ * If we couldn't match by inode or life time we just pick the first
+ * zone with enough space above. For that we want the least busy zone
+ * for some definition of "least" busy. For now this simple LRU
+ * algorithm that rotates every zone to the end of the list will do it,
+ * even if it isn't exactly cache friendly.
+ */
+ if (!list_is_last(&oz->oz_entry, &zi->zi_open_zones))
+ list_move_tail(&oz->oz_entry, &zi->zi_open_zones);
+ return true;
+}
+
+static struct xfs_open_zone *
+xfs_select_open_zone_lru(
+ struct xfs_zone_info *zi,
+ enum rw_hint file_hint,
+ bool lowspace)
+{
+ struct xfs_open_zone *oz;
+
+ lockdep_assert_held(&zi->zi_open_zones_lock);
+
+ list_for_each_entry(oz, &zi->zi_open_zones, oz_entry)
+ if (xfs_try_use_zone(zi, file_hint, oz, lowspace))
+ return oz;
+
+ cond_resched_lock(&zi->zi_open_zones_lock);
+ return NULL;
+}
+
+static struct xfs_open_zone *
+xfs_select_open_zone_mru(
+ struct xfs_zone_info *zi,
+ enum rw_hint file_hint)
+{
+ struct xfs_open_zone *oz;
+
+ lockdep_assert_held(&zi->zi_open_zones_lock);
+
+ list_for_each_entry_reverse(oz, &zi->zi_open_zones, oz_entry)
+ if (xfs_try_use_zone(zi, file_hint, oz, false))
+ return oz;
+
+ cond_resched_lock(&zi->zi_open_zones_lock);
+ return NULL;
+}
+
+static inline enum rw_hint xfs_inode_write_hint(struct xfs_inode *ip)
+{
+ if (xfs_has_nolifetime(ip->i_mount))
+ return WRITE_LIFE_NOT_SET;
+ return VFS_I(ip)->i_write_hint;
+}
+
+/*
+ * Try to pack inodes that are written back after they were closed tight instead
+ * of trying to open new zones for them or spread them to the least recently
+ * used zone. This optimizes the data layout for workloads that untar or copy
+ * a lot of small files. Right now this does not separate multiple such
+ * streams.
+ */
+static inline bool xfs_zoned_pack_tight(struct xfs_inode *ip)
+{
+ return !inode_is_open_for_write(VFS_I(ip)) &&
+ !(ip->i_diflags & XFS_DIFLAG_APPEND);
+}
+
+static struct xfs_open_zone *
+xfs_select_zone_nowait(
+ struct xfs_mount *mp,
+ enum rw_hint write_hint,
+ bool pack_tight)
+{
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ struct xfs_open_zone *oz = NULL;
+
+ if (xfs_is_shutdown(mp))
+ return NULL;
+
+ /*
+ * Try to fill up open zones with matching temperature if available. It
+ * is better to try to co-locate data when this is favorable, so we can
+ * activate empty zones when it is statistically better to separate
+ * data.
+ */
+ spin_lock(&zi->zi_open_zones_lock);
+ if (xfs_colocate_eagerly(write_hint))
+ oz = xfs_select_open_zone_lru(zi, write_hint, false);
+ else if (pack_tight)
+ oz = xfs_select_open_zone_mru(zi, write_hint);
+ if (oz)
+ goto out_unlock;
+
+ /*
+ * See if we can open a new zone and use that so that data for different
+ * files is mixed as little as possible.
+ */
+ oz = xfs_try_open_zone(mp, write_hint);
+ if (oz)
+ goto out_unlock;
+
+ /*
+ * Try to colocate cold data with other cold data if we failed to open a
+ * new zone for it.
+ */
+ if (write_hint != WRITE_LIFE_NOT_SET &&
+ !xfs_colocate_eagerly(write_hint))
+ oz = xfs_select_open_zone_lru(zi, write_hint, false);
+ if (!oz)
+ oz = xfs_select_open_zone_lru(zi, WRITE_LIFE_NOT_SET, false);
+ if (!oz)
+ oz = xfs_select_open_zone_lru(zi, WRITE_LIFE_NOT_SET, true);
+out_unlock:
+ spin_unlock(&zi->zi_open_zones_lock);
+ return oz;
+}
+
+static struct xfs_open_zone *
+xfs_select_zone(
+ struct xfs_mount *mp,
+ enum rw_hint write_hint,
+ bool pack_tight)
+{
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ DEFINE_WAIT (wait);
+ struct xfs_open_zone *oz;
+
+ oz = xfs_select_zone_nowait(mp, write_hint, pack_tight);
+ if (oz)
+ return oz;
+
+ for (;;) {
+ prepare_to_wait(&zi->zi_zone_wait, &wait, TASK_UNINTERRUPTIBLE);
+ oz = xfs_select_zone_nowait(mp, write_hint, pack_tight);
+ if (oz || xfs_is_shutdown(mp))
+ break;
+ schedule();
+ }
+ finish_wait(&zi->zi_zone_wait, &wait);
+ return oz;
+}
+
+static unsigned int
+xfs_zone_alloc_blocks(
+ struct xfs_open_zone *oz,
+ xfs_filblks_t count_fsb,
+ sector_t *sector,
+ bool *is_seq)
+{
+ struct xfs_rtgroup *rtg = oz->oz_rtg;
+ struct xfs_mount *mp = rtg_mount(rtg);
+ xfs_rgblock_t allocated;
+
+ spin_lock(&oz->oz_alloc_lock);
+ count_fsb = min3(count_fsb, XFS_MAX_BMBT_EXTLEN,
+ (xfs_filblks_t)rtg_blocks(rtg) - oz->oz_allocated);
+ if (!count_fsb) {
+ spin_unlock(&oz->oz_alloc_lock);
+ return 0;
+ }
+ allocated = oz->oz_allocated;
+ oz->oz_allocated += count_fsb;
+ spin_unlock(&oz->oz_alloc_lock);
+
+ trace_xfs_zone_alloc_blocks(oz, allocated, count_fsb);
+
+ *sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0);
+ *is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *sector);
+ if (!*is_seq)
+ *sector += XFS_FSB_TO_BB(mp, allocated);
+ return XFS_FSB_TO_B(mp, count_fsb);
+}
+
+void
+xfs_mark_rtg_boundary(
+ struct iomap_ioend *ioend)
+{
+ struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount;
+ sector_t sector = ioend->io_bio.bi_iter.bi_sector;
+
+ if (xfs_rtb_to_rgbno(mp, xfs_daddr_to_rtb(mp, sector)) == 0)
+ ioend->io_flags |= IOMAP_IOEND_BOUNDARY;
+}
+
+/*
+ * Cache the last zone written to for an inode so that it is considered first
+ * for subsequent writes.
+ */
+struct xfs_zone_cache_item {
+ struct xfs_mru_cache_elem mru;
+ struct xfs_open_zone *oz;
+};
+
+static inline struct xfs_zone_cache_item *
+xfs_zone_cache_item(struct xfs_mru_cache_elem *mru)
+{
+ return container_of(mru, struct xfs_zone_cache_item, mru);
+}
+
+static void
+xfs_zone_cache_free_func(
+ void *data,
+ struct xfs_mru_cache_elem *mru)
+{
+ struct xfs_zone_cache_item *item = xfs_zone_cache_item(mru);
+
+ xfs_open_zone_put(item->oz);
+ kfree(item);
+}
+
+/*
+ * Check if we have a cached last open zone available for the inode and
+ * if yes return a reference to it.
+ */
+static struct xfs_open_zone *
+xfs_cached_zone(
+ struct xfs_mount *mp,
+ struct xfs_inode *ip)
+{
+ struct xfs_mru_cache_elem *mru;
+ struct xfs_open_zone *oz;
+
+ mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino);
+ if (!mru)
+ return NULL;
+ oz = xfs_zone_cache_item(mru)->oz;
+ if (oz) {
+ /*
+ * GC only steals open zones at mount time, so no GC zones
+ * should end up in the cache.
+ */
+ ASSERT(!oz->oz_is_gc);
+ ASSERT(atomic_read(&oz->oz_ref) > 0);
+ atomic_inc(&oz->oz_ref);
+ }
+ xfs_mru_cache_done(mp->m_zone_cache);
+ return oz;
+}
+
+/*
+ * Update the last used zone cache for a given inode.
+ *
+ * The caller must have a reference on the open zone.
+ */
+static void
+xfs_zone_cache_create_association(
+ struct xfs_inode *ip,
+ struct xfs_open_zone *oz)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_zone_cache_item *item = NULL;
+ struct xfs_mru_cache_elem *mru;
+
+ ASSERT(atomic_read(&oz->oz_ref) > 0);
+ atomic_inc(&oz->oz_ref);
+
+ mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino);
+ if (mru) {
+ /*
+ * If we have an association already, update it to point to the
+ * new zone.
+ */
+ item = xfs_zone_cache_item(mru);
+ xfs_open_zone_put(item->oz);
+ item->oz = oz;
+ xfs_mru_cache_done(mp->m_zone_cache);
+ return;
+ }
+
+ item = kmalloc(sizeof(*item), GFP_KERNEL);
+ if (!item) {
+ xfs_open_zone_put(oz);
+ return;
+ }
+ item->oz = oz;
+ xfs_mru_cache_insert(mp->m_zone_cache, ip->i_ino, &item->mru);
+}
+
+static void
+xfs_submit_zoned_bio(
+ struct iomap_ioend *ioend,
+ struct xfs_open_zone *oz,
+ bool is_seq)
+{
+ ioend->io_bio.bi_iter.bi_sector = ioend->io_sector;
+ ioend->io_private = oz;
+ atomic_inc(&oz->oz_ref); /* for xfs_zoned_end_io */
+
+ if (is_seq) {
+ ioend->io_bio.bi_opf &= ~REQ_OP_WRITE;
+ ioend->io_bio.bi_opf |= REQ_OP_ZONE_APPEND;
+ } else {
+ xfs_mark_rtg_boundary(ioend);
+ }
+
+ submit_bio(&ioend->io_bio);
+}
+
+void
+xfs_zone_alloc_and_submit(
+ struct iomap_ioend *ioend,
+ struct xfs_open_zone **oz)
+{
+ struct xfs_inode *ip = XFS_I(ioend->io_inode);
+ struct xfs_mount *mp = ip->i_mount;
+ enum rw_hint write_hint = xfs_inode_write_hint(ip);
+ bool pack_tight = xfs_zoned_pack_tight(ip);
+ unsigned int alloc_len;
+ struct iomap_ioend *split;
+ bool is_seq;
+
+ if (xfs_is_shutdown(mp))
+ goto out_error;
+
+ /*
+ * If we don't have a cached zone in this write context, see if the
+ * last extent before the one we are writing to points to an active
+ * zone. If so, just continue writing to it.
+ */
+ if (!*oz && ioend->io_offset)
+ *oz = xfs_last_used_zone(ioend);
+ if (!*oz)
+ *oz = xfs_cached_zone(mp, ip);
+
+ if (!*oz) {
+select_zone:
+ *oz = xfs_select_zone(mp, write_hint, pack_tight);
+ if (!*oz)
+ goto out_error;
+
+ xfs_zone_cache_create_association(ip, *oz);
+ }
+
+ alloc_len = xfs_zone_alloc_blocks(*oz, XFS_B_TO_FSB(mp, ioend->io_size),
+ &ioend->io_sector, &is_seq);
+ if (!alloc_len) {
+ xfs_open_zone_put(*oz);
+ goto select_zone;
+ }
+
+ while ((split = iomap_split_ioend(ioend, alloc_len, is_seq))) {
+ if (IS_ERR(split))
+ goto out_split_error;
+ alloc_len -= split->io_bio.bi_iter.bi_size;
+ xfs_submit_zoned_bio(split, *oz, is_seq);
+ if (!alloc_len) {
+ xfs_open_zone_put(*oz);
+ goto select_zone;
+ }
+ }
+
+ xfs_submit_zoned_bio(ioend, *oz, is_seq);
+ return;
+
+out_split_error:
+ ioend->io_bio.bi_status = errno_to_blk_status(PTR_ERR(split));
+out_error:
+ bio_io_error(&ioend->io_bio);
+}
+
+/*
+ * Wake up all threads waiting for a zoned space allocation when the file system
+ * is shut down.
+ */
+void
+xfs_zoned_wake_all(
+ struct xfs_mount *mp)
+{
+ /*
+ * Don't wake up if there is no m_zone_info. This is complicated by the
+ * fact that unmount can't atomically clear m_zone_info and thus we need
+ * to check SB_ACTIVE for that, but mount temporarily enables SB_ACTIVE
+ * during log recovery so we can't entirely rely on that either.
+ */
+ if ((mp->m_super->s_flags & SB_ACTIVE) && mp->m_zone_info)
+ wake_up_all(&mp->m_zone_info->zi_zone_wait);
+}
+
+/*
+ * Check if @rgbno in @rgb is a potentially valid block. It might still be
+ * unused, but that information is only found in the rmap.
+ */
+bool
+xfs_zone_rgbno_is_valid(
+ struct xfs_rtgroup *rtg,
+ xfs_rgnumber_t rgbno)
+{
+ lockdep_assert_held(&rtg_rmap(rtg)->i_lock);
+
+ if (rtg->rtg_open_zone)
+ return rgbno < rtg->rtg_open_zone->oz_allocated;
+ return !xa_get_mark(&rtg_mount(rtg)->m_groups[XG_TYPE_RTG].xa,
+ rtg_rgno(rtg), XFS_RTG_FREE);
+}
+
+static void
+xfs_free_open_zones(
+ struct xfs_zone_info *zi)
+{
+ struct xfs_open_zone *oz;
+
+ spin_lock(&zi->zi_open_zones_lock);
+ while ((oz = list_first_entry_or_null(&zi->zi_open_zones,
+ struct xfs_open_zone, oz_entry))) {
+ list_del(&oz->oz_entry);
+ xfs_open_zone_put(oz);
+ }
+ spin_unlock(&zi->zi_open_zones_lock);
+}
+
+struct xfs_init_zones {
+ struct xfs_mount *mp;
+ uint64_t available;
+ uint64_t reclaimable;
+};
+
+static int
+xfs_init_zone(
+ struct xfs_init_zones *iz,
+ struct xfs_rtgroup *rtg,
+ struct blk_zone *zone)
+{
+ struct xfs_mount *mp = rtg_mount(rtg);
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ uint32_t used = rtg_rmap(rtg)->i_used_blocks;
+ xfs_rgblock_t write_pointer, highest_rgbno;
+ int error;
+
+ if (zone && !xfs_zone_validate(zone, rtg, &write_pointer))
+ return -EFSCORRUPTED;
+
+ /*
+ * For sequential write required zones we retrieved the hardware write
+ * pointer above.
+ *
+ * For conventional zones or conventional devices we don't have that
+ * luxury. Instead query the rmap to find the highest recorded block
+ * and set the write pointer to the block after that. In case of a
+ * power loss this misses blocks where the data I/O has completed but
+ * not recorded in the rmap yet, and it also rewrites blocks if the most
+ * recently written ones got deleted again before unmount, but this is
+ * the best we can do without hardware support.
+ */
+ if (!zone || zone->cond == BLK_ZONE_COND_NOT_WP) {
+ xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
+ highest_rgbno = xfs_rtrmap_highest_rgbno(rtg);
+ if (highest_rgbno == NULLRGBLOCK)
+ write_pointer = 0;
+ else
+ write_pointer = highest_rgbno + 1;
+ xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
+ }
+
+ /*
+ * If there are no used blocks, but the zone is not in empty state yet
+ * we lost power before the zoned reset. In that case finish the work
+ * here.
+ */
+ if (write_pointer == rtg_blocks(rtg) && used == 0) {
+ error = xfs_zone_gc_reset_sync(rtg);
+ if (error)
+ return error;
+ write_pointer = 0;
+ }
+
+ if (write_pointer == 0) {
+ /* zone is empty */
+ atomic_inc(&zi->zi_nr_free_zones);
+ xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_FREE);
+ iz->available += rtg_blocks(rtg);
+ } else if (write_pointer < rtg_blocks(rtg)) {
+ /* zone is open */
+ struct xfs_open_zone *oz;
+
+ atomic_inc(&rtg_group(rtg)->xg_active_ref);
+ oz = xfs_init_open_zone(rtg, write_pointer, WRITE_LIFE_NOT_SET,
+ false);
+ list_add_tail(&oz->oz_entry, &zi->zi_open_zones);
+ zi->zi_nr_open_zones++;
+
+ iz->available += (rtg_blocks(rtg) - write_pointer);
+ iz->reclaimable += write_pointer - used;
+ } else if (used < rtg_blocks(rtg)) {
+ /* zone fully written, but has freed blocks */
+ xfs_zone_account_reclaimable(rtg, rtg_blocks(rtg) - used);
+ iz->reclaimable += (rtg_blocks(rtg) - used);
+ }
+
+ return 0;
+}
+
+static int
+xfs_get_zone_info_cb(
+ struct blk_zone *zone,
+ unsigned int idx,
+ void *data)
+{
+ struct xfs_init_zones *iz = data;
+ struct xfs_mount *mp = iz->mp;
+ xfs_fsblock_t zsbno = xfs_daddr_to_rtb(mp, zone->start);
+ xfs_rgnumber_t rgno;
+ struct xfs_rtgroup *rtg;
+ int error;
+
+ if (xfs_rtb_to_rgbno(mp, zsbno) != 0) {
+ xfs_warn(mp, "mismatched zone start 0x%llx.", zsbno);
+ return -EFSCORRUPTED;
+ }
+
+ rgno = xfs_rtb_to_rgno(mp, zsbno);
+ rtg = xfs_rtgroup_grab(mp, rgno);
+ if (!rtg) {
+ xfs_warn(mp, "realtime group not found for zone %u.", rgno);
+ return -EFSCORRUPTED;
+ }
+ error = xfs_init_zone(iz, rtg, zone);
+ xfs_rtgroup_rele(rtg);
+ return error;
+}
+
+/*
+ * Calculate the max open zone limit based on the of number of backing zones
+ * available.
+ */
+static inline uint32_t
+xfs_max_open_zones(
+ struct xfs_mount *mp)
+{
+ unsigned int max_open, max_open_data_zones;
+
+ /*
+ * We need two zones for every open data zone, one in reserve as we
+ * don't reclaim open zones. One data zone and its spare is included
+ * in XFS_MIN_ZONES to support at least one user data writer.
+ */
+ max_open_data_zones = (mp->m_sb.sb_rgcount - XFS_MIN_ZONES) / 2 + 1;
+ max_open = max_open_data_zones + XFS_OPEN_GC_ZONES;
+
+ /*
+ * Cap the max open limit to 1/4 of available space. Without this we'd
+ * run out of easy reclaim targets too quickly and storage devices don't
+ * handle huge numbers of concurrent write streams overly well.
+ */
+ max_open = min(max_open, mp->m_sb.sb_rgcount / 4);
+
+ return max(XFS_MIN_OPEN_ZONES, max_open);
+}
+
+/*
+ * Normally we use the open zone limit that the device reports. If there is
+ * none let the user pick one from the command line.
+ *
+ * If the device doesn't report an open zone limit and there is no override,
+ * allow to hold about a quarter of the zones open. In theory we could allow
+ * all to be open, but at that point we run into GC deadlocks because we can't
+ * reclaim open zones.
+ *
+ * When used on conventional SSDs a lower open limit is advisable as we'll
+ * otherwise overwhelm the FTL just as much as a conventional block allocator.
+ *
+ * Note: To debug the open zone management code, force max_open to 1 here.
+ */
+static int
+xfs_calc_open_zones(
+ struct xfs_mount *mp)
+{
+ struct block_device *bdev = mp->m_rtdev_targp->bt_bdev;
+ unsigned int bdev_open_zones = bdev_max_open_zones(bdev);
+
+ if (!mp->m_max_open_zones) {
+ if (bdev_open_zones)
+ mp->m_max_open_zones = bdev_open_zones;
+ else
+ mp->m_max_open_zones = xfs_max_open_zones(mp);
+ }
+
+ if (mp->m_max_open_zones < XFS_MIN_OPEN_ZONES) {
+ xfs_notice(mp, "need at least %u open zones.",
+ XFS_MIN_OPEN_ZONES);
+ return -EIO;
+ }
+
+ if (bdev_open_zones && bdev_open_zones < mp->m_max_open_zones) {
+ mp->m_max_open_zones = bdev_open_zones;
+ xfs_info(mp, "limiting open zones to %u due to hardware limit.\n",
+ bdev_open_zones);
+ }
+
+ if (mp->m_max_open_zones > xfs_max_open_zones(mp)) {
+ mp->m_max_open_zones = xfs_max_open_zones(mp);
+ xfs_info(mp,
+"limiting open zones to %u due to total zone count (%u)",
+ mp->m_max_open_zones, mp->m_sb.sb_rgcount);
+ }
+
+ return 0;
+}
+
+static unsigned long *
+xfs_alloc_bucket_bitmap(
+ struct xfs_mount *mp)
+{
+ return kvmalloc_array(BITS_TO_LONGS(mp->m_sb.sb_rgcount),
+ sizeof(unsigned long), GFP_KERNEL | __GFP_ZERO);
+}
+
+static struct xfs_zone_info *
+xfs_alloc_zone_info(
+ struct xfs_mount *mp)
+{
+ struct xfs_zone_info *zi;
+ int i;
+
+ zi = kzalloc(sizeof(*zi), GFP_KERNEL);
+ if (!zi)
+ return NULL;
+ INIT_LIST_HEAD(&zi->zi_open_zones);
+ INIT_LIST_HEAD(&zi->zi_reclaim_reservations);
+ spin_lock_init(&zi->zi_reset_list_lock);
+ spin_lock_init(&zi->zi_open_zones_lock);
+ spin_lock_init(&zi->zi_reservation_lock);
+ init_waitqueue_head(&zi->zi_zone_wait);
+ spin_lock_init(&zi->zi_used_buckets_lock);
+ for (i = 0; i < XFS_ZONE_USED_BUCKETS; i++) {
+ zi->zi_used_bucket_bitmap[i] = xfs_alloc_bucket_bitmap(mp);
+ if (!zi->zi_used_bucket_bitmap[i])
+ goto out_free_bitmaps;
+ }
+ return zi;
+
+out_free_bitmaps:
+ while (--i > 0)
+ kvfree(zi->zi_used_bucket_bitmap[i]);
+ kfree(zi);
+ return NULL;
+}
+
+static void
+xfs_free_zone_info(
+ struct xfs_zone_info *zi)
+{
+ int i;
+
+ xfs_free_open_zones(zi);
+ for (i = 0; i < XFS_ZONE_USED_BUCKETS; i++)
+ kvfree(zi->zi_used_bucket_bitmap[i]);
+ kfree(zi);
+}
+
+int
+xfs_mount_zones(
+ struct xfs_mount *mp)
+{
+ struct xfs_init_zones iz = {
+ .mp = mp,
+ };
+ struct xfs_buftarg *bt = mp->m_rtdev_targp;
+ int error;
+
+ if (!bt) {
+ xfs_notice(mp, "RT device missing.");
+ return -EINVAL;
+ }
+
+ if (!xfs_has_rtgroups(mp) || !xfs_has_rmapbt(mp)) {
+ xfs_notice(mp, "invalid flag combination.");
+ return -EFSCORRUPTED;
+ }
+ if (mp->m_sb.sb_rextsize != 1) {
+ xfs_notice(mp, "zoned file systems do not support rextsize.");
+ return -EFSCORRUPTED;
+ }
+ if (mp->m_sb.sb_rgcount < XFS_MIN_ZONES) {
+ xfs_notice(mp,
+"zoned file systems need to have at least %u zones.", XFS_MIN_ZONES);
+ return -EFSCORRUPTED;
+ }
+
+ error = xfs_calc_open_zones(mp);
+ if (error)
+ return error;
+
+ mp->m_zone_info = xfs_alloc_zone_info(mp);
+ if (!mp->m_zone_info)
+ return -ENOMEM;
+
+ xfs_info(mp, "%u zones of %u blocks size (%u max open)",
+ mp->m_sb.sb_rgcount, mp->m_groups[XG_TYPE_RTG].blocks,
+ mp->m_max_open_zones);
+ trace_xfs_zones_mount(mp);
+
+ if (bdev_is_zoned(bt->bt_bdev)) {
+ error = blkdev_report_zones(bt->bt_bdev,
+ XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart),
+ mp->m_sb.sb_rgcount, xfs_get_zone_info_cb, &iz);
+ if (error < 0)
+ goto out_free_zone_info;
+ } else {
+ struct xfs_rtgroup *rtg = NULL;
+
+ while ((rtg = xfs_rtgroup_next(mp, rtg))) {
+ error = xfs_init_zone(&iz, rtg, NULL);
+ if (error)
+ goto out_free_zone_info;
+ }
+ }
+
+ xfs_set_freecounter(mp, XC_FREE_RTAVAILABLE, iz.available);
+ xfs_set_freecounter(mp, XC_FREE_RTEXTENTS,
+ iz.available + iz.reclaimable);
+
+ /*
+ * The user may configure GC to free up a percentage of unused blocks.
+ * By default this is 0. GC will always trigger at the minimum level
+ * for keeping max_open_zones available for data placement.
+ */
+ mp->m_zonegc_low_space = 0;
+
+ error = xfs_zone_gc_mount(mp);
+ if (error)
+ goto out_free_zone_info;
+
+ /*
+ * Set up a mru cache to track inode to open zone for data placement
+ * purposes. The magic values for group count and life time is the
+ * same as the defaults for file streams, which seems sane enough.
+ */
+ xfs_mru_cache_create(&mp->m_zone_cache, mp,
+ 5000, 10, xfs_zone_cache_free_func);
+ return 0;
+
+out_free_zone_info:
+ xfs_free_zone_info(mp->m_zone_info);
+ return error;
+}
+
+void
+xfs_unmount_zones(
+ struct xfs_mount *mp)
+{
+ xfs_zone_gc_unmount(mp);
+ xfs_free_zone_info(mp->m_zone_info);
+ xfs_mru_cache_destroy(mp->m_zone_cache);
+}
diff --git a/fs/xfs/xfs_zone_alloc.h b/fs/xfs/xfs_zone_alloc.h
new file mode 100644
index 000000000000..4db02816d0fd
--- /dev/null
+++ b/fs/xfs/xfs_zone_alloc.h
@@ -0,0 +1,70 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _XFS_ZONE_ALLOC_H
+#define _XFS_ZONE_ALLOC_H
+
+struct iomap_ioend;
+struct xfs_open_zone;
+
+struct xfs_zone_alloc_ctx {
+ struct xfs_open_zone *open_zone;
+ xfs_filblks_t reserved_blocks;
+};
+
+/*
+ * Grab any available space, even if it is less than what the caller asked for.
+ */
+#define XFS_ZR_GREEDY (1U << 0)
+/*
+ * Only grab instantly available space, don't wait or GC.
+ */
+#define XFS_ZR_NOWAIT (1U << 1)
+/*
+ * Dip into the reserved pool.
+ */
+#define XFS_ZR_RESERVED (1U << 2)
+
+int xfs_zoned_space_reserve(struct xfs_mount *mp, xfs_filblks_t count_fsb,
+ unsigned int flags, struct xfs_zone_alloc_ctx *ac);
+void xfs_zoned_space_unreserve(struct xfs_mount *mp,
+ struct xfs_zone_alloc_ctx *ac);
+void xfs_zoned_add_available(struct xfs_mount *mp, xfs_filblks_t count_fsb);
+
+void xfs_zone_alloc_and_submit(struct iomap_ioend *ioend,
+ struct xfs_open_zone **oz);
+int xfs_zone_free_blocks(struct xfs_trans *tp, struct xfs_rtgroup *rtg,
+ xfs_fsblock_t fsbno, xfs_filblks_t len);
+int xfs_zoned_end_io(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count,
+ xfs_daddr_t daddr, struct xfs_open_zone *oz,
+ xfs_fsblock_t old_startblock);
+void xfs_open_zone_put(struct xfs_open_zone *oz);
+
+void xfs_zoned_wake_all(struct xfs_mount *mp);
+bool xfs_zone_rgbno_is_valid(struct xfs_rtgroup *rtg, xfs_rgnumber_t rgbno);
+void xfs_mark_rtg_boundary(struct iomap_ioend *ioend);
+
+uint64_t xfs_zoned_default_resblks(struct xfs_mount *mp,
+ enum xfs_free_counter ctr);
+void xfs_zoned_show_stats(struct seq_file *m, struct xfs_mount *mp);
+
+#ifdef CONFIG_XFS_RT
+int xfs_mount_zones(struct xfs_mount *mp);
+void xfs_unmount_zones(struct xfs_mount *mp);
+void xfs_zone_gc_start(struct xfs_mount *mp);
+void xfs_zone_gc_stop(struct xfs_mount *mp);
+#else
+static inline int xfs_mount_zones(struct xfs_mount *mp)
+{
+ return -EIO;
+}
+static inline void xfs_unmount_zones(struct xfs_mount *mp)
+{
+}
+static inline void xfs_zone_gc_start(struct xfs_mount *mp)
+{
+}
+static inline void xfs_zone_gc_stop(struct xfs_mount *mp)
+{
+}
+#endif /* CONFIG_XFS_RT */
+
+#endif /* _XFS_ZONE_ALLOC_H */
diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c
new file mode 100644
index 000000000000..064cd1a857a0
--- /dev/null
+++ b/fs/xfs/xfs_zone_gc.c
@@ -0,0 +1,1178 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2023-2025 Christoph Hellwig.
+ * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
+ */
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_trans.h"
+#include "xfs_icache.h"
+#include "xfs_rmap.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_rtrmap_btree.h"
+#include "xfs_zone_alloc.h"
+#include "xfs_zone_priv.h"
+#include "xfs_zones.h"
+#include "xfs_trace.h"
+
+/*
+ * Implement Garbage Collection (GC) of partially used zoned.
+ *
+ * To support the purely sequential writes in each zone, zoned XFS needs to be
+ * able to move data remaining in a zone out of it to reset the zone to prepare
+ * for writing to it again.
+ *
+ * This is done by the GC thread implemented in this file. To support that a
+ * number of zones (XFS_GC_ZONES) is reserved from the user visible capacity to
+ * write the garbage collected data into.
+ *
+ * Whenever the available space is below the chosen threshold, the GC thread
+ * looks for potential non-empty but not fully used zones that are worth
+ * reclaiming. Once found the rmap for the victim zone is queried, and after
+ * a bit of sorting to reduce fragmentation, the still live extents are read
+ * into memory and written to the GC target zone, and the bmap btree of the
+ * files is updated to point to the new location. To avoid taking the IOLOCK
+ * and MMAPLOCK for the entire GC process and thus affecting the latency of
+ * user reads and writes to the files, the GC writes are speculative and the
+ * I/O completion checks that no other writes happened for the affected regions
+ * before remapping.
+ *
+ * Once a zone does not contain any valid data, be that through GC or user
+ * block removal, it is queued for for a zone reset. The reset operation
+ * carefully ensures that the RT device cache is flushed and all transactions
+ * referencing the rmap have been committed to disk.
+ */
+
+/*
+ * Size of each GC scratch pad. This is also the upper bound for each
+ * GC I/O, which helps to keep latency down.
+ */
+#define XFS_GC_CHUNK_SIZE SZ_1M
+
+/*
+ * Scratchpad data to read GCed data into.
+ *
+ * The offset member tracks where the next allocation starts, and freed tracks
+ * the amount of space that is not used anymore.
+ */
+#define XFS_ZONE_GC_NR_SCRATCH 2
+struct xfs_zone_scratch {
+ struct folio *folio;
+ unsigned int offset;
+ unsigned int freed;
+};
+
+/*
+ * Chunk that is read and written for each GC operation.
+ *
+ * Note that for writes to actual zoned devices, the chunk can be split when
+ * reaching the hardware limit.
+ */
+struct xfs_gc_bio {
+ struct xfs_zone_gc_data *data;
+
+ /*
+ * Entry into the reading/writing/resetting list. Only accessed from
+ * the GC thread, so no locking needed.
+ */
+ struct list_head entry;
+
+ /*
+ * State of this gc_bio. Done means the current I/O completed.
+ * Set from the bio end I/O handler, read from the GC thread.
+ */
+ enum {
+ XFS_GC_BIO_NEW,
+ XFS_GC_BIO_DONE,
+ } state;
+
+ /*
+ * Pointer to the inode and byte range in the inode that this
+ * GC chunk is operating on.
+ */
+ struct xfs_inode *ip;
+ loff_t offset;
+ unsigned int len;
+
+ /*
+ * Existing startblock (in the zone to be freed) and newly assigned
+ * daddr in the zone GCed into.
+ */
+ xfs_fsblock_t old_startblock;
+ xfs_daddr_t new_daddr;
+ struct xfs_zone_scratch *scratch;
+
+ /* Are we writing to a sequential write required zone? */
+ bool is_seq;
+
+ /* Open Zone being written to */
+ struct xfs_open_zone *oz;
+
+ /* Bio used for reads and writes, including the bvec used by it */
+ struct bio_vec bv;
+ struct bio bio; /* must be last */
+};
+
+#define XFS_ZONE_GC_RECS 1024
+
+/* iterator, needs to be reinitialized for each victim zone */
+struct xfs_zone_gc_iter {
+ struct xfs_rtgroup *victim_rtg;
+ unsigned int rec_count;
+ unsigned int rec_idx;
+ xfs_agblock_t next_startblock;
+ struct xfs_rmap_irec *recs;
+};
+
+/*
+ * Per-mount GC state.
+ */
+struct xfs_zone_gc_data {
+ struct xfs_mount *mp;
+
+ /* bioset used to allocate the gc_bios */
+ struct bio_set bio_set;
+
+ /*
+ * Scratchpad used, and index to indicated which one is used.
+ */
+ struct xfs_zone_scratch scratch[XFS_ZONE_GC_NR_SCRATCH];
+ unsigned int scratch_idx;
+
+ /*
+ * List of bios currently being read, written and reset.
+ * These lists are only accessed by the GC thread itself, and must only
+ * be processed in order.
+ */
+ struct list_head reading;
+ struct list_head writing;
+ struct list_head resetting;
+
+ /*
+ * Iterator for the victim zone.
+ */
+ struct xfs_zone_gc_iter iter;
+};
+
+/*
+ * We aim to keep enough zones free in stock to fully use the open zone limit
+ * for data placement purposes. Additionally, the m_zonegc_low_space tunable
+ * can be set to make sure a fraction of the unused blocks are available for
+ * writing.
+ */
+bool
+xfs_zoned_need_gc(
+ struct xfs_mount *mp)
+{
+ s64 available, free, threshold;
+ s32 remainder;
+
+ if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE))
+ return false;
+
+ available = xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE);
+
+ if (available <
+ mp->m_groups[XG_TYPE_RTG].blocks *
+ (mp->m_max_open_zones - XFS_OPEN_GC_ZONES))
+ return true;
+
+ free = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS);
+
+ threshold = div_s64_rem(free, 100, &remainder);
+ threshold = threshold * mp->m_zonegc_low_space +
+ remainder * div_s64(mp->m_zonegc_low_space, 100);
+
+ if (available < threshold)
+ return true;
+
+ return false;
+}
+
+static struct xfs_zone_gc_data *
+xfs_zone_gc_data_alloc(
+ struct xfs_mount *mp)
+{
+ struct xfs_zone_gc_data *data;
+ int i;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return NULL;
+ data->iter.recs = kcalloc(XFS_ZONE_GC_RECS, sizeof(*data->iter.recs),
+ GFP_KERNEL);
+ if (!data->iter.recs)
+ goto out_free_data;
+
+ /*
+ * We actually only need a single bio_vec. It would be nice to have
+ * a flag that only allocates the inline bvecs and not the separate
+ * bvec pool.
+ */
+ if (bioset_init(&data->bio_set, 16, offsetof(struct xfs_gc_bio, bio),
+ BIOSET_NEED_BVECS))
+ goto out_free_recs;
+ for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) {
+ data->scratch[i].folio =
+ folio_alloc(GFP_KERNEL, get_order(XFS_GC_CHUNK_SIZE));
+ if (!data->scratch[i].folio)
+ goto out_free_scratch;
+ }
+ INIT_LIST_HEAD(&data->reading);
+ INIT_LIST_HEAD(&data->writing);
+ INIT_LIST_HEAD(&data->resetting);
+ data->mp = mp;
+ return data;
+
+out_free_scratch:
+ while (--i >= 0)
+ folio_put(data->scratch[i].folio);
+ bioset_exit(&data->bio_set);
+out_free_recs:
+ kfree(data->iter.recs);
+out_free_data:
+ kfree(data);
+ return NULL;
+}
+
+static void
+xfs_zone_gc_data_free(
+ struct xfs_zone_gc_data *data)
+{
+ int i;
+
+ for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++)
+ folio_put(data->scratch[i].folio);
+ bioset_exit(&data->bio_set);
+ kfree(data->iter.recs);
+ kfree(data);
+}
+
+static void
+xfs_zone_gc_iter_init(
+ struct xfs_zone_gc_iter *iter,
+ struct xfs_rtgroup *victim_rtg)
+
+{
+ iter->next_startblock = 0;
+ iter->rec_count = 0;
+ iter->rec_idx = 0;
+ iter->victim_rtg = victim_rtg;
+}
+
+/*
+ * Query the rmap of the victim zone to gather the records to evacuate.
+ */
+static int
+xfs_zone_gc_query_cb(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *irec,
+ void *private)
+{
+ struct xfs_zone_gc_iter *iter = private;
+
+ ASSERT(!XFS_RMAP_NON_INODE_OWNER(irec->rm_owner));
+ ASSERT(!xfs_is_sb_inum(cur->bc_mp, irec->rm_owner));
+ ASSERT(!(irec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)));
+
+ iter->recs[iter->rec_count] = *irec;
+ if (++iter->rec_count == XFS_ZONE_GC_RECS) {
+ iter->next_startblock =
+ irec->rm_startblock + irec->rm_blockcount;
+ return 1;
+ }
+ return 0;
+}
+
+static int
+xfs_zone_gc_rmap_rec_cmp(
+ const void *a,
+ const void *b)
+{
+ const struct xfs_rmap_irec *reca = a;
+ const struct xfs_rmap_irec *recb = b;
+ int diff;
+
+ diff = cmp_int(reca->rm_owner, recb->rm_owner);
+ if (diff)
+ return diff;
+ return cmp_int(reca->rm_offset, recb->rm_offset);
+}
+
+static int
+xfs_zone_gc_query(
+ struct xfs_mount *mp,
+ struct xfs_zone_gc_iter *iter)
+{
+ struct xfs_rtgroup *rtg = iter->victim_rtg;
+ struct xfs_rmap_irec ri_low = { };
+ struct xfs_rmap_irec ri_high;
+ struct xfs_btree_cur *cur;
+ struct xfs_trans *tp;
+ int error;
+
+ ASSERT(iter->next_startblock <= rtg_blocks(rtg));
+ if (iter->next_startblock == rtg_blocks(rtg))
+ goto done;
+
+ ASSERT(iter->next_startblock < rtg_blocks(rtg));
+ ri_low.rm_startblock = iter->next_startblock;
+ memset(&ri_high, 0xFF, sizeof(ri_high));
+
+ iter->rec_idx = 0;
+ iter->rec_count = 0;
+
+ tp = xfs_trans_alloc_empty(mp);
+ xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
+ cur = xfs_rtrmapbt_init_cursor(tp, rtg);
+ error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
+ xfs_zone_gc_query_cb, iter);
+ xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
+ xfs_btree_del_cursor(cur, error < 0 ? error : 0);
+ xfs_trans_cancel(tp);
+
+ if (error < 0)
+ return error;
+
+ /*
+ * Sort the rmap records by inode number and increasing offset to
+ * defragment the mappings.
+ *
+ * This could be further enhanced by an even bigger look ahead window,
+ * but that's better left until we have better detection of changes to
+ * inode mapping to avoid the potential of GCing already dead data.
+ */
+ sort(iter->recs, iter->rec_count, sizeof(iter->recs[0]),
+ xfs_zone_gc_rmap_rec_cmp, NULL);
+
+ if (error == 0) {
+ /*
+ * We finished iterating through the zone.
+ */
+ iter->next_startblock = rtg_blocks(rtg);
+ if (iter->rec_count == 0)
+ goto done;
+ }
+
+ return 0;
+done:
+ xfs_rtgroup_rele(iter->victim_rtg);
+ iter->victim_rtg = NULL;
+ return 0;
+}
+
+static bool
+xfs_zone_gc_iter_next(
+ struct xfs_mount *mp,
+ struct xfs_zone_gc_iter *iter,
+ struct xfs_rmap_irec *chunk_rec,
+ struct xfs_inode **ipp)
+{
+ struct xfs_rmap_irec *irec;
+ int error;
+
+ if (!iter->victim_rtg)
+ return false;
+
+retry:
+ if (iter->rec_idx == iter->rec_count) {
+ error = xfs_zone_gc_query(mp, iter);
+ if (error)
+ goto fail;
+ if (!iter->victim_rtg)
+ return false;
+ }
+
+ irec = &iter->recs[iter->rec_idx];
+ error = xfs_iget(mp, NULL, irec->rm_owner,
+ XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, ipp);
+ if (error) {
+ /*
+ * If the inode was already deleted, skip over it.
+ */
+ if (error == -ENOENT) {
+ iter->rec_idx++;
+ goto retry;
+ }
+ goto fail;
+ }
+
+ if (!S_ISREG(VFS_I(*ipp)->i_mode) || !XFS_IS_REALTIME_INODE(*ipp)) {
+ iter->rec_idx++;
+ xfs_irele(*ipp);
+ goto retry;
+ }
+
+ *chunk_rec = *irec;
+ return true;
+
+fail:
+ xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+ return false;
+}
+
+static void
+xfs_zone_gc_iter_advance(
+ struct xfs_zone_gc_iter *iter,
+ xfs_extlen_t count_fsb)
+{
+ struct xfs_rmap_irec *irec = &iter->recs[iter->rec_idx];
+
+ irec->rm_offset += count_fsb;
+ irec->rm_startblock += count_fsb;
+ irec->rm_blockcount -= count_fsb;
+ if (!irec->rm_blockcount)
+ iter->rec_idx++;
+}
+
+static struct xfs_rtgroup *
+xfs_zone_gc_pick_victim_from(
+ struct xfs_mount *mp,
+ uint32_t bucket)
+{
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ uint32_t victim_used = U32_MAX;
+ struct xfs_rtgroup *victim_rtg = NULL;
+ uint32_t bit;
+
+ if (!zi->zi_used_bucket_entries[bucket])
+ return NULL;
+
+ for_each_set_bit(bit, zi->zi_used_bucket_bitmap[bucket],
+ mp->m_sb.sb_rgcount) {
+ struct xfs_rtgroup *rtg = xfs_rtgroup_grab(mp, bit);
+
+ if (!rtg)
+ continue;
+
+ /* skip zones that are just waiting for a reset */
+ if (rtg_rmap(rtg)->i_used_blocks == 0 ||
+ rtg_rmap(rtg)->i_used_blocks >= victim_used) {
+ xfs_rtgroup_rele(rtg);
+ continue;
+ }
+
+ if (victim_rtg)
+ xfs_rtgroup_rele(victim_rtg);
+ victim_rtg = rtg;
+ victim_used = rtg_rmap(rtg)->i_used_blocks;
+
+ /*
+ * Any zone that is less than 1 percent used is fair game for
+ * instant reclaim. All of these zones are in the last
+ * bucket, so avoid the expensive division for the zones
+ * in the other buckets.
+ */
+ if (bucket == 0 &&
+ rtg_rmap(rtg)->i_used_blocks < rtg_blocks(rtg) / 100)
+ break;
+ }
+
+ return victim_rtg;
+}
+
+/*
+ * Iterate through all zones marked as reclaimable and find a candidate to
+ * reclaim.
+ */
+static bool
+xfs_zone_gc_select_victim(
+ struct xfs_zone_gc_data *data)
+{
+ struct xfs_zone_gc_iter *iter = &data->iter;
+ struct xfs_mount *mp = data->mp;
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ struct xfs_rtgroup *victim_rtg = NULL;
+ unsigned int bucket;
+
+ if (xfs_is_shutdown(mp))
+ return false;
+
+ if (iter->victim_rtg)
+ return true;
+
+ /*
+ * Don't start new work if we are asked to stop or park.
+ */
+ if (kthread_should_stop() || kthread_should_park())
+ return false;
+
+ if (!xfs_zoned_need_gc(mp))
+ return false;
+
+ spin_lock(&zi->zi_used_buckets_lock);
+ for (bucket = 0; bucket < XFS_ZONE_USED_BUCKETS; bucket++) {
+ victim_rtg = xfs_zone_gc_pick_victim_from(mp, bucket);
+ if (victim_rtg)
+ break;
+ }
+ spin_unlock(&zi->zi_used_buckets_lock);
+
+ if (!victim_rtg)
+ return false;
+
+ trace_xfs_zone_gc_select_victim(victim_rtg, bucket);
+ xfs_zone_gc_iter_init(iter, victim_rtg);
+ return true;
+}
+
+static struct xfs_open_zone *
+xfs_zone_gc_steal_open(
+ struct xfs_zone_info *zi)
+{
+ struct xfs_open_zone *oz, *found = NULL;
+
+ spin_lock(&zi->zi_open_zones_lock);
+ list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) {
+ if (!found || oz->oz_allocated < found->oz_allocated)
+ found = oz;
+ }
+
+ if (found) {
+ found->oz_is_gc = true;
+ list_del_init(&found->oz_entry);
+ zi->zi_nr_open_zones--;
+ }
+
+ spin_unlock(&zi->zi_open_zones_lock);
+ return found;
+}
+
+static struct xfs_open_zone *
+xfs_zone_gc_select_target(
+ struct xfs_mount *mp)
+{
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ struct xfs_open_zone *oz = zi->zi_open_gc_zone;
+
+ /*
+ * We need to wait for pending writes to finish.
+ */
+ if (oz && oz->oz_written < rtg_blocks(oz->oz_rtg))
+ return NULL;
+
+ ASSERT(zi->zi_nr_open_zones <=
+ mp->m_max_open_zones - XFS_OPEN_GC_ZONES);
+ oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true);
+ if (oz)
+ trace_xfs_zone_gc_target_opened(oz->oz_rtg);
+ spin_lock(&zi->zi_open_zones_lock);
+ zi->zi_open_gc_zone = oz;
+ spin_unlock(&zi->zi_open_zones_lock);
+ return oz;
+}
+
+/*
+ * Ensure we have a valid open zone to write the GC data to.
+ *
+ * If the current target zone has space keep writing to it, else first wait for
+ * all pending writes and then pick a new one.
+ */
+static struct xfs_open_zone *
+xfs_zone_gc_ensure_target(
+ struct xfs_mount *mp)
+{
+ struct xfs_open_zone *oz = mp->m_zone_info->zi_open_gc_zone;
+
+ if (!oz || oz->oz_allocated == rtg_blocks(oz->oz_rtg))
+ return xfs_zone_gc_select_target(mp);
+ return oz;
+}
+
+static unsigned int
+xfs_zone_gc_scratch_available(
+ struct xfs_zone_gc_data *data)
+{
+ return XFS_GC_CHUNK_SIZE - data->scratch[data->scratch_idx].offset;
+}
+
+static bool
+xfs_zone_gc_space_available(
+ struct xfs_zone_gc_data *data)
+{
+ struct xfs_open_zone *oz;
+
+ oz = xfs_zone_gc_ensure_target(data->mp);
+ if (!oz)
+ return false;
+ return oz->oz_allocated < rtg_blocks(oz->oz_rtg) &&
+ xfs_zone_gc_scratch_available(data);
+}
+
+static void
+xfs_zone_gc_end_io(
+ struct bio *bio)
+{
+ struct xfs_gc_bio *chunk =
+ container_of(bio, struct xfs_gc_bio, bio);
+ struct xfs_zone_gc_data *data = chunk->data;
+
+ WRITE_ONCE(chunk->state, XFS_GC_BIO_DONE);
+ wake_up_process(data->mp->m_zone_info->zi_gc_thread);
+}
+
+static struct xfs_open_zone *
+xfs_zone_gc_alloc_blocks(
+ struct xfs_zone_gc_data *data,
+ xfs_extlen_t *count_fsb,
+ xfs_daddr_t *daddr,
+ bool *is_seq)
+{
+ struct xfs_mount *mp = data->mp;
+ struct xfs_open_zone *oz;
+
+ oz = xfs_zone_gc_ensure_target(mp);
+ if (!oz)
+ return NULL;
+
+ *count_fsb = min(*count_fsb,
+ XFS_B_TO_FSB(mp, xfs_zone_gc_scratch_available(data)));
+
+ /*
+ * Directly allocate GC blocks from the reserved pool.
+ *
+ * If we'd take them from the normal pool we could be stealing blocks
+ * from a regular writer, which would then have to wait for GC and
+ * deadlock.
+ */
+ spin_lock(&mp->m_sb_lock);
+ *count_fsb = min(*count_fsb,
+ rtg_blocks(oz->oz_rtg) - oz->oz_allocated);
+ *count_fsb = min3(*count_fsb,
+ mp->m_free[XC_FREE_RTEXTENTS].res_avail,
+ mp->m_free[XC_FREE_RTAVAILABLE].res_avail);
+ mp->m_free[XC_FREE_RTEXTENTS].res_avail -= *count_fsb;
+ mp->m_free[XC_FREE_RTAVAILABLE].res_avail -= *count_fsb;
+ spin_unlock(&mp->m_sb_lock);
+
+ if (!*count_fsb)
+ return NULL;
+
+ *daddr = xfs_gbno_to_daddr(&oz->oz_rtg->rtg_group, 0);
+ *is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *daddr);
+ if (!*is_seq)
+ *daddr += XFS_FSB_TO_BB(mp, oz->oz_allocated);
+ oz->oz_allocated += *count_fsb;
+ atomic_inc(&oz->oz_ref);
+ return oz;
+}
+
+static bool
+xfs_zone_gc_start_chunk(
+ struct xfs_zone_gc_data *data)
+{
+ struct xfs_zone_gc_iter *iter = &data->iter;
+ struct xfs_mount *mp = data->mp;
+ struct block_device *bdev = mp->m_rtdev_targp->bt_bdev;
+ struct xfs_open_zone *oz;
+ struct xfs_rmap_irec irec;
+ struct xfs_gc_bio *chunk;
+ struct xfs_inode *ip;
+ struct bio *bio;
+ xfs_daddr_t daddr;
+ bool is_seq;
+
+ if (xfs_is_shutdown(mp))
+ return false;
+
+ if (!xfs_zone_gc_iter_next(mp, iter, &irec, &ip))
+ return false;
+ oz = xfs_zone_gc_alloc_blocks(data, &irec.rm_blockcount, &daddr,
+ &is_seq);
+ if (!oz) {
+ xfs_irele(ip);
+ return false;
+ }
+
+ bio = bio_alloc_bioset(bdev, 1, REQ_OP_READ, GFP_NOFS, &data->bio_set);
+
+ chunk = container_of(bio, struct xfs_gc_bio, bio);
+ chunk->ip = ip;
+ chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset);
+ chunk->len = XFS_FSB_TO_B(mp, irec.rm_blockcount);
+ chunk->old_startblock =
+ xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock);
+ chunk->new_daddr = daddr;
+ chunk->is_seq = is_seq;
+ chunk->scratch = &data->scratch[data->scratch_idx];
+ chunk->data = data;
+ chunk->oz = oz;
+
+ bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock);
+ bio->bi_end_io = xfs_zone_gc_end_io;
+ bio_add_folio_nofail(bio, chunk->scratch->folio, chunk->len,
+ chunk->scratch->offset);
+ chunk->scratch->offset += chunk->len;
+ if (chunk->scratch->offset == XFS_GC_CHUNK_SIZE) {
+ data->scratch_idx =
+ (data->scratch_idx + 1) % XFS_ZONE_GC_NR_SCRATCH;
+ }
+ WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
+ list_add_tail(&chunk->entry, &data->reading);
+ xfs_zone_gc_iter_advance(iter, irec.rm_blockcount);
+
+ submit_bio(bio);
+ return true;
+}
+
+static void
+xfs_zone_gc_free_chunk(
+ struct xfs_gc_bio *chunk)
+{
+ list_del(&chunk->entry);
+ xfs_open_zone_put(chunk->oz);
+ xfs_irele(chunk->ip);
+ bio_put(&chunk->bio);
+}
+
+static void
+xfs_zone_gc_submit_write(
+ struct xfs_zone_gc_data *data,
+ struct xfs_gc_bio *chunk)
+{
+ if (chunk->is_seq) {
+ chunk->bio.bi_opf &= ~REQ_OP_WRITE;
+ chunk->bio.bi_opf |= REQ_OP_ZONE_APPEND;
+ }
+ chunk->bio.bi_iter.bi_sector = chunk->new_daddr;
+ chunk->bio.bi_end_io = xfs_zone_gc_end_io;
+ submit_bio(&chunk->bio);
+}
+
+static struct xfs_gc_bio *
+xfs_zone_gc_split_write(
+ struct xfs_zone_gc_data *data,
+ struct xfs_gc_bio *chunk)
+{
+ struct queue_limits *lim =
+ &bdev_get_queue(chunk->bio.bi_bdev)->limits;
+ struct xfs_gc_bio *split_chunk;
+ int split_sectors;
+ unsigned int split_len;
+ struct bio *split;
+ unsigned int nsegs;
+
+ if (!chunk->is_seq)
+ return NULL;
+
+ split_sectors = bio_split_rw_at(&chunk->bio, lim, &nsegs,
+ lim->max_zone_append_sectors << SECTOR_SHIFT);
+ if (!split_sectors)
+ return NULL;
+
+ /* ensure the split chunk is still block size aligned */
+ split_sectors = ALIGN_DOWN(split_sectors << SECTOR_SHIFT,
+ data->mp->m_sb.sb_blocksize) >> SECTOR_SHIFT;
+ split_len = split_sectors << SECTOR_SHIFT;
+
+ split = bio_split(&chunk->bio, split_sectors, GFP_NOFS, &data->bio_set);
+ split_chunk = container_of(split, struct xfs_gc_bio, bio);
+ split_chunk->data = data;
+ ihold(VFS_I(chunk->ip));
+ split_chunk->ip = chunk->ip;
+ split_chunk->is_seq = chunk->is_seq;
+ split_chunk->scratch = chunk->scratch;
+ split_chunk->offset = chunk->offset;
+ split_chunk->len = split_len;
+ split_chunk->old_startblock = chunk->old_startblock;
+ split_chunk->new_daddr = chunk->new_daddr;
+ split_chunk->oz = chunk->oz;
+ atomic_inc(&chunk->oz->oz_ref);
+
+ chunk->offset += split_len;
+ chunk->len -= split_len;
+ chunk->old_startblock += XFS_B_TO_FSB(data->mp, split_len);
+
+ /* add right before the original chunk */
+ WRITE_ONCE(split_chunk->state, XFS_GC_BIO_NEW);
+ list_add_tail(&split_chunk->entry, &chunk->entry);
+ return split_chunk;
+}
+
+static void
+xfs_zone_gc_write_chunk(
+ struct xfs_gc_bio *chunk)
+{
+ struct xfs_zone_gc_data *data = chunk->data;
+ struct xfs_mount *mp = chunk->ip->i_mount;
+ phys_addr_t bvec_paddr =
+ bvec_phys(bio_first_bvec_all(&chunk->bio));
+ struct xfs_gc_bio *split_chunk;
+
+ if (chunk->bio.bi_status)
+ xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+ if (xfs_is_shutdown(mp)) {
+ xfs_zone_gc_free_chunk(chunk);
+ return;
+ }
+
+ WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
+ list_move_tail(&chunk->entry, &data->writing);
+
+ bio_reset(&chunk->bio, mp->m_rtdev_targp->bt_bdev, REQ_OP_WRITE);
+ bio_add_folio_nofail(&chunk->bio, chunk->scratch->folio, chunk->len,
+ offset_in_folio(chunk->scratch->folio, bvec_paddr));
+
+ while ((split_chunk = xfs_zone_gc_split_write(data, chunk)))
+ xfs_zone_gc_submit_write(data, split_chunk);
+ xfs_zone_gc_submit_write(data, chunk);
+}
+
+static void
+xfs_zone_gc_finish_chunk(
+ struct xfs_gc_bio *chunk)
+{
+ uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
+ struct xfs_inode *ip = chunk->ip;
+ struct xfs_mount *mp = ip->i_mount;
+ int error;
+
+ if (chunk->bio.bi_status)
+ xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+ if (xfs_is_shutdown(mp)) {
+ xfs_zone_gc_free_chunk(chunk);
+ return;
+ }
+
+ chunk->scratch->freed += chunk->len;
+ if (chunk->scratch->freed == chunk->scratch->offset) {
+ chunk->scratch->offset = 0;
+ chunk->scratch->freed = 0;
+ }
+
+ /*
+ * Cycle through the iolock and wait for direct I/O and layouts to
+ * ensure no one is reading from the old mapping before it goes away.
+ *
+ * Note that xfs_zoned_end_io() below checks that no other writer raced
+ * with us to update the mapping by checking that the old startblock
+ * didn't change.
+ */
+ xfs_ilock(ip, iolock);
+ error = xfs_break_layouts(VFS_I(ip), &iolock, BREAK_UNMAP);
+ if (!error)
+ inode_dio_wait(VFS_I(ip));
+ xfs_iunlock(ip, iolock);
+ if (error)
+ goto free;
+
+ if (chunk->is_seq)
+ chunk->new_daddr = chunk->bio.bi_iter.bi_sector;
+ error = xfs_zoned_end_io(ip, chunk->offset, chunk->len,
+ chunk->new_daddr, chunk->oz, chunk->old_startblock);
+free:
+ if (error)
+ xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+ xfs_zone_gc_free_chunk(chunk);
+}
+
+static void
+xfs_zone_gc_finish_reset(
+ struct xfs_gc_bio *chunk)
+{
+ struct xfs_rtgroup *rtg = chunk->bio.bi_private;
+ struct xfs_mount *mp = rtg_mount(rtg);
+ struct xfs_zone_info *zi = mp->m_zone_info;
+
+ if (chunk->bio.bi_status) {
+ xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+ goto out;
+ }
+
+ xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_FREE);
+ atomic_inc(&zi->zi_nr_free_zones);
+
+ xfs_zoned_add_available(mp, rtg_blocks(rtg));
+
+ wake_up_all(&zi->zi_zone_wait);
+out:
+ list_del(&chunk->entry);
+ bio_put(&chunk->bio);
+}
+
+static bool
+xfs_zone_gc_prepare_reset(
+ struct bio *bio,
+ struct xfs_rtgroup *rtg)
+{
+ trace_xfs_zone_reset(rtg);
+
+ ASSERT(rtg_rmap(rtg)->i_used_blocks == 0);
+ bio->bi_iter.bi_sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0);
+ if (!bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) {
+ if (!bdev_max_discard_sectors(bio->bi_bdev))
+ return false;
+ bio->bi_opf = REQ_OP_DISCARD | REQ_SYNC;
+ bio->bi_iter.bi_size =
+ XFS_FSB_TO_B(rtg_mount(rtg), rtg_blocks(rtg));
+ }
+
+ return true;
+}
+
+int
+xfs_zone_gc_reset_sync(
+ struct xfs_rtgroup *rtg)
+{
+ int error = 0;
+ struct bio bio;
+
+ bio_init(&bio, rtg_mount(rtg)->m_rtdev_targp->bt_bdev, NULL, 0,
+ REQ_OP_ZONE_RESET);
+ if (xfs_zone_gc_prepare_reset(&bio, rtg))
+ error = submit_bio_wait(&bio);
+ bio_uninit(&bio);
+
+ return error;
+}
+
+static void
+xfs_zone_gc_reset_zones(
+ struct xfs_zone_gc_data *data,
+ struct xfs_group *reset_list)
+{
+ struct xfs_group *next = reset_list;
+
+ if (blkdev_issue_flush(data->mp->m_rtdev_targp->bt_bdev) < 0) {
+ xfs_force_shutdown(data->mp, SHUTDOWN_META_IO_ERROR);
+ return;
+ }
+
+ do {
+ struct xfs_rtgroup *rtg = to_rtg(next);
+ struct xfs_gc_bio *chunk;
+ struct bio *bio;
+
+ xfs_log_force_inode(rtg_rmap(rtg));
+
+ next = rtg_group(rtg)->xg_next_reset;
+ rtg_group(rtg)->xg_next_reset = NULL;
+
+ bio = bio_alloc_bioset(rtg_mount(rtg)->m_rtdev_targp->bt_bdev,
+ 0, REQ_OP_ZONE_RESET, GFP_NOFS, &data->bio_set);
+ bio->bi_private = rtg;
+ bio->bi_end_io = xfs_zone_gc_end_io;
+
+ chunk = container_of(bio, struct xfs_gc_bio, bio);
+ chunk->data = data;
+ WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
+ list_add_tail(&chunk->entry, &data->resetting);
+
+ /*
+ * Also use the bio to drive the state machine when neither
+ * zone reset nor discard is supported to keep things simple.
+ */
+ if (xfs_zone_gc_prepare_reset(bio, rtg))
+ submit_bio(bio);
+ else
+ bio_endio(bio);
+ } while (next);
+}
+
+/*
+ * Handle the work to read and write data for GC and to reset the zones,
+ * including handling all completions.
+ *
+ * Note that the order of the chunks is preserved so that we don't undo the
+ * optimal order established by xfs_zone_gc_query().
+ */
+static bool
+xfs_zone_gc_handle_work(
+ struct xfs_zone_gc_data *data)
+{
+ struct xfs_zone_info *zi = data->mp->m_zone_info;
+ struct xfs_gc_bio *chunk, *next;
+ struct xfs_group *reset_list;
+ struct blk_plug plug;
+
+ spin_lock(&zi->zi_reset_list_lock);
+ reset_list = zi->zi_reset_list;
+ zi->zi_reset_list = NULL;
+ spin_unlock(&zi->zi_reset_list_lock);
+
+ if (!xfs_zone_gc_select_victim(data) ||
+ !xfs_zone_gc_space_available(data)) {
+ if (list_empty(&data->reading) &&
+ list_empty(&data->writing) &&
+ list_empty(&data->resetting) &&
+ !reset_list)
+ return false;
+ }
+
+ __set_current_state(TASK_RUNNING);
+ try_to_freeze();
+
+ if (reset_list)
+ xfs_zone_gc_reset_zones(data, reset_list);
+
+ list_for_each_entry_safe(chunk, next, &data->resetting, entry) {
+ if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
+ break;
+ xfs_zone_gc_finish_reset(chunk);
+ }
+
+ list_for_each_entry_safe(chunk, next, &data->writing, entry) {
+ if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
+ break;
+ xfs_zone_gc_finish_chunk(chunk);
+ }
+
+ blk_start_plug(&plug);
+ list_for_each_entry_safe(chunk, next, &data->reading, entry) {
+ if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
+ break;
+ xfs_zone_gc_write_chunk(chunk);
+ }
+ blk_finish_plug(&plug);
+
+ blk_start_plug(&plug);
+ while (xfs_zone_gc_start_chunk(data))
+ ;
+ blk_finish_plug(&plug);
+ return true;
+}
+
+/*
+ * Note that the current GC algorithm would break reflinks and thus duplicate
+ * data that was shared by multiple owners before. Because of that reflinks
+ * are currently not supported on zoned file systems and can't be created or
+ * mounted.
+ */
+static int
+xfs_zoned_gcd(
+ void *private)
+{
+ struct xfs_zone_gc_data *data = private;
+ struct xfs_mount *mp = data->mp;
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ unsigned int nofs_flag;
+
+ nofs_flag = memalloc_nofs_save();
+ set_freezable();
+
+ for (;;) {
+ set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE);
+ xfs_set_zonegc_running(mp);
+ if (xfs_zone_gc_handle_work(data))
+ continue;
+
+ if (list_empty(&data->reading) &&
+ list_empty(&data->writing) &&
+ list_empty(&data->resetting) &&
+ !zi->zi_reset_list) {
+ xfs_clear_zonegc_running(mp);
+ xfs_zoned_resv_wake_all(mp);
+
+ if (kthread_should_stop()) {
+ __set_current_state(TASK_RUNNING);
+ break;
+ }
+
+ if (kthread_should_park()) {
+ __set_current_state(TASK_RUNNING);
+ kthread_parkme();
+ continue;
+ }
+ }
+
+ schedule();
+ }
+ xfs_clear_zonegc_running(mp);
+
+ if (data->iter.victim_rtg)
+ xfs_rtgroup_rele(data->iter.victim_rtg);
+
+ memalloc_nofs_restore(nofs_flag);
+ xfs_zone_gc_data_free(data);
+ return 0;
+}
+
+void
+xfs_zone_gc_start(
+ struct xfs_mount *mp)
+{
+ if (xfs_has_zoned(mp))
+ kthread_unpark(mp->m_zone_info->zi_gc_thread);
+}
+
+void
+xfs_zone_gc_stop(
+ struct xfs_mount *mp)
+{
+ if (xfs_has_zoned(mp))
+ kthread_park(mp->m_zone_info->zi_gc_thread);
+}
+
+int
+xfs_zone_gc_mount(
+ struct xfs_mount *mp)
+{
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ struct xfs_zone_gc_data *data;
+ struct xfs_open_zone *oz;
+ int error;
+
+ /*
+ * If there are no free zones available for GC, pick the open zone with
+ * the least used space to GC into. This should only happen after an
+ * unclean shutdown near ENOSPC while GC was ongoing.
+ *
+ * We also need to do this for the first gc zone allocation if we
+ * unmounted while at the open limit.
+ */
+ if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_FREE) ||
+ zi->zi_nr_open_zones == mp->m_max_open_zones)
+ oz = xfs_zone_gc_steal_open(zi);
+ else
+ oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true);
+ if (!oz) {
+ xfs_warn(mp, "unable to allocate a zone for gc");
+ error = -EIO;
+ goto out;
+ }
+
+ trace_xfs_zone_gc_target_opened(oz->oz_rtg);
+ zi->zi_open_gc_zone = oz;
+
+ data = xfs_zone_gc_data_alloc(mp);
+ if (!data) {
+ error = -ENOMEM;
+ goto out_put_gc_zone;
+ }
+
+ mp->m_zone_info->zi_gc_thread = kthread_create(xfs_zoned_gcd, data,
+ "xfs-zone-gc/%s", mp->m_super->s_id);
+ if (IS_ERR(mp->m_zone_info->zi_gc_thread)) {
+ xfs_warn(mp, "unable to create zone gc thread");
+ error = PTR_ERR(mp->m_zone_info->zi_gc_thread);
+ goto out_free_gc_data;
+ }
+
+ /* xfs_zone_gc_start will unpark for rw mounts */
+ kthread_park(mp->m_zone_info->zi_gc_thread);
+ return 0;
+
+out_free_gc_data:
+ kfree(data);
+out_put_gc_zone:
+ xfs_open_zone_put(zi->zi_open_gc_zone);
+out:
+ return error;
+}
+
+void
+xfs_zone_gc_unmount(
+ struct xfs_mount *mp)
+{
+ struct xfs_zone_info *zi = mp->m_zone_info;
+
+ kthread_stop(zi->zi_gc_thread);
+ if (zi->zi_open_gc_zone)
+ xfs_open_zone_put(zi->zi_open_gc_zone);
+}
diff --git a/fs/xfs/xfs_zone_info.c b/fs/xfs/xfs_zone_info.c
new file mode 100644
index 000000000000..07e30c596975
--- /dev/null
+++ b/fs/xfs/xfs_zone_info.c
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2023-2025 Christoph Hellwig.
+ * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
+ */
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_rtgroup.h"
+#include "xfs_zone_alloc.h"
+#include "xfs_zone_priv.h"
+
+static const char xfs_write_hint_shorthand[6][16] = {
+ "NOT_SET", "NONE", "SHORT", "MEDIUM", "LONG", "EXTREME"};
+
+static inline const char *
+xfs_write_hint_to_str(
+ uint8_t write_hint)
+{
+ if (write_hint > WRITE_LIFE_EXTREME)
+ return "UNKNOWN";
+ return xfs_write_hint_shorthand[write_hint];
+}
+
+static void
+xfs_show_open_zone(
+ struct seq_file *m,
+ struct xfs_open_zone *oz)
+{
+ seq_printf(m, "\t zone %d, wp %u, written %u, used %u, hint %s\n",
+ rtg_rgno(oz->oz_rtg),
+ oz->oz_allocated, oz->oz_written,
+ rtg_rmap(oz->oz_rtg)->i_used_blocks,
+ xfs_write_hint_to_str(oz->oz_write_hint));
+}
+
+static void
+xfs_show_full_zone_used_distribution(
+ struct seq_file *m,
+ struct xfs_mount *mp)
+{
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ unsigned int reclaimable = 0, full, i;
+
+ spin_lock(&zi->zi_used_buckets_lock);
+ for (i = 0; i < XFS_ZONE_USED_BUCKETS; i++) {
+ unsigned int entries = zi->zi_used_bucket_entries[i];
+
+ seq_printf(m, "\t %2u..%2u%%: %u\n",
+ i * (100 / XFS_ZONE_USED_BUCKETS),
+ (i + 1) * (100 / XFS_ZONE_USED_BUCKETS) - 1,
+ entries);
+ reclaimable += entries;
+ }
+ spin_unlock(&zi->zi_used_buckets_lock);
+
+ full = mp->m_sb.sb_rgcount;
+ if (zi->zi_open_gc_zone)
+ full--;
+ full -= zi->zi_nr_open_zones;
+ full -= atomic_read(&zi->zi_nr_free_zones);
+ full -= reclaimable;
+
+ seq_printf(m, "\t 100%%: %u\n", full);
+}
+
+void
+xfs_zoned_show_stats(
+ struct seq_file *m,
+ struct xfs_mount *mp)
+{
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ struct xfs_open_zone *oz;
+
+ seq_puts(m, "\n");
+
+ seq_printf(m, "\tuser free RT blocks: %lld\n",
+ xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS));
+ seq_printf(m, "\treserved free RT blocks: %lld\n",
+ mp->m_free[XC_FREE_RTEXTENTS].res_avail);
+ seq_printf(m, "\tuser available RT blocks: %lld\n",
+ xfs_sum_freecounter(mp, XC_FREE_RTAVAILABLE));
+ seq_printf(m, "\treserved available RT blocks: %lld\n",
+ mp->m_free[XC_FREE_RTAVAILABLE].res_avail);
+ seq_printf(m, "\tRT reservations required: %d\n",
+ !list_empty_careful(&zi->zi_reclaim_reservations));
+ seq_printf(m, "\tRT GC required: %d\n",
+ xfs_zoned_need_gc(mp));
+
+ seq_printf(m, "\tfree zones: %d\n", atomic_read(&zi->zi_nr_free_zones));
+ seq_puts(m, "\topen zones:\n");
+ spin_lock(&zi->zi_open_zones_lock);
+ list_for_each_entry(oz, &zi->zi_open_zones, oz_entry)
+ xfs_show_open_zone(m, oz);
+ if (zi->zi_open_gc_zone) {
+ seq_puts(m, "\topen gc zone:\n");
+ xfs_show_open_zone(m, zi->zi_open_gc_zone);
+ }
+ spin_unlock(&zi->zi_open_zones_lock);
+ seq_puts(m, "\tused blocks distribution (fully written zones):\n");
+ xfs_show_full_zone_used_distribution(m, mp);
+}
diff --git a/fs/xfs/xfs_zone_priv.h b/fs/xfs/xfs_zone_priv.h
new file mode 100644
index 000000000000..35e6de3d25ed
--- /dev/null
+++ b/fs/xfs/xfs_zone_priv.h
@@ -0,0 +1,119 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _XFS_ZONE_PRIV_H
+#define _XFS_ZONE_PRIV_H
+
+struct xfs_open_zone {
+ /*
+ * Entry in the open zone list and refcount. Protected by
+ * zi_open_zones_lock in struct xfs_zone_info.
+ */
+ struct list_head oz_entry;
+ atomic_t oz_ref;
+
+ /*
+ * oz_allocated is the amount of space already allocated out of the zone
+ * and is protected by oz_alloc_lock.
+ *
+ * For conventional zones it also is the offset of the next write.
+ */
+ spinlock_t oz_alloc_lock;
+ xfs_rgblock_t oz_allocated;
+
+ /*
+ * oz_written is the number of blocks for which we've received a write
+ * completion. oz_written must always be <= oz_allocated and is
+ * protected by the ILOCK of the rmap inode.
+ */
+ xfs_rgblock_t oz_written;
+
+ /*
+ * Write hint (data temperature) assigned to this zone, or
+ * WRITE_LIFE_NOT_SET if none was set.
+ */
+ enum rw_hint oz_write_hint;
+
+ /*
+ * Is this open zone used for garbage collection? There can only be a
+ * single open GC zone, which is pointed to by zi_open_gc_zone in
+ * struct xfs_zone_info. Constant over the life time of an open zone.
+ */
+ bool oz_is_gc;
+
+ /*
+ * Pointer to the RT groups structure for this open zone. Constant over
+ * the life time of an open zone.
+ */
+ struct xfs_rtgroup *oz_rtg;
+};
+
+/*
+ * Number of bitmap buckets to track reclaimable zones. There are 10 buckets
+ * so that each 10% of the usable capacity get their own bucket and GC can
+ * only has to walk the bitmaps of the lesser used zones if there are any.
+ */
+#define XFS_ZONE_USED_BUCKETS 10u
+
+struct xfs_zone_info {
+ /*
+ * List of pending space reservations:
+ */
+ spinlock_t zi_reservation_lock;
+ struct list_head zi_reclaim_reservations;
+
+ /*
+ * List and number of open zones:
+ */
+ spinlock_t zi_open_zones_lock;
+ struct list_head zi_open_zones;
+ unsigned int zi_nr_open_zones;
+
+ /*
+ * Free zone search cursor and number of free zones:
+ */
+ unsigned long zi_free_zone_cursor;
+ atomic_t zi_nr_free_zones;
+
+ /*
+ * Wait queue to wait for free zones or open zone resources to become
+ * available:
+ */
+ wait_queue_head_t zi_zone_wait;
+
+ /*
+ * Pointer to the GC thread, and the current open zone used by GC
+ * (if any).
+ *
+ * zi_open_gc_zone is mostly private to the GC thread, but can be read
+ * for debugging from other threads, in which case zi_open_zones_lock
+ * must be taken to access it.
+ */
+ struct task_struct *zi_gc_thread;
+ struct xfs_open_zone *zi_open_gc_zone;
+
+ /*
+ * List of zones that need a reset:
+ */
+ spinlock_t zi_reset_list_lock;
+ struct xfs_group *zi_reset_list;
+
+ /*
+ * A set of bitmaps to bucket-sort reclaimable zones by used blocks to help
+ * garbage collection to quickly find the best candidate for reclaim.
+ */
+ spinlock_t zi_used_buckets_lock;
+ unsigned int zi_used_bucket_entries[XFS_ZONE_USED_BUCKETS];
+ unsigned long *zi_used_bucket_bitmap[XFS_ZONE_USED_BUCKETS];
+
+};
+
+struct xfs_open_zone *xfs_open_zone(struct xfs_mount *mp,
+ enum rw_hint write_hint, bool is_gc);
+
+int xfs_zone_gc_reset_sync(struct xfs_rtgroup *rtg);
+bool xfs_zoned_need_gc(struct xfs_mount *mp);
+int xfs_zone_gc_mount(struct xfs_mount *mp);
+void xfs_zone_gc_unmount(struct xfs_mount *mp);
+
+void xfs_zoned_resv_wake_all(struct xfs_mount *mp);
+
+#endif /* _XFS_ZONE_PRIV_H */
diff --git a/fs/xfs/xfs_zone_space_resv.c b/fs/xfs/xfs_zone_space_resv.c
new file mode 100644
index 000000000000..1313c55b8cbe
--- /dev/null
+++ b/fs/xfs/xfs_zone_space_resv.c
@@ -0,0 +1,258 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2023-2025 Christoph Hellwig.
+ * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
+ */
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_zone_alloc.h"
+#include "xfs_zone_priv.h"
+#include "xfs_zones.h"
+
+/*
+ * Note: the zoned allocator does not support a rtextsize > 1, so this code and
+ * the allocator itself uses file system blocks interchangeable with realtime
+ * extents without doing the otherwise required conversions.
+ */
+
+/*
+ * Per-task space reservation.
+ *
+ * Tasks that need to wait for GC to free up space allocate one of these
+ * on-stack and adds it to the per-mount zi_reclaim_reservations lists.
+ * The GC thread will then wake the tasks in order when space becomes available.
+ */
+struct xfs_zone_reservation {
+ struct list_head entry;
+ struct task_struct *task;
+ xfs_filblks_t count_fsb;
+};
+
+/*
+ * Calculate the number of reserved blocks.
+ *
+ * XC_FREE_RTEXTENTS counts the user available capacity, to which the file
+ * system can be filled, while XC_FREE_RTAVAILABLE counts the blocks instantly
+ * available for writes without waiting for GC.
+ *
+ * For XC_FREE_RTAVAILABLE only the smaller reservation required for GC and
+ * block zeroing is excluded from the user capacity, while XC_FREE_RTEXTENTS
+ * is further restricted by at least one zone as well as the optional
+ * persistently reserved blocks. This allows the allocator to run more
+ * smoothly by not always triggering GC.
+ */
+uint64_t
+xfs_zoned_default_resblks(
+ struct xfs_mount *mp,
+ enum xfs_free_counter ctr)
+{
+ switch (ctr) {
+ case XC_FREE_RTEXTENTS:
+ return (uint64_t)XFS_RESERVED_ZONES *
+ mp->m_groups[XG_TYPE_RTG].blocks +
+ mp->m_sb.sb_rtreserved;
+ case XC_FREE_RTAVAILABLE:
+ return (uint64_t)XFS_GC_ZONES *
+ mp->m_groups[XG_TYPE_RTG].blocks;
+ default:
+ ASSERT(0);
+ return 0;
+ }
+}
+
+void
+xfs_zoned_resv_wake_all(
+ struct xfs_mount *mp)
+{
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ struct xfs_zone_reservation *reservation;
+
+ spin_lock(&zi->zi_reservation_lock);
+ list_for_each_entry(reservation, &zi->zi_reclaim_reservations, entry)
+ wake_up_process(reservation->task);
+ spin_unlock(&zi->zi_reservation_lock);
+}
+
+void
+xfs_zoned_add_available(
+ struct xfs_mount *mp,
+ xfs_filblks_t count_fsb)
+{
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ struct xfs_zone_reservation *reservation;
+
+ if (list_empty_careful(&zi->zi_reclaim_reservations)) {
+ xfs_add_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb);
+ return;
+ }
+
+ spin_lock(&zi->zi_reservation_lock);
+ xfs_add_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb);
+ count_fsb = xfs_sum_freecounter(mp, XC_FREE_RTAVAILABLE);
+ list_for_each_entry(reservation, &zi->zi_reclaim_reservations, entry) {
+ if (reservation->count_fsb > count_fsb)
+ break;
+ wake_up_process(reservation->task);
+ count_fsb -= reservation->count_fsb;
+
+ }
+ spin_unlock(&zi->zi_reservation_lock);
+}
+
+static int
+xfs_zoned_space_wait_error(
+ struct xfs_mount *mp)
+{
+ if (xfs_is_shutdown(mp))
+ return -EIO;
+ if (fatal_signal_pending(current))
+ return -EINTR;
+ return 0;
+}
+
+static int
+xfs_zoned_reserve_available(
+ struct xfs_mount *mp,
+ xfs_filblks_t count_fsb,
+ unsigned int flags)
+{
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ struct xfs_zone_reservation reservation = {
+ .task = current,
+ .count_fsb = count_fsb,
+ };
+ int error;
+
+ /*
+ * If there are no waiters, try to directly grab the available blocks
+ * from the percpu counter.
+ *
+ * If the caller wants to dip into the reserved pool also bypass the
+ * wait list. This relies on the fact that we have a very graciously
+ * sized reserved pool that always has enough space. If the reserved
+ * allocations fail we're in trouble.
+ */
+ if (likely(list_empty_careful(&zi->zi_reclaim_reservations) ||
+ (flags & XFS_ZR_RESERVED))) {
+ error = xfs_dec_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb,
+ flags & XFS_ZR_RESERVED);
+ if (error != -ENOSPC)
+ return error;
+ }
+
+ if (flags & XFS_ZR_NOWAIT)
+ return -EAGAIN;
+
+ spin_lock(&zi->zi_reservation_lock);
+ list_add_tail(&reservation.entry, &zi->zi_reclaim_reservations);
+ while ((error = xfs_zoned_space_wait_error(mp)) == 0) {
+ set_current_state(TASK_KILLABLE);
+
+ error = xfs_dec_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb,
+ flags & XFS_ZR_RESERVED);
+ if (error != -ENOSPC)
+ break;
+
+ /*
+ * Make sure to start GC if it is not running already. As we
+ * check the rtavailable count when filling up zones, GC is
+ * normally already running at this point, but in some setups
+ * with very few zones we may completely run out of non-
+ * reserved blocks in between filling zones.
+ */
+ if (!xfs_is_zonegc_running(mp))
+ wake_up_process(zi->zi_gc_thread);
+
+ /*
+ * If there is no reclaimable group left and we aren't still
+ * processing a pending GC request give up as we're fully out
+ * of space.
+ */
+ if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE) &&
+ !xfs_is_zonegc_running(mp))
+ break;
+
+ spin_unlock(&zi->zi_reservation_lock);
+ schedule();
+ spin_lock(&zi->zi_reservation_lock);
+ }
+ list_del(&reservation.entry);
+ spin_unlock(&zi->zi_reservation_lock);
+
+ __set_current_state(TASK_RUNNING);
+ return error;
+}
+
+/*
+ * Implement greedy space allocation for short writes by trying to grab all
+ * that is left after locking out other threads from trying to do the same.
+ *
+ * This isn't exactly optimal and can hopefully be replaced by a proper
+ * percpu_counter primitive one day.
+ */
+static int
+xfs_zoned_reserve_extents_greedy(
+ struct xfs_mount *mp,
+ xfs_filblks_t *count_fsb,
+ unsigned int flags)
+{
+ struct xfs_zone_info *zi = mp->m_zone_info;
+ s64 len = *count_fsb;
+ int error = -ENOSPC;
+
+ spin_lock(&zi->zi_reservation_lock);
+ len = min(len, xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS));
+ if (len > 0) {
+ *count_fsb = len;
+ error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, *count_fsb,
+ flags & XFS_ZR_RESERVED);
+ }
+ spin_unlock(&zi->zi_reservation_lock);
+ return error;
+}
+
+int
+xfs_zoned_space_reserve(
+ struct xfs_mount *mp,
+ xfs_filblks_t count_fsb,
+ unsigned int flags,
+ struct xfs_zone_alloc_ctx *ac)
+{
+ int error;
+
+ ASSERT(ac->reserved_blocks == 0);
+ ASSERT(ac->open_zone == NULL);
+
+ error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb,
+ flags & XFS_ZR_RESERVED);
+ if (error == -ENOSPC && (flags & XFS_ZR_GREEDY) && count_fsb > 1)
+ error = xfs_zoned_reserve_extents_greedy(mp, &count_fsb, flags);
+ if (error)
+ return error;
+
+ error = xfs_zoned_reserve_available(mp, count_fsb, flags);
+ if (error) {
+ xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb);
+ return error;
+ }
+ ac->reserved_blocks = count_fsb;
+ return 0;
+}
+
+void
+xfs_zoned_space_unreserve(
+ struct xfs_mount *mp,
+ struct xfs_zone_alloc_ctx *ac)
+{
+ if (ac->reserved_blocks > 0) {
+ xfs_zoned_add_available(mp, ac->reserved_blocks);
+ xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, ac->reserved_blocks);
+ }
+ if (ac->open_zone)
+ xfs_open_zone_put(ac->open_zone);
+}
diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c
index 35166c92420c..fd3a5922f6c3 100644
--- a/fs/zonefs/file.c
+++ b/fs/zonefs/file.c
@@ -124,37 +124,46 @@ static void zonefs_readahead(struct readahead_control *rac)
* Map blocks for page writeback. This is used only on conventional zone files,
* which implies that the page range can only be within the fixed inode size.
*/
-static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc,
- struct inode *inode, loff_t offset,
- unsigned int len)
+static ssize_t zonefs_writeback_range(struct iomap_writepage_ctx *wpc,
+ struct folio *folio, u64 offset, unsigned len, u64 end_pos)
{
- struct zonefs_zone *z = zonefs_inode_zone(inode);
+ struct zonefs_zone *z = zonefs_inode_zone(wpc->inode);
if (WARN_ON_ONCE(zonefs_zone_is_seq(z)))
return -EIO;
- if (WARN_ON_ONCE(offset >= i_size_read(inode)))
+ if (WARN_ON_ONCE(offset >= i_size_read(wpc->inode)))
return -EIO;
/* If the mapping is already OK, nothing needs to be done */
- if (offset >= wpc->iomap.offset &&
- offset < wpc->iomap.offset + wpc->iomap.length)
- return 0;
+ if (offset < wpc->iomap.offset ||
+ offset >= wpc->iomap.offset + wpc->iomap.length) {
+ int error;
+
+ error = zonefs_write_iomap_begin(wpc->inode, offset,
+ z->z_capacity - offset, IOMAP_WRITE,
+ &wpc->iomap, NULL);
+ if (error)
+ return error;
+ }
- return zonefs_write_iomap_begin(inode, offset,
- z->z_capacity - offset,
- IOMAP_WRITE, &wpc->iomap, NULL);
+ return iomap_add_to_ioend(wpc, folio, offset, end_pos, len);
}
static const struct iomap_writeback_ops zonefs_writeback_ops = {
- .map_blocks = zonefs_write_map_blocks,
+ .writeback_range = zonefs_writeback_range,
+ .writeback_submit = iomap_ioend_writeback_submit,
};
static int zonefs_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
- struct iomap_writepage_ctx wpc = { };
+ struct iomap_writepage_ctx wpc = {
+ .inode = mapping->host,
+ .wbc = wbc,
+ .ops = &zonefs_writeback_ops,
+ };
- return iomap_writepages(mapping, wbc, &wpc, &zonefs_writeback_ops);
+ return iomap_writepages(&wpc);
}
static int zonefs_swap_activate(struct swap_info_struct *sis,
@@ -299,7 +308,7 @@ static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf)
/* Serialize against truncates */
filemap_invalidate_lock_shared(inode->i_mapping);
- ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops);
+ ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops, NULL);
filemap_invalidate_unlock_shared(inode->i_mapping);
sb_end_pagefault(inode->i_sb);
@@ -312,8 +321,10 @@ static const struct vm_operations_struct zonefs_file_vm_ops = {
.page_mkwrite = zonefs_filemap_page_mkwrite,
};
-static int zonefs_file_mmap(struct file *file, struct vm_area_struct *vma)
+static int zonefs_file_mmap_prepare(struct vm_area_desc *desc)
{
+ struct file *file = desc->file;
+
/*
* Conventional zones accept random writes, so their files can support
* shared writable mappings. For sequential zone files, only read
@@ -321,11 +332,11 @@ static int zonefs_file_mmap(struct file *file, struct vm_area_struct *vma)
* ordering between msync() and page cache writeback.
*/
if (zonefs_inode_is_seq(file_inode(file)) &&
- (vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
+ (desc->vm_flags & VM_SHARED) && (desc->vm_flags & VM_MAYWRITE))
return -EINVAL;
file_accessed(file);
- vma->vm_ops = &zonefs_file_vm_ops;
+ desc->vm_ops = &zonefs_file_vm_ops;
return 0;
}
@@ -563,7 +574,8 @@ static ssize_t zonefs_file_buffered_write(struct kiocb *iocb,
if (ret <= 0)
goto inode_unlock;
- ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops, NULL);
+ ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops,
+ NULL, NULL);
if (ret == -EIO)
zonefs_io_error(inode, true);
@@ -850,7 +862,7 @@ const struct file_operations zonefs_file_operations = {
.open = zonefs_file_open,
.release = zonefs_file_release,
.fsync = zonefs_file_fsync,
- .mmap = zonefs_file_mmap,
+ .mmap_prepare = zonefs_file_mmap_prepare,
.llseek = zonefs_file_llseek,
.read_iter = zonefs_file_read_iter,
.write_iter = zonefs_file_write_iter,
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index faf1eb87895d..4dc7f967c861 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -1111,28 +1111,20 @@ static int zonefs_read_super(struct super_block *sb)
struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
struct zonefs_super *super;
u32 crc, stored_crc;
- struct page *page;
- struct bio_vec bio_vec;
- struct bio bio;
int ret;
- page = alloc_page(GFP_KERNEL);
- if (!page)
+ super = kmalloc(ZONEFS_SUPER_SIZE, GFP_KERNEL);
+ if (!super)
return -ENOMEM;
- bio_init(&bio, sb->s_bdev, &bio_vec, 1, REQ_OP_READ);
- bio.bi_iter.bi_sector = 0;
- __bio_add_page(&bio, page, PAGE_SIZE, 0);
-
- ret = submit_bio_wait(&bio);
+ ret = bdev_rw_virt(sb->s_bdev, 0, super, ZONEFS_SUPER_SIZE,
+ REQ_OP_READ);
if (ret)
- goto free_page;
-
- super = page_address(page);
+ goto free_super;
ret = -EINVAL;
if (le32_to_cpu(super->s_magic) != ZONEFS_MAGIC)
- goto free_page;
+ goto free_super;
stored_crc = le32_to_cpu(super->s_crc);
super->s_crc = 0;
@@ -1140,14 +1132,14 @@ static int zonefs_read_super(struct super_block *sb)
if (crc != stored_crc) {
zonefs_err(sb, "Invalid checksum (Expected 0x%08x, got 0x%08x)",
crc, stored_crc);
- goto free_page;
+ goto free_super;
}
sbi->s_features = le64_to_cpu(super->s_features);
if (sbi->s_features & ~ZONEFS_F_DEFINED_FEATURES) {
zonefs_err(sb, "Unknown features set 0x%llx\n",
sbi->s_features);
- goto free_page;
+ goto free_super;
}
if (sbi->s_features & ZONEFS_F_UID) {
@@ -1155,7 +1147,7 @@ static int zonefs_read_super(struct super_block *sb)
le32_to_cpu(super->s_uid));
if (!uid_valid(sbi->s_uid)) {
zonefs_err(sb, "Invalid UID feature\n");
- goto free_page;
+ goto free_super;
}
}
@@ -1164,7 +1156,7 @@ static int zonefs_read_super(struct super_block *sb)
le32_to_cpu(super->s_gid));
if (!gid_valid(sbi->s_gid)) {
zonefs_err(sb, "Invalid GID feature\n");
- goto free_page;
+ goto free_super;
}
}
@@ -1173,15 +1165,14 @@ static int zonefs_read_super(struct super_block *sb)
if (memchr_inv(super->s_reserved, 0, sizeof(super->s_reserved))) {
zonefs_err(sb, "Reserved area is being used\n");
- goto free_page;
+ goto free_super;
}
import_uuid(&sbi->s_uuid, super->s_uuid);
ret = 0;
-free_page:
- __free_page(page);
-
+free_super:
+ kfree(super);
return ret;
}